opsci-toolbox 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -33,15 +33,48 @@ from transformers import TextClassificationPipeline, AutoModelForSequenceClassif
33
33
  # CLEANING
34
34
  ####################################################################
35
35
 
36
- def filter_by_query(df, col_text, query, ignore_case=True, ignore_accent=True, match_word=False):
36
+ def filter_by_query(df: pd.DataFrame, col_text: str, query: str, ignore_case: bool = True, ignore_accent: bool = True, match_word: bool = False) -> pd.DataFrame:
37
+ """
38
+ Filter DataFrame rows by a query on a specific text column.
39
+
40
+ Parameters:
41
+ df : pandas DataFrame
42
+ The DataFrame to filter.
43
+ col_text : str
44
+ The name of the column containing text data to query.
45
+ query : str
46
+ The query string to filter the DataFrame.
47
+ ignore_case : bool, optional
48
+ Whether to ignore case sensitivity. Default is True.
49
+ ignore_accent : bool, optional
50
+ Whether to ignore accents. Default is True.
51
+ match_word : bool, optional
52
+ Whether to match the whole word. Default is False.
53
+
54
+ Returns:
55
+ df_filtered : pandas DataFrame
56
+ The filtered DataFrame.
57
+ """
37
58
  eldar_query=Query(query, ignore_case = ignore_case, ignore_accent=ignore_accent, match_word=match_word)
38
59
  df[col_text] = df[df[col_text].apply(eldar_query)]
39
60
  df=df.reset_index(drop=True)
40
61
  return df
41
62
 
42
- def TM_clean_text(df, col, col_clean):
63
+ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
43
64
  """
44
- Generic cleaning process for topic modeling
65
+ Generic cleaning process for topic modeling.
66
+
67
+ Parameters:
68
+ df : pandas DataFrame
69
+ The DataFrame containing text data.
70
+ col : str
71
+ The name of the column containing the original text data.
72
+ col_clean : str
73
+ The name of the column to store the cleaned text data.
74
+
75
+ Returns:
76
+ df : pandas DataFrame
77
+ The DataFrame with cleaned text data.
45
78
  """
46
79
  df[col_clean] = df[col].apply(lambda x : urls(x, repl= ''))
47
80
  df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
@@ -49,13 +82,35 @@ def TM_clean_text(df, col, col_clean):
49
82
  # df = df.loc[(df[col_clean] != ""), :]
50
83
  return df
51
84
 
52
- def extract_insta_shortcode(url):
85
+ def extract_insta_shortcode(url: str) -> str:
86
+ """
87
+ Extracts the shortcode from an Instagram URL.
88
+
89
+ Parameters:
90
+ url : str
91
+ The Instagram URL containing the shortcode.
92
+
93
+ Returns:
94
+ shortcode : str
95
+ The extracted shortcode.
96
+ """
53
97
  pattern =r'(?:https?:\/\/)?(?:www\.)?instagram\.com\/(?:p|reel|tv|stories)\/([a-zA-Z0-9_-]+)\/?'
54
98
 
55
99
  shortcode = re.findall(pattern, url)
56
100
  return shortcode[0]
57
101
 
58
- def remove_emojis(text):
102
+ def remove_emojis(text: str) -> str:
103
+ """
104
+ Removes emojis and their textual representations from a text string.
105
+
106
+ Parameters:
107
+ text : str
108
+ The input text string containing emojis.
109
+
110
+ Returns:
111
+ text_no_emojis : str
112
+ The input text string with emojis and their textual representations removed.
113
+ """
59
114
  # Convert emojis to their textual representations
60
115
  text_no_emojis = emoji.demojize(text)
61
116
 
@@ -64,24 +119,56 @@ def remove_emojis(text):
64
119
 
65
120
  return text_no_emojis
66
121
 
67
- def extract_urls_from_text(text):
68
- """Returns a list of URLs contained in text"""
122
+ def extract_urls_from_text(text: str) -> list:
123
+ """
124
+ Extracts URLs from a text string.
125
+
126
+ Parameters:
127
+ text : str
128
+ The input text string containing URLs.
129
+
130
+ Returns:
131
+ urls : list of str
132
+ A list of URLs extracted from the input text.
133
+ """
69
134
  extractor = URLExtract()
70
135
  urls = extractor.find_urls(text)
71
136
  return urls
72
137
 
73
- def extract_hashtags(text, lower=True):
138
+ def extract_hashtags(text: str, lower: bool = True) -> list:
74
139
  '''
75
- Using a regular expression to find hashtags in the text
140
+ Extracts hashtags from the text using a regular expression.
141
+
142
+ Parameters:
143
+ text : str
144
+ The input text string containing hashtags.
145
+ lower : bool, optional
146
+ Whether to convert extracted hashtags to lowercase. Default is True.
147
+
148
+ Returns:
149
+ hashtags : list of str
150
+ A list of hashtags extracted from the input text.
76
151
  '''
77
152
  hashtags = re.findall(r'\B#\w+', text)
78
153
  if lower :
79
154
  hashtags= [h.lower() for h in hashtags]
80
155
  return hashtags
81
156
 
82
- def extract_mentions(text, mention_char='@', lower=False):
157
+ def extract_mentions(text: str, mention_char: str = '@', lower: bool = False) -> list:
83
158
  '''
84
- Using a regular expression to find mentions in the text
159
+ Extracts mentions from the text using a regular expression.
160
+
161
+ Parameters:
162
+ text : str
163
+ The input text string containing mentions.
164
+ mention_char : str, optional
165
+ The character used to indicate mentions. Default is '@'.
166
+ lower : bool, optional
167
+ Whether to convert extracted mentions to lowercase. Default is False.
168
+
169
+ Returns:
170
+ mentions : list of str
171
+ A list of mentions extracted from the input text.
85
172
  '''
86
173
  pattern = r"(?<=^|(?<=[^a-zA-Z0-9-_\.]))" + re.escape(mention_char) + r"([A-Za-z0-9_]{4,15})"
87
174
 
@@ -90,16 +177,36 @@ def extract_mentions(text, mention_char='@', lower=False):
90
177
  mentions = [mention.lower() for mention in mentions]
91
178
  return mentions
92
179
 
93
- def remove_extra_spaces(text):
180
+ def remove_extra_spaces(text: str) -> str:
94
181
  """
95
- Remove extra spaces
182
+ Removes extra spaces from a text string.
183
+
184
+ Parameters:
185
+ text : str
186
+ The input text string with extra spaces.
187
+
188
+ Returns:
189
+ cleaned_text : str
190
+ The input text string with extra spaces removed.
96
191
  """
97
192
  cleaned_text = re.sub(r'\s+', ' ', text)
98
193
  return cleaned_text.strip()
99
194
 
100
- def remove_characters(text: str, start_indices: list, end_indices: list):
195
+ def remove_characters(text: str, start_indices: list, end_indices: list) -> str:
101
196
  """
102
- Remove words from a text using list of indices
197
+ Remove characters from a text string using lists of start and end indices.
198
+
199
+ Parameters:
200
+ text : str
201
+ The input text string.
202
+ start_indices : list of int
203
+ A list of start indices indicating the positions from which characters should be removed.
204
+ end_indices : list of int
205
+ A list of end indices indicating the positions up to which characters should be removed.
206
+
207
+ Returns:
208
+ result : str
209
+ The input text string with characters removed based on the specified indices.
103
210
  """
104
211
  if start_indices is None or len(start_indices) <1:
105
212
  return text
@@ -123,9 +230,17 @@ def remove_characters(text: str, start_indices: list, end_indices: list):
123
230
  return result
124
231
 
125
232
 
126
- def load_stopwords_df(lang):
233
+ def load_stopwords_df(lang: str) -> pd.DataFrame:
127
234
  """
128
235
  Load a CSV file without header containing stopwords. If the file doesn't exist, it creates an empty file.
236
+
237
+ Parameters:
238
+ lang : str
239
+ The language code used to identify the stopwords file.
240
+
241
+ Returns:
242
+ df : pandas DataFrame
243
+ A DataFrame containing stopwords loaded from the file.
129
244
  """
130
245
  lexicon_dir = os.path.join(os.getcwd(), "lexicons")
131
246
  file_path = os.path.join(lexicon_dir, f"stop_words_{lang.lower()}.csv")
@@ -150,11 +265,21 @@ def load_stopwords_df(lang):
150
265
 
151
266
  return df
152
267
 
153
-
154
-
155
- def add_stopwords(lang:str, new_stopwords:list, lower:bool = True):
268
+ def add_stopwords(lang: str, new_stopwords: list, lower: bool = True) -> pd.DataFrame:
156
269
  """
157
270
  Add a list of stopwords to an existing file. It removes duplicates.
271
+
272
+ Parameters:
273
+ lang : str
274
+ The language code used to identify the stopwords file.
275
+ new_stopwords : list of str
276
+ The list of stopwords to add.
277
+ lower : bool, optional
278
+ Whether to convert the new stopwords to lowercase before adding. Default is True.
279
+
280
+ Returns:
281
+ new_df : pandas DataFrame
282
+ A DataFrame containing the updated list of stopwords.
158
283
  """
159
284
  df = load_stopwords_df(lang)
160
285
  init_size = len(df.iloc[:, 0].unique()) # Selecting the first column
@@ -173,13 +298,21 @@ def add_stopwords(lang:str, new_stopwords:list, lower:bool = True):
173
298
  lexicon_dir = os.path.join(os.getcwd(), "lexicons")
174
299
  file_path = os.path.join(lexicon_dir, f"stop_words_{lang.lower()}.csv")
175
300
  new_df.to_csv(file_path, encoding="utf-8", index=False)
176
-
177
-
178
301
  return new_df
179
302
 
180
- def remove_stopwords(lang:str, stopwords:list):
303
+ def remove_stopwords(lang: str, stopwords: list) -> pd.DataFrame:
181
304
  """
182
305
  Remove stopwords from an existing file.
306
+
307
+ Parameters:
308
+ lang : str
309
+ The language code used to identify the stopwords file.
310
+ stopwords : list of str
311
+ The list of stopwords to remove.
312
+
313
+ Returns:
314
+ df : pandas DataFrame
315
+ A DataFrame containing the updated list of stopwords after removal.
183
316
  """
184
317
  df = load_stopwords_df(lang)
185
318
  init_size = len(df.iloc[:, 0].unique()) # Selecting the first column
@@ -193,14 +326,24 @@ def remove_stopwords(lang:str, stopwords:list):
193
326
  return df
194
327
 
195
328
 
196
- def keep_alphanum_char(text:str, replace:str = ''):
329
+ def keep_alphanum_char(text: str, replace: str = '') -> str:
197
330
  """
198
- Replace all non-alphanumeric characters
331
+ Replace all non-alphanumeric characters in a text string.
332
+
333
+ Parameters:
334
+ text : str
335
+ The input text string.
336
+ replace : str, optional
337
+ The string to replace non-alphanumeric characters with. Default is an empty string.
338
+
339
+ Returns:
340
+ cleaned_text : str
341
+ The input text string with non-alphanumeric characters replaced.
199
342
  """
200
343
  return re.sub("[^a-zA-Z0-9]", replace, text)
201
344
 
202
345
 
203
- def substitute_punctuations_with_white_space(text):
346
+ def substitute_punctuations_with_white_space(text : str) -> str:
204
347
  """
205
348
  Substitute punctuations with white spaces in the input string.
206
349
 
@@ -213,7 +356,28 @@ def substitute_punctuations_with_white_space(text):
213
356
  text = re.sub(r"[%s]" % re.escape('!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~“…”’'), " ", text)
214
357
  return text
215
358
 
216
- def translate_wt_libre(text, source, target, filename, dir_json, url = "http://127.0.0.1:5000/translate"):
359
+ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate") -> dict:
360
+ """
361
+ Translate text using LibreTranslate service.
362
+
363
+ Parameters:
364
+ text : str
365
+ The text to be translated.
366
+ source : str
367
+ The source language code.
368
+ target : str
369
+ The target language code.
370
+ filename : str
371
+ The filename to save the translation result.
372
+ dir_json : str
373
+ The directory to save the translation result JSON file.
374
+ url : str, optional
375
+ The URL of the WT Libre translation service. Default is "http://127.0.0.1:5000/translate".
376
+
377
+ Returns:
378
+ json_data : dict
379
+ The translation result in JSON format.
380
+ """
217
381
  headers = {"Content-Type": "application/json"}
218
382
  payload = {
219
383
  "q": text,
@@ -231,7 +395,28 @@ def translate_wt_libre(text, source, target, filename, dir_json, url = "http://1
231
395
  write_json(json_data, dir_json , str(filename))
232
396
  return json_data
233
397
 
234
- def translate_batch(batch_text, source, target, filename, dir_json, url = "http://127.0.0.1:5000/translate"):
398
+ def translate_batch(batch_text: list, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate") -> list:
399
+ """
400
+ Translate a batch of texts using LibreTranslate service.
401
+
402
+ Parameters:
403
+ batch_text : list of str
404
+ The list of texts to be translated.
405
+ source : str
406
+ The source language code.
407
+ target : str
408
+ The target language code.
409
+ filename : str
410
+ The filename to save the translation results.
411
+ dir_json : str
412
+ The directory to save the translation result JSONL file.
413
+ url : str, optional
414
+ The URL of the WT Libre translation service. Default is "http://127.0.0.1:5000/translate".
415
+
416
+ Returns:
417
+ json_results : list of dict
418
+ The translation results as a list of dictionaries containing 'translated_text' and 'clean_text'.
419
+ """
235
420
  headers = {"Content-Type": "application/json"}
236
421
  payload = {
237
422
  "q": batch_text,
@@ -253,7 +438,24 @@ def translate_batch(batch_text, source, target, filename, dir_json, url = "http:
253
438
  write_jsonl(json_results, dir_json , str(filename))
254
439
  return json_results
255
440
 
256
- def translate(text, source, target, url = "http://127.0.0.1:5000/translate"):
441
+ def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:5000/translate") -> str:
442
+ """
443
+ Translate text using LibreTranslate service.
444
+
445
+ Parameters:
446
+ text : str
447
+ The text to be translated.
448
+ source : str
449
+ The source language code.
450
+ target : str
451
+ The target language code.
452
+ url : str, optional
453
+ The URL of the translation service. Default is "http://127.0.0.1:5000/translate".
454
+
455
+ Returns:
456
+ translatedText : str
457
+ The translated text.
458
+ """
257
459
  headers = {"Content-Type": "application/json"}
258
460
  payload = {
259
461
  "q": text,
@@ -268,7 +470,24 @@ def translate(text, source, target, url = "http://127.0.0.1:5000/translate"):
268
470
  translatedText = json_data.get("translatedText", "")
269
471
  return translatedText
270
472
 
271
- def translate_row(df, col, source="auto", target = "en"):
473
+ def translate_row(df: pd.DataFrame, col: str, source: str = "auto", target: str = "en") -> pd.DataFrame:
474
+ """
475
+ Translate the text in a specific column of a DataFrame.
476
+
477
+ Parameters:
478
+ df : pandas DataFrame
479
+ The DataFrame containing the text to be translated.
480
+ col : str
481
+ The name of the column containing the text to be translated.
482
+ source : str, optional
483
+ The source language code. Default is "auto".
484
+ target : str, optional
485
+ The target language code. Default is "en" (English).
486
+
487
+ Returns:
488
+ df : pandas DataFrame
489
+ The DataFrame with an additional column containing the translated text.
490
+ """
272
491
  translations =[]
273
492
  for i, row in df.iterrows():
274
493
  txt_to_translate = row[col].replace(' | ', ', ')
@@ -281,27 +500,63 @@ def translate_row(df, col, source="auto", target = "en"):
281
500
  # METRICS
282
501
  ###################################################################
283
502
 
284
- def cosine_similarity(a, b):
503
+ def cosine_similarity(a: np.array, b: np.array) -> float:
285
504
  """
286
- calculate cosine similarity between two vectors
505
+ Calculate the cosine similarity between two vectors.
506
+
507
+ Parameters:
508
+ a : numpy array
509
+ The first vector.
510
+ b : numpy array
511
+ The second vector.
512
+
513
+ Returns:
514
+ similarity : float
515
+ The cosine similarity between the two vectors.
287
516
  """
288
517
  return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
289
-
290
- def approximate_tokens(text):
518
+
519
+ def approximate_tokens(text: str) -> int:
291
520
  """
292
- Approximate the number of tokens
521
+ Approximate the number of tokens in a text.
522
+
523
+ Parameters:
524
+ text : str
525
+ The input text.
526
+
527
+ Returns:
528
+ num_tokens : int
529
+ The approximate number of tokens in the text.
293
530
  """
294
531
  return len(text.split(' '))
295
532
 
296
- def approximate_unique_tokens(text):
533
+ def approximate_unique_tokens(text: str) -> int:
297
534
  """
298
- Approximate the number of distinct tokens
535
+ Approximate the number of distinct tokens in a text.
536
+
537
+ Parameters:
538
+ text : str
539
+ The input text.
540
+
541
+ Returns:
542
+ num_unique_tokens : int
543
+ The approximate number of distinct tokens in the text.
299
544
  """
300
- return len(list(set(text.split(' '))))
545
+ return len(set(text.split(' ')))
301
546
 
302
- def count_word_occurrences(text, word):
547
+ def count_word_occurrences(text: str, word: str) -> int:
303
548
  """
304
- Count word occurences
549
+ Count the occurrences of a word in a text.
550
+
551
+ Parameters:
552
+ text : str
553
+ The input text.
554
+ word : str
555
+ The word to count occurrences of.
556
+
557
+ Returns:
558
+ occurrences : int
559
+ The number of occurrences of the word in the text.
305
560
  """
306
561
  # Convert both text and word to lowercase for case-insensitive matching
307
562
  word_lower = word.lower()
@@ -312,8 +567,10 @@ def count_word_occurrences(text, word):
312
567
  return occurrences
313
568
 
314
569
 
315
- def chi2_per_category(lst_text, lst_categorie, col_cat, n_words = 10, p_value_limit=0.95, min_freq=3):
570
+ def chi2_per_category(lst_text: list, lst_categorie: list, col_cat: str, n_words: int = 10, p_value_limit: float = 0.95, min_freq: int = 3) -> pd.DataFrame:
316
571
  """
572
+ Calculate Chi-squared (Chi2) statistics per category based on the provided texts and corresponding categories.
573
+
317
574
  Parameters:
318
575
  lst_text : list
319
576
  List of texts for which Chi2 will be calculated.
@@ -353,7 +610,34 @@ def chi2_per_category(lst_text, lst_categorie, col_cat, n_words = 10, p_value_li
353
610
  df_chi.reset_index(drop=True)
354
611
  return df_chi
355
612
 
356
- def word_frequency_per_categorie(df, col_text, col_cat, ngram_range=(1, 1), stop_words=[], n_words = 20, min_freq=3):
613
+ def word_frequency_per_categorie(df: pd.DataFrame, col_text: str, col_cat: str, ngram_range: tuple = (1, 1), stop_words: list = [], n_words: int = 20, min_freq: int = 3) -> pd.DataFrame:
614
+ """
615
+ Calculate word frequency per category.
616
+
617
+ Parameters:
618
+ df : pandas DataFrame
619
+ DataFrame containing text data and corresponding categories.
620
+ col_text : str
621
+ Name of the column containing the text data.
622
+ col_cat : str
623
+ Name of the column containing the categories.
624
+ ngram_range : tuple, optional
625
+ The range for n-grams. Default is (1, 1) for unigrams.
626
+ stop_words : list, optional
627
+ List of stopwords to be ignored during frequency calculation. Default is an empty list.
628
+ n_words : int, optional
629
+ Number of top words to display per category. Default is 20.
630
+ min_freq : int, optional
631
+ Minimum frequency threshold for word occurrences per category. Default is 3.
632
+
633
+ Returns:
634
+ DataFrame
635
+ DataFrame containing word frequencies per category.
636
+
637
+ Description:
638
+ This function calculates word frequencies per category based on the provided DataFrame, considering the text data and corresponding categories.
639
+ It filters out words with frequencies below the specified minimum frequency threshold and returns the top words for each category.
640
+ """
357
641
  count_vectorizer = CountVectorizer(token_pattern=r'[^\s]+', ngram_range=ngram_range, stop_words=stop_words)
358
642
  X_train_count = count_vectorizer.fit_transform(df[col_text].to_list())
359
643
  X_names_count = count_vectorizer.get_feature_names_out()
@@ -370,10 +654,29 @@ def word_frequency_per_categorie(df, col_text, col_cat, ngram_range=(1, 1), stop
370
654
  return df_count
371
655
 
372
656
 
373
- def top_items_per_category(df, col_lst ="hashtags", col_cat = "soft_topic", col_id = "tweet_id", n_items= 10):
657
+ def top_items_per_category(df: pd.DataFrame, col_lst: str = "hashtags", col_cat: str = "soft_topic", col_id: str = "tweet_id", n_items: int = 10) -> pd.DataFrame:
374
658
  """
375
- Take a dataframe with a column containing lists of tokens (ex hashtags) and count their occurences grouped by a category.
376
- For instance : count the most used hashtags per topic, metric will be a volume of tweets
659
+ Count the occurrences of items (e.g., hashtags) per category and select the top items per category.
660
+
661
+ Parameters:
662
+ df : pandas DataFrame
663
+ DataFrame containing data.
664
+ col_lst : str, optional
665
+ Name of the column containing lists of items (e.g., hashtags). Default is "hashtags".
666
+ col_cat : str, optional
667
+ Name of the column containing categories. Default is "soft_topic".
668
+ col_id : str, optional
669
+ Name of the column containing unique identifiers. Default is "tweet_id".
670
+ n_items : int, optional
671
+ Number of top items to select per category. Default is 10.
672
+
673
+ Returns:
674
+ DataFrame
675
+ DataFrame containing the top items per category.
676
+
677
+ Description:
678
+ This function takes a DataFrame with a column containing lists of tokens (e.g., hashtags) and counts their occurrences grouped by a category.
679
+ It then selects the most frequently occurring items per category based on the provided metric (e.g., volume of tweets).
377
680
  """
378
681
  df_count = (df[[col_cat, col_id, col_lst]].explode(col_lst)
379
682
  .groupby([col_cat, col_lst], group_keys=False)
@@ -388,8 +691,31 @@ def top_items_per_category(df, col_lst ="hashtags", col_cat = "soft_topic", col_
388
691
  )
389
692
  return df_count
390
693
 
391
- def topic_representation(df_processed_data, col_topic, col_id, col_engagement, col_user_id, metrics):
694
+ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id: str, col_engagement: str, col_user_id: str, metrics: dict) -> pd.DataFrame:
695
+ """
696
+ Calculate the representation of topics in a processed DataFrame.
697
+
698
+ Parameters:
699
+ df_processed_data : pandas DataFrame
700
+ DataFrame containing processed data.
701
+ col_topic : str
702
+ Name of the column containing topic labels.
703
+ col_id : str
704
+ Name of the column containing unique identifiers.
705
+ col_engagement : str
706
+ Name of the column containing engagement metrics.
707
+ col_user_id : str
708
+ Name of the column containing user identifiers.
709
+ metrics : dict
710
+ Dictionary containing additional metrics to aggregate.
711
+
712
+ Returns:
713
+ DataFrame
714
+ DataFrame containing the representation of topics.
392
715
 
716
+ Description:
717
+ This function aggregates various metrics for each topic, including verbatim counts, engagement sums, average word counts, occurrences of emojis, hashtags, and mentions, as well as unique counts for emojis, hashtags, and mentions. Additionally, it computes the average topic coordinates (x and y) if available. Finally, it calculates percentages for verbatims, engagements, users (if applicable), occurrences of emojis, hashtags, and mentions, and their respective combinations with verbatims.
718
+ """
393
719
  #on s'assure que les colonnes de métriques soient bien complètes et en float
394
720
  # df_processed_data[metrics]=df_processed_data[metrics].fillna(0).astype(float)
395
721
 
@@ -436,7 +762,31 @@ def topic_representation(df_processed_data, col_topic, col_id, col_engagement, c
436
762
  df_distrib_all[col_topic]=df_distrib_all[col_topic].astype(str)
437
763
  return df_distrib_all
438
764
 
439
- def generic_representation(df_processed_data, col_gb, col_id, col_engagement, col_user_id = None, metrics={}):
765
+ def generic_representation(df_processed_data: pd.DataFrame, col_gb: str, col_id: str, col_engagement: str, col_user_id: str = None, metrics: dict = {}) -> pd.DataFrame:
766
+ """
767
+ Calculate a generic representation of data based on grouping by a specified column.
768
+
769
+ Parameters:
770
+ df_processed_data : pandas DataFrame
771
+ DataFrame containing processed data.
772
+ col_gb : str
773
+ Name of the column to group by.
774
+ col_id : str
775
+ Name of the column containing unique identifiers.
776
+ col_engagement : str
777
+ Name of the column containing engagement metrics.
778
+ col_user_id : str, optional
779
+ Name of the column containing user identifiers. Default is None.
780
+ metrics : dict, optional
781
+ Dictionary containing additional metrics to aggregate. Default is an empty dictionary.
782
+
783
+ Returns:
784
+ DataFrame
785
+ DataFrame containing the generic representation of data.
786
+
787
+ Description:
788
+ This function aggregates various metrics for each group, including verbatim counts, engagement sums, and any additional metrics provided in the `metrics` parameter. It also computes derived metrics such as verbatims per user and engagement per verbatim. Finally, it calculates percentages for verbatims, engagements, and users (if applicable) within each group.
789
+ """
440
790
  #on crée un dictionnaire contenant les agrégations
441
791
  metrics_dict = dict()
442
792
  metrics_dict['verbatims']=(col_id,'nunique')
@@ -460,7 +810,23 @@ def generic_representation(df_processed_data, col_gb, col_id, col_engagement, co
460
810
 
461
811
  return df_distrib_all
462
812
 
463
- def create_frequency_table(df, col):
813
+ def create_frequency_table(df: pd.DataFrame, col: str) -> pd.DataFrame:
814
+ """
815
+ Create a frequency table for a given column in a DataFrame.
816
+
817
+ Parameters:
818
+ df : pandas DataFrame
819
+ DataFrame containing the data.
820
+ col : str
821
+ Name of the column for which the frequency table is to be created.
822
+
823
+ Returns:
824
+ pandas DataFrame
825
+ DataFrame containing the frequency table.
826
+
827
+ Description:
828
+ This function generates a frequency table for the specified column in the DataFrame. It sorts the DataFrame by the specified column in descending order, calculates the rank of each entry, and assigns dense ranks both ascending and descending.
829
+ """
464
830
  df_frequency=(df.sort_values(col, ascending=False)
465
831
  .reset_index(drop=True)
466
832
  .reset_index()
@@ -475,9 +841,22 @@ def create_frequency_table(df, col):
475
841
  # SAMPLING
476
842
  ###################################################################
477
843
 
478
- def calculate_sample(len_df, n_rows):
844
+ def calculate_sample(len_df: int, n_rows: float) -> int:
479
845
  """
480
- Percentage conversion to number of rows
846
+ Convert a percentage to the number of rows to sample.
847
+
848
+ Parameters:
849
+ len_df : int
850
+ Length of the DataFrame.
851
+ n_rows : float
852
+ Number of rows to sample. If less than or equal to 1, it's treated as a percentage.
853
+
854
+ Returns:
855
+ int
856
+ Number of rows to sample.
857
+
858
+ Description:
859
+ This function converts a percentage of the DataFrame length into a number of rows to sample. If `n_rows` is between 0 and 1, it's treated as a percentage and converted into an integer representing the top `n_rows` percentage of the DataFrame length. If `n_rows` is greater than 1 or equal to 0, it's treated as an absolute number of rows.
481
860
  """
482
861
  if 0 < n_rows <= 1 :
483
862
  top_rows = int(n_rows * len_df)
@@ -488,12 +867,26 @@ def calculate_sample(len_df, n_rows):
488
867
  else :
489
868
  print("ERREUR - paramètre du sampling incorrect")
490
869
 
491
- def sampling_by_engagement(df, col_engagement, top_rows=0.3, sample_size=0.5):
870
+ def sampling_by_engagement(df: pd.DataFrame, col_engagement: str, top_rows: float = 0.3, sample_size: float = 0.5) -> pd.DataFrame:
492
871
  """
493
- Create a sample dataset by keeping a part of the top publications:
494
- - sample_size : final size of the sample. Ex : 1000 rows from an original dataset of 100000 rows
495
- - top_rows : number of "most engaging" rows to keep
496
- Values could be either an integer or a float between 0 and 1 (= sample a percentage)
872
+ Create a sample dataset by keeping a part of the top publications based on engagement metrics.
873
+
874
+ Parameters:
875
+ df : pandas.DataFrame
876
+ The original DataFrame.
877
+ col_engagement : str
878
+ The column name containing the engagement metrics.
879
+ top_rows : float, optional
880
+ The number of "most engaging" rows to keep. Values could be either an integer or a float between 0 and 1 (= sample a percentage). Default is 0.3.
881
+ sample_size : float, optional
882
+ The final size of the sample. Ex: 1000 rows from an original dataset of 100000 rows. Values could be either an integer or a float between 0 and 1 (= sample a percentage). Default is 0.5.
883
+
884
+ Returns:
885
+ pandas.DataFrame
886
+ The sampled DataFrame.
887
+
888
+ Description:
889
+ This function generates a sample dataset by keeping a portion of the top publications based on engagement metrics. It sorts the dataset by the specified engagement metric, keeps the top `top_rows` rows, and then samples the remaining rows to achieve the desired `sample_size`. The final sample is shuffled for randomness.
497
890
  """
498
891
 
499
892
  sample_rows = calculate_sample(len(df), sample_size)
@@ -514,9 +907,28 @@ def sampling_by_engagement(df, col_engagement, top_rows=0.3, sample_size=0.5):
514
907
  else:
515
908
  return df
516
909
 
517
- def sample_most_engaging_posts(df, col_topic, col_engagement, sample_size= 0.1, min_size=10):
910
+ def sample_most_engaging_posts(df: pd.DataFrame, col_topic: str, col_engagement: str, sample_size: float = 0.1, min_size: int = 10) -> pd.DataFrame:
518
911
  """
519
- "Stratified sample" of the most engaging content per topic. Returns a minimun number of items per group.
912
+ Perform a "stratified sample" of the most engaging content per topic, ensuring a minimum number of items per group.
913
+
914
+ Parameters:
915
+ df : pandas.DataFrame
916
+ The DataFrame containing the data.
917
+ col_topic : str
918
+ The column name containing the topic information.
919
+ col_engagement : str
920
+ The column name containing the engagement metrics.
921
+ sample_size : float, optional
922
+ The size of the sample relative to the total data. Default is 0.1 (10%).
923
+ min_size : int, optional
924
+ The minimum number of items to retain per group. Default is 10.
925
+
926
+ Returns:
927
+ pandas.DataFrame
928
+ The sampled DataFrame.
929
+
930
+ Description:
931
+ This function performs a "stratified sample" of the most engaging content per topic. It sorts the data by engagement metrics within each topic group, and then takes a sample of `sample_size` proportion from each group. If a group has fewer than `min_size` items, it retains all items in that group.
520
932
  """
521
933
  df = (df.groupby(col_topic, group_keys=False)
522
934
  .apply(lambda x: x.sort_values(by=col_engagement, ascending=False)
@@ -532,10 +944,38 @@ def sample_most_engaging_posts(df, col_topic, col_engagement, sample_size= 0.1,
532
944
  def get_lang_detector(nlp, name):
533
945
  return LanguageDetector(seed=42) # We use the seed 42
534
946
 
535
- def TM_nlp_process(nlp, df, col_text, col_lemma, pos_to_keep, stopwords, batch_size=100, n_process=1, stats=True, join_list = False):
947
+ def TM_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_keep: list, stopwords: list, batch_size: int = 100, n_process: int = 1, stats: bool = True, join_list: bool = False) -> pd.DataFrame:
536
948
  """
537
- Spacy implementation for topic modeling
538
-
949
+ Perform natural language processing tasks using spaCy for topic modeling.
950
+
951
+ Parameters:
952
+ nlp : spacy.Language
953
+ The spaCy language model.
954
+ df : pandas.DataFrame
955
+ The DataFrame containing the text data.
956
+ col_text : str
957
+ The name of the column containing the text data.
958
+ col_lemma : str
959
+ The name of the column to store the lemmatized text data.
960
+ pos_to_keep : list
961
+ A list of part-of-speech tags to keep during lemmatization.
962
+ stopwords : list
963
+ A list of stopwords to remove during processing.
964
+ batch_size : int, optional
965
+ The batch size for spaCy processing. Default is 100.
966
+ n_process : int, optional
967
+ The number of processes for parallel processing. Default is 1.
968
+ stats : bool, optional
969
+ Whether to compute and store additional statistics. Default is True.
970
+ join_list : bool, optional
971
+ Whether to join the lemmas into a single string. Default is False.
972
+
973
+ Returns:
974
+ pandas.DataFrame
975
+ The DataFrame with processed text data.
976
+
977
+ Description:
978
+ This function utilizes spaCy for natural language processing tasks such as lemmatization, emoji extraction, and token counting. It processes the text data in the DataFrame and returns the DataFrame with additional columns for lemmatized text, emoji counts, token counts, and more.
539
979
  """
540
980
  all_lemmas=[]
541
981
  tokens_counts=[]
@@ -587,17 +1027,19 @@ def TM_nlp_process(nlp, df, col_text, col_lemma, pos_to_keep, stopwords, batch_s
587
1027
  return df
588
1028
 
589
1029
 
590
- def load_spacy_model(model, disable_components=["transformer", "morphologizer", "trainable_lemmatizer", "textcat_multilabel", "textcat", "entity_ruler", "entity_linker"], lang_detect=False, emoji=False):
1030
+ def load_spacy_model(model: str, disable_components: list = ["transformer", "morphologizer", "trainable_lemmatizer", "textcat_multilabel", "textcat", "entity_ruler", "entity_linker"], lang_detect: bool = False, emoji: bool = False) -> spacy.language.Language:
591
1031
  """
1032
+ Load a spaCy model with optional configurations.
1033
+
592
1034
  Parameters:
593
- model : str
594
- Name of the spaCy model to load.
595
- disable_components : list, optional
596
- List of spaCy components to disable. Default is ["transformer", "morphologizer", "trainable_lemmatizer", "textcat_multilabel", "textcat", "entity_ruler", "entity_linker"].
597
- lang_detect : bool, optional
598
- Flag indicating whether language detection should be enabled. Default is False.
599
- emoji : bool, optional
600
- Flag indicating whether to include the emoji component in the spaCy pipeline. Default is False.
1035
+ model : str
1036
+ Name of the spaCy model to load.
1037
+ disable_components : list, optional
1038
+ List of spaCy components to disable. Default is ["transformer", "morphologizer", "trainable_lemmatizer", "textcat_multilabel", "textcat", "entity_ruler", "entity_linker"].
1039
+ lang_detect : bool, optional
1040
+ Flag indicating whether language detection should be enabled. Default is False.
1041
+ emoji : bool, optional
1042
+ Flag indicating whether to include the emoji component in the spaCy pipeline. Default is False.
601
1043
 
602
1044
  Returns:
603
1045
  nlp : spacy.language.Language
@@ -611,7 +1053,6 @@ def load_spacy_model(model, disable_components=["transformer", "morphologizer",
611
1053
  If 'disable_components' is provided, the specified spaCy components will be disabled. If 'lang_detect' is set to True,
612
1054
  language detection will be enabled using the 'get_lang_detector' function. If 'emoji' is set to True, the emoji component
613
1055
  will be included in the spaCy pipeline.
614
-
615
1056
  """
616
1057
  if torch.cuda.is_available():
617
1058
 
@@ -631,10 +1072,25 @@ def load_spacy_model(model, disable_components=["transformer", "morphologizer",
631
1072
 
632
1073
  return nlp
633
1074
 
634
- def get_labels(nlp, pipe_step="ner", explanations=False):
635
- """ Return labels associated to a pipeline step and explanations
636
- Available names: ['tok2vec', 'tagger', 'parser', 'senter', 'attribute_ruler', 'lemmatizer', 'ner']
637
-
1075
+ def get_labels(nlp: spacy.language.Language, pipe_step: str = "ner", explanations: bool = False) -> pd.DataFrame:
1076
+ """
1077
+ Return labels associated with a pipeline step and optionally provide explanations.
1078
+
1079
+ Parameters:
1080
+ nlp : spacy.language.Language
1081
+ The spaCy language processing pipeline.
1082
+ pipe_step : str, optional
1083
+ The pipeline step for which labels are retrieved. Default is "ner".
1084
+ explanations : bool, optional
1085
+ Flag indicating whether to include explanations for the labels. Default is False.
1086
+
1087
+ Returns:
1088
+ DataFrame
1089
+ DataFrame containing the labels associated with the specified pipeline step.
1090
+
1091
+ Description:
1092
+ This function retrieves the labels associated with a specific pipeline step of the spaCy language processing pipeline.
1093
+ It returns a DataFrame containing the labels. If 'explanations' is set to True, explanations for each label are also included.
638
1094
  """
639
1095
  pipe_details=nlp.get_pipe(pipe_step)
640
1096
  labels=list(pipe_details.labels)
@@ -646,9 +1102,30 @@ def get_labels(nlp, pipe_step="ner", explanations=False):
646
1102
  return df
647
1103
 
648
1104
 
649
- def spacy_langdetect(nlp, df, col_text, batch_size=100, n_process=1):
1105
+ def spacy_langdetect(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100, n_process: int = 1) -> pd.DataFrame:
650
1106
  """
651
- Detect language and returns a score
1107
+ Detect language and return a score.
1108
+
1109
+ Parameters:
1110
+ nlp : spacy.language.Language
1111
+ The spaCy language processing pipeline with language detection enabled.
1112
+ df : pd.DataFrame
1113
+ DataFrame containing the text data to analyze.
1114
+ col_text : str
1115
+ The name of the column containing the text data.
1116
+ batch_size : int, optional
1117
+ The batch size for processing texts. Default is 100.
1118
+ n_process : int, optional
1119
+ The number of processes to use for language detection. Default is 1.
1120
+
1121
+ Returns:
1122
+ pd.DataFrame
1123
+ DataFrame containing the detected languages and their scores.
1124
+
1125
+ Description:
1126
+ This function uses spaCy's language detection capabilities to detect the language of text data in a DataFrame.
1127
+ It returns a DataFrame containing the detected languages and their scores, which indicate the confidence level
1128
+ of the language detection for each text.
652
1129
  """
653
1130
  text=list(df[col_text].astype('unicode').values)
654
1131
 
@@ -662,9 +1139,32 @@ def spacy_langdetect(nlp, df, col_text, batch_size=100, n_process=1):
662
1139
 
663
1140
  return df
664
1141
 
665
- def extract_noun_chunks(nlp, df, col_text, batch_size=100, n_process=1, stats=False):
1142
+ def extract_noun_chunks(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100, n_process: int = 1, stats: bool = False) -> pd.DataFrame:
666
1143
  """
667
- Spacy implementation to extract noun chunks
1144
+ Spacy implementation to extract noun chunks.
1145
+
1146
+ Parameters:
1147
+ nlp : spacy.language.Language
1148
+ The spaCy language processing pipeline.
1149
+ df : pd.DataFrame
1150
+ DataFrame containing the text data to analyze.
1151
+ col_text : str
1152
+ The name of the column containing the text data.
1153
+ batch_size : int, optional
1154
+ The batch size for processing texts. Default is 100.
1155
+ n_process : int, optional
1156
+ The number of processes to use for text processing. Default is 1.
1157
+ stats : bool, optional
1158
+ Flag indicating whether to compute statistics about the noun chunks. Default is False.
1159
+
1160
+ Returns:
1161
+ pd.DataFrame
1162
+ DataFrame containing the extracted noun chunks and their statistics if enabled.
1163
+
1164
+ Description:
1165
+ This function utilizes spaCy's noun chunk extraction capabilities to extract noun chunks from text data in a DataFrame.
1166
+ It returns a DataFrame containing the extracted noun chunks for each text. Optionally, it can compute statistics such
1167
+ as the count of noun chunks and unique noun chunks if the 'stats' parameter is set to True.
668
1168
  """
669
1169
  all_chunks = []
670
1170
  all_unique_chunks =[]
@@ -689,10 +1189,32 @@ def extract_noun_chunks(nlp, df, col_text, batch_size=100, n_process=1, stats=Fa
689
1189
  df['unique_noun_chunks_count']=unique_chunks_count
690
1190
  return df
691
1191
 
692
- def extract_emojis(nlp, df, col_text, batch_size=100, n_process=1, stats=True):
1192
+ def extract_emojis(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100, n_process: int = 1, stats: bool = True) -> pd.DataFrame:
693
1193
  """
694
1194
  Spacy implementation to extract emojis
695
1195
 
1196
+ Parameters:
1197
+ nlp : spacy.language.Language
1198
+ The spaCy language processing pipeline.
1199
+ df : pd.DataFrame
1200
+ DataFrame containing the text data to analyze.
1201
+ col_text : str
1202
+ The name of the column containing the text data.
1203
+ batch_size : int, optional
1204
+ The batch size for processing texts. Default is 100.
1205
+ n_process : int, optional
1206
+ The number of processes to use for text processing. Default is 1.
1207
+ stats : bool, optional
1208
+ Flag indicating whether to compute statistics about the emojis. Default is True.
1209
+
1210
+ Returns:
1211
+ pd.DataFrame
1212
+ DataFrame containing the extracted emojis and their statistics if enabled.
1213
+
1214
+ Description:
1215
+ This function utilizes spaCy's emoji detection capabilities to extract emojis from text data in a DataFrame.
1216
+ It returns a DataFrame containing the extracted emojis for each text. Optionally, it can compute statistics such
1217
+ as the count of emojis and unique emojis if the 'stats' parameter is set to True.
696
1218
  """
697
1219
  all_emojis=[]
698
1220
  all_unique_emojis=[]
@@ -720,9 +1242,33 @@ def extract_emojis(nlp, df, col_text, batch_size=100, n_process=1, stats=True):
720
1242
 
721
1243
  return df
722
1244
 
723
- def split_n_sentences(nlp, df, col_text, n_sentences=1, batch_size=100, n_process=1, stats=False):
1245
+ def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1, batch_size: int = 100, n_process: int = 1, stats: bool = False) -> pd.DataFrame:
724
1246
  """
725
1247
  Split a text into chunks of n sentences
1248
+
1249
+ Parameters:
1250
+ nlp : spacy.language.Language
1251
+ The spaCy language processing pipeline.
1252
+ df : pd.DataFrame
1253
+ DataFrame containing the text data to split.
1254
+ col_text : str
1255
+ The name of the column containing the text data.
1256
+ n_sentences : int, optional
1257
+ The number of sentences to group together. Default is 1.
1258
+ batch_size : int, optional
1259
+ The batch size for processing texts. Default is 100.
1260
+ n_process : int, optional
1261
+ The number of processes to use for text processing. Default is 1.
1262
+ stats : bool, optional
1263
+ Flag indicating whether to compute statistics about the splitting process. Default is False.
1264
+
1265
+ Returns:
1266
+ pd.DataFrame
1267
+ DataFrame containing the split sentences.
1268
+
1269
+ Description:
1270
+ This function splits text in a DataFrame into chunks of n sentences. It returns a DataFrame containing the split sentences.
1271
+ Optionally, it can compute statistics such as the count of sentences and batches if the 'stats' parameter is set to True.
726
1272
  """
727
1273
 
728
1274
  text=list(df[col_text].astype('unicode').values)
@@ -753,12 +1299,32 @@ def split_n_sentences(nlp, df, col_text, n_sentences=1, batch_size=100, n_proces
753
1299
  return df
754
1300
 
755
1301
 
756
- def spacy_NER(nlp, df, col_text, entities_to_keep=['PERSON','ORG'], explode= True):
1302
+ def spacy_NER(nlp, df: pd.DataFrame, col_text: str, entities_to_keep: list = ['PERSON','ORG'], explode: bool = True) -> pd.DataFrame:
757
1303
  """
758
1304
  Spacy implementation of NER.
759
1305
  To define entities type to keep, call get_labels(nlp, pipe_step="ner", explanations=False)
760
- explode = False means it return 1 list of entities per document
1306
+ explode = False means it returns 1 list of entities per document
761
1307
  explode = True means it returns 1 entity per row
1308
+
1309
+ Parameters:
1310
+ nlp : spacy.language.Language
1311
+ The spaCy language processing pipeline.
1312
+ df : pd.DataFrame
1313
+ DataFrame containing the text data.
1314
+ col_text : str
1315
+ The name of the column containing the text data.
1316
+ entities_to_keep : list, optional
1317
+ List of entity types to keep. Default is ['PERSON','ORG'].
1318
+ explode : bool, optional
1319
+ Flag indicating whether to explode the DataFrame to have one entity per row. Default is True.
1320
+
1321
+ Returns:
1322
+ pd.DataFrame
1323
+ DataFrame containing the NER information.
1324
+
1325
+ Description:
1326
+ This function performs Named Entity Recognition (NER) using spaCy on a DataFrame with text data. It extracts entities of the specified types
1327
+ and stores the NER information in separate columns. If 'explode' is set to True, it returns one entity per row in the DataFrame.
762
1328
  """
763
1329
  # Create columns to store the NER information
764
1330
  df['NER_type'] = None
@@ -797,10 +1363,38 @@ def spacy_NER(nlp, df, col_text, entities_to_keep=['PERSON','ORG'], explode= Tru
797
1363
  return df
798
1364
 
799
1365
 
800
- def tokenize(nlp, df, col_text, col_tokens, pos_to_keep, stopwords, batch_size=100, n_process=1, stats=True):
1366
+ def tokenize(nlp, df: pd.DataFrame, col_text: str, col_tokens: str, pos_to_keep: list, stopwords: list, batch_size: int = 100, n_process: int = 1, stats: bool = True) -> pd.DataFrame:
801
1367
  """
802
1368
  Spacy implementation to tokenize text
803
1369
 
1370
+ Parameters:
1371
+ nlp : spacy.language.Language
1372
+ The spaCy language processing pipeline.
1373
+ df : pd.DataFrame
1374
+ DataFrame containing the text data.
1375
+ col_text : str
1376
+ The name of the column containing the text data.
1377
+ col_tokens : str
1378
+ The name of the column to store the tokenized text.
1379
+ pos_to_keep : list
1380
+ List of POS tags to keep.
1381
+ stopwords : list
1382
+ List of stopwords to exclude from tokens.
1383
+ batch_size : int, optional
1384
+ Batch size for processing. Default is 100.
1385
+ n_process : int, optional
1386
+ Number of processes for parallel processing. Default is 1.
1387
+ stats : bool, optional
1388
+ Flag indicating whether to calculate and store statistics. Default is True.
1389
+
1390
+ Returns:
1391
+ pd.DataFrame
1392
+ DataFrame containing the tokenized text.
1393
+
1394
+ Description:
1395
+ This function tokenizes text using spaCy and stores the tokens in a new column in the DataFrame.
1396
+ It allows filtering tokens based on POS tags and stopwords. If 'stats' is set to True, it calculates
1397
+ and stores token counts.
804
1398
  """
805
1399
  all_tokens=[]
806
1400
  tokens_counts=[]
@@ -832,10 +1426,40 @@ def tokenize(nlp, df, col_text, col_tokens, pos_to_keep, stopwords, batch_size=1
832
1426
  return df
833
1427
 
834
1428
 
835
- def lemmatize(nlp, df, col_text, col_lemma, pos_to_keep, stopwords, batch_size=100, n_process=1, stats=True, join_list = False):
1429
+ def lemmatize(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_keep: list, stopwords: list, batch_size: int = 100, n_process: int = 1, stats: bool = True, join_list: bool = False) -> pd.DataFrame:
836
1430
  """
837
1431
  Spacy implementation to lemmatize text
838
1432
 
1433
+ Parameters:
1434
+ nlp : spacy.language.Language
1435
+ The spaCy language processing pipeline.
1436
+ df : pd.DataFrame
1437
+ DataFrame containing the text data.
1438
+ col_text : str
1439
+ The name of the column containing the text data.
1440
+ col_lemma : str
1441
+ The name of the column to store the lemmatized text.
1442
+ pos_to_keep : list
1443
+ List of POS tags to keep.
1444
+ stopwords : list
1445
+ List of stopwords to exclude from lemmas.
1446
+ batch_size : int, optional
1447
+ Batch size for processing. Default is 100.
1448
+ n_process : int, optional
1449
+ Number of processes for parallel processing. Default is 1.
1450
+ stats : bool, optional
1451
+ Flag indicating whether to calculate and store statistics. Default is True.
1452
+ join_list : bool, optional
1453
+ Flag indicating whether to join the lemmas into a single string. Default is False.
1454
+
1455
+ Returns:
1456
+ pd.DataFrame
1457
+ DataFrame containing the lemmatized text.
1458
+
1459
+ Description:
1460
+ This function lemmatizes text using spaCy and stores the lemmatized text in a new column in the DataFrame.
1461
+ It allows filtering lemmas based on POS tags and stopwords. If 'stats' is set to True, it calculates
1462
+ and stores token counts.
839
1463
  """
840
1464
  all_lemmas=[]
841
1465
  tokens_counts=[]
@@ -871,12 +1495,11 @@ def lemmatize(nlp, df, col_text, col_lemma, pos_to_keep, stopwords, batch_size=1
871
1495
  return df
872
1496
 
873
1497
 
874
-
875
1498
  ####################################################################
876
1499
  # VECTORISATION
877
1500
  ####################################################################
878
1501
 
879
- def count_vectorize(lst_text):
1502
+ def count_vectorize(lst_text: list) -> tuple:
880
1503
  """
881
1504
  Parameters:
882
1505
  lst_text : list
@@ -905,8 +1528,8 @@ def count_vectorize(lst_text):
905
1528
 
906
1529
  return count_vectorizer, features, features_names, vocabulary
907
1530
 
908
- def tfidf_vectorize(lst_text, analyzer='word', max_df=1.0, max_features=None,
909
- min_df=1, use_idf=True, ngram_range=(1,1), stop_words=None):
1531
+ def tfidf_vectorize(lst_text: list, analyzer: str = 'word', max_df: float = 1.0, max_features: int = None,
1532
+ min_df: float = 1, use_idf: bool = True, ngram_range: tuple = (1, 1), stop_words: list = None) -> tuple:
910
1533
  """
911
1534
  Parameters:
912
1535
  lst_text : list
@@ -959,15 +1582,29 @@ def tfidf_vectorize(lst_text, analyzer='word', max_df=1.0, max_features=None,
959
1582
 
960
1583
  return tfidf_vectorizer, features, features_names, vocabulary
961
1584
 
962
- def SF_vectorize(lst_text, model_name):
1585
+ def SF_vectorize(lst_text: list, model_name: str) -> np.array:
963
1586
  """
964
- Vectorize text using Sentence Transformers
1587
+ Vectorize text using Sentence Transformers.
1588
+
1589
+ Parameters:
1590
+ lst_text : list
1591
+ List of texts to be vectorized.
1592
+ model_name : str
1593
+ Name of the Sentence Transformers model to be used.
1594
+
1595
+ Returns:
1596
+ features : numpy.ndarray
1597
+ Encoded features of the input texts.
1598
+
1599
+ Description:
1600
+ This function vectorizes a list of texts using Sentence Transformers. It encodes the texts into fixed-size
1601
+ vectors of features using the specified model. The function returns the encoded features as a numpy array.
965
1602
  """
966
1603
  model = SentenceTransformer(model_name)
967
1604
  features = model.encode(lst_text)
968
1605
  return features
969
1606
 
970
- def load_HF_embeddings(model_name, encode_kwargs={'batch_size':32}, model_kwargs={'device': 'cuda:0'}):
1607
+ def load_HF_embeddings(model_name : str, encode_kwargs : dict ={'batch_size':32}, model_kwargs : dict ={'device': 'cuda:0'}):
971
1608
  """
972
1609
  create a HugginFace encoder
973
1610
  """
@@ -987,20 +1624,25 @@ def HF_vectorize(HF_encoder, lst_txt):
987
1624
 
988
1625
  return embeddings
989
1626
 
990
- def encode_chunked_files(chunk_files_paths, HF_encoder, cols, col_text, path_embedded_chunks, reencode = False):
1627
+ def encode_chunked_files(chunk_files_paths: list,
1628
+ HF_encoder,
1629
+ cols: list,
1630
+ col_text: str,
1631
+ path_embedded_chunks: str,
1632
+ reencode: bool = False) -> list:
991
1633
  """
992
1634
  Encode text from files and save the results in another pickle file.
993
1635
 
994
1636
  Parameters:
995
- chunk_files_paths (list): List of file paths containing documents.
1637
+ chunk_files_paths (List[str]): List of file paths containing documents.
996
1638
  HF_encoder (Encoder): Encoder object for text vectorization.
997
- cols (list): Columns to keep in the resulting DataFrame.
1639
+ cols (List[str]): Columns to keep in the resulting DataFrame.
998
1640
  col_text (str): Column containing text data in the DataFrame.
999
1641
  path_embedded_chunks (str): Path to save the embedded chunks.
1000
- reencode (bool): Whether to re-encode files even if they already exist.
1642
+ reencode (bool, optional): Whether to re-encode files even if they already exist. Defaults to False.
1001
1643
 
1002
1644
  Returns:
1003
- list: List of paths for newly created files.
1645
+ List[str]: List of paths for newly created files.
1004
1646
  """
1005
1647
  new_file_paths=[]
1006
1648
  for file in tqdm(chunk_files_paths, total=len(chunk_files_paths), desc="Encoding text from files"):
@@ -1032,9 +1674,16 @@ def encode_chunked_files(chunk_files_paths, HF_encoder, cols, col_text, path_emb
1032
1674
  # SCALING FEATURES
1033
1675
  ####################################################################
1034
1676
 
1035
- def scaling_features(features, method="standard"):
1677
+ def scaling_features(features: list, method: str = "standard") -> list:
1036
1678
  """
1037
- Scale features if metho
1679
+ Scale features using either standardization or min-max scaling.
1680
+
1681
+ Parameters:
1682
+ features (Union[List[List[float]], List[float]]): List of features to scale.
1683
+ method (str, optional): Method of scaling, either "standard" for standardization or "min-max" for min-max scaling. Defaults to "standard".
1684
+
1685
+ Returns:
1686
+ Union[List[List[float]], List[float]]: Scaled features.
1038
1687
  """
1039
1688
  try:
1040
1689
  if method=="standard":