opsci-toolbox 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/rapidapi_helpers.py +1 -0
- opsci_toolbox/helpers/common.py +557 -207
- opsci_toolbox/helpers/cv.py +298 -123
- opsci_toolbox/helpers/dataviz.py +875 -191
- opsci_toolbox/helpers/dates.py +55 -8
- opsci_toolbox/helpers/nlp.py +746 -97
- opsci_toolbox/helpers/nlp_cuml.py +166 -57
- opsci_toolbox/helpers/sna.py +101 -10
- opsci_toolbox/helpers/surreaction.py +58 -16
- {opsci_toolbox-0.0.5.dist-info → opsci_toolbox-0.0.7.dist-info}/METADATA +3 -2
- opsci_toolbox-0.0.7.dist-info/RECORD +21 -0
- opsci_toolbox-0.0.5.dist-info/RECORD +0 -21
- {opsci_toolbox-0.0.5.dist-info → opsci_toolbox-0.0.7.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.5.dist-info → opsci_toolbox-0.0.7.dist-info}/top_level.txt +0 -0
opsci_toolbox/helpers/nlp.py
CHANGED
@@ -33,15 +33,48 @@ from transformers import TextClassificationPipeline, AutoModelForSequenceClassif
|
|
33
33
|
# CLEANING
|
34
34
|
####################################################################
|
35
35
|
|
36
|
-
def filter_by_query(df, col_text, query, ignore_case=True, ignore_accent=True, match_word=False):
|
36
|
+
def filter_by_query(df: pd.DataFrame, col_text: str, query: str, ignore_case: bool = True, ignore_accent: bool = True, match_word: bool = False) -> pd.DataFrame:
|
37
|
+
"""
|
38
|
+
Filter DataFrame rows by a query on a specific text column.
|
39
|
+
|
40
|
+
Parameters:
|
41
|
+
df : pandas DataFrame
|
42
|
+
The DataFrame to filter.
|
43
|
+
col_text : str
|
44
|
+
The name of the column containing text data to query.
|
45
|
+
query : str
|
46
|
+
The query string to filter the DataFrame.
|
47
|
+
ignore_case : bool, optional
|
48
|
+
Whether to ignore case sensitivity. Default is True.
|
49
|
+
ignore_accent : bool, optional
|
50
|
+
Whether to ignore accents. Default is True.
|
51
|
+
match_word : bool, optional
|
52
|
+
Whether to match the whole word. Default is False.
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
df_filtered : pandas DataFrame
|
56
|
+
The filtered DataFrame.
|
57
|
+
"""
|
37
58
|
eldar_query=Query(query, ignore_case = ignore_case, ignore_accent=ignore_accent, match_word=match_word)
|
38
59
|
df[col_text] = df[df[col_text].apply(eldar_query)]
|
39
60
|
df=df.reset_index(drop=True)
|
40
61
|
return df
|
41
62
|
|
42
|
-
def TM_clean_text(df, col, col_clean):
|
63
|
+
def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
|
43
64
|
"""
|
44
|
-
Generic cleaning process for topic modeling
|
65
|
+
Generic cleaning process for topic modeling.
|
66
|
+
|
67
|
+
Parameters:
|
68
|
+
df : pandas DataFrame
|
69
|
+
The DataFrame containing text data.
|
70
|
+
col : str
|
71
|
+
The name of the column containing the original text data.
|
72
|
+
col_clean : str
|
73
|
+
The name of the column to store the cleaned text data.
|
74
|
+
|
75
|
+
Returns:
|
76
|
+
df : pandas DataFrame
|
77
|
+
The DataFrame with cleaned text data.
|
45
78
|
"""
|
46
79
|
df[col_clean] = df[col].apply(lambda x : urls(x, repl= ''))
|
47
80
|
df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
|
@@ -49,13 +82,35 @@ def TM_clean_text(df, col, col_clean):
|
|
49
82
|
# df = df.loc[(df[col_clean] != ""), :]
|
50
83
|
return df
|
51
84
|
|
52
|
-
def extract_insta_shortcode(url):
|
85
|
+
def extract_insta_shortcode(url: str) -> str:
|
86
|
+
"""
|
87
|
+
Extracts the shortcode from an Instagram URL.
|
88
|
+
|
89
|
+
Parameters:
|
90
|
+
url : str
|
91
|
+
The Instagram URL containing the shortcode.
|
92
|
+
|
93
|
+
Returns:
|
94
|
+
shortcode : str
|
95
|
+
The extracted shortcode.
|
96
|
+
"""
|
53
97
|
pattern =r'(?:https?:\/\/)?(?:www\.)?instagram\.com\/(?:p|reel|tv|stories)\/([a-zA-Z0-9_-]+)\/?'
|
54
98
|
|
55
99
|
shortcode = re.findall(pattern, url)
|
56
100
|
return shortcode[0]
|
57
101
|
|
58
|
-
def remove_emojis(text):
|
102
|
+
def remove_emojis(text: str) -> str:
|
103
|
+
"""
|
104
|
+
Removes emojis and their textual representations from a text string.
|
105
|
+
|
106
|
+
Parameters:
|
107
|
+
text : str
|
108
|
+
The input text string containing emojis.
|
109
|
+
|
110
|
+
Returns:
|
111
|
+
text_no_emojis : str
|
112
|
+
The input text string with emojis and their textual representations removed.
|
113
|
+
"""
|
59
114
|
# Convert emojis to their textual representations
|
60
115
|
text_no_emojis = emoji.demojize(text)
|
61
116
|
|
@@ -64,24 +119,56 @@ def remove_emojis(text):
|
|
64
119
|
|
65
120
|
return text_no_emojis
|
66
121
|
|
67
|
-
def extract_urls_from_text(text):
|
68
|
-
"""
|
122
|
+
def extract_urls_from_text(text: str) -> list:
|
123
|
+
"""
|
124
|
+
Extracts URLs from a text string.
|
125
|
+
|
126
|
+
Parameters:
|
127
|
+
text : str
|
128
|
+
The input text string containing URLs.
|
129
|
+
|
130
|
+
Returns:
|
131
|
+
urls : list of str
|
132
|
+
A list of URLs extracted from the input text.
|
133
|
+
"""
|
69
134
|
extractor = URLExtract()
|
70
135
|
urls = extractor.find_urls(text)
|
71
136
|
return urls
|
72
137
|
|
73
|
-
def extract_hashtags(text, lower=True):
|
138
|
+
def extract_hashtags(text: str, lower: bool = True) -> list:
|
74
139
|
'''
|
75
|
-
|
140
|
+
Extracts hashtags from the text using a regular expression.
|
141
|
+
|
142
|
+
Parameters:
|
143
|
+
text : str
|
144
|
+
The input text string containing hashtags.
|
145
|
+
lower : bool, optional
|
146
|
+
Whether to convert extracted hashtags to lowercase. Default is True.
|
147
|
+
|
148
|
+
Returns:
|
149
|
+
hashtags : list of str
|
150
|
+
A list of hashtags extracted from the input text.
|
76
151
|
'''
|
77
152
|
hashtags = re.findall(r'\B#\w+', text)
|
78
153
|
if lower :
|
79
154
|
hashtags= [h.lower() for h in hashtags]
|
80
155
|
return hashtags
|
81
156
|
|
82
|
-
def extract_mentions(text, mention_char='@', lower=False):
|
157
|
+
def extract_mentions(text: str, mention_char: str = '@', lower: bool = False) -> list:
|
83
158
|
'''
|
84
|
-
|
159
|
+
Extracts mentions from the text using a regular expression.
|
160
|
+
|
161
|
+
Parameters:
|
162
|
+
text : str
|
163
|
+
The input text string containing mentions.
|
164
|
+
mention_char : str, optional
|
165
|
+
The character used to indicate mentions. Default is '@'.
|
166
|
+
lower : bool, optional
|
167
|
+
Whether to convert extracted mentions to lowercase. Default is False.
|
168
|
+
|
169
|
+
Returns:
|
170
|
+
mentions : list of str
|
171
|
+
A list of mentions extracted from the input text.
|
85
172
|
'''
|
86
173
|
pattern = r"(?<=^|(?<=[^a-zA-Z0-9-_\.]))" + re.escape(mention_char) + r"([A-Za-z0-9_]{4,15})"
|
87
174
|
|
@@ -90,16 +177,36 @@ def extract_mentions(text, mention_char='@', lower=False):
|
|
90
177
|
mentions = [mention.lower() for mention in mentions]
|
91
178
|
return mentions
|
92
179
|
|
93
|
-
def remove_extra_spaces(text):
|
180
|
+
def remove_extra_spaces(text: str) -> str:
|
94
181
|
"""
|
95
|
-
|
182
|
+
Removes extra spaces from a text string.
|
183
|
+
|
184
|
+
Parameters:
|
185
|
+
text : str
|
186
|
+
The input text string with extra spaces.
|
187
|
+
|
188
|
+
Returns:
|
189
|
+
cleaned_text : str
|
190
|
+
The input text string with extra spaces removed.
|
96
191
|
"""
|
97
192
|
cleaned_text = re.sub(r'\s+', ' ', text)
|
98
193
|
return cleaned_text.strip()
|
99
194
|
|
100
|
-
def remove_characters(text: str, start_indices: list, end_indices: list):
|
195
|
+
def remove_characters(text: str, start_indices: list, end_indices: list) -> str:
|
101
196
|
"""
|
102
|
-
Remove
|
197
|
+
Remove characters from a text string using lists of start and end indices.
|
198
|
+
|
199
|
+
Parameters:
|
200
|
+
text : str
|
201
|
+
The input text string.
|
202
|
+
start_indices : list of int
|
203
|
+
A list of start indices indicating the positions from which characters should be removed.
|
204
|
+
end_indices : list of int
|
205
|
+
A list of end indices indicating the positions up to which characters should be removed.
|
206
|
+
|
207
|
+
Returns:
|
208
|
+
result : str
|
209
|
+
The input text string with characters removed based on the specified indices.
|
103
210
|
"""
|
104
211
|
if start_indices is None or len(start_indices) <1:
|
105
212
|
return text
|
@@ -123,9 +230,17 @@ def remove_characters(text: str, start_indices: list, end_indices: list):
|
|
123
230
|
return result
|
124
231
|
|
125
232
|
|
126
|
-
def load_stopwords_df(lang):
|
233
|
+
def load_stopwords_df(lang: str) -> pd.DataFrame:
|
127
234
|
"""
|
128
235
|
Load a CSV file without header containing stopwords. If the file doesn't exist, it creates an empty file.
|
236
|
+
|
237
|
+
Parameters:
|
238
|
+
lang : str
|
239
|
+
The language code used to identify the stopwords file.
|
240
|
+
|
241
|
+
Returns:
|
242
|
+
df : pandas DataFrame
|
243
|
+
A DataFrame containing stopwords loaded from the file.
|
129
244
|
"""
|
130
245
|
lexicon_dir = os.path.join(os.getcwd(), "lexicons")
|
131
246
|
file_path = os.path.join(lexicon_dir, f"stop_words_{lang.lower()}.csv")
|
@@ -150,11 +265,21 @@ def load_stopwords_df(lang):
|
|
150
265
|
|
151
266
|
return df
|
152
267
|
|
153
|
-
|
154
|
-
|
155
|
-
def add_stopwords(lang:str, new_stopwords:list, lower:bool = True):
|
268
|
+
def add_stopwords(lang: str, new_stopwords: list, lower: bool = True) -> pd.DataFrame:
|
156
269
|
"""
|
157
270
|
Add a list of stopwords to an existing file. It removes duplicates.
|
271
|
+
|
272
|
+
Parameters:
|
273
|
+
lang : str
|
274
|
+
The language code used to identify the stopwords file.
|
275
|
+
new_stopwords : list of str
|
276
|
+
The list of stopwords to add.
|
277
|
+
lower : bool, optional
|
278
|
+
Whether to convert the new stopwords to lowercase before adding. Default is True.
|
279
|
+
|
280
|
+
Returns:
|
281
|
+
new_df : pandas DataFrame
|
282
|
+
A DataFrame containing the updated list of stopwords.
|
158
283
|
"""
|
159
284
|
df = load_stopwords_df(lang)
|
160
285
|
init_size = len(df.iloc[:, 0].unique()) # Selecting the first column
|
@@ -173,13 +298,21 @@ def add_stopwords(lang:str, new_stopwords:list, lower:bool = True):
|
|
173
298
|
lexicon_dir = os.path.join(os.getcwd(), "lexicons")
|
174
299
|
file_path = os.path.join(lexicon_dir, f"stop_words_{lang.lower()}.csv")
|
175
300
|
new_df.to_csv(file_path, encoding="utf-8", index=False)
|
176
|
-
|
177
|
-
|
178
301
|
return new_df
|
179
302
|
|
180
|
-
def remove_stopwords(lang:str, stopwords:list):
|
303
|
+
def remove_stopwords(lang: str, stopwords: list) -> pd.DataFrame:
|
181
304
|
"""
|
182
305
|
Remove stopwords from an existing file.
|
306
|
+
|
307
|
+
Parameters:
|
308
|
+
lang : str
|
309
|
+
The language code used to identify the stopwords file.
|
310
|
+
stopwords : list of str
|
311
|
+
The list of stopwords to remove.
|
312
|
+
|
313
|
+
Returns:
|
314
|
+
df : pandas DataFrame
|
315
|
+
A DataFrame containing the updated list of stopwords after removal.
|
183
316
|
"""
|
184
317
|
df = load_stopwords_df(lang)
|
185
318
|
init_size = len(df.iloc[:, 0].unique()) # Selecting the first column
|
@@ -193,14 +326,24 @@ def remove_stopwords(lang:str, stopwords:list):
|
|
193
326
|
return df
|
194
327
|
|
195
328
|
|
196
|
-
def keep_alphanum_char(text:str, replace:str = ''):
|
329
|
+
def keep_alphanum_char(text: str, replace: str = '') -> str:
|
197
330
|
"""
|
198
|
-
Replace all non-alphanumeric characters
|
331
|
+
Replace all non-alphanumeric characters in a text string.
|
332
|
+
|
333
|
+
Parameters:
|
334
|
+
text : str
|
335
|
+
The input text string.
|
336
|
+
replace : str, optional
|
337
|
+
The string to replace non-alphanumeric characters with. Default is an empty string.
|
338
|
+
|
339
|
+
Returns:
|
340
|
+
cleaned_text : str
|
341
|
+
The input text string with non-alphanumeric characters replaced.
|
199
342
|
"""
|
200
343
|
return re.sub("[^a-zA-Z0-9]", replace, text)
|
201
344
|
|
202
345
|
|
203
|
-
def substitute_punctuations_with_white_space(text):
|
346
|
+
def substitute_punctuations_with_white_space(text : str) -> str:
|
204
347
|
"""
|
205
348
|
Substitute punctuations with white spaces in the input string.
|
206
349
|
|
@@ -213,7 +356,28 @@ def substitute_punctuations_with_white_space(text):
|
|
213
356
|
text = re.sub(r"[%s]" % re.escape('!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~“…”’'), " ", text)
|
214
357
|
return text
|
215
358
|
|
216
|
-
def translate_wt_libre(text, source, target, filename, dir_json, url = "http://127.0.0.1:5000/translate"):
|
359
|
+
def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate") -> dict:
|
360
|
+
"""
|
361
|
+
Translate text using LibreTranslate service.
|
362
|
+
|
363
|
+
Parameters:
|
364
|
+
text : str
|
365
|
+
The text to be translated.
|
366
|
+
source : str
|
367
|
+
The source language code.
|
368
|
+
target : str
|
369
|
+
The target language code.
|
370
|
+
filename : str
|
371
|
+
The filename to save the translation result.
|
372
|
+
dir_json : str
|
373
|
+
The directory to save the translation result JSON file.
|
374
|
+
url : str, optional
|
375
|
+
The URL of the WT Libre translation service. Default is "http://127.0.0.1:5000/translate".
|
376
|
+
|
377
|
+
Returns:
|
378
|
+
json_data : dict
|
379
|
+
The translation result in JSON format.
|
380
|
+
"""
|
217
381
|
headers = {"Content-Type": "application/json"}
|
218
382
|
payload = {
|
219
383
|
"q": text,
|
@@ -231,7 +395,28 @@ def translate_wt_libre(text, source, target, filename, dir_json, url = "http://1
|
|
231
395
|
write_json(json_data, dir_json , str(filename))
|
232
396
|
return json_data
|
233
397
|
|
234
|
-
def translate_batch(batch_text, source, target, filename, dir_json, url = "http://127.0.0.1:5000/translate"):
|
398
|
+
def translate_batch(batch_text: list, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate") -> list:
|
399
|
+
"""
|
400
|
+
Translate a batch of texts using LibreTranslate service.
|
401
|
+
|
402
|
+
Parameters:
|
403
|
+
batch_text : list of str
|
404
|
+
The list of texts to be translated.
|
405
|
+
source : str
|
406
|
+
The source language code.
|
407
|
+
target : str
|
408
|
+
The target language code.
|
409
|
+
filename : str
|
410
|
+
The filename to save the translation results.
|
411
|
+
dir_json : str
|
412
|
+
The directory to save the translation result JSONL file.
|
413
|
+
url : str, optional
|
414
|
+
The URL of the WT Libre translation service. Default is "http://127.0.0.1:5000/translate".
|
415
|
+
|
416
|
+
Returns:
|
417
|
+
json_results : list of dict
|
418
|
+
The translation results as a list of dictionaries containing 'translated_text' and 'clean_text'.
|
419
|
+
"""
|
235
420
|
headers = {"Content-Type": "application/json"}
|
236
421
|
payload = {
|
237
422
|
"q": batch_text,
|
@@ -253,7 +438,24 @@ def translate_batch(batch_text, source, target, filename, dir_json, url = "http:
|
|
253
438
|
write_jsonl(json_results, dir_json , str(filename))
|
254
439
|
return json_results
|
255
440
|
|
256
|
-
def translate(text, source, target,
|
441
|
+
def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:5000/translate") -> str:
|
442
|
+
"""
|
443
|
+
Translate text using LibreTranslate service.
|
444
|
+
|
445
|
+
Parameters:
|
446
|
+
text : str
|
447
|
+
The text to be translated.
|
448
|
+
source : str
|
449
|
+
The source language code.
|
450
|
+
target : str
|
451
|
+
The target language code.
|
452
|
+
url : str, optional
|
453
|
+
The URL of the translation service. Default is "http://127.0.0.1:5000/translate".
|
454
|
+
|
455
|
+
Returns:
|
456
|
+
translatedText : str
|
457
|
+
The translated text.
|
458
|
+
"""
|
257
459
|
headers = {"Content-Type": "application/json"}
|
258
460
|
payload = {
|
259
461
|
"q": text,
|
@@ -268,7 +470,24 @@ def translate(text, source, target, url = "http://127.0.0.1:5000/translate"):
|
|
268
470
|
translatedText = json_data.get("translatedText", "")
|
269
471
|
return translatedText
|
270
472
|
|
271
|
-
def translate_row(df, col, source="auto", target = "en"):
|
473
|
+
def translate_row(df: pd.DataFrame, col: str, source: str = "auto", target: str = "en") -> pd.DataFrame:
|
474
|
+
"""
|
475
|
+
Translate the text in a specific column of a DataFrame.
|
476
|
+
|
477
|
+
Parameters:
|
478
|
+
df : pandas DataFrame
|
479
|
+
The DataFrame containing the text to be translated.
|
480
|
+
col : str
|
481
|
+
The name of the column containing the text to be translated.
|
482
|
+
source : str, optional
|
483
|
+
The source language code. Default is "auto".
|
484
|
+
target : str, optional
|
485
|
+
The target language code. Default is "en" (English).
|
486
|
+
|
487
|
+
Returns:
|
488
|
+
df : pandas DataFrame
|
489
|
+
The DataFrame with an additional column containing the translated text.
|
490
|
+
"""
|
272
491
|
translations =[]
|
273
492
|
for i, row in df.iterrows():
|
274
493
|
txt_to_translate = row[col].replace(' | ', ', ')
|
@@ -281,27 +500,63 @@ def translate_row(df, col, source="auto", target = "en"):
|
|
281
500
|
# METRICS
|
282
501
|
###################################################################
|
283
502
|
|
284
|
-
def cosine_similarity(a, b):
|
503
|
+
def cosine_similarity(a: np.array, b: np.array) -> float:
|
285
504
|
"""
|
286
|
-
|
505
|
+
Calculate the cosine similarity between two vectors.
|
506
|
+
|
507
|
+
Parameters:
|
508
|
+
a : numpy array
|
509
|
+
The first vector.
|
510
|
+
b : numpy array
|
511
|
+
The second vector.
|
512
|
+
|
513
|
+
Returns:
|
514
|
+
similarity : float
|
515
|
+
The cosine similarity between the two vectors.
|
287
516
|
"""
|
288
517
|
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
289
|
-
|
290
|
-
def approximate_tokens(text):
|
518
|
+
|
519
|
+
def approximate_tokens(text: str) -> int:
|
291
520
|
"""
|
292
|
-
Approximate the number of tokens
|
521
|
+
Approximate the number of tokens in a text.
|
522
|
+
|
523
|
+
Parameters:
|
524
|
+
text : str
|
525
|
+
The input text.
|
526
|
+
|
527
|
+
Returns:
|
528
|
+
num_tokens : int
|
529
|
+
The approximate number of tokens in the text.
|
293
530
|
"""
|
294
531
|
return len(text.split(' '))
|
295
532
|
|
296
|
-
def approximate_unique_tokens(text):
|
533
|
+
def approximate_unique_tokens(text: str) -> int:
|
297
534
|
"""
|
298
|
-
Approximate the number of distinct tokens
|
535
|
+
Approximate the number of distinct tokens in a text.
|
536
|
+
|
537
|
+
Parameters:
|
538
|
+
text : str
|
539
|
+
The input text.
|
540
|
+
|
541
|
+
Returns:
|
542
|
+
num_unique_tokens : int
|
543
|
+
The approximate number of distinct tokens in the text.
|
299
544
|
"""
|
300
|
-
return len(
|
545
|
+
return len(set(text.split(' ')))
|
301
546
|
|
302
|
-
def count_word_occurrences(text, word):
|
547
|
+
def count_word_occurrences(text: str, word: str) -> int:
|
303
548
|
"""
|
304
|
-
Count word
|
549
|
+
Count the occurrences of a word in a text.
|
550
|
+
|
551
|
+
Parameters:
|
552
|
+
text : str
|
553
|
+
The input text.
|
554
|
+
word : str
|
555
|
+
The word to count occurrences of.
|
556
|
+
|
557
|
+
Returns:
|
558
|
+
occurrences : int
|
559
|
+
The number of occurrences of the word in the text.
|
305
560
|
"""
|
306
561
|
# Convert both text and word to lowercase for case-insensitive matching
|
307
562
|
word_lower = word.lower()
|
@@ -312,8 +567,10 @@ def count_word_occurrences(text, word):
|
|
312
567
|
return occurrences
|
313
568
|
|
314
569
|
|
315
|
-
def chi2_per_category(lst_text, lst_categorie, col_cat, n_words = 10, p_value_limit=0.95, min_freq=3):
|
570
|
+
def chi2_per_category(lst_text: list, lst_categorie: list, col_cat: str, n_words: int = 10, p_value_limit: float = 0.95, min_freq: int = 3) -> pd.DataFrame:
|
316
571
|
"""
|
572
|
+
Calculate Chi-squared (Chi2) statistics per category based on the provided texts and corresponding categories.
|
573
|
+
|
317
574
|
Parameters:
|
318
575
|
lst_text : list
|
319
576
|
List of texts for which Chi2 will be calculated.
|
@@ -353,7 +610,34 @@ def chi2_per_category(lst_text, lst_categorie, col_cat, n_words = 10, p_value_li
|
|
353
610
|
df_chi.reset_index(drop=True)
|
354
611
|
return df_chi
|
355
612
|
|
356
|
-
def word_frequency_per_categorie(df, col_text, col_cat, ngram_range=(1, 1), stop_words=[], n_words = 20, min_freq=3):
|
613
|
+
def word_frequency_per_categorie(df: pd.DataFrame, col_text: str, col_cat: str, ngram_range: tuple = (1, 1), stop_words: list = [], n_words: int = 20, min_freq: int = 3) -> pd.DataFrame:
|
614
|
+
"""
|
615
|
+
Calculate word frequency per category.
|
616
|
+
|
617
|
+
Parameters:
|
618
|
+
df : pandas DataFrame
|
619
|
+
DataFrame containing text data and corresponding categories.
|
620
|
+
col_text : str
|
621
|
+
Name of the column containing the text data.
|
622
|
+
col_cat : str
|
623
|
+
Name of the column containing the categories.
|
624
|
+
ngram_range : tuple, optional
|
625
|
+
The range for n-grams. Default is (1, 1) for unigrams.
|
626
|
+
stop_words : list, optional
|
627
|
+
List of stopwords to be ignored during frequency calculation. Default is an empty list.
|
628
|
+
n_words : int, optional
|
629
|
+
Number of top words to display per category. Default is 20.
|
630
|
+
min_freq : int, optional
|
631
|
+
Minimum frequency threshold for word occurrences per category. Default is 3.
|
632
|
+
|
633
|
+
Returns:
|
634
|
+
DataFrame
|
635
|
+
DataFrame containing word frequencies per category.
|
636
|
+
|
637
|
+
Description:
|
638
|
+
This function calculates word frequencies per category based on the provided DataFrame, considering the text data and corresponding categories.
|
639
|
+
It filters out words with frequencies below the specified minimum frequency threshold and returns the top words for each category.
|
640
|
+
"""
|
357
641
|
count_vectorizer = CountVectorizer(token_pattern=r'[^\s]+', ngram_range=ngram_range, stop_words=stop_words)
|
358
642
|
X_train_count = count_vectorizer.fit_transform(df[col_text].to_list())
|
359
643
|
X_names_count = count_vectorizer.get_feature_names_out()
|
@@ -370,10 +654,29 @@ def word_frequency_per_categorie(df, col_text, col_cat, ngram_range=(1, 1), stop
|
|
370
654
|
return df_count
|
371
655
|
|
372
656
|
|
373
|
-
def top_items_per_category(df, col_lst ="hashtags", col_cat = "soft_topic", col_id = "tweet_id", n_items= 10):
|
657
|
+
def top_items_per_category(df: pd.DataFrame, col_lst: str = "hashtags", col_cat: str = "soft_topic", col_id: str = "tweet_id", n_items: int = 10) -> pd.DataFrame:
|
374
658
|
"""
|
375
|
-
|
376
|
-
|
659
|
+
Count the occurrences of items (e.g., hashtags) per category and select the top items per category.
|
660
|
+
|
661
|
+
Parameters:
|
662
|
+
df : pandas DataFrame
|
663
|
+
DataFrame containing data.
|
664
|
+
col_lst : str, optional
|
665
|
+
Name of the column containing lists of items (e.g., hashtags). Default is "hashtags".
|
666
|
+
col_cat : str, optional
|
667
|
+
Name of the column containing categories. Default is "soft_topic".
|
668
|
+
col_id : str, optional
|
669
|
+
Name of the column containing unique identifiers. Default is "tweet_id".
|
670
|
+
n_items : int, optional
|
671
|
+
Number of top items to select per category. Default is 10.
|
672
|
+
|
673
|
+
Returns:
|
674
|
+
DataFrame
|
675
|
+
DataFrame containing the top items per category.
|
676
|
+
|
677
|
+
Description:
|
678
|
+
This function takes a DataFrame with a column containing lists of tokens (e.g., hashtags) and counts their occurrences grouped by a category.
|
679
|
+
It then selects the most frequently occurring items per category based on the provided metric (e.g., volume of tweets).
|
377
680
|
"""
|
378
681
|
df_count = (df[[col_cat, col_id, col_lst]].explode(col_lst)
|
379
682
|
.groupby([col_cat, col_lst], group_keys=False)
|
@@ -388,8 +691,31 @@ def top_items_per_category(df, col_lst ="hashtags", col_cat = "soft_topic", col_
|
|
388
691
|
)
|
389
692
|
return df_count
|
390
693
|
|
391
|
-
def topic_representation(df_processed_data, col_topic, col_id, col_engagement, col_user_id, metrics):
|
694
|
+
def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id: str, col_engagement: str, col_user_id: str, metrics: dict) -> pd.DataFrame:
|
695
|
+
"""
|
696
|
+
Calculate the representation of topics in a processed DataFrame.
|
697
|
+
|
698
|
+
Parameters:
|
699
|
+
df_processed_data : pandas DataFrame
|
700
|
+
DataFrame containing processed data.
|
701
|
+
col_topic : str
|
702
|
+
Name of the column containing topic labels.
|
703
|
+
col_id : str
|
704
|
+
Name of the column containing unique identifiers.
|
705
|
+
col_engagement : str
|
706
|
+
Name of the column containing engagement metrics.
|
707
|
+
col_user_id : str
|
708
|
+
Name of the column containing user identifiers.
|
709
|
+
metrics : dict
|
710
|
+
Dictionary containing additional metrics to aggregate.
|
711
|
+
|
712
|
+
Returns:
|
713
|
+
DataFrame
|
714
|
+
DataFrame containing the representation of topics.
|
392
715
|
|
716
|
+
Description:
|
717
|
+
This function aggregates various metrics for each topic, including verbatim counts, engagement sums, average word counts, occurrences of emojis, hashtags, and mentions, as well as unique counts for emojis, hashtags, and mentions. Additionally, it computes the average topic coordinates (x and y) if available. Finally, it calculates percentages for verbatims, engagements, users (if applicable), occurrences of emojis, hashtags, and mentions, and their respective combinations with verbatims.
|
718
|
+
"""
|
393
719
|
#on s'assure que les colonnes de métriques soient bien complètes et en float
|
394
720
|
# df_processed_data[metrics]=df_processed_data[metrics].fillna(0).astype(float)
|
395
721
|
|
@@ -436,7 +762,31 @@ def topic_representation(df_processed_data, col_topic, col_id, col_engagement, c
|
|
436
762
|
df_distrib_all[col_topic]=df_distrib_all[col_topic].astype(str)
|
437
763
|
return df_distrib_all
|
438
764
|
|
439
|
-
def generic_representation(df_processed_data, col_gb, col_id, col_engagement, col_user_id = None, metrics={}):
|
765
|
+
def generic_representation(df_processed_data: pd.DataFrame, col_gb: str, col_id: str, col_engagement: str, col_user_id: str = None, metrics: dict = {}) -> pd.DataFrame:
|
766
|
+
"""
|
767
|
+
Calculate a generic representation of data based on grouping by a specified column.
|
768
|
+
|
769
|
+
Parameters:
|
770
|
+
df_processed_data : pandas DataFrame
|
771
|
+
DataFrame containing processed data.
|
772
|
+
col_gb : str
|
773
|
+
Name of the column to group by.
|
774
|
+
col_id : str
|
775
|
+
Name of the column containing unique identifiers.
|
776
|
+
col_engagement : str
|
777
|
+
Name of the column containing engagement metrics.
|
778
|
+
col_user_id : str, optional
|
779
|
+
Name of the column containing user identifiers. Default is None.
|
780
|
+
metrics : dict, optional
|
781
|
+
Dictionary containing additional metrics to aggregate. Default is an empty dictionary.
|
782
|
+
|
783
|
+
Returns:
|
784
|
+
DataFrame
|
785
|
+
DataFrame containing the generic representation of data.
|
786
|
+
|
787
|
+
Description:
|
788
|
+
This function aggregates various metrics for each group, including verbatim counts, engagement sums, and any additional metrics provided in the `metrics` parameter. It also computes derived metrics such as verbatims per user and engagement per verbatim. Finally, it calculates percentages for verbatims, engagements, and users (if applicable) within each group.
|
789
|
+
"""
|
440
790
|
#on crée un dictionnaire contenant les agrégations
|
441
791
|
metrics_dict = dict()
|
442
792
|
metrics_dict['verbatims']=(col_id,'nunique')
|
@@ -460,7 +810,23 @@ def generic_representation(df_processed_data, col_gb, col_id, col_engagement, co
|
|
460
810
|
|
461
811
|
return df_distrib_all
|
462
812
|
|
463
|
-
def create_frequency_table(df, col):
|
813
|
+
def create_frequency_table(df: pd.DataFrame, col: str) -> pd.DataFrame:
|
814
|
+
"""
|
815
|
+
Create a frequency table for a given column in a DataFrame.
|
816
|
+
|
817
|
+
Parameters:
|
818
|
+
df : pandas DataFrame
|
819
|
+
DataFrame containing the data.
|
820
|
+
col : str
|
821
|
+
Name of the column for which the frequency table is to be created.
|
822
|
+
|
823
|
+
Returns:
|
824
|
+
pandas DataFrame
|
825
|
+
DataFrame containing the frequency table.
|
826
|
+
|
827
|
+
Description:
|
828
|
+
This function generates a frequency table for the specified column in the DataFrame. It sorts the DataFrame by the specified column in descending order, calculates the rank of each entry, and assigns dense ranks both ascending and descending.
|
829
|
+
"""
|
464
830
|
df_frequency=(df.sort_values(col, ascending=False)
|
465
831
|
.reset_index(drop=True)
|
466
832
|
.reset_index()
|
@@ -475,9 +841,22 @@ def create_frequency_table(df, col):
|
|
475
841
|
# SAMPLING
|
476
842
|
###################################################################
|
477
843
|
|
478
|
-
def calculate_sample(len_df, n_rows):
|
844
|
+
def calculate_sample(len_df: int, n_rows: float) -> int:
|
479
845
|
"""
|
480
|
-
|
846
|
+
Convert a percentage to the number of rows to sample.
|
847
|
+
|
848
|
+
Parameters:
|
849
|
+
len_df : int
|
850
|
+
Length of the DataFrame.
|
851
|
+
n_rows : float
|
852
|
+
Number of rows to sample. If less than or equal to 1, it's treated as a percentage.
|
853
|
+
|
854
|
+
Returns:
|
855
|
+
int
|
856
|
+
Number of rows to sample.
|
857
|
+
|
858
|
+
Description:
|
859
|
+
This function converts a percentage of the DataFrame length into a number of rows to sample. If `n_rows` is between 0 and 1, it's treated as a percentage and converted into an integer representing the top `n_rows` percentage of the DataFrame length. If `n_rows` is greater than 1 or equal to 0, it's treated as an absolute number of rows.
|
481
860
|
"""
|
482
861
|
if 0 < n_rows <= 1 :
|
483
862
|
top_rows = int(n_rows * len_df)
|
@@ -488,12 +867,26 @@ def calculate_sample(len_df, n_rows):
|
|
488
867
|
else :
|
489
868
|
print("ERREUR - paramètre du sampling incorrect")
|
490
869
|
|
491
|
-
def sampling_by_engagement(df, col_engagement, top_rows=0.3, sample_size=0.5):
|
870
|
+
def sampling_by_engagement(df: pd.DataFrame, col_engagement: str, top_rows: float = 0.3, sample_size: float = 0.5) -> pd.DataFrame:
|
492
871
|
"""
|
493
|
-
Create a sample dataset by keeping a part of the top publications
|
494
|
-
|
495
|
-
|
496
|
-
|
872
|
+
Create a sample dataset by keeping a part of the top publications based on engagement metrics.
|
873
|
+
|
874
|
+
Parameters:
|
875
|
+
df : pandas.DataFrame
|
876
|
+
The original DataFrame.
|
877
|
+
col_engagement : str
|
878
|
+
The column name containing the engagement metrics.
|
879
|
+
top_rows : float, optional
|
880
|
+
The number of "most engaging" rows to keep. Values could be either an integer or a float between 0 and 1 (= sample a percentage). Default is 0.3.
|
881
|
+
sample_size : float, optional
|
882
|
+
The final size of the sample. Ex: 1000 rows from an original dataset of 100000 rows. Values could be either an integer or a float between 0 and 1 (= sample a percentage). Default is 0.5.
|
883
|
+
|
884
|
+
Returns:
|
885
|
+
pandas.DataFrame
|
886
|
+
The sampled DataFrame.
|
887
|
+
|
888
|
+
Description:
|
889
|
+
This function generates a sample dataset by keeping a portion of the top publications based on engagement metrics. It sorts the dataset by the specified engagement metric, keeps the top `top_rows` rows, and then samples the remaining rows to achieve the desired `sample_size`. The final sample is shuffled for randomness.
|
497
890
|
"""
|
498
891
|
|
499
892
|
sample_rows = calculate_sample(len(df), sample_size)
|
@@ -514,9 +907,28 @@ def sampling_by_engagement(df, col_engagement, top_rows=0.3, sample_size=0.5):
|
|
514
907
|
else:
|
515
908
|
return df
|
516
909
|
|
517
|
-
def sample_most_engaging_posts(df, col_topic, col_engagement, sample_size= 0.1, min_size=10):
|
910
|
+
def sample_most_engaging_posts(df: pd.DataFrame, col_topic: str, col_engagement: str, sample_size: float = 0.1, min_size: int = 10) -> pd.DataFrame:
|
518
911
|
"""
|
519
|
-
"
|
912
|
+
Perform a "stratified sample" of the most engaging content per topic, ensuring a minimum number of items per group.
|
913
|
+
|
914
|
+
Parameters:
|
915
|
+
df : pandas.DataFrame
|
916
|
+
The DataFrame containing the data.
|
917
|
+
col_topic : str
|
918
|
+
The column name containing the topic information.
|
919
|
+
col_engagement : str
|
920
|
+
The column name containing the engagement metrics.
|
921
|
+
sample_size : float, optional
|
922
|
+
The size of the sample relative to the total data. Default is 0.1 (10%).
|
923
|
+
min_size : int, optional
|
924
|
+
The minimum number of items to retain per group. Default is 10.
|
925
|
+
|
926
|
+
Returns:
|
927
|
+
pandas.DataFrame
|
928
|
+
The sampled DataFrame.
|
929
|
+
|
930
|
+
Description:
|
931
|
+
This function performs a "stratified sample" of the most engaging content per topic. It sorts the data by engagement metrics within each topic group, and then takes a sample of `sample_size` proportion from each group. If a group has fewer than `min_size` items, it retains all items in that group.
|
520
932
|
"""
|
521
933
|
df = (df.groupby(col_topic, group_keys=False)
|
522
934
|
.apply(lambda x: x.sort_values(by=col_engagement, ascending=False)
|
@@ -532,10 +944,38 @@ def sample_most_engaging_posts(df, col_topic, col_engagement, sample_size= 0.1,
|
|
532
944
|
def get_lang_detector(nlp, name):
|
533
945
|
return LanguageDetector(seed=42) # We use the seed 42
|
534
946
|
|
535
|
-
def TM_nlp_process(nlp, df, col_text, col_lemma, pos_to_keep, stopwords, batch_size=100, n_process=1, stats=True, join_list = False):
|
947
|
+
def TM_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_keep: list, stopwords: list, batch_size: int = 100, n_process: int = 1, stats: bool = True, join_list: bool = False) -> pd.DataFrame:
|
536
948
|
"""
|
537
|
-
|
538
|
-
|
949
|
+
Perform natural language processing tasks using spaCy for topic modeling.
|
950
|
+
|
951
|
+
Parameters:
|
952
|
+
nlp : spacy.Language
|
953
|
+
The spaCy language model.
|
954
|
+
df : pandas.DataFrame
|
955
|
+
The DataFrame containing the text data.
|
956
|
+
col_text : str
|
957
|
+
The name of the column containing the text data.
|
958
|
+
col_lemma : str
|
959
|
+
The name of the column to store the lemmatized text data.
|
960
|
+
pos_to_keep : list
|
961
|
+
A list of part-of-speech tags to keep during lemmatization.
|
962
|
+
stopwords : list
|
963
|
+
A list of stopwords to remove during processing.
|
964
|
+
batch_size : int, optional
|
965
|
+
The batch size for spaCy processing. Default is 100.
|
966
|
+
n_process : int, optional
|
967
|
+
The number of processes for parallel processing. Default is 1.
|
968
|
+
stats : bool, optional
|
969
|
+
Whether to compute and store additional statistics. Default is True.
|
970
|
+
join_list : bool, optional
|
971
|
+
Whether to join the lemmas into a single string. Default is False.
|
972
|
+
|
973
|
+
Returns:
|
974
|
+
pandas.DataFrame
|
975
|
+
The DataFrame with processed text data.
|
976
|
+
|
977
|
+
Description:
|
978
|
+
This function utilizes spaCy for natural language processing tasks such as lemmatization, emoji extraction, and token counting. It processes the text data in the DataFrame and returns the DataFrame with additional columns for lemmatized text, emoji counts, token counts, and more.
|
539
979
|
"""
|
540
980
|
all_lemmas=[]
|
541
981
|
tokens_counts=[]
|
@@ -587,17 +1027,19 @@ def TM_nlp_process(nlp, df, col_text, col_lemma, pos_to_keep, stopwords, batch_s
|
|
587
1027
|
return df
|
588
1028
|
|
589
1029
|
|
590
|
-
def load_spacy_model(model,
|
1030
|
+
def load_spacy_model(model: str, disable_components: list = ["transformer", "morphologizer", "trainable_lemmatizer", "textcat_multilabel", "textcat", "entity_ruler", "entity_linker"], lang_detect: bool = False, emoji: bool = False) -> spacy.language.Language:
|
591
1031
|
"""
|
1032
|
+
Load a spaCy model with optional configurations.
|
1033
|
+
|
592
1034
|
Parameters:
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
1035
|
+
model : str
|
1036
|
+
Name of the spaCy model to load.
|
1037
|
+
disable_components : list, optional
|
1038
|
+
List of spaCy components to disable. Default is ["transformer", "morphologizer", "trainable_lemmatizer", "textcat_multilabel", "textcat", "entity_ruler", "entity_linker"].
|
1039
|
+
lang_detect : bool, optional
|
1040
|
+
Flag indicating whether language detection should be enabled. Default is False.
|
1041
|
+
emoji : bool, optional
|
1042
|
+
Flag indicating whether to include the emoji component in the spaCy pipeline. Default is False.
|
601
1043
|
|
602
1044
|
Returns:
|
603
1045
|
nlp : spacy.language.Language
|
@@ -611,7 +1053,6 @@ def load_spacy_model(model, disable_components=["transformer", "morphologizer",
|
|
611
1053
|
If 'disable_components' is provided, the specified spaCy components will be disabled. If 'lang_detect' is set to True,
|
612
1054
|
language detection will be enabled using the 'get_lang_detector' function. If 'emoji' is set to True, the emoji component
|
613
1055
|
will be included in the spaCy pipeline.
|
614
|
-
|
615
1056
|
"""
|
616
1057
|
if torch.cuda.is_available():
|
617
1058
|
|
@@ -631,10 +1072,25 @@ def load_spacy_model(model, disable_components=["transformer", "morphologizer",
|
|
631
1072
|
|
632
1073
|
return nlp
|
633
1074
|
|
634
|
-
def get_labels(nlp, pipe_step="ner", explanations=False):
|
635
|
-
"""
|
636
|
-
|
637
|
-
|
1075
|
+
def get_labels(nlp: spacy.language.Language, pipe_step: str = "ner", explanations: bool = False) -> pd.DataFrame:
|
1076
|
+
"""
|
1077
|
+
Return labels associated with a pipeline step and optionally provide explanations.
|
1078
|
+
|
1079
|
+
Parameters:
|
1080
|
+
nlp : spacy.language.Language
|
1081
|
+
The spaCy language processing pipeline.
|
1082
|
+
pipe_step : str, optional
|
1083
|
+
The pipeline step for which labels are retrieved. Default is "ner".
|
1084
|
+
explanations : bool, optional
|
1085
|
+
Flag indicating whether to include explanations for the labels. Default is False.
|
1086
|
+
|
1087
|
+
Returns:
|
1088
|
+
DataFrame
|
1089
|
+
DataFrame containing the labels associated with the specified pipeline step.
|
1090
|
+
|
1091
|
+
Description:
|
1092
|
+
This function retrieves the labels associated with a specific pipeline step of the spaCy language processing pipeline.
|
1093
|
+
It returns a DataFrame containing the labels. If 'explanations' is set to True, explanations for each label are also included.
|
638
1094
|
"""
|
639
1095
|
pipe_details=nlp.get_pipe(pipe_step)
|
640
1096
|
labels=list(pipe_details.labels)
|
@@ -646,9 +1102,30 @@ def get_labels(nlp, pipe_step="ner", explanations=False):
|
|
646
1102
|
return df
|
647
1103
|
|
648
1104
|
|
649
|
-
def spacy_langdetect(nlp, df, col_text, batch_size=100, n_process=1):
|
1105
|
+
def spacy_langdetect(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100, n_process: int = 1) -> pd.DataFrame:
|
650
1106
|
"""
|
651
|
-
Detect language and
|
1107
|
+
Detect language and return a score.
|
1108
|
+
|
1109
|
+
Parameters:
|
1110
|
+
nlp : spacy.language.Language
|
1111
|
+
The spaCy language processing pipeline with language detection enabled.
|
1112
|
+
df : pd.DataFrame
|
1113
|
+
DataFrame containing the text data to analyze.
|
1114
|
+
col_text : str
|
1115
|
+
The name of the column containing the text data.
|
1116
|
+
batch_size : int, optional
|
1117
|
+
The batch size for processing texts. Default is 100.
|
1118
|
+
n_process : int, optional
|
1119
|
+
The number of processes to use for language detection. Default is 1.
|
1120
|
+
|
1121
|
+
Returns:
|
1122
|
+
pd.DataFrame
|
1123
|
+
DataFrame containing the detected languages and their scores.
|
1124
|
+
|
1125
|
+
Description:
|
1126
|
+
This function uses spaCy's language detection capabilities to detect the language of text data in a DataFrame.
|
1127
|
+
It returns a DataFrame containing the detected languages and their scores, which indicate the confidence level
|
1128
|
+
of the language detection for each text.
|
652
1129
|
"""
|
653
1130
|
text=list(df[col_text].astype('unicode').values)
|
654
1131
|
|
@@ -662,9 +1139,32 @@ def spacy_langdetect(nlp, df, col_text, batch_size=100, n_process=1):
|
|
662
1139
|
|
663
1140
|
return df
|
664
1141
|
|
665
|
-
def extract_noun_chunks(nlp, df, col_text, batch_size=100, n_process=1, stats=False):
|
1142
|
+
def extract_noun_chunks(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100, n_process: int = 1, stats: bool = False) -> pd.DataFrame:
|
666
1143
|
"""
|
667
|
-
Spacy implementation to extract noun chunks
|
1144
|
+
Spacy implementation to extract noun chunks.
|
1145
|
+
|
1146
|
+
Parameters:
|
1147
|
+
nlp : spacy.language.Language
|
1148
|
+
The spaCy language processing pipeline.
|
1149
|
+
df : pd.DataFrame
|
1150
|
+
DataFrame containing the text data to analyze.
|
1151
|
+
col_text : str
|
1152
|
+
The name of the column containing the text data.
|
1153
|
+
batch_size : int, optional
|
1154
|
+
The batch size for processing texts. Default is 100.
|
1155
|
+
n_process : int, optional
|
1156
|
+
The number of processes to use for text processing. Default is 1.
|
1157
|
+
stats : bool, optional
|
1158
|
+
Flag indicating whether to compute statistics about the noun chunks. Default is False.
|
1159
|
+
|
1160
|
+
Returns:
|
1161
|
+
pd.DataFrame
|
1162
|
+
DataFrame containing the extracted noun chunks and their statistics if enabled.
|
1163
|
+
|
1164
|
+
Description:
|
1165
|
+
This function utilizes spaCy's noun chunk extraction capabilities to extract noun chunks from text data in a DataFrame.
|
1166
|
+
It returns a DataFrame containing the extracted noun chunks for each text. Optionally, it can compute statistics such
|
1167
|
+
as the count of noun chunks and unique noun chunks if the 'stats' parameter is set to True.
|
668
1168
|
"""
|
669
1169
|
all_chunks = []
|
670
1170
|
all_unique_chunks =[]
|
@@ -689,10 +1189,32 @@ def extract_noun_chunks(nlp, df, col_text, batch_size=100, n_process=1, stats=Fa
|
|
689
1189
|
df['unique_noun_chunks_count']=unique_chunks_count
|
690
1190
|
return df
|
691
1191
|
|
692
|
-
def extract_emojis(nlp, df, col_text, batch_size=100, n_process=1, stats=True):
|
1192
|
+
def extract_emojis(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100, n_process: int = 1, stats: bool = True) -> pd.DataFrame:
|
693
1193
|
"""
|
694
1194
|
Spacy implementation to extract emojis
|
695
1195
|
|
1196
|
+
Parameters:
|
1197
|
+
nlp : spacy.language.Language
|
1198
|
+
The spaCy language processing pipeline.
|
1199
|
+
df : pd.DataFrame
|
1200
|
+
DataFrame containing the text data to analyze.
|
1201
|
+
col_text : str
|
1202
|
+
The name of the column containing the text data.
|
1203
|
+
batch_size : int, optional
|
1204
|
+
The batch size for processing texts. Default is 100.
|
1205
|
+
n_process : int, optional
|
1206
|
+
The number of processes to use for text processing. Default is 1.
|
1207
|
+
stats : bool, optional
|
1208
|
+
Flag indicating whether to compute statistics about the emojis. Default is True.
|
1209
|
+
|
1210
|
+
Returns:
|
1211
|
+
pd.DataFrame
|
1212
|
+
DataFrame containing the extracted emojis and their statistics if enabled.
|
1213
|
+
|
1214
|
+
Description:
|
1215
|
+
This function utilizes spaCy's emoji detection capabilities to extract emojis from text data in a DataFrame.
|
1216
|
+
It returns a DataFrame containing the extracted emojis for each text. Optionally, it can compute statistics such
|
1217
|
+
as the count of emojis and unique emojis if the 'stats' parameter is set to True.
|
696
1218
|
"""
|
697
1219
|
all_emojis=[]
|
698
1220
|
all_unique_emojis=[]
|
@@ -720,9 +1242,33 @@ def extract_emojis(nlp, df, col_text, batch_size=100, n_process=1, stats=True):
|
|
720
1242
|
|
721
1243
|
return df
|
722
1244
|
|
723
|
-
def split_n_sentences(nlp, df, col_text, n_sentences=1, batch_size=100, n_process=1, stats=False):
|
1245
|
+
def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1, batch_size: int = 100, n_process: int = 1, stats: bool = False) -> pd.DataFrame:
|
724
1246
|
"""
|
725
1247
|
Split a text into chunks of n sentences
|
1248
|
+
|
1249
|
+
Parameters:
|
1250
|
+
nlp : spacy.language.Language
|
1251
|
+
The spaCy language processing pipeline.
|
1252
|
+
df : pd.DataFrame
|
1253
|
+
DataFrame containing the text data to split.
|
1254
|
+
col_text : str
|
1255
|
+
The name of the column containing the text data.
|
1256
|
+
n_sentences : int, optional
|
1257
|
+
The number of sentences to group together. Default is 1.
|
1258
|
+
batch_size : int, optional
|
1259
|
+
The batch size for processing texts. Default is 100.
|
1260
|
+
n_process : int, optional
|
1261
|
+
The number of processes to use for text processing. Default is 1.
|
1262
|
+
stats : bool, optional
|
1263
|
+
Flag indicating whether to compute statistics about the splitting process. Default is False.
|
1264
|
+
|
1265
|
+
Returns:
|
1266
|
+
pd.DataFrame
|
1267
|
+
DataFrame containing the split sentences.
|
1268
|
+
|
1269
|
+
Description:
|
1270
|
+
This function splits text in a DataFrame into chunks of n sentences. It returns a DataFrame containing the split sentences.
|
1271
|
+
Optionally, it can compute statistics such as the count of sentences and batches if the 'stats' parameter is set to True.
|
726
1272
|
"""
|
727
1273
|
|
728
1274
|
text=list(df[col_text].astype('unicode').values)
|
@@ -753,12 +1299,32 @@ def split_n_sentences(nlp, df, col_text, n_sentences=1, batch_size=100, n_proces
|
|
753
1299
|
return df
|
754
1300
|
|
755
1301
|
|
756
|
-
def spacy_NER(nlp, df, col_text, entities_to_keep=['PERSON','ORG'], explode= True):
|
1302
|
+
def spacy_NER(nlp, df: pd.DataFrame, col_text: str, entities_to_keep: list = ['PERSON','ORG'], explode: bool = True) -> pd.DataFrame:
|
757
1303
|
"""
|
758
1304
|
Spacy implementation of NER.
|
759
1305
|
To define entities type to keep, call get_labels(nlp, pipe_step="ner", explanations=False)
|
760
|
-
explode = False means it
|
1306
|
+
explode = False means it returns 1 list of entities per document
|
761
1307
|
explode = True means it returns 1 entity per row
|
1308
|
+
|
1309
|
+
Parameters:
|
1310
|
+
nlp : spacy.language.Language
|
1311
|
+
The spaCy language processing pipeline.
|
1312
|
+
df : pd.DataFrame
|
1313
|
+
DataFrame containing the text data.
|
1314
|
+
col_text : str
|
1315
|
+
The name of the column containing the text data.
|
1316
|
+
entities_to_keep : list, optional
|
1317
|
+
List of entity types to keep. Default is ['PERSON','ORG'].
|
1318
|
+
explode : bool, optional
|
1319
|
+
Flag indicating whether to explode the DataFrame to have one entity per row. Default is True.
|
1320
|
+
|
1321
|
+
Returns:
|
1322
|
+
pd.DataFrame
|
1323
|
+
DataFrame containing the NER information.
|
1324
|
+
|
1325
|
+
Description:
|
1326
|
+
This function performs Named Entity Recognition (NER) using spaCy on a DataFrame with text data. It extracts entities of the specified types
|
1327
|
+
and stores the NER information in separate columns. If 'explode' is set to True, it returns one entity per row in the DataFrame.
|
762
1328
|
"""
|
763
1329
|
# Create columns to store the NER information
|
764
1330
|
df['NER_type'] = None
|
@@ -797,10 +1363,38 @@ def spacy_NER(nlp, df, col_text, entities_to_keep=['PERSON','ORG'], explode= Tru
|
|
797
1363
|
return df
|
798
1364
|
|
799
1365
|
|
800
|
-
def tokenize(nlp, df, col_text, col_tokens, pos_to_keep, stopwords, batch_size=100, n_process=1, stats=True):
|
1366
|
+
def tokenize(nlp, df: pd.DataFrame, col_text: str, col_tokens: str, pos_to_keep: list, stopwords: list, batch_size: int = 100, n_process: int = 1, stats: bool = True) -> pd.DataFrame:
|
801
1367
|
"""
|
802
1368
|
Spacy implementation to tokenize text
|
803
1369
|
|
1370
|
+
Parameters:
|
1371
|
+
nlp : spacy.language.Language
|
1372
|
+
The spaCy language processing pipeline.
|
1373
|
+
df : pd.DataFrame
|
1374
|
+
DataFrame containing the text data.
|
1375
|
+
col_text : str
|
1376
|
+
The name of the column containing the text data.
|
1377
|
+
col_tokens : str
|
1378
|
+
The name of the column to store the tokenized text.
|
1379
|
+
pos_to_keep : list
|
1380
|
+
List of POS tags to keep.
|
1381
|
+
stopwords : list
|
1382
|
+
List of stopwords to exclude from tokens.
|
1383
|
+
batch_size : int, optional
|
1384
|
+
Batch size for processing. Default is 100.
|
1385
|
+
n_process : int, optional
|
1386
|
+
Number of processes for parallel processing. Default is 1.
|
1387
|
+
stats : bool, optional
|
1388
|
+
Flag indicating whether to calculate and store statistics. Default is True.
|
1389
|
+
|
1390
|
+
Returns:
|
1391
|
+
pd.DataFrame
|
1392
|
+
DataFrame containing the tokenized text.
|
1393
|
+
|
1394
|
+
Description:
|
1395
|
+
This function tokenizes text using spaCy and stores the tokens in a new column in the DataFrame.
|
1396
|
+
It allows filtering tokens based on POS tags and stopwords. If 'stats' is set to True, it calculates
|
1397
|
+
and stores token counts.
|
804
1398
|
"""
|
805
1399
|
all_tokens=[]
|
806
1400
|
tokens_counts=[]
|
@@ -832,10 +1426,40 @@ def tokenize(nlp, df, col_text, col_tokens, pos_to_keep, stopwords, batch_size=1
|
|
832
1426
|
return df
|
833
1427
|
|
834
1428
|
|
835
|
-
def lemmatize(nlp, df, col_text, col_lemma, pos_to_keep, stopwords, batch_size=100, n_process=1, stats=True, join_list = False):
|
1429
|
+
def lemmatize(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_keep: list, stopwords: list, batch_size: int = 100, n_process: int = 1, stats: bool = True, join_list: bool = False) -> pd.DataFrame:
|
836
1430
|
"""
|
837
1431
|
Spacy implementation to lemmatize text
|
838
1432
|
|
1433
|
+
Parameters:
|
1434
|
+
nlp : spacy.language.Language
|
1435
|
+
The spaCy language processing pipeline.
|
1436
|
+
df : pd.DataFrame
|
1437
|
+
DataFrame containing the text data.
|
1438
|
+
col_text : str
|
1439
|
+
The name of the column containing the text data.
|
1440
|
+
col_lemma : str
|
1441
|
+
The name of the column to store the lemmatized text.
|
1442
|
+
pos_to_keep : list
|
1443
|
+
List of POS tags to keep.
|
1444
|
+
stopwords : list
|
1445
|
+
List of stopwords to exclude from lemmas.
|
1446
|
+
batch_size : int, optional
|
1447
|
+
Batch size for processing. Default is 100.
|
1448
|
+
n_process : int, optional
|
1449
|
+
Number of processes for parallel processing. Default is 1.
|
1450
|
+
stats : bool, optional
|
1451
|
+
Flag indicating whether to calculate and store statistics. Default is True.
|
1452
|
+
join_list : bool, optional
|
1453
|
+
Flag indicating whether to join the lemmas into a single string. Default is False.
|
1454
|
+
|
1455
|
+
Returns:
|
1456
|
+
pd.DataFrame
|
1457
|
+
DataFrame containing the lemmatized text.
|
1458
|
+
|
1459
|
+
Description:
|
1460
|
+
This function lemmatizes text using spaCy and stores the lemmatized text in a new column in the DataFrame.
|
1461
|
+
It allows filtering lemmas based on POS tags and stopwords. If 'stats' is set to True, it calculates
|
1462
|
+
and stores token counts.
|
839
1463
|
"""
|
840
1464
|
all_lemmas=[]
|
841
1465
|
tokens_counts=[]
|
@@ -871,12 +1495,11 @@ def lemmatize(nlp, df, col_text, col_lemma, pos_to_keep, stopwords, batch_size=1
|
|
871
1495
|
return df
|
872
1496
|
|
873
1497
|
|
874
|
-
|
875
1498
|
####################################################################
|
876
1499
|
# VECTORISATION
|
877
1500
|
####################################################################
|
878
1501
|
|
879
|
-
def count_vectorize(lst_text):
|
1502
|
+
def count_vectorize(lst_text: list) -> tuple:
|
880
1503
|
"""
|
881
1504
|
Parameters:
|
882
1505
|
lst_text : list
|
@@ -905,8 +1528,8 @@ def count_vectorize(lst_text):
|
|
905
1528
|
|
906
1529
|
return count_vectorizer, features, features_names, vocabulary
|
907
1530
|
|
908
|
-
def tfidf_vectorize(lst_text, analyzer='word', max_df=1.0, max_features=None,
|
909
|
-
min_df=1, use_idf=True, ngram_range=(1,1), stop_words=None):
|
1531
|
+
def tfidf_vectorize(lst_text: list, analyzer: str = 'word', max_df: float = 1.0, max_features: int = None,
|
1532
|
+
min_df: float = 1, use_idf: bool = True, ngram_range: tuple = (1, 1), stop_words: list = None) -> tuple:
|
910
1533
|
"""
|
911
1534
|
Parameters:
|
912
1535
|
lst_text : list
|
@@ -959,15 +1582,29 @@ def tfidf_vectorize(lst_text, analyzer='word', max_df=1.0, max_features=None,
|
|
959
1582
|
|
960
1583
|
return tfidf_vectorizer, features, features_names, vocabulary
|
961
1584
|
|
962
|
-
def SF_vectorize(lst_text, model_name):
|
1585
|
+
def SF_vectorize(lst_text: list, model_name: str) -> np.array:
|
963
1586
|
"""
|
964
|
-
Vectorize text using Sentence Transformers
|
1587
|
+
Vectorize text using Sentence Transformers.
|
1588
|
+
|
1589
|
+
Parameters:
|
1590
|
+
lst_text : list
|
1591
|
+
List of texts to be vectorized.
|
1592
|
+
model_name : str
|
1593
|
+
Name of the Sentence Transformers model to be used.
|
1594
|
+
|
1595
|
+
Returns:
|
1596
|
+
features : numpy.ndarray
|
1597
|
+
Encoded features of the input texts.
|
1598
|
+
|
1599
|
+
Description:
|
1600
|
+
This function vectorizes a list of texts using Sentence Transformers. It encodes the texts into fixed-size
|
1601
|
+
vectors of features using the specified model. The function returns the encoded features as a numpy array.
|
965
1602
|
"""
|
966
1603
|
model = SentenceTransformer(model_name)
|
967
1604
|
features = model.encode(lst_text)
|
968
1605
|
return features
|
969
1606
|
|
970
|
-
def load_HF_embeddings(model_name, encode_kwargs={'batch_size':32}, model_kwargs={'device': 'cuda:0'}):
|
1607
|
+
def load_HF_embeddings(model_name : str, encode_kwargs : dict ={'batch_size':32}, model_kwargs : dict ={'device': 'cuda:0'}):
|
971
1608
|
"""
|
972
1609
|
create a HugginFace encoder
|
973
1610
|
"""
|
@@ -987,20 +1624,25 @@ def HF_vectorize(HF_encoder, lst_txt):
|
|
987
1624
|
|
988
1625
|
return embeddings
|
989
1626
|
|
990
|
-
def encode_chunked_files(chunk_files_paths
|
1627
|
+
def encode_chunked_files(chunk_files_paths: list,
|
1628
|
+
HF_encoder,
|
1629
|
+
cols: list,
|
1630
|
+
col_text: str,
|
1631
|
+
path_embedded_chunks: str,
|
1632
|
+
reencode: bool = False) -> list:
|
991
1633
|
"""
|
992
1634
|
Encode text from files and save the results in another pickle file.
|
993
1635
|
|
994
1636
|
Parameters:
|
995
|
-
chunk_files_paths (
|
1637
|
+
chunk_files_paths (List[str]): List of file paths containing documents.
|
996
1638
|
HF_encoder (Encoder): Encoder object for text vectorization.
|
997
|
-
cols (
|
1639
|
+
cols (List[str]): Columns to keep in the resulting DataFrame.
|
998
1640
|
col_text (str): Column containing text data in the DataFrame.
|
999
1641
|
path_embedded_chunks (str): Path to save the embedded chunks.
|
1000
|
-
reencode (bool): Whether to re-encode files even if they already exist.
|
1642
|
+
reencode (bool, optional): Whether to re-encode files even if they already exist. Defaults to False.
|
1001
1643
|
|
1002
1644
|
Returns:
|
1003
|
-
|
1645
|
+
List[str]: List of paths for newly created files.
|
1004
1646
|
"""
|
1005
1647
|
new_file_paths=[]
|
1006
1648
|
for file in tqdm(chunk_files_paths, total=len(chunk_files_paths), desc="Encoding text from files"):
|
@@ -1032,9 +1674,16 @@ def encode_chunked_files(chunk_files_paths, HF_encoder, cols, col_text, path_emb
|
|
1032
1674
|
# SCALING FEATURES
|
1033
1675
|
####################################################################
|
1034
1676
|
|
1035
|
-
def scaling_features(features, method="standard"):
|
1677
|
+
def scaling_features(features: list, method: str = "standard") -> list:
|
1036
1678
|
"""
|
1037
|
-
Scale features
|
1679
|
+
Scale features using either standardization or min-max scaling.
|
1680
|
+
|
1681
|
+
Parameters:
|
1682
|
+
features (Union[List[List[float]], List[float]]): List of features to scale.
|
1683
|
+
method (str, optional): Method of scaling, either "standard" for standardization or "min-max" for min-max scaling. Defaults to "standard".
|
1684
|
+
|
1685
|
+
Returns:
|
1686
|
+
Union[List[List[float]], List[float]]: Scaled features.
|
1038
1687
|
"""
|
1039
1688
|
try:
|
1040
1689
|
if method=="standard":
|