opsci-toolbox 0.0.7__py3-none-any.whl → 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/rapidapi_helpers.py +120 -21
- opsci_toolbox/apis/webscraping.py +186 -59
- opsci_toolbox/apis/youtube_helpers.py +103 -16
- opsci_toolbox/helpers/common.py +368 -254
- opsci_toolbox/helpers/cv.py +50 -60
- opsci_toolbox/helpers/dataviz.py +255 -184
- opsci_toolbox/helpers/dates.py +17 -18
- opsci_toolbox/helpers/nlp.py +154 -114
- opsci_toolbox/helpers/nlp_cuml.py +389 -36
- opsci_toolbox/helpers/sna.py +509 -0
- opsci_toolbox/helpers/sql.py +53 -0
- {opsci_toolbox-0.0.7.dist-info → opsci_toolbox-0.0.8.dist-info}/METADATA +14 -9
- opsci_toolbox-0.0.8.dist-info/RECORD +22 -0
- opsci_toolbox-0.0.7.dist-info/RECORD +0 -21
- {opsci_toolbox-0.0.7.dist-info → opsci_toolbox-0.0.8.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.7.dist-info → opsci_toolbox-0.0.8.dist-info}/top_level.txt +0 -0
opsci_toolbox/helpers/nlp.py
CHANGED
@@ -15,7 +15,8 @@ import spacy
|
|
15
15
|
from spacy.language import Language
|
16
16
|
from spacy_language_detection import LanguageDetector
|
17
17
|
from spacymoji import Emoji
|
18
|
-
from langchain.embeddings import HuggingFaceEmbeddings
|
18
|
+
# from langchain.embeddings import HuggingFaceEmbeddings
|
19
|
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
19
20
|
from sklearn.feature_selection import chi2
|
20
21
|
from urlextract import URLExtract
|
21
22
|
import ast
|
@@ -27,17 +28,31 @@ from textacy.preprocessing.replace import urls
|
|
27
28
|
from eldar import Query
|
28
29
|
import torch
|
29
30
|
from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer
|
31
|
+
from bs4 import BeautifulSoup
|
30
32
|
|
31
33
|
|
32
34
|
####################################################################
|
33
35
|
# CLEANING
|
34
36
|
####################################################################
|
35
37
|
|
38
|
+
def remove_html_tags(text: str) -> str:
|
39
|
+
"""
|
40
|
+
Remove HTML tags from the given text.
|
41
|
+
|
42
|
+
Parameters:
|
43
|
+
- text (str): The text containing HTML tags.
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
- str: The text with HTML tags removed.
|
47
|
+
"""
|
48
|
+
soup = BeautifulSoup(text, "html.parser")
|
49
|
+
return soup.get_text()
|
50
|
+
|
36
51
|
def filter_by_query(df: pd.DataFrame, col_text: str, query: str, ignore_case: bool = True, ignore_accent: bool = True, match_word: bool = False) -> pd.DataFrame:
|
37
52
|
"""
|
38
53
|
Filter DataFrame rows by a query on a specific text column.
|
39
54
|
|
40
|
-
|
55
|
+
Args:
|
41
56
|
df : pandas DataFrame
|
42
57
|
The DataFrame to filter.
|
43
58
|
col_text : str
|
@@ -64,7 +79,7 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
|
|
64
79
|
"""
|
65
80
|
Generic cleaning process for topic modeling.
|
66
81
|
|
67
|
-
|
82
|
+
Args:
|
68
83
|
df : pandas DataFrame
|
69
84
|
The DataFrame containing text data.
|
70
85
|
col : str
|
@@ -86,7 +101,7 @@ def extract_insta_shortcode(url: str) -> str:
|
|
86
101
|
"""
|
87
102
|
Extracts the shortcode from an Instagram URL.
|
88
103
|
|
89
|
-
|
104
|
+
Args:
|
90
105
|
url : str
|
91
106
|
The Instagram URL containing the shortcode.
|
92
107
|
|
@@ -99,11 +114,25 @@ def extract_insta_shortcode(url: str) -> str:
|
|
99
114
|
shortcode = re.findall(pattern, url)
|
100
115
|
return shortcode[0]
|
101
116
|
|
117
|
+
def remove_parentheses_content(text: str) -> str:
|
118
|
+
"""
|
119
|
+
Remove content within parentheses from the given text.
|
120
|
+
|
121
|
+
Parameters:
|
122
|
+
- text (str): The text from which content within parentheses should be removed.
|
123
|
+
|
124
|
+
Returns:
|
125
|
+
- str: The text with content within parentheses removed.
|
126
|
+
"""
|
127
|
+
# Using regular expression to find content between parentheses and removing it
|
128
|
+
result = re.sub(r'\([^)]*\)', '', text)
|
129
|
+
return result
|
130
|
+
|
102
131
|
def remove_emojis(text: str) -> str:
|
103
132
|
"""
|
104
133
|
Removes emojis and their textual representations from a text string.
|
105
134
|
|
106
|
-
|
135
|
+
Args:
|
107
136
|
text : str
|
108
137
|
The input text string containing emojis.
|
109
138
|
|
@@ -119,11 +148,56 @@ def remove_emojis(text: str) -> str:
|
|
119
148
|
|
120
149
|
return text_no_emojis
|
121
150
|
|
151
|
+
def extract_numbers(text: str) -> list:
|
152
|
+
"""
|
153
|
+
Extracts all numeric values from a given text string and returns them as a list of floats.
|
154
|
+
|
155
|
+
Args:
|
156
|
+
text (str): The input string from which numbers are to be extracted.
|
157
|
+
|
158
|
+
Returns:
|
159
|
+
list: A list containing all the extracted numbers as floats.
|
160
|
+
"""
|
161
|
+
# Define a regular expression pattern to match numbers
|
162
|
+
pattern = r'\d+\.?\d*'
|
163
|
+
|
164
|
+
# Use re.findall to find all matches of the pattern in the text
|
165
|
+
numbers = re.findall(pattern, text)
|
166
|
+
|
167
|
+
# Convert the extracted numbers from strings to floats
|
168
|
+
numbers = [float(num) for num in numbers]
|
169
|
+
|
170
|
+
return numbers
|
171
|
+
|
172
|
+
def contains_question_mark(text: str) -> int:
|
173
|
+
"""
|
174
|
+
Checks if a given text string contains a question mark.
|
175
|
+
|
176
|
+
Args:
|
177
|
+
text (str): The input string to be checked.
|
178
|
+
|
179
|
+
Returns:
|
180
|
+
int: Returns 1 if the text contains a question mark, otherwise 0.
|
181
|
+
"""
|
182
|
+
return 1 if '?' in text else 0
|
183
|
+
|
184
|
+
def contains_exclamation_mark(text: str) -> int:
|
185
|
+
"""
|
186
|
+
Checks if a given text string contains an exclamation mark.
|
187
|
+
|
188
|
+
Args:
|
189
|
+
text (str): The input string to be checked.
|
190
|
+
|
191
|
+
Returns:
|
192
|
+
int: Returns 1 if the text contains an exclamation mark, otherwise 0.
|
193
|
+
"""
|
194
|
+
return 1 if '!' in text else 0
|
195
|
+
|
122
196
|
def extract_urls_from_text(text: str) -> list:
|
123
197
|
"""
|
124
198
|
Extracts URLs from a text string.
|
125
199
|
|
126
|
-
|
200
|
+
Args:
|
127
201
|
text : str
|
128
202
|
The input text string containing URLs.
|
129
203
|
|
@@ -139,7 +213,7 @@ def extract_hashtags(text: str, lower: bool = True) -> list:
|
|
139
213
|
'''
|
140
214
|
Extracts hashtags from the text using a regular expression.
|
141
215
|
|
142
|
-
|
216
|
+
Args:
|
143
217
|
text : str
|
144
218
|
The input text string containing hashtags.
|
145
219
|
lower : bool, optional
|
@@ -158,7 +232,7 @@ def extract_mentions(text: str, mention_char: str = '@', lower: bool = False) ->
|
|
158
232
|
'''
|
159
233
|
Extracts mentions from the text using a regular expression.
|
160
234
|
|
161
|
-
|
235
|
+
Args:
|
162
236
|
text : str
|
163
237
|
The input text string containing mentions.
|
164
238
|
mention_char : str, optional
|
@@ -181,7 +255,7 @@ def remove_extra_spaces(text: str) -> str:
|
|
181
255
|
"""
|
182
256
|
Removes extra spaces from a text string.
|
183
257
|
|
184
|
-
|
258
|
+
Args:
|
185
259
|
text : str
|
186
260
|
The input text string with extra spaces.
|
187
261
|
|
@@ -196,7 +270,7 @@ def remove_characters(text: str, start_indices: list, end_indices: list) -> str:
|
|
196
270
|
"""
|
197
271
|
Remove characters from a text string using lists of start and end indices.
|
198
272
|
|
199
|
-
|
273
|
+
Args:
|
200
274
|
text : str
|
201
275
|
The input text string.
|
202
276
|
start_indices : list of int
|
@@ -234,7 +308,7 @@ def load_stopwords_df(lang: str) -> pd.DataFrame:
|
|
234
308
|
"""
|
235
309
|
Load a CSV file without header containing stopwords. If the file doesn't exist, it creates an empty file.
|
236
310
|
|
237
|
-
|
311
|
+
Args:
|
238
312
|
lang : str
|
239
313
|
The language code used to identify the stopwords file.
|
240
314
|
|
@@ -269,7 +343,7 @@ def add_stopwords(lang: str, new_stopwords: list, lower: bool = True) -> pd.Data
|
|
269
343
|
"""
|
270
344
|
Add a list of stopwords to an existing file. It removes duplicates.
|
271
345
|
|
272
|
-
|
346
|
+
Args:
|
273
347
|
lang : str
|
274
348
|
The language code used to identify the stopwords file.
|
275
349
|
new_stopwords : list of str
|
@@ -304,7 +378,7 @@ def remove_stopwords(lang: str, stopwords: list) -> pd.DataFrame:
|
|
304
378
|
"""
|
305
379
|
Remove stopwords from an existing file.
|
306
380
|
|
307
|
-
|
381
|
+
Args:
|
308
382
|
lang : str
|
309
383
|
The language code used to identify the stopwords file.
|
310
384
|
stopwords : list of str
|
@@ -330,7 +404,7 @@ def keep_alphanum_char(text: str, replace: str = '') -> str:
|
|
330
404
|
"""
|
331
405
|
Replace all non-alphanumeric characters in a text string.
|
332
406
|
|
333
|
-
|
407
|
+
Args:
|
334
408
|
text : str
|
335
409
|
The input text string.
|
336
410
|
replace : str, optional
|
@@ -347,7 +421,7 @@ def substitute_punctuations_with_white_space(text : str) -> str:
|
|
347
421
|
"""
|
348
422
|
Substitute punctuations with white spaces in the input string.
|
349
423
|
|
350
|
-
|
424
|
+
Args:
|
351
425
|
text (str): The input string.
|
352
426
|
|
353
427
|
Returns:
|
@@ -360,7 +434,7 @@ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_j
|
|
360
434
|
"""
|
361
435
|
Translate text using LibreTranslate service.
|
362
436
|
|
363
|
-
|
437
|
+
Args:
|
364
438
|
text : str
|
365
439
|
The text to be translated.
|
366
440
|
source : str
|
@@ -399,7 +473,7 @@ def translate_batch(batch_text: list, source: str, target: str, filename: str, d
|
|
399
473
|
"""
|
400
474
|
Translate a batch of texts using LibreTranslate service.
|
401
475
|
|
402
|
-
|
476
|
+
Args:
|
403
477
|
batch_text : list of str
|
404
478
|
The list of texts to be translated.
|
405
479
|
source : str
|
@@ -442,7 +516,7 @@ def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:
|
|
442
516
|
"""
|
443
517
|
Translate text using LibreTranslate service.
|
444
518
|
|
445
|
-
|
519
|
+
Args:
|
446
520
|
text : str
|
447
521
|
The text to be translated.
|
448
522
|
source : str
|
@@ -474,7 +548,7 @@ def translate_row(df: pd.DataFrame, col: str, source: str = "auto", target: str
|
|
474
548
|
"""
|
475
549
|
Translate the text in a specific column of a DataFrame.
|
476
550
|
|
477
|
-
|
551
|
+
Args:
|
478
552
|
df : pandas DataFrame
|
479
553
|
The DataFrame containing the text to be translated.
|
480
554
|
col : str
|
@@ -504,7 +578,7 @@ def cosine_similarity(a: np.array, b: np.array) -> float:
|
|
504
578
|
"""
|
505
579
|
Calculate the cosine similarity between two vectors.
|
506
580
|
|
507
|
-
|
581
|
+
Args:
|
508
582
|
a : numpy array
|
509
583
|
The first vector.
|
510
584
|
b : numpy array
|
@@ -520,7 +594,7 @@ def approximate_tokens(text: str) -> int:
|
|
520
594
|
"""
|
521
595
|
Approximate the number of tokens in a text.
|
522
596
|
|
523
|
-
|
597
|
+
Args:
|
524
598
|
text : str
|
525
599
|
The input text.
|
526
600
|
|
@@ -534,7 +608,7 @@ def approximate_unique_tokens(text: str) -> int:
|
|
534
608
|
"""
|
535
609
|
Approximate the number of distinct tokens in a text.
|
536
610
|
|
537
|
-
|
611
|
+
Args:
|
538
612
|
text : str
|
539
613
|
The input text.
|
540
614
|
|
@@ -548,7 +622,7 @@ def count_word_occurrences(text: str, word: str) -> int:
|
|
548
622
|
"""
|
549
623
|
Count the occurrences of a word in a text.
|
550
624
|
|
551
|
-
|
625
|
+
Args:
|
552
626
|
text : str
|
553
627
|
The input text.
|
554
628
|
word : str
|
@@ -571,7 +645,7 @@ def chi2_per_category(lst_text: list, lst_categorie: list, col_cat: str, n_words
|
|
571
645
|
"""
|
572
646
|
Calculate Chi-squared (Chi2) statistics per category based on the provided texts and corresponding categories.
|
573
647
|
|
574
|
-
|
648
|
+
Args:
|
575
649
|
lst_text : list
|
576
650
|
List of texts for which Chi2 will be calculated.
|
577
651
|
lst_categorie : list
|
@@ -614,7 +688,7 @@ def word_frequency_per_categorie(df: pd.DataFrame, col_text: str, col_cat: str,
|
|
614
688
|
"""
|
615
689
|
Calculate word frequency per category.
|
616
690
|
|
617
|
-
|
691
|
+
Args:
|
618
692
|
df : pandas DataFrame
|
619
693
|
DataFrame containing text data and corresponding categories.
|
620
694
|
col_text : str
|
@@ -658,7 +732,7 @@ def top_items_per_category(df: pd.DataFrame, col_lst: str = "hashtags", col_cat:
|
|
658
732
|
"""
|
659
733
|
Count the occurrences of items (e.g., hashtags) per category and select the top items per category.
|
660
734
|
|
661
|
-
|
735
|
+
Args:
|
662
736
|
df : pandas DataFrame
|
663
737
|
DataFrame containing data.
|
664
738
|
col_lst : str, optional
|
@@ -695,7 +769,7 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
|
|
695
769
|
"""
|
696
770
|
Calculate the representation of topics in a processed DataFrame.
|
697
771
|
|
698
|
-
|
772
|
+
Args:
|
699
773
|
df_processed_data : pandas DataFrame
|
700
774
|
DataFrame containing processed data.
|
701
775
|
col_topic : str
|
@@ -740,6 +814,9 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
|
|
740
814
|
metrics_dict['unique_mentions']=("mentions", lambda x: len(set(mention for sublist in x for mention in sublist)))
|
741
815
|
metrics_dict['verbatims_with_mentions']=("mentions_count", lambda x: (x > 0).sum() )
|
742
816
|
metrics_dict['mentions_occurences']=("mentions_count", "sum")
|
817
|
+
metrics_dict['verbatims_with_numbers']= ("len_numbers", lambda x: (x > 0).sum())
|
818
|
+
metrics_dict['verbatims_with_interrogation']=("interrogation", "sum")
|
819
|
+
metrics_dict['verbatims_with_exclamation']=("exclamation", "sum")
|
743
820
|
metrics_dict['topic_x']=("x", "mean")
|
744
821
|
metrics_dict['topic_y']=("y", "mean")
|
745
822
|
|
@@ -757,6 +834,9 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
|
|
757
834
|
.assign(percentage_verbatims_with_emoji = lambda x : x["verbatims_with_emoji"] / x["verbatims"])
|
758
835
|
.assign(percentage_verbatims_with_hashtags = lambda x : x["verbatims_with_hashtags"] / x["verbatims"])
|
759
836
|
.assign(percentage_verbatims_with_mentions = lambda x : x["verbatims_with_mentions"] / x["verbatims"])
|
837
|
+
.assign(percentage_verbatims_with_numbers = lambda x : x["verbatims_with_numbers"] / x["verbatims"])
|
838
|
+
.assign(percentage_verbatims_with_numbers = lambda x : x["verbatims_with_interrogation"] / x["verbatims"])
|
839
|
+
.assign(percentage_verbatims_with_numbers = lambda x : x["verbatims_with_exclamation"] / x["verbatims"])
|
760
840
|
.reset_index())
|
761
841
|
|
762
842
|
df_distrib_all[col_topic]=df_distrib_all[col_topic].astype(str)
|
@@ -766,7 +846,7 @@ def generic_representation(df_processed_data: pd.DataFrame, col_gb: str, col_id:
|
|
766
846
|
"""
|
767
847
|
Calculate a generic representation of data based on grouping by a specified column.
|
768
848
|
|
769
|
-
|
849
|
+
Args:
|
770
850
|
df_processed_data : pandas DataFrame
|
771
851
|
DataFrame containing processed data.
|
772
852
|
col_gb : str
|
@@ -814,7 +894,7 @@ def create_frequency_table(df: pd.DataFrame, col: str) -> pd.DataFrame:
|
|
814
894
|
"""
|
815
895
|
Create a frequency table for a given column in a DataFrame.
|
816
896
|
|
817
|
-
|
897
|
+
Args:
|
818
898
|
df : pandas DataFrame
|
819
899
|
DataFrame containing the data.
|
820
900
|
col : str
|
@@ -845,7 +925,7 @@ def calculate_sample(len_df: int, n_rows: float) -> int:
|
|
845
925
|
"""
|
846
926
|
Convert a percentage to the number of rows to sample.
|
847
927
|
|
848
|
-
|
928
|
+
Args:
|
849
929
|
len_df : int
|
850
930
|
Length of the DataFrame.
|
851
931
|
n_rows : float
|
@@ -855,8 +935,6 @@ def calculate_sample(len_df: int, n_rows: float) -> int:
|
|
855
935
|
int
|
856
936
|
Number of rows to sample.
|
857
937
|
|
858
|
-
Description:
|
859
|
-
This function converts a percentage of the DataFrame length into a number of rows to sample. If `n_rows` is between 0 and 1, it's treated as a percentage and converted into an integer representing the top `n_rows` percentage of the DataFrame length. If `n_rows` is greater than 1 or equal to 0, it's treated as an absolute number of rows.
|
860
938
|
"""
|
861
939
|
if 0 < n_rows <= 1 :
|
862
940
|
top_rows = int(n_rows * len_df)
|
@@ -870,8 +948,9 @@ def calculate_sample(len_df: int, n_rows: float) -> int:
|
|
870
948
|
def sampling_by_engagement(df: pd.DataFrame, col_engagement: str, top_rows: float = 0.3, sample_size: float = 0.5) -> pd.DataFrame:
|
871
949
|
"""
|
872
950
|
Create a sample dataset by keeping a part of the top publications based on engagement metrics.
|
951
|
+
This function generates a sample dataset by keeping a portion of the top publications based on engagement metrics. It sorts the dataset by the specified engagement metric, keeps the top `top_rows` rows, and then samples the remaining rows to achieve the desired `sample_size`. The final sample is shuffled for randomness
|
873
952
|
|
874
|
-
|
953
|
+
Args:
|
875
954
|
df : pandas.DataFrame
|
876
955
|
The original DataFrame.
|
877
956
|
col_engagement : str
|
@@ -885,8 +964,6 @@ def sampling_by_engagement(df: pd.DataFrame, col_engagement: str, top_rows: floa
|
|
885
964
|
pandas.DataFrame
|
886
965
|
The sampled DataFrame.
|
887
966
|
|
888
|
-
Description:
|
889
|
-
This function generates a sample dataset by keeping a portion of the top publications based on engagement metrics. It sorts the dataset by the specified engagement metric, keeps the top `top_rows` rows, and then samples the remaining rows to achieve the desired `sample_size`. The final sample is shuffled for randomness.
|
890
967
|
"""
|
891
968
|
|
892
969
|
sample_rows = calculate_sample(len(df), sample_size)
|
@@ -911,7 +988,7 @@ def sample_most_engaging_posts(df: pd.DataFrame, col_topic: str, col_engagement:
|
|
911
988
|
"""
|
912
989
|
Perform a "stratified sample" of the most engaging content per topic, ensuring a minimum number of items per group.
|
913
990
|
|
914
|
-
|
991
|
+
Args:
|
915
992
|
df : pandas.DataFrame
|
916
993
|
The DataFrame containing the data.
|
917
994
|
col_topic : str
|
@@ -927,8 +1004,6 @@ def sample_most_engaging_posts(df: pd.DataFrame, col_topic: str, col_engagement:
|
|
927
1004
|
pandas.DataFrame
|
928
1005
|
The sampled DataFrame.
|
929
1006
|
|
930
|
-
Description:
|
931
|
-
This function performs a "stratified sample" of the most engaging content per topic. It sorts the data by engagement metrics within each topic group, and then takes a sample of `sample_size` proportion from each group. If a group has fewer than `min_size` items, it retains all items in that group.
|
932
1007
|
"""
|
933
1008
|
df = (df.groupby(col_topic, group_keys=False)
|
934
1009
|
.apply(lambda x: x.sort_values(by=col_engagement, ascending=False)
|
@@ -948,7 +1023,7 @@ def TM_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_
|
|
948
1023
|
"""
|
949
1024
|
Perform natural language processing tasks using spaCy for topic modeling.
|
950
1025
|
|
951
|
-
|
1026
|
+
Args:
|
952
1027
|
nlp : spacy.Language
|
953
1028
|
The spaCy language model.
|
954
1029
|
df : pandas.DataFrame
|
@@ -974,8 +1049,6 @@ def TM_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_
|
|
974
1049
|
pandas.DataFrame
|
975
1050
|
The DataFrame with processed text data.
|
976
1051
|
|
977
|
-
Description:
|
978
|
-
This function utilizes spaCy for natural language processing tasks such as lemmatization, emoji extraction, and token counting. It processes the text data in the DataFrame and returns the DataFrame with additional columns for lemmatized text, emoji counts, token counts, and more.
|
979
1052
|
"""
|
980
1053
|
all_lemmas=[]
|
981
1054
|
tokens_counts=[]
|
@@ -1029,9 +1102,15 @@ def TM_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_
|
|
1029
1102
|
|
1030
1103
|
def load_spacy_model(model: str, disable_components: list = ["transformer", "morphologizer", "trainable_lemmatizer", "textcat_multilabel", "textcat", "entity_ruler", "entity_linker"], lang_detect: bool = False, emoji: bool = False) -> spacy.language.Language:
|
1031
1104
|
"""
|
1032
|
-
Load a spaCy model with optional configurations.
|
1105
|
+
Load a spaCy model with optional configurations. This function loads a spaCy model with optional configurations such as disabling specific components, enabling emoji parsing,
|
1106
|
+
and enabling language detection. It first loads the spaCy model specified by the 'model' parameter and then applies
|
1107
|
+
additional configurations based on the provided flags.
|
1033
1108
|
|
1034
|
-
|
1109
|
+
If 'disable_components' is provided, the specified spaCy components will be disabled. If 'lang_detect' is set to True,
|
1110
|
+
language detection will be enabled using the 'get_lang_detector' function. If 'emoji' is set to True, the emoji component
|
1111
|
+
will be included in the spaCy pipeline.
|
1112
|
+
|
1113
|
+
Args:
|
1035
1114
|
model : str
|
1036
1115
|
Name of the spaCy model to load.
|
1037
1116
|
disable_components : list, optional
|
@@ -1044,15 +1123,7 @@ def load_spacy_model(model: str, disable_components: list = ["transformer", "mor
|
|
1044
1123
|
Returns:
|
1045
1124
|
nlp : spacy.language.Language
|
1046
1125
|
Loaded spaCy language processing pipeline.
|
1047
|
-
|
1048
|
-
Description:
|
1049
|
-
This function loads a spaCy model with optional configurations such as disabling specific components, enabling emoji parsing,
|
1050
|
-
and enabling language detection. It first loads the spaCy model specified by the 'model' parameter and then applies
|
1051
|
-
additional configurations based on the provided flags.
|
1052
|
-
|
1053
|
-
If 'disable_components' is provided, the specified spaCy components will be disabled. If 'lang_detect' is set to True,
|
1054
|
-
language detection will be enabled using the 'get_lang_detector' function. If 'emoji' is set to True, the emoji component
|
1055
|
-
will be included in the spaCy pipeline.
|
1126
|
+
|
1056
1127
|
"""
|
1057
1128
|
if torch.cuda.is_available():
|
1058
1129
|
|
@@ -1074,23 +1145,15 @@ def load_spacy_model(model: str, disable_components: list = ["transformer", "mor
|
|
1074
1145
|
|
1075
1146
|
def get_labels(nlp: spacy.language.Language, pipe_step: str = "ner", explanations: bool = False) -> pd.DataFrame:
|
1076
1147
|
"""
|
1077
|
-
Return labels associated with a pipeline step and optionally provide explanations.
|
1078
|
-
|
1079
|
-
|
1080
|
-
|
1081
|
-
|
1082
|
-
pipe_step : str, optional
|
1083
|
-
The pipeline step for which labels are retrieved. Default is "ner".
|
1084
|
-
explanations : bool, optional
|
1085
|
-
Flag indicating whether to include explanations for the labels. Default is False.
|
1148
|
+
Return labels associated with a pipeline step and optionally provide explanations.This function retrieves the labels associated with a specific pipeline step of the spaCy language processing pipeline. It returns a DataFrame containing the labels. If 'explanations' is set to True, explanations for each label are also included.
|
1149
|
+
Args:
|
1150
|
+
nlp : spacy.language.Language. The spaCy language processing pipeline.
|
1151
|
+
pipe_step : str, optional. The pipeline step for which labels are retrieved. Default is "ner".
|
1152
|
+
explanations : bool, optional. Flag indicating whether to include explanations for the labels. Default is False.
|
1086
1153
|
|
1087
1154
|
Returns:
|
1088
|
-
DataFrame
|
1089
|
-
|
1090
|
-
|
1091
|
-
Description:
|
1092
|
-
This function retrieves the labels associated with a specific pipeline step of the spaCy language processing pipeline.
|
1093
|
-
It returns a DataFrame containing the labels. If 'explanations' is set to True, explanations for each label are also included.
|
1155
|
+
DataFrame : DataFrame containing the labels associated with the specified pipeline step.
|
1156
|
+
|
1094
1157
|
"""
|
1095
1158
|
pipe_details=nlp.get_pipe(pipe_step)
|
1096
1159
|
labels=list(pipe_details.labels)
|
@@ -1104,9 +1167,9 @@ def get_labels(nlp: spacy.language.Language, pipe_step: str = "ner", explanation
|
|
1104
1167
|
|
1105
1168
|
def spacy_langdetect(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100, n_process: int = 1) -> pd.DataFrame:
|
1106
1169
|
"""
|
1107
|
-
Detect language and return a score.
|
1170
|
+
Detect language and return a score.This function uses spaCy's language detection capabilities to detect the language of text data in a DataFrame.It returns a DataFrame containing the detected languages and their scores, which indicate the confidence levelof the language detection for each text.
|
1108
1171
|
|
1109
|
-
|
1172
|
+
Args:
|
1110
1173
|
nlp : spacy.language.Language
|
1111
1174
|
The spaCy language processing pipeline with language detection enabled.
|
1112
1175
|
df : pd.DataFrame
|
@@ -1121,11 +1184,7 @@ def spacy_langdetect(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100
|
|
1121
1184
|
Returns:
|
1122
1185
|
pd.DataFrame
|
1123
1186
|
DataFrame containing the detected languages and their scores.
|
1124
|
-
|
1125
|
-
Description:
|
1126
|
-
This function uses spaCy's language detection capabilities to detect the language of text data in a DataFrame.
|
1127
|
-
It returns a DataFrame containing the detected languages and their scores, which indicate the confidence level
|
1128
|
-
of the language detection for each text.
|
1187
|
+
|
1129
1188
|
"""
|
1130
1189
|
text=list(df[col_text].astype('unicode').values)
|
1131
1190
|
|
@@ -1847,46 +1906,26 @@ def hdbscan_clustering(embeddings, algorithm='best', alpha=1.0, cluster_selectio
|
|
1847
1906
|
p=None, cluster_selection_method='eom', prediction_data = True):
|
1848
1907
|
|
1849
1908
|
"""
|
1850
|
-
|
1851
|
-
|
1852
|
-
The input data to be clustered.
|
1853
|
-
|
1854
|
-
|
1855
|
-
|
1856
|
-
|
1857
|
-
|
1858
|
-
|
1859
|
-
|
1860
|
-
|
1861
|
-
|
1862
|
-
|
1863
|
-
|
1864
|
-
|
1865
|
-
|
1866
|
-
|
1867
|
-
|
1868
|
-
|
1869
|
-
|
1870
|
-
The number of samples in a neighborhood for a point to be considered a core point. If None, the value is set to min_cluster_size. Default is None.
|
1871
|
-
p : int, optional
|
1872
|
-
The Minkowski p-norm distance metric parameter. Default is None.
|
1873
|
-
cluster_selection_method : {'eom', 'leaf', 'leaf_similar', 'eom_similar', 'tree', 'beagle'}, optional
|
1874
|
-
The method used to select clusters from the condensed tree. Default is 'eom'.
|
1875
|
-
prediction_data : bool, optional
|
1876
|
-
Whether the data is prediction data or not. Default is True.
|
1877
|
-
|
1878
|
-
Returns:
|
1879
|
-
clusterer : hdbscan.hdbscan_.HDBSCAN
|
1880
|
-
HDBSCAN clusterer object.
|
1881
|
-
labels : array, shape (n_samples,)
|
1882
|
-
Cluster labels for each point. Noisy samples are given the label -1.
|
1883
|
-
probabilities : array, shape (n_samples,)
|
1884
|
-
The probability of each sample being an outlier.
|
1885
|
-
|
1886
|
-
Description:
|
1887
|
-
This function performs clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm.
|
1888
|
-
It clusters the input data based on the specified parameters and returns the clusterer object, cluster labels for each point, and the
|
1889
|
-
probability of each sample being an outlier.
|
1909
|
+
This function performs clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm. It clusters the input data based on the specified parameters and returns the clusterer object, cluster labels for each point, and the probability of each sample being an outlier.
|
1910
|
+
Args
|
1911
|
+
embeddings : array-like or sparse matrix, shape (n_samples, n_features). The input data to be clustered.
|
1912
|
+
algorithm : {'best', 'generic', 'prims_kdtree', 'boruvka_kdtree', 'boruvka_balltree', 'prims_balltree'}, optional. The algorithm to use for computation. Default is 'best'.
|
1913
|
+
alpha : float, optional. Scaling factor determining the individual weight of the (unnormalized) density estimate. Default is 1.0.
|
1914
|
+
cluster_selection_epsilon : float, optional. The epsilon value to specify a minimum cluster size. Default is 0.0.
|
1915
|
+
approx_min_span_tree : bool, optional. Whether to compute an approximation of the minimum spanning tree. Default is True.
|
1916
|
+
gen_min_span_tree : bool, optional. Whether to compute the minimum spanning tree. Default is True.
|
1917
|
+
leaf_size : int, optional. Leaf size for the underlying KD-tree or Ball Tree. Default is 40.
|
1918
|
+
metric : str or callable, optional. The metric to use for distance computation. Default is 'euclidean'.
|
1919
|
+
min_cluster_size : int, optional. The minimum size of clusters; single linkage splits that produce smaller clusters than this will be considered points "falling out" of a cluster rather than a cluster splitting into two new clusters. Default is 5.
|
1920
|
+
min_samples : int or None, optional. The number of samples in a neighborhood for a point to be considered a core point. If None, the value is set to min_cluster_size. Default is None.
|
1921
|
+
p : int, optional. The Minkowski p-norm distance metric parameter. Default is None.
|
1922
|
+
cluster_selection_method : {'eom', 'leaf', 'leaf_similar', 'eom_similar', 'tree', 'beagle'}, optional. The method used to select clusters from the condensed tree. Default is 'eom'.
|
1923
|
+
prediction_data : bool, optional. Whether the data is prediction data or not. Default is True.
|
1924
|
+
|
1925
|
+
Returns:
|
1926
|
+
clusterer : hdbscan.hdbscan_.HDBSCAN. HDBSCAN clusterer object.
|
1927
|
+
labels : array, shape (n_samples,). Cluster labels for each point. Noisy samples are given the label -1.
|
1928
|
+
probabilities : array, shape (n_samples,). The probability of each sample being an outlier.
|
1890
1929
|
"""
|
1891
1930
|
clusterer = hdbscan.HDBSCAN(algorithm=algorithm,
|
1892
1931
|
alpha=alpha,
|
@@ -2017,4 +2056,5 @@ def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json
|
|
2017
2056
|
results = {"label":label, "score" : float(proba.max()), col_text : text}
|
2018
2057
|
print(results)
|
2019
2058
|
write_json(results, dir_json , str(filename))
|
2059
|
+
|
2020
2060
|
return results
|