opsci-toolbox 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/rapidapi_helpers.py +1 -2
- opsci_toolbox/apis/reddit.py +407 -0
- opsci_toolbox/apis/telegram.py +1125 -0
- opsci_toolbox/helpers/common.py +177 -5
- opsci_toolbox/helpers/dataviz.py +184 -26
- opsci_toolbox/helpers/dates.py +47 -1
- opsci_toolbox/helpers/gliner.py +88 -0
- opsci_toolbox/helpers/nlp.py +273 -15
- opsci_toolbox/helpers/nlp_cuml.py +44 -3
- opsci_toolbox/helpers/sna.py +1 -0
- {opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.14.dist-info}/METADATA +5 -2
- opsci_toolbox-0.0.14.dist-info/RECORD +26 -0
- opsci_toolbox-0.0.14.dist-info/dependency_links.txt +1 -0
- opsci_toolbox-0.0.12.dist-info/RECORD +0 -22
- {opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.14.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.14.dist-info}/top_level.txt +0 -0
opsci_toolbox/helpers/nlp.py
CHANGED
@@ -7,7 +7,7 @@ import os
|
|
7
7
|
from sklearn.decomposition import TruncatedSVD
|
8
8
|
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
9
9
|
from sklearn.manifold import TSNE
|
10
|
-
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
10
|
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
|
11
11
|
from sklearn.cluster import AgglomerativeClustering
|
12
12
|
from sentence_transformers import SentenceTransformer
|
13
13
|
from tqdm import tqdm
|
@@ -25,9 +25,10 @@ import requests
|
|
25
25
|
import json
|
26
26
|
from opsci_toolbox.helpers.common import write_json, write_pickle, load_pickle, create_dir, copy_file, write_jsonl
|
27
27
|
from textacy.preprocessing.replace import urls
|
28
|
+
from textacy.preprocessing.remove import brackets
|
28
29
|
from eldar import Query
|
29
30
|
import torch
|
30
|
-
from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer
|
31
|
+
from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
|
31
32
|
from bs4 import BeautifulSoup
|
32
33
|
|
33
34
|
|
@@ -93,10 +94,15 @@ def filter_by_query(df: pd.DataFrame, col_text: str, query: str, ignore_case: bo
|
|
93
94
|
The filtered DataFrame.
|
94
95
|
"""
|
95
96
|
eldar_query=Query(query, ignore_case = ignore_case, ignore_accent=ignore_accent, match_word=match_word)
|
96
|
-
df
|
97
|
+
df = df[df[col_text].apply(eldar_query)]
|
97
98
|
df=df.reset_index(drop=True)
|
98
99
|
return df
|
99
100
|
|
101
|
+
def remove_trailing_dots(text):
|
102
|
+
if text.endswith('…'):
|
103
|
+
return text[:-3].strip()
|
104
|
+
return text
|
105
|
+
|
100
106
|
def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
|
101
107
|
"""
|
102
108
|
Generic cleaning process for topic modeling.
|
@@ -114,12 +120,19 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
|
|
114
120
|
The DataFrame with cleaned text data.
|
115
121
|
"""
|
116
122
|
df[col_clean] = df[col].apply(remove_rt)
|
123
|
+
df[col_clean] = df[col_clean].apply(remove_emoji)
|
124
|
+
df[col_clean] = df[col_clean].apply(remove_trailing_dots)
|
125
|
+
df[col_clean] = df[col_clean].apply(remove_html_tags)
|
126
|
+
df[col_clean] = df[col_clean].apply(lambda x : brackets(x))
|
117
127
|
df[col_clean] = df[col_clean].apply(lambda x : urls(x, repl= ''))
|
118
128
|
df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
|
129
|
+
df[col_clean] = df[col_clean].apply(remove_multiple_hashtags)
|
119
130
|
df[col_clean] = df[col_clean].apply(remove_extra_spaces)
|
120
131
|
# df = df.loc[(df[col_clean] != ""), :]
|
121
132
|
return df
|
122
133
|
|
134
|
+
|
135
|
+
|
123
136
|
def extract_insta_shortcode(url: str) -> str:
|
124
137
|
"""
|
125
138
|
Extracts the shortcode from an Instagram URL.
|
@@ -151,6 +164,39 @@ def remove_parentheses_content(text: str) -> str:
|
|
151
164
|
result = re.sub(r'\([^)]*\)', '', text)
|
152
165
|
return result
|
153
166
|
|
167
|
+
def remove_hashtags(text: str) -> str:
|
168
|
+
"""
|
169
|
+
Removes any hashtag from text.
|
170
|
+
|
171
|
+
Args:
|
172
|
+
text : str
|
173
|
+
The input text string to clean.
|
174
|
+
|
175
|
+
Returns:
|
176
|
+
result : str
|
177
|
+
The input text string with hashtags removed.
|
178
|
+
"""
|
179
|
+
pattern = r'\B#\w+'
|
180
|
+
result = re.sub(pattern, '', text).strip()
|
181
|
+
return result
|
182
|
+
|
183
|
+
def remove_multiple_hashtags(text: str) -> str:
|
184
|
+
"""
|
185
|
+
Removes series of hashtags separated by spaces.
|
186
|
+
|
187
|
+
Args:
|
188
|
+
text : str
|
189
|
+
The input text string to clean.
|
190
|
+
|
191
|
+
Returns:
|
192
|
+
result : str
|
193
|
+
The input text string with series of hashtags removed.
|
194
|
+
"""
|
195
|
+
pattern = r'(?:\B#\w+\s*){2,}'
|
196
|
+
result = re.sub(pattern, '', text).strip()
|
197
|
+
return result
|
198
|
+
|
199
|
+
|
154
200
|
def remove_emojis(text: str) -> str:
|
155
201
|
"""
|
156
202
|
Removes emojis and their textual representations from a text string.
|
@@ -171,6 +217,31 @@ def remove_emojis(text: str) -> str:
|
|
171
217
|
|
172
218
|
return text_no_emojis
|
173
219
|
|
220
|
+
def remove_emoji(string):
|
221
|
+
emoji_pattern = re.compile(
|
222
|
+
"["
|
223
|
+
u"\U0001F600-\U0001F64F" # emoticons
|
224
|
+
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
225
|
+
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
226
|
+
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
227
|
+
u"\U00002500-\U00002BEF" # chinese char
|
228
|
+
u"\U00002702-\U000027B0"
|
229
|
+
u"\U00002702-\U000027B0"
|
230
|
+
u"\U000024C2-\U0001F251"
|
231
|
+
u"\U0001f926-\U0001f937"
|
232
|
+
u"\U00010000-\U0010ffff"
|
233
|
+
u"\u2640-\u2642"
|
234
|
+
u"\u2600-\u2B55"
|
235
|
+
u"\u200d"
|
236
|
+
u"\u23cf"
|
237
|
+
u"\u23e9"
|
238
|
+
u"\u231a"
|
239
|
+
u"\ufe0f" # dingbats
|
240
|
+
u"\u3030"
|
241
|
+
"]+", flags=re.UNICODE)
|
242
|
+
return emoji_pattern.sub(r'', string)
|
243
|
+
|
244
|
+
|
174
245
|
def extract_numbers(text: str) -> list:
|
175
246
|
"""
|
176
247
|
Extracts all numeric values from a given text string and returns them as a list of floats.
|
@@ -421,6 +492,23 @@ def remove_stopwords(lang: str, stopwords: list) -> pd.DataFrame:
|
|
421
492
|
df.to_csv(file_path, encoding="utf-8", index=False)
|
422
493
|
print("File saved -", file_path)
|
423
494
|
return df
|
495
|
+
|
496
|
+
def keep_valid_filename_chars(text: str, replace: str = '') -> str:
|
497
|
+
"""
|
498
|
+
Replace all characters not typically allowed in filenames with a specified replacement string.
|
499
|
+
|
500
|
+
Args:
|
501
|
+
text : str
|
502
|
+
The input text string.
|
503
|
+
replace : str, optional
|
504
|
+
The string to replace invalid filename characters with. Default is an empty string.
|
505
|
+
|
506
|
+
Returns:
|
507
|
+
cleaned_text : str
|
508
|
+
The input text string with invalid filename characters replaced.
|
509
|
+
"""
|
510
|
+
return re.sub(r'[.<>:"/\\|?*\x00-\x1F]', replace, text)
|
511
|
+
|
424
512
|
|
425
513
|
|
426
514
|
def keep_alphanum_char(text: str, replace: str = '') -> str:
|
@@ -453,7 +541,7 @@ def substitute_punctuations_with_white_space(text : str) -> str:
|
|
453
541
|
text = re.sub(r"[%s]" % re.escape('!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~“…”’'), " ", text)
|
454
542
|
return text
|
455
543
|
|
456
|
-
def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate") -> dict:
|
544
|
+
def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate", format_payload="html") -> dict:
|
457
545
|
"""
|
458
546
|
Translate text using LibreTranslate service.
|
459
547
|
|
@@ -470,6 +558,8 @@ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_j
|
|
470
558
|
The directory to save the translation result JSON file.
|
471
559
|
url : str, optional
|
472
560
|
The URL of the WT Libre translation service. Default is "http://127.0.0.1:5000/translate".
|
561
|
+
format_payload : str, optional
|
562
|
+
Possible values are html or text.
|
473
563
|
|
474
564
|
Returns:
|
475
565
|
json_data : dict
|
@@ -480,7 +570,7 @@ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_j
|
|
480
570
|
"q": text,
|
481
571
|
"source": source,
|
482
572
|
"target": target,
|
483
|
-
"format":
|
573
|
+
"format": format_payload,
|
484
574
|
"api_key": ""
|
485
575
|
}
|
486
576
|
|
@@ -492,7 +582,7 @@ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_j
|
|
492
582
|
write_json(json_data, dir_json , str(filename))
|
493
583
|
return json_data
|
494
584
|
|
495
|
-
def translate_batch(batch_text: list, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate") -> list:
|
585
|
+
def translate_batch(batch_text: list, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate", format_payload="html") -> list:
|
496
586
|
"""
|
497
587
|
Translate a batch of texts using LibreTranslate service.
|
498
588
|
|
@@ -509,6 +599,8 @@ def translate_batch(batch_text: list, source: str, target: str, filename: str, d
|
|
509
599
|
The directory to save the translation result JSONL file.
|
510
600
|
url : str, optional
|
511
601
|
The URL of the WT Libre translation service. Default is "http://127.0.0.1:5000/translate".
|
602
|
+
format_payload : str, optional
|
603
|
+
Possible values are html or text.
|
512
604
|
|
513
605
|
Returns:
|
514
606
|
json_results : list of dict
|
@@ -519,7 +611,7 @@ def translate_batch(batch_text: list, source: str, target: str, filename: str, d
|
|
519
611
|
"q": batch_text,
|
520
612
|
"source": source,
|
521
613
|
"target": target,
|
522
|
-
"format":
|
614
|
+
"format": format_payload,
|
523
615
|
"api_key": ""
|
524
616
|
}
|
525
617
|
|
@@ -535,7 +627,7 @@ def translate_batch(batch_text: list, source: str, target: str, filename: str, d
|
|
535
627
|
write_jsonl(json_results, dir_json , str(filename))
|
536
628
|
return json_results
|
537
629
|
|
538
|
-
def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:5000/translate") -> str:
|
630
|
+
def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:5000/translate", format_payload="html") -> str:
|
539
631
|
"""
|
540
632
|
Translate text using LibreTranslate service.
|
541
633
|
|
@@ -548,6 +640,8 @@ def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:
|
|
548
640
|
The target language code.
|
549
641
|
url : str, optional
|
550
642
|
The URL of the translation service. Default is "http://127.0.0.1:5000/translate".
|
643
|
+
format_payload : str, optional
|
644
|
+
Possible values are html or text.
|
551
645
|
|
552
646
|
Returns:
|
553
647
|
translatedText : str
|
@@ -558,7 +652,7 @@ def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:
|
|
558
652
|
"q": text,
|
559
653
|
"source": source,
|
560
654
|
"target": target,
|
561
|
-
"format":
|
655
|
+
"format": format_payload,
|
562
656
|
"api_key": ""
|
563
657
|
}
|
564
658
|
|
@@ -788,7 +882,95 @@ def top_items_per_category(df: pd.DataFrame, col_lst: str = "hashtags", col_cat:
|
|
788
882
|
)
|
789
883
|
return df_count
|
790
884
|
|
791
|
-
def
|
885
|
+
def topic_aggregate_chunks(df: pd.DataFrame, col_id: str, col_topic : str, col_chunk_id: str, col_engagement: str, col_user_id: str=None, metrics : dict =dict())-> pd.DataFrame:
|
886
|
+
"""
|
887
|
+
Calculate the intermediate agregation of chunks per Post ID and topic
|
888
|
+
|
889
|
+
Args:
|
890
|
+
df : pandas DataFrame
|
891
|
+
DataFrame containing processed data.
|
892
|
+
col_id : str
|
893
|
+
Name of the column containing unique posts identifiers.
|
894
|
+
col_topic : str
|
895
|
+
Name of the column containing topic labels.
|
896
|
+
col_chunk_id : str
|
897
|
+
Name of the column containing unique sentences identifiers.
|
898
|
+
col_engagement : str
|
899
|
+
Name of the column containing engagement metrics.
|
900
|
+
col_user_id : str
|
901
|
+
Name of the column containing user identifiers.
|
902
|
+
metrics : dict
|
903
|
+
Dictionary containing additional metrics to aggregate.
|
904
|
+
|
905
|
+
Returns:
|
906
|
+
DataFrame
|
907
|
+
DataFrame containing the agregated posts per topic
|
908
|
+
|
909
|
+
Description:
|
910
|
+
This function aggregates various metrics for each post and topic, including verbatim counts, engagement sums, average word counts, occurrences of emojis, hashtags, and mentions, as well as unique counts for emojis, hashtags, and mentions. Additionally, it computes the average topic coordinates (x and y) if available. Finally, it calculates percentages for verbatims, engagements, users (if applicable), occurrences of emojis, hashtags, and mentions, and their respective combinations with verbatims.
|
911
|
+
"""
|
912
|
+
metrics_dict = dict()
|
913
|
+
# metrics_dict[col_id]=(col_id,'first')
|
914
|
+
metrics_dict[col_chunk_id]=(col_chunk_id,"nunique")
|
915
|
+
metrics_dict[col_engagement]=(col_engagement,'first')
|
916
|
+
|
917
|
+
if col_user_id:
|
918
|
+
metrics_dict[col_user_id]=(col_user_id,"first")
|
919
|
+
if "sentiment" in df.columns:
|
920
|
+
metrics_dict["sentiment"] = ("sentiment", "mean")
|
921
|
+
if "sentiment_score" in df.columns:
|
922
|
+
metrics_dict["sentiment_score"] = ("sentiment_score", "mean")
|
923
|
+
|
924
|
+
metrics_dict["tokens_count"] = ("tokens_count", "sum")
|
925
|
+
metrics_dict["lemmas_count"] = ("lemmas_count", "sum")
|
926
|
+
metrics_dict["emojis_count"] = ("emojis_count", "sum")
|
927
|
+
metrics_dict["unique_emojis"] = ("unique_emojis", lambda x: set(emoji for sublist in x for emoji in sublist))
|
928
|
+
metrics_dict["unique_emojis_count"] = ("unique_emojis", len)
|
929
|
+
metrics_dict["hashtags"] = ("hashtags", lambda x: list(hashtag for sublist in x for hashtag in sublist))
|
930
|
+
metrics_dict["hashtags_count"] = ("hashtags_count", "sum")
|
931
|
+
metrics_dict["mentions"] = ("mentions", lambda x: list(mention for sublist in x for mention in sublist))
|
932
|
+
metrics_dict["mentions_count"] = ("mentions_count", "sum")
|
933
|
+
metrics_dict["extracted_urls_from_text"] = ("extracted_urls_from_text", lambda x: list(url for sublist in x for url in sublist))
|
934
|
+
metrics_dict["domain"] = ("domain", lambda x: list(domain for sublist in x for domain in sublist))
|
935
|
+
metrics_dict["len_numbers"] = ("len_numbers", "sum")
|
936
|
+
metrics_dict["interrogation"] = ("interrogation", "sum")
|
937
|
+
metrics_dict["exclamation"] = ("exclamation", "sum")
|
938
|
+
metrics_dict["x"] = ("x", "mean")
|
939
|
+
metrics_dict["y"] = ("y", "mean")
|
940
|
+
|
941
|
+
metrics_dict.update(metrics)
|
942
|
+
|
943
|
+
df_gb = df.groupby([col_id, col_topic]).agg(**metrics_dict).reset_index()
|
944
|
+
df_gb[col_topic]=df_gb[col_topic].astype(str)
|
945
|
+
|
946
|
+
return df_gb
|
947
|
+
|
948
|
+
def sentiment_to_category(sentiment : float, boundaries : list = [-1.0, -0.5, 0.5, 1.0], labels :list = ['negative', 'neutral', 'positive']) -> str:
|
949
|
+
"""
|
950
|
+
Assign a sentiment category to a sentiment score.
|
951
|
+
|
952
|
+
Args:
|
953
|
+
sentiment : float
|
954
|
+
sentiment score
|
955
|
+
boundaries : list
|
956
|
+
list of boundaries for each category
|
957
|
+
labels : list
|
958
|
+
list of labels for each category
|
959
|
+
|
960
|
+
Returns:
|
961
|
+
str
|
962
|
+
category label
|
963
|
+
|
964
|
+
Description:
|
965
|
+
This function assigns a sentiment category to a sentiment score based on a list of boundaries and labels. If the sentiment score is outside the boundaries, it is assigned to the last category.
|
966
|
+
"""
|
967
|
+
for i in range(len(boundaries) - 1):
|
968
|
+
if boundaries[i] <= sentiment < boundaries[i + 1]:
|
969
|
+
return labels[i]
|
970
|
+
return labels[-1]
|
971
|
+
|
972
|
+
|
973
|
+
def topic_representation(df: pd.DataFrame, col_topic: str, col_id: str, col_engagement: str, col_user_id: str, metrics: dict) -> pd.DataFrame:
|
792
974
|
"""
|
793
975
|
Calculate the representation of topics in a processed DataFrame.
|
794
976
|
|
@@ -822,11 +1004,15 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
|
|
822
1004
|
metrics_dict['engagements']=(col_engagement,'sum')
|
823
1005
|
if col_user_id:
|
824
1006
|
metrics_dict["users"]=(col_user_id,"nunique")
|
1007
|
+
panel_cols = [col for col in df.columns if col[:6] == 'panel_']
|
1008
|
+
if len(panel_cols)>0:
|
1009
|
+
for panel_col in panel_cols:
|
1010
|
+
metrics_dict[panel_col+'_verbatims'] = (panel_col, "sum")
|
1011
|
+
metrics_dict[panel_col+'_users'] = (col_user_id, lambda x : x[df[panel_col]].nunique())
|
1012
|
+
metrics_dict[panel_col+'_engagements'] = (col_engagement, lambda x : x[df[panel_col]].sum())
|
825
1013
|
|
826
1014
|
metrics_dict.update(metrics)
|
827
1015
|
|
828
|
-
print(metrics_dict)
|
829
|
-
|
830
1016
|
metrics_dict['avg_word_count']=("tokens_count", lambda x: round(x.mean(),2))
|
831
1017
|
metrics_dict['verbatims_with_emoji']=("emojis_count", lambda x: (x > 0).sum() )
|
832
1018
|
metrics_dict['emojis_occurences']=("emojis_count", "sum")
|
@@ -843,9 +1029,8 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
|
|
843
1029
|
metrics_dict['topic_x']=("x", "mean")
|
844
1030
|
metrics_dict['topic_y']=("y", "mean")
|
845
1031
|
|
846
|
-
|
847
1032
|
# on produit la représentation des topics finale
|
848
|
-
df_distrib_all = (
|
1033
|
+
df_distrib_all = (df.groupby(col_topic)
|
849
1034
|
.agg(**metrics_dict)
|
850
1035
|
.sort_values(by="verbatims", ascending=False)
|
851
1036
|
.assign(engagement_per_verbatims = lambda x : x["engagements"] / x["verbatims"])
|
@@ -1101,10 +1286,12 @@ def PRarmy_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str = "l
|
|
1101
1286
|
NER_type.append(ent.label_)
|
1102
1287
|
NER_text.append(ent.text)
|
1103
1288
|
|
1289
|
+
|
1104
1290
|
record = (NER_type, NER_text, ' '.join(map(str, lemmas_list)))
|
1105
1291
|
all_records.append(record)
|
1106
1292
|
|
1107
1293
|
|
1294
|
+
|
1108
1295
|
df[['NER_type', 'NER_text', col_lemma]] = pd.DataFrame(all_records, index=df.index)
|
1109
1296
|
|
1110
1297
|
return df
|
@@ -1819,6 +2006,75 @@ def encode_chunked_files(chunk_files_paths: list,
|
|
1819
2006
|
|
1820
2007
|
return new_file_paths
|
1821
2008
|
|
2009
|
+
####################################################################
|
2010
|
+
# ENCODING FEATURES
|
2011
|
+
####################################################################
|
2012
|
+
|
2013
|
+
def encode_labels(data_to_encode: np.ndarray) -> tuple:
|
2014
|
+
"""
|
2015
|
+
Encodes a list of labels using a LabelEncoder.
|
2016
|
+
|
2017
|
+
Args:
|
2018
|
+
- data_to_encode (List[Union[str, int]]): The list of labels to encode. Labels can be of any hashable type,
|
2019
|
+
but strings or integers are typical.
|
2020
|
+
|
2021
|
+
Returns:
|
2022
|
+
- Tuple[LabelEncoder, np.ndarray]: A tuple containing the fitted LabelEncoder instance and a numpy array
|
2023
|
+
of encoded labels.
|
2024
|
+
"""
|
2025
|
+
label_encoder = LabelEncoder()
|
2026
|
+
label_encoder.fit(data_to_encode)
|
2027
|
+
encoded_labels = label_encoder.transform(data_to_encode)
|
2028
|
+
return label_encoder, encoded_labels
|
2029
|
+
|
2030
|
+
|
2031
|
+
def encode_new_labels(label_encoder : LabelEncoder, data_to_encode : np.ndarray) -> np.ndarray:
|
2032
|
+
"""
|
2033
|
+
Encodes a list of new labels using an already fitted LabelEncoder.
|
2034
|
+
|
2035
|
+
Args:
|
2036
|
+
- label_encoder (LabelEncoder): A pre-fitted LabelEncoder instance.
|
2037
|
+
- data_to_encode (List[Union[str, int]]): The list of new labels to encode using the pre-fitted encoder.
|
2038
|
+
|
2039
|
+
Returns:
|
2040
|
+
- np.ndarray: A numpy array of encoded labels.
|
2041
|
+
"""
|
2042
|
+
encoded_labels = label_encoder.transform(data_to_encode)
|
2043
|
+
return encoded_labels
|
2044
|
+
|
2045
|
+
def one_hot_encode(data_to_encode:np.ndarray) -> tuple:
|
2046
|
+
"""
|
2047
|
+
One-hot encodes a list of categorical values using OneHotEncoder.
|
2048
|
+
|
2049
|
+
Args:
|
2050
|
+
- data_to_encode (List[Union[str, int]]): The list of categorical values to encode. The values can be of
|
2051
|
+
any hashable type, typically strings or integers.
|
2052
|
+
|
2053
|
+
Returns:
|
2054
|
+
- Tuple[OneHotEncoder, np.ndarray]: A tuple containing the fitted OneHotEncoder instance and a numpy array
|
2055
|
+
of one-hot encoded values.
|
2056
|
+
"""
|
2057
|
+
one_hot_encoder = OneHotEncoder(sparse=False)
|
2058
|
+
data_to_encode_reshaped = np.array(data_to_encode).reshape(-1, 1) # Reshape for OneHotEncoder
|
2059
|
+
one_hot_encoder.fit(data_to_encode_reshaped)
|
2060
|
+
encoded_array = one_hot_encoder.transform(data_to_encode_reshaped)
|
2061
|
+
return one_hot_encoder, encoded_array
|
2062
|
+
|
2063
|
+
|
2064
|
+
def one_hot_encode_new_data(one_hot_encoder: OneHotEncoder, data_to_encode: np.ndarray) -> np.ndarray:
|
2065
|
+
"""
|
2066
|
+
One-hot encodes a list of new categorical values using an already fitted OneHotEncoder.
|
2067
|
+
|
2068
|
+
Args:
|
2069
|
+
- one_hot_encoder (OneHotEncoder): A pre-fitted OneHotEncoder instance.
|
2070
|
+
- data_to_encode (List[Union[str, int]]): The list of new categorical values to encode using the pre-fitted encoder.
|
2071
|
+
|
2072
|
+
Returns:
|
2073
|
+
- np.ndarray: A numpy array of one-hot encoded values.
|
2074
|
+
"""
|
2075
|
+
data_to_encode_reshaped = np.array(data_to_encode).reshape(-1, 1) # Reshape for OneHotEncoder
|
2076
|
+
encoded_array = one_hot_encoder.transform(data_to_encode_reshaped)
|
2077
|
+
return encoded_array
|
1822
2078
|
|
1823
2079
|
####################################################################
|
1824
2080
|
# SCALING FEATURES
|
@@ -2130,13 +2386,15 @@ def check_gpu():
|
|
2130
2386
|
def HF_load_model(model_checkpoint):
|
2131
2387
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
2132
2388
|
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
|
2389
|
+
config = AutoConfig.from_pretrained(model_checkpoint)
|
2133
2390
|
if torch.cuda.is_available():
|
2134
2391
|
model.cuda()
|
2135
|
-
return model, tokenizer
|
2392
|
+
return model, tokenizer, config
|
2136
2393
|
|
2137
2394
|
def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json):
|
2138
2395
|
""" Calculate sentiment of a text. `return_type` can be 'label', 'score' or 'proba' """
|
2139
2396
|
file_path= os.path.join(dir_json , str(filename)+'.json')
|
2397
|
+
results = {}
|
2140
2398
|
if not os.path.exists(file_path):
|
2141
2399
|
with torch.no_grad():
|
2142
2400
|
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
|
@@ -46,6 +46,44 @@ def reduce_with_cuml_UMAP(embeddings: np.ndarray,
|
|
46
46
|
reduced_embeddings = reducer.transform(embeddings)
|
47
47
|
return reducer, reduced_embeddings
|
48
48
|
|
49
|
+
|
50
|
+
def supervised_reduce_with_cuml_UMAP(embeddings: np.ndarray,
|
51
|
+
n_neighbors: int = 5,
|
52
|
+
n_components: int = 3,
|
53
|
+
min_dist: float = 0.0,
|
54
|
+
metric: str = "cosine",
|
55
|
+
spread: float = 1.0,
|
56
|
+
learning_rate: float = 1.0,
|
57
|
+
n_epochs:int = 300,
|
58
|
+
y: np.ndarray = None,
|
59
|
+
convert_dtype: bool = False
|
60
|
+
) -> tuple:
|
61
|
+
"""
|
62
|
+
Reduces the dimensionality of embeddings using UMAP with cuML library.
|
63
|
+
|
64
|
+
Args:
|
65
|
+
embeddings (np.ndarray): The input embeddings to be reduced.
|
66
|
+
n_neighbors (int, optional): The number of nearest neighbors to consider. Defaults to 5.
|
67
|
+
n_components (int, optional): The number of dimensions of the embedded space. Defaults to 3.
|
68
|
+
min_dist (float, optional): The minimum distance between embedded points. Defaults to 0.0.
|
69
|
+
metric (str, optional): The metric to use for distance computation. Defaults to "cosine".
|
70
|
+
spread (float, optional): The effective scale of embedded points. Defaults to 1.0.
|
71
|
+
|
72
|
+
Returns:
|
73
|
+
reducer (UMAP): The UMAP reducer object.
|
74
|
+
reduced_embeddings (np.ndarray): The reduced embeddings.
|
75
|
+
"""
|
76
|
+
reducer = UMAP(n_neighbors=n_neighbors,
|
77
|
+
n_components=n_components,
|
78
|
+
min_dist=min_dist,
|
79
|
+
metric=metric,
|
80
|
+
spread = spread,
|
81
|
+
n_epochs=n_epochs,
|
82
|
+
learning_rate=learning_rate).fit(X = embeddings, y = y, convert_dtype = convert_dtype)
|
83
|
+
|
84
|
+
reduced_embeddings = reducer.transform(embeddings)
|
85
|
+
return reducer, reduced_embeddings
|
86
|
+
|
49
87
|
def transform_with_cuml_UMAP(reducer,
|
50
88
|
new_embeddings: np.ndarray) -> np.ndarray:
|
51
89
|
"""
|
@@ -384,7 +422,6 @@ def cuml_word_frequency_per_categorie(gdf: pd.DataFrame, col_text: str, col_cat:
|
|
384
422
|
# Initialize cuML's CountVectorizer
|
385
423
|
count_vectorizer = CountVectorizer(analyzer='word', ngram_range=ngram_range, stop_words=stop_words)
|
386
424
|
|
387
|
-
print(type(gdf[col_text]))
|
388
425
|
# Fit and transform the text data
|
389
426
|
X_train_count = count_vectorizer.fit_transform(cudf.Series(gdf[col_text]))
|
390
427
|
X_names_count = count_vectorizer.get_feature_names()
|
@@ -402,13 +439,17 @@ def cuml_word_frequency_per_categorie(gdf: pd.DataFrame, col_text: str, col_cat:
|
|
402
439
|
df_count_tmp = df_count_tmp.head(n_words)
|
403
440
|
if min_freq:
|
404
441
|
df_count_tmp = df_count_tmp[df_count_tmp["freq"] > min_freq]
|
405
|
-
|
442
|
+
|
443
|
+
df_count_tmp['word'] = df_count_tmp['word'].astype(str)
|
406
444
|
# Concatenate the result to the main DataFrame
|
407
445
|
df_count = cudf.concat([df_count, df_count_tmp])
|
408
446
|
|
409
447
|
# Convert the result back to pandas DataFrame
|
410
448
|
return df_count.to_pandas()
|
411
449
|
|
450
|
+
|
451
|
+
|
452
|
+
|
412
453
|
# def cuml_chi2_per_category(lst_text: list, lst_categorie: list, col_cat: str, n_words: int = 10, p_value_limit: float = 0.95, min_freq: int = 3) -> pd.DataFrame:
|
413
454
|
|
414
455
|
# # Convert input lists to cuDF Series
|
@@ -588,7 +629,7 @@ def cudf_encode_chunked_files(chunk_files_paths: list,
|
|
588
629
|
current_df = cudf_read_parquet(file)
|
589
630
|
|
590
631
|
text_list = current_df[col_text].to_arrow().to_pylist()
|
591
|
-
|
632
|
+
|
592
633
|
# text vectorization
|
593
634
|
embeddings = HF_encoder.embed_documents(text_list)
|
594
635
|
|
opsci_toolbox/helpers/sna.py
CHANGED
@@ -421,6 +421,7 @@ def select_top_nodes_by_degrees(G: nx.Graph, degree_type : str = "degree", N : i
|
|
421
421
|
return subgraph
|
422
422
|
|
423
423
|
|
424
|
+
|
424
425
|
def scale_size(G, size_attribute, min_node_size = 10, max_node_size = 100):
|
425
426
|
"""
|
426
427
|
Scale the sizes of nodes in a graph based on a specified attribute.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: opsci-toolbox
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.14
|
4
4
|
Summary: a complete toolbox
|
5
5
|
Home-page: UNKNOWN
|
6
6
|
Author: Erwan Le Nagard
|
@@ -41,13 +41,16 @@ Requires-Dist: spacy-language-detection ==0.2.1
|
|
41
41
|
Requires-Dist: spacymoji ==3.1.0
|
42
42
|
Requires-Dist: supervision ==0.21.0
|
43
43
|
Requires-Dist: textacy ==0.13.0
|
44
|
-
Requires-Dist: torch
|
44
|
+
Requires-Dist: torch >=2.4.0
|
45
45
|
Requires-Dist: tqdm >=4.66.2
|
46
46
|
Requires-Dist: trafilatura ==1.7.0
|
47
47
|
Requires-Dist: transformers ==4.38.2
|
48
48
|
Requires-Dist: umap-learn ==0.5.5
|
49
49
|
Requires-Dist: urlextract ==1.9.0
|
50
50
|
Requires-Dist: wordcloud ==1.9.3
|
51
|
+
Requires-Dist: Unidecode ==1.3.8
|
52
|
+
Requires-Dist: kaleido ==0.2.1
|
53
|
+
Requires-Dist: gliner ==0.2.8
|
51
54
|
|
52
55
|
UNKNOWN
|
53
56
|
|
@@ -0,0 +1,26 @@
|
|
1
|
+
opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
opsci_toolbox/apis/rapidapi_helpers.py,sha256=plX0uoGXWBEmeRqK7QfB_CVYJnW15kVUWtitESxPLNw,26669
|
4
|
+
opsci_toolbox/apis/reddit.py,sha256=b_dJFZ_bOB9LLugGBBw5bCbUZdq8VnwtVCGaTYljIIg,21096
|
5
|
+
opsci_toolbox/apis/telegram.py,sha256=IJYXMvXzA2R2Z7ywKJiny38pd-ryHK4jPxVG2Nj_dms,45676
|
6
|
+
opsci_toolbox/apis/webscraping.py,sha256=1DAIYbywZoPwTSyoqFGxyF0-q_nUsGg_VK51zLL_bB0,21465
|
7
|
+
opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
|
8
|
+
opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
+
opsci_toolbox/helpers/common.py,sha256=dlP6TnRggZsnPksgo7LPH7IghU_t9LFz42eMEzzg99o,53323
|
10
|
+
opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
|
11
|
+
opsci_toolbox/helpers/dataviz.py,sha256=U2Kj-xoF1wHvYXUKxLsrSvKnhky9PrPUy61s1WEKp44,208743
|
12
|
+
opsci_toolbox/helpers/dates.py,sha256=Pq-SKP2n1z0_jzU8NxGSv8CHLH_MOKjP_rNYeny0Tb8,4752
|
13
|
+
opsci_toolbox/helpers/gliner.py,sha256=qLkpuoCDezQyYmg_TE3XYETSpobHods6WBjCLo0Gjqw,3579
|
14
|
+
opsci_toolbox/helpers/nlp.py,sha256=hXnP6rUkUzyurJ5O_fNUxqT2MZK3poC21L9zy6oa22c,102551
|
15
|
+
opsci_toolbox/helpers/nlp_cuml.py,sha256=OBCRkaHibuyvJ8LQAE2EC7_J0KPe7Kf-ayN2jyxDlKg,30709
|
16
|
+
opsci_toolbox/helpers/sna.py,sha256=E5D_1aGDmq_YQYseHxZggEtWQOwbXJJ0GHu3YtZLGtg,31906
|
17
|
+
opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
|
18
|
+
opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
|
19
|
+
opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
|
+
opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
|
21
|
+
opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
|
22
|
+
opsci_toolbox-0.0.14.dist-info/METADATA,sha256=X2EgVw8JlZLdgnrN1nOP6aZRs1WyztbkCkN4UKkuTLE,1727
|
23
|
+
opsci_toolbox-0.0.14.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
24
|
+
opsci_toolbox-0.0.14.dist-info/dependency_links.txt,sha256=bEiJsgyh9M0F_pGpJBwUYDefiTNq9F6QEGfQS5RH1Os,39
|
25
|
+
opsci_toolbox-0.0.14.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
|
26
|
+
opsci_toolbox-0.0.14.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
https://download.pytorch.org/whl/cu124
|
@@ -1,22 +0,0 @@
|
|
1
|
-
opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
opsci_toolbox/apis/rapidapi_helpers.py,sha256=k_hYcRNww5noNkX7zyz5Htggxb15BPoKSlbY7NLuQXI,26696
|
4
|
-
opsci_toolbox/apis/webscraping.py,sha256=1DAIYbywZoPwTSyoqFGxyF0-q_nUsGg_VK51zLL_bB0,21465
|
5
|
-
opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
|
6
|
-
opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
opsci_toolbox/helpers/common.py,sha256=nqg9wzgU5DxVTCxEb5LSw2lUnp0f_hKF_Q-DhpRtu6g,45158
|
8
|
-
opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
|
9
|
-
opsci_toolbox/helpers/dataviz.py,sha256=1cIGb-u81cD5iSIkkkrzyrBnfim7fbhm0x_CguHUbf0,202128
|
10
|
-
opsci_toolbox/helpers/dates.py,sha256=Wf7HxaUY62IRrY3XPdRIuoaMbGi3QqWf-vStqbRRY_o,2633
|
11
|
-
opsci_toolbox/helpers/nlp.py,sha256=n7nNEU0cuu7bqXYRRBH4D-xIzpdNwKm0nj-eRYh3aPY,91956
|
12
|
-
opsci_toolbox/helpers/nlp_cuml.py,sha256=XzBfoFMpVIehpRbp60E4wGokpoqJP0lJxs1plOxQqBY,28882
|
13
|
-
opsci_toolbox/helpers/sna.py,sha256=XL1BZ-x83xWRNbGsvh7-m8Mdy6iOrWx8vjgaL2_TSmo,31905
|
14
|
-
opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
|
15
|
-
opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
|
16
|
-
opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
-
opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
|
18
|
-
opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
|
19
|
-
opsci_toolbox-0.0.12.dist-info/METADATA,sha256=LosT5jzu7Z0TXIslwVUSvPG6AKMrblGp8A6odUN_N9U,1633
|
20
|
-
opsci_toolbox-0.0.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
21
|
-
opsci_toolbox-0.0.12.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
|
22
|
-
opsci_toolbox-0.0.12.dist-info/RECORD,,
|
File without changes
|
File without changes
|