opsci-toolbox 0.0.11__py3-none-any.whl → 0.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/reddit.py +399 -0
- opsci_toolbox/apis/telegram.py +1035 -0
- opsci_toolbox/apis/webscraping.py +75 -0
- opsci_toolbox/helpers/common.py +176 -4
- opsci_toolbox/helpers/dataviz.py +184 -26
- opsci_toolbox/helpers/dates.py +46 -0
- opsci_toolbox/helpers/gliner.py +88 -0
- opsci_toolbox/helpers/nlp.py +256 -8
- opsci_toolbox/helpers/nlp_cuml.py +3 -3
- opsci_toolbox/helpers/sna.py +1 -0
- {opsci_toolbox-0.0.11.dist-info → opsci_toolbox-0.0.13.dist-info}/METADATA +4 -1
- opsci_toolbox-0.0.13.dist-info/RECORD +25 -0
- opsci_toolbox-0.0.11.dist-info/RECORD +0 -22
- {opsci_toolbox-0.0.11.dist-info → opsci_toolbox-0.0.13.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.11.dist-info → opsci_toolbox-0.0.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,88 @@
|
|
1
|
+
from gliner import GLiNER
|
2
|
+
|
3
|
+
|
4
|
+
def load_gliner_model(model_name : str, map_location="cpu") -> GLiNER:
|
5
|
+
"""
|
6
|
+
Load the GLINER named entity recognition (NER) model.
|
7
|
+
|
8
|
+
Args:
|
9
|
+
model: The model name to load.
|
10
|
+
map_location: The device to load the model on. Possible values are cpu or cuda .
|
11
|
+
|
12
|
+
Returns:
|
13
|
+
A list of predicted entities.
|
14
|
+
|
15
|
+
"""
|
16
|
+
model = GLiNER.from_pretrained(model_name, map_location=map_location)
|
17
|
+
return model
|
18
|
+
|
19
|
+
def gliner_predict(model : GLiNER, text : str, labels : list, threshold : float = 0.5) -> list:
|
20
|
+
"""
|
21
|
+
Predicts entities using the given model.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
model: The model used for prediction.
|
25
|
+
text: A list of texts to predict entities from.
|
26
|
+
labels: A list of labels corresponding to the texts.
|
27
|
+
threshold: The threshold value for entity prediction (default: 0.5).
|
28
|
+
|
29
|
+
Returns:
|
30
|
+
A list of predicted entities.
|
31
|
+
|
32
|
+
"""
|
33
|
+
entities = model.predict_entities(text, labels, threshold=threshold)
|
34
|
+
return entities
|
35
|
+
|
36
|
+
def gliner_batch_predict(model : GLiNER, text : list, labels : list, threshold : float = 0.5) -> list:
|
37
|
+
"""
|
38
|
+
Batch inference. Predicts entities using the given model.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
model: The model used for prediction.
|
42
|
+
text: A list of texts to predict entities from.
|
43
|
+
labels: A list of labels corresponding to the texts.
|
44
|
+
threshold: The threshold value for entity prediction (default: 0.5).
|
45
|
+
|
46
|
+
Returns:
|
47
|
+
A list of predicted entities.
|
48
|
+
|
49
|
+
"""
|
50
|
+
entities = model.batch_predict_entities(text, labels, threshold=threshold)
|
51
|
+
return entities
|
52
|
+
|
53
|
+
|
54
|
+
def parse_predictions(predictions : list) -> tuple:
|
55
|
+
"""
|
56
|
+
Parse the predictions generated by a GLINER named entity recognition (NER) model for batch processing.
|
57
|
+
|
58
|
+
Args:
|
59
|
+
predictions (list): A list of dictionaries representing the predictions. Each dictionary contains the following keys:
|
60
|
+
- "start" (int): The starting index of the predicted entity in the input text.
|
61
|
+
- "end" (int): The ending index of the predicted entity in the input text.
|
62
|
+
- "text" (str): The predicted entity text.
|
63
|
+
- "label" (str): The predicted entity label.
|
64
|
+
- "score" (float): The confidence score of the prediction.
|
65
|
+
|
66
|
+
Returns:
|
67
|
+
tuple: A tuple containing lists of the extracted information from the predictions. The tuple contains the following lists:
|
68
|
+
- starts (list): A list of lists, where each inner list contains the starting indices of the predicted entities.
|
69
|
+
- ends (list): A list of lists, where each inner list contains the ending indices of the predicted entities.
|
70
|
+
- texts (list): A list of lists, where each inner list contains the predicted entity texts.
|
71
|
+
- labels (list): A list of lists, where each inner list contains the predicted entity labels.
|
72
|
+
- scores (list): A list of lists, where each inner list contains the confidence scores of the predictions.
|
73
|
+
"""
|
74
|
+
starts, ends, texts, labels, scores = [], [], [], [], []
|
75
|
+
for prediction in predictions:
|
76
|
+
start, end, text, label, score = [], [], [], [], []
|
77
|
+
for item in prediction:
|
78
|
+
start.append(item.get("start"))
|
79
|
+
end.append(item.get("end"))
|
80
|
+
text.append(item.get("text"))
|
81
|
+
label.append(item.get("label"))
|
82
|
+
score.append(item.get("score"))
|
83
|
+
starts.append(start)
|
84
|
+
ends.append(end)
|
85
|
+
texts.append(text)
|
86
|
+
labels.append(label)
|
87
|
+
scores.append(score)
|
88
|
+
return starts, ends, texts, labels, scores
|
opsci_toolbox/helpers/nlp.py
CHANGED
@@ -25,9 +25,10 @@ import requests
|
|
25
25
|
import json
|
26
26
|
from opsci_toolbox.helpers.common import write_json, write_pickle, load_pickle, create_dir, copy_file, write_jsonl
|
27
27
|
from textacy.preprocessing.replace import urls
|
28
|
+
from textacy.preprocessing.remove import brackets
|
28
29
|
from eldar import Query
|
29
30
|
import torch
|
30
|
-
from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer
|
31
|
+
from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
|
31
32
|
from bs4 import BeautifulSoup
|
32
33
|
|
33
34
|
|
@@ -97,6 +98,11 @@ def filter_by_query(df: pd.DataFrame, col_text: str, query: str, ignore_case: bo
|
|
97
98
|
df=df.reset_index(drop=True)
|
98
99
|
return df
|
99
100
|
|
101
|
+
def remove_trailing_dots(text):
|
102
|
+
if text.endswith('…'):
|
103
|
+
return text[:-3].strip()
|
104
|
+
return text
|
105
|
+
|
100
106
|
def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
|
101
107
|
"""
|
102
108
|
Generic cleaning process for topic modeling.
|
@@ -114,12 +120,19 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
|
|
114
120
|
The DataFrame with cleaned text data.
|
115
121
|
"""
|
116
122
|
df[col_clean] = df[col].apply(remove_rt)
|
117
|
-
df[col_clean] = df[
|
123
|
+
df[col_clean] = df[col_clean].apply(remove_emoji)
|
124
|
+
df[col_clean] = df[col_clean].apply(remove_trailing_dots)
|
125
|
+
df[col_clean] = df[col_clean].apply(remove_html_tags)
|
126
|
+
df[col_clean] = df[col_clean].apply(lambda x : brackets(x))
|
127
|
+
df[col_clean] = df[col_clean].apply(lambda x : urls(x, repl= ''))
|
118
128
|
df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
|
129
|
+
df[col_clean] = df[col_clean].apply(remove_multiple_hashtags)
|
119
130
|
df[col_clean] = df[col_clean].apply(remove_extra_spaces)
|
120
131
|
# df = df.loc[(df[col_clean] != ""), :]
|
121
132
|
return df
|
122
133
|
|
134
|
+
|
135
|
+
|
123
136
|
def extract_insta_shortcode(url: str) -> str:
|
124
137
|
"""
|
125
138
|
Extracts the shortcode from an Instagram URL.
|
@@ -151,6 +164,39 @@ def remove_parentheses_content(text: str) -> str:
|
|
151
164
|
result = re.sub(r'\([^)]*\)', '', text)
|
152
165
|
return result
|
153
166
|
|
167
|
+
def remove_hashtags(text: str) -> str:
|
168
|
+
"""
|
169
|
+
Removes any hashtag from text.
|
170
|
+
|
171
|
+
Args:
|
172
|
+
text : str
|
173
|
+
The input text string to clean.
|
174
|
+
|
175
|
+
Returns:
|
176
|
+
result : str
|
177
|
+
The input text string with hashtags removed.
|
178
|
+
"""
|
179
|
+
pattern = r'\B#\w+'
|
180
|
+
result = re.sub(pattern, '', text).strip()
|
181
|
+
return result
|
182
|
+
|
183
|
+
def remove_multiple_hashtags(text: str) -> str:
|
184
|
+
"""
|
185
|
+
Removes series of hashtags separated by spaces.
|
186
|
+
|
187
|
+
Args:
|
188
|
+
text : str
|
189
|
+
The input text string to clean.
|
190
|
+
|
191
|
+
Returns:
|
192
|
+
result : str
|
193
|
+
The input text string with series of hashtags removed.
|
194
|
+
"""
|
195
|
+
pattern = r'(?:\B#\w+\s*){2,}'
|
196
|
+
result = re.sub(pattern, '', text).strip()
|
197
|
+
return result
|
198
|
+
|
199
|
+
|
154
200
|
def remove_emojis(text: str) -> str:
|
155
201
|
"""
|
156
202
|
Removes emojis and their textual representations from a text string.
|
@@ -171,6 +217,31 @@ def remove_emojis(text: str) -> str:
|
|
171
217
|
|
172
218
|
return text_no_emojis
|
173
219
|
|
220
|
+
def remove_emoji(string):
|
221
|
+
emoji_pattern = re.compile(
|
222
|
+
"["
|
223
|
+
u"\U0001F600-\U0001F64F" # emoticons
|
224
|
+
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
225
|
+
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
226
|
+
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
227
|
+
u"\U00002500-\U00002BEF" # chinese char
|
228
|
+
u"\U00002702-\U000027B0"
|
229
|
+
u"\U00002702-\U000027B0"
|
230
|
+
u"\U000024C2-\U0001F251"
|
231
|
+
u"\U0001f926-\U0001f937"
|
232
|
+
u"\U00010000-\U0010ffff"
|
233
|
+
u"\u2640-\u2642"
|
234
|
+
u"\u2600-\u2B55"
|
235
|
+
u"\u200d"
|
236
|
+
u"\u23cf"
|
237
|
+
u"\u23e9"
|
238
|
+
u"\u231a"
|
239
|
+
u"\ufe0f" # dingbats
|
240
|
+
u"\u3030"
|
241
|
+
"]+", flags=re.UNICODE)
|
242
|
+
return emoji_pattern.sub(r'', string)
|
243
|
+
|
244
|
+
|
174
245
|
def extract_numbers(text: str) -> list:
|
175
246
|
"""
|
176
247
|
Extracts all numeric values from a given text string and returns them as a list of floats.
|
@@ -421,6 +492,23 @@ def remove_stopwords(lang: str, stopwords: list) -> pd.DataFrame:
|
|
421
492
|
df.to_csv(file_path, encoding="utf-8", index=False)
|
422
493
|
print("File saved -", file_path)
|
423
494
|
return df
|
495
|
+
|
496
|
+
def keep_valid_filename_chars(text: str, replace: str = '') -> str:
|
497
|
+
"""
|
498
|
+
Replace all characters not typically allowed in filenames with a specified replacement string.
|
499
|
+
|
500
|
+
Args:
|
501
|
+
text : str
|
502
|
+
The input text string.
|
503
|
+
replace : str, optional
|
504
|
+
The string to replace invalid filename characters with. Default is an empty string.
|
505
|
+
|
506
|
+
Returns:
|
507
|
+
cleaned_text : str
|
508
|
+
The input text string with invalid filename characters replaced.
|
509
|
+
"""
|
510
|
+
return re.sub(r'[.<>:"/\\|?*\x00-\x1F]', replace, text)
|
511
|
+
|
424
512
|
|
425
513
|
|
426
514
|
def keep_alphanum_char(text: str, replace: str = '') -> str:
|
@@ -788,7 +876,95 @@ def top_items_per_category(df: pd.DataFrame, col_lst: str = "hashtags", col_cat:
|
|
788
876
|
)
|
789
877
|
return df_count
|
790
878
|
|
791
|
-
def
|
879
|
+
def topic_aggregate_chunks(df: pd.DataFrame, col_id: str, col_topic : str, col_chunk_id: str, col_engagement: str, col_user_id: str=None, metrics : dict =dict())-> pd.DataFrame:
|
880
|
+
"""
|
881
|
+
Calculate the intermediate agregation of chunks per Post ID and topic
|
882
|
+
|
883
|
+
Args:
|
884
|
+
df : pandas DataFrame
|
885
|
+
DataFrame containing processed data.
|
886
|
+
col_id : str
|
887
|
+
Name of the column containing unique posts identifiers.
|
888
|
+
col_topic : str
|
889
|
+
Name of the column containing topic labels.
|
890
|
+
col_chunk_id : str
|
891
|
+
Name of the column containing unique sentences identifiers.
|
892
|
+
col_engagement : str
|
893
|
+
Name of the column containing engagement metrics.
|
894
|
+
col_user_id : str
|
895
|
+
Name of the column containing user identifiers.
|
896
|
+
metrics : dict
|
897
|
+
Dictionary containing additional metrics to aggregate.
|
898
|
+
|
899
|
+
Returns:
|
900
|
+
DataFrame
|
901
|
+
DataFrame containing the agregated posts per topic
|
902
|
+
|
903
|
+
Description:
|
904
|
+
This function aggregates various metrics for each post and topic, including verbatim counts, engagement sums, average word counts, occurrences of emojis, hashtags, and mentions, as well as unique counts for emojis, hashtags, and mentions. Additionally, it computes the average topic coordinates (x and y) if available. Finally, it calculates percentages for verbatims, engagements, users (if applicable), occurrences of emojis, hashtags, and mentions, and their respective combinations with verbatims.
|
905
|
+
"""
|
906
|
+
metrics_dict = dict()
|
907
|
+
# metrics_dict[col_id]=(col_id,'first')
|
908
|
+
metrics_dict[col_chunk_id]=(col_chunk_id,"nunique")
|
909
|
+
metrics_dict[col_engagement]=(col_engagement,'first')
|
910
|
+
|
911
|
+
if col_user_id:
|
912
|
+
metrics_dict[col_user_id]=(col_user_id,"first")
|
913
|
+
if "sentiment" in df.columns:
|
914
|
+
metrics_dict["sentiment"] = ("sentiment", "mean")
|
915
|
+
if "sentiment_score" in df.columns:
|
916
|
+
metrics_dict["sentiment_score"] = ("sentiment_score", "mean")
|
917
|
+
|
918
|
+
metrics_dict["tokens_count"] = ("tokens_count", "sum")
|
919
|
+
metrics_dict["lemmas_count"] = ("lemmas_count", "sum")
|
920
|
+
metrics_dict["emojis_count"] = ("emojis_count", "sum")
|
921
|
+
metrics_dict["unique_emojis"] = ("unique_emojis", lambda x: set(emoji for sublist in x for emoji in sublist))
|
922
|
+
metrics_dict["unique_emojis_count"] = ("unique_emojis", len)
|
923
|
+
metrics_dict["hashtags"] = ("hashtags", lambda x: list(hashtag for sublist in x for hashtag in sublist))
|
924
|
+
metrics_dict["hashtags_count"] = ("hashtags_count", "sum")
|
925
|
+
metrics_dict["mentions"] = ("mentions", lambda x: list(mention for sublist in x for mention in sublist))
|
926
|
+
metrics_dict["mentions_count"] = ("mentions_count", "sum")
|
927
|
+
metrics_dict["extracted_urls_from_text"] = ("extracted_urls_from_text", lambda x: list(url for sublist in x for url in sublist))
|
928
|
+
metrics_dict["domain"] = ("domain", lambda x: list(domain for sublist in x for domain in sublist))
|
929
|
+
metrics_dict["len_numbers"] = ("len_numbers", "sum")
|
930
|
+
metrics_dict["interrogation"] = ("interrogation", "sum")
|
931
|
+
metrics_dict["exclamation"] = ("exclamation", "sum")
|
932
|
+
metrics_dict["x"] = ("x", "mean")
|
933
|
+
metrics_dict["y"] = ("y", "mean")
|
934
|
+
|
935
|
+
metrics_dict.update(metrics)
|
936
|
+
|
937
|
+
df_gb = df.groupby([col_id, col_topic]).agg(**metrics_dict).reset_index()
|
938
|
+
df_gb[col_topic]=df_gb[col_topic].astype(str)
|
939
|
+
|
940
|
+
return df_gb
|
941
|
+
|
942
|
+
def sentiment_to_category(sentiment : float, boundaries : list = [-1.0, -0.5, 0.5, 1.0], labels :list = ['negative', 'neutral', 'positive']) -> str:
|
943
|
+
"""
|
944
|
+
Assign a sentiment category to a sentiment score.
|
945
|
+
|
946
|
+
Args:
|
947
|
+
sentiment : float
|
948
|
+
sentiment score
|
949
|
+
boundaries : list
|
950
|
+
list of boundaries for each category
|
951
|
+
labels : list
|
952
|
+
list of labels for each category
|
953
|
+
|
954
|
+
Returns:
|
955
|
+
str
|
956
|
+
category label
|
957
|
+
|
958
|
+
Description:
|
959
|
+
This function assigns a sentiment category to a sentiment score based on a list of boundaries and labels. If the sentiment score is outside the boundaries, it is assigned to the last category.
|
960
|
+
"""
|
961
|
+
for i in range(len(boundaries) - 1):
|
962
|
+
if boundaries[i] <= sentiment < boundaries[i + 1]:
|
963
|
+
return labels[i]
|
964
|
+
return labels[-1]
|
965
|
+
|
966
|
+
|
967
|
+
def topic_representation(df: pd.DataFrame, col_topic: str, col_id: str, col_engagement: str, col_user_id: str, metrics: dict) -> pd.DataFrame:
|
792
968
|
"""
|
793
969
|
Calculate the representation of topics in a processed DataFrame.
|
794
970
|
|
@@ -822,11 +998,15 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
|
|
822
998
|
metrics_dict['engagements']=(col_engagement,'sum')
|
823
999
|
if col_user_id:
|
824
1000
|
metrics_dict["users"]=(col_user_id,"nunique")
|
1001
|
+
panel_cols = [col for col in df.columns if col[:6] == 'panel_']
|
1002
|
+
if len(panel_cols)>0:
|
1003
|
+
for panel_col in panel_cols:
|
1004
|
+
metrics_dict[panel_col+'_verbatims'] = (panel_col, "sum")
|
1005
|
+
metrics_dict[panel_col+'_users'] = (col_user_id, lambda x : x[df[panel_col]].nunique())
|
1006
|
+
metrics_dict[panel_col+'_engagements'] = (col_engagement, lambda x : x[df[panel_col]].sum())
|
825
1007
|
|
826
1008
|
metrics_dict.update(metrics)
|
827
1009
|
|
828
|
-
print(metrics_dict)
|
829
|
-
|
830
1010
|
metrics_dict['avg_word_count']=("tokens_count", lambda x: round(x.mean(),2))
|
831
1011
|
metrics_dict['verbatims_with_emoji']=("emojis_count", lambda x: (x > 0).sum() )
|
832
1012
|
metrics_dict['emojis_occurences']=("emojis_count", "sum")
|
@@ -843,9 +1023,8 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
|
|
843
1023
|
metrics_dict['topic_x']=("x", "mean")
|
844
1024
|
metrics_dict['topic_y']=("y", "mean")
|
845
1025
|
|
846
|
-
|
847
1026
|
# on produit la représentation des topics finale
|
848
|
-
df_distrib_all = (
|
1027
|
+
df_distrib_all = (df.groupby(col_topic)
|
849
1028
|
.agg(**metrics_dict)
|
850
1029
|
.sort_values(by="verbatims", ascending=False)
|
851
1030
|
.assign(engagement_per_verbatims = lambda x : x["engagements"] / x["verbatims"])
|
@@ -1042,6 +1221,73 @@ def sample_most_engaging_posts(df: pd.DataFrame, col_topic: str, col_engagement:
|
|
1042
1221
|
def get_lang_detector(nlp, name):
|
1043
1222
|
return LanguageDetector(seed=42) # We use the seed 42
|
1044
1223
|
|
1224
|
+
def PRarmy_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str = "lemmatized_text", pos_to_keep: list = ["VERB","NOUN","ADJ", "ADV", "PROPN"], entities_to_keep: list = ['PERSON','ORG', 'LOC'], stopwords: list = [], batch_size: int = 100, n_process: int = 1) -> pd.DataFrame:
|
1225
|
+
"""
|
1226
|
+
Perform natural language processing tasks using spaCy for PR Army project.
|
1227
|
+
Its main tasks are lemmatization and named entity recognition (NER).
|
1228
|
+
|
1229
|
+
Args:
|
1230
|
+
nlp : spacy.Language
|
1231
|
+
The spaCy language model.
|
1232
|
+
df : pandas.DataFrame
|
1233
|
+
The DataFrame containing the text data.
|
1234
|
+
col_text : str
|
1235
|
+
The name of the column containing the text data.
|
1236
|
+
col_lemma : str
|
1237
|
+
The name of the column to store the lemmatized text data.
|
1238
|
+
pos_to_keep : list
|
1239
|
+
A list of part-of-speech tags to keep during lemmatization.
|
1240
|
+
entities_to_keep : list
|
1241
|
+
A list of NER tags to keep.
|
1242
|
+
stopwords : list
|
1243
|
+
A list of stopwords to remove during processing.
|
1244
|
+
batch_size : int, optional
|
1245
|
+
The batch size for spaCy processing. Default is 100.
|
1246
|
+
n_process : int, optional
|
1247
|
+
The number of processes for parallel processing. Default is 1.
|
1248
|
+
Returns:
|
1249
|
+
pandas.DataFrame
|
1250
|
+
The DataFrame with processed text data.
|
1251
|
+
|
1252
|
+
"""
|
1253
|
+
all_records = []
|
1254
|
+
text=list(df[col_text].astype('unicode').values)
|
1255
|
+
|
1256
|
+
for doc in tqdm(nlp.pipe(text, batch_size=batch_size, n_process=n_process), total= len(text), desc = "NLP Process"):
|
1257
|
+
NER_type = []
|
1258
|
+
NER_text = []
|
1259
|
+
|
1260
|
+
### LEMMATIZATION
|
1261
|
+
|
1262
|
+
if len(pos_to_keep)>0 and len(stopwords)>0:
|
1263
|
+
lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space) and tok.text.lower() not in stopwords and tok.pos_ in pos_to_keep]
|
1264
|
+
elif len(pos_to_keep)>0 and len(stopwords) < 1:
|
1265
|
+
lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space) and tok.pos_ in pos_to_keep]
|
1266
|
+
elif len(pos_to_keep) < 1 and len(stopwords) > 0:
|
1267
|
+
lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space) and tok.text.lower() not in stopwords]
|
1268
|
+
else :
|
1269
|
+
lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space)]
|
1270
|
+
|
1271
|
+
### NER
|
1272
|
+
if len(entities_to_keep)>0:
|
1273
|
+
for ent in doc.ents:
|
1274
|
+
if ent.label_ in entities_to_keep:
|
1275
|
+
NER_type.append(ent.label_)
|
1276
|
+
NER_text.append(ent.text)
|
1277
|
+
|
1278
|
+
else:
|
1279
|
+
for ent in doc.ents:
|
1280
|
+
NER_type.append(ent.label_)
|
1281
|
+
NER_text.append(ent.text)
|
1282
|
+
|
1283
|
+
record = (NER_type, NER_text, ' '.join(map(str, lemmas_list)))
|
1284
|
+
all_records.append(record)
|
1285
|
+
|
1286
|
+
|
1287
|
+
df[['NER_type', 'NER_text', col_lemma]] = pd.DataFrame(all_records, index=df.index)
|
1288
|
+
|
1289
|
+
return df
|
1290
|
+
|
1045
1291
|
def TM_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_keep: list, stopwords: list, batch_size: int = 100, n_process: int = 1, stats: bool = True, join_list: bool = False) -> pd.DataFrame:
|
1046
1292
|
"""
|
1047
1293
|
Perform natural language processing tasks using spaCy for topic modeling.
|
@@ -2063,13 +2309,15 @@ def check_gpu():
|
|
2063
2309
|
def HF_load_model(model_checkpoint):
|
2064
2310
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
2065
2311
|
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
|
2312
|
+
config = AutoConfig.from_pretrained(model_checkpoint)
|
2066
2313
|
if torch.cuda.is_available():
|
2067
2314
|
model.cuda()
|
2068
|
-
return model, tokenizer
|
2315
|
+
return model, tokenizer, config
|
2069
2316
|
|
2070
2317
|
def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json):
|
2071
2318
|
""" Calculate sentiment of a text. `return_type` can be 'label', 'score' or 'proba' """
|
2072
2319
|
file_path= os.path.join(dir_json , str(filename)+'.json')
|
2320
|
+
results = {}
|
2073
2321
|
if not os.path.exists(file_path):
|
2074
2322
|
with torch.no_grad():
|
2075
2323
|
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
|
@@ -384,7 +384,6 @@ def cuml_word_frequency_per_categorie(gdf: pd.DataFrame, col_text: str, col_cat:
|
|
384
384
|
# Initialize cuML's CountVectorizer
|
385
385
|
count_vectorizer = CountVectorizer(analyzer='word', ngram_range=ngram_range, stop_words=stop_words)
|
386
386
|
|
387
|
-
print(type(gdf[col_text]))
|
388
387
|
# Fit and transform the text data
|
389
388
|
X_train_count = count_vectorizer.fit_transform(cudf.Series(gdf[col_text]))
|
390
389
|
X_names_count = count_vectorizer.get_feature_names()
|
@@ -402,7 +401,8 @@ def cuml_word_frequency_per_categorie(gdf: pd.DataFrame, col_text: str, col_cat:
|
|
402
401
|
df_count_tmp = df_count_tmp.head(n_words)
|
403
402
|
if min_freq:
|
404
403
|
df_count_tmp = df_count_tmp[df_count_tmp["freq"] > min_freq]
|
405
|
-
|
404
|
+
|
405
|
+
df_count_tmp['word'] = df_count_tmp['word'].astype(str)
|
406
406
|
# Concatenate the result to the main DataFrame
|
407
407
|
df_count = cudf.concat([df_count, df_count_tmp])
|
408
408
|
|
@@ -588,7 +588,7 @@ def cudf_encode_chunked_files(chunk_files_paths: list,
|
|
588
588
|
current_df = cudf_read_parquet(file)
|
589
589
|
|
590
590
|
text_list = current_df[col_text].to_arrow().to_pylist()
|
591
|
-
|
591
|
+
|
592
592
|
# text vectorization
|
593
593
|
embeddings = HF_encoder.embed_documents(text_list)
|
594
594
|
|
opsci_toolbox/helpers/sna.py
CHANGED
@@ -421,6 +421,7 @@ def select_top_nodes_by_degrees(G: nx.Graph, degree_type : str = "degree", N : i
|
|
421
421
|
return subgraph
|
422
422
|
|
423
423
|
|
424
|
+
|
424
425
|
def scale_size(G, size_attribute, min_node_size = 10, max_node_size = 100):
|
425
426
|
"""
|
426
427
|
Scale the sizes of nodes in a graph based on a specified attribute.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: opsci-toolbox
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.13
|
4
4
|
Summary: a complete toolbox
|
5
5
|
Home-page: UNKNOWN
|
6
6
|
Author: Erwan Le Nagard
|
@@ -48,6 +48,9 @@ Requires-Dist: transformers ==4.38.2
|
|
48
48
|
Requires-Dist: umap-learn ==0.5.5
|
49
49
|
Requires-Dist: urlextract ==1.9.0
|
50
50
|
Requires-Dist: wordcloud ==1.9.3
|
51
|
+
Requires-Dist: Unidecode ==1.3.8
|
52
|
+
Requires-Dist: kaleido ==0.2.1
|
53
|
+
Requires-Dist: gliner ==0.2.8
|
51
54
|
|
52
55
|
UNKNOWN
|
53
56
|
|
@@ -0,0 +1,25 @@
|
|
1
|
+
opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
opsci_toolbox/apis/rapidapi_helpers.py,sha256=k_hYcRNww5noNkX7zyz5Htggxb15BPoKSlbY7NLuQXI,26696
|
4
|
+
opsci_toolbox/apis/reddit.py,sha256=zhK2CY9CkCezNcekQFdv1So3NmHHYxB7-tgMVErHOGI,15763
|
5
|
+
opsci_toolbox/apis/telegram.py,sha256=GKDLpZg1fc9D_PGCgi9pfTaW7Jjm_2luQ-2trXTr38A,42208
|
6
|
+
opsci_toolbox/apis/webscraping.py,sha256=1DAIYbywZoPwTSyoqFGxyF0-q_nUsGg_VK51zLL_bB0,21465
|
7
|
+
opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
|
8
|
+
opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
+
opsci_toolbox/helpers/common.py,sha256=ZGjWIPEpqr-gIYjkfsS97PmCtQWHa_iF8tBbVxrQsOQ,53321
|
10
|
+
opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
|
11
|
+
opsci_toolbox/helpers/dataviz.py,sha256=U2Kj-xoF1wHvYXUKxLsrSvKnhky9PrPUy61s1WEKp44,208743
|
12
|
+
opsci_toolbox/helpers/dates.py,sha256=CxbXSo61GPZ2L37PV0ujvp78vwl0DoBq7t0nkk9qHp8,4751
|
13
|
+
opsci_toolbox/helpers/gliner.py,sha256=qLkpuoCDezQyYmg_TE3XYETSpobHods6WBjCLo0Gjqw,3579
|
14
|
+
opsci_toolbox/helpers/nlp.py,sha256=I72F32ieofZaCIkjZ9kqpiJLktfRoM7mMhzzxyXDQ3I,99316
|
15
|
+
opsci_toolbox/helpers/nlp_cuml.py,sha256=CGyThKNgo6fdFPV-iooPG0oNrzA__Hvv08t_sdEp3BE,28919
|
16
|
+
opsci_toolbox/helpers/sna.py,sha256=E5D_1aGDmq_YQYseHxZggEtWQOwbXJJ0GHu3YtZLGtg,31906
|
17
|
+
opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
|
18
|
+
opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
|
19
|
+
opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
|
+
opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
|
21
|
+
opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
|
22
|
+
opsci_toolbox-0.0.13.dist-info/METADATA,sha256=G_JhKg5tmYPkRUhAN2Uj9B6orX7x3TKWqIOKU_TjeIA,1727
|
23
|
+
opsci_toolbox-0.0.13.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
24
|
+
opsci_toolbox-0.0.13.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
|
25
|
+
opsci_toolbox-0.0.13.dist-info/RECORD,,
|
@@ -1,22 +0,0 @@
|
|
1
|
-
opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
opsci_toolbox/apis/rapidapi_helpers.py,sha256=k_hYcRNww5noNkX7zyz5Htggxb15BPoKSlbY7NLuQXI,26696
|
4
|
-
opsci_toolbox/apis/webscraping.py,sha256=Gz3hOfhOHUpwHU1Pzj3mB2WdBAcKa2WisYBHMi3lcVE,18343
|
5
|
-
opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
|
6
|
-
opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
opsci_toolbox/helpers/common.py,sha256=nqg9wzgU5DxVTCxEb5LSw2lUnp0f_hKF_Q-DhpRtu6g,45158
|
8
|
-
opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
|
9
|
-
opsci_toolbox/helpers/dataviz.py,sha256=1cIGb-u81cD5iSIkkkrzyrBnfim7fbhm0x_CguHUbf0,202128
|
10
|
-
opsci_toolbox/helpers/dates.py,sha256=Wf7HxaUY62IRrY3XPdRIuoaMbGi3QqWf-vStqbRRY_o,2633
|
11
|
-
opsci_toolbox/helpers/nlp.py,sha256=baq4BsSgeLBgToPOU5RTmDA80dFJwH9xf0jppuAVseU,88947
|
12
|
-
opsci_toolbox/helpers/nlp_cuml.py,sha256=XzBfoFMpVIehpRbp60E4wGokpoqJP0lJxs1plOxQqBY,28882
|
13
|
-
opsci_toolbox/helpers/sna.py,sha256=XL1BZ-x83xWRNbGsvh7-m8Mdy6iOrWx8vjgaL2_TSmo,31905
|
14
|
-
opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
|
15
|
-
opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
|
16
|
-
opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
-
opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
|
18
|
-
opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
|
19
|
-
opsci_toolbox-0.0.11.dist-info/METADATA,sha256=5h-cfwhi31VKlzrOfdAeZuoKTLB1iyDIA4qqsz-bZGQ,1633
|
20
|
-
opsci_toolbox-0.0.11.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
21
|
-
opsci_toolbox-0.0.11.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
|
22
|
-
opsci_toolbox-0.0.11.dist-info/RECORD,,
|
File without changes
|
File without changes
|