opsci-toolbox 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,7 @@ import os
7
7
  from sklearn.decomposition import TruncatedSVD
8
8
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
9
9
  from sklearn.manifold import TSNE
10
- from sklearn.preprocessing import StandardScaler, MinMaxScaler
10
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
11
11
  from sklearn.cluster import AgglomerativeClustering
12
12
  from sentence_transformers import SentenceTransformer
13
13
  from tqdm import tqdm
@@ -25,9 +25,10 @@ import requests
25
25
  import json
26
26
  from opsci_toolbox.helpers.common import write_json, write_pickle, load_pickle, create_dir, copy_file, write_jsonl
27
27
  from textacy.preprocessing.replace import urls
28
+ from textacy.preprocessing.remove import brackets
28
29
  from eldar import Query
29
30
  import torch
30
- from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer
31
+ from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
31
32
  from bs4 import BeautifulSoup
32
33
 
33
34
 
@@ -93,10 +94,15 @@ def filter_by_query(df: pd.DataFrame, col_text: str, query: str, ignore_case: bo
93
94
  The filtered DataFrame.
94
95
  """
95
96
  eldar_query=Query(query, ignore_case = ignore_case, ignore_accent=ignore_accent, match_word=match_word)
96
- df[col_text] = df[df[col_text].apply(eldar_query)]
97
+ df = df[df[col_text].apply(eldar_query)]
97
98
  df=df.reset_index(drop=True)
98
99
  return df
99
100
 
101
+ def remove_trailing_dots(text):
102
+ if text.endswith('…'):
103
+ return text[:-3].strip()
104
+ return text
105
+
100
106
  def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
101
107
  """
102
108
  Generic cleaning process for topic modeling.
@@ -114,12 +120,19 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
114
120
  The DataFrame with cleaned text data.
115
121
  """
116
122
  df[col_clean] = df[col].apply(remove_rt)
123
+ df[col_clean] = df[col_clean].apply(remove_emoji)
124
+ df[col_clean] = df[col_clean].apply(remove_trailing_dots)
125
+ df[col_clean] = df[col_clean].apply(remove_html_tags)
126
+ df[col_clean] = df[col_clean].apply(lambda x : brackets(x))
117
127
  df[col_clean] = df[col_clean].apply(lambda x : urls(x, repl= ''))
118
128
  df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
129
+ df[col_clean] = df[col_clean].apply(remove_multiple_hashtags)
119
130
  df[col_clean] = df[col_clean].apply(remove_extra_spaces)
120
131
  # df = df.loc[(df[col_clean] != ""), :]
121
132
  return df
122
133
 
134
+
135
+
123
136
  def extract_insta_shortcode(url: str) -> str:
124
137
  """
125
138
  Extracts the shortcode from an Instagram URL.
@@ -151,6 +164,39 @@ def remove_parentheses_content(text: str) -> str:
151
164
  result = re.sub(r'\([^)]*\)', '', text)
152
165
  return result
153
166
 
167
+ def remove_hashtags(text: str) -> str:
168
+ """
169
+ Removes any hashtag from text.
170
+
171
+ Args:
172
+ text : str
173
+ The input text string to clean.
174
+
175
+ Returns:
176
+ result : str
177
+ The input text string with hashtags removed.
178
+ """
179
+ pattern = r'\B#\w+'
180
+ result = re.sub(pattern, '', text).strip()
181
+ return result
182
+
183
+ def remove_multiple_hashtags(text: str) -> str:
184
+ """
185
+ Removes series of hashtags separated by spaces.
186
+
187
+ Args:
188
+ text : str
189
+ The input text string to clean.
190
+
191
+ Returns:
192
+ result : str
193
+ The input text string with series of hashtags removed.
194
+ """
195
+ pattern = r'(?:\B#\w+\s*){2,}'
196
+ result = re.sub(pattern, '', text).strip()
197
+ return result
198
+
199
+
154
200
  def remove_emojis(text: str) -> str:
155
201
  """
156
202
  Removes emojis and their textual representations from a text string.
@@ -171,6 +217,31 @@ def remove_emojis(text: str) -> str:
171
217
 
172
218
  return text_no_emojis
173
219
 
220
+ def remove_emoji(string):
221
+ emoji_pattern = re.compile(
222
+ "["
223
+ u"\U0001F600-\U0001F64F" # emoticons
224
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
225
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
226
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
227
+ u"\U00002500-\U00002BEF" # chinese char
228
+ u"\U00002702-\U000027B0"
229
+ u"\U00002702-\U000027B0"
230
+ u"\U000024C2-\U0001F251"
231
+ u"\U0001f926-\U0001f937"
232
+ u"\U00010000-\U0010ffff"
233
+ u"\u2640-\u2642"
234
+ u"\u2600-\u2B55"
235
+ u"\u200d"
236
+ u"\u23cf"
237
+ u"\u23e9"
238
+ u"\u231a"
239
+ u"\ufe0f" # dingbats
240
+ u"\u3030"
241
+ "]+", flags=re.UNICODE)
242
+ return emoji_pattern.sub(r'', string)
243
+
244
+
174
245
  def extract_numbers(text: str) -> list:
175
246
  """
176
247
  Extracts all numeric values from a given text string and returns them as a list of floats.
@@ -421,6 +492,23 @@ def remove_stopwords(lang: str, stopwords: list) -> pd.DataFrame:
421
492
  df.to_csv(file_path, encoding="utf-8", index=False)
422
493
  print("File saved -", file_path)
423
494
  return df
495
+
496
+ def keep_valid_filename_chars(text: str, replace: str = '') -> str:
497
+ """
498
+ Replace all characters not typically allowed in filenames with a specified replacement string.
499
+
500
+ Args:
501
+ text : str
502
+ The input text string.
503
+ replace : str, optional
504
+ The string to replace invalid filename characters with. Default is an empty string.
505
+
506
+ Returns:
507
+ cleaned_text : str
508
+ The input text string with invalid filename characters replaced.
509
+ """
510
+ return re.sub(r'[.<>:"/\\|?*\x00-\x1F]', replace, text)
511
+
424
512
 
425
513
 
426
514
  def keep_alphanum_char(text: str, replace: str = '') -> str:
@@ -453,7 +541,7 @@ def substitute_punctuations_with_white_space(text : str) -> str:
453
541
  text = re.sub(r"[%s]" % re.escape('!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~“…”’'), " ", text)
454
542
  return text
455
543
 
456
- def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate") -> dict:
544
+ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate", format_payload="html") -> dict:
457
545
  """
458
546
  Translate text using LibreTranslate service.
459
547
 
@@ -470,6 +558,8 @@ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_j
470
558
  The directory to save the translation result JSON file.
471
559
  url : str, optional
472
560
  The URL of the WT Libre translation service. Default is "http://127.0.0.1:5000/translate".
561
+ format_payload : str, optional
562
+ Possible values are html or text.
473
563
 
474
564
  Returns:
475
565
  json_data : dict
@@ -480,7 +570,7 @@ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_j
480
570
  "q": text,
481
571
  "source": source,
482
572
  "target": target,
483
- "format": "text",
573
+ "format": format_payload,
484
574
  "api_key": ""
485
575
  }
486
576
 
@@ -492,7 +582,7 @@ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_j
492
582
  write_json(json_data, dir_json , str(filename))
493
583
  return json_data
494
584
 
495
- def translate_batch(batch_text: list, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate") -> list:
585
+ def translate_batch(batch_text: list, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate", format_payload="html") -> list:
496
586
  """
497
587
  Translate a batch of texts using LibreTranslate service.
498
588
 
@@ -509,6 +599,8 @@ def translate_batch(batch_text: list, source: str, target: str, filename: str, d
509
599
  The directory to save the translation result JSONL file.
510
600
  url : str, optional
511
601
  The URL of the WT Libre translation service. Default is "http://127.0.0.1:5000/translate".
602
+ format_payload : str, optional
603
+ Possible values are html or text.
512
604
 
513
605
  Returns:
514
606
  json_results : list of dict
@@ -519,7 +611,7 @@ def translate_batch(batch_text: list, source: str, target: str, filename: str, d
519
611
  "q": batch_text,
520
612
  "source": source,
521
613
  "target": target,
522
- "format": "text",
614
+ "format": format_payload,
523
615
  "api_key": ""
524
616
  }
525
617
 
@@ -535,7 +627,7 @@ def translate_batch(batch_text: list, source: str, target: str, filename: str, d
535
627
  write_jsonl(json_results, dir_json , str(filename))
536
628
  return json_results
537
629
 
538
- def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:5000/translate") -> str:
630
+ def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:5000/translate", format_payload="html") -> str:
539
631
  """
540
632
  Translate text using LibreTranslate service.
541
633
 
@@ -548,6 +640,8 @@ def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:
548
640
  The target language code.
549
641
  url : str, optional
550
642
  The URL of the translation service. Default is "http://127.0.0.1:5000/translate".
643
+ format_payload : str, optional
644
+ Possible values are html or text.
551
645
 
552
646
  Returns:
553
647
  translatedText : str
@@ -558,7 +652,7 @@ def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:
558
652
  "q": text,
559
653
  "source": source,
560
654
  "target": target,
561
- "format": "text",
655
+ "format": format_payload,
562
656
  "api_key": ""
563
657
  }
564
658
 
@@ -788,7 +882,95 @@ def top_items_per_category(df: pd.DataFrame, col_lst: str = "hashtags", col_cat:
788
882
  )
789
883
  return df_count
790
884
 
791
- def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id: str, col_engagement: str, col_user_id: str, metrics: dict) -> pd.DataFrame:
885
+ def topic_aggregate_chunks(df: pd.DataFrame, col_id: str, col_topic : str, col_chunk_id: str, col_engagement: str, col_user_id: str=None, metrics : dict =dict())-> pd.DataFrame:
886
+ """
887
+ Calculate the intermediate agregation of chunks per Post ID and topic
888
+
889
+ Args:
890
+ df : pandas DataFrame
891
+ DataFrame containing processed data.
892
+ col_id : str
893
+ Name of the column containing unique posts identifiers.
894
+ col_topic : str
895
+ Name of the column containing topic labels.
896
+ col_chunk_id : str
897
+ Name of the column containing unique sentences identifiers.
898
+ col_engagement : str
899
+ Name of the column containing engagement metrics.
900
+ col_user_id : str
901
+ Name of the column containing user identifiers.
902
+ metrics : dict
903
+ Dictionary containing additional metrics to aggregate.
904
+
905
+ Returns:
906
+ DataFrame
907
+ DataFrame containing the agregated posts per topic
908
+
909
+ Description:
910
+ This function aggregates various metrics for each post and topic, including verbatim counts, engagement sums, average word counts, occurrences of emojis, hashtags, and mentions, as well as unique counts for emojis, hashtags, and mentions. Additionally, it computes the average topic coordinates (x and y) if available. Finally, it calculates percentages for verbatims, engagements, users (if applicable), occurrences of emojis, hashtags, and mentions, and their respective combinations with verbatims.
911
+ """
912
+ metrics_dict = dict()
913
+ # metrics_dict[col_id]=(col_id,'first')
914
+ metrics_dict[col_chunk_id]=(col_chunk_id,"nunique")
915
+ metrics_dict[col_engagement]=(col_engagement,'first')
916
+
917
+ if col_user_id:
918
+ metrics_dict[col_user_id]=(col_user_id,"first")
919
+ if "sentiment" in df.columns:
920
+ metrics_dict["sentiment"] = ("sentiment", "mean")
921
+ if "sentiment_score" in df.columns:
922
+ metrics_dict["sentiment_score"] = ("sentiment_score", "mean")
923
+
924
+ metrics_dict["tokens_count"] = ("tokens_count", "sum")
925
+ metrics_dict["lemmas_count"] = ("lemmas_count", "sum")
926
+ metrics_dict["emojis_count"] = ("emojis_count", "sum")
927
+ metrics_dict["unique_emojis"] = ("unique_emojis", lambda x: set(emoji for sublist in x for emoji in sublist))
928
+ metrics_dict["unique_emojis_count"] = ("unique_emojis", len)
929
+ metrics_dict["hashtags"] = ("hashtags", lambda x: list(hashtag for sublist in x for hashtag in sublist))
930
+ metrics_dict["hashtags_count"] = ("hashtags_count", "sum")
931
+ metrics_dict["mentions"] = ("mentions", lambda x: list(mention for sublist in x for mention in sublist))
932
+ metrics_dict["mentions_count"] = ("mentions_count", "sum")
933
+ metrics_dict["extracted_urls_from_text"] = ("extracted_urls_from_text", lambda x: list(url for sublist in x for url in sublist))
934
+ metrics_dict["domain"] = ("domain", lambda x: list(domain for sublist in x for domain in sublist))
935
+ metrics_dict["len_numbers"] = ("len_numbers", "sum")
936
+ metrics_dict["interrogation"] = ("interrogation", "sum")
937
+ metrics_dict["exclamation"] = ("exclamation", "sum")
938
+ metrics_dict["x"] = ("x", "mean")
939
+ metrics_dict["y"] = ("y", "mean")
940
+
941
+ metrics_dict.update(metrics)
942
+
943
+ df_gb = df.groupby([col_id, col_topic]).agg(**metrics_dict).reset_index()
944
+ df_gb[col_topic]=df_gb[col_topic].astype(str)
945
+
946
+ return df_gb
947
+
948
+ def sentiment_to_category(sentiment : float, boundaries : list = [-1.0, -0.5, 0.5, 1.0], labels :list = ['negative', 'neutral', 'positive']) -> str:
949
+ """
950
+ Assign a sentiment category to a sentiment score.
951
+
952
+ Args:
953
+ sentiment : float
954
+ sentiment score
955
+ boundaries : list
956
+ list of boundaries for each category
957
+ labels : list
958
+ list of labels for each category
959
+
960
+ Returns:
961
+ str
962
+ category label
963
+
964
+ Description:
965
+ This function assigns a sentiment category to a sentiment score based on a list of boundaries and labels. If the sentiment score is outside the boundaries, it is assigned to the last category.
966
+ """
967
+ for i in range(len(boundaries) - 1):
968
+ if boundaries[i] <= sentiment < boundaries[i + 1]:
969
+ return labels[i]
970
+ return labels[-1]
971
+
972
+
973
+ def topic_representation(df: pd.DataFrame, col_topic: str, col_id: str, col_engagement: str, col_user_id: str, metrics: dict) -> pd.DataFrame:
792
974
  """
793
975
  Calculate the representation of topics in a processed DataFrame.
794
976
 
@@ -822,11 +1004,15 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
822
1004
  metrics_dict['engagements']=(col_engagement,'sum')
823
1005
  if col_user_id:
824
1006
  metrics_dict["users"]=(col_user_id,"nunique")
1007
+ panel_cols = [col for col in df.columns if col[:6] == 'panel_']
1008
+ if len(panel_cols)>0:
1009
+ for panel_col in panel_cols:
1010
+ metrics_dict[panel_col+'_verbatims'] = (panel_col, "sum")
1011
+ metrics_dict[panel_col+'_users'] = (col_user_id, lambda x : x[df[panel_col]].nunique())
1012
+ metrics_dict[panel_col+'_engagements'] = (col_engagement, lambda x : x[df[panel_col]].sum())
825
1013
 
826
1014
  metrics_dict.update(metrics)
827
1015
 
828
- print(metrics_dict)
829
-
830
1016
  metrics_dict['avg_word_count']=("tokens_count", lambda x: round(x.mean(),2))
831
1017
  metrics_dict['verbatims_with_emoji']=("emojis_count", lambda x: (x > 0).sum() )
832
1018
  metrics_dict['emojis_occurences']=("emojis_count", "sum")
@@ -843,9 +1029,8 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
843
1029
  metrics_dict['topic_x']=("x", "mean")
844
1030
  metrics_dict['topic_y']=("y", "mean")
845
1031
 
846
-
847
1032
  # on produit la représentation des topics finale
848
- df_distrib_all = (df_processed_data.groupby(col_topic)
1033
+ df_distrib_all = (df.groupby(col_topic)
849
1034
  .agg(**metrics_dict)
850
1035
  .sort_values(by="verbatims", ascending=False)
851
1036
  .assign(engagement_per_verbatims = lambda x : x["engagements"] / x["verbatims"])
@@ -1101,10 +1286,12 @@ def PRarmy_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str = "l
1101
1286
  NER_type.append(ent.label_)
1102
1287
  NER_text.append(ent.text)
1103
1288
 
1289
+
1104
1290
  record = (NER_type, NER_text, ' '.join(map(str, lemmas_list)))
1105
1291
  all_records.append(record)
1106
1292
 
1107
1293
 
1294
+
1108
1295
  df[['NER_type', 'NER_text', col_lemma]] = pd.DataFrame(all_records, index=df.index)
1109
1296
 
1110
1297
  return df
@@ -1819,6 +2006,75 @@ def encode_chunked_files(chunk_files_paths: list,
1819
2006
 
1820
2007
  return new_file_paths
1821
2008
 
2009
+ ####################################################################
2010
+ # ENCODING FEATURES
2011
+ ####################################################################
2012
+
2013
+ def encode_labels(data_to_encode: np.ndarray) -> tuple:
2014
+ """
2015
+ Encodes a list of labels using a LabelEncoder.
2016
+
2017
+ Args:
2018
+ - data_to_encode (List[Union[str, int]]): The list of labels to encode. Labels can be of any hashable type,
2019
+ but strings or integers are typical.
2020
+
2021
+ Returns:
2022
+ - Tuple[LabelEncoder, np.ndarray]: A tuple containing the fitted LabelEncoder instance and a numpy array
2023
+ of encoded labels.
2024
+ """
2025
+ label_encoder = LabelEncoder()
2026
+ label_encoder.fit(data_to_encode)
2027
+ encoded_labels = label_encoder.transform(data_to_encode)
2028
+ return label_encoder, encoded_labels
2029
+
2030
+
2031
+ def encode_new_labels(label_encoder : LabelEncoder, data_to_encode : np.ndarray) -> np.ndarray:
2032
+ """
2033
+ Encodes a list of new labels using an already fitted LabelEncoder.
2034
+
2035
+ Args:
2036
+ - label_encoder (LabelEncoder): A pre-fitted LabelEncoder instance.
2037
+ - data_to_encode (List[Union[str, int]]): The list of new labels to encode using the pre-fitted encoder.
2038
+
2039
+ Returns:
2040
+ - np.ndarray: A numpy array of encoded labels.
2041
+ """
2042
+ encoded_labels = label_encoder.transform(data_to_encode)
2043
+ return encoded_labels
2044
+
2045
+ def one_hot_encode(data_to_encode:np.ndarray) -> tuple:
2046
+ """
2047
+ One-hot encodes a list of categorical values using OneHotEncoder.
2048
+
2049
+ Args:
2050
+ - data_to_encode (List[Union[str, int]]): The list of categorical values to encode. The values can be of
2051
+ any hashable type, typically strings or integers.
2052
+
2053
+ Returns:
2054
+ - Tuple[OneHotEncoder, np.ndarray]: A tuple containing the fitted OneHotEncoder instance and a numpy array
2055
+ of one-hot encoded values.
2056
+ """
2057
+ one_hot_encoder = OneHotEncoder(sparse=False)
2058
+ data_to_encode_reshaped = np.array(data_to_encode).reshape(-1, 1) # Reshape for OneHotEncoder
2059
+ one_hot_encoder.fit(data_to_encode_reshaped)
2060
+ encoded_array = one_hot_encoder.transform(data_to_encode_reshaped)
2061
+ return one_hot_encoder, encoded_array
2062
+
2063
+
2064
+ def one_hot_encode_new_data(one_hot_encoder: OneHotEncoder, data_to_encode: np.ndarray) -> np.ndarray:
2065
+ """
2066
+ One-hot encodes a list of new categorical values using an already fitted OneHotEncoder.
2067
+
2068
+ Args:
2069
+ - one_hot_encoder (OneHotEncoder): A pre-fitted OneHotEncoder instance.
2070
+ - data_to_encode (List[Union[str, int]]): The list of new categorical values to encode using the pre-fitted encoder.
2071
+
2072
+ Returns:
2073
+ - np.ndarray: A numpy array of one-hot encoded values.
2074
+ """
2075
+ data_to_encode_reshaped = np.array(data_to_encode).reshape(-1, 1) # Reshape for OneHotEncoder
2076
+ encoded_array = one_hot_encoder.transform(data_to_encode_reshaped)
2077
+ return encoded_array
1822
2078
 
1823
2079
  ####################################################################
1824
2080
  # SCALING FEATURES
@@ -2130,13 +2386,15 @@ def check_gpu():
2130
2386
  def HF_load_model(model_checkpoint):
2131
2387
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
2132
2388
  model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
2389
+ config = AutoConfig.from_pretrained(model_checkpoint)
2133
2390
  if torch.cuda.is_available():
2134
2391
  model.cuda()
2135
- return model, tokenizer
2392
+ return model, tokenizer, config
2136
2393
 
2137
2394
  def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json):
2138
2395
  """ Calculate sentiment of a text. `return_type` can be 'label', 'score' or 'proba' """
2139
2396
  file_path= os.path.join(dir_json , str(filename)+'.json')
2397
+ results = {}
2140
2398
  if not os.path.exists(file_path):
2141
2399
  with torch.no_grad():
2142
2400
  inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
@@ -46,6 +46,44 @@ def reduce_with_cuml_UMAP(embeddings: np.ndarray,
46
46
  reduced_embeddings = reducer.transform(embeddings)
47
47
  return reducer, reduced_embeddings
48
48
 
49
+
50
+ def supervised_reduce_with_cuml_UMAP(embeddings: np.ndarray,
51
+ n_neighbors: int = 5,
52
+ n_components: int = 3,
53
+ min_dist: float = 0.0,
54
+ metric: str = "cosine",
55
+ spread: float = 1.0,
56
+ learning_rate: float = 1.0,
57
+ n_epochs:int = 300,
58
+ y: np.ndarray = None,
59
+ convert_dtype: bool = False
60
+ ) -> tuple:
61
+ """
62
+ Reduces the dimensionality of embeddings using UMAP with cuML library.
63
+
64
+ Args:
65
+ embeddings (np.ndarray): The input embeddings to be reduced.
66
+ n_neighbors (int, optional): The number of nearest neighbors to consider. Defaults to 5.
67
+ n_components (int, optional): The number of dimensions of the embedded space. Defaults to 3.
68
+ min_dist (float, optional): The minimum distance between embedded points. Defaults to 0.0.
69
+ metric (str, optional): The metric to use for distance computation. Defaults to "cosine".
70
+ spread (float, optional): The effective scale of embedded points. Defaults to 1.0.
71
+
72
+ Returns:
73
+ reducer (UMAP): The UMAP reducer object.
74
+ reduced_embeddings (np.ndarray): The reduced embeddings.
75
+ """
76
+ reducer = UMAP(n_neighbors=n_neighbors,
77
+ n_components=n_components,
78
+ min_dist=min_dist,
79
+ metric=metric,
80
+ spread = spread,
81
+ n_epochs=n_epochs,
82
+ learning_rate=learning_rate).fit(X = embeddings, y = y, convert_dtype = convert_dtype)
83
+
84
+ reduced_embeddings = reducer.transform(embeddings)
85
+ return reducer, reduced_embeddings
86
+
49
87
  def transform_with_cuml_UMAP(reducer,
50
88
  new_embeddings: np.ndarray) -> np.ndarray:
51
89
  """
@@ -384,7 +422,6 @@ def cuml_word_frequency_per_categorie(gdf: pd.DataFrame, col_text: str, col_cat:
384
422
  # Initialize cuML's CountVectorizer
385
423
  count_vectorizer = CountVectorizer(analyzer='word', ngram_range=ngram_range, stop_words=stop_words)
386
424
 
387
- print(type(gdf[col_text]))
388
425
  # Fit and transform the text data
389
426
  X_train_count = count_vectorizer.fit_transform(cudf.Series(gdf[col_text]))
390
427
  X_names_count = count_vectorizer.get_feature_names()
@@ -402,13 +439,17 @@ def cuml_word_frequency_per_categorie(gdf: pd.DataFrame, col_text: str, col_cat:
402
439
  df_count_tmp = df_count_tmp.head(n_words)
403
440
  if min_freq:
404
441
  df_count_tmp = df_count_tmp[df_count_tmp["freq"] > min_freq]
405
-
442
+
443
+ df_count_tmp['word'] = df_count_tmp['word'].astype(str)
406
444
  # Concatenate the result to the main DataFrame
407
445
  df_count = cudf.concat([df_count, df_count_tmp])
408
446
 
409
447
  # Convert the result back to pandas DataFrame
410
448
  return df_count.to_pandas()
411
449
 
450
+
451
+
452
+
412
453
  # def cuml_chi2_per_category(lst_text: list, lst_categorie: list, col_cat: str, n_words: int = 10, p_value_limit: float = 0.95, min_freq: int = 3) -> pd.DataFrame:
413
454
 
414
455
  # # Convert input lists to cuDF Series
@@ -588,7 +629,7 @@ def cudf_encode_chunked_files(chunk_files_paths: list,
588
629
  current_df = cudf_read_parquet(file)
589
630
 
590
631
  text_list = current_df[col_text].to_arrow().to_pylist()
591
-
632
+
592
633
  # text vectorization
593
634
  embeddings = HF_encoder.embed_documents(text_list)
594
635
 
@@ -421,6 +421,7 @@ def select_top_nodes_by_degrees(G: nx.Graph, degree_type : str = "degree", N : i
421
421
  return subgraph
422
422
 
423
423
 
424
+
424
425
  def scale_size(G, size_attribute, min_node_size = 10, max_node_size = 100):
425
426
  """
426
427
  Scale the sizes of nodes in a graph based on a specified attribute.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: opsci-toolbox
3
- Version: 0.0.12
3
+ Version: 0.0.14
4
4
  Summary: a complete toolbox
5
5
  Home-page: UNKNOWN
6
6
  Author: Erwan Le Nagard
@@ -41,13 +41,16 @@ Requires-Dist: spacy-language-detection ==0.2.1
41
41
  Requires-Dist: spacymoji ==3.1.0
42
42
  Requires-Dist: supervision ==0.21.0
43
43
  Requires-Dist: textacy ==0.13.0
44
- Requires-Dist: torch ==2.0.1
44
+ Requires-Dist: torch >=2.4.0
45
45
  Requires-Dist: tqdm >=4.66.2
46
46
  Requires-Dist: trafilatura ==1.7.0
47
47
  Requires-Dist: transformers ==4.38.2
48
48
  Requires-Dist: umap-learn ==0.5.5
49
49
  Requires-Dist: urlextract ==1.9.0
50
50
  Requires-Dist: wordcloud ==1.9.3
51
+ Requires-Dist: Unidecode ==1.3.8
52
+ Requires-Dist: kaleido ==0.2.1
53
+ Requires-Dist: gliner ==0.2.8
51
54
 
52
55
  UNKNOWN
53
56
 
@@ -0,0 +1,26 @@
1
+ opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ opsci_toolbox/apis/rapidapi_helpers.py,sha256=plX0uoGXWBEmeRqK7QfB_CVYJnW15kVUWtitESxPLNw,26669
4
+ opsci_toolbox/apis/reddit.py,sha256=b_dJFZ_bOB9LLugGBBw5bCbUZdq8VnwtVCGaTYljIIg,21096
5
+ opsci_toolbox/apis/telegram.py,sha256=IJYXMvXzA2R2Z7ywKJiny38pd-ryHK4jPxVG2Nj_dms,45676
6
+ opsci_toolbox/apis/webscraping.py,sha256=1DAIYbywZoPwTSyoqFGxyF0-q_nUsGg_VK51zLL_bB0,21465
7
+ opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
8
+ opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ opsci_toolbox/helpers/common.py,sha256=dlP6TnRggZsnPksgo7LPH7IghU_t9LFz42eMEzzg99o,53323
10
+ opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
11
+ opsci_toolbox/helpers/dataviz.py,sha256=U2Kj-xoF1wHvYXUKxLsrSvKnhky9PrPUy61s1WEKp44,208743
12
+ opsci_toolbox/helpers/dates.py,sha256=Pq-SKP2n1z0_jzU8NxGSv8CHLH_MOKjP_rNYeny0Tb8,4752
13
+ opsci_toolbox/helpers/gliner.py,sha256=qLkpuoCDezQyYmg_TE3XYETSpobHods6WBjCLo0Gjqw,3579
14
+ opsci_toolbox/helpers/nlp.py,sha256=hXnP6rUkUzyurJ5O_fNUxqT2MZK3poC21L9zy6oa22c,102551
15
+ opsci_toolbox/helpers/nlp_cuml.py,sha256=OBCRkaHibuyvJ8LQAE2EC7_J0KPe7Kf-ayN2jyxDlKg,30709
16
+ opsci_toolbox/helpers/sna.py,sha256=E5D_1aGDmq_YQYseHxZggEtWQOwbXJJ0GHu3YtZLGtg,31906
17
+ opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
18
+ opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
19
+ opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
+ opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
21
+ opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
22
+ opsci_toolbox-0.0.14.dist-info/METADATA,sha256=X2EgVw8JlZLdgnrN1nOP6aZRs1WyztbkCkN4UKkuTLE,1727
23
+ opsci_toolbox-0.0.14.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
24
+ opsci_toolbox-0.0.14.dist-info/dependency_links.txt,sha256=bEiJsgyh9M0F_pGpJBwUYDefiTNq9F6QEGfQS5RH1Os,39
25
+ opsci_toolbox-0.0.14.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
26
+ opsci_toolbox-0.0.14.dist-info/RECORD,,
@@ -0,0 +1 @@
1
+ https://download.pytorch.org/whl/cu124
@@ -1,22 +0,0 @@
1
- opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- opsci_toolbox/apis/rapidapi_helpers.py,sha256=k_hYcRNww5noNkX7zyz5Htggxb15BPoKSlbY7NLuQXI,26696
4
- opsci_toolbox/apis/webscraping.py,sha256=1DAIYbywZoPwTSyoqFGxyF0-q_nUsGg_VK51zLL_bB0,21465
5
- opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
6
- opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- opsci_toolbox/helpers/common.py,sha256=nqg9wzgU5DxVTCxEb5LSw2lUnp0f_hKF_Q-DhpRtu6g,45158
8
- opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
9
- opsci_toolbox/helpers/dataviz.py,sha256=1cIGb-u81cD5iSIkkkrzyrBnfim7fbhm0x_CguHUbf0,202128
10
- opsci_toolbox/helpers/dates.py,sha256=Wf7HxaUY62IRrY3XPdRIuoaMbGi3QqWf-vStqbRRY_o,2633
11
- opsci_toolbox/helpers/nlp.py,sha256=n7nNEU0cuu7bqXYRRBH4D-xIzpdNwKm0nj-eRYh3aPY,91956
12
- opsci_toolbox/helpers/nlp_cuml.py,sha256=XzBfoFMpVIehpRbp60E4wGokpoqJP0lJxs1plOxQqBY,28882
13
- opsci_toolbox/helpers/sna.py,sha256=XL1BZ-x83xWRNbGsvh7-m8Mdy6iOrWx8vjgaL2_TSmo,31905
14
- opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
15
- opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
16
- opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
18
- opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
19
- opsci_toolbox-0.0.12.dist-info/METADATA,sha256=LosT5jzu7Z0TXIslwVUSvPG6AKMrblGp8A6odUN_N9U,1633
20
- opsci_toolbox-0.0.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
21
- opsci_toolbox-0.0.12.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
22
- opsci_toolbox-0.0.12.dist-info/RECORD,,