opsci-toolbox 0.0.12__py3-none-any.whl → 0.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,9 +25,10 @@ import requests
25
25
  import json
26
26
  from opsci_toolbox.helpers.common import write_json, write_pickle, load_pickle, create_dir, copy_file, write_jsonl
27
27
  from textacy.preprocessing.replace import urls
28
+ from textacy.preprocessing.remove import brackets
28
29
  from eldar import Query
29
30
  import torch
30
- from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer
31
+ from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
31
32
  from bs4 import BeautifulSoup
32
33
 
33
34
 
@@ -97,6 +98,11 @@ def filter_by_query(df: pd.DataFrame, col_text: str, query: str, ignore_case: bo
97
98
  df=df.reset_index(drop=True)
98
99
  return df
99
100
 
101
+ def remove_trailing_dots(text):
102
+ if text.endswith('…'):
103
+ return text[:-3].strip()
104
+ return text
105
+
100
106
  def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
101
107
  """
102
108
  Generic cleaning process for topic modeling.
@@ -114,12 +120,19 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
114
120
  The DataFrame with cleaned text data.
115
121
  """
116
122
  df[col_clean] = df[col].apply(remove_rt)
123
+ df[col_clean] = df[col_clean].apply(remove_emoji)
124
+ df[col_clean] = df[col_clean].apply(remove_trailing_dots)
125
+ df[col_clean] = df[col_clean].apply(remove_html_tags)
126
+ df[col_clean] = df[col_clean].apply(lambda x : brackets(x))
117
127
  df[col_clean] = df[col_clean].apply(lambda x : urls(x, repl= ''))
118
128
  df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
129
+ df[col_clean] = df[col_clean].apply(remove_multiple_hashtags)
119
130
  df[col_clean] = df[col_clean].apply(remove_extra_spaces)
120
131
  # df = df.loc[(df[col_clean] != ""), :]
121
132
  return df
122
133
 
134
+
135
+
123
136
  def extract_insta_shortcode(url: str) -> str:
124
137
  """
125
138
  Extracts the shortcode from an Instagram URL.
@@ -151,6 +164,39 @@ def remove_parentheses_content(text: str) -> str:
151
164
  result = re.sub(r'\([^)]*\)', '', text)
152
165
  return result
153
166
 
167
+ def remove_hashtags(text: str) -> str:
168
+ """
169
+ Removes any hashtag from text.
170
+
171
+ Args:
172
+ text : str
173
+ The input text string to clean.
174
+
175
+ Returns:
176
+ result : str
177
+ The input text string with hashtags removed.
178
+ """
179
+ pattern = r'\B#\w+'
180
+ result = re.sub(pattern, '', text).strip()
181
+ return result
182
+
183
+ def remove_multiple_hashtags(text: str) -> str:
184
+ """
185
+ Removes series of hashtags separated by spaces.
186
+
187
+ Args:
188
+ text : str
189
+ The input text string to clean.
190
+
191
+ Returns:
192
+ result : str
193
+ The input text string with series of hashtags removed.
194
+ """
195
+ pattern = r'(?:\B#\w+\s*){2,}'
196
+ result = re.sub(pattern, '', text).strip()
197
+ return result
198
+
199
+
154
200
  def remove_emojis(text: str) -> str:
155
201
  """
156
202
  Removes emojis and their textual representations from a text string.
@@ -171,6 +217,31 @@ def remove_emojis(text: str) -> str:
171
217
 
172
218
  return text_no_emojis
173
219
 
220
+ def remove_emoji(string):
221
+ emoji_pattern = re.compile(
222
+ "["
223
+ u"\U0001F600-\U0001F64F" # emoticons
224
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
225
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
226
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
227
+ u"\U00002500-\U00002BEF" # chinese char
228
+ u"\U00002702-\U000027B0"
229
+ u"\U00002702-\U000027B0"
230
+ u"\U000024C2-\U0001F251"
231
+ u"\U0001f926-\U0001f937"
232
+ u"\U00010000-\U0010ffff"
233
+ u"\u2640-\u2642"
234
+ u"\u2600-\u2B55"
235
+ u"\u200d"
236
+ u"\u23cf"
237
+ u"\u23e9"
238
+ u"\u231a"
239
+ u"\ufe0f" # dingbats
240
+ u"\u3030"
241
+ "]+", flags=re.UNICODE)
242
+ return emoji_pattern.sub(r'', string)
243
+
244
+
174
245
  def extract_numbers(text: str) -> list:
175
246
  """
176
247
  Extracts all numeric values from a given text string and returns them as a list of floats.
@@ -421,6 +492,23 @@ def remove_stopwords(lang: str, stopwords: list) -> pd.DataFrame:
421
492
  df.to_csv(file_path, encoding="utf-8", index=False)
422
493
  print("File saved -", file_path)
423
494
  return df
495
+
496
+ def keep_valid_filename_chars(text: str, replace: str = '') -> str:
497
+ """
498
+ Replace all characters not typically allowed in filenames with a specified replacement string.
499
+
500
+ Args:
501
+ text : str
502
+ The input text string.
503
+ replace : str, optional
504
+ The string to replace invalid filename characters with. Default is an empty string.
505
+
506
+ Returns:
507
+ cleaned_text : str
508
+ The input text string with invalid filename characters replaced.
509
+ """
510
+ return re.sub(r'[.<>:"/\\|?*\x00-\x1F]', replace, text)
511
+
424
512
 
425
513
 
426
514
  def keep_alphanum_char(text: str, replace: str = '') -> str:
@@ -788,7 +876,95 @@ def top_items_per_category(df: pd.DataFrame, col_lst: str = "hashtags", col_cat:
788
876
  )
789
877
  return df_count
790
878
 
791
- def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id: str, col_engagement: str, col_user_id: str, metrics: dict) -> pd.DataFrame:
879
+ def topic_aggregate_chunks(df: pd.DataFrame, col_id: str, col_topic : str, col_chunk_id: str, col_engagement: str, col_user_id: str=None, metrics : dict =dict())-> pd.DataFrame:
880
+ """
881
+ Calculate the intermediate agregation of chunks per Post ID and topic
882
+
883
+ Args:
884
+ df : pandas DataFrame
885
+ DataFrame containing processed data.
886
+ col_id : str
887
+ Name of the column containing unique posts identifiers.
888
+ col_topic : str
889
+ Name of the column containing topic labels.
890
+ col_chunk_id : str
891
+ Name of the column containing unique sentences identifiers.
892
+ col_engagement : str
893
+ Name of the column containing engagement metrics.
894
+ col_user_id : str
895
+ Name of the column containing user identifiers.
896
+ metrics : dict
897
+ Dictionary containing additional metrics to aggregate.
898
+
899
+ Returns:
900
+ DataFrame
901
+ DataFrame containing the agregated posts per topic
902
+
903
+ Description:
904
+ This function aggregates various metrics for each post and topic, including verbatim counts, engagement sums, average word counts, occurrences of emojis, hashtags, and mentions, as well as unique counts for emojis, hashtags, and mentions. Additionally, it computes the average topic coordinates (x and y) if available. Finally, it calculates percentages for verbatims, engagements, users (if applicable), occurrences of emojis, hashtags, and mentions, and their respective combinations with verbatims.
905
+ """
906
+ metrics_dict = dict()
907
+ # metrics_dict[col_id]=(col_id,'first')
908
+ metrics_dict[col_chunk_id]=(col_chunk_id,"nunique")
909
+ metrics_dict[col_engagement]=(col_engagement,'first')
910
+
911
+ if col_user_id:
912
+ metrics_dict[col_user_id]=(col_user_id,"first")
913
+ if "sentiment" in df.columns:
914
+ metrics_dict["sentiment"] = ("sentiment", "mean")
915
+ if "sentiment_score" in df.columns:
916
+ metrics_dict["sentiment_score"] = ("sentiment_score", "mean")
917
+
918
+ metrics_dict["tokens_count"] = ("tokens_count", "sum")
919
+ metrics_dict["lemmas_count"] = ("lemmas_count", "sum")
920
+ metrics_dict["emojis_count"] = ("emojis_count", "sum")
921
+ metrics_dict["unique_emojis"] = ("unique_emojis", lambda x: set(emoji for sublist in x for emoji in sublist))
922
+ metrics_dict["unique_emojis_count"] = ("unique_emojis", len)
923
+ metrics_dict["hashtags"] = ("hashtags", lambda x: list(hashtag for sublist in x for hashtag in sublist))
924
+ metrics_dict["hashtags_count"] = ("hashtags_count", "sum")
925
+ metrics_dict["mentions"] = ("mentions", lambda x: list(mention for sublist in x for mention in sublist))
926
+ metrics_dict["mentions_count"] = ("mentions_count", "sum")
927
+ metrics_dict["extracted_urls_from_text"] = ("extracted_urls_from_text", lambda x: list(url for sublist in x for url in sublist))
928
+ metrics_dict["domain"] = ("domain", lambda x: list(domain for sublist in x for domain in sublist))
929
+ metrics_dict["len_numbers"] = ("len_numbers", "sum")
930
+ metrics_dict["interrogation"] = ("interrogation", "sum")
931
+ metrics_dict["exclamation"] = ("exclamation", "sum")
932
+ metrics_dict["x"] = ("x", "mean")
933
+ metrics_dict["y"] = ("y", "mean")
934
+
935
+ metrics_dict.update(metrics)
936
+
937
+ df_gb = df.groupby([col_id, col_topic]).agg(**metrics_dict).reset_index()
938
+ df_gb[col_topic]=df_gb[col_topic].astype(str)
939
+
940
+ return df_gb
941
+
942
+ def sentiment_to_category(sentiment : float, boundaries : list = [-1.0, -0.5, 0.5, 1.0], labels :list = ['negative', 'neutral', 'positive']) -> str:
943
+ """
944
+ Assign a sentiment category to a sentiment score.
945
+
946
+ Args:
947
+ sentiment : float
948
+ sentiment score
949
+ boundaries : list
950
+ list of boundaries for each category
951
+ labels : list
952
+ list of labels for each category
953
+
954
+ Returns:
955
+ str
956
+ category label
957
+
958
+ Description:
959
+ This function assigns a sentiment category to a sentiment score based on a list of boundaries and labels. If the sentiment score is outside the boundaries, it is assigned to the last category.
960
+ """
961
+ for i in range(len(boundaries) - 1):
962
+ if boundaries[i] <= sentiment < boundaries[i + 1]:
963
+ return labels[i]
964
+ return labels[-1]
965
+
966
+
967
+ def topic_representation(df: pd.DataFrame, col_topic: str, col_id: str, col_engagement: str, col_user_id: str, metrics: dict) -> pd.DataFrame:
792
968
  """
793
969
  Calculate the representation of topics in a processed DataFrame.
794
970
 
@@ -822,11 +998,15 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
822
998
  metrics_dict['engagements']=(col_engagement,'sum')
823
999
  if col_user_id:
824
1000
  metrics_dict["users"]=(col_user_id,"nunique")
1001
+ panel_cols = [col for col in df.columns if col[:6] == 'panel_']
1002
+ if len(panel_cols)>0:
1003
+ for panel_col in panel_cols:
1004
+ metrics_dict[panel_col+'_verbatims'] = (panel_col, "sum")
1005
+ metrics_dict[panel_col+'_users'] = (col_user_id, lambda x : x[df[panel_col]].nunique())
1006
+ metrics_dict[panel_col+'_engagements'] = (col_engagement, lambda x : x[df[panel_col]].sum())
825
1007
 
826
1008
  metrics_dict.update(metrics)
827
1009
 
828
- print(metrics_dict)
829
-
830
1010
  metrics_dict['avg_word_count']=("tokens_count", lambda x: round(x.mean(),2))
831
1011
  metrics_dict['verbatims_with_emoji']=("emojis_count", lambda x: (x > 0).sum() )
832
1012
  metrics_dict['emojis_occurences']=("emojis_count", "sum")
@@ -843,9 +1023,8 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
843
1023
  metrics_dict['topic_x']=("x", "mean")
844
1024
  metrics_dict['topic_y']=("y", "mean")
845
1025
 
846
-
847
1026
  # on produit la représentation des topics finale
848
- df_distrib_all = (df_processed_data.groupby(col_topic)
1027
+ df_distrib_all = (df.groupby(col_topic)
849
1028
  .agg(**metrics_dict)
850
1029
  .sort_values(by="verbatims", ascending=False)
851
1030
  .assign(engagement_per_verbatims = lambda x : x["engagements"] / x["verbatims"])
@@ -2130,13 +2309,15 @@ def check_gpu():
2130
2309
  def HF_load_model(model_checkpoint):
2131
2310
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
2132
2311
  model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
2312
+ config = AutoConfig.from_pretrained(model_checkpoint)
2133
2313
  if torch.cuda.is_available():
2134
2314
  model.cuda()
2135
- return model, tokenizer
2315
+ return model, tokenizer, config
2136
2316
 
2137
2317
  def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json):
2138
2318
  """ Calculate sentiment of a text. `return_type` can be 'label', 'score' or 'proba' """
2139
2319
  file_path= os.path.join(dir_json , str(filename)+'.json')
2320
+ results = {}
2140
2321
  if not os.path.exists(file_path):
2141
2322
  with torch.no_grad():
2142
2323
  inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
@@ -384,7 +384,6 @@ def cuml_word_frequency_per_categorie(gdf: pd.DataFrame, col_text: str, col_cat:
384
384
  # Initialize cuML's CountVectorizer
385
385
  count_vectorizer = CountVectorizer(analyzer='word', ngram_range=ngram_range, stop_words=stop_words)
386
386
 
387
- print(type(gdf[col_text]))
388
387
  # Fit and transform the text data
389
388
  X_train_count = count_vectorizer.fit_transform(cudf.Series(gdf[col_text]))
390
389
  X_names_count = count_vectorizer.get_feature_names()
@@ -402,7 +401,8 @@ def cuml_word_frequency_per_categorie(gdf: pd.DataFrame, col_text: str, col_cat:
402
401
  df_count_tmp = df_count_tmp.head(n_words)
403
402
  if min_freq:
404
403
  df_count_tmp = df_count_tmp[df_count_tmp["freq"] > min_freq]
405
-
404
+
405
+ df_count_tmp['word'] = df_count_tmp['word'].astype(str)
406
406
  # Concatenate the result to the main DataFrame
407
407
  df_count = cudf.concat([df_count, df_count_tmp])
408
408
 
@@ -588,7 +588,7 @@ def cudf_encode_chunked_files(chunk_files_paths: list,
588
588
  current_df = cudf_read_parquet(file)
589
589
 
590
590
  text_list = current_df[col_text].to_arrow().to_pylist()
591
-
591
+
592
592
  # text vectorization
593
593
  embeddings = HF_encoder.embed_documents(text_list)
594
594
 
@@ -421,6 +421,7 @@ def select_top_nodes_by_degrees(G: nx.Graph, degree_type : str = "degree", N : i
421
421
  return subgraph
422
422
 
423
423
 
424
+
424
425
  def scale_size(G, size_attribute, min_node_size = 10, max_node_size = 100):
425
426
  """
426
427
  Scale the sizes of nodes in a graph based on a specified attribute.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: opsci-toolbox
3
- Version: 0.0.12
3
+ Version: 0.0.13
4
4
  Summary: a complete toolbox
5
5
  Home-page: UNKNOWN
6
6
  Author: Erwan Le Nagard
@@ -48,6 +48,9 @@ Requires-Dist: transformers ==4.38.2
48
48
  Requires-Dist: umap-learn ==0.5.5
49
49
  Requires-Dist: urlextract ==1.9.0
50
50
  Requires-Dist: wordcloud ==1.9.3
51
+ Requires-Dist: Unidecode ==1.3.8
52
+ Requires-Dist: kaleido ==0.2.1
53
+ Requires-Dist: gliner ==0.2.8
51
54
 
52
55
  UNKNOWN
53
56
 
@@ -1,22 +1,25 @@
1
1
  opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  opsci_toolbox/apis/rapidapi_helpers.py,sha256=k_hYcRNww5noNkX7zyz5Htggxb15BPoKSlbY7NLuQXI,26696
4
+ opsci_toolbox/apis/reddit.py,sha256=zhK2CY9CkCezNcekQFdv1So3NmHHYxB7-tgMVErHOGI,15763
5
+ opsci_toolbox/apis/telegram.py,sha256=GKDLpZg1fc9D_PGCgi9pfTaW7Jjm_2luQ-2trXTr38A,42208
4
6
  opsci_toolbox/apis/webscraping.py,sha256=1DAIYbywZoPwTSyoqFGxyF0-q_nUsGg_VK51zLL_bB0,21465
5
7
  opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
6
8
  opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- opsci_toolbox/helpers/common.py,sha256=nqg9wzgU5DxVTCxEb5LSw2lUnp0f_hKF_Q-DhpRtu6g,45158
9
+ opsci_toolbox/helpers/common.py,sha256=ZGjWIPEpqr-gIYjkfsS97PmCtQWHa_iF8tBbVxrQsOQ,53321
8
10
  opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
9
- opsci_toolbox/helpers/dataviz.py,sha256=1cIGb-u81cD5iSIkkkrzyrBnfim7fbhm0x_CguHUbf0,202128
10
- opsci_toolbox/helpers/dates.py,sha256=Wf7HxaUY62IRrY3XPdRIuoaMbGi3QqWf-vStqbRRY_o,2633
11
- opsci_toolbox/helpers/nlp.py,sha256=n7nNEU0cuu7bqXYRRBH4D-xIzpdNwKm0nj-eRYh3aPY,91956
12
- opsci_toolbox/helpers/nlp_cuml.py,sha256=XzBfoFMpVIehpRbp60E4wGokpoqJP0lJxs1plOxQqBY,28882
13
- opsci_toolbox/helpers/sna.py,sha256=XL1BZ-x83xWRNbGsvh7-m8Mdy6iOrWx8vjgaL2_TSmo,31905
11
+ opsci_toolbox/helpers/dataviz.py,sha256=U2Kj-xoF1wHvYXUKxLsrSvKnhky9PrPUy61s1WEKp44,208743
12
+ opsci_toolbox/helpers/dates.py,sha256=CxbXSo61GPZ2L37PV0ujvp78vwl0DoBq7t0nkk9qHp8,4751
13
+ opsci_toolbox/helpers/gliner.py,sha256=qLkpuoCDezQyYmg_TE3XYETSpobHods6WBjCLo0Gjqw,3579
14
+ opsci_toolbox/helpers/nlp.py,sha256=I72F32ieofZaCIkjZ9kqpiJLktfRoM7mMhzzxyXDQ3I,99316
15
+ opsci_toolbox/helpers/nlp_cuml.py,sha256=CGyThKNgo6fdFPV-iooPG0oNrzA__Hvv08t_sdEp3BE,28919
16
+ opsci_toolbox/helpers/sna.py,sha256=E5D_1aGDmq_YQYseHxZggEtWQOwbXJJ0GHu3YtZLGtg,31906
14
17
  opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
15
18
  opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
16
19
  opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
20
  opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
18
21
  opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
19
- opsci_toolbox-0.0.12.dist-info/METADATA,sha256=LosT5jzu7Z0TXIslwVUSvPG6AKMrblGp8A6odUN_N9U,1633
20
- opsci_toolbox-0.0.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
21
- opsci_toolbox-0.0.12.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
22
- opsci_toolbox-0.0.12.dist-info/RECORD,,
22
+ opsci_toolbox-0.0.13.dist-info/METADATA,sha256=G_JhKg5tmYPkRUhAN2Uj9B6orX7x3TKWqIOKU_TjeIA,1727
23
+ opsci_toolbox-0.0.13.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
24
+ opsci_toolbox-0.0.13.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
25
+ opsci_toolbox-0.0.13.dist-info/RECORD,,