opsci-toolbox 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,7 +15,8 @@ import spacy
15
15
  from spacy.language import Language
16
16
  from spacy_language_detection import LanguageDetector
17
17
  from spacymoji import Emoji
18
- from langchain.embeddings import HuggingFaceEmbeddings
18
+ # from langchain.embeddings import HuggingFaceEmbeddings
19
+ from langchain_community.embeddings import HuggingFaceEmbeddings
19
20
  from sklearn.feature_selection import chi2
20
21
  from urlextract import URLExtract
21
22
  import ast
@@ -27,17 +28,31 @@ from textacy.preprocessing.replace import urls
27
28
  from eldar import Query
28
29
  import torch
29
30
  from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer
31
+ from bs4 import BeautifulSoup
30
32
 
31
33
 
32
34
  ####################################################################
33
35
  # CLEANING
34
36
  ####################################################################
35
37
 
38
+ def remove_html_tags(text: str) -> str:
39
+ """
40
+ Remove HTML tags from the given text.
41
+
42
+ Parameters:
43
+ - text (str): The text containing HTML tags.
44
+
45
+ Returns:
46
+ - str: The text with HTML tags removed.
47
+ """
48
+ soup = BeautifulSoup(text, "html.parser")
49
+ return soup.get_text()
50
+
36
51
  def filter_by_query(df: pd.DataFrame, col_text: str, query: str, ignore_case: bool = True, ignore_accent: bool = True, match_word: bool = False) -> pd.DataFrame:
37
52
  """
38
53
  Filter DataFrame rows by a query on a specific text column.
39
54
 
40
- Parameters:
55
+ Args:
41
56
  df : pandas DataFrame
42
57
  The DataFrame to filter.
43
58
  col_text : str
@@ -64,7 +79,7 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
64
79
  """
65
80
  Generic cleaning process for topic modeling.
66
81
 
67
- Parameters:
82
+ Args:
68
83
  df : pandas DataFrame
69
84
  The DataFrame containing text data.
70
85
  col : str
@@ -86,7 +101,7 @@ def extract_insta_shortcode(url: str) -> str:
86
101
  """
87
102
  Extracts the shortcode from an Instagram URL.
88
103
 
89
- Parameters:
104
+ Args:
90
105
  url : str
91
106
  The Instagram URL containing the shortcode.
92
107
 
@@ -99,11 +114,25 @@ def extract_insta_shortcode(url: str) -> str:
99
114
  shortcode = re.findall(pattern, url)
100
115
  return shortcode[0]
101
116
 
117
+ def remove_parentheses_content(text: str) -> str:
118
+ """
119
+ Remove content within parentheses from the given text.
120
+
121
+ Parameters:
122
+ - text (str): The text from which content within parentheses should be removed.
123
+
124
+ Returns:
125
+ - str: The text with content within parentheses removed.
126
+ """
127
+ # Using regular expression to find content between parentheses and removing it
128
+ result = re.sub(r'\([^)]*\)', '', text)
129
+ return result
130
+
102
131
  def remove_emojis(text: str) -> str:
103
132
  """
104
133
  Removes emojis and their textual representations from a text string.
105
134
 
106
- Parameters:
135
+ Args:
107
136
  text : str
108
137
  The input text string containing emojis.
109
138
 
@@ -119,11 +148,56 @@ def remove_emojis(text: str) -> str:
119
148
 
120
149
  return text_no_emojis
121
150
 
151
+ def extract_numbers(text: str) -> list:
152
+ """
153
+ Extracts all numeric values from a given text string and returns them as a list of floats.
154
+
155
+ Args:
156
+ text (str): The input string from which numbers are to be extracted.
157
+
158
+ Returns:
159
+ list: A list containing all the extracted numbers as floats.
160
+ """
161
+ # Define a regular expression pattern to match numbers
162
+ pattern = r'\d+\.?\d*'
163
+
164
+ # Use re.findall to find all matches of the pattern in the text
165
+ numbers = re.findall(pattern, text)
166
+
167
+ # Convert the extracted numbers from strings to floats
168
+ numbers = [float(num) for num in numbers]
169
+
170
+ return numbers
171
+
172
+ def contains_question_mark(text: str) -> int:
173
+ """
174
+ Checks if a given text string contains a question mark.
175
+
176
+ Args:
177
+ text (str): The input string to be checked.
178
+
179
+ Returns:
180
+ int: Returns 1 if the text contains a question mark, otherwise 0.
181
+ """
182
+ return 1 if '?' in text else 0
183
+
184
+ def contains_exclamation_mark(text: str) -> int:
185
+ """
186
+ Checks if a given text string contains an exclamation mark.
187
+
188
+ Args:
189
+ text (str): The input string to be checked.
190
+
191
+ Returns:
192
+ int: Returns 1 if the text contains an exclamation mark, otherwise 0.
193
+ """
194
+ return 1 if '!' in text else 0
195
+
122
196
  def extract_urls_from_text(text: str) -> list:
123
197
  """
124
198
  Extracts URLs from a text string.
125
199
 
126
- Parameters:
200
+ Args:
127
201
  text : str
128
202
  The input text string containing URLs.
129
203
 
@@ -139,7 +213,7 @@ def extract_hashtags(text: str, lower: bool = True) -> list:
139
213
  '''
140
214
  Extracts hashtags from the text using a regular expression.
141
215
 
142
- Parameters:
216
+ Args:
143
217
  text : str
144
218
  The input text string containing hashtags.
145
219
  lower : bool, optional
@@ -158,7 +232,7 @@ def extract_mentions(text: str, mention_char: str = '@', lower: bool = False) ->
158
232
  '''
159
233
  Extracts mentions from the text using a regular expression.
160
234
 
161
- Parameters:
235
+ Args:
162
236
  text : str
163
237
  The input text string containing mentions.
164
238
  mention_char : str, optional
@@ -181,7 +255,7 @@ def remove_extra_spaces(text: str) -> str:
181
255
  """
182
256
  Removes extra spaces from a text string.
183
257
 
184
- Parameters:
258
+ Args:
185
259
  text : str
186
260
  The input text string with extra spaces.
187
261
 
@@ -196,7 +270,7 @@ def remove_characters(text: str, start_indices: list, end_indices: list) -> str:
196
270
  """
197
271
  Remove characters from a text string using lists of start and end indices.
198
272
 
199
- Parameters:
273
+ Args:
200
274
  text : str
201
275
  The input text string.
202
276
  start_indices : list of int
@@ -234,7 +308,7 @@ def load_stopwords_df(lang: str) -> pd.DataFrame:
234
308
  """
235
309
  Load a CSV file without header containing stopwords. If the file doesn't exist, it creates an empty file.
236
310
 
237
- Parameters:
311
+ Args:
238
312
  lang : str
239
313
  The language code used to identify the stopwords file.
240
314
 
@@ -269,7 +343,7 @@ def add_stopwords(lang: str, new_stopwords: list, lower: bool = True) -> pd.Data
269
343
  """
270
344
  Add a list of stopwords to an existing file. It removes duplicates.
271
345
 
272
- Parameters:
346
+ Args:
273
347
  lang : str
274
348
  The language code used to identify the stopwords file.
275
349
  new_stopwords : list of str
@@ -304,7 +378,7 @@ def remove_stopwords(lang: str, stopwords: list) -> pd.DataFrame:
304
378
  """
305
379
  Remove stopwords from an existing file.
306
380
 
307
- Parameters:
381
+ Args:
308
382
  lang : str
309
383
  The language code used to identify the stopwords file.
310
384
  stopwords : list of str
@@ -330,7 +404,7 @@ def keep_alphanum_char(text: str, replace: str = '') -> str:
330
404
  """
331
405
  Replace all non-alphanumeric characters in a text string.
332
406
 
333
- Parameters:
407
+ Args:
334
408
  text : str
335
409
  The input text string.
336
410
  replace : str, optional
@@ -347,7 +421,7 @@ def substitute_punctuations_with_white_space(text : str) -> str:
347
421
  """
348
422
  Substitute punctuations with white spaces in the input string.
349
423
 
350
- Parameters:
424
+ Args:
351
425
  text (str): The input string.
352
426
 
353
427
  Returns:
@@ -360,7 +434,7 @@ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_j
360
434
  """
361
435
  Translate text using LibreTranslate service.
362
436
 
363
- Parameters:
437
+ Args:
364
438
  text : str
365
439
  The text to be translated.
366
440
  source : str
@@ -399,7 +473,7 @@ def translate_batch(batch_text: list, source: str, target: str, filename: str, d
399
473
  """
400
474
  Translate a batch of texts using LibreTranslate service.
401
475
 
402
- Parameters:
476
+ Args:
403
477
  batch_text : list of str
404
478
  The list of texts to be translated.
405
479
  source : str
@@ -442,7 +516,7 @@ def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:
442
516
  """
443
517
  Translate text using LibreTranslate service.
444
518
 
445
- Parameters:
519
+ Args:
446
520
  text : str
447
521
  The text to be translated.
448
522
  source : str
@@ -474,7 +548,7 @@ def translate_row(df: pd.DataFrame, col: str, source: str = "auto", target: str
474
548
  """
475
549
  Translate the text in a specific column of a DataFrame.
476
550
 
477
- Parameters:
551
+ Args:
478
552
  df : pandas DataFrame
479
553
  The DataFrame containing the text to be translated.
480
554
  col : str
@@ -504,7 +578,7 @@ def cosine_similarity(a: np.array, b: np.array) -> float:
504
578
  """
505
579
  Calculate the cosine similarity between two vectors.
506
580
 
507
- Parameters:
581
+ Args:
508
582
  a : numpy array
509
583
  The first vector.
510
584
  b : numpy array
@@ -520,7 +594,7 @@ def approximate_tokens(text: str) -> int:
520
594
  """
521
595
  Approximate the number of tokens in a text.
522
596
 
523
- Parameters:
597
+ Args:
524
598
  text : str
525
599
  The input text.
526
600
 
@@ -534,7 +608,7 @@ def approximate_unique_tokens(text: str) -> int:
534
608
  """
535
609
  Approximate the number of distinct tokens in a text.
536
610
 
537
- Parameters:
611
+ Args:
538
612
  text : str
539
613
  The input text.
540
614
 
@@ -548,7 +622,7 @@ def count_word_occurrences(text: str, word: str) -> int:
548
622
  """
549
623
  Count the occurrences of a word in a text.
550
624
 
551
- Parameters:
625
+ Args:
552
626
  text : str
553
627
  The input text.
554
628
  word : str
@@ -571,7 +645,7 @@ def chi2_per_category(lst_text: list, lst_categorie: list, col_cat: str, n_words
571
645
  """
572
646
  Calculate Chi-squared (Chi2) statistics per category based on the provided texts and corresponding categories.
573
647
 
574
- Parameters:
648
+ Args:
575
649
  lst_text : list
576
650
  List of texts for which Chi2 will be calculated.
577
651
  lst_categorie : list
@@ -614,7 +688,7 @@ def word_frequency_per_categorie(df: pd.DataFrame, col_text: str, col_cat: str,
614
688
  """
615
689
  Calculate word frequency per category.
616
690
 
617
- Parameters:
691
+ Args:
618
692
  df : pandas DataFrame
619
693
  DataFrame containing text data and corresponding categories.
620
694
  col_text : str
@@ -658,7 +732,7 @@ def top_items_per_category(df: pd.DataFrame, col_lst: str = "hashtags", col_cat:
658
732
  """
659
733
  Count the occurrences of items (e.g., hashtags) per category and select the top items per category.
660
734
 
661
- Parameters:
735
+ Args:
662
736
  df : pandas DataFrame
663
737
  DataFrame containing data.
664
738
  col_lst : str, optional
@@ -695,7 +769,7 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
695
769
  """
696
770
  Calculate the representation of topics in a processed DataFrame.
697
771
 
698
- Parameters:
772
+ Args:
699
773
  df_processed_data : pandas DataFrame
700
774
  DataFrame containing processed data.
701
775
  col_topic : str
@@ -740,6 +814,9 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
740
814
  metrics_dict['unique_mentions']=("mentions", lambda x: len(set(mention for sublist in x for mention in sublist)))
741
815
  metrics_dict['verbatims_with_mentions']=("mentions_count", lambda x: (x > 0).sum() )
742
816
  metrics_dict['mentions_occurences']=("mentions_count", "sum")
817
+ metrics_dict['verbatims_with_numbers']= ("len_numbers", lambda x: (x > 0).sum())
818
+ metrics_dict['verbatims_with_interrogation']=("interrogation", "sum")
819
+ metrics_dict['verbatims_with_exclamation']=("exclamation", "sum")
743
820
  metrics_dict['topic_x']=("x", "mean")
744
821
  metrics_dict['topic_y']=("y", "mean")
745
822
 
@@ -757,6 +834,9 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
757
834
  .assign(percentage_verbatims_with_emoji = lambda x : x["verbatims_with_emoji"] / x["verbatims"])
758
835
  .assign(percentage_verbatims_with_hashtags = lambda x : x["verbatims_with_hashtags"] / x["verbatims"])
759
836
  .assign(percentage_verbatims_with_mentions = lambda x : x["verbatims_with_mentions"] / x["verbatims"])
837
+ .assign(percentage_verbatims_with_numbers = lambda x : x["verbatims_with_numbers"] / x["verbatims"])
838
+ .assign(percentage_verbatims_with_numbers = lambda x : x["verbatims_with_interrogation"] / x["verbatims"])
839
+ .assign(percentage_verbatims_with_numbers = lambda x : x["verbatims_with_exclamation"] / x["verbatims"])
760
840
  .reset_index())
761
841
 
762
842
  df_distrib_all[col_topic]=df_distrib_all[col_topic].astype(str)
@@ -766,7 +846,7 @@ def generic_representation(df_processed_data: pd.DataFrame, col_gb: str, col_id:
766
846
  """
767
847
  Calculate a generic representation of data based on grouping by a specified column.
768
848
 
769
- Parameters:
849
+ Args:
770
850
  df_processed_data : pandas DataFrame
771
851
  DataFrame containing processed data.
772
852
  col_gb : str
@@ -814,7 +894,7 @@ def create_frequency_table(df: pd.DataFrame, col: str) -> pd.DataFrame:
814
894
  """
815
895
  Create a frequency table for a given column in a DataFrame.
816
896
 
817
- Parameters:
897
+ Args:
818
898
  df : pandas DataFrame
819
899
  DataFrame containing the data.
820
900
  col : str
@@ -845,7 +925,7 @@ def calculate_sample(len_df: int, n_rows: float) -> int:
845
925
  """
846
926
  Convert a percentage to the number of rows to sample.
847
927
 
848
- Parameters:
928
+ Args:
849
929
  len_df : int
850
930
  Length of the DataFrame.
851
931
  n_rows : float
@@ -855,8 +935,6 @@ def calculate_sample(len_df: int, n_rows: float) -> int:
855
935
  int
856
936
  Number of rows to sample.
857
937
 
858
- Description:
859
- This function converts a percentage of the DataFrame length into a number of rows to sample. If `n_rows` is between 0 and 1, it's treated as a percentage and converted into an integer representing the top `n_rows` percentage of the DataFrame length. If `n_rows` is greater than 1 or equal to 0, it's treated as an absolute number of rows.
860
938
  """
861
939
  if 0 < n_rows <= 1 :
862
940
  top_rows = int(n_rows * len_df)
@@ -870,8 +948,9 @@ def calculate_sample(len_df: int, n_rows: float) -> int:
870
948
  def sampling_by_engagement(df: pd.DataFrame, col_engagement: str, top_rows: float = 0.3, sample_size: float = 0.5) -> pd.DataFrame:
871
949
  """
872
950
  Create a sample dataset by keeping a part of the top publications based on engagement metrics.
951
+ This function generates a sample dataset by keeping a portion of the top publications based on engagement metrics. It sorts the dataset by the specified engagement metric, keeps the top `top_rows` rows, and then samples the remaining rows to achieve the desired `sample_size`. The final sample is shuffled for randomness
873
952
 
874
- Parameters:
953
+ Args:
875
954
  df : pandas.DataFrame
876
955
  The original DataFrame.
877
956
  col_engagement : str
@@ -885,8 +964,6 @@ def sampling_by_engagement(df: pd.DataFrame, col_engagement: str, top_rows: floa
885
964
  pandas.DataFrame
886
965
  The sampled DataFrame.
887
966
 
888
- Description:
889
- This function generates a sample dataset by keeping a portion of the top publications based on engagement metrics. It sorts the dataset by the specified engagement metric, keeps the top `top_rows` rows, and then samples the remaining rows to achieve the desired `sample_size`. The final sample is shuffled for randomness.
890
967
  """
891
968
 
892
969
  sample_rows = calculate_sample(len(df), sample_size)
@@ -911,7 +988,7 @@ def sample_most_engaging_posts(df: pd.DataFrame, col_topic: str, col_engagement:
911
988
  """
912
989
  Perform a "stratified sample" of the most engaging content per topic, ensuring a minimum number of items per group.
913
990
 
914
- Parameters:
991
+ Args:
915
992
  df : pandas.DataFrame
916
993
  The DataFrame containing the data.
917
994
  col_topic : str
@@ -927,8 +1004,6 @@ def sample_most_engaging_posts(df: pd.DataFrame, col_topic: str, col_engagement:
927
1004
  pandas.DataFrame
928
1005
  The sampled DataFrame.
929
1006
 
930
- Description:
931
- This function performs a "stratified sample" of the most engaging content per topic. It sorts the data by engagement metrics within each topic group, and then takes a sample of `sample_size` proportion from each group. If a group has fewer than `min_size` items, it retains all items in that group.
932
1007
  """
933
1008
  df = (df.groupby(col_topic, group_keys=False)
934
1009
  .apply(lambda x: x.sort_values(by=col_engagement, ascending=False)
@@ -948,7 +1023,7 @@ def TM_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_
948
1023
  """
949
1024
  Perform natural language processing tasks using spaCy for topic modeling.
950
1025
 
951
- Parameters:
1026
+ Args:
952
1027
  nlp : spacy.Language
953
1028
  The spaCy language model.
954
1029
  df : pandas.DataFrame
@@ -974,8 +1049,6 @@ def TM_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_
974
1049
  pandas.DataFrame
975
1050
  The DataFrame with processed text data.
976
1051
 
977
- Description:
978
- This function utilizes spaCy for natural language processing tasks such as lemmatization, emoji extraction, and token counting. It processes the text data in the DataFrame and returns the DataFrame with additional columns for lemmatized text, emoji counts, token counts, and more.
979
1052
  """
980
1053
  all_lemmas=[]
981
1054
  tokens_counts=[]
@@ -1029,9 +1102,15 @@ def TM_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_
1029
1102
 
1030
1103
  def load_spacy_model(model: str, disable_components: list = ["transformer", "morphologizer", "trainable_lemmatizer", "textcat_multilabel", "textcat", "entity_ruler", "entity_linker"], lang_detect: bool = False, emoji: bool = False) -> spacy.language.Language:
1031
1104
  """
1032
- Load a spaCy model with optional configurations.
1105
+ Load a spaCy model with optional configurations. This function loads a spaCy model with optional configurations such as disabling specific components, enabling emoji parsing,
1106
+ and enabling language detection. It first loads the spaCy model specified by the 'model' parameter and then applies
1107
+ additional configurations based on the provided flags.
1033
1108
 
1034
- Parameters:
1109
+ If 'disable_components' is provided, the specified spaCy components will be disabled. If 'lang_detect' is set to True,
1110
+ language detection will be enabled using the 'get_lang_detector' function. If 'emoji' is set to True, the emoji component
1111
+ will be included in the spaCy pipeline.
1112
+
1113
+ Args:
1035
1114
  model : str
1036
1115
  Name of the spaCy model to load.
1037
1116
  disable_components : list, optional
@@ -1044,15 +1123,7 @@ def load_spacy_model(model: str, disable_components: list = ["transformer", "mor
1044
1123
  Returns:
1045
1124
  nlp : spacy.language.Language
1046
1125
  Loaded spaCy language processing pipeline.
1047
-
1048
- Description:
1049
- This function loads a spaCy model with optional configurations such as disabling specific components, enabling emoji parsing,
1050
- and enabling language detection. It first loads the spaCy model specified by the 'model' parameter and then applies
1051
- additional configurations based on the provided flags.
1052
-
1053
- If 'disable_components' is provided, the specified spaCy components will be disabled. If 'lang_detect' is set to True,
1054
- language detection will be enabled using the 'get_lang_detector' function. If 'emoji' is set to True, the emoji component
1055
- will be included in the spaCy pipeline.
1126
+
1056
1127
  """
1057
1128
  if torch.cuda.is_available():
1058
1129
 
@@ -1074,23 +1145,15 @@ def load_spacy_model(model: str, disable_components: list = ["transformer", "mor
1074
1145
 
1075
1146
  def get_labels(nlp: spacy.language.Language, pipe_step: str = "ner", explanations: bool = False) -> pd.DataFrame:
1076
1147
  """
1077
- Return labels associated with a pipeline step and optionally provide explanations.
1078
-
1079
- Parameters:
1080
- nlp : spacy.language.Language
1081
- The spaCy language processing pipeline.
1082
- pipe_step : str, optional
1083
- The pipeline step for which labels are retrieved. Default is "ner".
1084
- explanations : bool, optional
1085
- Flag indicating whether to include explanations for the labels. Default is False.
1148
+ Return labels associated with a pipeline step and optionally provide explanations.This function retrieves the labels associated with a specific pipeline step of the spaCy language processing pipeline. It returns a DataFrame containing the labels. If 'explanations' is set to True, explanations for each label are also included.
1149
+ Args:
1150
+ nlp : spacy.language.Language. The spaCy language processing pipeline.
1151
+ pipe_step : str, optional. The pipeline step for which labels are retrieved. Default is "ner".
1152
+ explanations : bool, optional. Flag indicating whether to include explanations for the labels. Default is False.
1086
1153
 
1087
1154
  Returns:
1088
- DataFrame
1089
- DataFrame containing the labels associated with the specified pipeline step.
1090
-
1091
- Description:
1092
- This function retrieves the labels associated with a specific pipeline step of the spaCy language processing pipeline.
1093
- It returns a DataFrame containing the labels. If 'explanations' is set to True, explanations for each label are also included.
1155
+ DataFrame : DataFrame containing the labels associated with the specified pipeline step.
1156
+
1094
1157
  """
1095
1158
  pipe_details=nlp.get_pipe(pipe_step)
1096
1159
  labels=list(pipe_details.labels)
@@ -1104,9 +1167,9 @@ def get_labels(nlp: spacy.language.Language, pipe_step: str = "ner", explanation
1104
1167
 
1105
1168
  def spacy_langdetect(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100, n_process: int = 1) -> pd.DataFrame:
1106
1169
  """
1107
- Detect language and return a score.
1170
+ Detect language and return a score.This function uses spaCy's language detection capabilities to detect the language of text data in a DataFrame.It returns a DataFrame containing the detected languages and their scores, which indicate the confidence levelof the language detection for each text.
1108
1171
 
1109
- Parameters:
1172
+ Args:
1110
1173
  nlp : spacy.language.Language
1111
1174
  The spaCy language processing pipeline with language detection enabled.
1112
1175
  df : pd.DataFrame
@@ -1121,11 +1184,7 @@ def spacy_langdetect(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100
1121
1184
  Returns:
1122
1185
  pd.DataFrame
1123
1186
  DataFrame containing the detected languages and their scores.
1124
-
1125
- Description:
1126
- This function uses spaCy's language detection capabilities to detect the language of text data in a DataFrame.
1127
- It returns a DataFrame containing the detected languages and their scores, which indicate the confidence level
1128
- of the language detection for each text.
1187
+
1129
1188
  """
1130
1189
  text=list(df[col_text].astype('unicode').values)
1131
1190
 
@@ -1847,46 +1906,26 @@ def hdbscan_clustering(embeddings, algorithm='best', alpha=1.0, cluster_selectio
1847
1906
  p=None, cluster_selection_method='eom', prediction_data = True):
1848
1907
 
1849
1908
  """
1850
- Parameters:
1851
- embeddings : array-like or sparse matrix, shape (n_samples, n_features)
1852
- The input data to be clustered.
1853
- algorithm : {'best', 'generic', 'prims_kdtree', 'boruvka_kdtree', 'boruvka_balltree', 'prims_balltree'}, optional
1854
- The algorithm to use for computation. Default is 'best'.
1855
- alpha : float, optional
1856
- Scaling factor determining the individual weight of the (unnormalized) density estimate. Default is 1.0.
1857
- cluster_selection_epsilon : float, optional
1858
- The epsilon value to specify a minimum cluster size. Default is 0.0.
1859
- approx_min_span_tree : bool, optional
1860
- Whether to compute an approximation of the minimum spanning tree. Default is True.
1861
- gen_min_span_tree : bool, optional
1862
- Whether to compute the minimum spanning tree. Default is True.
1863
- leaf_size : int, optional
1864
- Leaf size for the underlying KD-tree or Ball Tree. Default is 40.
1865
- metric : str or callable, optional
1866
- The metric to use for distance computation. Default is 'euclidean'.
1867
- min_cluster_size : int, optional
1868
- The minimum size of clusters; single linkage splits that produce smaller clusters than this will be considered points "falling out" of a cluster rather than a cluster splitting into two new clusters. Default is 5.
1869
- min_samples : int or None, optional
1870
- The number of samples in a neighborhood for a point to be considered a core point. If None, the value is set to min_cluster_size. Default is None.
1871
- p : int, optional
1872
- The Minkowski p-norm distance metric parameter. Default is None.
1873
- cluster_selection_method : {'eom', 'leaf', 'leaf_similar', 'eom_similar', 'tree', 'beagle'}, optional
1874
- The method used to select clusters from the condensed tree. Default is 'eom'.
1875
- prediction_data : bool, optional
1876
- Whether the data is prediction data or not. Default is True.
1877
-
1878
- Returns:
1879
- clusterer : hdbscan.hdbscan_.HDBSCAN
1880
- HDBSCAN clusterer object.
1881
- labels : array, shape (n_samples,)
1882
- Cluster labels for each point. Noisy samples are given the label -1.
1883
- probabilities : array, shape (n_samples,)
1884
- The probability of each sample being an outlier.
1885
-
1886
- Description:
1887
- This function performs clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm.
1888
- It clusters the input data based on the specified parameters and returns the clusterer object, cluster labels for each point, and the
1889
- probability of each sample being an outlier.
1909
+ This function performs clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm. It clusters the input data based on the specified parameters and returns the clusterer object, cluster labels for each point, and the probability of each sample being an outlier.
1910
+ Args
1911
+ embeddings : array-like or sparse matrix, shape (n_samples, n_features). The input data to be clustered.
1912
+ algorithm : {'best', 'generic', 'prims_kdtree', 'boruvka_kdtree', 'boruvka_balltree', 'prims_balltree'}, optional. The algorithm to use for computation. Default is 'best'.
1913
+ alpha : float, optional. Scaling factor determining the individual weight of the (unnormalized) density estimate. Default is 1.0.
1914
+ cluster_selection_epsilon : float, optional. The epsilon value to specify a minimum cluster size. Default is 0.0.
1915
+ approx_min_span_tree : bool, optional. Whether to compute an approximation of the minimum spanning tree. Default is True.
1916
+ gen_min_span_tree : bool, optional. Whether to compute the minimum spanning tree. Default is True.
1917
+ leaf_size : int, optional. Leaf size for the underlying KD-tree or Ball Tree. Default is 40.
1918
+ metric : str or callable, optional. The metric to use for distance computation. Default is 'euclidean'.
1919
+ min_cluster_size : int, optional. The minimum size of clusters; single linkage splits that produce smaller clusters than this will be considered points "falling out" of a cluster rather than a cluster splitting into two new clusters. Default is 5.
1920
+ min_samples : int or None, optional. The number of samples in a neighborhood for a point to be considered a core point. If None, the value is set to min_cluster_size. Default is None.
1921
+ p : int, optional. The Minkowski p-norm distance metric parameter. Default is None.
1922
+ cluster_selection_method : {'eom', 'leaf', 'leaf_similar', 'eom_similar', 'tree', 'beagle'}, optional. The method used to select clusters from the condensed tree. Default is 'eom'.
1923
+ prediction_data : bool, optional. Whether the data is prediction data or not. Default is True.
1924
+
1925
+ Returns:
1926
+ clusterer : hdbscan.hdbscan_.HDBSCAN. HDBSCAN clusterer object.
1927
+ labels : array, shape (n_samples,). Cluster labels for each point. Noisy samples are given the label -1.
1928
+ probabilities : array, shape (n_samples,). The probability of each sample being an outlier.
1890
1929
  """
1891
1930
  clusterer = hdbscan.HDBSCAN(algorithm=algorithm,
1892
1931
  alpha=alpha,
@@ -2017,4 +2056,5 @@ def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json
2017
2056
  results = {"label":label, "score" : float(proba.max()), col_text : text}
2018
2057
  print(results)
2019
2058
  write_json(results, dir_json , str(filename))
2059
+
2020
2060
  return results