PyPI - opsci-toolbox - Versions diffs - 0.0.7__py3-none-any.whl → 0.0.8__py3-none-any.whl - Mend

opsci-toolbox 0.0.7py3-none-any.whl → 0.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

opsci_toolbox/apis/rapidapi_helpers.py +120 -21
opsci_toolbox/apis/webscraping.py +186 -59
opsci_toolbox/apis/youtube_helpers.py +103 -16
opsci_toolbox/helpers/common.py +368 -254
opsci_toolbox/helpers/cv.py +50 -60
opsci_toolbox/helpers/dataviz.py +255 -184
opsci_toolbox/helpers/dates.py +17 -18
opsci_toolbox/helpers/nlp.py +154 -114
opsci_toolbox/helpers/nlp_cuml.py +389 -36
opsci_toolbox/helpers/sna.py +509 -0
opsci_toolbox/helpers/sql.py +53 -0
{opsci_toolbox-0.0.7.dist-info → opsci_toolbox-0.0.8.dist-info}/METADATA +14 -9
opsci_toolbox-0.0.8.dist-info/RECORD +22 -0
opsci_toolbox-0.0.7.dist-info/RECORD +0 -21
{opsci_toolbox-0.0.7.dist-info → opsci_toolbox-0.0.8.dist-info}/WHEEL +0 -0
{opsci_toolbox-0.0.7.dist-info → opsci_toolbox-0.0.8.dist-info}/top_level.txt +0 -0

opsci_toolbox/helpers/nlp.py CHANGED Viewed

@@ -15,7 +15,8 @@ import spacy
 from spacy.language import Language
 from spacy_language_detection import LanguageDetector
 from spacymoji import Emoji
-from langchain.embeddings import HuggingFaceEmbeddings
+# from langchain.embeddings import HuggingFaceEmbeddings
+from langchain_community.embeddings import HuggingFaceEmbeddings
 from sklearn.feature_selection import chi2
 from urlextract import URLExtract
 import ast
@@ -27,17 +28,31 @@ from textacy.preprocessing.replace import urls
 from eldar import Query
 import torch
 from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer
+from bs4 import BeautifulSoup
 ####################################################################
 # CLEANING
 ####################################################################
+def remove_html_tags(text: str) -> str:
+    """
+    Remove HTML tags from the given text.
+    Parameters:
+    - text (str): The text containing HTML tags.
+    Returns:
+    - str: The text with HTML tags removed.
+    """
+    soup = BeautifulSoup(text, "html.parser")
+    return soup.get_text()
 def filter_by_query(df: pd.DataFrame, col_text: str, query: str, ignore_case: bool = True, ignore_accent: bool = True, match_word: bool = False) -> pd.DataFrame:
     """
     Filter DataFrame rows by a query on a specific text column.
-    Parameters:
+    Args:
         df : pandas DataFrame
             The DataFrame to filter.
         col_text : str
@@ -64,7 +79,7 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
     """
     Generic cleaning process for topic modeling.
-    Parameters:
+    Args:
         df : pandas DataFrame
             The DataFrame containing text data.
         col : str
@@ -86,7 +101,7 @@ def extract_insta_shortcode(url: str) -> str:
     """
     Extracts the shortcode from an Instagram URL.
-    Parameters:
+    Args:
         url : str
             The Instagram URL containing the shortcode.
@@ -99,11 +114,25 @@ def extract_insta_shortcode(url: str) -> str:
     shortcode = re.findall(pattern, url)
     return shortcode[0]
+def remove_parentheses_content(text: str) -> str:
+    """
+    Remove content within parentheses from the given text.
+    Parameters:
+    - text (str): The text from which content within parentheses should be removed.
+    Returns:
+    - str: The text with content within parentheses removed.
+    """
+    # Using regular expression to find content between parentheses and removing it
+    result = re.sub(r'\([^)]*\)', '', text)
+    return result
 def remove_emojis(text: str) -> str:
     """
     Removes emojis and their textual representations from a text string.
-    Parameters:
+    Args:
         text : str
             The input text string containing emojis.
@@ -119,11 +148,56 @@ def remove_emojis(text: str) -> str:
     return text_no_emojis
+def extract_numbers(text: str) -> list:
+    """
+    Extracts all numeric values from a given text string and returns them as a list of floats.
+    Args:
+        text (str): The input string from which numbers are to be extracted.
+    Returns:
+        list: A list containing all the extracted numbers as floats.
+    """
+    # Define a regular expression pattern to match numbers
+    pattern = r'\d+\.?\d*'
+    # Use re.findall to find all matches of the pattern in the text
+    numbers = re.findall(pattern, text)
+    # Convert the extracted numbers from strings to floats
+    numbers = [float(num) for num in numbers]
+    return numbers
+def contains_question_mark(text: str) -> int:
+    """
+    Checks if a given text string contains a question mark.
+    Args:
+        text (str): The input string to be checked.
+    Returns:
+        int: Returns 1 if the text contains a question mark, otherwise 0.
+    """
+    return 1 if '?' in text else 0
+def contains_exclamation_mark(text: str) -> int:
+    """
+    Checks if a given text string contains an exclamation mark.
+    Args:
+        text (str): The input string to be checked.
+    Returns:
+        int: Returns 1 if the text contains an exclamation mark, otherwise 0.
+    """
+    return 1 if '!' in text else 0
 def extract_urls_from_text(text: str) -> list:
     """
     Extracts URLs from a text string.
-    Parameters:
+    Args:
         text : str
             The input text string containing URLs.
@@ -139,7 +213,7 @@ def extract_hashtags(text: str, lower: bool = True) -> list:
     '''
     Extracts hashtags from the text using a regular expression.
-    Parameters:
+    Args:
         text : str
             The input text string containing hashtags.
         lower : bool, optional
@@ -158,7 +232,7 @@ def extract_mentions(text: str, mention_char: str = '@', lower: bool = False) ->
     '''
     Extracts mentions from the text using a regular expression.
-    Parameters:
+    Args:
         text : str
             The input text string containing mentions.
         mention_char : str, optional
@@ -181,7 +255,7 @@ def remove_extra_spaces(text: str) -> str:
     """
     Removes extra spaces from a text string.
-    Parameters:
+    Args:
         text : str
             The input text string with extra spaces.
@@ -196,7 +270,7 @@ def remove_characters(text: str, start_indices: list, end_indices: list) -> str:
     """
     Remove characters from a text string using lists of start and end indices.
-    Parameters:
+    Args:
         text : str
             The input text string.
         start_indices : list of int
@@ -234,7 +308,7 @@ def load_stopwords_df(lang: str) -> pd.DataFrame:
     """
     Load a CSV file without header containing stopwords. If the file doesn't exist, it creates an empty file.
-    Parameters:
+    Args:
         lang : str
             The language code used to identify the stopwords file.
@@ -269,7 +343,7 @@ def add_stopwords(lang: str, new_stopwords: list, lower: bool = True) -> pd.Data
     """
     Add a list of stopwords to an existing file. It removes duplicates.
-    Parameters:
+    Args:
         lang : str
             The language code used to identify the stopwords file.
         new_stopwords : list of str
@@ -304,7 +378,7 @@ def remove_stopwords(lang: str, stopwords: list) -> pd.DataFrame:
     """
     Remove stopwords from an existing file.
-    Parameters:
+    Args:
         lang : str
             The language code used to identify the stopwords file.
         stopwords : list of str
@@ -330,7 +404,7 @@ def keep_alphanum_char(text: str, replace: str = '') -> str:
     """
     Replace all non-alphanumeric characters in a text string.
-    Parameters:
+    Args:
         text : str
             The input text string.
         replace : str, optional
@@ -347,7 +421,7 @@ def substitute_punctuations_with_white_space(text : str) -> str:
     """
     Substitute punctuations with white spaces in the input string.
-    Parameters:
+    Args:
         text (str): The input string.
     Returns:
@@ -360,7 +434,7 @@ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_j
     """
     Translate text using LibreTranslate service.
-    Parameters:
+    Args:
         text : str
             The text to be translated.
         source : str
@@ -399,7 +473,7 @@ def translate_batch(batch_text: list, source: str, target: str, filename: str, d
     """
     Translate a batch of texts using LibreTranslate service.
-    Parameters:
+    Args:
         batch_text : list of str
             The list of texts to be translated.
         source : str
@@ -442,7 +516,7 @@ def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:
     """
     Translate text using LibreTranslate service.
-    Parameters:
+    Args:
         text : str
             The text to be translated.
         source : str
@@ -474,7 +548,7 @@ def translate_row(df: pd.DataFrame, col: str, source: str = "auto", target: str
     """
     Translate the text in a specific column of a DataFrame.
-    Parameters:
+    Args:
         df : pandas DataFrame
             The DataFrame containing the text to be translated.
         col : str
@@ -504,7 +578,7 @@ def cosine_similarity(a: np.array, b: np.array) -> float:
     """
     Calculate the cosine similarity between two vectors.
-    Parameters:
+    Args:
         a : numpy array
             The first vector.
         b : numpy array
@@ -520,7 +594,7 @@ def approximate_tokens(text: str) -> int:
     """
     Approximate the number of tokens in a text.
-    Parameters:
+    Args:
         text : str
             The input text.
@@ -534,7 +608,7 @@ def approximate_unique_tokens(text: str) -> int:
     """
     Approximate the number of distinct tokens in a text.
-    Parameters:
+    Args:
         text : str
             The input text.
@@ -548,7 +622,7 @@ def count_word_occurrences(text: str, word: str) -> int:
     """
     Count the occurrences of a word in a text.
-    Parameters:
+    Args:
         text : str
             The input text.
         word : str
@@ -571,7 +645,7 @@ def chi2_per_category(lst_text: list, lst_categorie: list, col_cat: str, n_words
     """
     Calculate Chi-squared (Chi2) statistics per category based on the provided texts and corresponding categories.
-    Parameters:
+    Args:
         lst_text : list
             List of texts for which Chi2 will be calculated.
         lst_categorie : list
@@ -614,7 +688,7 @@ def word_frequency_per_categorie(df: pd.DataFrame, col_text: str, col_cat: str,
     """
     Calculate word frequency per category.
-    Parameters:
+    Args:
         df : pandas DataFrame
             DataFrame containing text data and corresponding categories.
         col_text : str
@@ -658,7 +732,7 @@ def top_items_per_category(df: pd.DataFrame, col_lst: str = "hashtags", col_cat:
     """
     Count the occurrences of items (e.g., hashtags) per category and select the top items per category.
-    Parameters:
+    Args:
         df : pandas DataFrame
             DataFrame containing data.
         col_lst : str, optional
@@ -695,7 +769,7 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
     """
     Calculate the representation of topics in a processed DataFrame.
-    Parameters:
+    Args:
         df_processed_data : pandas DataFrame
             DataFrame containing processed data.
         col_topic : str
@@ -740,6 +814,9 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
     metrics_dict['unique_mentions']=("mentions", lambda x: len(set(mention for sublist in x for mention in sublist)))
     metrics_dict['verbatims_with_mentions']=("mentions_count", lambda x: (x > 0).sum() )
     metrics_dict['mentions_occurences']=("mentions_count", "sum")
+    metrics_dict['verbatims_with_numbers']= ("len_numbers", lambda x: (x > 0).sum())
+    metrics_dict['verbatims_with_interrogation']=("interrogation", "sum")
+    metrics_dict['verbatims_with_exclamation']=("exclamation", "sum")
     metrics_dict['topic_x']=("x", "mean")
     metrics_dict['topic_y']=("y", "mean")
@@ -757,6 +834,9 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
                       .assign(percentage_verbatims_with_emoji = lambda x : x["verbatims_with_emoji"] / x["verbatims"])
                       .assign(percentage_verbatims_with_hashtags = lambda x : x["verbatims_with_hashtags"] / x["verbatims"])
                       .assign(percentage_verbatims_with_mentions = lambda x : x["verbatims_with_mentions"] / x["verbatims"])
+                      .assign(percentage_verbatims_with_numbers = lambda x : x["verbatims_with_numbers"] / x["verbatims"])
+                      .assign(percentage_verbatims_with_numbers = lambda x : x["verbatims_with_interrogation"] / x["verbatims"])
+                      .assign(percentage_verbatims_with_numbers = lambda x : x["verbatims_with_exclamation"] / x["verbatims"])
                       .reset_index())
     df_distrib_all[col_topic]=df_distrib_all[col_topic].astype(str)
@@ -766,7 +846,7 @@ def generic_representation(df_processed_data: pd.DataFrame, col_gb: str, col_id:
     """
     Calculate a generic representation of data based on grouping by a specified column.
-    Parameters:
+    Args:
         df_processed_data : pandas DataFrame
             DataFrame containing processed data.
         col_gb : str
@@ -814,7 +894,7 @@ def create_frequency_table(df: pd.DataFrame, col: str) -> pd.DataFrame:
     """
     Create a frequency table for a given column in a DataFrame.
-    Parameters:
+    Args:
         df : pandas DataFrame
             DataFrame containing the data.
         col : str
@@ -845,7 +925,7 @@ def calculate_sample(len_df: int, n_rows: float) -> int:
     """
     Convert a percentage to the number of rows to sample.
-    Parameters:
+    Args:
         len_df : int
             Length of the DataFrame.
         n_rows : float
@@ -855,8 +935,6 @@ def calculate_sample(len_df: int, n_rows: float) -> int:
         int
             Number of rows to sample.
-    Description:
-        This function converts a percentage of the DataFrame length into a number of rows to sample. If `n_rows` is between 0 and 1, it's treated as a percentage and converted into an integer representing the top `n_rows` percentage of the DataFrame length. If `n_rows` is greater than 1 or equal to 0, it's treated as an absolute number of rows.
     """
     if 0 < n_rows <= 1 :
         top_rows = int(n_rows * len_df)
@@ -870,8 +948,9 @@ def calculate_sample(len_df: int, n_rows: float) -> int:
 def sampling_by_engagement(df: pd.DataFrame, col_engagement: str, top_rows: float = 0.3, sample_size: float = 0.5) -> pd.DataFrame:
     """
     Create a sample dataset by keeping a part of the top publications based on engagement metrics.
+    This function generates a sample dataset by keeping a portion of the top publications based on engagement metrics. It sorts the dataset by the specified engagement metric, keeps the top `top_rows` rows, and then samples the remaining rows to achieve the desired `sample_size`. The final sample is shuffled for randomness
-    Parameters:
+    Args:
         df : pandas.DataFrame
             The original DataFrame.
         col_engagement : str
@@ -885,8 +964,6 @@ def sampling_by_engagement(df: pd.DataFrame, col_engagement: str, top_rows: floa
         pandas.DataFrame
             The sampled DataFrame.
-    Description:
-        This function generates a sample dataset by keeping a portion of the top publications based on engagement metrics. It sorts the dataset by the specified engagement metric, keeps the top `top_rows` rows, and then samples the remaining rows to achieve the desired `sample_size`. The final sample is shuffled for randomness.
     """
     sample_rows = calculate_sample(len(df), sample_size)
@@ -911,7 +988,7 @@ def sample_most_engaging_posts(df: pd.DataFrame, col_topic: str, col_engagement:
     """
     Perform a "stratified sample" of the most engaging content per topic, ensuring a minimum number of items per group.
-    Parameters:
+    Args:
         df : pandas.DataFrame
             The DataFrame containing the data.
         col_topic : str
@@ -927,8 +1004,6 @@ def sample_most_engaging_posts(df: pd.DataFrame, col_topic: str, col_engagement:
         pandas.DataFrame
             The sampled DataFrame.
-    Description:
-        This function performs a "stratified sample" of the most engaging content per topic. It sorts the data by engagement metrics within each topic group, and then takes a sample of `sample_size` proportion from each group. If a group has fewer than `min_size` items, it retains all items in that group.
     """
     df = (df.groupby(col_topic, group_keys=False)
           .apply(lambda x: x.sort_values(by=col_engagement, ascending=False)
@@ -948,7 +1023,7 @@ def TM_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_
     """
     Perform natural language processing tasks using spaCy for topic modeling.
-    Parameters:
+    Args:
         nlp : spacy.Language
             The spaCy language model.
         df : pandas.DataFrame
@@ -974,8 +1049,6 @@ def TM_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_
         pandas.DataFrame
             The DataFrame with processed text data.
-    Description:
-        This function utilizes spaCy for natural language processing tasks such as lemmatization, emoji extraction, and token counting. It processes the text data in the DataFrame and returns the DataFrame with additional columns for lemmatized text, emoji counts, token counts, and more.
     """
     all_lemmas=[]
     tokens_counts=[]
@@ -1029,9 +1102,15 @@ def TM_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_
 def load_spacy_model(model: str, disable_components: list = ["transformer", "morphologizer", "trainable_lemmatizer", "textcat_multilabel", "textcat", "entity_ruler", "entity_linker"], lang_detect: bool = False, emoji: bool = False) -> spacy.language.Language:
     """
-    Load a spaCy model with optional configurations.
+    Load a spaCy model with optional configurations. This function loads a spaCy model with optional configurations such as disabling specific components, enabling emoji parsing,
+        and enabling language detection. It first loads the spaCy model specified by the 'model' parameter and then applies
+        additional configurations based on the provided flags.
-    Parameters:
+        If 'disable_components' is provided, the specified spaCy components will be disabled. If 'lang_detect' is set to True,
+        language detection will be enabled using the 'get_lang_detector' function. If 'emoji' is set to True, the emoji component
+        will be included in the spaCy pipeline.
+    Args:
         model : str
             Name of the spaCy model to load.
         disable_components : list, optional
@@ -1044,15 +1123,7 @@ def load_spacy_model(model: str, disable_components: list = ["transformer", "mor
     Returns:
         nlp : spacy.language.Language
             Loaded spaCy language processing pipeline.
-    Description:
-        This function loads a spaCy model with optional configurations such as disabling specific components, enabling emoji parsing,
-        and enabling language detection. It first loads the spaCy model specified by the 'model' parameter and then applies
-        additional configurations based on the provided flags.
-        If 'disable_components' is provided, the specified spaCy components will be disabled. If 'lang_detect' is set to True,
-        language detection will be enabled using the 'get_lang_detector' function. If 'emoji' is set to True, the emoji component
-        will be included in the spaCy pipeline.
     """
     if torch.cuda.is_available():
@@ -1074,23 +1145,15 @@ def load_spacy_model(model: str, disable_components: list = ["transformer", "mor
 def get_labels(nlp: spacy.language.Language, pipe_step: str = "ner", explanations: bool = False) -> pd.DataFrame:
     """
-    Return labels associated with a pipeline step and optionally provide explanations.
-    Parameters:
-        nlp : spacy.language.Language
-            The spaCy language processing pipeline.
-        pipe_step : str, optional
-            The pipeline step for which labels are retrieved. Default is "ner".
-        explanations : bool, optional
-            Flag indicating whether to include explanations for the labels. Default is False.
+    Return labels associated with a pipeline step and optionally provide explanations.This function retrieves the labels associated with a specific pipeline step of the spaCy language processing pipeline. It returns a DataFrame containing the labels. If 'explanations' is set to True, explanations for each label are also included.
+    Args:
+        nlp : spacy.language.Language. The spaCy language processing pipeline.
+        pipe_step : str, optional. The pipeline step for which labels are retrieved. Default is "ner".
+        explanations : bool, optional. Flag indicating whether to include explanations for the labels. Default is False.
     Returns:
-        DataFrame
-            DataFrame containing the labels associated with the specified pipeline step.
-    Description:
-        This function retrieves the labels associated with a specific pipeline step of the spaCy language processing pipeline.
-        It returns a DataFrame containing the labels. If 'explanations' is set to True, explanations for each label are also included.
+        DataFrame : DataFrame containing the labels associated with the specified pipeline step.
     """
     pipe_details=nlp.get_pipe(pipe_step)
     labels=list(pipe_details.labels)
@@ -1104,9 +1167,9 @@ def get_labels(nlp: spacy.language.Language, pipe_step: str = "ner", explanation
 def spacy_langdetect(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100, n_process: int = 1) -> pd.DataFrame:
     """
-    Detect language and return a score.
+    Detect language and return a score.This function uses spaCy's language detection capabilities to detect the language of text data in a DataFrame.It returns a DataFrame containing the detected languages and their scores, which indicate the confidence levelof the language detection for each text.
-    Parameters:
+    Args:
         nlp : spacy.language.Language
             The spaCy language processing pipeline with language detection enabled.
         df : pd.DataFrame
@@ -1121,11 +1184,7 @@ def spacy_langdetect(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100
     Returns:
         pd.DataFrame
             DataFrame containing the detected languages and their scores.
-    Description:
-        This function uses spaCy's language detection capabilities to detect the language of text data in a DataFrame.
-        It returns a DataFrame containing the detected languages and their scores, which indicate the confidence level
-        of the language detection for each text.
     """
     text=list(df[col_text].astype('unicode').values)
@@ -1847,46 +1906,26 @@ def hdbscan_clustering(embeddings, algorithm='best', alpha=1.0, cluster_selectio
                        p=None, cluster_selection_method='eom', prediction_data = True):
     """
-    Parameters:
-    embeddings : array-like or sparse matrix, shape (n_samples, n_features)
-        The input data to be clustered.
-    algorithm : {'best', 'generic', 'prims_kdtree', 'boruvka_kdtree', 'boruvka_balltree', 'prims_balltree'}, optional
-        The algorithm to use for computation. Default is 'best'.
-    alpha : float, optional
-        Scaling factor determining the individual weight of the (unnormalized) density estimate. Default is 1.0.
-    cluster_selection_epsilon : float, optional
-        The epsilon value to specify a minimum cluster size. Default is 0.0.
-    approx_min_span_tree : bool, optional
-        Whether to compute an approximation of the minimum spanning tree. Default is True.
-    gen_min_span_tree : bool, optional
-        Whether to compute the minimum spanning tree. Default is True.
-    leaf_size : int, optional
-        Leaf size for the underlying KD-tree or Ball Tree. Default is 40.
-    metric : str or callable, optional
-        The metric to use for distance computation. Default is 'euclidean'.
-    min_cluster_size : int, optional
-        The minimum size of clusters; single linkage splits that produce smaller clusters than this will be considered points "falling out" of a cluster rather than a cluster splitting into two new clusters. Default is 5.
-    min_samples : int or None, optional
-        The number of samples in a neighborhood for a point to be considered a core point. If None, the value is set to min_cluster_size. Default is None.
-    p : int, optional
-        The Minkowski p-norm distance metric parameter. Default is None.
-    cluster_selection_method : {'eom', 'leaf', 'leaf_similar', 'eom_similar', 'tree', 'beagle'}, optional
-        The method used to select clusters from the condensed tree. Default is 'eom'.
-    prediction_data : bool, optional
-        Whether the data is prediction data or not. Default is True.
-Returns:
-    clusterer : hdbscan.hdbscan_.HDBSCAN
-        HDBSCAN clusterer object.
-    labels : array, shape (n_samples,)
-        Cluster labels for each point. Noisy samples are given the label -1.
-    probabilities : array, shape (n_samples,)
-        The probability of each sample being an outlier.
-Description:
-    This function performs clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm.
-    It clusters the input data based on the specified parameters and returns the clusterer object, cluster labels for each point, and the
-    probability of each sample being an outlier.
+    This function performs clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm. It clusters the input data based on the specified parameters and returns the clusterer object, cluster labels for each point, and the probability of each sample being an outlier.
+    Args
+        embeddings : array-like or sparse matrix, shape (n_samples, n_features). The input data to be clustered.
+        algorithm : {'best', 'generic', 'prims_kdtree', 'boruvka_kdtree', 'boruvka_balltree', 'prims_balltree'}, optional. The algorithm to use for computation. Default is 'best'.
+        alpha : float, optional. Scaling factor determining the individual weight of the (unnormalized) density estimate. Default is 1.0.
+        cluster_selection_epsilon : float, optional. The epsilon value to specify a minimum cluster size. Default is 0.0.
+        approx_min_span_tree : bool, optional. Whether to compute an approximation of the minimum spanning tree. Default is True.
+        gen_min_span_tree : bool, optional. Whether to compute the minimum spanning tree. Default is True.
+        leaf_size : int, optional. Leaf size for the underlying KD-tree or Ball Tree. Default is 40.
+        metric : str or callable, optional. The metric to use for distance computation. Default is 'euclidean'.
+        min_cluster_size : int, optional. The minimum size of clusters; single linkage splits that produce smaller clusters than this will be considered points "falling out" of a cluster rather than a cluster splitting into two new clusters. Default is 5.
+        min_samples : int or None, optional. The number of samples in a neighborhood for a point to be considered a core point. If None, the value is set to min_cluster_size. Default is None.
+        p : int, optional. The Minkowski p-norm distance metric parameter. Default is None.
+        cluster_selection_method : {'eom', 'leaf', 'leaf_similar', 'eom_similar', 'tree', 'beagle'}, optional. The method used to select clusters from the condensed tree. Default is 'eom'.
+        prediction_data : bool, optional. Whether the data is prediction data or not. Default is True.
+    Returns:
+        clusterer : hdbscan.hdbscan_.HDBSCAN. HDBSCAN clusterer object.
+        labels : array, shape (n_samples,). Cluster labels for each point. Noisy samples are given the label -1.
+        probabilities : array, shape (n_samples,). The probability of each sample being an outlier.
     """
     clusterer = hdbscan.HDBSCAN(algorithm=algorithm,
                                 alpha=alpha,
@@ -2017,4 +2056,5 @@ def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json
             results = {"label":label, "score" : float(proba.max()), col_text : text}
             print(results)
             write_json(results, dir_json , str(filename))
     return results

opsci-toolbox 0.0.7__py3-none-any.whl → 0.0.8__py3-none-any.whl

opsci-toolbox 0.0.7py3-none-any.whl → 0.0.8py3-none-any.whl