PyPI - opsci-toolbox - Versions diffs - 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl - Mend

opsci-toolbox 0.0.5py3-none-any.whl → 0.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

opsci_toolbox/apis/rapidapi_helpers.py +1 -0
opsci_toolbox/helpers/common.py +557 -207
opsci_toolbox/helpers/cv.py +298 -123
opsci_toolbox/helpers/dataviz.py +875 -191
opsci_toolbox/helpers/dates.py +55 -8
opsci_toolbox/helpers/nlp.py +746 -97
opsci_toolbox/helpers/nlp_cuml.py +166 -57
opsci_toolbox/helpers/sna.py +101 -10
opsci_toolbox/helpers/surreaction.py +58 -16
{opsci_toolbox-0.0.5.dist-info → opsci_toolbox-0.0.7.dist-info}/METADATA +3 -2
opsci_toolbox-0.0.7.dist-info/RECORD +21 -0
opsci_toolbox-0.0.5.dist-info/RECORD +0 -21
{opsci_toolbox-0.0.5.dist-info → opsci_toolbox-0.0.7.dist-info}/WHEEL +0 -0
{opsci_toolbox-0.0.5.dist-info → opsci_toolbox-0.0.7.dist-info}/top_level.txt +0 -0

opsci_toolbox/helpers/nlp.py CHANGED Viewed

@@ -33,15 +33,48 @@ from transformers import TextClassificationPipeline, AutoModelForSequenceClassif
 # CLEANING
 ####################################################################
-def filter_by_query(df, col_text, query, ignore_case=True, ignore_accent=True, match_word=False):
+def filter_by_query(df: pd.DataFrame, col_text: str, query: str, ignore_case: bool = True, ignore_accent: bool = True, match_word: bool = False) -> pd.DataFrame:
+    """
+    Filter DataFrame rows by a query on a specific text column.
+    Parameters:
+        df : pandas DataFrame
+            The DataFrame to filter.
+        col_text : str
+            The name of the column containing text data to query.
+        query : str
+            The query string to filter the DataFrame.
+        ignore_case : bool, optional
+            Whether to ignore case sensitivity. Default is True.
+        ignore_accent : bool, optional
+            Whether to ignore accents. Default is True.
+        match_word : bool, optional
+            Whether to match the whole word. Default is False.
+    Returns:
+        df_filtered : pandas DataFrame
+            The filtered DataFrame.
+    """
     eldar_query=Query(query, ignore_case = ignore_case, ignore_accent=ignore_accent, match_word=match_word)
     df[col_text] = df[df[col_text].apply(eldar_query)]
     df=df.reset_index(drop=True)
     return df
-def TM_clean_text(df, col, col_clean):
+def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
     """
-    Generic cleaning process for topic modeling
+    Generic cleaning process for topic modeling.
+    Parameters:
+        df : pandas DataFrame
+            The DataFrame containing text data.
+        col : str
+            The name of the column containing the original text data.
+        col_clean : str
+            The name of the column to store the cleaned text data.
+    Returns:
+        df : pandas DataFrame
+            The DataFrame with cleaned text data.
     """
     df[col_clean] = df[col].apply(lambda x : urls(x, repl= ''))
     df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
@@ -49,13 +82,35 @@ def TM_clean_text(df, col, col_clean):
     # df = df.loc[(df[col_clean] != ""), :]
     return df
-def extract_insta_shortcode(url):
+def extract_insta_shortcode(url: str) -> str:
+    """
+    Extracts the shortcode from an Instagram URL.
+    Parameters:
+        url : str
+            The Instagram URL containing the shortcode.
+    Returns:
+        shortcode : str
+            The extracted shortcode.
+    """
     pattern =r'(?:https?:\/\/)?(?:www\.)?instagram\.com\/(?:p|reel|tv|stories)\/([a-zA-Z0-9_-]+)\/?'
     shortcode = re.findall(pattern, url)
     return shortcode[0]
-def remove_emojis(text):
+def remove_emojis(text: str) -> str:
+    """
+    Removes emojis and their textual representations from a text string.
+    Parameters:
+        text : str
+            The input text string containing emojis.
+    Returns:
+        text_no_emojis : str
+            The input text string with emojis and their textual representations removed.
+    """
     # Convert emojis to their textual representations
     text_no_emojis = emoji.demojize(text)
@@ -64,24 +119,56 @@ def remove_emojis(text):
     return text_no_emojis
-def extract_urls_from_text(text):
-    """Returns a list of URLs contained in text"""
+def extract_urls_from_text(text: str) -> list:
+    """
+    Extracts URLs from a text string.
+    Parameters:
+        text : str
+            The input text string containing URLs.
+    Returns:
+        urls : list of str
+            A list of URLs extracted from the input text.
+    """
     extractor = URLExtract()
     urls = extractor.find_urls(text)
     return urls
-def extract_hashtags(text, lower=True):
+def extract_hashtags(text: str, lower: bool = True) -> list:
     '''
-    Using a regular expression to find hashtags in the text
+    Extracts hashtags from the text using a regular expression.
+    Parameters:
+        text : str
+            The input text string containing hashtags.
+        lower : bool, optional
+            Whether to convert extracted hashtags to lowercase. Default is True.
+    Returns:
+        hashtags : list of str
+            A list of hashtags extracted from the input text.
     '''
     hashtags = re.findall(r'\B#\w+', text)
     if lower :
         hashtags= [h.lower() for h in hashtags]
     return hashtags
-def extract_mentions(text, mention_char='@', lower=False):
+def extract_mentions(text: str, mention_char: str = '@', lower: bool = False) -> list:
     '''
-    Using a regular expression to find mentions in the text
+    Extracts mentions from the text using a regular expression.
+    Parameters:
+        text : str
+            The input text string containing mentions.
+        mention_char : str, optional
+            The character used to indicate mentions. Default is '@'.
+        lower : bool, optional
+            Whether to convert extracted mentions to lowercase. Default is False.
+    Returns:
+        mentions : list of str
+            A list of mentions extracted from the input text.
     '''
     pattern = r"(?<=^|(?<=[^a-zA-Z0-9-_\.]))" + re.escape(mention_char) + r"([A-Za-z0-9_]{4,15})"
@@ -90,16 +177,36 @@ def extract_mentions(text, mention_char='@', lower=False):
         mentions = [mention.lower() for mention in mentions]
     return mentions
-def remove_extra_spaces(text):
+def remove_extra_spaces(text: str) -> str:
     """
-    Remove extra spaces
+    Removes extra spaces from a text string.
+    Parameters:
+        text : str
+            The input text string with extra spaces.
+    Returns:
+        cleaned_text : str
+            The input text string with extra spaces removed.
     """
     cleaned_text = re.sub(r'\s+', ' ', text)
     return cleaned_text.strip()
-def remove_characters(text: str, start_indices: list, end_indices: list):
+def remove_characters(text: str, start_indices: list, end_indices: list) -> str:
     """
-    Remove words from a text using list of indices
+    Remove characters from a text string using lists of start and end indices.
+    Parameters:
+        text : str
+            The input text string.
+        start_indices : list of int
+            A list of start indices indicating the positions from which characters should be removed.
+        end_indices : list of int
+            A list of end indices indicating the positions up to which characters should be removed.
+    Returns:
+        result : str
+            The input text string with characters removed based on the specified indices.
     """
     if start_indices is None or len(start_indices) <1:
         return text
@@ -123,9 +230,17 @@ def remove_characters(text: str, start_indices: list, end_indices: list):
     return result
-def load_stopwords_df(lang):
+def load_stopwords_df(lang: str) -> pd.DataFrame:
     """
     Load a CSV file without header containing stopwords. If the file doesn't exist, it creates an empty file.
+    Parameters:
+        lang : str
+            The language code used to identify the stopwords file.
+    Returns:
+        df : pandas DataFrame
+            A DataFrame containing stopwords loaded from the file.
     """
     lexicon_dir = os.path.join(os.getcwd(), "lexicons")
     file_path = os.path.join(lexicon_dir, f"stop_words_{lang.lower()}.csv")
@@ -150,11 +265,21 @@ def load_stopwords_df(lang):
     return df
-def add_stopwords(lang:str, new_stopwords:list, lower:bool = True):
+def add_stopwords(lang: str, new_stopwords: list, lower: bool = True) -> pd.DataFrame:
     """
     Add a list of stopwords to an existing file. It removes duplicates.
+    Parameters:
+        lang : str
+            The language code used to identify the stopwords file.
+        new_stopwords : list of str
+            The list of stopwords to add.
+        lower : bool, optional
+            Whether to convert the new stopwords to lowercase before adding. Default is True.
+    Returns:
+        new_df : pandas DataFrame
+            A DataFrame containing the updated list of stopwords.
     """
     df = load_stopwords_df(lang)
     init_size = len(df.iloc[:, 0].unique())  # Selecting the first column
@@ -173,13 +298,21 @@ def add_stopwords(lang:str, new_stopwords:list, lower:bool = True):
     lexicon_dir = os.path.join(os.getcwd(), "lexicons")
     file_path = os.path.join(lexicon_dir, f"stop_words_{lang.lower()}.csv")
     new_df.to_csv(file_path, encoding="utf-8", index=False)
     return new_df
-def remove_stopwords(lang:str, stopwords:list):
+def remove_stopwords(lang: str, stopwords: list) -> pd.DataFrame:
     """
     Remove stopwords from an existing file.
+    Parameters:
+        lang : str
+            The language code used to identify the stopwords file.
+        stopwords : list of str
+            The list of stopwords to remove.
+    Returns:
+        df : pandas DataFrame
+            A DataFrame containing the updated list of stopwords after removal.
     """
     df = load_stopwords_df(lang)
     init_size = len(df.iloc[:, 0].unique())  # Selecting the first column
@@ -193,14 +326,24 @@ def remove_stopwords(lang:str, stopwords:list):
     return df
-def keep_alphanum_char(text:str, replace:str = ''):
+def keep_alphanum_char(text: str, replace: str = '') -> str:
     """
-    Replace all non-alphanumeric characters
+    Replace all non-alphanumeric characters in a text string.
+    Parameters:
+        text : str
+            The input text string.
+        replace : str, optional
+            The string to replace non-alphanumeric characters with. Default is an empty string.
+    Returns:
+        cleaned_text : str
+            The input text string with non-alphanumeric characters replaced.
     """
     return re.sub("[^a-zA-Z0-9]", replace, text)
-def substitute_punctuations_with_white_space(text):
+def substitute_punctuations_with_white_space(text : str) -> str:
     """
     Substitute punctuations with white spaces in the input string.
@@ -213,7 +356,28 @@ def substitute_punctuations_with_white_space(text):
     text = re.sub(r"[%s]" % re.escape('!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~“…”’'), " ", text)
     return text
-def translate_wt_libre(text, source, target, filename, dir_json, url = "http://127.0.0.1:5000/translate"):
+def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate") -> dict:
+    """
+    Translate text using LibreTranslate service.
+    Parameters:
+        text : str
+            The text to be translated.
+        source : str
+            The source language code.
+        target : str
+            The target language code.
+        filename : str
+            The filename to save the translation result.
+        dir_json : str
+            The directory to save the translation result JSON file.
+        url : str, optional
+            The URL of the WT Libre translation service. Default is "http://127.0.0.1:5000/translate".
+    Returns:
+        json_data : dict
+            The translation result in JSON format.
+    """
     headers = {"Content-Type": "application/json"}
     payload = {
         "q": text,
@@ -231,7 +395,28 @@ def translate_wt_libre(text, source, target, filename, dir_json, url = "http://1
         write_json(json_data, dir_json , str(filename))
         return json_data
-def translate_batch(batch_text, source, target, filename, dir_json, url = "http://127.0.0.1:5000/translate"):
+def translate_batch(batch_text: list, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate") -> list:
+    """
+    Translate a batch of texts using LibreTranslate service.
+    Parameters:
+        batch_text : list of str
+            The list of texts to be translated.
+        source : str
+            The source language code.
+        target : str
+            The target language code.
+        filename : str
+            The filename to save the translation results.
+        dir_json : str
+            The directory to save the translation result JSONL file.
+        url : str, optional
+            The URL of the WT Libre translation service. Default is "http://127.0.0.1:5000/translate".
+    Returns:
+        json_results : list of dict
+            The translation results as a list of dictionaries containing 'translated_text' and 'clean_text'.
+    """
     headers = {"Content-Type": "application/json"}
     payload = {
         "q": batch_text,
@@ -253,7 +438,24 @@ def translate_batch(batch_text, source, target, filename, dir_json, url = "http:
         write_jsonl(json_results, dir_json , str(filename))
         return json_results
-def translate(text, source, target,  url = "http://127.0.0.1:5000/translate"):
+def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:5000/translate") -> str:
+    """
+    Translate text using LibreTranslate service.
+    Parameters:
+        text : str
+            The text to be translated.
+        source : str
+            The source language code.
+        target : str
+            The target language code.
+        url : str, optional
+            The URL of the translation service. Default is "http://127.0.0.1:5000/translate".
+    Returns:
+        translatedText : str
+            The translated text.
+    """
     headers = {"Content-Type": "application/json"}
     payload = {
         "q": text,
@@ -268,7 +470,24 @@ def translate(text, source, target,  url = "http://127.0.0.1:5000/translate"):
     translatedText = json_data.get("translatedText", "")
     return translatedText
-def translate_row(df, col, source="auto", target = "en"):
+def translate_row(df: pd.DataFrame, col: str, source: str = "auto", target: str = "en") -> pd.DataFrame:
+    """
+    Translate the text in a specific column of a DataFrame.
+    Parameters:
+        df : pandas DataFrame
+            The DataFrame containing the text to be translated.
+        col : str
+            The name of the column containing the text to be translated.
+        source : str, optional
+            The source language code. Default is "auto".
+        target : str, optional
+            The target language code. Default is "en" (English).
+    Returns:
+        df : pandas DataFrame
+            The DataFrame with an additional column containing the translated text.
+    """
     translations =[]
     for i, row in df.iterrows():
         txt_to_translate = row[col].replace(' | ', ', ')
@@ -281,27 +500,63 @@ def translate_row(df, col, source="auto", target = "en"):
 # METRICS
 ###################################################################
-def cosine_similarity(a, b):
+def cosine_similarity(a: np.array, b: np.array) -> float:
     """
-    calculate cosine similarity between two vectors
+    Calculate the cosine similarity between two vectors.
+    Parameters:
+        a : numpy array
+            The first vector.
+        b : numpy array
+            The second vector.
+    Returns:
+        similarity : float
+            The cosine similarity between the two vectors.
     """
     return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
-def approximate_tokens(text):
+def approximate_tokens(text: str) -> int:
     """
-    Approximate the number of tokens
+    Approximate the number of tokens in a text.
+    Parameters:
+        text : str
+            The input text.
+    Returns:
+        num_tokens : int
+            The approximate number of tokens in the text.
     """
     return len(text.split(' '))
-def approximate_unique_tokens(text):
+def approximate_unique_tokens(text: str) -> int:
     """
-    Approximate the number of distinct tokens
+    Approximate the number of distinct tokens in a text.
+    Parameters:
+        text : str
+            The input text.
+    Returns:
+        num_unique_tokens : int
+            The approximate number of distinct tokens in the text.
     """
-    return len(list(set(text.split(' '))))
+    return len(set(text.split(' ')))
-def count_word_occurrences(text, word):
+def count_word_occurrences(text: str, word: str) -> int:
     """
-    Count word occurences
+    Count the occurrences of a word in a text.
+    Parameters:
+        text : str
+            The input text.
+        word : str
+            The word to count occurrences of.
+    Returns:
+        occurrences : int
+            The number of occurrences of the word in the text.
     """
     # Convert both text and word to lowercase for case-insensitive matching
     word_lower = word.lower()
@@ -312,8 +567,10 @@ def count_word_occurrences(text, word):
     return occurrences
-def chi2_per_category(lst_text, lst_categorie, col_cat, n_words = 10, p_value_limit=0.95, min_freq=3):
+def chi2_per_category(lst_text: list, lst_categorie: list, col_cat: str, n_words: int = 10, p_value_limit: float = 0.95, min_freq: int = 3) -> pd.DataFrame:
     """
+    Calculate Chi-squared (Chi2) statistics per category based on the provided texts and corresponding categories.
     Parameters:
         lst_text : list
             List of texts for which Chi2 will be calculated.
@@ -353,7 +610,34 @@ def chi2_per_category(lst_text, lst_categorie, col_cat, n_words = 10, p_value_li
     df_chi.reset_index(drop=True)
     return df_chi
-def word_frequency_per_categorie(df, col_text, col_cat, ngram_range=(1, 1), stop_words=[], n_words = 20, min_freq=3):
+def word_frequency_per_categorie(df: pd.DataFrame, col_text: str, col_cat: str, ngram_range: tuple = (1, 1), stop_words: list = [], n_words: int = 20, min_freq: int = 3) -> pd.DataFrame:
+    """
+    Calculate word frequency per category.
+    Parameters:
+        df : pandas DataFrame
+            DataFrame containing text data and corresponding categories.
+        col_text : str
+            Name of the column containing the text data.
+        col_cat : str
+            Name of the column containing the categories.
+        ngram_range : tuple, optional
+            The range for n-grams. Default is (1, 1) for unigrams.
+        stop_words : list, optional
+            List of stopwords to be ignored during frequency calculation. Default is an empty list.
+        n_words : int, optional
+            Number of top words to display per category. Default is 20.
+        min_freq : int, optional
+            Minimum frequency threshold for word occurrences per category. Default is 3.
+    Returns:
+        DataFrame
+            DataFrame containing word frequencies per category.
+    Description:
+        This function calculates word frequencies per category based on the provided DataFrame, considering the text data and corresponding categories.
+        It filters out words with frequencies below the specified minimum frequency threshold and returns the top words for each category.
+    """
     count_vectorizer = CountVectorizer(token_pattern=r'[^\s]+', ngram_range=ngram_range, stop_words=stop_words)
     X_train_count = count_vectorizer.fit_transform(df[col_text].to_list())
     X_names_count = count_vectorizer.get_feature_names_out()
@@ -370,10 +654,29 @@ def word_frequency_per_categorie(df, col_text, col_cat, ngram_range=(1, 1), stop
     return df_count
-def top_items_per_category(df, col_lst ="hashtags", col_cat = "soft_topic", col_id = "tweet_id", n_items= 10):
+def top_items_per_category(df: pd.DataFrame, col_lst: str = "hashtags", col_cat: str = "soft_topic", col_id: str = "tweet_id", n_items: int = 10) -> pd.DataFrame:
     """
-    Take a dataframe with a column containing lists of tokens (ex hashtags) and count their occurences grouped by a category.
-    For instance : count the most used hashtags per topic, metric will be a volume of tweets
+    Count the occurrences of items (e.g., hashtags) per category and select the top items per category.
+    Parameters:
+        df : pandas DataFrame
+            DataFrame containing data.
+        col_lst : str, optional
+            Name of the column containing lists of items (e.g., hashtags). Default is "hashtags".
+        col_cat : str, optional
+            Name of the column containing categories. Default is "soft_topic".
+        col_id : str, optional
+            Name of the column containing unique identifiers. Default is "tweet_id".
+        n_items : int, optional
+            Number of top items to select per category. Default is 10.
+    Returns:
+        DataFrame
+            DataFrame containing the top items per category.
+    Description:
+        This function takes a DataFrame with a column containing lists of tokens (e.g., hashtags) and counts their occurrences grouped by a category.
+        It then selects the most frequently occurring items per category based on the provided metric (e.g., volume of tweets).
     """
     df_count = (df[[col_cat, col_id, col_lst]].explode(col_lst)
             .groupby([col_cat, col_lst], group_keys=False)
@@ -388,8 +691,31 @@ def top_items_per_category(df, col_lst ="hashtags", col_cat = "soft_topic", col_
             )
     return df_count
-def topic_representation(df_processed_data, col_topic, col_id, col_engagement, col_user_id, metrics):
+def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id: str, col_engagement: str, col_user_id: str, metrics: dict) -> pd.DataFrame:
+    """
+    Calculate the representation of topics in a processed DataFrame.
+    Parameters:
+        df_processed_data : pandas DataFrame
+            DataFrame containing processed data.
+        col_topic : str
+            Name of the column containing topic labels.
+        col_id : str
+            Name of the column containing unique identifiers.
+        col_engagement : str
+            Name of the column containing engagement metrics.
+        col_user_id : str
+            Name of the column containing user identifiers.
+        metrics : dict
+            Dictionary containing additional metrics to aggregate.
+    Returns:
+        DataFrame
+            DataFrame containing the representation of topics.
+    Description:
+        This function aggregates various metrics for each topic, including verbatim counts, engagement sums, average word counts, occurrences of emojis, hashtags, and mentions, as well as unique counts for emojis, hashtags, and mentions. Additionally, it computes the average topic coordinates (x and y) if available. Finally, it calculates percentages for verbatims, engagements, users (if applicable), occurrences of emojis, hashtags, and mentions, and their respective combinations with verbatims.
+    """
     #on s'assure que les colonnes de métriques soient bien complètes et en float
     # df_processed_data[metrics]=df_processed_data[metrics].fillna(0).astype(float)
@@ -436,7 +762,31 @@ def topic_representation(df_processed_data, col_topic, col_id, col_engagement, c
     df_distrib_all[col_topic]=df_distrib_all[col_topic].astype(str)
     return df_distrib_all
-def generic_representation(df_processed_data, col_gb, col_id, col_engagement, col_user_id = None, metrics={}):
+def generic_representation(df_processed_data: pd.DataFrame, col_gb: str, col_id: str, col_engagement: str, col_user_id: str = None, metrics: dict = {}) -> pd.DataFrame:
+    """
+    Calculate a generic representation of data based on grouping by a specified column.
+    Parameters:
+        df_processed_data : pandas DataFrame
+            DataFrame containing processed data.
+        col_gb : str
+            Name of the column to group by.
+        col_id : str
+            Name of the column containing unique identifiers.
+        col_engagement : str
+            Name of the column containing engagement metrics.
+        col_user_id : str, optional
+            Name of the column containing user identifiers. Default is None.
+        metrics : dict, optional
+            Dictionary containing additional metrics to aggregate. Default is an empty dictionary.
+    Returns:
+        DataFrame
+            DataFrame containing the generic representation of data.
+    Description:
+        This function aggregates various metrics for each group, including verbatim counts, engagement sums, and any additional metrics provided in the `metrics` parameter. It also computes derived metrics such as verbatims per user and engagement per verbatim. Finally, it calculates percentages for verbatims, engagements, and users (if applicable) within each group.
+    """
     #on crée un dictionnaire contenant les agrégations
     metrics_dict = dict()
     metrics_dict['verbatims']=(col_id,'nunique')
@@ -460,7 +810,23 @@ def generic_representation(df_processed_data, col_gb, col_id, col_engagement, co
     return df_distrib_all
-def create_frequency_table(df, col):
+def create_frequency_table(df: pd.DataFrame, col: str) -> pd.DataFrame:
+    """
+    Create a frequency table for a given column in a DataFrame.
+    Parameters:
+        df : pandas DataFrame
+            DataFrame containing the data.
+        col : str
+            Name of the column for which the frequency table is to be created.
+    Returns:
+        pandas DataFrame
+            DataFrame containing the frequency table.
+    Description:
+        This function generates a frequency table for the specified column in the DataFrame. It sorts the DataFrame by the specified column in descending order, calculates the rank of each entry, and assigns dense ranks both ascending and descending.
+    """
     df_frequency=(df.sort_values(col, ascending=False)
                   .reset_index(drop=True)
                   .reset_index()
@@ -475,9 +841,22 @@ def create_frequency_table(df, col):
 # SAMPLING
 ###################################################################
-def calculate_sample(len_df, n_rows):
+def calculate_sample(len_df: int, n_rows: float) -> int:
     """
-    Percentage conversion to number of rows
+    Convert a percentage to the number of rows to sample.
+    Parameters:
+        len_df : int
+            Length of the DataFrame.
+        n_rows : float
+            Number of rows to sample. If less than or equal to 1, it's treated as a percentage.
+    Returns:
+        int
+            Number of rows to sample.
+    Description:
+        This function converts a percentage of the DataFrame length into a number of rows to sample. If `n_rows` is between 0 and 1, it's treated as a percentage and converted into an integer representing the top `n_rows` percentage of the DataFrame length. If `n_rows` is greater than 1 or equal to 0, it's treated as an absolute number of rows.
     """
     if 0 < n_rows <= 1 :
         top_rows = int(n_rows * len_df)
@@ -488,12 +867,26 @@ def calculate_sample(len_df, n_rows):
     else :
         print("ERREUR - paramètre du sampling incorrect")
-def sampling_by_engagement(df, col_engagement, top_rows=0.3, sample_size=0.5):
+def sampling_by_engagement(df: pd.DataFrame, col_engagement: str, top_rows: float = 0.3, sample_size: float = 0.5) -> pd.DataFrame:
     """
-    Create a sample dataset by keeping a part of the top publications:
-    - sample_size : final size of the sample. Ex : 1000 rows from an original dataset of 100000 rows
-    - top_rows : number of "most engaging" rows to keep
-    Values could be either an integer or a float between 0 and 1 (= sample a percentage)
+    Create a sample dataset by keeping a part of the top publications based on engagement metrics.
+    Parameters:
+        df : pandas.DataFrame
+            The original DataFrame.
+        col_engagement : str
+            The column name containing the engagement metrics.
+        top_rows : float, optional
+            The number of "most engaging" rows to keep. Values could be either an integer or a float between 0 and 1 (= sample a percentage). Default is 0.3.
+        sample_size : float, optional
+            The final size of the sample. Ex: 1000 rows from an original dataset of 100000 rows. Values could be either an integer or a float between 0 and 1 (= sample a percentage). Default is 0.5.
+    Returns:
+        pandas.DataFrame
+            The sampled DataFrame.
+    Description:
+        This function generates a sample dataset by keeping a portion of the top publications based on engagement metrics. It sorts the dataset by the specified engagement metric, keeps the top `top_rows` rows, and then samples the remaining rows to achieve the desired `sample_size`. The final sample is shuffled for randomness.
     """
     sample_rows = calculate_sample(len(df), sample_size)
@@ -514,9 +907,28 @@ def sampling_by_engagement(df, col_engagement, top_rows=0.3, sample_size=0.5):
     else:
         return df
-def sample_most_engaging_posts(df, col_topic, col_engagement, sample_size= 0.1, min_size=10):
+def sample_most_engaging_posts(df: pd.DataFrame, col_topic: str, col_engagement: str, sample_size: float = 0.1, min_size: int = 10) -> pd.DataFrame:
     """
-    "Stratified sample" of the most engaging content per topic. Returns a minimun number of items per group.
+    Perform a "stratified sample" of the most engaging content per topic, ensuring a minimum number of items per group.
+    Parameters:
+        df : pandas.DataFrame
+            The DataFrame containing the data.
+        col_topic : str
+            The column name containing the topic information.
+        col_engagement : str
+            The column name containing the engagement metrics.
+        sample_size : float, optional
+            The size of the sample relative to the total data. Default is 0.1 (10%).
+        min_size : int, optional
+            The minimum number of items to retain per group. Default is 10.
+    Returns:
+        pandas.DataFrame
+            The sampled DataFrame.
+    Description:
+        This function performs a "stratified sample" of the most engaging content per topic. It sorts the data by engagement metrics within each topic group, and then takes a sample of `sample_size` proportion from each group. If a group has fewer than `min_size` items, it retains all items in that group.
     """
     df = (df.groupby(col_topic, group_keys=False)
           .apply(lambda x: x.sort_values(by=col_engagement, ascending=False)
@@ -532,10 +944,38 @@ def sample_most_engaging_posts(df, col_topic, col_engagement, sample_size= 0.1,
 def get_lang_detector(nlp, name):
     return LanguageDetector(seed=42)  # We use the seed 42
-def TM_nlp_process(nlp, df, col_text, col_lemma, pos_to_keep, stopwords, batch_size=100, n_process=1, stats=True, join_list = False):
+def TM_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_keep: list, stopwords: list, batch_size: int = 100, n_process: int = 1, stats: bool = True, join_list: bool = False) -> pd.DataFrame:
     """
-    Spacy implementation for topic modeling
+    Perform natural language processing tasks using spaCy for topic modeling.
+    Parameters:
+        nlp : spacy.Language
+            The spaCy language model.
+        df : pandas.DataFrame
+            The DataFrame containing the text data.
+        col_text : str
+            The name of the column containing the text data.
+        col_lemma : str
+            The name of the column to store the lemmatized text data.
+        pos_to_keep : list
+            A list of part-of-speech tags to keep during lemmatization.
+        stopwords : list
+            A list of stopwords to remove during processing.
+        batch_size : int, optional
+            The batch size for spaCy processing. Default is 100.
+        n_process : int, optional
+            The number of processes for parallel processing. Default is 1.
+        stats : bool, optional
+            Whether to compute and store additional statistics. Default is True.
+        join_list : bool, optional
+            Whether to join the lemmas into a single string. Default is False.
+    Returns:
+        pandas.DataFrame
+            The DataFrame with processed text data.
+    Description:
+        This function utilizes spaCy for natural language processing tasks such as lemmatization, emoji extraction, and token counting. It processes the text data in the DataFrame and returns the DataFrame with additional columns for lemmatized text, emoji counts, token counts, and more.
     """
     all_lemmas=[]
     tokens_counts=[]
@@ -587,17 +1027,19 @@ def TM_nlp_process(nlp, df, col_text, col_lemma, pos_to_keep, stopwords, batch_s
     return df
-def load_spacy_model(model,  disable_components=["transformer", "morphologizer", "trainable_lemmatizer", "textcat_multilabel", "textcat", "entity_ruler", "entity_linker"], lang_detect=False, emoji=False):
+def load_spacy_model(model: str, disable_components: list = ["transformer", "morphologizer", "trainable_lemmatizer", "textcat_multilabel", "textcat", "entity_ruler", "entity_linker"], lang_detect: bool = False, emoji: bool = False) -> spacy.language.Language:
     """
+    Load a spaCy model with optional configurations.
     Parameters:
-    model : str
-        Name of the spaCy model to load.
-    disable_components : list, optional
-        List of spaCy components to disable. Default is ["transformer", "morphologizer", "trainable_lemmatizer", "textcat_multilabel", "textcat", "entity_ruler", "entity_linker"].
-    lang_detect : bool, optional
-        Flag indicating whether language detection should be enabled. Default is False.
-    emoji : bool, optional
-        Flag indicating whether to include the emoji component in the spaCy pipeline. Default is False.
+        model : str
+            Name of the spaCy model to load.
+        disable_components : list, optional
+            List of spaCy components to disable. Default is ["transformer", "morphologizer", "trainable_lemmatizer", "textcat_multilabel", "textcat", "entity_ruler", "entity_linker"].
+        lang_detect : bool, optional
+            Flag indicating whether language detection should be enabled. Default is False.
+        emoji : bool, optional
+            Flag indicating whether to include the emoji component in the spaCy pipeline. Default is False.
     Returns:
         nlp : spacy.language.Language
@@ -611,7 +1053,6 @@ def load_spacy_model(model,  disable_components=["transformer", "morphologizer",
         If 'disable_components' is provided, the specified spaCy components will be disabled. If 'lang_detect' is set to True,
         language detection will be enabled using the 'get_lang_detector' function. If 'emoji' is set to True, the emoji component
         will be included in the spaCy pipeline.
     """
     if torch.cuda.is_available():
@@ -631,10 +1072,25 @@ def load_spacy_model(model,  disable_components=["transformer", "morphologizer",
     return nlp
-def get_labels(nlp, pipe_step="ner", explanations=False):
-    """ Return labels associated to a pipeline step and explanations
-    Available names: ['tok2vec', 'tagger', 'parser', 'senter', 'attribute_ruler', 'lemmatizer', 'ner']
+def get_labels(nlp: spacy.language.Language, pipe_step: str = "ner", explanations: bool = False) -> pd.DataFrame:
+    """
+    Return labels associated with a pipeline step and optionally provide explanations.
+    Parameters:
+        nlp : spacy.language.Language
+            The spaCy language processing pipeline.
+        pipe_step : str, optional
+            The pipeline step for which labels are retrieved. Default is "ner".
+        explanations : bool, optional
+            Flag indicating whether to include explanations for the labels. Default is False.
+    Returns:
+        DataFrame
+            DataFrame containing the labels associated with the specified pipeline step.
+    Description:
+        This function retrieves the labels associated with a specific pipeline step of the spaCy language processing pipeline.
+        It returns a DataFrame containing the labels. If 'explanations' is set to True, explanations for each label are also included.
     """
     pipe_details=nlp.get_pipe(pipe_step)
     labels=list(pipe_details.labels)
@@ -646,9 +1102,30 @@ def get_labels(nlp, pipe_step="ner", explanations=False):
     return df
-def spacy_langdetect(nlp, df, col_text, batch_size=100, n_process=1):
+def spacy_langdetect(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100, n_process: int = 1) -> pd.DataFrame:
     """
-    Detect language and returns a score
+    Detect language and return a score.
+    Parameters:
+        nlp : spacy.language.Language
+            The spaCy language processing pipeline with language detection enabled.
+        df : pd.DataFrame
+            DataFrame containing the text data to analyze.
+        col_text : str
+            The name of the column containing the text data.
+        batch_size : int, optional
+            The batch size for processing texts. Default is 100.
+        n_process : int, optional
+            The number of processes to use for language detection. Default is 1.
+    Returns:
+        pd.DataFrame
+            DataFrame containing the detected languages and their scores.
+    Description:
+        This function uses spaCy's language detection capabilities to detect the language of text data in a DataFrame.
+        It returns a DataFrame containing the detected languages and their scores, which indicate the confidence level
+        of the language detection for each text.
     """
     text=list(df[col_text].astype('unicode').values)
@@ -662,9 +1139,32 @@ def spacy_langdetect(nlp, df, col_text, batch_size=100, n_process=1):
     return df
-def extract_noun_chunks(nlp, df, col_text, batch_size=100, n_process=1, stats=False):
+def extract_noun_chunks(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100, n_process: int = 1, stats: bool = False) -> pd.DataFrame:
     """
-    Spacy implementation to extract noun chunks
+    Spacy implementation to extract noun chunks.
+    Parameters:
+        nlp : spacy.language.Language
+            The spaCy language processing pipeline.
+        df : pd.DataFrame
+            DataFrame containing the text data to analyze.
+        col_text : str
+            The name of the column containing the text data.
+        batch_size : int, optional
+            The batch size for processing texts. Default is 100.
+        n_process : int, optional
+            The number of processes to use for text processing. Default is 1.
+        stats : bool, optional
+            Flag indicating whether to compute statistics about the noun chunks. Default is False.
+    Returns:
+        pd.DataFrame
+            DataFrame containing the extracted noun chunks and their statistics if enabled.
+    Description:
+        This function utilizes spaCy's noun chunk extraction capabilities to extract noun chunks from text data in a DataFrame.
+        It returns a DataFrame containing the extracted noun chunks for each text. Optionally, it can compute statistics such
+        as the count of noun chunks and unique noun chunks if the 'stats' parameter is set to True.
     """
     all_chunks = []
     all_unique_chunks =[]
@@ -689,10 +1189,32 @@ def extract_noun_chunks(nlp, df, col_text, batch_size=100, n_process=1, stats=Fa
         df['unique_noun_chunks_count']=unique_chunks_count
     return df
-def extract_emojis(nlp, df, col_text, batch_size=100, n_process=1, stats=True):
+def extract_emojis(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100, n_process: int = 1, stats: bool = True) -> pd.DataFrame:
     """
     Spacy implementation to extract emojis
+    Parameters:
+        nlp : spacy.language.Language
+            The spaCy language processing pipeline.
+        df : pd.DataFrame
+            DataFrame containing the text data to analyze.
+        col_text : str
+            The name of the column containing the text data.
+        batch_size : int, optional
+            The batch size for processing texts. Default is 100.
+        n_process : int, optional
+            The number of processes to use for text processing. Default is 1.
+        stats : bool, optional
+            Flag indicating whether to compute statistics about the emojis. Default is True.
+    Returns:
+        pd.DataFrame
+            DataFrame containing the extracted emojis and their statistics if enabled.
+    Description:
+        This function utilizes spaCy's emoji detection capabilities to extract emojis from text data in a DataFrame.
+        It returns a DataFrame containing the extracted emojis for each text. Optionally, it can compute statistics such
+        as the count of emojis and unique emojis if the 'stats' parameter is set to True.
     """
     all_emojis=[]
     all_unique_emojis=[]
@@ -720,9 +1242,33 @@ def extract_emojis(nlp, df, col_text, batch_size=100, n_process=1, stats=True):
     return df
-def split_n_sentences(nlp, df, col_text, n_sentences=1, batch_size=100, n_process=1, stats=False):
+def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1, batch_size: int = 100, n_process: int = 1, stats: bool = False) -> pd.DataFrame:
     """
     Split a text into chunks of n sentences
+    Parameters:
+        nlp : spacy.language.Language
+            The spaCy language processing pipeline.
+        df : pd.DataFrame
+            DataFrame containing the text data to split.
+        col_text : str
+            The name of the column containing the text data.
+        n_sentences : int, optional
+            The number of sentences to group together. Default is 1.
+        batch_size : int, optional
+            The batch size for processing texts. Default is 100.
+        n_process : int, optional
+            The number of processes to use for text processing. Default is 1.
+        stats : bool, optional
+            Flag indicating whether to compute statistics about the splitting process. Default is False.
+    Returns:
+        pd.DataFrame
+            DataFrame containing the split sentences.
+    Description:
+        This function splits text in a DataFrame into chunks of n sentences. It returns a DataFrame containing the split sentences.
+        Optionally, it can compute statistics such as the count of sentences and batches if the 'stats' parameter is set to True.
     """
     text=list(df[col_text].astype('unicode').values)
@@ -753,12 +1299,32 @@ def split_n_sentences(nlp, df, col_text, n_sentences=1, batch_size=100, n_proces
     return df
-def spacy_NER(nlp, df, col_text, entities_to_keep=['PERSON','ORG'], explode= True):
+def spacy_NER(nlp, df: pd.DataFrame, col_text: str, entities_to_keep: list = ['PERSON','ORG'], explode: bool = True) -> pd.DataFrame:
     """
     Spacy implementation of NER.
     To define entities type to keep, call get_labels(nlp, pipe_step="ner", explanations=False)
-    explode = False means it return 1 list of entities per document
+    explode = False means it returns 1 list of entities per document
     explode = True means it returns 1 entity per row
+    Parameters:
+        nlp : spacy.language.Language
+            The spaCy language processing pipeline.
+        df : pd.DataFrame
+            DataFrame containing the text data.
+        col_text : str
+            The name of the column containing the text data.
+        entities_to_keep : list, optional
+            List of entity types to keep. Default is ['PERSON','ORG'].
+        explode : bool, optional
+            Flag indicating whether to explode the DataFrame to have one entity per row. Default is True.
+    Returns:
+        pd.DataFrame
+            DataFrame containing the NER information.
+    Description:
+        This function performs Named Entity Recognition (NER) using spaCy on a DataFrame with text data. It extracts entities of the specified types
+        and stores the NER information in separate columns. If 'explode' is set to True, it returns one entity per row in the DataFrame.
     """
     # Create columns to store the NER information
     df['NER_type'] = None
@@ -797,10 +1363,38 @@ def spacy_NER(nlp, df, col_text, entities_to_keep=['PERSON','ORG'], explode= Tru
     return df
-def tokenize(nlp, df, col_text, col_tokens, pos_to_keep, stopwords, batch_size=100, n_process=1, stats=True):
+def tokenize(nlp, df: pd.DataFrame, col_text: str, col_tokens: str, pos_to_keep: list, stopwords: list, batch_size: int = 100, n_process: int = 1, stats: bool = True) -> pd.DataFrame:
     """
     Spacy implementation to tokenize text
+    Parameters:
+        nlp : spacy.language.Language
+            The spaCy language processing pipeline.
+        df : pd.DataFrame
+            DataFrame containing the text data.
+        col_text : str
+            The name of the column containing the text data.
+        col_tokens : str
+            The name of the column to store the tokenized text.
+        pos_to_keep : list
+            List of POS tags to keep.
+        stopwords : list
+            List of stopwords to exclude from tokens.
+        batch_size : int, optional
+            Batch size for processing. Default is 100.
+        n_process : int, optional
+            Number of processes for parallel processing. Default is 1.
+        stats : bool, optional
+            Flag indicating whether to calculate and store statistics. Default is True.
+    Returns:
+        pd.DataFrame
+            DataFrame containing the tokenized text.
+    Description:
+        This function tokenizes text using spaCy and stores the tokens in a new column in the DataFrame.
+        It allows filtering tokens based on POS tags and stopwords. If 'stats' is set to True, it calculates
+        and stores token counts.
     """
     all_tokens=[]
     tokens_counts=[]
@@ -832,10 +1426,40 @@ def tokenize(nlp, df, col_text, col_tokens, pos_to_keep, stopwords, batch_size=1
     return df
-def lemmatize(nlp, df, col_text, col_lemma, pos_to_keep, stopwords, batch_size=100, n_process=1, stats=True, join_list = False):
+def lemmatize(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_keep: list, stopwords: list, batch_size: int = 100, n_process: int = 1, stats: bool = True, join_list: bool = False) -> pd.DataFrame:
     """
     Spacy implementation to lemmatize text
+    Parameters:
+        nlp : spacy.language.Language
+            The spaCy language processing pipeline.
+        df : pd.DataFrame
+            DataFrame containing the text data.
+        col_text : str
+            The name of the column containing the text data.
+        col_lemma : str
+            The name of the column to store the lemmatized text.
+        pos_to_keep : list
+            List of POS tags to keep.
+        stopwords : list
+            List of stopwords to exclude from lemmas.
+        batch_size : int, optional
+            Batch size for processing. Default is 100.
+        n_process : int, optional
+            Number of processes for parallel processing. Default is 1.
+        stats : bool, optional
+            Flag indicating whether to calculate and store statistics. Default is True.
+        join_list : bool, optional
+            Flag indicating whether to join the lemmas into a single string. Default is False.
+    Returns:
+        pd.DataFrame
+            DataFrame containing the lemmatized text.
+    Description:
+        This function lemmatizes text using spaCy and stores the lemmatized text in a new column in the DataFrame.
+        It allows filtering lemmas based on POS tags and stopwords. If 'stats' is set to True, it calculates
+        and stores token counts.
     """
     all_lemmas=[]
     tokens_counts=[]
@@ -871,12 +1495,11 @@ def lemmatize(nlp, df, col_text, col_lemma, pos_to_keep, stopwords, batch_size=1
     return df
 ####################################################################
 # VECTORISATION
 ####################################################################
-def count_vectorize(lst_text):
+def count_vectorize(lst_text: list) -> tuple:
     """
     Parameters:
         lst_text : list
@@ -905,8 +1528,8 @@ def count_vectorize(lst_text):
     return count_vectorizer, features, features_names, vocabulary
-def tfidf_vectorize(lst_text, analyzer='word', max_df=1.0, max_features=None,
-                    min_df=1, use_idf=True, ngram_range=(1,1), stop_words=None):
+def tfidf_vectorize(lst_text: list, analyzer: str = 'word', max_df: float = 1.0, max_features: int = None,
+                    min_df: float = 1, use_idf: bool = True, ngram_range: tuple = (1, 1), stop_words: list = None) -> tuple:
     """
     Parameters:
         lst_text : list
@@ -959,15 +1582,29 @@ def tfidf_vectorize(lst_text, analyzer='word', max_df=1.0, max_features=None,
     return tfidf_vectorizer, features, features_names, vocabulary
-def SF_vectorize(lst_text, model_name):
+def SF_vectorize(lst_text: list, model_name: str) -> np.array:
     """
-    Vectorize text using Sentence Transformers
+    Vectorize text using Sentence Transformers.
+    Parameters:
+        lst_text : list
+            List of texts to be vectorized.
+        model_name : str
+            Name of the Sentence Transformers model to be used.
+    Returns:
+        features : numpy.ndarray
+            Encoded features of the input texts.
+    Description:
+        This function vectorizes a list of texts using Sentence Transformers. It encodes the texts into fixed-size
+        vectors of features using the specified model. The function returns the encoded features as a numpy array.
     """
     model = SentenceTransformer(model_name)
     features = model.encode(lst_text)
     return features
-def load_HF_embeddings(model_name, encode_kwargs={'batch_size':32}, model_kwargs={'device': 'cuda:0'}):
+def load_HF_embeddings(model_name : str, encode_kwargs : dict ={'batch_size':32}, model_kwargs : dict ={'device': 'cuda:0'}):
     """
     create a HugginFace encoder
     """
@@ -987,20 +1624,25 @@ def HF_vectorize(HF_encoder, lst_txt):
     return embeddings
-def encode_chunked_files(chunk_files_paths, HF_encoder, cols, col_text, path_embedded_chunks, reencode = False):
+def encode_chunked_files(chunk_files_paths: list,
+                         HF_encoder,
+                         cols: list,
+                         col_text: str,
+                         path_embedded_chunks: str,
+                         reencode: bool = False) -> list:
     """
     Encode text from files and save the results in another pickle file.
     Parameters:
-        chunk_files_paths (list): List of file paths containing documents.
+        chunk_files_paths (List[str]): List of file paths containing documents.
         HF_encoder (Encoder): Encoder object for text vectorization.
-        cols (list): Columns to keep in the resulting DataFrame.
+        cols (List[str]): Columns to keep in the resulting DataFrame.
         col_text (str): Column containing text data in the DataFrame.
         path_embedded_chunks (str): Path to save the embedded chunks.
-        reencode (bool): Whether to re-encode files even if they already exist.
+        reencode (bool, optional): Whether to re-encode files even if they already exist. Defaults to False.
     Returns:
-        list: List of paths for newly created files.
+        List[str]: List of paths for newly created files.
     """
     new_file_paths=[]
     for file in tqdm(chunk_files_paths, total=len(chunk_files_paths), desc="Encoding text from files"):
@@ -1032,9 +1674,16 @@ def encode_chunked_files(chunk_files_paths, HF_encoder, cols, col_text, path_emb
 # SCALING FEATURES
 ####################################################################
-def scaling_features(features, method="standard"):
+def scaling_features(features: list, method: str = "standard") -> list:
     """
-    Scale features if metho
+    Scale features using either standardization or min-max scaling.
+    Parameters:
+        features (Union[List[List[float]], List[float]]): List of features to scale.
+        method (str, optional): Method of scaling, either "standard" for standardization or "min-max" for min-max scaling. Defaults to "standard".
+    Returns:
+        Union[List[List[float]], List[float]]: Scaled features.
     """
     try:
         if method=="standard":

opsci-toolbox 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl

opsci-toolbox 0.0.5py3-none-any.whl → 0.0.7py3-none-any.whl