PyPI - opsci-toolbox - Versions diffs - 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl - Mend

opsci-toolbox 0.0.12py3-none-any.whl → 0.0.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

opsci_toolbox/apis/rapidapi_helpers.py +1 -2
opsci_toolbox/apis/reddit.py +407 -0
opsci_toolbox/apis/telegram.py +1125 -0
opsci_toolbox/helpers/common.py +177 -5
opsci_toolbox/helpers/dataviz.py +184 -26
opsci_toolbox/helpers/dates.py +47 -1
opsci_toolbox/helpers/gliner.py +88 -0
opsci_toolbox/helpers/nlp.py +273 -15
opsci_toolbox/helpers/nlp_cuml.py +44 -3
opsci_toolbox/helpers/sna.py +1 -0
{opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.14.dist-info}/METADATA +5 -2
opsci_toolbox-0.0.14.dist-info/RECORD +26 -0
opsci_toolbox-0.0.14.dist-info/dependency_links.txt +1 -0
opsci_toolbox-0.0.12.dist-info/RECORD +0 -22
{opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.14.dist-info}/WHEEL +0 -0
{opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.14.dist-info}/top_level.txt +0 -0

opsci_toolbox/helpers/nlp.py CHANGED Viewed

@@ -7,7 +7,7 @@ import os
 from sklearn.decomposition import TruncatedSVD
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from sklearn.manifold import TSNE
-from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
 from sklearn.cluster import AgglomerativeClustering
 from sentence_transformers import SentenceTransformer
 from tqdm import tqdm
@@ -25,9 +25,10 @@ import requests
 import json
 from opsci_toolbox.helpers.common import write_json, write_pickle, load_pickle, create_dir, copy_file, write_jsonl
 from textacy.preprocessing.replace import urls
+from textacy.preprocessing.remove import brackets
 from eldar import Query
 import torch
-from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer
+from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
 from bs4 import BeautifulSoup
@@ -93,10 +94,15 @@ def filter_by_query(df: pd.DataFrame, col_text: str, query: str, ignore_case: bo
             The filtered DataFrame.
     """
     eldar_query=Query(query, ignore_case = ignore_case, ignore_accent=ignore_accent, match_word=match_word)
-    df[col_text] = df[df[col_text].apply(eldar_query)]
+    df = df[df[col_text].apply(eldar_query)]
     df=df.reset_index(drop=True)
     return df
+def remove_trailing_dots(text):
+    if text.endswith('…'):
+        return text[:-3].strip()
+    return text
 def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
     """
     Generic cleaning process for topic modeling.
@@ -114,12 +120,19 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
             The DataFrame with cleaned text data.
     """
     df[col_clean] = df[col].apply(remove_rt)
+    df[col_clean] = df[col_clean].apply(remove_emoji)
+    df[col_clean] = df[col_clean].apply(remove_trailing_dots)
+    df[col_clean] = df[col_clean].apply(remove_html_tags)
+    df[col_clean] = df[col_clean].apply(lambda x : brackets(x))
     df[col_clean] = df[col_clean].apply(lambda x : urls(x, repl= ''))
     df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
+    df[col_clean] = df[col_clean].apply(remove_multiple_hashtags)
     df[col_clean] = df[col_clean].apply(remove_extra_spaces)
     # df = df.loc[(df[col_clean] != ""), :]
     return df
 def extract_insta_shortcode(url: str) -> str:
     """
     Extracts the shortcode from an Instagram URL.
@@ -151,6 +164,39 @@ def remove_parentheses_content(text: str) -> str:
     result = re.sub(r'\([^)]*\)', '', text)
     return result
+def remove_hashtags(text: str) -> str:
+    """
+    Removes any hashtag from text.
+    Args:
+        text : str
+            The input text string to clean.
+    Returns:
+        result : str
+            The input text string with hashtags removed.
+    """
+    pattern = r'\B#\w+'
+    result = re.sub(pattern, '', text).strip()
+    return result
+def remove_multiple_hashtags(text: str) -> str:
+    """
+    Removes series of hashtags separated by spaces.
+    Args:
+        text : str
+            The input text string to clean.
+    Returns:
+        result : str
+            The input text string with series of hashtags removed.
+    """
+    pattern = r'(?:\B#\w+\s*){2,}'
+    result = re.sub(pattern, '', text).strip()
+    return result
 def remove_emojis(text: str) -> str:
     """
     Removes emojis and their textual representations from a text string.
@@ -171,6 +217,31 @@ def remove_emojis(text: str) -> str:
     return text_no_emojis
+def remove_emoji(string):
+    emoji_pattern = re.compile(
+        "["
+        u"\U0001F600-\U0001F64F"  # emoticons
+        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+        u"\U0001F680-\U0001F6FF"  # transport & map symbols
+        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+        u"\U00002500-\U00002BEF"  # chinese char
+        u"\U00002702-\U000027B0"
+        u"\U00002702-\U000027B0"
+        u"\U000024C2-\U0001F251"
+        u"\U0001f926-\U0001f937"
+        u"\U00010000-\U0010ffff"
+        u"\u2640-\u2642"
+        u"\u2600-\u2B55"
+        u"\u200d"
+        u"\u23cf"
+        u"\u23e9"
+        u"\u231a"
+        u"\ufe0f"  # dingbats
+        u"\u3030"
+        "]+", flags=re.UNICODE)
+    return emoji_pattern.sub(r'', string)
 def extract_numbers(text: str) -> list:
     """
     Extracts all numeric values from a given text string and returns them as a list of floats.
@@ -421,6 +492,23 @@ def remove_stopwords(lang: str, stopwords: list) -> pd.DataFrame:
     df.to_csv(file_path,  encoding="utf-8", index=False)
     print("File saved -", file_path)
     return df
+def keep_valid_filename_chars(text: str, replace: str = '') -> str:
+    """
+    Replace all characters not typically allowed in filenames with a specified replacement string.
+    Args:
+        text : str
+            The input text string.
+        replace : str, optional
+            The string to replace invalid filename characters with. Default is an empty string.
+    Returns:
+        cleaned_text : str
+            The input text string with invalid filename characters replaced.
+    """
+    return re.sub(r'[.<>:"/\\|?*\x00-\x1F]', replace, text)
 def keep_alphanum_char(text: str, replace: str = '') -> str:
@@ -453,7 +541,7 @@ def substitute_punctuations_with_white_space(text : str) -> str:
     text = re.sub(r"[%s]" % re.escape('!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~“…”’'), " ", text)
     return text
-def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate") -> dict:
+def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate", format_payload="html") -> dict:
     """
     Translate text using LibreTranslate service.
@@ -470,6 +558,8 @@ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_j
             The directory to save the translation result JSON file.
         url : str, optional
             The URL of the WT Libre translation service. Default is "http://127.0.0.1:5000/translate".
+        format_payload : str, optional
+            Possible values are html or text.
     Returns:
         json_data : dict
@@ -480,7 +570,7 @@ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_j
         "q": text,
         "source": source,
         "target": target,
-        "format": "text",
+        "format": format_payload,
         "api_key": ""
     }
@@ -492,7 +582,7 @@ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_j
         write_json(json_data, dir_json , str(filename))
         return json_data
-def translate_batch(batch_text: list, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate") -> list:
+def translate_batch(batch_text: list, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate", format_payload="html") -> list:
     """
     Translate a batch of texts using LibreTranslate service.
@@ -509,6 +599,8 @@ def translate_batch(batch_text: list, source: str, target: str, filename: str, d
             The directory to save the translation result JSONL file.
         url : str, optional
             The URL of the WT Libre translation service. Default is "http://127.0.0.1:5000/translate".
+        format_payload : str, optional
+            Possible values are html or text.
     Returns:
         json_results : list of dict
@@ -519,7 +611,7 @@ def translate_batch(batch_text: list, source: str, target: str, filename: str, d
         "q": batch_text,
         "source": source,
         "target": target,
-        "format": "text",
+        "format": format_payload,
         "api_key": ""
     }
@@ -535,7 +627,7 @@ def translate_batch(batch_text: list, source: str, target: str, filename: str, d
         write_jsonl(json_results, dir_json , str(filename))
         return json_results
-def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:5000/translate") -> str:
+def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:5000/translate", format_payload="html") -> str:
     """
     Translate text using LibreTranslate service.
@@ -548,6 +640,8 @@ def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:
             The target language code.
         url : str, optional
             The URL of the translation service. Default is "http://127.0.0.1:5000/translate".
+        format_payload : str, optional
+            Possible values are html or text.
     Returns:
         translatedText : str
@@ -558,7 +652,7 @@ def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:
         "q": text,
         "source": source,
         "target": target,
-        "format": "text",
+        "format": format_payload,
         "api_key": ""
     }
@@ -788,7 +882,95 @@ def top_items_per_category(df: pd.DataFrame, col_lst: str = "hashtags", col_cat:
             )
     return df_count
-def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id: str, col_engagement: str, col_user_id: str, metrics: dict) -> pd.DataFrame:
+def topic_aggregate_chunks(df: pd.DataFrame, col_id: str, col_topic : str, col_chunk_id: str, col_engagement: str, col_user_id: str=None, metrics : dict =dict())-> pd.DataFrame:
+    """
+    Calculate the intermediate agregation of chunks per Post ID and topic
+    Args:
+        df : pandas DataFrame
+            DataFrame containing processed data.
+        col_id : str
+            Name of the column containing unique posts identifiers.
+        col_topic : str
+            Name of the column containing topic labels.
+        col_chunk_id : str
+            Name of the column containing unique sentences identifiers.
+        col_engagement : str
+            Name of the column containing engagement metrics.
+        col_user_id : str
+            Name of the column containing user identifiers.
+        metrics : dict
+            Dictionary containing additional metrics to aggregate.
+    Returns:
+        DataFrame
+            DataFrame containing the agregated posts per topic
+    Description:
+        This function aggregates various metrics for each post and topic, including verbatim counts, engagement sums, average word counts, occurrences of emojis, hashtags, and mentions, as well as unique counts for emojis, hashtags, and mentions. Additionally, it computes the average topic coordinates (x and y) if available. Finally, it calculates percentages for verbatims, engagements, users (if applicable), occurrences of emojis, hashtags, and mentions, and their respective combinations with verbatims.
+    """
+    metrics_dict = dict()
+    # metrics_dict[col_id]=(col_id,'first')
+    metrics_dict[col_chunk_id]=(col_chunk_id,"nunique")
+    metrics_dict[col_engagement]=(col_engagement,'first')
+    if col_user_id:
+        metrics_dict[col_user_id]=(col_user_id,"first")
+    if "sentiment" in df.columns:
+        metrics_dict["sentiment"] = ("sentiment", "mean")
+    if "sentiment_score" in df.columns:
+        metrics_dict["sentiment_score"] = ("sentiment_score", "mean")
+    metrics_dict["tokens_count"] = ("tokens_count", "sum")
+    metrics_dict["lemmas_count"] = ("lemmas_count", "sum")
+    metrics_dict["emojis_count"] = ("emojis_count", "sum")
+    metrics_dict["unique_emojis"] = ("unique_emojis", lambda x: set(emoji for sublist in x for emoji in sublist))
+    metrics_dict["unique_emojis_count"] = ("unique_emojis", len)
+    metrics_dict["hashtags"] = ("hashtags", lambda x: list(hashtag for sublist in x for hashtag in sublist))
+    metrics_dict["hashtags_count"] = ("hashtags_count", "sum")
+    metrics_dict["mentions"] = ("mentions", lambda x: list(mention for sublist in x for mention in sublist))
+    metrics_dict["mentions_count"] = ("mentions_count", "sum")
+    metrics_dict["extracted_urls_from_text"] = ("extracted_urls_from_text", lambda x: list(url for sublist in x for url in sublist))
+    metrics_dict["domain"] = ("domain", lambda x: list(domain for sublist in x for domain in sublist))
+    metrics_dict["len_numbers"] = ("len_numbers", "sum")
+    metrics_dict["interrogation"] = ("interrogation", "sum")
+    metrics_dict["exclamation"] = ("exclamation", "sum")
+    metrics_dict["x"] = ("x", "mean")
+    metrics_dict["y"] = ("y", "mean")
+    metrics_dict.update(metrics)
+    df_gb = df.groupby([col_id, col_topic]).agg(**metrics_dict).reset_index()
+    df_gb[col_topic]=df_gb[col_topic].astype(str)
+    return df_gb
+def sentiment_to_category(sentiment : float, boundaries : list = [-1.0, -0.5, 0.5, 1.0], labels :list = ['negative', 'neutral', 'positive']) -> str:
+    """
+    Assign a sentiment category to a sentiment score.
+    Args:
+        sentiment : float
+            sentiment score
+        boundaries : list
+            list of boundaries for each category
+        labels : list
+            list of labels for each category
+    Returns:
+        str
+            category label
+    Description:
+        This function assigns a sentiment category to a sentiment score based on a list of boundaries and labels. If the sentiment score is outside the boundaries, it is assigned to the last category.
+    """
+    for i in range(len(boundaries) - 1):
+        if boundaries[i] <= sentiment < boundaries[i + 1]:
+            return labels[i]
+    return labels[-1]
+def topic_representation(df: pd.DataFrame, col_topic: str, col_id: str, col_engagement: str, col_user_id: str, metrics: dict) -> pd.DataFrame:
     """
     Calculate the representation of topics in a processed DataFrame.
@@ -822,11 +1004,15 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
     metrics_dict['engagements']=(col_engagement,'sum')
     if col_user_id:
         metrics_dict["users"]=(col_user_id,"nunique")
+        panel_cols = [col for col in df.columns if col[:6] == 'panel_']
+        if len(panel_cols)>0:
+            for panel_col in panel_cols:
+                metrics_dict[panel_col+'_verbatims'] = (panel_col, "sum")
+                metrics_dict[panel_col+'_users'] = (col_user_id, lambda x : x[df[panel_col]].nunique())
+                metrics_dict[panel_col+'_engagements'] = (col_engagement, lambda x : x[df[panel_col]].sum())
     metrics_dict.update(metrics)
-    print(metrics_dict)
     metrics_dict['avg_word_count']=("tokens_count", lambda x: round(x.mean(),2))
     metrics_dict['verbatims_with_emoji']=("emojis_count", lambda x: (x > 0).sum() )
     metrics_dict['emojis_occurences']=("emojis_count", "sum")
@@ -843,9 +1029,8 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
     metrics_dict['topic_x']=("x", "mean")
     metrics_dict['topic_y']=("y", "mean")
     # on produit la représentation des topics finale
-    df_distrib_all = (df_processed_data.groupby(col_topic)
+    df_distrib_all = (df.groupby(col_topic)
                       .agg(**metrics_dict)
                       .sort_values(by="verbatims", ascending=False)
                       .assign(engagement_per_verbatims = lambda x : x["engagements"] / x["verbatims"])
@@ -1101,10 +1286,12 @@ def PRarmy_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str = "l
                 NER_type.append(ent.label_)
                 NER_text.append(ent.text)
         record = (NER_type, NER_text, ' '.join(map(str, lemmas_list)))
         all_records.append(record)
     df[['NER_type', 'NER_text', col_lemma]] = pd.DataFrame(all_records, index=df.index)
     return df
@@ -1819,6 +2006,75 @@ def encode_chunked_files(chunk_files_paths: list,
     return new_file_paths
+####################################################################
+# ENCODING FEATURES
+####################################################################
+def encode_labels(data_to_encode: np.ndarray) -> tuple:
+    """
+    Encodes a list of labels using a LabelEncoder.
+    Args:
+    - data_to_encode (List[Union[str, int]]): The list of labels to encode. Labels can be of any hashable type,
+      but strings or integers are typical.
+    Returns:
+    - Tuple[LabelEncoder, np.ndarray]: A tuple containing the fitted LabelEncoder instance and a numpy array
+      of encoded labels.
+    """
+    label_encoder = LabelEncoder()
+    label_encoder.fit(data_to_encode)
+    encoded_labels = label_encoder.transform(data_to_encode)
+    return label_encoder, encoded_labels
+def encode_new_labels(label_encoder : LabelEncoder, data_to_encode : np.ndarray) -> np.ndarray:
+    """
+    Encodes a list of new labels using an already fitted LabelEncoder.
+    Args:
+    - label_encoder (LabelEncoder): A pre-fitted LabelEncoder instance.
+    - data_to_encode (List[Union[str, int]]): The list of new labels to encode using the pre-fitted encoder.
+    Returns:
+    - np.ndarray: A numpy array of encoded labels.
+    """
+    encoded_labels = label_encoder.transform(data_to_encode)
+    return encoded_labels
+def one_hot_encode(data_to_encode:np.ndarray) -> tuple:
+    """
+    One-hot encodes a list of categorical values using OneHotEncoder.
+    Args:
+    - data_to_encode (List[Union[str, int]]): The list of categorical values to encode. The values can be of
+      any hashable type, typically strings or integers.
+    Returns:
+    - Tuple[OneHotEncoder, np.ndarray]: A tuple containing the fitted OneHotEncoder instance and a numpy array
+      of one-hot encoded values.
+    """
+    one_hot_encoder = OneHotEncoder(sparse=False)
+    data_to_encode_reshaped = np.array(data_to_encode).reshape(-1, 1)  # Reshape for OneHotEncoder
+    one_hot_encoder.fit(data_to_encode_reshaped)
+    encoded_array = one_hot_encoder.transform(data_to_encode_reshaped)
+    return one_hot_encoder, encoded_array
+def one_hot_encode_new_data(one_hot_encoder: OneHotEncoder, data_to_encode: np.ndarray) -> np.ndarray:
+    """
+    One-hot encodes a list of new categorical values using an already fitted OneHotEncoder.
+    Args:
+    - one_hot_encoder (OneHotEncoder): A pre-fitted OneHotEncoder instance.
+    - data_to_encode (List[Union[str, int]]): The list of new categorical values to encode using the pre-fitted encoder.
+    Returns:
+    - np.ndarray: A numpy array of one-hot encoded values.
+    """
+    data_to_encode_reshaped = np.array(data_to_encode).reshape(-1, 1)  # Reshape for OneHotEncoder
+    encoded_array = one_hot_encoder.transform(data_to_encode_reshaped)
+    return encoded_array
 ####################################################################
 # SCALING FEATURES
@@ -2130,13 +2386,15 @@ def check_gpu():
 def HF_load_model(model_checkpoint):
     tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
     model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
+    config = AutoConfig.from_pretrained(model_checkpoint)
     if torch.cuda.is_available():
         model.cuda()
-    return model, tokenizer
+    return model, tokenizer, config
 def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json):
     """ Calculate sentiment of a text. `return_type` can be 'label', 'score' or 'proba' """
     file_path= os.path.join(dir_json , str(filename)+'.json')
+    results = {}
     if not os.path.exists(file_path):
         with torch.no_grad():
             inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)

opsci_toolbox/helpers/nlp_cuml.py CHANGED Viewed

@@ -46,6 +46,44 @@ def reduce_with_cuml_UMAP(embeddings: np.ndarray,
     reduced_embeddings = reducer.transform(embeddings)
     return reducer, reduced_embeddings
+def supervised_reduce_with_cuml_UMAP(embeddings: np.ndarray,
+                          n_neighbors: int = 5,
+                          n_components: int = 3,
+                          min_dist: float = 0.0,
+                          metric: str = "cosine",
+                          spread: float = 1.0,
+                          learning_rate: float = 1.0,
+                          n_epochs:int = 300,
+                          y: np.ndarray = None,
+                          convert_dtype: bool = False
+                           ) -> tuple:
+    """
+    Reduces the dimensionality of embeddings using UMAP with cuML library.
+    Args:
+        embeddings (np.ndarray): The input embeddings to be reduced.
+        n_neighbors (int, optional): The number of nearest neighbors to consider. Defaults to 5.
+        n_components (int, optional): The number of dimensions of the embedded space. Defaults to 3.
+        min_dist (float, optional): The minimum distance between embedded points. Defaults to 0.0.
+        metric (str, optional): The metric to use for distance computation. Defaults to "cosine".
+        spread (float, optional): The effective scale of embedded points. Defaults to 1.0.
+    Returns:
+        reducer (UMAP): The UMAP reducer object.
+        reduced_embeddings (np.ndarray): The reduced embeddings.
+    """
+    reducer = UMAP(n_neighbors=n_neighbors,
+                   n_components=n_components,
+                   min_dist=min_dist,
+                   metric=metric,
+                   spread = spread,
+                   n_epochs=n_epochs,
+                   learning_rate=learning_rate).fit(X = embeddings, y = y, convert_dtype = convert_dtype)
+    reduced_embeddings = reducer.transform(embeddings)
+    return reducer, reduced_embeddings
 def transform_with_cuml_UMAP(reducer,
                              new_embeddings: np.ndarray) -> np.ndarray:
     """
@@ -384,7 +422,6 @@ def cuml_word_frequency_per_categorie(gdf: pd.DataFrame, col_text: str, col_cat:
     # Initialize cuML's CountVectorizer
     count_vectorizer = CountVectorizer(analyzer='word', ngram_range=ngram_range, stop_words=stop_words)
-    print(type(gdf[col_text]))
     # Fit and transform the text data
     X_train_count = count_vectorizer.fit_transform(cudf.Series(gdf[col_text]))
     X_names_count = count_vectorizer.get_feature_names()
@@ -402,13 +439,17 @@ def cuml_word_frequency_per_categorie(gdf: pd.DataFrame, col_text: str, col_cat:
             df_count_tmp = df_count_tmp.head(n_words)
         if min_freq:
             df_count_tmp = df_count_tmp[df_count_tmp["freq"] > min_freq]
+        df_count_tmp['word'] = df_count_tmp['word'].astype(str)
         # Concatenate the result to the main DataFrame
         df_count = cudf.concat([df_count, df_count_tmp])
     # Convert the result back to pandas DataFrame
     return df_count.to_pandas()
 # def cuml_chi2_per_category(lst_text: list, lst_categorie: list, col_cat: str, n_words: int = 10, p_value_limit: float = 0.95, min_freq: int = 3) -> pd.DataFrame:
 #     # Convert input lists to cuDF Series
@@ -588,7 +629,7 @@ def cudf_encode_chunked_files(chunk_files_paths: list,
             current_df = cudf_read_parquet(file)
             text_list = current_df[col_text].to_arrow().to_pylist()
             # text vectorization
             embeddings = HF_encoder.embed_documents(text_list)

opsci_toolbox/helpers/sna.py CHANGED Viewed

@@ -421,6 +421,7 @@ def select_top_nodes_by_degrees(G: nx.Graph, degree_type : str = "degree", N : i
     return subgraph
 def scale_size(G, size_attribute, min_node_size = 10, max_node_size = 100):
     """
     Scale the sizes of nodes in a graph based on a specified attribute.

{opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.14.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: opsci-toolbox
-Version: 0.0.12
+Version: 0.0.14
 Summary: a complete toolbox
 Home-page: UNKNOWN
 Author: Erwan Le Nagard
@@ -41,13 +41,16 @@ Requires-Dist: spacy-language-detection ==0.2.1
 Requires-Dist: spacymoji ==3.1.0
 Requires-Dist: supervision ==0.21.0
 Requires-Dist: textacy ==0.13.0
-Requires-Dist: torch ==2.0.1
+Requires-Dist: torch >=2.4.0
 Requires-Dist: tqdm >=4.66.2
 Requires-Dist: trafilatura ==1.7.0
 Requires-Dist: transformers ==4.38.2
 Requires-Dist: umap-learn ==0.5.5
 Requires-Dist: urlextract ==1.9.0
 Requires-Dist: wordcloud ==1.9.3
+Requires-Dist: Unidecode ==1.3.8
+Requires-Dist: kaleido ==0.2.1
+Requires-Dist: gliner ==0.2.8
 UNKNOWN

opsci_toolbox-0.0.14.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,26 @@
+opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+opsci_toolbox/apis/rapidapi_helpers.py,sha256=plX0uoGXWBEmeRqK7QfB_CVYJnW15kVUWtitESxPLNw,26669
+opsci_toolbox/apis/reddit.py,sha256=b_dJFZ_bOB9LLugGBBw5bCbUZdq8VnwtVCGaTYljIIg,21096
+opsci_toolbox/apis/telegram.py,sha256=IJYXMvXzA2R2Z7ywKJiny38pd-ryHK4jPxVG2Nj_dms,45676
+opsci_toolbox/apis/webscraping.py,sha256=1DAIYbywZoPwTSyoqFGxyF0-q_nUsGg_VK51zLL_bB0,21465
+opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
+opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+opsci_toolbox/helpers/common.py,sha256=dlP6TnRggZsnPksgo7LPH7IghU_t9LFz42eMEzzg99o,53323
+opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
+opsci_toolbox/helpers/dataviz.py,sha256=U2Kj-xoF1wHvYXUKxLsrSvKnhky9PrPUy61s1WEKp44,208743
+opsci_toolbox/helpers/dates.py,sha256=Pq-SKP2n1z0_jzU8NxGSv8CHLH_MOKjP_rNYeny0Tb8,4752
+opsci_toolbox/helpers/gliner.py,sha256=qLkpuoCDezQyYmg_TE3XYETSpobHods6WBjCLo0Gjqw,3579
+opsci_toolbox/helpers/nlp.py,sha256=hXnP6rUkUzyurJ5O_fNUxqT2MZK3poC21L9zy6oa22c,102551
+opsci_toolbox/helpers/nlp_cuml.py,sha256=OBCRkaHibuyvJ8LQAE2EC7_J0KPe7Kf-ayN2jyxDlKg,30709
+opsci_toolbox/helpers/sna.py,sha256=E5D_1aGDmq_YQYseHxZggEtWQOwbXJJ0GHu3YtZLGtg,31906
+opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
+opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
+opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
+opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
+opsci_toolbox-0.0.14.dist-info/METADATA,sha256=X2EgVw8JlZLdgnrN1nOP6aZRs1WyztbkCkN4UKkuTLE,1727
+opsci_toolbox-0.0.14.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+opsci_toolbox-0.0.14.dist-info/dependency_links.txt,sha256=bEiJsgyh9M0F_pGpJBwUYDefiTNq9F6QEGfQS5RH1Os,39
+opsci_toolbox-0.0.14.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
+opsci_toolbox-0.0.14.dist-info/RECORD,,

opsci_toolbox-0.0.14.dist-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ https://download.pytorch.org/whl/cu124

opsci_toolbox-0.0.12.dist-info/RECORD DELETED Viewed

@@ -1,22 +0,0 @@
-opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-opsci_toolbox/apis/rapidapi_helpers.py,sha256=k_hYcRNww5noNkX7zyz5Htggxb15BPoKSlbY7NLuQXI,26696
-opsci_toolbox/apis/webscraping.py,sha256=1DAIYbywZoPwTSyoqFGxyF0-q_nUsGg_VK51zLL_bB0,21465
-opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
-opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-opsci_toolbox/helpers/common.py,sha256=nqg9wzgU5DxVTCxEb5LSw2lUnp0f_hKF_Q-DhpRtu6g,45158
-opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
-opsci_toolbox/helpers/dataviz.py,sha256=1cIGb-u81cD5iSIkkkrzyrBnfim7fbhm0x_CguHUbf0,202128
-opsci_toolbox/helpers/dates.py,sha256=Wf7HxaUY62IRrY3XPdRIuoaMbGi3QqWf-vStqbRRY_o,2633
-opsci_toolbox/helpers/nlp.py,sha256=n7nNEU0cuu7bqXYRRBH4D-xIzpdNwKm0nj-eRYh3aPY,91956
-opsci_toolbox/helpers/nlp_cuml.py,sha256=XzBfoFMpVIehpRbp60E4wGokpoqJP0lJxs1plOxQqBY,28882
-opsci_toolbox/helpers/sna.py,sha256=XL1BZ-x83xWRNbGsvh7-m8Mdy6iOrWx8vjgaL2_TSmo,31905
-opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
-opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
-opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
-opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
-opsci_toolbox-0.0.12.dist-info/METADATA,sha256=LosT5jzu7Z0TXIslwVUSvPG6AKMrblGp8A6odUN_N9U,1633
-opsci_toolbox-0.0.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-opsci_toolbox-0.0.12.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
-opsci_toolbox-0.0.12.dist-info/RECORD,,

{opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.14.dist-info}/WHEEL RENAMED Viewed

File without changes

{opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.14.dist-info}/top_level.txt RENAMED Viewed

File without changes

opsci-toolbox 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl

opsci-toolbox 0.0.12py3-none-any.whl → 0.0.14py3-none-any.whl