PyPI - opsci-toolbox - Versions diffs - 0.0.12__py3-none-any.whl → 0.0.13__py3-none-any.whl - Mend

opsci-toolbox 0.0.12py3-none-any.whl → 0.0.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

opsci_toolbox/apis/reddit.py +399 -0
opsci_toolbox/apis/telegram.py +1035 -0
opsci_toolbox/helpers/common.py +176 -4
opsci_toolbox/helpers/dataviz.py +184 -26
opsci_toolbox/helpers/dates.py +46 -0
opsci_toolbox/helpers/gliner.py +88 -0
opsci_toolbox/helpers/nlp.py +188 -7
opsci_toolbox/helpers/nlp_cuml.py +3 -3
opsci_toolbox/helpers/sna.py +1 -0
{opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.13.dist-info}/METADATA +4 -1
{opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.13.dist-info}/RECORD +13 -10
{opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.13.dist-info}/WHEEL +0 -0
{opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.13.dist-info}/top_level.txt +0 -0

opsci_toolbox/helpers/nlp.py CHANGED Viewed

@@ -25,9 +25,10 @@ import requests
 import json
 from opsci_toolbox.helpers.common import write_json, write_pickle, load_pickle, create_dir, copy_file, write_jsonl
 from textacy.preprocessing.replace import urls
+from textacy.preprocessing.remove import brackets
 from eldar import Query
 import torch
-from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer
+from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
 from bs4 import BeautifulSoup
@@ -97,6 +98,11 @@ def filter_by_query(df: pd.DataFrame, col_text: str, query: str, ignore_case: bo
     df=df.reset_index(drop=True)
     return df
+def remove_trailing_dots(text):
+    if text.endswith('…'):
+        return text[:-3].strip()
+    return text
 def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
     """
     Generic cleaning process for topic modeling.
@@ -114,12 +120,19 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
             The DataFrame with cleaned text data.
     """
     df[col_clean] = df[col].apply(remove_rt)
+    df[col_clean] = df[col_clean].apply(remove_emoji)
+    df[col_clean] = df[col_clean].apply(remove_trailing_dots)
+    df[col_clean] = df[col_clean].apply(remove_html_tags)
+    df[col_clean] = df[col_clean].apply(lambda x : brackets(x))
     df[col_clean] = df[col_clean].apply(lambda x : urls(x, repl= ''))
     df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
+    df[col_clean] = df[col_clean].apply(remove_multiple_hashtags)
     df[col_clean] = df[col_clean].apply(remove_extra_spaces)
     # df = df.loc[(df[col_clean] != ""), :]
     return df
 def extract_insta_shortcode(url: str) -> str:
     """
     Extracts the shortcode from an Instagram URL.
@@ -151,6 +164,39 @@ def remove_parentheses_content(text: str) -> str:
     result = re.sub(r'\([^)]*\)', '', text)
     return result
+def remove_hashtags(text: str) -> str:
+    """
+    Removes any hashtag from text.
+    Args:
+        text : str
+            The input text string to clean.
+    Returns:
+        result : str
+            The input text string with hashtags removed.
+    """
+    pattern = r'\B#\w+'
+    result = re.sub(pattern, '', text).strip()
+    return result
+def remove_multiple_hashtags(text: str) -> str:
+    """
+    Removes series of hashtags separated by spaces.
+    Args:
+        text : str
+            The input text string to clean.
+    Returns:
+        result : str
+            The input text string with series of hashtags removed.
+    """
+    pattern = r'(?:\B#\w+\s*){2,}'
+    result = re.sub(pattern, '', text).strip()
+    return result
 def remove_emojis(text: str) -> str:
     """
     Removes emojis and their textual representations from a text string.
@@ -171,6 +217,31 @@ def remove_emojis(text: str) -> str:
     return text_no_emojis
+def remove_emoji(string):
+    emoji_pattern = re.compile(
+        "["
+        u"\U0001F600-\U0001F64F"  # emoticons
+        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+        u"\U0001F680-\U0001F6FF"  # transport & map symbols
+        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+        u"\U00002500-\U00002BEF"  # chinese char
+        u"\U00002702-\U000027B0"
+        u"\U00002702-\U000027B0"
+        u"\U000024C2-\U0001F251"
+        u"\U0001f926-\U0001f937"
+        u"\U00010000-\U0010ffff"
+        u"\u2640-\u2642"
+        u"\u2600-\u2B55"
+        u"\u200d"
+        u"\u23cf"
+        u"\u23e9"
+        u"\u231a"
+        u"\ufe0f"  # dingbats
+        u"\u3030"
+        "]+", flags=re.UNICODE)
+    return emoji_pattern.sub(r'', string)
 def extract_numbers(text: str) -> list:
     """
     Extracts all numeric values from a given text string and returns them as a list of floats.
@@ -421,6 +492,23 @@ def remove_stopwords(lang: str, stopwords: list) -> pd.DataFrame:
     df.to_csv(file_path,  encoding="utf-8", index=False)
     print("File saved -", file_path)
     return df
+def keep_valid_filename_chars(text: str, replace: str = '') -> str:
+    """
+    Replace all characters not typically allowed in filenames with a specified replacement string.
+    Args:
+        text : str
+            The input text string.
+        replace : str, optional
+            The string to replace invalid filename characters with. Default is an empty string.
+    Returns:
+        cleaned_text : str
+            The input text string with invalid filename characters replaced.
+    """
+    return re.sub(r'[.<>:"/\\|?*\x00-\x1F]', replace, text)
 def keep_alphanum_char(text: str, replace: str = '') -> str:
@@ -788,7 +876,95 @@ def top_items_per_category(df: pd.DataFrame, col_lst: str = "hashtags", col_cat:
             )
     return df_count
-def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id: str, col_engagement: str, col_user_id: str, metrics: dict) -> pd.DataFrame:
+def topic_aggregate_chunks(df: pd.DataFrame, col_id: str, col_topic : str, col_chunk_id: str, col_engagement: str, col_user_id: str=None, metrics : dict =dict())-> pd.DataFrame:
+    """
+    Calculate the intermediate agregation of chunks per Post ID and topic
+    Args:
+        df : pandas DataFrame
+            DataFrame containing processed data.
+        col_id : str
+            Name of the column containing unique posts identifiers.
+        col_topic : str
+            Name of the column containing topic labels.
+        col_chunk_id : str
+            Name of the column containing unique sentences identifiers.
+        col_engagement : str
+            Name of the column containing engagement metrics.
+        col_user_id : str
+            Name of the column containing user identifiers.
+        metrics : dict
+            Dictionary containing additional metrics to aggregate.
+    Returns:
+        DataFrame
+            DataFrame containing the agregated posts per topic
+    Description:
+        This function aggregates various metrics for each post and topic, including verbatim counts, engagement sums, average word counts, occurrences of emojis, hashtags, and mentions, as well as unique counts for emojis, hashtags, and mentions. Additionally, it computes the average topic coordinates (x and y) if available. Finally, it calculates percentages for verbatims, engagements, users (if applicable), occurrences of emojis, hashtags, and mentions, and their respective combinations with verbatims.
+    """
+    metrics_dict = dict()
+    # metrics_dict[col_id]=(col_id,'first')
+    metrics_dict[col_chunk_id]=(col_chunk_id,"nunique")
+    metrics_dict[col_engagement]=(col_engagement,'first')
+    if col_user_id:
+        metrics_dict[col_user_id]=(col_user_id,"first")
+    if "sentiment" in df.columns:
+        metrics_dict["sentiment"] = ("sentiment", "mean")
+    if "sentiment_score" in df.columns:
+        metrics_dict["sentiment_score"] = ("sentiment_score", "mean")
+    metrics_dict["tokens_count"] = ("tokens_count", "sum")
+    metrics_dict["lemmas_count"] = ("lemmas_count", "sum")
+    metrics_dict["emojis_count"] = ("emojis_count", "sum")
+    metrics_dict["unique_emojis"] = ("unique_emojis", lambda x: set(emoji for sublist in x for emoji in sublist))
+    metrics_dict["unique_emojis_count"] = ("unique_emojis", len)
+    metrics_dict["hashtags"] = ("hashtags", lambda x: list(hashtag for sublist in x for hashtag in sublist))
+    metrics_dict["hashtags_count"] = ("hashtags_count", "sum")
+    metrics_dict["mentions"] = ("mentions", lambda x: list(mention for sublist in x for mention in sublist))
+    metrics_dict["mentions_count"] = ("mentions_count", "sum")
+    metrics_dict["extracted_urls_from_text"] = ("extracted_urls_from_text", lambda x: list(url for sublist in x for url in sublist))
+    metrics_dict["domain"] = ("domain", lambda x: list(domain for sublist in x for domain in sublist))
+    metrics_dict["len_numbers"] = ("len_numbers", "sum")
+    metrics_dict["interrogation"] = ("interrogation", "sum")
+    metrics_dict["exclamation"] = ("exclamation", "sum")
+    metrics_dict["x"] = ("x", "mean")
+    metrics_dict["y"] = ("y", "mean")
+    metrics_dict.update(metrics)
+    df_gb = df.groupby([col_id, col_topic]).agg(**metrics_dict).reset_index()
+    df_gb[col_topic]=df_gb[col_topic].astype(str)
+    return df_gb
+def sentiment_to_category(sentiment : float, boundaries : list = [-1.0, -0.5, 0.5, 1.0], labels :list = ['negative', 'neutral', 'positive']) -> str:
+    """
+    Assign a sentiment category to a sentiment score.
+    Args:
+        sentiment : float
+            sentiment score
+        boundaries : list
+            list of boundaries for each category
+        labels : list
+            list of labels for each category
+    Returns:
+        str
+            category label
+    Description:
+        This function assigns a sentiment category to a sentiment score based on a list of boundaries and labels. If the sentiment score is outside the boundaries, it is assigned to the last category.
+    """
+    for i in range(len(boundaries) - 1):
+        if boundaries[i] <= sentiment < boundaries[i + 1]:
+            return labels[i]
+    return labels[-1]
+def topic_representation(df: pd.DataFrame, col_topic: str, col_id: str, col_engagement: str, col_user_id: str, metrics: dict) -> pd.DataFrame:
     """
     Calculate the representation of topics in a processed DataFrame.
@@ -822,11 +998,15 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
     metrics_dict['engagements']=(col_engagement,'sum')
     if col_user_id:
         metrics_dict["users"]=(col_user_id,"nunique")
+        panel_cols = [col for col in df.columns if col[:6] == 'panel_']
+        if len(panel_cols)>0:
+            for panel_col in panel_cols:
+                metrics_dict[panel_col+'_verbatims'] = (panel_col, "sum")
+                metrics_dict[panel_col+'_users'] = (col_user_id, lambda x : x[df[panel_col]].nunique())
+                metrics_dict[panel_col+'_engagements'] = (col_engagement, lambda x : x[df[panel_col]].sum())
     metrics_dict.update(metrics)
-    print(metrics_dict)
     metrics_dict['avg_word_count']=("tokens_count", lambda x: round(x.mean(),2))
     metrics_dict['verbatims_with_emoji']=("emojis_count", lambda x: (x > 0).sum() )
     metrics_dict['emojis_occurences']=("emojis_count", "sum")
@@ -843,9 +1023,8 @@ def topic_representation(df_processed_data: pd.DataFrame, col_topic: str, col_id
     metrics_dict['topic_x']=("x", "mean")
     metrics_dict['topic_y']=("y", "mean")
     # on produit la représentation des topics finale
-    df_distrib_all = (df_processed_data.groupby(col_topic)
+    df_distrib_all = (df.groupby(col_topic)
                       .agg(**metrics_dict)
                       .sort_values(by="verbatims", ascending=False)
                       .assign(engagement_per_verbatims = lambda x : x["engagements"] / x["verbatims"])
@@ -2130,13 +2309,15 @@ def check_gpu():
 def HF_load_model(model_checkpoint):
     tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
     model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
+    config = AutoConfig.from_pretrained(model_checkpoint)
     if torch.cuda.is_available():
         model.cuda()
-    return model, tokenizer
+    return model, tokenizer, config
 def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json):
     """ Calculate sentiment of a text. `return_type` can be 'label', 'score' or 'proba' """
     file_path= os.path.join(dir_json , str(filename)+'.json')
+    results = {}
     if not os.path.exists(file_path):
         with torch.no_grad():
             inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)

opsci_toolbox/helpers/nlp_cuml.py CHANGED Viewed

@@ -384,7 +384,6 @@ def cuml_word_frequency_per_categorie(gdf: pd.DataFrame, col_text: str, col_cat:
     # Initialize cuML's CountVectorizer
     count_vectorizer = CountVectorizer(analyzer='word', ngram_range=ngram_range, stop_words=stop_words)
-    print(type(gdf[col_text]))
     # Fit and transform the text data
     X_train_count = count_vectorizer.fit_transform(cudf.Series(gdf[col_text]))
     X_names_count = count_vectorizer.get_feature_names()
@@ -402,7 +401,8 @@ def cuml_word_frequency_per_categorie(gdf: pd.DataFrame, col_text: str, col_cat:
             df_count_tmp = df_count_tmp.head(n_words)
         if min_freq:
             df_count_tmp = df_count_tmp[df_count_tmp["freq"] > min_freq]
+        df_count_tmp['word'] = df_count_tmp['word'].astype(str)
         # Concatenate the result to the main DataFrame
         df_count = cudf.concat([df_count, df_count_tmp])
@@ -588,7 +588,7 @@ def cudf_encode_chunked_files(chunk_files_paths: list,
             current_df = cudf_read_parquet(file)
             text_list = current_df[col_text].to_arrow().to_pylist()
             # text vectorization
             embeddings = HF_encoder.embed_documents(text_list)

opsci_toolbox/helpers/sna.py CHANGED Viewed

@@ -421,6 +421,7 @@ def select_top_nodes_by_degrees(G: nx.Graph, degree_type : str = "degree", N : i
     return subgraph
 def scale_size(G, size_attribute, min_node_size = 10, max_node_size = 100):
     """
     Scale the sizes of nodes in a graph based on a specified attribute.

{opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.13.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: opsci-toolbox
-Version: 0.0.12
+Version: 0.0.13
 Summary: a complete toolbox
 Home-page: UNKNOWN
 Author: Erwan Le Nagard
@@ -48,6 +48,9 @@ Requires-Dist: transformers ==4.38.2
 Requires-Dist: umap-learn ==0.5.5
 Requires-Dist: urlextract ==1.9.0
 Requires-Dist: wordcloud ==1.9.3
+Requires-Dist: Unidecode ==1.3.8
+Requires-Dist: kaleido ==0.2.1
+Requires-Dist: gliner ==0.2.8
 UNKNOWN

{opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.13.dist-info}/RECORD RENAMED Viewed

@@ -1,22 +1,25 @@
 opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 opsci_toolbox/apis/rapidapi_helpers.py,sha256=k_hYcRNww5noNkX7zyz5Htggxb15BPoKSlbY7NLuQXI,26696
+opsci_toolbox/apis/reddit.py,sha256=zhK2CY9CkCezNcekQFdv1So3NmHHYxB7-tgMVErHOGI,15763
+opsci_toolbox/apis/telegram.py,sha256=GKDLpZg1fc9D_PGCgi9pfTaW7Jjm_2luQ-2trXTr38A,42208
 opsci_toolbox/apis/webscraping.py,sha256=1DAIYbywZoPwTSyoqFGxyF0-q_nUsGg_VK51zLL_bB0,21465
 opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
 opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-opsci_toolbox/helpers/common.py,sha256=nqg9wzgU5DxVTCxEb5LSw2lUnp0f_hKF_Q-DhpRtu6g,45158
+opsci_toolbox/helpers/common.py,sha256=ZGjWIPEpqr-gIYjkfsS97PmCtQWHa_iF8tBbVxrQsOQ,53321
 opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
-opsci_toolbox/helpers/dataviz.py,sha256=1cIGb-u81cD5iSIkkkrzyrBnfim7fbhm0x_CguHUbf0,202128
-opsci_toolbox/helpers/dates.py,sha256=Wf7HxaUY62IRrY3XPdRIuoaMbGi3QqWf-vStqbRRY_o,2633
-opsci_toolbox/helpers/nlp.py,sha256=n7nNEU0cuu7bqXYRRBH4D-xIzpdNwKm0nj-eRYh3aPY,91956
-opsci_toolbox/helpers/nlp_cuml.py,sha256=XzBfoFMpVIehpRbp60E4wGokpoqJP0lJxs1plOxQqBY,28882
-opsci_toolbox/helpers/sna.py,sha256=XL1BZ-x83xWRNbGsvh7-m8Mdy6iOrWx8vjgaL2_TSmo,31905
+opsci_toolbox/helpers/dataviz.py,sha256=U2Kj-xoF1wHvYXUKxLsrSvKnhky9PrPUy61s1WEKp44,208743
+opsci_toolbox/helpers/dates.py,sha256=CxbXSo61GPZ2L37PV0ujvp78vwl0DoBq7t0nkk9qHp8,4751
+opsci_toolbox/helpers/gliner.py,sha256=qLkpuoCDezQyYmg_TE3XYETSpobHods6WBjCLo0Gjqw,3579
+opsci_toolbox/helpers/nlp.py,sha256=I72F32ieofZaCIkjZ9kqpiJLktfRoM7mMhzzxyXDQ3I,99316
+opsci_toolbox/helpers/nlp_cuml.py,sha256=CGyThKNgo6fdFPV-iooPG0oNrzA__Hvv08t_sdEp3BE,28919
+opsci_toolbox/helpers/sna.py,sha256=E5D_1aGDmq_YQYseHxZggEtWQOwbXJJ0GHu3YtZLGtg,31906
 opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
 opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
 opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
 opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
-opsci_toolbox-0.0.12.dist-info/METADATA,sha256=LosT5jzu7Z0TXIslwVUSvPG6AKMrblGp8A6odUN_N9U,1633
-opsci_toolbox-0.0.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-opsci_toolbox-0.0.12.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
-opsci_toolbox-0.0.12.dist-info/RECORD,,
+opsci_toolbox-0.0.13.dist-info/METADATA,sha256=G_JhKg5tmYPkRUhAN2Uj9B6orX7x3TKWqIOKU_TjeIA,1727
+opsci_toolbox-0.0.13.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+opsci_toolbox-0.0.13.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
+opsci_toolbox-0.0.13.dist-info/RECORD,,

{opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.13.dist-info}/WHEEL RENAMED Viewed

File without changes

{opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.13.dist-info}/top_level.txt RENAMED Viewed

File without changes

opsci-toolbox 0.0.12__py3-none-any.whl → 0.0.13__py3-none-any.whl

opsci-toolbox 0.0.12py3-none-any.whl → 0.0.13py3-none-any.whl