PyPI - opsci-toolbox - Versions diffs - 0.0.11__py3-none-any.whl → 0.0.12__py3-none-any.whl - Mend

opsci-toolbox 0.0.11py3-none-any.whl → 0.0.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

opsci_toolbox/apis/webscraping.py CHANGED Viewed

@@ -11,6 +11,81 @@ import concurrent.futures
 import pandas as pd
 from tqdm import tqdm
+def get_tweet_html(username: str, tweet_id: str, **kwargs) -> str:
+    """
+    Retrieves the HTML code of a tweet given the username and tweet ID.
+    Args:
+        username (str): The username of the Twitter account.
+        tweet_id (str): The ID of the tweet.
+        kwargs : additional parameters to pass to the Twitter API.
+    Returns:
+        str: The HTML code of the tweet.
+    """
+    params = {'lang':"en",             # language of the features around the tweet
+              "maxwidth" : 550,        # size of the tweet
+              "hide_media":False,      # to hide photo / video
+              "hide_thread":False,     # to hide original message on replies
+              "omit_script": True,     # to include or not the JS script : <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
+              "align": None,           # to align the tweet {left,right,center,none}
+              "theme": "light",        # theme of the tweet {light,dark}
+              "dnt": True              # When set to true, the Tweet and its embedded page on your site are not used for purposes that include personalized suggestions and personalized ads.
+              }
+    params.update(kwargs)
+    url = f'https://publish.twitter.com/oembed?url=https://twitter.com/{username}/status/{tweet_id}'
+    response = requests.get(url, params=params)
+    if response.status_code == 200:
+        data = response.json()
+        html = data.get('html')
+        return html, username, tweet_id
+    else:
+        print(response.url, "Failed to fetch data from Twitter.")
+        return None, username, tweet_id
+def parallel_twitter_oembed(usernames, tweet_ids, **kwargs):
+    """
+    Scrapes Twitter oEmbed data for multiple tweets in parallel.
+    Args:
+        usernames (list): A list of Twitter usernames.
+        tweet_ids (list): A list of tweet IDs corresponding to the tweets of the given usernames.
+        **kwargs: Additional keyword arguments to be passed to the `get_tweet_html` function.
+    Returns:
+        pandas.DataFrame: A DataFrame containing the scraped tweet HTML, username, and message ID.
+    Raises:
+        Exception: If there is an error while downloading the tweet HTML.
+    """
+    all_data = []
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        # Submit scraping tasks for each URL and add tqdm progress bar
+        futures = [
+            executor.submit(get_tweet_html, username, tweet_id, **kwargs)
+            for username, tweet_id in zip(usernames, tweet_ids)
+        ]
+        for future in tqdm(
+            concurrent.futures.as_completed(futures),
+            total=len(usernames),
+            desc="Scraping Progress",
+        ):
+            try:
+                data, username, tweet_id = future.result()
+                all_data.append((data, username, tweet_id))
+            except Exception as e:
+                print(f"Error downloading : {e}")
+    df = pd.DataFrame(all_data, columns=["tweet_html", "user_name", "message_id"])
+    return df
 def url_get_domain(url: str) -> str:
     """

opsci_toolbox/helpers/nlp.py CHANGED Viewed

@@ -114,7 +114,7 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
             The DataFrame with cleaned text data.
     """
     df[col_clean] = df[col].apply(remove_rt)
-    df[col_clean] = df[col].apply(lambda x : urls(x, repl= ''))
+    df[col_clean] = df[col_clean].apply(lambda x : urls(x, repl= ''))
     df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
     df[col_clean] = df[col_clean].apply(remove_extra_spaces)
     # df = df.loc[(df[col_clean] != ""), :]
@@ -1042,6 +1042,73 @@ def sample_most_engaging_posts(df: pd.DataFrame, col_topic: str, col_engagement:
 def get_lang_detector(nlp, name):
     return LanguageDetector(seed=42)  # We use the seed 42
+def PRarmy_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str = "lemmatized_text", pos_to_keep: list = ["VERB","NOUN","ADJ", "ADV", "PROPN"], entities_to_keep: list = ['PERSON','ORG', 'LOC'], stopwords: list = [], batch_size: int = 100, n_process: int = 1) -> pd.DataFrame:
+    """
+    Perform natural language processing tasks using spaCy for PR Army project.
+    Its main tasks are lemmatization and named entity recognition (NER).
+    Args:
+        nlp : spacy.Language
+            The spaCy language model.
+        df : pandas.DataFrame
+            The DataFrame containing the text data.
+        col_text : str
+            The name of the column containing the text data.
+        col_lemma : str
+            The name of the column to store the lemmatized text data.
+        pos_to_keep : list
+            A list of part-of-speech tags to keep during lemmatization.
+        entities_to_keep : list
+            A list of NER tags to keep.
+        stopwords : list
+            A list of stopwords to remove during processing.
+        batch_size : int, optional
+            The batch size for spaCy processing. Default is 100.
+        n_process : int, optional
+            The number of processes for parallel processing. Default is 1.
+    Returns:
+        pandas.DataFrame
+            The DataFrame with processed text data.
+    """
+    all_records = []
+    text=list(df[col_text].astype('unicode').values)
+    for doc in tqdm(nlp.pipe(text, batch_size=batch_size, n_process=n_process), total= len(text), desc = "NLP Process"):
+        NER_type = []
+        NER_text = []
+        ### LEMMATIZATION
+        if len(pos_to_keep)>0 and len(stopwords)>0:
+            lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space) and tok.text.lower() not in stopwords and tok.pos_ in pos_to_keep]
+        elif len(pos_to_keep)>0 and len(stopwords) < 1:
+            lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space) and tok.pos_ in pos_to_keep]
+        elif len(pos_to_keep) < 1 and len(stopwords) > 0:
+            lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space) and tok.text.lower() not in stopwords]
+        else :
+            lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space)]
+        ### NER
+        if len(entities_to_keep)>0:
+            for ent in doc.ents:
+                if ent.label_ in entities_to_keep:
+                    NER_type.append(ent.label_)
+                    NER_text.append(ent.text)
+        else:
+            for ent in doc.ents:
+                NER_type.append(ent.label_)
+                NER_text.append(ent.text)
+        record = (NER_type, NER_text, ' '.join(map(str, lemmas_list)))
+        all_records.append(record)
+    df[['NER_type', 'NER_text', col_lemma]] = pd.DataFrame(all_records, index=df.index)
+    return df
 def TM_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_keep: list, stopwords: list, batch_size: int = 100, n_process: int = 1, stats: bool = True, join_list: bool = False) -> pd.DataFrame:
     """
     Perform natural language processing tasks using spaCy for topic modeling.

{opsci_toolbox-0.0.11.dist-info → opsci_toolbox-0.0.12.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: opsci-toolbox
-Version: 0.0.11
+Version: 0.0.12
 Summary: a complete toolbox
 Home-page: UNKNOWN
 Author: Erwan Le Nagard

{opsci_toolbox-0.0.11.dist-info → opsci_toolbox-0.0.12.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,14 @@
 opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 opsci_toolbox/apis/rapidapi_helpers.py,sha256=k_hYcRNww5noNkX7zyz5Htggxb15BPoKSlbY7NLuQXI,26696
-opsci_toolbox/apis/webscraping.py,sha256=Gz3hOfhOHUpwHU1Pzj3mB2WdBAcKa2WisYBHMi3lcVE,18343
+opsci_toolbox/apis/webscraping.py,sha256=1DAIYbywZoPwTSyoqFGxyF0-q_nUsGg_VK51zLL_bB0,21465
 opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
 opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 opsci_toolbox/helpers/common.py,sha256=nqg9wzgU5DxVTCxEb5LSw2lUnp0f_hKF_Q-DhpRtu6g,45158
 opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
 opsci_toolbox/helpers/dataviz.py,sha256=1cIGb-u81cD5iSIkkkrzyrBnfim7fbhm0x_CguHUbf0,202128
 opsci_toolbox/helpers/dates.py,sha256=Wf7HxaUY62IRrY3XPdRIuoaMbGi3QqWf-vStqbRRY_o,2633
-opsci_toolbox/helpers/nlp.py,sha256=baq4BsSgeLBgToPOU5RTmDA80dFJwH9xf0jppuAVseU,88947
+opsci_toolbox/helpers/nlp.py,sha256=n7nNEU0cuu7bqXYRRBH4D-xIzpdNwKm0nj-eRYh3aPY,91956
 opsci_toolbox/helpers/nlp_cuml.py,sha256=XzBfoFMpVIehpRbp60E4wGokpoqJP0lJxs1plOxQqBY,28882
 opsci_toolbox/helpers/sna.py,sha256=XL1BZ-x83xWRNbGsvh7-m8Mdy6iOrWx8vjgaL2_TSmo,31905
 opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
@@ -16,7 +16,7 @@ opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUK
 opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
 opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
-opsci_toolbox-0.0.11.dist-info/METADATA,sha256=5h-cfwhi31VKlzrOfdAeZuoKTLB1iyDIA4qqsz-bZGQ,1633
-opsci_toolbox-0.0.11.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-opsci_toolbox-0.0.11.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
-opsci_toolbox-0.0.11.dist-info/RECORD,,
+opsci_toolbox-0.0.12.dist-info/METADATA,sha256=LosT5jzu7Z0TXIslwVUSvPG6AKMrblGp8A6odUN_N9U,1633
+opsci_toolbox-0.0.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+opsci_toolbox-0.0.12.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
+opsci_toolbox-0.0.12.dist-info/RECORD,,

{opsci_toolbox-0.0.11.dist-info → opsci_toolbox-0.0.12.dist-info}/WHEEL RENAMED Viewed

File without changes

{opsci_toolbox-0.0.11.dist-info → opsci_toolbox-0.0.12.dist-info}/top_level.txt RENAMED Viewed

File without changes

opsci-toolbox 0.0.11__py3-none-any.whl → 0.0.12__py3-none-any.whl

opsci-toolbox 0.0.11py3-none-any.whl → 0.0.12py3-none-any.whl