PyPI - opsci-toolbox - Versions diffs - 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl - Mend

opsci-toolbox 0.0.10py3-none-any.whl → 0.0.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

opsci_toolbox/apis/webscraping.py +75 -0
opsci_toolbox/helpers/common.py +39 -20
opsci_toolbox/helpers/dataviz.py +4262 -1975
opsci_toolbox/helpers/nlp.py +121 -33
opsci_toolbox-0.0.12.dist-info/METADATA +53 -0
{opsci_toolbox-0.0.10.dist-info → opsci_toolbox-0.0.12.dist-info}/RECORD +8 -8
{opsci_toolbox-0.0.10.dist-info → opsci_toolbox-0.0.12.dist-info}/WHEEL +1 -1
opsci_toolbox-0.0.10.dist-info/METADATA +0 -53
{opsci_toolbox-0.0.10.dist-info → opsci_toolbox-0.0.12.dist-info}/top_level.txt +0 -0

opsci_toolbox/apis/webscraping.py CHANGED Viewed

@@ -11,6 +11,81 @@ import concurrent.futures
 import pandas as pd
 from tqdm import tqdm
+def get_tweet_html(username: str, tweet_id: str, **kwargs) -> str:
+    """
+    Retrieves the HTML code of a tweet given the username and tweet ID.
+    Args:
+        username (str): The username of the Twitter account.
+        tweet_id (str): The ID of the tweet.
+        kwargs : additional parameters to pass to the Twitter API.
+    Returns:
+        str: The HTML code of the tweet.
+    """
+    params = {'lang':"en",             # language of the features around the tweet
+              "maxwidth" : 550,        # size of the tweet
+              "hide_media":False,      # to hide photo / video
+              "hide_thread":False,     # to hide original message on replies
+              "omit_script": True,     # to include or not the JS script : <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
+              "align": None,           # to align the tweet {left,right,center,none}
+              "theme": "light",        # theme of the tweet {light,dark}
+              "dnt": True              # When set to true, the Tweet and its embedded page on your site are not used for purposes that include personalized suggestions and personalized ads.
+              }
+    params.update(kwargs)
+    url = f'https://publish.twitter.com/oembed?url=https://twitter.com/{username}/status/{tweet_id}'
+    response = requests.get(url, params=params)
+    if response.status_code == 200:
+        data = response.json()
+        html = data.get('html')
+        return html, username, tweet_id
+    else:
+        print(response.url, "Failed to fetch data from Twitter.")
+        return None, username, tweet_id
+def parallel_twitter_oembed(usernames, tweet_ids, **kwargs):
+    """
+    Scrapes Twitter oEmbed data for multiple tweets in parallel.
+    Args:
+        usernames (list): A list of Twitter usernames.
+        tweet_ids (list): A list of tweet IDs corresponding to the tweets of the given usernames.
+        **kwargs: Additional keyword arguments to be passed to the `get_tweet_html` function.
+    Returns:
+        pandas.DataFrame: A DataFrame containing the scraped tweet HTML, username, and message ID.
+    Raises:
+        Exception: If there is an error while downloading the tweet HTML.
+    """
+    all_data = []
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        # Submit scraping tasks for each URL and add tqdm progress bar
+        futures = [
+            executor.submit(get_tweet_html, username, tweet_id, **kwargs)
+            for username, tweet_id in zip(usernames, tweet_ids)
+        ]
+        for future in tqdm(
+            concurrent.futures.as_completed(futures),
+            total=len(usernames),
+            desc="Scraping Progress",
+        ):
+            try:
+                data, username, tweet_id = future.result()
+                all_data.append((data, username, tweet_id))
+            except Exception as e:
+                print(f"Error downloading : {e}")
+    df = pd.DataFrame(all_data, columns=["tweet_html", "user_name", "message_id"])
+    return df
 def url_get_domain(url: str) -> str:
     """

opsci_toolbox/helpers/common.py CHANGED Viewed

@@ -1219,25 +1219,6 @@ def top_rows_per_category(df: pd.DataFrame,
                 )[cols_to_keep]
     return df_gb
-def format_number(number: int) -> str:
-    """
-    Format a number into a human-readable string with K, M, or B suffixes.
-    Args:
-        number (int): The number to format.
-    Returns:
-        str: The formatted number as a string with an appropriate suffix.
-    """
-    if number < 1000:
-        return str(number)
-    elif number < 1000000:
-        return f"{number / 1000:.1f}K"
-    elif number < 1000000000:
-        return f"{number / 1000000:.1f}M"
-    else:
-        return f"{number / 1000000000:.1f}B"
 def unrar_file(rar_file_path : str, output_dir : str) -> None:
@@ -1330,4 +1311,42 @@ def remove_empty_folders(path: str):
             # If the directory is empty, remove it
             if not os.listdir(dir_path):
                 os.rmdir(dir_path)
-                print(f"Removed empty folder: {dir_path}")
+                print(f"Removed empty folder: {dir_path}")
+def categorize_percentiles(percentile: float) -> str:
+    """
+    Categorizes a percentile value into a string representing its range.
+    Args:
+    - percentile (float): The percentile value (between 0 and 1).
+    Returns:
+    - str: The category of the percentile value.
+    Raises:
+    - ValueError: If the percentile value is outside the range [0, 1].
+    """
+    if not (0 <= percentile <= 1):
+        raise ValueError("Percentile must be between 0 and 1 inclusive.")
+    if percentile <= 0.1:
+        return '0-10%'
+    elif percentile <= 0.2:
+        return '10-20%'
+    elif percentile <= 0.3:
+        return '20-30%'
+    elif percentile <= 0.4:
+        return '30-40%'
+    elif percentile <= 0.5:
+        return '40-50%'
+    elif percentile <= 0.6:
+        return '50-60%'
+    elif percentile <= 0.7:
+        return '60-70%'
+    elif percentile <= 0.8:
+        return '70-80%'
+    elif percentile <= 0.9:
+        return '80-90%'
+    else:
+        return '90-100%'

opsci-toolbox 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl

opsci-toolbox 0.0.10py3-none-any.whl → 0.0.12py3-none-any.whl