opsci-toolbox 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,6 +11,81 @@ import concurrent.futures
11
11
  import pandas as pd
12
12
  from tqdm import tqdm
13
13
 
14
+ def get_tweet_html(username: str, tweet_id: str, **kwargs) -> str:
15
+ """
16
+ Retrieves the HTML code of a tweet given the username and tweet ID.
17
+
18
+ Args:
19
+ username (str): The username of the Twitter account.
20
+ tweet_id (str): The ID of the tweet.
21
+ kwargs : additional parameters to pass to the Twitter API.
22
+
23
+ Returns:
24
+ str: The HTML code of the tweet.
25
+
26
+
27
+ """
28
+ params = {'lang':"en", # language of the features around the tweet
29
+ "maxwidth" : 550, # size of the tweet
30
+ "hide_media":False, # to hide photo / video
31
+ "hide_thread":False, # to hide original message on replies
32
+ "omit_script": True, # to include or not the JS script : <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
33
+ "align": None, # to align the tweet {left,right,center,none}
34
+ "theme": "light", # theme of the tweet {light,dark}
35
+ "dnt": True # When set to true, the Tweet and its embedded page on your site are not used for purposes that include personalized suggestions and personalized ads.
36
+ }
37
+
38
+ params.update(kwargs)
39
+
40
+ url = f'https://publish.twitter.com/oembed?url=https://twitter.com/{username}/status/{tweet_id}'
41
+ response = requests.get(url, params=params)
42
+
43
+ if response.status_code == 200:
44
+ data = response.json()
45
+ html = data.get('html')
46
+ return html, username, tweet_id
47
+ else:
48
+ print(response.url, "Failed to fetch data from Twitter.")
49
+ return None, username, tweet_id
50
+
51
+
52
+ def parallel_twitter_oembed(usernames, tweet_ids, **kwargs):
53
+ """
54
+ Scrapes Twitter oEmbed data for multiple tweets in parallel.
55
+
56
+ Args:
57
+ usernames (list): A list of Twitter usernames.
58
+ tweet_ids (list): A list of tweet IDs corresponding to the tweets of the given usernames.
59
+ **kwargs: Additional keyword arguments to be passed to the `get_tweet_html` function.
60
+
61
+ Returns:
62
+ pandas.DataFrame: A DataFrame containing the scraped tweet HTML, username, and message ID.
63
+
64
+ Raises:
65
+ Exception: If there is an error while downloading the tweet HTML.
66
+
67
+ """
68
+ all_data = []
69
+ with concurrent.futures.ThreadPoolExecutor() as executor:
70
+ # Submit scraping tasks for each URL and add tqdm progress bar
71
+ futures = [
72
+ executor.submit(get_tweet_html, username, tweet_id, **kwargs)
73
+ for username, tweet_id in zip(usernames, tweet_ids)
74
+ ]
75
+ for future in tqdm(
76
+ concurrent.futures.as_completed(futures),
77
+ total=len(usernames),
78
+ desc="Scraping Progress",
79
+ ):
80
+ try:
81
+ data, username, tweet_id = future.result()
82
+ all_data.append((data, username, tweet_id))
83
+ except Exception as e:
84
+ print(f"Error downloading : {e}")
85
+
86
+ df = pd.DataFrame(all_data, columns=["tweet_html", "user_name", "message_id"])
87
+ return df
88
+
14
89
 
15
90
  def url_get_domain(url: str) -> str:
16
91
  """
@@ -1219,25 +1219,6 @@ def top_rows_per_category(df: pd.DataFrame,
1219
1219
  )[cols_to_keep]
1220
1220
  return df_gb
1221
1221
 
1222
- def format_number(number: int) -> str:
1223
- """
1224
- Format a number into a human-readable string with K, M, or B suffixes.
1225
-
1226
- Args:
1227
- number (int): The number to format.
1228
-
1229
- Returns:
1230
- str: The formatted number as a string with an appropriate suffix.
1231
- """
1232
- if number < 1000:
1233
- return str(number)
1234
- elif number < 1000000:
1235
- return f"{number / 1000:.1f}K"
1236
- elif number < 1000000000:
1237
- return f"{number / 1000000:.1f}M"
1238
- else:
1239
- return f"{number / 1000000000:.1f}B"
1240
-
1241
1222
 
1242
1223
 
1243
1224
  def unrar_file(rar_file_path : str, output_dir : str) -> None:
@@ -1330,4 +1311,42 @@ def remove_empty_folders(path: str):
1330
1311
  # If the directory is empty, remove it
1331
1312
  if not os.listdir(dir_path):
1332
1313
  os.rmdir(dir_path)
1333
- print(f"Removed empty folder: {dir_path}")
1314
+ print(f"Removed empty folder: {dir_path}")
1315
+
1316
+
1317
+ def categorize_percentiles(percentile: float) -> str:
1318
+ """
1319
+ Categorizes a percentile value into a string representing its range.
1320
+
1321
+ Args:
1322
+ - percentile (float): The percentile value (between 0 and 1).
1323
+
1324
+ Returns:
1325
+ - str: The category of the percentile value.
1326
+
1327
+ Raises:
1328
+ - ValueError: If the percentile value is outside the range [0, 1].
1329
+ """
1330
+ if not (0 <= percentile <= 1):
1331
+ raise ValueError("Percentile must be between 0 and 1 inclusive.")
1332
+
1333
+ if percentile <= 0.1:
1334
+ return '0-10%'
1335
+ elif percentile <= 0.2:
1336
+ return '10-20%'
1337
+ elif percentile <= 0.3:
1338
+ return '20-30%'
1339
+ elif percentile <= 0.4:
1340
+ return '30-40%'
1341
+ elif percentile <= 0.5:
1342
+ return '40-50%'
1343
+ elif percentile <= 0.6:
1344
+ return '50-60%'
1345
+ elif percentile <= 0.7:
1346
+ return '60-70%'
1347
+ elif percentile <= 0.8:
1348
+ return '70-80%'
1349
+ elif percentile <= 0.9:
1350
+ return '80-90%'
1351
+ else:
1352
+ return '90-100%'