opsci-toolbox 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/webscraping.py +75 -0
- opsci_toolbox/helpers/common.py +39 -20
- opsci_toolbox/helpers/dataviz.py +4262 -1975
- opsci_toolbox/helpers/nlp.py +121 -33
- opsci_toolbox-0.0.12.dist-info/METADATA +53 -0
- {opsci_toolbox-0.0.10.dist-info → opsci_toolbox-0.0.12.dist-info}/RECORD +8 -8
- {opsci_toolbox-0.0.10.dist-info → opsci_toolbox-0.0.12.dist-info}/WHEEL +1 -1
- opsci_toolbox-0.0.10.dist-info/METADATA +0 -53
- {opsci_toolbox-0.0.10.dist-info → opsci_toolbox-0.0.12.dist-info}/top_level.txt +0 -0
@@ -11,6 +11,81 @@ import concurrent.futures
|
|
11
11
|
import pandas as pd
|
12
12
|
from tqdm import tqdm
|
13
13
|
|
14
|
+
def get_tweet_html(username: str, tweet_id: str, **kwargs) -> str:
|
15
|
+
"""
|
16
|
+
Retrieves the HTML code of a tweet given the username and tweet ID.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
username (str): The username of the Twitter account.
|
20
|
+
tweet_id (str): The ID of the tweet.
|
21
|
+
kwargs : additional parameters to pass to the Twitter API.
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
str: The HTML code of the tweet.
|
25
|
+
|
26
|
+
|
27
|
+
"""
|
28
|
+
params = {'lang':"en", # language of the features around the tweet
|
29
|
+
"maxwidth" : 550, # size of the tweet
|
30
|
+
"hide_media":False, # to hide photo / video
|
31
|
+
"hide_thread":False, # to hide original message on replies
|
32
|
+
"omit_script": True, # to include or not the JS script : <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
|
33
|
+
"align": None, # to align the tweet {left,right,center,none}
|
34
|
+
"theme": "light", # theme of the tweet {light,dark}
|
35
|
+
"dnt": True # When set to true, the Tweet and its embedded page on your site are not used for purposes that include personalized suggestions and personalized ads.
|
36
|
+
}
|
37
|
+
|
38
|
+
params.update(kwargs)
|
39
|
+
|
40
|
+
url = f'https://publish.twitter.com/oembed?url=https://twitter.com/{username}/status/{tweet_id}'
|
41
|
+
response = requests.get(url, params=params)
|
42
|
+
|
43
|
+
if response.status_code == 200:
|
44
|
+
data = response.json()
|
45
|
+
html = data.get('html')
|
46
|
+
return html, username, tweet_id
|
47
|
+
else:
|
48
|
+
print(response.url, "Failed to fetch data from Twitter.")
|
49
|
+
return None, username, tweet_id
|
50
|
+
|
51
|
+
|
52
|
+
def parallel_twitter_oembed(usernames, tweet_ids, **kwargs):
|
53
|
+
"""
|
54
|
+
Scrapes Twitter oEmbed data for multiple tweets in parallel.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
usernames (list): A list of Twitter usernames.
|
58
|
+
tweet_ids (list): A list of tweet IDs corresponding to the tweets of the given usernames.
|
59
|
+
**kwargs: Additional keyword arguments to be passed to the `get_tweet_html` function.
|
60
|
+
|
61
|
+
Returns:
|
62
|
+
pandas.DataFrame: A DataFrame containing the scraped tweet HTML, username, and message ID.
|
63
|
+
|
64
|
+
Raises:
|
65
|
+
Exception: If there is an error while downloading the tweet HTML.
|
66
|
+
|
67
|
+
"""
|
68
|
+
all_data = []
|
69
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
70
|
+
# Submit scraping tasks for each URL and add tqdm progress bar
|
71
|
+
futures = [
|
72
|
+
executor.submit(get_tweet_html, username, tweet_id, **kwargs)
|
73
|
+
for username, tweet_id in zip(usernames, tweet_ids)
|
74
|
+
]
|
75
|
+
for future in tqdm(
|
76
|
+
concurrent.futures.as_completed(futures),
|
77
|
+
total=len(usernames),
|
78
|
+
desc="Scraping Progress",
|
79
|
+
):
|
80
|
+
try:
|
81
|
+
data, username, tweet_id = future.result()
|
82
|
+
all_data.append((data, username, tweet_id))
|
83
|
+
except Exception as e:
|
84
|
+
print(f"Error downloading : {e}")
|
85
|
+
|
86
|
+
df = pd.DataFrame(all_data, columns=["tweet_html", "user_name", "message_id"])
|
87
|
+
return df
|
88
|
+
|
14
89
|
|
15
90
|
def url_get_domain(url: str) -> str:
|
16
91
|
"""
|
opsci_toolbox/helpers/common.py
CHANGED
@@ -1219,25 +1219,6 @@ def top_rows_per_category(df: pd.DataFrame,
|
|
1219
1219
|
)[cols_to_keep]
|
1220
1220
|
return df_gb
|
1221
1221
|
|
1222
|
-
def format_number(number: int) -> str:
|
1223
|
-
"""
|
1224
|
-
Format a number into a human-readable string with K, M, or B suffixes.
|
1225
|
-
|
1226
|
-
Args:
|
1227
|
-
number (int): The number to format.
|
1228
|
-
|
1229
|
-
Returns:
|
1230
|
-
str: The formatted number as a string with an appropriate suffix.
|
1231
|
-
"""
|
1232
|
-
if number < 1000:
|
1233
|
-
return str(number)
|
1234
|
-
elif number < 1000000:
|
1235
|
-
return f"{number / 1000:.1f}K"
|
1236
|
-
elif number < 1000000000:
|
1237
|
-
return f"{number / 1000000:.1f}M"
|
1238
|
-
else:
|
1239
|
-
return f"{number / 1000000000:.1f}B"
|
1240
|
-
|
1241
1222
|
|
1242
1223
|
|
1243
1224
|
def unrar_file(rar_file_path : str, output_dir : str) -> None:
|
@@ -1330,4 +1311,42 @@ def remove_empty_folders(path: str):
|
|
1330
1311
|
# If the directory is empty, remove it
|
1331
1312
|
if not os.listdir(dir_path):
|
1332
1313
|
os.rmdir(dir_path)
|
1333
|
-
print(f"Removed empty folder: {dir_path}")
|
1314
|
+
print(f"Removed empty folder: {dir_path}")
|
1315
|
+
|
1316
|
+
|
1317
|
+
def categorize_percentiles(percentile: float) -> str:
|
1318
|
+
"""
|
1319
|
+
Categorizes a percentile value into a string representing its range.
|
1320
|
+
|
1321
|
+
Args:
|
1322
|
+
- percentile (float): The percentile value (between 0 and 1).
|
1323
|
+
|
1324
|
+
Returns:
|
1325
|
+
- str: The category of the percentile value.
|
1326
|
+
|
1327
|
+
Raises:
|
1328
|
+
- ValueError: If the percentile value is outside the range [0, 1].
|
1329
|
+
"""
|
1330
|
+
if not (0 <= percentile <= 1):
|
1331
|
+
raise ValueError("Percentile must be between 0 and 1 inclusive.")
|
1332
|
+
|
1333
|
+
if percentile <= 0.1:
|
1334
|
+
return '0-10%'
|
1335
|
+
elif percentile <= 0.2:
|
1336
|
+
return '10-20%'
|
1337
|
+
elif percentile <= 0.3:
|
1338
|
+
return '20-30%'
|
1339
|
+
elif percentile <= 0.4:
|
1340
|
+
return '30-40%'
|
1341
|
+
elif percentile <= 0.5:
|
1342
|
+
return '40-50%'
|
1343
|
+
elif percentile <= 0.6:
|
1344
|
+
return '50-60%'
|
1345
|
+
elif percentile <= 0.7:
|
1346
|
+
return '60-70%'
|
1347
|
+
elif percentile <= 0.8:
|
1348
|
+
return '70-80%'
|
1349
|
+
elif percentile <= 0.9:
|
1350
|
+
return '80-90%'
|
1351
|
+
else:
|
1352
|
+
return '90-100%'
|