opsci-toolbox 0.0.2__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/rapidapi_helpers.py +82 -0
- opsci_toolbox/helpers/common.py +566 -191
- opsci_toolbox/helpers/cv.py +298 -123
- opsci_toolbox/helpers/dataviz.py +1005 -216
- opsci_toolbox/helpers/dates.py +55 -8
- opsci_toolbox/helpers/nlp.py +768 -110
- opsci_toolbox/helpers/nlp_cuml.py +280 -0
- opsci_toolbox/helpers/sna.py +101 -10
- opsci_toolbox/helpers/surreaction.py +156 -0
- {opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.6.dist-info}/METADATA +9 -11
- opsci_toolbox-0.0.6.dist-info/RECORD +21 -0
- opsci_toolbox-0.0.2.dist-info/RECORD +0 -19
- {opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.6.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.6.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ from tqdm import tqdm
|
|
5
5
|
import re
|
6
6
|
from datetime import datetime,timedelta
|
7
7
|
from opsci_toolbox.helpers.dates import str_to_datetime
|
8
|
+
from opsci_toolbox.helpers.common import write_jsonl
|
8
9
|
|
9
10
|
def create_queries_per_period(query, publishedAfter, publishedBefore, col_publishedAfter = "start_date", col_publishedBefore = "end_date", date_format = '%Y-%m-%d', rolling_days = 7 ):
|
10
11
|
datetime_publishedAfter = datetime.strptime(publishedAfter, date_format)
|
@@ -278,6 +279,31 @@ def parse_tweet(json_data):
|
|
278
279
|
df = pd.DataFrame.from_records(all_records, columns = all_cols)
|
279
280
|
return df
|
280
281
|
|
282
|
+
def parse_twitter_list_details(json_data):
|
283
|
+
"""
|
284
|
+
Parse list results from https://rapidapi.com/omarmhaimdat/api/twitter154
|
285
|
+
"""
|
286
|
+
list_id = json_data.get("list_id", "")
|
287
|
+
list_id_str = json_data.get("list_id_str", "")
|
288
|
+
member_count = json_data.get("member_count", 0)
|
289
|
+
name = json_data.get("name", "")
|
290
|
+
suscriber_count = json_data.get("subscriber_count", 0)
|
291
|
+
creation_date = json_data.get("creation_date", 0)
|
292
|
+
mode = json_data.get("mode", "0")
|
293
|
+
|
294
|
+
user_record = parse_user(json_data.get("user", {}))
|
295
|
+
record = (list_id, list_id_str, member_count, name, suscriber_count, creation_date, mode) + user_record
|
296
|
+
cols = ["list_id", "list_id_str", "member_count", "name", "suscriber_count", "creation_date", "mode", "user_creation_date", "user_id", "user_username", "user_name", "user_follower_count", "user_following_count", "user_favourites_count", "user_is_private", "user_is_verified", "user_is_blue_verified", "user_location", "user_profile_pic_url", "user_profile_banner_url", "user_description", "user_external_url", "user_number_of_tweets", "user_bot", "user_timestamp", "user_has_nft_avatar", "user_category", "user_default_profile", "user_default_profile_image", "user_listed_count", "user_verified_type"]
|
297
|
+
|
298
|
+
df = pd.DataFrame.from_records(record, cols)
|
299
|
+
return df
|
300
|
+
|
301
|
+
######################################################################################
|
302
|
+
# function to parse Instagram data
|
303
|
+
# https://rapidapi.com/JoTucker/api/instagram-scraper2
|
304
|
+
# https://instagram-scraper2.p.rapidapi.com/hash_tag_medias_v2
|
305
|
+
######################################################################################
|
306
|
+
|
281
307
|
def instagram_parse_hashtag_data(hashtag_data):
|
282
308
|
hashtag_id = hashtag_data.get("id")
|
283
309
|
hashtag_name = hashtag_data.get("name")
|
@@ -324,3 +350,59 @@ def instagram_parse_hashtag_data(hashtag_data):
|
|
324
350
|
return df
|
325
351
|
|
326
352
|
|
353
|
+
######################################################################################
|
354
|
+
# function to parse Twitter data
|
355
|
+
# https://rapidapi.com/twttrapi-twttrapi-default/api/twttrapi
|
356
|
+
######################################################################################
|
357
|
+
def compile_list_entries(json_data, path_json, filename):
|
358
|
+
"""
|
359
|
+
Function to return next cursor and list details from https://twttrapi.p.rapidapi.com/list-members
|
360
|
+
"""
|
361
|
+
results = []
|
362
|
+
next_cursor = None
|
363
|
+
entries = json_data.get('data', {}).get('list', {}).get('timeline_response', {}).get("timeline", {}).get("instructions", [{}])[-1].get('entries',[])
|
364
|
+
if len(entries)>0:
|
365
|
+
for entry in entries:
|
366
|
+
content = entry.get("content")
|
367
|
+
if (content.get("__typename") == "TimelineTimelineCursor") & (content.get("cursorType") =="Bottom"):
|
368
|
+
next_cursor = content.get("value", None)
|
369
|
+
if next_cursor:
|
370
|
+
if next_cursor.split('|')[0]=="0":
|
371
|
+
next_cursor = None
|
372
|
+
if content.get("__typename") != "TimelineTimelineCursor":
|
373
|
+
legacy = content.get("content", {}). get('userResult', {}).get("result", {}).get("legacy", {})
|
374
|
+
results.append(legacy)
|
375
|
+
|
376
|
+
write_jsonl(results, path_json, filename)
|
377
|
+
return results, next_cursor
|
378
|
+
|
379
|
+
|
380
|
+
def parse_list_entries(jsonl_data):
|
381
|
+
"""
|
382
|
+
Function to parse list details from https://twttrapi.p.rapidapi.com/list-members
|
383
|
+
"""
|
384
|
+
all_records=[]
|
385
|
+
for data in jsonl_data:
|
386
|
+
id_str = data.get("id_str","")
|
387
|
+
name = data.get("name","")
|
388
|
+
screen_name = data.get("screen_name", "")
|
389
|
+
created_at = data.get("created_at")
|
390
|
+
description = data.get("description")
|
391
|
+
statuses_count = data.get("statuses_count", 0)
|
392
|
+
followers_count = data.get("followers_count",0)
|
393
|
+
friends_count = data.get("friends_count",0)
|
394
|
+
favourites_count = data.get("favourites_count",0)
|
395
|
+
media_count = data.get("media_count",0)
|
396
|
+
protected = data.get("protected", False)
|
397
|
+
verified = data.get("verified", False)
|
398
|
+
verified_type = data.get("verified_type", "")
|
399
|
+
entities = data.get("entities")
|
400
|
+
urls = [url.get("expanded_url","") for url in entities.get('url', {}).get("urls",[])]
|
401
|
+
user_mentions = [um.get("screen_name","") for um in entities.get('description', {}).get('user_mentions', [])]
|
402
|
+
user_mentions_indices = [um.get("indices",[]) for um in entities.get('description', {}).get('user_mentions', [])]
|
403
|
+
hashtags = [um.get("text","") for um in entities.get('description', {}).get('hashtags', [])]
|
404
|
+
hashtags_indices = [um.get("indices",[]) for um in entities.get('description', {}).get('hashtags', [])]
|
405
|
+
record = (id_str, name, screen_name, created_at, description, statuses_count, followers_count, friends_count, favourites_count, media_count, protected, verified, verified_type, urls, user_mentions, user_mentions_indices, hashtags, hashtags_indices)
|
406
|
+
all_records.append(record)
|
407
|
+
df = pd.DataFrame.from_records(all_records, columns = ["id_str", "name", "screen_name", "created_at", "description", "statuses_count", "followers_count", "friends_count", "favourites_count", "media_count", "protected", "verified", "verified_type", "urls", "user_mentions", "user_mentions_indices", "hashtags", "hashtags_indices"])
|
408
|
+
return df
|