opsci-toolbox 0.0.2__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,7 @@ from tqdm import tqdm
5
5
  import re
6
6
  from datetime import datetime,timedelta
7
7
  from opsci_toolbox.helpers.dates import str_to_datetime
8
+ from opsci_toolbox.helpers.common import write_jsonl
8
9
 
9
10
  def create_queries_per_period(query, publishedAfter, publishedBefore, col_publishedAfter = "start_date", col_publishedBefore = "end_date", date_format = '%Y-%m-%d', rolling_days = 7 ):
10
11
  datetime_publishedAfter = datetime.strptime(publishedAfter, date_format)
@@ -278,6 +279,31 @@ def parse_tweet(json_data):
278
279
  df = pd.DataFrame.from_records(all_records, columns = all_cols)
279
280
  return df
280
281
 
282
+ def parse_twitter_list_details(json_data):
283
+ """
284
+ Parse list results from https://rapidapi.com/omarmhaimdat/api/twitter154
285
+ """
286
+ list_id = json_data.get("list_id", "")
287
+ list_id_str = json_data.get("list_id_str", "")
288
+ member_count = json_data.get("member_count", 0)
289
+ name = json_data.get("name", "")
290
+ suscriber_count = json_data.get("subscriber_count", 0)
291
+ creation_date = json_data.get("creation_date", 0)
292
+ mode = json_data.get("mode", "0")
293
+
294
+ user_record = parse_user(json_data.get("user", {}))
295
+ record = (list_id, list_id_str, member_count, name, suscriber_count, creation_date, mode) + user_record
296
+ cols = ["list_id", "list_id_str", "member_count", "name", "suscriber_count", "creation_date", "mode", "user_creation_date", "user_id", "user_username", "user_name", "user_follower_count", "user_following_count", "user_favourites_count", "user_is_private", "user_is_verified", "user_is_blue_verified", "user_location", "user_profile_pic_url", "user_profile_banner_url", "user_description", "user_external_url", "user_number_of_tweets", "user_bot", "user_timestamp", "user_has_nft_avatar", "user_category", "user_default_profile", "user_default_profile_image", "user_listed_count", "user_verified_type"]
297
+
298
+ df = pd.DataFrame.from_records(record, cols)
299
+ return df
300
+
301
+ ######################################################################################
302
+ # function to parse Instagram data
303
+ # https://rapidapi.com/JoTucker/api/instagram-scraper2
304
+ # https://instagram-scraper2.p.rapidapi.com/hash_tag_medias_v2
305
+ ######################################################################################
306
+
281
307
  def instagram_parse_hashtag_data(hashtag_data):
282
308
  hashtag_id = hashtag_data.get("id")
283
309
  hashtag_name = hashtag_data.get("name")
@@ -324,3 +350,59 @@ def instagram_parse_hashtag_data(hashtag_data):
324
350
  return df
325
351
 
326
352
 
353
+ ######################################################################################
354
+ # function to parse Twitter data
355
+ # https://rapidapi.com/twttrapi-twttrapi-default/api/twttrapi
356
+ ######################################################################################
357
+ def compile_list_entries(json_data, path_json, filename):
358
+ """
359
+ Function to return next cursor and list details from https://twttrapi.p.rapidapi.com/list-members
360
+ """
361
+ results = []
362
+ next_cursor = None
363
+ entries = json_data.get('data', {}).get('list', {}).get('timeline_response', {}).get("timeline", {}).get("instructions", [{}])[-1].get('entries',[])
364
+ if len(entries)>0:
365
+ for entry in entries:
366
+ content = entry.get("content")
367
+ if (content.get("__typename") == "TimelineTimelineCursor") & (content.get("cursorType") =="Bottom"):
368
+ next_cursor = content.get("value", None)
369
+ if next_cursor:
370
+ if next_cursor.split('|')[0]=="0":
371
+ next_cursor = None
372
+ if content.get("__typename") != "TimelineTimelineCursor":
373
+ legacy = content.get("content", {}). get('userResult', {}).get("result", {}).get("legacy", {})
374
+ results.append(legacy)
375
+
376
+ write_jsonl(results, path_json, filename)
377
+ return results, next_cursor
378
+
379
+
380
+ def parse_list_entries(jsonl_data):
381
+ """
382
+ Function to parse list details from https://twttrapi.p.rapidapi.com/list-members
383
+ """
384
+ all_records=[]
385
+ for data in jsonl_data:
386
+ id_str = data.get("id_str","")
387
+ name = data.get("name","")
388
+ screen_name = data.get("screen_name", "")
389
+ created_at = data.get("created_at")
390
+ description = data.get("description")
391
+ statuses_count = data.get("statuses_count", 0)
392
+ followers_count = data.get("followers_count",0)
393
+ friends_count = data.get("friends_count",0)
394
+ favourites_count = data.get("favourites_count",0)
395
+ media_count = data.get("media_count",0)
396
+ protected = data.get("protected", False)
397
+ verified = data.get("verified", False)
398
+ verified_type = data.get("verified_type", "")
399
+ entities = data.get("entities")
400
+ urls = [url.get("expanded_url","") for url in entities.get('url', {}).get("urls",[])]
401
+ user_mentions = [um.get("screen_name","") for um in entities.get('description', {}).get('user_mentions', [])]
402
+ user_mentions_indices = [um.get("indices",[]) for um in entities.get('description', {}).get('user_mentions', [])]
403
+ hashtags = [um.get("text","") for um in entities.get('description', {}).get('hashtags', [])]
404
+ hashtags_indices = [um.get("indices",[]) for um in entities.get('description', {}).get('hashtags', [])]
405
+ record = (id_str, name, screen_name, created_at, description, statuses_count, followers_count, friends_count, favourites_count, media_count, protected, verified, verified_type, urls, user_mentions, user_mentions_indices, hashtags, hashtags_indices)
406
+ all_records.append(record)
407
+ df = pd.DataFrame.from_records(all_records, columns = ["id_str", "name", "screen_name", "created_at", "description", "statuses_count", "followers_count", "friends_count", "favourites_count", "media_count", "protected", "verified", "verified_type", "urls", "user_mentions", "user_mentions_indices", "hashtags", "hashtags_indices"])
408
+ return df