twitwi 0.20.0__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
twitwi/normalizers.py CHANGED
@@ -8,31 +8,27 @@
8
8
  #
9
9
  import re
10
10
  from copy import deepcopy
11
- from datetime import datetime
12
11
  from html import unescape
13
12
 
14
13
  from twitwi.exceptions import TwitterPayloadV2IncompleteIncludesError
15
14
  from twitwi.utils import (
15
+ get_collection_time,
16
16
  get_dates,
17
17
  custom_normalize_url,
18
18
  validate_payload_v2,
19
- custom_get_normalized_hostname
19
+ custom_get_normalized_hostname,
20
20
  )
21
21
 
22
- CLEAN_RT_PATTERN = re.compile(r'^RT @\w+: ')
23
-
24
-
25
- def get_collection_time():
26
- return datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%f')
22
+ CLEAN_RT_PATTERN = re.compile(r"^RT @\w+: ")
27
23
 
28
24
 
29
25
  def format_rt_text(user, text):
30
- return 'RT @%s: %s' % (user, text)
26
+ return "RT @%s: %s" % (user, text)
31
27
 
32
28
 
33
29
  def format_qt_text(user, text, quoted_text, url):
34
- clean_url = re.sub(r'(\?s=\d+|/(video|photo)/\d+)+', '', url.lower())
35
- quote = '« %s: %s — %s »' % (user, quoted_text, clean_url)
30
+ clean_url = re.sub(r"(\?s=\d+|/(video|photo)/\d+)+", "", url.lower())
31
+ quote = "« %s: %s — %s »" % (user, quoted_text, clean_url)
36
32
  text_lc = text.lower()
37
33
  if quote.lower() in text_lc:
38
34
  return text
@@ -40,43 +36,43 @@ def format_qt_text(user, text, quoted_text, url):
40
36
  url_pos = text_lc.find(url_lc)
41
37
  if url_pos != -1:
42
38
  url_len = len(url)
43
- return ("%s%s%s" % (text[:url_pos], quote, text[url_pos + url_len:])).strip()
39
+ return ("%s%s%s" % (text[:url_pos], quote, text[url_pos + url_len :])).strip()
44
40
  return "%s %s" % (text, quote)
45
41
 
46
42
 
47
43
  def format_tweet_url(screen_name, tweet_id):
48
- return 'https://twitter.com/%s/status/%s' % (screen_name, tweet_id)
44
+ return "https://twitter.com/%s/status/%s" % (screen_name, tweet_id)
49
45
 
50
46
 
51
47
  def extract_media_name_from_url(media_url):
52
- return media_url.rsplit('/', 1)[-1].split('?tag=', 1)[0]
48
+ return media_url.rsplit("/", 1)[-1].split("?tag=", 1)[0]
53
49
 
54
50
 
55
51
  def extract_items_from_text(text, char):
56
- splitter = re.compile(r'[^\w%s]+' % char)
52
+ splitter = re.compile(r"[^\w%s]+" % char)
57
53
 
58
54
  return sorted(
59
55
  set(
60
56
  r.lstrip(char).lower()
61
- for r in splitter.split(CLEAN_RT_PATTERN.sub('', text))
57
+ for r in splitter.split(CLEAN_RT_PATTERN.sub("", text))
62
58
  if r.startswith(char)
63
59
  )
64
60
  )
65
61
 
66
62
 
67
63
  def extract_hashtags_from_text(text):
68
- return extract_items_from_text(text, '#')
64
+ return extract_items_from_text(text, "#")
69
65
 
70
66
 
71
67
  def extract_mentions_from_text(text):
72
- return extract_items_from_text(text, '@')
68
+ return extract_items_from_text(text, "@")
73
69
 
74
70
 
75
71
  def resolve_entities(tweet, prefix):
76
- status_key = '%s_status' % prefix
72
+ status_key = "%s_status" % prefix
77
73
  target = tweet[status_key]
78
74
 
79
- for ent in ['entities', 'extended_entities']:
75
+ for ent in ["entities", "extended_entities"]:
80
76
  if ent not in target:
81
77
  continue
82
78
  tweet[ent] = tweet.get(ent, {})
@@ -87,63 +83,57 @@ def resolve_entities(tweet, prefix):
87
83
 
88
84
 
89
85
  def get_bitrate(x):
90
- return x.get('bitrate', 0)
86
+ return x.get("bitrate", 0)
91
87
 
92
88
 
93
89
  def get_bitrate_v2(x):
94
- return x.get('bit_rate', 0)
90
+ return x.get("bit_rate", 0)
95
91
 
96
92
 
97
93
  def nostr_field(f):
98
- return f.replace('_str', '')
94
+ return f.replace("_str", "")
99
95
 
100
96
 
101
97
  META_FIELDS = [
102
- 'in_reply_to_status_id_str',
103
- 'in_reply_to_screen_name',
104
- 'in_reply_to_user_id_str',
105
- 'lang',
106
- 'possibly_sensitive',
107
- 'retweet_count',
108
- 'favorite_count',
109
- 'reply_count'
98
+ "in_reply_to_status_id_str",
99
+ "in_reply_to_screen_name",
100
+ "in_reply_to_user_id_str",
101
+ "lang",
102
+ "possibly_sensitive",
103
+ "retweet_count",
104
+ "favorite_count",
105
+ "reply_count",
110
106
  ]
111
107
 
112
108
  META_FIELD_TRANSLATIONS = {
113
- 'in_reply_to_status_id_str': 'to_tweetid',
114
- 'in_reply_to_screen_name': 'to_username',
115
- 'in_reply_to_user_id_str': 'to_userid',
116
- 'favorite_count': 'like_count'
109
+ "in_reply_to_status_id_str": "to_tweetid",
110
+ "in_reply_to_screen_name": "to_username",
111
+ "in_reply_to_user_id_str": "to_userid",
112
+ "favorite_count": "like_count",
117
113
  }
118
114
 
119
115
  USER_META_FIELDS = [
120
- 'screen_name',
121
- 'name',
122
- 'friends_count',
123
- 'followers_count',
124
- 'location',
125
- 'verified',
126
- 'description',
127
- 'created_at'
116
+ "screen_name",
117
+ "name",
118
+ "friends_count",
119
+ "followers_count",
120
+ "location",
121
+ "verified",
122
+ "description",
123
+ "created_at",
128
124
  ]
129
125
 
130
- PLACE_META_FIELDS = [
131
- 'country_code',
132
- 'full_name',
133
- 'place_type'
134
- ]
126
+ PLACE_META_FIELDS = ["country_code", "full_name", "place_type"]
135
127
 
136
128
 
137
129
  def grab_extra_meta(source, result, locale=None):
138
-
139
- if source.get('coordinates'):
140
- result['coordinates'] = source['coordinates']['coordinates']
141
- result['lat'] = source['coordinates']['coordinates'][1]
142
- result['lng'] = source['coordinates']['coordinates'][0]
130
+ if source.get("coordinates"):
131
+ result["coordinates"] = source["coordinates"]["coordinates"]
132
+ result["lat"] = source["coordinates"]["coordinates"][1]
133
+ result["lng"] = source["coordinates"]["coordinates"][0]
143
134
  else:
144
-
145
135
  # TODO: this is hardly optimal
146
- result['coordinates'] = None
136
+ result["coordinates"] = None
147
137
 
148
138
  for meta in META_FIELDS:
149
139
  if meta in source:
@@ -152,54 +142,72 @@ def grab_extra_meta(source, result, locale=None):
152
142
  result[meta] = str(source[nostr_field(meta)])
153
143
 
154
144
  # impression_count when scraping
155
- if 'ext_views' in source:
156
- result['impression_count'] = source['ext_views'].get('count')
145
+ if "ext_views" in source:
146
+ result["impression_count"] = source["ext_views"].get("count")
157
147
 
158
148
  for meta in USER_META_FIELDS:
159
- key = 'user_%s' % meta.replace('_count', '')
149
+ key = "user_%s" % meta.replace("_count", "")
160
150
  if key in source:
161
151
  result[key] = source[key]
162
- elif 'user' in source and meta in source['user']:
163
- result[key] = source['user'][meta] if source['user'][meta] != "" else None
152
+ elif "user" in source and meta in source["user"]:
153
+ result[key] = source["user"][meta] if source["user"][meta] != "" else None
164
154
 
165
- if 'user' in source:
166
- result['user_id'] = source['user']['id_str']
167
- result['user_tweets'] = source['user']['statuses_count']
168
- result['user_likes'] = source['user']['favourites_count']
169
- result['user_lists'] = source['user']['listed_count']
170
- result['user_image'] = source['user']['profile_image_url_https']
155
+ if "user" in source:
156
+ result["user_id"] = source["user"]["id_str"]
157
+ result["user_tweets"] = source["user"]["statuses_count"]
158
+ result["user_likes"] = source["user"]["favourites_count"]
159
+ result["user_lists"] = source["user"]["listed_count"]
160
+ result["user_image"] = source["user"]["profile_image_url_https"]
171
161
 
172
- if 'place' in source and source['place'] is not None:
162
+ if "place" in source and source["place"] is not None:
173
163
  for meta in PLACE_META_FIELDS:
174
- if meta in source['place']:
175
- key = 'place_%s' % meta.replace('place_', '').replace('full_', '')
176
- result[key] = source['place'][meta]
177
-
178
- if 'bounding_box' in source['place'] \
179
- and source['place']['bounding_box'] is not None \
180
- and 'coordinates' in source['place']['bounding_box']:
181
- result['place_coordinates'] = source['place']['bounding_box']['coordinates'][0]
164
+ if meta in source["place"]:
165
+ key = "place_%s" % meta.replace("place_", "").replace("full_", "")
166
+ result[key] = source["place"][meta]
167
+
168
+ if (
169
+ "bounding_box" in source["place"]
170
+ and source["place"]["bounding_box"] is not None
171
+ and "coordinates" in source["place"]["bounding_box"]
172
+ ):
173
+ result["place_coordinates"] = source["place"]["bounding_box"][
174
+ "coordinates"
175
+ ][0]
182
176
 
183
177
  # TODO: nested_get
184
178
  try:
185
- result['user_url'] = source['user']['entities']['url']['urls'][0]['expanded_url']
179
+ result["user_url"] = source["user"]["entities"]["url"]["urls"][0][
180
+ "expanded_url"
181
+ ]
186
182
  except (KeyError, IndexError):
187
183
  try:
188
- result['user_url'] = source['user']['url']
184
+ result["user_url"] = source["user"]["url"]
189
185
  except KeyError:
190
186
  pass
191
187
 
192
- if 'user_created_at' in result:
193
- result['user_timestamp_utc'], result['user_created_at'] = get_dates(result['user_created_at'], locale)
188
+ if "user_created_at" in result:
189
+ result["user_timestamp_utc"], result["user_created_at"] = get_dates(
190
+ result["user_created_at"], locale
191
+ )
194
192
 
195
- if source.get('source'):
196
- result['source_url'], result['source_name'] = source['source'].replace('<a href="', '').replace('</a>', '').split('" rel="nofollow">')
193
+ if source.get("source"):
194
+ result["source_url"], result["source_name"] = (
195
+ source["source"]
196
+ .replace('<a href="', "")
197
+ .replace("</a>", "")
198
+ .split('" rel="nofollow">')
199
+ )
197
200
 
198
201
  return result
199
202
 
200
203
 
201
- def normalize_tweet(tweet, locale=None, extract_referenced_tweets=False,
202
- collection_source=None, pure=True):
204
+ def normalize_tweet(
205
+ tweet,
206
+ locale=None,
207
+ extract_referenced_tweets=False,
208
+ collection_source=None,
209
+ pure=True,
210
+ ):
203
211
  """
204
212
  Function "normalizing" a tweet as returned by Twitter's API in order to
205
213
  cleanup and optimize some fields.
@@ -229,11 +237,11 @@ def normalize_tweet(tweet, locale=None, extract_referenced_tweets=False,
229
237
 
230
238
  results = []
231
239
 
232
- if 'extended_tweet' in tweet:
233
- for field in tweet['extended_tweet']:
234
- tweet[field] = tweet['extended_tweet'][field]
240
+ if "extended_tweet" in tweet:
241
+ for field in tweet["extended_tweet"]:
242
+ tweet[field] = tweet["extended_tweet"][field]
235
243
 
236
- text = tweet.get('full_text', tweet.get('text', ''))
244
+ text = tweet.get("full_text", tweet.get("text", ""))
237
245
 
238
246
  rti = None
239
247
  rtu = None
@@ -245,17 +253,20 @@ def normalize_tweet(tweet, locale=None, extract_referenced_tweets=False,
245
253
  qtuid = None
246
254
  qtime = None
247
255
 
248
- if 'retweeted_status' in tweet and tweet['retweeted_status']['id_str'] != tweet['id_str']:
249
- rti = tweet['retweeted_status']['id_str']
250
- rtu = tweet['retweeted_status']['user']['screen_name']
251
- rtuid = tweet['retweeted_status']['user']['id_str']
256
+ if (
257
+ "retweeted_status" in tweet
258
+ and tweet["retweeted_status"]["id_str"] != tweet["id_str"]
259
+ ):
260
+ rti = tweet["retweeted_status"]["id_str"]
261
+ rtu = tweet["retweeted_status"]["user"]["screen_name"]
262
+ rtuid = tweet["retweeted_status"]["user"]["id_str"]
252
263
 
253
264
  nested = normalize_tweet(
254
- tweet['retweeted_status'],
265
+ tweet["retweeted_status"],
255
266
  locale=locale,
256
267
  extract_referenced_tweets=True,
257
- collection_source='retweet',
258
- pure=False
268
+ collection_source="retweet",
269
+ pure=False,
259
270
  )
260
271
 
261
272
  rtweet = nested[-1]
@@ -263,21 +274,23 @@ def normalize_tweet(tweet, locale=None, extract_referenced_tweets=False,
263
274
  if extract_referenced_tweets:
264
275
  results.extend(nested)
265
276
 
266
- rtime = rtweet['timestamp_utc']
277
+ rtime = rtweet["timestamp_utc"]
267
278
 
268
- resolve_entities(tweet, 'retweeted')
279
+ resolve_entities(tweet, "retweeted")
269
280
 
270
- elif 'quoted_status' in tweet and tweet['quoted_status']['id_str'] != tweet['id_str']:
271
- qti = tweet['quoted_status']['id_str']
272
- qtu = tweet['quoted_status']['user']['screen_name']
273
- qtuid = tweet['quoted_status']['user']['id_str']
281
+ elif (
282
+ "quoted_status" in tweet and tweet["quoted_status"]["id_str"] != tweet["id_str"]
283
+ ):
284
+ qti = tweet["quoted_status"]["id_str"]
285
+ qtu = tweet["quoted_status"]["user"]["screen_name"]
286
+ qtuid = tweet["quoted_status"]["user"]["id_str"]
274
287
 
275
288
  nested = normalize_tweet(
276
- tweet['quoted_status'],
289
+ tweet["quoted_status"],
277
290
  locale=locale,
278
291
  extract_referenced_tweets=True,
279
- collection_source='quote',
280
- pure=False
292
+ collection_source="quote",
293
+ pure=False,
281
294
  )
282
295
 
283
296
  qtweet = nested[-1]
@@ -286,12 +299,12 @@ def normalize_tweet(tweet, locale=None, extract_referenced_tweets=False,
286
299
  results.extend(nested)
287
300
 
288
301
  if "quoted_status_permalink" in tweet:
289
- qturl = tweet['quoted_status_permalink']['expanded']
302
+ qturl = tweet["quoted_status_permalink"]["expanded"]
290
303
  else:
291
- qturl = qtweet['url']
292
- qtime = qtweet['timestamp_utc']
304
+ qturl = qtweet["url"]
305
+ qtime = qtweet["timestamp_utc"]
293
306
 
294
- resolve_entities(tweet, 'quoted')
307
+ resolve_entities(tweet, "quoted")
295
308
 
296
309
  medids = set()
297
310
  media_urls = []
@@ -303,54 +316,56 @@ def normalize_tweet(tweet, locale=None, extract_referenced_tweets=False,
303
316
  hashtags = set()
304
317
  mentions = {}
305
318
 
306
- if 'entities' in tweet or 'extended_entities' in tweet:
307
- source_id = rti or qti or tweet['id_str']
319
+ if "entities" in tweet or "extended_entities" in tweet:
320
+ source_id = rti or qti or tweet["id_str"]
308
321
 
309
- entities = tweet.get('extended_entities', tweet['entities']).get('media', [])
310
- entities += tweet['entities'].get('urls', [])
322
+ entities = tweet.get("extended_entities", tweet["entities"]).get("media", [])
323
+ entities += tweet["entities"].get("urls", [])
311
324
 
312
325
  for entity in entities:
313
- if 'expanded_url' in entity and 'url' in entity and entity['expanded_url']:
326
+ if "expanded_url" in entity and "url" in entity and entity["expanded_url"]:
314
327
  try:
315
- text = text.replace(entity['url'], entity['expanded_url'])
328
+ text = text.replace(entity["url"], entity["expanded_url"])
316
329
  except KeyError:
317
330
  pass
318
331
 
319
- if 'media_url' in entity or 'media_url_https' in entity:
320
- if 'video_info' in entity:
321
- med_url = max(entity['video_info']['variants'], key=get_bitrate)['url']
332
+ if "media_url" in entity or "media_url_https" in entity:
333
+ if "video_info" in entity:
334
+ med_url = max(entity["video_info"]["variants"], key=get_bitrate)[
335
+ "url"
336
+ ]
322
337
  else:
323
- med_url = entity['media_url_https']
338
+ med_url = entity["media_url_https"]
324
339
 
325
340
  med_name = extract_media_name_from_url(med_url)
326
341
 
327
342
  if med_name not in medids:
328
343
  medids.add(med_name)
329
- media_types.append(entity['type'])
330
- media_urls.append(med_url.split('?tag=')[0])
331
- media_files.append('%s_%s' % (source_id, med_name))
332
- media_alt_texts.append(entity.get("ext_alt_text") or '')
344
+ media_types.append(entity["type"])
345
+ media_urls.append(med_url.split("?tag=")[0])
346
+ media_files.append("%s_%s" % (source_id, med_name))
347
+ media_alt_texts.append(entity.get("ext_alt_text") or "")
333
348
 
334
349
  # NOTE: fun fact, Twitter is starting to break down and we cannot guarantee
335
350
  # expanded_url exists anymore. It even crashes the website itself lol:
336
351
  # https://x.com/lmerzeau/status/426318495450943488
337
352
  elif "expanded_url" in entity:
338
- normalized = custom_normalize_url(entity['expanded_url'])
353
+ normalized = custom_normalize_url(entity["expanded_url"])
339
354
  links.add(normalized)
340
355
 
341
- for hashtag in tweet['entities'].get('hashtags', []):
342
- hashtags.add(hashtag['text'].lower())
356
+ for hashtag in tweet["entities"].get("hashtags", []):
357
+ hashtags.add(hashtag["text"].lower())
343
358
 
344
- for mention in tweet['entities'].get('user_mentions', []):
345
- mentions[mention['screen_name'].lower()] = mention['id_str']
359
+ for mention in tweet["entities"].get("user_mentions", []):
360
+ mentions[mention["screen_name"].lower()] = mention["id_str"]
346
361
 
347
362
  if rtu:
348
- text = format_rt_text(rtu, rtweet['text'])
363
+ text = format_rt_text(rtu, rtweet["text"])
349
364
  if rtweet["quoted_id"]:
350
365
  qturl = format_tweet_url(rtweet["quoted_user"], rtweet["quoted_id"])
351
366
 
352
367
  elif qtu:
353
- text = format_qt_text(qtu, text, qtweet['text'], qturl)
368
+ text = format_qt_text(qtu, text, qtweet["text"], qturl)
354
369
 
355
370
  if qturl:
356
371
  qturl_lc = custom_normalize_url(qturl).lower()
@@ -358,44 +373,50 @@ def normalize_tweet(tweet, locale=None, extract_referenced_tweets=False,
358
373
  if link.lower() == qturl_lc:
359
374
  links.remove(link)
360
375
 
361
- timestamp_utc, local_time = get_dates(tweet['created_at'], locale)
376
+ timestamp_utc, local_time = get_dates(tweet["created_at"], locale)
362
377
  text = unescape(text)
363
378
 
364
379
  if collection_source is None:
365
- collection_source = tweet.get('collection_source')
380
+ collection_source = tweet.get("collection_source")
366
381
  links = sorted(links)
367
- domains = [custom_get_normalized_hostname(link, normalize_amp=False, infer_redirection=False)
368
- for link in links]
382
+ domains = [
383
+ custom_get_normalized_hostname(
384
+ link, normalize_amp=False, infer_redirection=False
385
+ )
386
+ for link in links
387
+ ]
369
388
  normalized_tweet = {
370
- 'id': tweet['id_str'],
371
- 'local_time': local_time,
372
- 'timestamp_utc': timestamp_utc,
373
- 'text': text,
374
- 'url': format_tweet_url(tweet['user']['screen_name'], tweet['id_str']),
375
- 'quoted_id': qti,
376
- 'quoted_user': qtu,
377
- 'quoted_user_id': qtuid,
378
- 'quoted_timestamp_utc': qtime,
379
- 'retweeted_id': rti,
380
- 'retweeted_user': rtu,
381
- 'retweeted_user_id': rtuid,
382
- 'retweeted_timestamp_utc': rtime,
383
- 'media_files': media_files,
384
- 'media_types': media_types,
385
- 'media_urls': media_urls,
386
- 'media_alt_texts': media_alt_texts,
387
- 'links': links,
388
- 'links_to_resolve': len(links) > 0,
389
- 'domains': domains,
390
- 'hashtags': sorted(hashtags) if hashtags else extract_hashtags_from_text(text),
391
- 'mentioned_ids': [mentions[m] for m in sorted(mentions.keys())],
392
- 'mentioned_names': sorted(mentions.keys()) if mentions else extract_mentions_from_text(text),
393
- 'collection_time': get_collection_time(),
394
- 'match_query': collection_source != 'thread' and collection_source != 'quote'
389
+ "id": tweet["id_str"],
390
+ "local_time": local_time,
391
+ "timestamp_utc": timestamp_utc,
392
+ "text": text,
393
+ "url": format_tweet_url(tweet["user"]["screen_name"], tweet["id_str"]),
394
+ "quoted_id": qti,
395
+ "quoted_user": qtu,
396
+ "quoted_user_id": qtuid,
397
+ "quoted_timestamp_utc": qtime,
398
+ "retweeted_id": rti,
399
+ "retweeted_user": rtu,
400
+ "retweeted_user_id": rtuid,
401
+ "retweeted_timestamp_utc": rtime,
402
+ "media_files": media_files,
403
+ "media_types": media_types,
404
+ "media_urls": media_urls,
405
+ "media_alt_texts": media_alt_texts,
406
+ "links": links,
407
+ "links_to_resolve": len(links) > 0,
408
+ "domains": domains,
409
+ "hashtags": sorted(hashtags) if hashtags else extract_hashtags_from_text(text),
410
+ "mentioned_ids": [mentions[m] for m in sorted(mentions.keys())],
411
+ "mentioned_names": sorted(mentions.keys())
412
+ if mentions
413
+ else extract_mentions_from_text(text),
414
+ "collection_time": get_collection_time(),
415
+ "match_query": collection_source != "thread" and collection_source != "quote",
395
416
  }
396
417
 
397
418
  if collection_source is not None:
398
- normalized_tweet['collected_via'] = [collection_source]
419
+ normalized_tweet["collected_via"] = [collection_source]
399
420
 
400
421
  grab_extra_meta(tweet, normalized_tweet, locale)
401
422
 
@@ -411,14 +432,14 @@ def normalize_tweet(tweet, locale=None, extract_referenced_tweets=False,
411
432
 
412
433
 
413
434
  def resolve_user_entities(user):
414
- if 'entities' in user:
415
- for k in user['entities']:
416
- if 'urls' in user['entities'][k]:
417
- for url in user['entities'][k]['urls']:
418
- if not url.get('expanded_url'):
435
+ if "entities" in user:
436
+ for k in user["entities"]:
437
+ if "urls" in user["entities"][k]:
438
+ for url in user["entities"][k]["urls"]:
439
+ if not url.get("expanded_url"):
419
440
  continue
420
441
  if k in user:
421
- user[k] = user[k].replace(url['url'], url['expanded_url'])
442
+ user[k] = user[k].replace(url["url"], url["expanded_url"])
422
443
 
423
444
 
424
445
  def normalize_user(user, locale=None, pure=True, v2=False):
@@ -444,146 +465,179 @@ def normalize_user(user, locale=None, pure=True, v2=False):
444
465
 
445
466
  resolve_user_entities(user)
446
467
 
447
- timestamp_utc, local_time = get_dates(user['created_at'], locale, v2)
468
+ timestamp_utc, local_time = get_dates(
469
+ user["created_at"], locale, source="v2" if v2 else "v1"
470
+ )
448
471
 
449
- if v2 and 'withheld' in user:
450
- withheld = user['withheld']
451
- withheld_in_countries = withheld.get('country_codes', [])
452
- withheld_scope = withheld.get('withheld_scope', '')
472
+ if v2 and "withheld" in user:
473
+ withheld = user["withheld"]
474
+ withheld_in_countries = withheld.get("country_codes", [])
475
+ withheld_scope = withheld.get("withheld_scope", "")
453
476
  else:
454
477
  withheld_in_countries = []
455
- withheld_scope = ''
478
+ withheld_scope = ""
456
479
 
457
480
  normalized_user = {
458
- 'id': user['id_str'] if not v2 else user['id'],
459
- 'screen_name': user['screen_name'] if not v2 else user['username'],
460
- 'name': user['name'],
461
- 'description': user['description'] if user["description"] else None,
462
- 'url': user.get('url'),
463
- 'timestamp_utc': timestamp_utc,
464
- 'local_time': local_time,
465
- 'location': user.get('location'),
466
- 'verified': user.get('verified', False),
467
- 'protected': user.get('protected', False),
468
- 'tweets': user['statuses_count'] if not v2 else user['public_metrics']['tweet_count'],
469
- 'followers': user['followers_count'] if not v2 else user['public_metrics']['followers_count'],
470
- 'friends': user['friends_count'] if not v2 else user['public_metrics']['following_count'],
471
- 'likes': user['favourites_count'] if not v2 else None,
472
- 'lists': user['listed_count'] if not v2 else user['public_metrics']['listed_count'],
473
- 'image': user.get('profile_image_url_https') if not v2 else user.get('profile_image_url'),
474
- 'default_profile': user.get('default_profile', False),
475
- 'default_profile_image': user.get('default_profile_image', False),
476
- 'witheld_in_countries': user.get('witheld_in_countries', []) if not v2 else withheld_in_countries,
477
- 'witheld_scope': user.get('witheld_scope') if not v2 else withheld_scope
481
+ "id": user["id_str"] if not v2 else user["id"],
482
+ "screen_name": user["screen_name"] if not v2 else user["username"],
483
+ "name": user["name"],
484
+ "description": user["description"] if user["description"] else None,
485
+ "url": user.get("url"),
486
+ "timestamp_utc": timestamp_utc,
487
+ "local_time": local_time,
488
+ "location": user.get("location"),
489
+ "verified": user.get("verified", False),
490
+ "protected": user.get("protected", False),
491
+ "tweets": user["statuses_count"]
492
+ if not v2
493
+ else user["public_metrics"]["tweet_count"],
494
+ "followers": user["followers_count"]
495
+ if not v2
496
+ else user["public_metrics"]["followers_count"],
497
+ "friends": user["friends_count"]
498
+ if not v2
499
+ else user["public_metrics"]["following_count"],
500
+ "likes": user["favourites_count"] if not v2 else None,
501
+ "lists": user["listed_count"]
502
+ if not v2
503
+ else user["public_metrics"]["listed_count"],
504
+ "image": user.get("profile_image_url_https")
505
+ if not v2
506
+ else user.get("profile_image_url"),
507
+ "default_profile": user.get("default_profile", False),
508
+ "default_profile_image": user.get("default_profile_image", False),
509
+ "witheld_in_countries": user.get("witheld_in_countries", [])
510
+ if not v2
511
+ else withheld_in_countries,
512
+ "witheld_scope": user.get("witheld_scope") if not v2 else withheld_scope,
478
513
  }
479
514
 
480
515
  return normalized_user
481
516
 
482
517
 
483
- def includes_index(payload, key, index_key='id'):
484
- return {item[index_key]: item for item in payload['includes'].get(key, [])}
518
+ def includes_index(payload, key, index_key="id"):
519
+ return {item[index_key]: item for item in payload["includes"].get(key, [])}
485
520
 
486
521
 
487
522
  def get_best_url(item):
488
- if 'unwound_url' in item:
489
- return item['unwound_url']
523
+ if "unwound_url" in item:
524
+ return item["unwound_url"]
490
525
 
491
- if 'expanded_url' in item:
492
- return item['expanded_url']
526
+ if "expanded_url" in item:
527
+ return item["expanded_url"]
493
528
 
494
529
  return None
495
530
 
496
531
 
497
- def normalize_tweet_v2(tweet, *, users_by_screen_name, places_by_id, tweets_by_id,
498
- users_by_id, media_by_key, locale=None, collection_source=None,
499
- extract_referenced_tweets=False):
500
- timestamp_utc, local_time = get_dates(tweet['created_at'], locale=locale, v2=True)
532
+ def normalize_tweet_v2(
533
+ tweet,
534
+ *,
535
+ users_by_screen_name,
536
+ places_by_id,
537
+ tweets_by_id,
538
+ users_by_id,
539
+ media_by_key,
540
+ locale=None,
541
+ collection_source=None,
542
+ extract_referenced_tweets=False,
543
+ ):
544
+ timestamp_utc, local_time = get_dates(
545
+ tweet["created_at"], locale=locale, source="v2"
546
+ )
501
547
 
502
548
  try:
503
- user = users_by_id[tweet['author_id']]
549
+ user = users_by_id[tweet["author_id"]]
504
550
  except KeyError:
505
- raise TwitterPayloadV2IncompleteIncludesError('user', tweet['author_id'])
551
+ raise TwitterPayloadV2IncompleteIncludesError("user", tweet["author_id"])
506
552
 
507
- user_timestamp_utc, user_created_at = get_dates(user['created_at'], locale=locale, v2=True)
508
- user_entities = user.get('entities', {})
553
+ user_timestamp_utc, user_created_at = get_dates(
554
+ user["created_at"], locale=locale, source="v2"
555
+ )
556
+ user_entities = user.get("entities", {})
509
557
 
510
- entities = tweet.get('entities', {})
511
- referenced_tweets = tweet.get('referenced_tweets', [])
558
+ entities = tweet.get("entities", {})
559
+ referenced_tweets = tweet.get("referenced_tweets", [])
512
560
 
513
561
  hashtags = set()
514
562
 
515
- for hashtag in entities.get('hashtags', []):
516
- hashtags.add(hashtag['tag'])
563
+ for hashtag in entities.get("hashtags", []):
564
+ hashtags.add(hashtag["tag"])
517
565
 
518
566
  mentions = {}
519
567
 
520
- for mention in entities.get('mentions', []):
521
- if 'id' in mention:
522
- mentions[mention['username']] = mention['id']
568
+ for mention in entities.get("mentions", []):
569
+ if "id" in mention:
570
+ mentions[mention["username"]] = mention["id"]
523
571
  else:
524
572
  try:
525
- mentions[mention['username']] = users_by_screen_name[mention['username']]['id']
573
+ mentions[mention["username"]] = users_by_screen_name[
574
+ mention["username"]
575
+ ]["id"]
526
576
  except KeyError:
527
- raise TwitterPayloadV2IncompleteIncludesError('user', mention['username'])
577
+ raise TwitterPayloadV2IncompleteIncludesError(
578
+ "user", mention["username"]
579
+ )
528
580
 
529
581
  place_info = {}
530
582
 
531
- if 'geo' in tweet:
532
- geo_data = tweet['geo']
583
+ if "geo" in tweet:
584
+ geo_data = tweet["geo"]
533
585
 
534
- if 'coordinates' in geo_data:
535
- point = geo_data['coordinates']
586
+ if "coordinates" in geo_data:
587
+ point = geo_data["coordinates"]
536
588
 
537
- if point['type'] == 'Point':
538
- lng, lat = point['coordinates']
539
- place_info['lng'] = lng
540
- place_info['lat'] = lat
589
+ if point["type"] == "Point":
590
+ lng, lat = point["coordinates"]
591
+ place_info["lng"] = lng
592
+ place_info["lat"] = lat
541
593
 
542
- if 'place_id' in geo_data:
543
- place_data = places_by_id.get(geo_data['place_id'], {})
594
+ if "place_id" in geo_data:
595
+ place_data = places_by_id.get(geo_data["place_id"], {})
544
596
 
545
- if 'country_code' in place_data:
546
- place_info['place_country_code'] = place_data['country_code']
597
+ if "country_code" in place_data:
598
+ place_info["place_country_code"] = place_data["country_code"]
547
599
 
548
- if 'full_name' in place_data:
549
- place_info['place_name'] = place_data['full_name']
600
+ if "full_name" in place_data:
601
+ place_info["place_name"] = place_data["full_name"]
550
602
 
551
- if 'place_type' in place_data:
552
- place_info['place_type'] = place_data['place_type']
603
+ if "place_type" in place_data:
604
+ place_info["place_type"] = place_data["place_type"]
553
605
 
554
- if 'geo' in place_data and 'bbox' in place_data['geo']:
555
- place_info['place_coordinates'] = place_data['geo']['bbox']
606
+ if "geo" in place_data and "bbox" in place_data["geo"]:
607
+ place_info["place_coordinates"] = place_data["geo"]["bbox"]
556
608
 
557
609
  # Text
558
- text = tweet['text']
610
+ text = tweet["text"]
559
611
 
560
612
  # References
561
- refs = {t['type']: t['id'] for t in referenced_tweets}
613
+ refs = {t["type"]: t["id"] for t in referenced_tweets}
562
614
 
563
615
  # Reply
564
616
  reply_info = {}
565
617
 
566
- if 'replied_to' in refs:
567
- reply = tweets_by_id.get(refs['replied_to'], {})
568
- if 'author_id' in reply:
618
+ if "replied_to" in refs:
619
+ reply = tweets_by_id.get(refs["replied_to"], {})
620
+ if "author_id" in reply:
569
621
  try:
570
- reply_info['to_username'] = users_by_id[reply['author_id']]['username']
622
+ reply_info["to_username"] = users_by_id[reply["author_id"]]["username"]
571
623
  except KeyError:
572
- raise TwitterPayloadV2IncompleteIncludesError('replied_user', reply['author_id'])
624
+ raise TwitterPayloadV2IncompleteIncludesError(
625
+ "replied_user", reply["author_id"]
626
+ )
573
627
  else:
574
- reply_info['to_username'] = ''
575
- reply_info['to_userid'] = reply.get('author_id', '')
576
- reply_info['to_tweetid'] = reply.get('id', '')
628
+ reply_info["to_username"] = ""
629
+ reply_info["to_userid"] = reply.get("author_id", "")
630
+ reply_info["to_tweetid"] = reply.get("id", "")
577
631
 
578
632
  # Retweet
579
633
  retweet_info = {}
580
634
  normalized_retweet = None
581
635
 
582
- if 'retweeted' in refs:
583
- retweet_info['retweeted_id'] = refs['retweeted']
636
+ if "retweeted" in refs:
637
+ retweet_info["retweeted_id"] = refs["retweeted"]
584
638
 
585
- if refs['retweeted'] in tweets_by_id:
586
- retweet = tweets_by_id[refs['retweeted']]
639
+ if refs["retweeted"] in tweets_by_id:
640
+ retweet = tweets_by_id[refs["retweeted"]]
587
641
  normalized_retweet = normalize_tweet_v2(
588
642
  retweet,
589
643
  users_by_screen_name=users_by_screen_name,
@@ -592,22 +646,24 @@ def normalize_tweet_v2(tweet, *, users_by_screen_name, places_by_id, tweets_by_i
592
646
  users_by_id=users_by_id,
593
647
  media_by_key=media_by_key,
594
648
  locale=locale,
595
- collection_source='retweet'
649
+ collection_source="retweet",
596
650
  )
597
651
 
598
- retweet_info['retweeted_user'] = normalized_retweet['user_screen_name']
599
- retweet_info['retweeted_user_id'] = normalized_retweet['user_id']
600
- retweet_info['retweeted_timestamp_utc'] = normalized_retweet['timestamp_utc']
652
+ retweet_info["retweeted_user"] = normalized_retweet["user_screen_name"]
653
+ retweet_info["retweeted_user_id"] = normalized_retweet["user_id"]
654
+ retweet_info["retweeted_timestamp_utc"] = normalized_retweet[
655
+ "timestamp_utc"
656
+ ]
601
657
 
602
658
  # Quoted
603
659
  quote_info = {}
604
660
  normalized_quote = None
605
661
 
606
- if 'quoted' in refs:
607
- quote_info['quoted_id'] = refs['quoted']
662
+ if "quoted" in refs:
663
+ quote_info["quoted_id"] = refs["quoted"]
608
664
 
609
- if refs['quoted'] in tweets_by_id:
610
- quote = tweets_by_id[refs['quoted']]
665
+ if refs["quoted"] in tweets_by_id:
666
+ quote = tweets_by_id[refs["quoted"]]
611
667
  normalized_quote = normalize_tweet_v2(
612
668
  quote,
613
669
  users_by_screen_name=users_by_screen_name,
@@ -616,125 +672,126 @@ def normalize_tweet_v2(tweet, *, users_by_screen_name, places_by_id, tweets_by_i
616
672
  users_by_id=users_by_id,
617
673
  media_by_key=media_by_key,
618
674
  locale=locale,
619
- collection_source='quote'
675
+ collection_source="quote",
620
676
  )
621
677
 
622
- quote_info['quoted_user'] = normalized_quote['user_screen_name']
623
- quote_info['quoted_user_id'] = normalized_quote['user_id']
624
- quote_info['quoted_timestamp_utc'] = normalized_quote['timestamp_utc']
678
+ quote_info["quoted_user"] = normalized_quote["user_screen_name"]
679
+ quote_info["quoted_user_id"] = normalized_quote["user_id"]
680
+ quote_info["quoted_timestamp_utc"] = normalized_quote["timestamp_utc"]
625
681
 
626
682
  # Replace urls in text
627
683
  links = set()
628
684
 
629
- for url_data in entities.get('urls', []):
685
+ for url_data in entities.get("urls", []):
630
686
  replacement_url = get_best_url(url_data)
631
687
 
632
688
  if replacement_url:
633
- text = text.replace(url_data['url'], replacement_url)
689
+ text = text.replace(url_data["url"], replacement_url)
634
690
 
635
691
  if not replacement_url:
636
- replacement_url = url_data['url']
692
+ replacement_url = url_data["url"]
637
693
 
638
694
  links.add(custom_normalize_url(replacement_url))
639
695
 
640
696
  if normalized_retweet:
641
697
  text = format_rt_text(
642
- normalized_retweet['user_screen_name'],
643
- normalized_retweet['text']
698
+ normalized_retweet["user_screen_name"], normalized_retweet["text"]
644
699
  )
645
700
 
646
701
  if normalized_quote:
647
702
  text = format_qt_text(
648
- normalized_quote['user_screen_name'],
703
+ normalized_quote["user_screen_name"],
649
704
  text,
650
- normalized_quote['text'],
651
- normalized_quote['url']
705
+ normalized_quote["text"],
706
+ normalized_quote["url"],
652
707
  )
653
708
 
654
709
  # Metrics
655
- is_retweet = 'retweeted' in refs
656
- public_metrics = tweet['public_metrics']
657
- user_public_metrics = user['public_metrics']
710
+ is_retweet = "retweeted" in refs
711
+ public_metrics = tweet["public_metrics"]
712
+ user_public_metrics = user["public_metrics"]
658
713
 
659
- user_url = user.get('url')
714
+ user_url = user.get("url")
660
715
 
661
- if 'url' in user_entities and 'urls' in user_entities['url']:
662
- user_url_entity = user_entities['url']['urls'][0]
716
+ if "url" in user_entities and "urls" in user_entities["url"]:
717
+ user_url_entity = user_entities["url"]["urls"][0]
663
718
  user_url = get_best_url(user_url_entity) or user_url
664
719
 
665
720
  # Media
666
721
  medias = []
667
722
 
668
- if 'attachments' in tweet and 'media_keys' in tweet['attachments']:
669
- source_id = refs.get('retweeted_id', tweet['id'])
723
+ if "attachments" in tweet and "media_keys" in tweet["attachments"]:
724
+ source_id = refs.get("retweeted_id", tweet["id"])
670
725
 
671
- for media_key in tweet['attachments']['media_keys']:
726
+ for media_key in tweet["attachments"]["media_keys"]:
672
727
  if media_key in media_by_key:
673
728
  try:
674
729
  media_data = media_by_key[media_key]
675
730
  except KeyError:
676
- raise TwitterPayloadV2IncompleteIncludesError('media', media_key)
731
+ raise TwitterPayloadV2IncompleteIncludesError("media", media_key)
677
732
 
678
733
  if "variants" in media_data:
679
- media_url = max(media_data['variants'], key=get_bitrate_v2)['url']
734
+ media_url = max(media_data["variants"], key=get_bitrate_v2)["url"]
680
735
  else:
681
- media_url = media_data.get('url', '')
682
- medias.append((
683
- media_url,
684
- '%s_%s' % (source_id, extract_media_name_from_url(media_url)),
685
- media_data['type']
686
- ))
736
+ media_url = media_data.get("url", "")
737
+ medias.append(
738
+ (
739
+ media_url,
740
+ "%s_%s" % (source_id, extract_media_name_from_url(media_url)),
741
+ media_data["type"],
742
+ )
743
+ )
687
744
 
688
745
  if collection_source is None:
689
- collection_source = tweet.get('collection_source')
746
+ collection_source = tweet.get("collection_source")
690
747
 
691
748
  sorted_mentions = sorted(mentions.keys())
692
749
 
693
750
  normalized_tweet = {
694
- 'id': tweet['id'],
695
- 'local_time': local_time,
696
- 'timestamp_utc': timestamp_utc,
697
- 'text': unescape(text),
698
- 'url': format_tweet_url(user['username'], tweet['id']),
699
- 'hashtags': sorted(hashtags),
700
- 'mentioned_names': sorted_mentions,
701
- 'mentioned_ids': [mentions[k] for k in sorted_mentions],
702
- 'collection_time': get_collection_time(),
703
- 'user_id': user['id'],
704
- 'user_screen_name': user['username'],
705
- 'user_name': user['name'],
706
- 'user_image': user['profile_image_url'],
707
- 'user_url': user_url or None,
708
- 'user_location': user.get("location"),
709
- 'user_verified': user['verified'],
710
- 'user_description': user['description'] if user["description"] else None,
711
- 'user_tweets': user_public_metrics.get('tweet_count'),
712
- 'user_followers': user_public_metrics.get('followers_count'),
713
- 'user_friends': user_public_metrics.get('following_count'),
714
- 'user_lists': user_public_metrics.get('listed_count'),
715
- 'user_created_at': user_created_at,
716
- 'user_timestamp_utc': user_timestamp_utc,
717
- 'possibly_sensitive': tweet['possibly_sensitive'],
718
- 'like_count': public_metrics['like_count'] if not is_retweet else 0,
719
- 'retweet_count': public_metrics['retweet_count'] if not is_retweet else 0,
720
- 'quote_count': public_metrics['quote_count'] if not is_retweet else 0,
721
- 'reply_count': public_metrics['reply_count'] if not is_retweet else 0,
722
- 'impression_count': public_metrics.get('impression_count'),
723
- 'lang': tweet['lang'],
724
- 'source_name': tweet.get('source'),
725
- 'links': sorted(links),
726
- 'media_urls': [m[0] for m in medias],
727
- 'media_files': [m[1] for m in medias],
728
- 'media_types': [m[2] for m in medias],
729
- 'match_query': collection_source != 'thread' and collection_source != 'quote',
751
+ "id": tweet["id"],
752
+ "local_time": local_time,
753
+ "timestamp_utc": timestamp_utc,
754
+ "text": unescape(text),
755
+ "url": format_tweet_url(user["username"], tweet["id"]),
756
+ "hashtags": sorted(hashtags),
757
+ "mentioned_names": sorted_mentions,
758
+ "mentioned_ids": [mentions[k] for k in sorted_mentions],
759
+ "collection_time": get_collection_time(),
760
+ "user_id": user["id"],
761
+ "user_screen_name": user["username"],
762
+ "user_name": user["name"],
763
+ "user_image": user["profile_image_url"],
764
+ "user_url": user_url or None,
765
+ "user_location": user.get("location"),
766
+ "user_verified": user["verified"],
767
+ "user_description": user["description"] if user["description"] else None,
768
+ "user_tweets": user_public_metrics.get("tweet_count"),
769
+ "user_followers": user_public_metrics.get("followers_count"),
770
+ "user_friends": user_public_metrics.get("following_count"),
771
+ "user_lists": user_public_metrics.get("listed_count"),
772
+ "user_created_at": user_created_at,
773
+ "user_timestamp_utc": user_timestamp_utc,
774
+ "possibly_sensitive": tweet["possibly_sensitive"],
775
+ "like_count": public_metrics["like_count"] if not is_retweet else 0,
776
+ "retweet_count": public_metrics["retweet_count"] if not is_retweet else 0,
777
+ "quote_count": public_metrics["quote_count"] if not is_retweet else 0,
778
+ "reply_count": public_metrics["reply_count"] if not is_retweet else 0,
779
+ "impression_count": public_metrics.get("impression_count"),
780
+ "lang": tweet["lang"],
781
+ "source_name": tweet.get("source"),
782
+ "links": sorted(links),
783
+ "media_urls": [m[0] for m in medias],
784
+ "media_files": [m[1] for m in medias],
785
+ "media_types": [m[2] for m in medias],
786
+ "match_query": collection_source != "thread" and collection_source != "quote",
730
787
  **place_info,
731
788
  **reply_info,
732
789
  **retweet_info,
733
- **quote_info
790
+ **quote_info,
734
791
  }
735
792
 
736
793
  if collection_source is not None:
737
- normalized_tweet['collected_via'] = [collection_source]
794
+ normalized_tweet["collected_via"] = [collection_source]
738
795
 
739
796
  if extract_referenced_tweets:
740
797
  normalized_tweets = [normalized_tweet]
@@ -750,25 +807,25 @@ def normalize_tweet_v2(tweet, *, users_by_screen_name, places_by_id, tweets_by_i
750
807
  return normalized_tweet
751
808
 
752
809
 
753
- def normalize_tweets_payload_v2(payload, locale=None, extract_referenced_tweets=False,
754
- collection_source=None):
755
-
810
+ def normalize_tweets_payload_v2(
811
+ payload, locale=None, extract_referenced_tweets=False, collection_source=None
812
+ ):
756
813
  if not validate_payload_v2(payload):
757
- raise TypeError('given value is not a Twitter API v2 payload')
814
+ raise TypeError("given value is not a Twitter API v2 payload")
758
815
 
759
- if 'data' not in payload:
816
+ if "data" not in payload:
760
817
  return []
761
818
 
762
- users_by_screen_name = includes_index(payload, 'users', index_key='username')
763
- users_by_id = includes_index(payload, 'users')
764
- places_by_id = includes_index(payload, 'places')
765
- tweets_by_id = includes_index(payload, 'tweets')
766
- media_by_key = includes_index(payload, 'media', index_key='media_key')
819
+ users_by_screen_name = includes_index(payload, "users", index_key="username")
820
+ users_by_id = includes_index(payload, "users")
821
+ places_by_id = includes_index(payload, "places")
822
+ tweets_by_id = includes_index(payload, "tweets")
823
+ media_by_key = includes_index(payload, "media", index_key="media_key")
767
824
 
768
825
  output = []
769
826
  already_seen = {}
770
827
 
771
- for item in payload['data']:
828
+ for item in payload["data"]:
772
829
  normalized_tweets = normalize_tweet_v2(
773
830
  item,
774
831
  users_by_id=users_by_id,
@@ -778,23 +835,30 @@ def normalize_tweets_payload_v2(payload, locale=None, extract_referenced_tweets=
778
835
  media_by_key=media_by_key,
779
836
  locale=locale,
780
837
  collection_source=collection_source,
781
- extract_referenced_tweets=True
838
+ extract_referenced_tweets=True,
782
839
  )
783
840
 
784
841
  if extract_referenced_tweets:
785
842
  for normalized_tweet in normalized_tweets:
786
- k = int(normalized_tweet['id'])
843
+ k = int(normalized_tweet["id"])
787
844
  earlier_normalized_tweet = already_seen.get(k)
788
845
 
789
846
  if earlier_normalized_tweet is not None:
790
- if 'collected_via' in normalized_tweet:
791
- new_collection_source = normalized_tweet['collected_via'][0]
847
+ if "collected_via" in normalized_tweet:
848
+ new_collection_source = normalized_tweet["collected_via"][0]
792
849
 
793
- if 'collected_via' not in earlier_normalized_tweet:
794
- earlier_normalized_tweet['collected_via'] = [new_collection_source]
850
+ if "collected_via" not in earlier_normalized_tweet:
851
+ earlier_normalized_tweet["collected_via"] = [
852
+ new_collection_source
853
+ ]
795
854
  else:
796
- if new_collection_source not in earlier_normalized_tweet['collected_via']:
797
- earlier_normalized_tweet['collected_via'].append(new_collection_source)
855
+ if (
856
+ new_collection_source
857
+ not in earlier_normalized_tweet["collected_via"]
858
+ ):
859
+ earlier_normalized_tweet["collected_via"].append(
860
+ new_collection_source
861
+ )
798
862
 
799
863
  continue
800
864