twitwi 0.20.0__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- test/bluesky/__init__.py +0 -0
- test/bluesky/formatters_test.py +101 -0
- test/bluesky/normalizers_test.py +130 -0
- twitwi/__init__.py +19 -2
- twitwi/anonymizers.py +3 -9
- twitwi/bluesky/__init__.py +16 -0
- twitwi/bluesky/constants.py +19 -0
- twitwi/bluesky/formatters.py +29 -0
- twitwi/bluesky/normalizers.py +641 -0
- twitwi/bluesky/types.py +135 -0
- twitwi/bluesky/utils.py +103 -0
- twitwi/constants.py +324 -349
- twitwi/exceptions.py +8 -1
- twitwi/formatters.py +35 -37
- twitwi/normalizers.py +403 -339
- twitwi/utils.py +44 -17
- twitwi-0.21.0.dist-info/METADATA +435 -0
- twitwi-0.21.0.dist-info/RECORD +22 -0
- {twitwi-0.20.0.dist-info → twitwi-0.21.0.dist-info}/WHEEL +1 -1
- {twitwi-0.20.0.dist-info → twitwi-0.21.0.dist-info}/top_level.txt +1 -0
- twitwi-0.20.0.dist-info/METADATA +0 -156
- twitwi-0.20.0.dist-info/RECORD +0 -13
- {twitwi-0.20.0.dist-info → twitwi-0.21.0.dist-info}/licenses/LICENSE.txt +0 -0
- {twitwi-0.20.0.dist-info → twitwi-0.21.0.dist-info}/zip-safe +0 -0
twitwi/normalizers.py
CHANGED
|
@@ -8,31 +8,27 @@
|
|
|
8
8
|
#
|
|
9
9
|
import re
|
|
10
10
|
from copy import deepcopy
|
|
11
|
-
from datetime import datetime
|
|
12
11
|
from html import unescape
|
|
13
12
|
|
|
14
13
|
from twitwi.exceptions import TwitterPayloadV2IncompleteIncludesError
|
|
15
14
|
from twitwi.utils import (
|
|
15
|
+
get_collection_time,
|
|
16
16
|
get_dates,
|
|
17
17
|
custom_normalize_url,
|
|
18
18
|
validate_payload_v2,
|
|
19
|
-
custom_get_normalized_hostname
|
|
19
|
+
custom_get_normalized_hostname,
|
|
20
20
|
)
|
|
21
21
|
|
|
22
|
-
CLEAN_RT_PATTERN = re.compile(r
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def get_collection_time():
|
|
26
|
-
return datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%f')
|
|
22
|
+
CLEAN_RT_PATTERN = re.compile(r"^RT @\w+: ")
|
|
27
23
|
|
|
28
24
|
|
|
29
25
|
def format_rt_text(user, text):
|
|
30
|
-
return
|
|
26
|
+
return "RT @%s: %s" % (user, text)
|
|
31
27
|
|
|
32
28
|
|
|
33
29
|
def format_qt_text(user, text, quoted_text, url):
|
|
34
|
-
clean_url = re.sub(r
|
|
35
|
-
quote =
|
|
30
|
+
clean_url = re.sub(r"(\?s=\d+|/(video|photo)/\d+)+", "", url.lower())
|
|
31
|
+
quote = "« %s: %s — %s »" % (user, quoted_text, clean_url)
|
|
36
32
|
text_lc = text.lower()
|
|
37
33
|
if quote.lower() in text_lc:
|
|
38
34
|
return text
|
|
@@ -40,43 +36,43 @@ def format_qt_text(user, text, quoted_text, url):
|
|
|
40
36
|
url_pos = text_lc.find(url_lc)
|
|
41
37
|
if url_pos != -1:
|
|
42
38
|
url_len = len(url)
|
|
43
|
-
return ("%s%s%s" % (text[:url_pos], quote, text[url_pos + url_len:])).strip()
|
|
39
|
+
return ("%s%s%s" % (text[:url_pos], quote, text[url_pos + url_len :])).strip()
|
|
44
40
|
return "%s %s" % (text, quote)
|
|
45
41
|
|
|
46
42
|
|
|
47
43
|
def format_tweet_url(screen_name, tweet_id):
|
|
48
|
-
return
|
|
44
|
+
return "https://twitter.com/%s/status/%s" % (screen_name, tweet_id)
|
|
49
45
|
|
|
50
46
|
|
|
51
47
|
def extract_media_name_from_url(media_url):
|
|
52
|
-
return media_url.rsplit(
|
|
48
|
+
return media_url.rsplit("/", 1)[-1].split("?tag=", 1)[0]
|
|
53
49
|
|
|
54
50
|
|
|
55
51
|
def extract_items_from_text(text, char):
|
|
56
|
-
splitter = re.compile(r
|
|
52
|
+
splitter = re.compile(r"[^\w%s]+" % char)
|
|
57
53
|
|
|
58
54
|
return sorted(
|
|
59
55
|
set(
|
|
60
56
|
r.lstrip(char).lower()
|
|
61
|
-
for r in splitter.split(CLEAN_RT_PATTERN.sub(
|
|
57
|
+
for r in splitter.split(CLEAN_RT_PATTERN.sub("", text))
|
|
62
58
|
if r.startswith(char)
|
|
63
59
|
)
|
|
64
60
|
)
|
|
65
61
|
|
|
66
62
|
|
|
67
63
|
def extract_hashtags_from_text(text):
|
|
68
|
-
return extract_items_from_text(text,
|
|
64
|
+
return extract_items_from_text(text, "#")
|
|
69
65
|
|
|
70
66
|
|
|
71
67
|
def extract_mentions_from_text(text):
|
|
72
|
-
return extract_items_from_text(text,
|
|
68
|
+
return extract_items_from_text(text, "@")
|
|
73
69
|
|
|
74
70
|
|
|
75
71
|
def resolve_entities(tweet, prefix):
|
|
76
|
-
status_key =
|
|
72
|
+
status_key = "%s_status" % prefix
|
|
77
73
|
target = tweet[status_key]
|
|
78
74
|
|
|
79
|
-
for ent in [
|
|
75
|
+
for ent in ["entities", "extended_entities"]:
|
|
80
76
|
if ent not in target:
|
|
81
77
|
continue
|
|
82
78
|
tweet[ent] = tweet.get(ent, {})
|
|
@@ -87,63 +83,57 @@ def resolve_entities(tweet, prefix):
|
|
|
87
83
|
|
|
88
84
|
|
|
89
85
|
def get_bitrate(x):
|
|
90
|
-
return x.get(
|
|
86
|
+
return x.get("bitrate", 0)
|
|
91
87
|
|
|
92
88
|
|
|
93
89
|
def get_bitrate_v2(x):
|
|
94
|
-
return x.get(
|
|
90
|
+
return x.get("bit_rate", 0)
|
|
95
91
|
|
|
96
92
|
|
|
97
93
|
def nostr_field(f):
|
|
98
|
-
return f.replace(
|
|
94
|
+
return f.replace("_str", "")
|
|
99
95
|
|
|
100
96
|
|
|
101
97
|
META_FIELDS = [
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
98
|
+
"in_reply_to_status_id_str",
|
|
99
|
+
"in_reply_to_screen_name",
|
|
100
|
+
"in_reply_to_user_id_str",
|
|
101
|
+
"lang",
|
|
102
|
+
"possibly_sensitive",
|
|
103
|
+
"retweet_count",
|
|
104
|
+
"favorite_count",
|
|
105
|
+
"reply_count",
|
|
110
106
|
]
|
|
111
107
|
|
|
112
108
|
META_FIELD_TRANSLATIONS = {
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
109
|
+
"in_reply_to_status_id_str": "to_tweetid",
|
|
110
|
+
"in_reply_to_screen_name": "to_username",
|
|
111
|
+
"in_reply_to_user_id_str": "to_userid",
|
|
112
|
+
"favorite_count": "like_count",
|
|
117
113
|
}
|
|
118
114
|
|
|
119
115
|
USER_META_FIELDS = [
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
116
|
+
"screen_name",
|
|
117
|
+
"name",
|
|
118
|
+
"friends_count",
|
|
119
|
+
"followers_count",
|
|
120
|
+
"location",
|
|
121
|
+
"verified",
|
|
122
|
+
"description",
|
|
123
|
+
"created_at",
|
|
128
124
|
]
|
|
129
125
|
|
|
130
|
-
PLACE_META_FIELDS = [
|
|
131
|
-
'country_code',
|
|
132
|
-
'full_name',
|
|
133
|
-
'place_type'
|
|
134
|
-
]
|
|
126
|
+
PLACE_META_FIELDS = ["country_code", "full_name", "place_type"]
|
|
135
127
|
|
|
136
128
|
|
|
137
129
|
def grab_extra_meta(source, result, locale=None):
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
result[
|
|
141
|
-
result[
|
|
142
|
-
result['lng'] = source['coordinates']['coordinates'][0]
|
|
130
|
+
if source.get("coordinates"):
|
|
131
|
+
result["coordinates"] = source["coordinates"]["coordinates"]
|
|
132
|
+
result["lat"] = source["coordinates"]["coordinates"][1]
|
|
133
|
+
result["lng"] = source["coordinates"]["coordinates"][0]
|
|
143
134
|
else:
|
|
144
|
-
|
|
145
135
|
# TODO: this is hardly optimal
|
|
146
|
-
result[
|
|
136
|
+
result["coordinates"] = None
|
|
147
137
|
|
|
148
138
|
for meta in META_FIELDS:
|
|
149
139
|
if meta in source:
|
|
@@ -152,54 +142,72 @@ def grab_extra_meta(source, result, locale=None):
|
|
|
152
142
|
result[meta] = str(source[nostr_field(meta)])
|
|
153
143
|
|
|
154
144
|
# impression_count when scraping
|
|
155
|
-
if
|
|
156
|
-
result[
|
|
145
|
+
if "ext_views" in source:
|
|
146
|
+
result["impression_count"] = source["ext_views"].get("count")
|
|
157
147
|
|
|
158
148
|
for meta in USER_META_FIELDS:
|
|
159
|
-
key =
|
|
149
|
+
key = "user_%s" % meta.replace("_count", "")
|
|
160
150
|
if key in source:
|
|
161
151
|
result[key] = source[key]
|
|
162
|
-
elif
|
|
163
|
-
result[key] = source[
|
|
152
|
+
elif "user" in source and meta in source["user"]:
|
|
153
|
+
result[key] = source["user"][meta] if source["user"][meta] != "" else None
|
|
164
154
|
|
|
165
|
-
if
|
|
166
|
-
result[
|
|
167
|
-
result[
|
|
168
|
-
result[
|
|
169
|
-
result[
|
|
170
|
-
result[
|
|
155
|
+
if "user" in source:
|
|
156
|
+
result["user_id"] = source["user"]["id_str"]
|
|
157
|
+
result["user_tweets"] = source["user"]["statuses_count"]
|
|
158
|
+
result["user_likes"] = source["user"]["favourites_count"]
|
|
159
|
+
result["user_lists"] = source["user"]["listed_count"]
|
|
160
|
+
result["user_image"] = source["user"]["profile_image_url_https"]
|
|
171
161
|
|
|
172
|
-
if
|
|
162
|
+
if "place" in source and source["place"] is not None:
|
|
173
163
|
for meta in PLACE_META_FIELDS:
|
|
174
|
-
if meta in source[
|
|
175
|
-
key =
|
|
176
|
-
result[key] = source[
|
|
177
|
-
|
|
178
|
-
if
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
164
|
+
if meta in source["place"]:
|
|
165
|
+
key = "place_%s" % meta.replace("place_", "").replace("full_", "")
|
|
166
|
+
result[key] = source["place"][meta]
|
|
167
|
+
|
|
168
|
+
if (
|
|
169
|
+
"bounding_box" in source["place"]
|
|
170
|
+
and source["place"]["bounding_box"] is not None
|
|
171
|
+
and "coordinates" in source["place"]["bounding_box"]
|
|
172
|
+
):
|
|
173
|
+
result["place_coordinates"] = source["place"]["bounding_box"][
|
|
174
|
+
"coordinates"
|
|
175
|
+
][0]
|
|
182
176
|
|
|
183
177
|
# TODO: nested_get
|
|
184
178
|
try:
|
|
185
|
-
result[
|
|
179
|
+
result["user_url"] = source["user"]["entities"]["url"]["urls"][0][
|
|
180
|
+
"expanded_url"
|
|
181
|
+
]
|
|
186
182
|
except (KeyError, IndexError):
|
|
187
183
|
try:
|
|
188
|
-
result[
|
|
184
|
+
result["user_url"] = source["user"]["url"]
|
|
189
185
|
except KeyError:
|
|
190
186
|
pass
|
|
191
187
|
|
|
192
|
-
if
|
|
193
|
-
result[
|
|
188
|
+
if "user_created_at" in result:
|
|
189
|
+
result["user_timestamp_utc"], result["user_created_at"] = get_dates(
|
|
190
|
+
result["user_created_at"], locale
|
|
191
|
+
)
|
|
194
192
|
|
|
195
|
-
if source.get(
|
|
196
|
-
result[
|
|
193
|
+
if source.get("source"):
|
|
194
|
+
result["source_url"], result["source_name"] = (
|
|
195
|
+
source["source"]
|
|
196
|
+
.replace('<a href="', "")
|
|
197
|
+
.replace("</a>", "")
|
|
198
|
+
.split('" rel="nofollow">')
|
|
199
|
+
)
|
|
197
200
|
|
|
198
201
|
return result
|
|
199
202
|
|
|
200
203
|
|
|
201
|
-
def normalize_tweet(
|
|
202
|
-
|
|
204
|
+
def normalize_tweet(
|
|
205
|
+
tweet,
|
|
206
|
+
locale=None,
|
|
207
|
+
extract_referenced_tweets=False,
|
|
208
|
+
collection_source=None,
|
|
209
|
+
pure=True,
|
|
210
|
+
):
|
|
203
211
|
"""
|
|
204
212
|
Function "normalizing" a tweet as returned by Twitter's API in order to
|
|
205
213
|
cleanup and optimize some fields.
|
|
@@ -229,11 +237,11 @@ def normalize_tweet(tweet, locale=None, extract_referenced_tweets=False,
|
|
|
229
237
|
|
|
230
238
|
results = []
|
|
231
239
|
|
|
232
|
-
if
|
|
233
|
-
for field in tweet[
|
|
234
|
-
tweet[field] = tweet[
|
|
240
|
+
if "extended_tweet" in tweet:
|
|
241
|
+
for field in tweet["extended_tweet"]:
|
|
242
|
+
tweet[field] = tweet["extended_tweet"][field]
|
|
235
243
|
|
|
236
|
-
text = tweet.get(
|
|
244
|
+
text = tweet.get("full_text", tweet.get("text", ""))
|
|
237
245
|
|
|
238
246
|
rti = None
|
|
239
247
|
rtu = None
|
|
@@ -245,17 +253,20 @@ def normalize_tweet(tweet, locale=None, extract_referenced_tweets=False,
|
|
|
245
253
|
qtuid = None
|
|
246
254
|
qtime = None
|
|
247
255
|
|
|
248
|
-
if
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
256
|
+
if (
|
|
257
|
+
"retweeted_status" in tweet
|
|
258
|
+
and tweet["retweeted_status"]["id_str"] != tweet["id_str"]
|
|
259
|
+
):
|
|
260
|
+
rti = tweet["retweeted_status"]["id_str"]
|
|
261
|
+
rtu = tweet["retweeted_status"]["user"]["screen_name"]
|
|
262
|
+
rtuid = tweet["retweeted_status"]["user"]["id_str"]
|
|
252
263
|
|
|
253
264
|
nested = normalize_tweet(
|
|
254
|
-
tweet[
|
|
265
|
+
tweet["retweeted_status"],
|
|
255
266
|
locale=locale,
|
|
256
267
|
extract_referenced_tweets=True,
|
|
257
|
-
collection_source=
|
|
258
|
-
pure=False
|
|
268
|
+
collection_source="retweet",
|
|
269
|
+
pure=False,
|
|
259
270
|
)
|
|
260
271
|
|
|
261
272
|
rtweet = nested[-1]
|
|
@@ -263,21 +274,23 @@ def normalize_tweet(tweet, locale=None, extract_referenced_tweets=False,
|
|
|
263
274
|
if extract_referenced_tweets:
|
|
264
275
|
results.extend(nested)
|
|
265
276
|
|
|
266
|
-
rtime = rtweet[
|
|
277
|
+
rtime = rtweet["timestamp_utc"]
|
|
267
278
|
|
|
268
|
-
resolve_entities(tweet,
|
|
279
|
+
resolve_entities(tweet, "retweeted")
|
|
269
280
|
|
|
270
|
-
elif
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
281
|
+
elif (
|
|
282
|
+
"quoted_status" in tweet and tweet["quoted_status"]["id_str"] != tweet["id_str"]
|
|
283
|
+
):
|
|
284
|
+
qti = tweet["quoted_status"]["id_str"]
|
|
285
|
+
qtu = tweet["quoted_status"]["user"]["screen_name"]
|
|
286
|
+
qtuid = tweet["quoted_status"]["user"]["id_str"]
|
|
274
287
|
|
|
275
288
|
nested = normalize_tweet(
|
|
276
|
-
tweet[
|
|
289
|
+
tweet["quoted_status"],
|
|
277
290
|
locale=locale,
|
|
278
291
|
extract_referenced_tweets=True,
|
|
279
|
-
collection_source=
|
|
280
|
-
pure=False
|
|
292
|
+
collection_source="quote",
|
|
293
|
+
pure=False,
|
|
281
294
|
)
|
|
282
295
|
|
|
283
296
|
qtweet = nested[-1]
|
|
@@ -286,12 +299,12 @@ def normalize_tweet(tweet, locale=None, extract_referenced_tweets=False,
|
|
|
286
299
|
results.extend(nested)
|
|
287
300
|
|
|
288
301
|
if "quoted_status_permalink" in tweet:
|
|
289
|
-
qturl = tweet[
|
|
302
|
+
qturl = tweet["quoted_status_permalink"]["expanded"]
|
|
290
303
|
else:
|
|
291
|
-
qturl = qtweet[
|
|
292
|
-
qtime = qtweet[
|
|
304
|
+
qturl = qtweet["url"]
|
|
305
|
+
qtime = qtweet["timestamp_utc"]
|
|
293
306
|
|
|
294
|
-
resolve_entities(tweet,
|
|
307
|
+
resolve_entities(tweet, "quoted")
|
|
295
308
|
|
|
296
309
|
medids = set()
|
|
297
310
|
media_urls = []
|
|
@@ -303,54 +316,56 @@ def normalize_tweet(tweet, locale=None, extract_referenced_tweets=False,
|
|
|
303
316
|
hashtags = set()
|
|
304
317
|
mentions = {}
|
|
305
318
|
|
|
306
|
-
if
|
|
307
|
-
source_id = rti or qti or tweet[
|
|
319
|
+
if "entities" in tweet or "extended_entities" in tweet:
|
|
320
|
+
source_id = rti or qti or tweet["id_str"]
|
|
308
321
|
|
|
309
|
-
entities = tweet.get(
|
|
310
|
-
entities += tweet[
|
|
322
|
+
entities = tweet.get("extended_entities", tweet["entities"]).get("media", [])
|
|
323
|
+
entities += tweet["entities"].get("urls", [])
|
|
311
324
|
|
|
312
325
|
for entity in entities:
|
|
313
|
-
if
|
|
326
|
+
if "expanded_url" in entity and "url" in entity and entity["expanded_url"]:
|
|
314
327
|
try:
|
|
315
|
-
text = text.replace(entity[
|
|
328
|
+
text = text.replace(entity["url"], entity["expanded_url"])
|
|
316
329
|
except KeyError:
|
|
317
330
|
pass
|
|
318
331
|
|
|
319
|
-
if
|
|
320
|
-
if
|
|
321
|
-
med_url = max(entity[
|
|
332
|
+
if "media_url" in entity or "media_url_https" in entity:
|
|
333
|
+
if "video_info" in entity:
|
|
334
|
+
med_url = max(entity["video_info"]["variants"], key=get_bitrate)[
|
|
335
|
+
"url"
|
|
336
|
+
]
|
|
322
337
|
else:
|
|
323
|
-
med_url = entity[
|
|
338
|
+
med_url = entity["media_url_https"]
|
|
324
339
|
|
|
325
340
|
med_name = extract_media_name_from_url(med_url)
|
|
326
341
|
|
|
327
342
|
if med_name not in medids:
|
|
328
343
|
medids.add(med_name)
|
|
329
|
-
media_types.append(entity[
|
|
330
|
-
media_urls.append(med_url.split(
|
|
331
|
-
media_files.append(
|
|
332
|
-
media_alt_texts.append(entity.get("ext_alt_text") or
|
|
344
|
+
media_types.append(entity["type"])
|
|
345
|
+
media_urls.append(med_url.split("?tag=")[0])
|
|
346
|
+
media_files.append("%s_%s" % (source_id, med_name))
|
|
347
|
+
media_alt_texts.append(entity.get("ext_alt_text") or "")
|
|
333
348
|
|
|
334
349
|
# NOTE: fun fact, Twitter is starting to break down and we cannot guarantee
|
|
335
350
|
# expanded_url exists anymore. It even crashes the website itself lol:
|
|
336
351
|
# https://x.com/lmerzeau/status/426318495450943488
|
|
337
352
|
elif "expanded_url" in entity:
|
|
338
|
-
normalized = custom_normalize_url(entity[
|
|
353
|
+
normalized = custom_normalize_url(entity["expanded_url"])
|
|
339
354
|
links.add(normalized)
|
|
340
355
|
|
|
341
|
-
for hashtag in tweet[
|
|
342
|
-
hashtags.add(hashtag[
|
|
356
|
+
for hashtag in tweet["entities"].get("hashtags", []):
|
|
357
|
+
hashtags.add(hashtag["text"].lower())
|
|
343
358
|
|
|
344
|
-
for mention in tweet[
|
|
345
|
-
mentions[mention[
|
|
359
|
+
for mention in tweet["entities"].get("user_mentions", []):
|
|
360
|
+
mentions[mention["screen_name"].lower()] = mention["id_str"]
|
|
346
361
|
|
|
347
362
|
if rtu:
|
|
348
|
-
text = format_rt_text(rtu, rtweet[
|
|
363
|
+
text = format_rt_text(rtu, rtweet["text"])
|
|
349
364
|
if rtweet["quoted_id"]:
|
|
350
365
|
qturl = format_tweet_url(rtweet["quoted_user"], rtweet["quoted_id"])
|
|
351
366
|
|
|
352
367
|
elif qtu:
|
|
353
|
-
text = format_qt_text(qtu, text, qtweet[
|
|
368
|
+
text = format_qt_text(qtu, text, qtweet["text"], qturl)
|
|
354
369
|
|
|
355
370
|
if qturl:
|
|
356
371
|
qturl_lc = custom_normalize_url(qturl).lower()
|
|
@@ -358,44 +373,50 @@ def normalize_tweet(tweet, locale=None, extract_referenced_tweets=False,
|
|
|
358
373
|
if link.lower() == qturl_lc:
|
|
359
374
|
links.remove(link)
|
|
360
375
|
|
|
361
|
-
timestamp_utc, local_time = get_dates(tweet[
|
|
376
|
+
timestamp_utc, local_time = get_dates(tweet["created_at"], locale)
|
|
362
377
|
text = unescape(text)
|
|
363
378
|
|
|
364
379
|
if collection_source is None:
|
|
365
|
-
collection_source = tweet.get(
|
|
380
|
+
collection_source = tweet.get("collection_source")
|
|
366
381
|
links = sorted(links)
|
|
367
|
-
domains = [
|
|
368
|
-
|
|
382
|
+
domains = [
|
|
383
|
+
custom_get_normalized_hostname(
|
|
384
|
+
link, normalize_amp=False, infer_redirection=False
|
|
385
|
+
)
|
|
386
|
+
for link in links
|
|
387
|
+
]
|
|
369
388
|
normalized_tweet = {
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
389
|
+
"id": tweet["id_str"],
|
|
390
|
+
"local_time": local_time,
|
|
391
|
+
"timestamp_utc": timestamp_utc,
|
|
392
|
+
"text": text,
|
|
393
|
+
"url": format_tweet_url(tweet["user"]["screen_name"], tweet["id_str"]),
|
|
394
|
+
"quoted_id": qti,
|
|
395
|
+
"quoted_user": qtu,
|
|
396
|
+
"quoted_user_id": qtuid,
|
|
397
|
+
"quoted_timestamp_utc": qtime,
|
|
398
|
+
"retweeted_id": rti,
|
|
399
|
+
"retweeted_user": rtu,
|
|
400
|
+
"retweeted_user_id": rtuid,
|
|
401
|
+
"retweeted_timestamp_utc": rtime,
|
|
402
|
+
"media_files": media_files,
|
|
403
|
+
"media_types": media_types,
|
|
404
|
+
"media_urls": media_urls,
|
|
405
|
+
"media_alt_texts": media_alt_texts,
|
|
406
|
+
"links": links,
|
|
407
|
+
"links_to_resolve": len(links) > 0,
|
|
408
|
+
"domains": domains,
|
|
409
|
+
"hashtags": sorted(hashtags) if hashtags else extract_hashtags_from_text(text),
|
|
410
|
+
"mentioned_ids": [mentions[m] for m in sorted(mentions.keys())],
|
|
411
|
+
"mentioned_names": sorted(mentions.keys())
|
|
412
|
+
if mentions
|
|
413
|
+
else extract_mentions_from_text(text),
|
|
414
|
+
"collection_time": get_collection_time(),
|
|
415
|
+
"match_query": collection_source != "thread" and collection_source != "quote",
|
|
395
416
|
}
|
|
396
417
|
|
|
397
418
|
if collection_source is not None:
|
|
398
|
-
normalized_tweet[
|
|
419
|
+
normalized_tweet["collected_via"] = [collection_source]
|
|
399
420
|
|
|
400
421
|
grab_extra_meta(tweet, normalized_tweet, locale)
|
|
401
422
|
|
|
@@ -411,14 +432,14 @@ def normalize_tweet(tweet, locale=None, extract_referenced_tweets=False,
|
|
|
411
432
|
|
|
412
433
|
|
|
413
434
|
def resolve_user_entities(user):
|
|
414
|
-
if
|
|
415
|
-
for k in user[
|
|
416
|
-
if
|
|
417
|
-
for url in user[
|
|
418
|
-
if not url.get(
|
|
435
|
+
if "entities" in user:
|
|
436
|
+
for k in user["entities"]:
|
|
437
|
+
if "urls" in user["entities"][k]:
|
|
438
|
+
for url in user["entities"][k]["urls"]:
|
|
439
|
+
if not url.get("expanded_url"):
|
|
419
440
|
continue
|
|
420
441
|
if k in user:
|
|
421
|
-
user[k] = user[k].replace(url[
|
|
442
|
+
user[k] = user[k].replace(url["url"], url["expanded_url"])
|
|
422
443
|
|
|
423
444
|
|
|
424
445
|
def normalize_user(user, locale=None, pure=True, v2=False):
|
|
@@ -444,146 +465,179 @@ def normalize_user(user, locale=None, pure=True, v2=False):
|
|
|
444
465
|
|
|
445
466
|
resolve_user_entities(user)
|
|
446
467
|
|
|
447
|
-
timestamp_utc, local_time = get_dates(
|
|
468
|
+
timestamp_utc, local_time = get_dates(
|
|
469
|
+
user["created_at"], locale, source="v2" if v2 else "v1"
|
|
470
|
+
)
|
|
448
471
|
|
|
449
|
-
if v2 and
|
|
450
|
-
withheld = user[
|
|
451
|
-
withheld_in_countries = withheld.get(
|
|
452
|
-
withheld_scope = withheld.get(
|
|
472
|
+
if v2 and "withheld" in user:
|
|
473
|
+
withheld = user["withheld"]
|
|
474
|
+
withheld_in_countries = withheld.get("country_codes", [])
|
|
475
|
+
withheld_scope = withheld.get("withheld_scope", "")
|
|
453
476
|
else:
|
|
454
477
|
withheld_in_countries = []
|
|
455
|
-
withheld_scope =
|
|
478
|
+
withheld_scope = ""
|
|
456
479
|
|
|
457
480
|
normalized_user = {
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
481
|
+
"id": user["id_str"] if not v2 else user["id"],
|
|
482
|
+
"screen_name": user["screen_name"] if not v2 else user["username"],
|
|
483
|
+
"name": user["name"],
|
|
484
|
+
"description": user["description"] if user["description"] else None,
|
|
485
|
+
"url": user.get("url"),
|
|
486
|
+
"timestamp_utc": timestamp_utc,
|
|
487
|
+
"local_time": local_time,
|
|
488
|
+
"location": user.get("location"),
|
|
489
|
+
"verified": user.get("verified", False),
|
|
490
|
+
"protected": user.get("protected", False),
|
|
491
|
+
"tweets": user["statuses_count"]
|
|
492
|
+
if not v2
|
|
493
|
+
else user["public_metrics"]["tweet_count"],
|
|
494
|
+
"followers": user["followers_count"]
|
|
495
|
+
if not v2
|
|
496
|
+
else user["public_metrics"]["followers_count"],
|
|
497
|
+
"friends": user["friends_count"]
|
|
498
|
+
if not v2
|
|
499
|
+
else user["public_metrics"]["following_count"],
|
|
500
|
+
"likes": user["favourites_count"] if not v2 else None,
|
|
501
|
+
"lists": user["listed_count"]
|
|
502
|
+
if not v2
|
|
503
|
+
else user["public_metrics"]["listed_count"],
|
|
504
|
+
"image": user.get("profile_image_url_https")
|
|
505
|
+
if not v2
|
|
506
|
+
else user.get("profile_image_url"),
|
|
507
|
+
"default_profile": user.get("default_profile", False),
|
|
508
|
+
"default_profile_image": user.get("default_profile_image", False),
|
|
509
|
+
"witheld_in_countries": user.get("witheld_in_countries", [])
|
|
510
|
+
if not v2
|
|
511
|
+
else withheld_in_countries,
|
|
512
|
+
"witheld_scope": user.get("witheld_scope") if not v2 else withheld_scope,
|
|
478
513
|
}
|
|
479
514
|
|
|
480
515
|
return normalized_user
|
|
481
516
|
|
|
482
517
|
|
|
483
|
-
def includes_index(payload, key, index_key=
|
|
484
|
-
return {item[index_key]: item for item in payload[
|
|
518
|
+
def includes_index(payload, key, index_key="id"):
|
|
519
|
+
return {item[index_key]: item for item in payload["includes"].get(key, [])}
|
|
485
520
|
|
|
486
521
|
|
|
487
522
|
def get_best_url(item):
|
|
488
|
-
if
|
|
489
|
-
return item[
|
|
523
|
+
if "unwound_url" in item:
|
|
524
|
+
return item["unwound_url"]
|
|
490
525
|
|
|
491
|
-
if
|
|
492
|
-
return item[
|
|
526
|
+
if "expanded_url" in item:
|
|
527
|
+
return item["expanded_url"]
|
|
493
528
|
|
|
494
529
|
return None
|
|
495
530
|
|
|
496
531
|
|
|
497
|
-
def normalize_tweet_v2(
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
532
|
+
def normalize_tweet_v2(
|
|
533
|
+
tweet,
|
|
534
|
+
*,
|
|
535
|
+
users_by_screen_name,
|
|
536
|
+
places_by_id,
|
|
537
|
+
tweets_by_id,
|
|
538
|
+
users_by_id,
|
|
539
|
+
media_by_key,
|
|
540
|
+
locale=None,
|
|
541
|
+
collection_source=None,
|
|
542
|
+
extract_referenced_tweets=False,
|
|
543
|
+
):
|
|
544
|
+
timestamp_utc, local_time = get_dates(
|
|
545
|
+
tweet["created_at"], locale=locale, source="v2"
|
|
546
|
+
)
|
|
501
547
|
|
|
502
548
|
try:
|
|
503
|
-
user = users_by_id[tweet[
|
|
549
|
+
user = users_by_id[tweet["author_id"]]
|
|
504
550
|
except KeyError:
|
|
505
|
-
raise TwitterPayloadV2IncompleteIncludesError(
|
|
551
|
+
raise TwitterPayloadV2IncompleteIncludesError("user", tweet["author_id"])
|
|
506
552
|
|
|
507
|
-
user_timestamp_utc, user_created_at = get_dates(
|
|
508
|
-
|
|
553
|
+
user_timestamp_utc, user_created_at = get_dates(
|
|
554
|
+
user["created_at"], locale=locale, source="v2"
|
|
555
|
+
)
|
|
556
|
+
user_entities = user.get("entities", {})
|
|
509
557
|
|
|
510
|
-
entities = tweet.get(
|
|
511
|
-
referenced_tweets = tweet.get(
|
|
558
|
+
entities = tweet.get("entities", {})
|
|
559
|
+
referenced_tweets = tweet.get("referenced_tweets", [])
|
|
512
560
|
|
|
513
561
|
hashtags = set()
|
|
514
562
|
|
|
515
|
-
for hashtag in entities.get(
|
|
516
|
-
hashtags.add(hashtag[
|
|
563
|
+
for hashtag in entities.get("hashtags", []):
|
|
564
|
+
hashtags.add(hashtag["tag"])
|
|
517
565
|
|
|
518
566
|
mentions = {}
|
|
519
567
|
|
|
520
|
-
for mention in entities.get(
|
|
521
|
-
if
|
|
522
|
-
mentions[mention[
|
|
568
|
+
for mention in entities.get("mentions", []):
|
|
569
|
+
if "id" in mention:
|
|
570
|
+
mentions[mention["username"]] = mention["id"]
|
|
523
571
|
else:
|
|
524
572
|
try:
|
|
525
|
-
mentions[mention[
|
|
573
|
+
mentions[mention["username"]] = users_by_screen_name[
|
|
574
|
+
mention["username"]
|
|
575
|
+
]["id"]
|
|
526
576
|
except KeyError:
|
|
527
|
-
raise TwitterPayloadV2IncompleteIncludesError(
|
|
577
|
+
raise TwitterPayloadV2IncompleteIncludesError(
|
|
578
|
+
"user", mention["username"]
|
|
579
|
+
)
|
|
528
580
|
|
|
529
581
|
place_info = {}
|
|
530
582
|
|
|
531
|
-
if
|
|
532
|
-
geo_data = tweet[
|
|
583
|
+
if "geo" in tweet:
|
|
584
|
+
geo_data = tweet["geo"]
|
|
533
585
|
|
|
534
|
-
if
|
|
535
|
-
point = geo_data[
|
|
586
|
+
if "coordinates" in geo_data:
|
|
587
|
+
point = geo_data["coordinates"]
|
|
536
588
|
|
|
537
|
-
if point[
|
|
538
|
-
lng, lat = point[
|
|
539
|
-
place_info[
|
|
540
|
-
place_info[
|
|
589
|
+
if point["type"] == "Point":
|
|
590
|
+
lng, lat = point["coordinates"]
|
|
591
|
+
place_info["lng"] = lng
|
|
592
|
+
place_info["lat"] = lat
|
|
541
593
|
|
|
542
|
-
if
|
|
543
|
-
place_data = places_by_id.get(geo_data[
|
|
594
|
+
if "place_id" in geo_data:
|
|
595
|
+
place_data = places_by_id.get(geo_data["place_id"], {})
|
|
544
596
|
|
|
545
|
-
if
|
|
546
|
-
place_info[
|
|
597
|
+
if "country_code" in place_data:
|
|
598
|
+
place_info["place_country_code"] = place_data["country_code"]
|
|
547
599
|
|
|
548
|
-
if
|
|
549
|
-
place_info[
|
|
600
|
+
if "full_name" in place_data:
|
|
601
|
+
place_info["place_name"] = place_data["full_name"]
|
|
550
602
|
|
|
551
|
-
if
|
|
552
|
-
place_info[
|
|
603
|
+
if "place_type" in place_data:
|
|
604
|
+
place_info["place_type"] = place_data["place_type"]
|
|
553
605
|
|
|
554
|
-
if
|
|
555
|
-
place_info[
|
|
606
|
+
if "geo" in place_data and "bbox" in place_data["geo"]:
|
|
607
|
+
place_info["place_coordinates"] = place_data["geo"]["bbox"]
|
|
556
608
|
|
|
557
609
|
# Text
|
|
558
|
-
text = tweet[
|
|
610
|
+
text = tweet["text"]
|
|
559
611
|
|
|
560
612
|
# References
|
|
561
|
-
refs = {t[
|
|
613
|
+
refs = {t["type"]: t["id"] for t in referenced_tweets}
|
|
562
614
|
|
|
563
615
|
# Reply
|
|
564
616
|
reply_info = {}
|
|
565
617
|
|
|
566
|
-
if
|
|
567
|
-
reply = tweets_by_id.get(refs[
|
|
568
|
-
if
|
|
618
|
+
if "replied_to" in refs:
|
|
619
|
+
reply = tweets_by_id.get(refs["replied_to"], {})
|
|
620
|
+
if "author_id" in reply:
|
|
569
621
|
try:
|
|
570
|
-
reply_info[
|
|
622
|
+
reply_info["to_username"] = users_by_id[reply["author_id"]]["username"]
|
|
571
623
|
except KeyError:
|
|
572
|
-
raise TwitterPayloadV2IncompleteIncludesError(
|
|
624
|
+
raise TwitterPayloadV2IncompleteIncludesError(
|
|
625
|
+
"replied_user", reply["author_id"]
|
|
626
|
+
)
|
|
573
627
|
else:
|
|
574
|
-
reply_info[
|
|
575
|
-
reply_info[
|
|
576
|
-
reply_info[
|
|
628
|
+
reply_info["to_username"] = ""
|
|
629
|
+
reply_info["to_userid"] = reply.get("author_id", "")
|
|
630
|
+
reply_info["to_tweetid"] = reply.get("id", "")
|
|
577
631
|
|
|
578
632
|
# Retweet
|
|
579
633
|
retweet_info = {}
|
|
580
634
|
normalized_retweet = None
|
|
581
635
|
|
|
582
|
-
if
|
|
583
|
-
retweet_info[
|
|
636
|
+
if "retweeted" in refs:
|
|
637
|
+
retweet_info["retweeted_id"] = refs["retweeted"]
|
|
584
638
|
|
|
585
|
-
if refs[
|
|
586
|
-
retweet = tweets_by_id[refs[
|
|
639
|
+
if refs["retweeted"] in tweets_by_id:
|
|
640
|
+
retweet = tweets_by_id[refs["retweeted"]]
|
|
587
641
|
normalized_retweet = normalize_tweet_v2(
|
|
588
642
|
retweet,
|
|
589
643
|
users_by_screen_name=users_by_screen_name,
|
|
@@ -592,22 +646,24 @@ def normalize_tweet_v2(tweet, *, users_by_screen_name, places_by_id, tweets_by_i
|
|
|
592
646
|
users_by_id=users_by_id,
|
|
593
647
|
media_by_key=media_by_key,
|
|
594
648
|
locale=locale,
|
|
595
|
-
collection_source=
|
|
649
|
+
collection_source="retweet",
|
|
596
650
|
)
|
|
597
651
|
|
|
598
|
-
retweet_info[
|
|
599
|
-
retweet_info[
|
|
600
|
-
retweet_info[
|
|
652
|
+
retweet_info["retweeted_user"] = normalized_retweet["user_screen_name"]
|
|
653
|
+
retweet_info["retweeted_user_id"] = normalized_retweet["user_id"]
|
|
654
|
+
retweet_info["retweeted_timestamp_utc"] = normalized_retweet[
|
|
655
|
+
"timestamp_utc"
|
|
656
|
+
]
|
|
601
657
|
|
|
602
658
|
# Quoted
|
|
603
659
|
quote_info = {}
|
|
604
660
|
normalized_quote = None
|
|
605
661
|
|
|
606
|
-
if
|
|
607
|
-
quote_info[
|
|
662
|
+
if "quoted" in refs:
|
|
663
|
+
quote_info["quoted_id"] = refs["quoted"]
|
|
608
664
|
|
|
609
|
-
if refs[
|
|
610
|
-
quote = tweets_by_id[refs[
|
|
665
|
+
if refs["quoted"] in tweets_by_id:
|
|
666
|
+
quote = tweets_by_id[refs["quoted"]]
|
|
611
667
|
normalized_quote = normalize_tweet_v2(
|
|
612
668
|
quote,
|
|
613
669
|
users_by_screen_name=users_by_screen_name,
|
|
@@ -616,125 +672,126 @@ def normalize_tweet_v2(tweet, *, users_by_screen_name, places_by_id, tweets_by_i
|
|
|
616
672
|
users_by_id=users_by_id,
|
|
617
673
|
media_by_key=media_by_key,
|
|
618
674
|
locale=locale,
|
|
619
|
-
collection_source=
|
|
675
|
+
collection_source="quote",
|
|
620
676
|
)
|
|
621
677
|
|
|
622
|
-
quote_info[
|
|
623
|
-
quote_info[
|
|
624
|
-
quote_info[
|
|
678
|
+
quote_info["quoted_user"] = normalized_quote["user_screen_name"]
|
|
679
|
+
quote_info["quoted_user_id"] = normalized_quote["user_id"]
|
|
680
|
+
quote_info["quoted_timestamp_utc"] = normalized_quote["timestamp_utc"]
|
|
625
681
|
|
|
626
682
|
# Replace urls in text
|
|
627
683
|
links = set()
|
|
628
684
|
|
|
629
|
-
for url_data in entities.get(
|
|
685
|
+
for url_data in entities.get("urls", []):
|
|
630
686
|
replacement_url = get_best_url(url_data)
|
|
631
687
|
|
|
632
688
|
if replacement_url:
|
|
633
|
-
text = text.replace(url_data[
|
|
689
|
+
text = text.replace(url_data["url"], replacement_url)
|
|
634
690
|
|
|
635
691
|
if not replacement_url:
|
|
636
|
-
replacement_url = url_data[
|
|
692
|
+
replacement_url = url_data["url"]
|
|
637
693
|
|
|
638
694
|
links.add(custom_normalize_url(replacement_url))
|
|
639
695
|
|
|
640
696
|
if normalized_retweet:
|
|
641
697
|
text = format_rt_text(
|
|
642
|
-
normalized_retweet[
|
|
643
|
-
normalized_retweet['text']
|
|
698
|
+
normalized_retweet["user_screen_name"], normalized_retweet["text"]
|
|
644
699
|
)
|
|
645
700
|
|
|
646
701
|
if normalized_quote:
|
|
647
702
|
text = format_qt_text(
|
|
648
|
-
normalized_quote[
|
|
703
|
+
normalized_quote["user_screen_name"],
|
|
649
704
|
text,
|
|
650
|
-
normalized_quote[
|
|
651
|
-
normalized_quote[
|
|
705
|
+
normalized_quote["text"],
|
|
706
|
+
normalized_quote["url"],
|
|
652
707
|
)
|
|
653
708
|
|
|
654
709
|
# Metrics
|
|
655
|
-
is_retweet =
|
|
656
|
-
public_metrics = tweet[
|
|
657
|
-
user_public_metrics = user[
|
|
710
|
+
is_retweet = "retweeted" in refs
|
|
711
|
+
public_metrics = tweet["public_metrics"]
|
|
712
|
+
user_public_metrics = user["public_metrics"]
|
|
658
713
|
|
|
659
|
-
user_url = user.get(
|
|
714
|
+
user_url = user.get("url")
|
|
660
715
|
|
|
661
|
-
if
|
|
662
|
-
user_url_entity = user_entities[
|
|
716
|
+
if "url" in user_entities and "urls" in user_entities["url"]:
|
|
717
|
+
user_url_entity = user_entities["url"]["urls"][0]
|
|
663
718
|
user_url = get_best_url(user_url_entity) or user_url
|
|
664
719
|
|
|
665
720
|
# Media
|
|
666
721
|
medias = []
|
|
667
722
|
|
|
668
|
-
if
|
|
669
|
-
source_id = refs.get(
|
|
723
|
+
if "attachments" in tweet and "media_keys" in tweet["attachments"]:
|
|
724
|
+
source_id = refs.get("retweeted_id", tweet["id"])
|
|
670
725
|
|
|
671
|
-
for media_key in tweet[
|
|
726
|
+
for media_key in tweet["attachments"]["media_keys"]:
|
|
672
727
|
if media_key in media_by_key:
|
|
673
728
|
try:
|
|
674
729
|
media_data = media_by_key[media_key]
|
|
675
730
|
except KeyError:
|
|
676
|
-
raise TwitterPayloadV2IncompleteIncludesError(
|
|
731
|
+
raise TwitterPayloadV2IncompleteIncludesError("media", media_key)
|
|
677
732
|
|
|
678
733
|
if "variants" in media_data:
|
|
679
|
-
media_url = max(media_data[
|
|
734
|
+
media_url = max(media_data["variants"], key=get_bitrate_v2)["url"]
|
|
680
735
|
else:
|
|
681
|
-
media_url = media_data.get(
|
|
682
|
-
medias.append(
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
736
|
+
media_url = media_data.get("url", "")
|
|
737
|
+
medias.append(
|
|
738
|
+
(
|
|
739
|
+
media_url,
|
|
740
|
+
"%s_%s" % (source_id, extract_media_name_from_url(media_url)),
|
|
741
|
+
media_data["type"],
|
|
742
|
+
)
|
|
743
|
+
)
|
|
687
744
|
|
|
688
745
|
if collection_source is None:
|
|
689
|
-
collection_source = tweet.get(
|
|
746
|
+
collection_source = tweet.get("collection_source")
|
|
690
747
|
|
|
691
748
|
sorted_mentions = sorted(mentions.keys())
|
|
692
749
|
|
|
693
750
|
normalized_tweet = {
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
751
|
+
"id": tweet["id"],
|
|
752
|
+
"local_time": local_time,
|
|
753
|
+
"timestamp_utc": timestamp_utc,
|
|
754
|
+
"text": unescape(text),
|
|
755
|
+
"url": format_tweet_url(user["username"], tweet["id"]),
|
|
756
|
+
"hashtags": sorted(hashtags),
|
|
757
|
+
"mentioned_names": sorted_mentions,
|
|
758
|
+
"mentioned_ids": [mentions[k] for k in sorted_mentions],
|
|
759
|
+
"collection_time": get_collection_time(),
|
|
760
|
+
"user_id": user["id"],
|
|
761
|
+
"user_screen_name": user["username"],
|
|
762
|
+
"user_name": user["name"],
|
|
763
|
+
"user_image": user["profile_image_url"],
|
|
764
|
+
"user_url": user_url or None,
|
|
765
|
+
"user_location": user.get("location"),
|
|
766
|
+
"user_verified": user["verified"],
|
|
767
|
+
"user_description": user["description"] if user["description"] else None,
|
|
768
|
+
"user_tweets": user_public_metrics.get("tweet_count"),
|
|
769
|
+
"user_followers": user_public_metrics.get("followers_count"),
|
|
770
|
+
"user_friends": user_public_metrics.get("following_count"),
|
|
771
|
+
"user_lists": user_public_metrics.get("listed_count"),
|
|
772
|
+
"user_created_at": user_created_at,
|
|
773
|
+
"user_timestamp_utc": user_timestamp_utc,
|
|
774
|
+
"possibly_sensitive": tweet["possibly_sensitive"],
|
|
775
|
+
"like_count": public_metrics["like_count"] if not is_retweet else 0,
|
|
776
|
+
"retweet_count": public_metrics["retweet_count"] if not is_retweet else 0,
|
|
777
|
+
"quote_count": public_metrics["quote_count"] if not is_retweet else 0,
|
|
778
|
+
"reply_count": public_metrics["reply_count"] if not is_retweet else 0,
|
|
779
|
+
"impression_count": public_metrics.get("impression_count"),
|
|
780
|
+
"lang": tweet["lang"],
|
|
781
|
+
"source_name": tweet.get("source"),
|
|
782
|
+
"links": sorted(links),
|
|
783
|
+
"media_urls": [m[0] for m in medias],
|
|
784
|
+
"media_files": [m[1] for m in medias],
|
|
785
|
+
"media_types": [m[2] for m in medias],
|
|
786
|
+
"match_query": collection_source != "thread" and collection_source != "quote",
|
|
730
787
|
**place_info,
|
|
731
788
|
**reply_info,
|
|
732
789
|
**retweet_info,
|
|
733
|
-
**quote_info
|
|
790
|
+
**quote_info,
|
|
734
791
|
}
|
|
735
792
|
|
|
736
793
|
if collection_source is not None:
|
|
737
|
-
normalized_tweet[
|
|
794
|
+
normalized_tweet["collected_via"] = [collection_source]
|
|
738
795
|
|
|
739
796
|
if extract_referenced_tweets:
|
|
740
797
|
normalized_tweets = [normalized_tweet]
|
|
@@ -750,25 +807,25 @@ def normalize_tweet_v2(tweet, *, users_by_screen_name, places_by_id, tweets_by_i
|
|
|
750
807
|
return normalized_tweet
|
|
751
808
|
|
|
752
809
|
|
|
753
|
-
def normalize_tweets_payload_v2(
|
|
754
|
-
|
|
755
|
-
|
|
810
|
+
def normalize_tweets_payload_v2(
|
|
811
|
+
payload, locale=None, extract_referenced_tweets=False, collection_source=None
|
|
812
|
+
):
|
|
756
813
|
if not validate_payload_v2(payload):
|
|
757
|
-
raise TypeError(
|
|
814
|
+
raise TypeError("given value is not a Twitter API v2 payload")
|
|
758
815
|
|
|
759
|
-
if
|
|
816
|
+
if "data" not in payload:
|
|
760
817
|
return []
|
|
761
818
|
|
|
762
|
-
users_by_screen_name = includes_index(payload,
|
|
763
|
-
users_by_id = includes_index(payload,
|
|
764
|
-
places_by_id = includes_index(payload,
|
|
765
|
-
tweets_by_id = includes_index(payload,
|
|
766
|
-
media_by_key = includes_index(payload,
|
|
819
|
+
users_by_screen_name = includes_index(payload, "users", index_key="username")
|
|
820
|
+
users_by_id = includes_index(payload, "users")
|
|
821
|
+
places_by_id = includes_index(payload, "places")
|
|
822
|
+
tweets_by_id = includes_index(payload, "tweets")
|
|
823
|
+
media_by_key = includes_index(payload, "media", index_key="media_key")
|
|
767
824
|
|
|
768
825
|
output = []
|
|
769
826
|
already_seen = {}
|
|
770
827
|
|
|
771
|
-
for item in payload[
|
|
828
|
+
for item in payload["data"]:
|
|
772
829
|
normalized_tweets = normalize_tweet_v2(
|
|
773
830
|
item,
|
|
774
831
|
users_by_id=users_by_id,
|
|
@@ -778,23 +835,30 @@ def normalize_tweets_payload_v2(payload, locale=None, extract_referenced_tweets=
|
|
|
778
835
|
media_by_key=media_by_key,
|
|
779
836
|
locale=locale,
|
|
780
837
|
collection_source=collection_source,
|
|
781
|
-
extract_referenced_tweets=True
|
|
838
|
+
extract_referenced_tweets=True,
|
|
782
839
|
)
|
|
783
840
|
|
|
784
841
|
if extract_referenced_tweets:
|
|
785
842
|
for normalized_tweet in normalized_tweets:
|
|
786
|
-
k = int(normalized_tweet[
|
|
843
|
+
k = int(normalized_tweet["id"])
|
|
787
844
|
earlier_normalized_tweet = already_seen.get(k)
|
|
788
845
|
|
|
789
846
|
if earlier_normalized_tweet is not None:
|
|
790
|
-
if
|
|
791
|
-
new_collection_source = normalized_tweet[
|
|
847
|
+
if "collected_via" in normalized_tweet:
|
|
848
|
+
new_collection_source = normalized_tweet["collected_via"][0]
|
|
792
849
|
|
|
793
|
-
if
|
|
794
|
-
earlier_normalized_tweet[
|
|
850
|
+
if "collected_via" not in earlier_normalized_tweet:
|
|
851
|
+
earlier_normalized_tweet["collected_via"] = [
|
|
852
|
+
new_collection_source
|
|
853
|
+
]
|
|
795
854
|
else:
|
|
796
|
-
if
|
|
797
|
-
|
|
855
|
+
if (
|
|
856
|
+
new_collection_source
|
|
857
|
+
not in earlier_normalized_tweet["collected_via"]
|
|
858
|
+
):
|
|
859
|
+
earlier_normalized_tweet["collected_via"].append(
|
|
860
|
+
new_collection_source
|
|
861
|
+
)
|
|
798
862
|
|
|
799
863
|
continue
|
|
800
864
|
|