twitwi 0.20.0__py3-none-any.whl → 0.21.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- test/bluesky/__init__.py +0 -0
- test/bluesky/formatters_test.py +101 -0
- test/bluesky/normalizers_test.py +130 -0
- twitwi/__init__.py +19 -2
- twitwi/anonymizers.py +3 -9
- twitwi/bluesky/__init__.py +16 -0
- twitwi/bluesky/constants.py +19 -0
- twitwi/bluesky/formatters.py +29 -0
- twitwi/bluesky/normalizers.py +686 -0
- twitwi/bluesky/types.py +135 -0
- twitwi/bluesky/utils.py +110 -0
- twitwi/constants.py +323 -349
- twitwi/exceptions.py +8 -1
- twitwi/formatters.py +35 -37
- twitwi/normalizers.py +403 -339
- twitwi/utils.py +46 -18
- twitwi-0.21.1.dist-info/METADATA +436 -0
- twitwi-0.21.1.dist-info/RECORD +22 -0
- {twitwi-0.20.0.dist-info → twitwi-0.21.1.dist-info}/WHEEL +1 -1
- {twitwi-0.20.0.dist-info → twitwi-0.21.1.dist-info}/top_level.txt +1 -0
- twitwi-0.20.0.dist-info/METADATA +0 -156
- twitwi-0.20.0.dist-info/RECORD +0 -13
- {twitwi-0.20.0.dist-info → twitwi-0.21.1.dist-info}/licenses/LICENSE.txt +0 -0
- {twitwi-0.20.0.dist-info → twitwi-0.21.1.dist-info}/zip-safe +0 -0
twitwi/constants.py
CHANGED
|
@@ -4,426 +4,400 @@
|
|
|
4
4
|
#
|
|
5
5
|
# Useful constants used throughout the library.
|
|
6
6
|
#
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
FORMATTED_TWEET_DATETIME_FORMAT =
|
|
7
|
+
SOURCE_DATETIME_FORMAT = "%a %b %d %H:%M:%S +0000 %Y"
|
|
8
|
+
SOURCE_DATETIME_FORMAT_V2 = "%Y-%m-%dT%H:%M:%S.%fZ"
|
|
9
|
+
FORMATTED_TWEET_DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S"
|
|
10
|
+
|
|
11
|
+
FORMATTED_FULL_DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%f"
|
|
10
12
|
|
|
11
13
|
# More details on Twitter's tweets metadata can be read here: https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object
|
|
12
14
|
TWEET_FIELDS = [
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
# 'filter_level',
|
|
19
|
-
|
|
20
|
-
# 'withheld_copyright',
|
|
21
|
-
# 'withheld_scope',
|
|
22
|
-
# 'withheld_countries',
|
|
23
|
-
# 'truncated',
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
# 'source',
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
# 'user_utcoffset',
|
|
45
|
-
# 'user_timezone',
|
|
46
|
-
# 'user_lang',
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
15
|
+
"id", # digital ID
|
|
16
|
+
"timestamp_utc", # UNIX timestamp of creation - UTC time
|
|
17
|
+
"local_time", # ISO datetime of creation - local time
|
|
18
|
+
"user_screen_name", # author's user text ID (@user) (at collection time)
|
|
19
|
+
"text", # message's text content
|
|
20
|
+
# 'filter_level', # maximum value of the filter_level parameter which may be used and still stream this Tweet
|
|
21
|
+
"possibly_sensitive", # whether a link present in the message might contain sensitive content according to Twitter
|
|
22
|
+
# 'withheld_copyright', # whether the tweet might be censored by Twitter following copyright requests, ignorable
|
|
23
|
+
# 'withheld_scope', # whether the content withheld is the 'status' or a 'user', ignorable
|
|
24
|
+
# 'withheld_countries', # list of ISO country codes in which the message is withheld, separated by |, ignorable
|
|
25
|
+
# 'truncated', # whether the tweet is bigger than 140 characters, obsolete
|
|
26
|
+
"retweet_count", # number of retweets of the message (at collection time)
|
|
27
|
+
"like_count", # number of likes of the message (at collection time)
|
|
28
|
+
"reply_count", # number of answers to the message, dropped by Twitter (since Oct 17, now charged), unreliable and ignorable
|
|
29
|
+
"impression_count", # number of impressions generated by the message (at collection time)
|
|
30
|
+
"lang", # language of the message automatically identified by Twitter's algorithms (equals 'und' when no language could be detected)
|
|
31
|
+
"to_username", # text ID of the user the message is answering to
|
|
32
|
+
"to_userid", # digital ID of the user the message is answering to
|
|
33
|
+
"to_tweetid", # digital ID of the tweet the message is answering to
|
|
34
|
+
# 'source', # medium used by the user to post the message, now exported in source_name and source_url fields
|
|
35
|
+
"source_name", # name of the medium used to post the message
|
|
36
|
+
"source_url", # link to the medium used to post the message
|
|
37
|
+
"user_location", # location declared in the user's profile (at collection time)
|
|
38
|
+
"lat", # latitude of messages geolocalized
|
|
39
|
+
"lng", # longitude of messages geolocalized
|
|
40
|
+
"user_id", # author's user digital ID
|
|
41
|
+
"user_name", # author's detailed textual name (at collection time)
|
|
42
|
+
"user_verified", # whether the author's account is certified
|
|
43
|
+
"user_description", # description given in the author's profile (at collection time)
|
|
44
|
+
"user_url", # link to a website given in the author's profile (at collection time)
|
|
45
|
+
"user_image", # link to the image avatar of the author's profile (at collection time)
|
|
46
|
+
# 'user_utcoffset', # time offset due to the user's timezone, dropped by Twitter (since May 2018), ignorable
|
|
47
|
+
# 'user_timezone', # timezone declared in the user's profile, dropped by Twitter (since May 2018), ignorable
|
|
48
|
+
# 'user_lang', # language declared in the user's profile (at collection time), dropped by Twitter (since May 2019), ignorable
|
|
49
|
+
"user_tweets", # number of tweets sent by the user (at collection time)
|
|
50
|
+
"user_followers", # number of users following the author (at collection time)
|
|
51
|
+
"user_friends", # number of users the author is following (at collection time)
|
|
52
|
+
"user_likes", # number of likes the author has expressed (at collection time)
|
|
53
|
+
"user_lists", # number of users lists the author has been included in (at collection time)
|
|
54
|
+
"user_created_at", # ISO datetime of creation of the author's account
|
|
55
|
+
"user_timestamp_utc", # UNIX timestamp of creation of the author's account - UTC time
|
|
56
|
+
"collected_via", # How we received the message: 'stream', 'search', 'retweet' (the original tweet was
|
|
57
|
+
# contained in the retweet metadata), 'quote' (the original tweet was contained in
|
|
58
|
+
# the quote metadata), 'thread' (the tweet is part of the same conversation as a
|
|
59
|
+
# tweet collected via search or stream). If the message was collected via multiple
|
|
60
|
+
# ways, they are separated by |
|
|
61
|
+
"match_query", # whether the tweet was retrieved because it matches the query, or whether it was
|
|
62
|
+
# collected via 'quote' or 'thread'
|
|
63
|
+
"retweeted_id", # digital ID of the retweeted message
|
|
64
|
+
"retweeted_user", # text ID of the user who authored the retweeted message
|
|
65
|
+
"retweeted_user_id", # digital ID of the user who authoring the retweeted message
|
|
66
|
+
"retweeted_timestamp_utc", # UNIX timestamp of creation of the retweeted message - UTC time
|
|
67
|
+
"quoted_id", # digital ID of the retweeted message
|
|
68
|
+
"quoted_user", # text ID of the user who authored the quoted message
|
|
69
|
+
"quoted_user_id", # digital ID of the user who authoring the quoted message
|
|
70
|
+
"quoted_timestamp_utc", # UNIX timestamp of creation of the quoted message - UTC time
|
|
71
|
+
"collection_time", # ISO datetime of message collection - local time
|
|
72
|
+
"url", # url of the tweet (to get a view of the message directly on Twitter)
|
|
73
|
+
"place_country_code", # if the tweet has an associated 'place', country code of that place
|
|
74
|
+
"place_name", # if the tweet has an associated 'place', name of that place
|
|
75
|
+
"place_type", # if the tweet has an associated 'place', type of that place ('city', 'admin', etc.)
|
|
76
|
+
"place_coordinates", # if the tweet has an associated 'place', coordinates of that place, separated by |
|
|
77
|
+
"links", # list of links included in the text content, with redirections resolved, separated by |
|
|
78
|
+
"domains", # list of domain names in the links fields, separated by |
|
|
79
|
+
"media_urls", # list of links to images/videos embedded, separated by |
|
|
80
|
+
"media_files", # list of filenames of images/videos embedded and downloaded, separated by |, ignorable when medias collections isn't enabled
|
|
81
|
+
"media_types", # list of media types (photo, video, animated gif), separated by |
|
|
82
|
+
"media_alt_texts", # list of alternative texts (image descriptions), separated by |
|
|
83
|
+
"mentioned_names", # list of text IDs of users mentionned, separated by |
|
|
84
|
+
"mentioned_ids", # list of digital IDs of users mentionned, separated by |
|
|
85
|
+
"hashtags", # list of hashtags used, lowercased, separated by |
|
|
84
86
|
]
|
|
85
87
|
|
|
86
88
|
TWEET_FIELDS_TCAT = [
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
89
|
+
"id",
|
|
90
|
+
"time",
|
|
91
|
+
"created_at",
|
|
92
|
+
"from_user_name",
|
|
93
|
+
"text",
|
|
94
|
+
"filter_level",
|
|
95
|
+
"possibly_sensitive",
|
|
96
|
+
"withheld_copyright",
|
|
97
|
+
"withheld_scope",
|
|
98
|
+
"truncated",
|
|
99
|
+
"retweet_count",
|
|
100
|
+
"favorite_count",
|
|
101
|
+
"lang",
|
|
102
|
+
"to_user_name",
|
|
103
|
+
"in_reply_to_status_id",
|
|
104
|
+
"quoted_status_id",
|
|
105
|
+
"source",
|
|
106
|
+
"location",
|
|
107
|
+
"lat",
|
|
108
|
+
"lng",
|
|
109
|
+
"from_user_id",
|
|
110
|
+
"from_user_realname",
|
|
111
|
+
"from_user_verified",
|
|
112
|
+
"from_user_description",
|
|
113
|
+
"from_user_url",
|
|
114
|
+
"from_user_profile_image_url",
|
|
115
|
+
"from_user_utcoffset",
|
|
116
|
+
"from_user_timezone",
|
|
117
|
+
"from_user_lang",
|
|
118
|
+
"from_user_tweetcount",
|
|
119
|
+
"from_user_followercount",
|
|
120
|
+
"from_user_friendcount",
|
|
121
|
+
"from_user_favourites_count",
|
|
122
|
+
"from_user_listed",
|
|
123
|
+
"from_user_withheld_scope",
|
|
124
|
+
"from_user_created_at",
|
|
125
|
+
"urls",
|
|
126
|
+
"urls_expanded",
|
|
127
|
+
"urls_followed",
|
|
128
|
+
"domains",
|
|
129
|
+
"HTTP status code",
|
|
130
|
+
"media_id",
|
|
131
|
+
"media_urls",
|
|
132
|
+
"media_type",
|
|
133
|
+
"media_indice_start",
|
|
134
|
+
"media_indice_end",
|
|
135
|
+
"photo_sizes_width",
|
|
136
|
+
"photo_sizes_height",
|
|
137
|
+
"photo_resize",
|
|
138
|
+
"mentions",
|
|
139
|
+
"hashtags",
|
|
138
140
|
]
|
|
139
141
|
|
|
140
142
|
GAZOU_TO_TCAT = {
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
'media_types': 'media_type'
|
|
143
|
+
"identical_fields": {
|
|
144
|
+
"id": "id",
|
|
145
|
+
"timestamp_utc": "time",
|
|
146
|
+
"local_time": "created_at",
|
|
147
|
+
"user_screen_name": "from_user_name",
|
|
148
|
+
"text": "text",
|
|
149
|
+
"possibly_sensitive": "possibly_sensitive",
|
|
150
|
+
"retweet_count": "retweet_count",
|
|
151
|
+
"like_count": "favorite_count",
|
|
152
|
+
"lang": "lang",
|
|
153
|
+
"to_username": "to_user_name",
|
|
154
|
+
"to_userid": "to_user_id",
|
|
155
|
+
"to_tweetid": "in_reply_to_status_id",
|
|
156
|
+
"quoted_id": "quoted_status_id",
|
|
157
|
+
"user_location": "location",
|
|
158
|
+
"lat": "lat",
|
|
159
|
+
"lng": "lng",
|
|
160
|
+
"user_id": "from_user_id",
|
|
161
|
+
"user_name": "from_user_realname",
|
|
162
|
+
"user_verified": "from_user_verified",
|
|
163
|
+
"user_description": "from_user_description",
|
|
164
|
+
"user_url": "from_user_url",
|
|
165
|
+
"user_image": "from_user_profile_image_url",
|
|
166
|
+
"user_tweets": "from_user_tweetcount",
|
|
167
|
+
"user_followers": "from_user_followercount",
|
|
168
|
+
"user_friends": "from_user_friendcount",
|
|
169
|
+
"user_likes": "from_user_favourites_count",
|
|
170
|
+
"user_lists": "from_user_listed",
|
|
171
|
+
"user_created_at": "from_user_created_at",
|
|
172
|
+
"links": "urls_expanded",
|
|
173
|
+
"domains": "domains",
|
|
174
|
+
"mentioned_ids": "mentions",
|
|
175
|
+
"hashtags": "hashtags",
|
|
176
|
+
"media_urls": "media_urls",
|
|
177
|
+
"media_types": "media_type",
|
|
177
178
|
},
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
179
|
+
"modified_fields": ["source"],
|
|
180
|
+
"removed_fields": [
|
|
181
|
+
"filter_level",
|
|
182
|
+
"witheld_copyright",
|
|
183
|
+
"withheld_scope",
|
|
184
|
+
"truncated",
|
|
185
|
+
"from_user_utcoffset",
|
|
186
|
+
"from_user_timezone",
|
|
187
|
+
"from_user_lang",
|
|
188
|
+
"from_user_withheld_scope",
|
|
189
|
+
"urls",
|
|
190
|
+
"media_id",
|
|
191
|
+
"media_indice_start",
|
|
192
|
+
"media_indice_end",
|
|
193
|
+
"photo_sizes_width",
|
|
194
|
+
"photo_sizes_height",
|
|
195
|
+
"photo_resize",
|
|
181
196
|
],
|
|
182
|
-
|
|
183
|
-
'removed_fields': [
|
|
184
|
-
'filter_level',
|
|
185
|
-
'witheld_copyright',
|
|
186
|
-
'withheld_scope',
|
|
187
|
-
'truncated',
|
|
188
|
-
'from_user_utcoffset',
|
|
189
|
-
'from_user_timezone',
|
|
190
|
-
'from_user_lang',
|
|
191
|
-
'from_user_withheld_scope',
|
|
192
|
-
'urls',
|
|
193
|
-
'media_id',
|
|
194
|
-
'media_indice_start',
|
|
195
|
-
'media_indice_end',
|
|
196
|
-
'photo_sizes_width',
|
|
197
|
-
'photo_sizes_height',
|
|
198
|
-
'photo_resize',
|
|
199
|
-
]
|
|
200
197
|
}
|
|
201
198
|
|
|
202
199
|
TWEET_PLURAL_FIELDS = {
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
200
|
+
"links",
|
|
201
|
+
"urls_expanded",
|
|
202
|
+
"domains",
|
|
203
|
+
"hashtags",
|
|
204
|
+
"collected_via",
|
|
205
|
+
"media_urls",
|
|
206
|
+
"media_files",
|
|
207
|
+
"media_types",
|
|
208
|
+
"media_alt_texts",
|
|
209
|
+
"mentioned_names",
|
|
210
|
+
"mentioned_ids",
|
|
211
|
+
"mentions",
|
|
215
212
|
}
|
|
216
213
|
|
|
217
|
-
TWEET_BOOLEAN_FIELDS = {
|
|
218
|
-
'possibly_sensitive',
|
|
219
|
-
'user_verified',
|
|
220
|
-
'match_query'
|
|
221
|
-
}
|
|
214
|
+
TWEET_BOOLEAN_FIELDS = {"possibly_sensitive", "user_verified", "match_query"}
|
|
222
215
|
|
|
223
216
|
# More details on Twitter's users metadata can be read here: https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/user-object
|
|
224
217
|
USER_FIELDS = [
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
# 'lang', # dropped from tweet objects only by Twitter (since May
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
# 'utc_offset', # dropped by Twitter (since May
|
|
234
|
-
# 'time_zone', # dropped by Twitter (since May
|
|
235
|
-
|
|
236
|
-
# 'geo_enabled', # dropped by Twitter (since May
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
# 'is_translator', # dropped by Twitter (since May
|
|
245
|
-
# 'translator_type', # dropped by Twitter (since May
|
|
246
|
-
# 'is_translation_enabled', # dropped by Twitter (since May
|
|
218
|
+
"id",
|
|
219
|
+
"screen_name",
|
|
220
|
+
"name",
|
|
221
|
+
"description",
|
|
222
|
+
"url",
|
|
223
|
+
# 'lang', # dropped from tweet objects only by Twitter (since May 2019)
|
|
224
|
+
"timestamp_utc",
|
|
225
|
+
"local_time",
|
|
226
|
+
# 'utc_offset', # dropped by Twitter (since May 2018), ignorable
|
|
227
|
+
# 'time_zone', # dropped by Twitter (since May 2018), ignorable
|
|
228
|
+
"location",
|
|
229
|
+
# 'geo_enabled', # dropped by Twitter (since May 2019), ignorable
|
|
230
|
+
"verified",
|
|
231
|
+
"protected",
|
|
232
|
+
"tweets",
|
|
233
|
+
"followers",
|
|
234
|
+
"friends",
|
|
235
|
+
"likes",
|
|
236
|
+
"lists",
|
|
237
|
+
# 'is_translator', # dropped by Twitter (since May 2019), ignorable
|
|
238
|
+
# 'translator_type', # dropped by Twitter (since May 2019), ignorable
|
|
239
|
+
# 'is_translation_enabled', # dropped by Twitter (since May 2019), ignorable
|
|
247
240
|
# 'default_profile',
|
|
248
241
|
# 'default_profile_image',
|
|
249
|
-
# 'has_extended_profile', # dropped by Twitter (since May
|
|
250
|
-
# 'profile_image_url', # dropped by Twitter (since May
|
|
251
|
-
|
|
242
|
+
# 'has_extended_profile', # dropped by Twitter (since May 2019), ignorable
|
|
243
|
+
# 'profile_image_url', # dropped by Twitter (since May 2019), ignorable
|
|
244
|
+
"image",
|
|
252
245
|
# 'profile_banner_url',
|
|
253
|
-
# 'profile_use_background_image', # dropped by Twitter (since May
|
|
254
|
-
# 'profile_background_image_url', # dropped by Twitter (since May
|
|
255
|
-
# 'profile_background_image_url_https', # dropped by Twitter (since May
|
|
256
|
-
# 'profile_background_tile', # dropped by Twitter (since May
|
|
257
|
-
# 'profile_background_color', # dropped by Twitter (since May
|
|
258
|
-
# 'profile_link_color', # dropped by Twitter (since May
|
|
259
|
-
# 'profile_text_color', # dropped by Twitter (since May
|
|
260
|
-
# 'profile_sidebar_fill_color', # dropped by Twitter (since May
|
|
261
|
-
# 'profile_sidebar_border_color' # dropped by Twitter (since May
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
246
|
+
# 'profile_use_background_image', # dropped by Twitter (since May 2019), ignorable
|
|
247
|
+
# 'profile_background_image_url', # dropped by Twitter (since May 2019), ignorable
|
|
248
|
+
# 'profile_background_image_url_https', # dropped by Twitter (since May 2019), ignorable
|
|
249
|
+
# 'profile_background_tile', # dropped by Twitter (since May 2019), ignorable
|
|
250
|
+
# 'profile_background_color', # dropped by Twitter (since May 2019), ignorable
|
|
251
|
+
# 'profile_link_color', # dropped by Twitter (since May 2019), ignorable
|
|
252
|
+
# 'profile_text_color', # dropped by Twitter (since May 2019), ignorable
|
|
253
|
+
# 'profile_sidebar_fill_color', # dropped by Twitter (since May 2019), ignorable
|
|
254
|
+
# 'profile_sidebar_border_color' # dropped by Twitter (since May 2019), ignorable
|
|
255
|
+
"default_profile",
|
|
256
|
+
"default_profile_image",
|
|
257
|
+
"witheld_in_countries",
|
|
258
|
+
"witheld_scope",
|
|
266
259
|
]
|
|
267
260
|
|
|
268
|
-
USER_PLURAL_FIELDS = {
|
|
269
|
-
'witheld_in_countries'
|
|
270
|
-
}
|
|
261
|
+
USER_PLURAL_FIELDS = {"witheld_in_countries"}
|
|
271
262
|
|
|
272
263
|
USER_BOOLEAN_FIELDS = {
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
264
|
+
"verified",
|
|
265
|
+
"protected",
|
|
266
|
+
"default_profile",
|
|
267
|
+
"default_profile_image",
|
|
277
268
|
}
|
|
278
269
|
|
|
279
270
|
CANONICAL_URL_KWARGS = {
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
271
|
+
"strip_authentication": False,
|
|
272
|
+
"strip_trailing_slash": False,
|
|
273
|
+
"strip_protocol": False,
|
|
274
|
+
"strip_irrelevant_subdomains": False,
|
|
275
|
+
"strip_fragment": False,
|
|
276
|
+
"normalize_amp": False,
|
|
277
|
+
"fix_common_mistakes": False,
|
|
278
|
+
"infer_redirection": False,
|
|
279
|
+
"quoted": True,
|
|
289
280
|
}
|
|
290
281
|
|
|
291
|
-
CANONICAL_HOSTNAME_KWARGS = {
|
|
292
|
-
'normalize_amp': False,
|
|
293
|
-
'infer_redirection': False
|
|
294
|
-
}
|
|
282
|
+
CANONICAL_HOSTNAME_KWARGS = {"normalize_amp": False, "infer_redirection": False}
|
|
295
283
|
|
|
296
284
|
# API v2 constants
|
|
297
285
|
TWEET_FIELDS_V2 = {
|
|
298
|
-
|
|
299
|
-
|
|
286
|
+
"attachments",
|
|
287
|
+
"author_id",
|
|
300
288
|
# NOTE: (2023-04-26) dropping this because we don't use it and it prevents us
|
|
301
289
|
# from being able to get 500 tweets per call using academic v2 API.
|
|
302
290
|
# 'context_annotations',
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
291
|
+
"conversation_id",
|
|
292
|
+
"created_at",
|
|
293
|
+
"entities",
|
|
294
|
+
"geo",
|
|
295
|
+
"id",
|
|
296
|
+
"in_reply_to_user_id",
|
|
297
|
+
"lang",
|
|
298
|
+
"possibly_sensitive",
|
|
299
|
+
"public_metrics",
|
|
300
|
+
"referenced_tweets",
|
|
301
|
+
"reply_settings",
|
|
302
|
+
"source",
|
|
303
|
+
"text",
|
|
304
|
+
"withheld",
|
|
317
305
|
}
|
|
318
306
|
|
|
319
307
|
MEDIA_FIELDS = {
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
308
|
+
"media_key",
|
|
309
|
+
"type",
|
|
310
|
+
"duration_ms",
|
|
311
|
+
"height",
|
|
312
|
+
"preview_image_url",
|
|
313
|
+
"public_metrics",
|
|
314
|
+
"width",
|
|
315
|
+
"alt_text",
|
|
316
|
+
"url",
|
|
317
|
+
"variants",
|
|
330
318
|
}
|
|
331
319
|
|
|
332
|
-
POLL_FIELDS = {
|
|
333
|
-
'id',
|
|
334
|
-
'options',
|
|
335
|
-
'duration_minutes',
|
|
336
|
-
'end_datetime',
|
|
337
|
-
'voting_status'
|
|
338
|
-
}
|
|
320
|
+
POLL_FIELDS = {"id", "options", "duration_minutes", "end_datetime", "voting_status"}
|
|
339
321
|
|
|
340
322
|
PLACE_FIELDS = {
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
323
|
+
"full_name",
|
|
324
|
+
"id",
|
|
325
|
+
"contained_within",
|
|
326
|
+
"country",
|
|
327
|
+
"country_code",
|
|
328
|
+
"geo",
|
|
329
|
+
"name",
|
|
330
|
+
"place_type",
|
|
349
331
|
}
|
|
350
332
|
|
|
351
333
|
USER_FIELDS_V2 = {
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
334
|
+
"id",
|
|
335
|
+
"name",
|
|
336
|
+
"username",
|
|
337
|
+
"created_at",
|
|
338
|
+
"description",
|
|
339
|
+
"entities",
|
|
340
|
+
"location",
|
|
341
|
+
"pinned_tweet_id",
|
|
342
|
+
"profile_image_url",
|
|
343
|
+
"protected",
|
|
344
|
+
"public_metrics",
|
|
345
|
+
"url",
|
|
346
|
+
"verified",
|
|
347
|
+
"withheld",
|
|
366
348
|
}
|
|
367
349
|
|
|
368
350
|
TWEET_EXPANSIONS = {
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
351
|
+
"author_id",
|
|
352
|
+
"referenced_tweets.id",
|
|
353
|
+
"in_reply_to_user_id",
|
|
354
|
+
"attachments.media_keys",
|
|
355
|
+
"attachments.poll_ids",
|
|
356
|
+
"geo.place_id",
|
|
357
|
+
"entities.mentions.username",
|
|
358
|
+
"referenced_tweets.id.author_id",
|
|
377
359
|
}
|
|
378
360
|
|
|
379
361
|
TWEET_PARAMS = {
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
362
|
+
"tweet.fields": ",".join(TWEET_FIELDS_V2),
|
|
363
|
+
"media.fields": ",".join(MEDIA_FIELDS),
|
|
364
|
+
"poll.fields": ",".join(POLL_FIELDS),
|
|
365
|
+
"place.fields": ",".join(PLACE_FIELDS),
|
|
366
|
+
"user.fields": ",".join(USER_FIELDS_V2),
|
|
385
367
|
}
|
|
386
368
|
|
|
387
|
-
USER_EXPANSIONS = {
|
|
388
|
-
'pinned_tweet_id'
|
|
389
|
-
}
|
|
369
|
+
USER_EXPANSIONS = {"pinned_tweet_id"}
|
|
390
370
|
|
|
391
371
|
USER_PARAMS = {
|
|
392
|
-
|
|
393
|
-
|
|
372
|
+
"user.fields": ",".join(USER_FIELDS_V2),
|
|
373
|
+
"tweet.fields": ",".join(TWEET_FIELDS_V2),
|
|
394
374
|
}
|
|
395
375
|
|
|
396
376
|
# Lists
|
|
397
377
|
|
|
398
378
|
LIST_FIELDS = {
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
379
|
+
"created_at",
|
|
380
|
+
"follower_count",
|
|
381
|
+
"member_count",
|
|
382
|
+
"private",
|
|
383
|
+
"description",
|
|
384
|
+
"owner_id",
|
|
405
385
|
}
|
|
406
386
|
|
|
407
|
-
LIST_EXPANSIONS = {
|
|
408
|
-
'owner_id'
|
|
409
|
-
}
|
|
387
|
+
LIST_EXPANSIONS = {"owner_id"}
|
|
410
388
|
|
|
411
389
|
LIST_PARAMS = {
|
|
412
|
-
|
|
413
|
-
|
|
390
|
+
"list.fields": ",".join(LIST_FIELDS),
|
|
391
|
+
"user.fields": ",".join(USER_FIELDS_V2),
|
|
414
392
|
}
|
|
415
393
|
|
|
416
|
-
LIST_TWEETS_EXPANSIONS = {
|
|
417
|
-
'author_id'
|
|
418
|
-
}
|
|
394
|
+
LIST_TWEETS_EXPANSIONS = {"author_id"}
|
|
419
395
|
|
|
420
|
-
LIST_MEMBERS_EXPANSIONS = {
|
|
421
|
-
'pinned_tweet_id'
|
|
422
|
-
}
|
|
396
|
+
LIST_MEMBERS_EXPANSIONS = {"pinned_tweet_id"}
|
|
423
397
|
|
|
424
398
|
LIST_TWEETS_OR_MEMBERS_PARAMS = {
|
|
425
|
-
|
|
426
|
-
|
|
399
|
+
"tweet.fields": ",".join(TWEET_FIELDS_V2),
|
|
400
|
+
"user.fields": ",".join(USER_FIELDS_V2),
|
|
427
401
|
}
|
|
428
402
|
|
|
429
403
|
PRE_SNOWFLAKE_LAST_TWEET_ID = 29700859247
|