twitwi 0.20.0__py3-none-any.whl → 0.21.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- test/bluesky/__init__.py +0 -0
- test/bluesky/formatters_test.py +101 -0
- test/bluesky/normalizers_test.py +130 -0
- twitwi/__init__.py +19 -2
- twitwi/anonymizers.py +3 -9
- twitwi/bluesky/__init__.py +16 -0
- twitwi/bluesky/constants.py +19 -0
- twitwi/bluesky/formatters.py +29 -0
- twitwi/bluesky/normalizers.py +686 -0
- twitwi/bluesky/types.py +135 -0
- twitwi/bluesky/utils.py +110 -0
- twitwi/constants.py +323 -349
- twitwi/exceptions.py +8 -1
- twitwi/formatters.py +35 -37
- twitwi/normalizers.py +403 -339
- twitwi/utils.py +46 -18
- twitwi-0.21.1.dist-info/METADATA +436 -0
- twitwi-0.21.1.dist-info/RECORD +22 -0
- {twitwi-0.20.0.dist-info → twitwi-0.21.1.dist-info}/WHEEL +1 -1
- {twitwi-0.20.0.dist-info → twitwi-0.21.1.dist-info}/top_level.txt +1 -0
- twitwi-0.20.0.dist-info/METADATA +0 -156
- twitwi-0.20.0.dist-info/RECORD +0 -13
- {twitwi-0.20.0.dist-info → twitwi-0.21.1.dist-info}/licenses/LICENSE.txt +0 -0
- {twitwi-0.20.0.dist-info → twitwi-0.21.1.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,686 @@
|
|
|
1
|
+
from copy import deepcopy
|
|
2
|
+
from typing import List, Dict, Union, Optional, Literal, overload
|
|
3
|
+
|
|
4
|
+
from twitwi.exceptions import BlueskyPayloadError
|
|
5
|
+
from twitwi.utils import (
|
|
6
|
+
get_collection_time,
|
|
7
|
+
get_dates,
|
|
8
|
+
custom_normalize_url,
|
|
9
|
+
custom_get_normalized_hostname,
|
|
10
|
+
)
|
|
11
|
+
from twitwi.bluesky.utils import (
|
|
12
|
+
validate_post_payload,
|
|
13
|
+
valid_embed_type,
|
|
14
|
+
format_profile_url,
|
|
15
|
+
format_post_url,
|
|
16
|
+
parse_post_url,
|
|
17
|
+
parse_post_uri,
|
|
18
|
+
format_starterpack_url,
|
|
19
|
+
format_media_url,
|
|
20
|
+
)
|
|
21
|
+
from twitwi.bluesky.types import BlueskyProfile, BlueskyPost
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def normalize_profile(data: Dict, locale: Optional[str] = None) -> BlueskyProfile:
|
|
25
|
+
associated = data["associated"]
|
|
26
|
+
|
|
27
|
+
pinned_post_uri = None
|
|
28
|
+
pinned_post_data = data.get("pinnedPost")
|
|
29
|
+
|
|
30
|
+
if pinned_post_data is not None:
|
|
31
|
+
pinned_post_uri = pinned_post_data["uri"]
|
|
32
|
+
|
|
33
|
+
timestamp_utc, created_at = get_dates(
|
|
34
|
+
data["createdAt"], locale=locale, source="bluesky"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
return {
|
|
38
|
+
"did": data["did"],
|
|
39
|
+
"url": format_profile_url(data["handle"]),
|
|
40
|
+
"handle": data["handle"],
|
|
41
|
+
"display_name": data.get("displayName", ""),
|
|
42
|
+
"created_at": created_at,
|
|
43
|
+
"timestamp_utc": timestamp_utc,
|
|
44
|
+
"description": data["description"],
|
|
45
|
+
"avatar": data.get("avatar", ""),
|
|
46
|
+
"posts": data["postsCount"],
|
|
47
|
+
"followers": data["followersCount"],
|
|
48
|
+
"follows": data["followsCount"],
|
|
49
|
+
"lists": associated["lists"],
|
|
50
|
+
"feedgens": associated["feedgens"],
|
|
51
|
+
"starter_packs": associated["starterPacks"],
|
|
52
|
+
"banner": data["banner"],
|
|
53
|
+
"pinned_post_uri": pinned_post_uri,
|
|
54
|
+
"collection_time": get_collection_time(),
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def prepare_native_gif_as_media(gif_data, user_did, source):
|
|
59
|
+
if "thumb" in gif_data:
|
|
60
|
+
media_cid = gif_data["thumb"]["ref"]["$link"]
|
|
61
|
+
_, thumbnail = format_media_url(user_did, media_cid, "image/jpeg", source)
|
|
62
|
+
else:
|
|
63
|
+
media_cid = ""
|
|
64
|
+
thumbnail = ""
|
|
65
|
+
|
|
66
|
+
return {
|
|
67
|
+
"id": media_cid,
|
|
68
|
+
"type": "video/gif",
|
|
69
|
+
"alt": gif_data["title"],
|
|
70
|
+
"url": gif_data["uri"],
|
|
71
|
+
"thumb": thumbnail,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def prepare_image_as_media(image_data):
|
|
76
|
+
return {
|
|
77
|
+
"id": image_data["image"]["ref"]["$link"],
|
|
78
|
+
"type": image_data["image"]["mimeType"],
|
|
79
|
+
"alt": image_data["alt"],
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def prepare_video_as_media(video_data):
|
|
84
|
+
return {
|
|
85
|
+
"id": video_data["ref"]["$link"],
|
|
86
|
+
"type": video_data["mimeType"],
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def process_starterpack_card(embed_data, post):
|
|
91
|
+
# Warning: mutates post
|
|
92
|
+
|
|
93
|
+
card = embed_data.get("record", {})
|
|
94
|
+
creator_did, pack_did = parse_post_uri(embed_data["uri"])
|
|
95
|
+
post["card_link"] = format_starterpack_url(embed_data.get("creator", {}).get("handle") or creator_did, pack_did)
|
|
96
|
+
post["card_title"] = card.get("name", "")
|
|
97
|
+
post["card_description"] = card.get("description", "")
|
|
98
|
+
post["card_thumbnail"] = card.get("thumb", "")
|
|
99
|
+
return post
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def process_card_data(embed_data, post):
|
|
103
|
+
# Warning: mutates post
|
|
104
|
+
|
|
105
|
+
post["card_link"] = embed_data["uri"]
|
|
106
|
+
post["card_title"] = embed_data.get("title", "")
|
|
107
|
+
post["card_description"] = embed_data.get("description", "")
|
|
108
|
+
post["card_thumbnail"] = embed_data.get("thumb", "")
|
|
109
|
+
return post
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def prepare_quote_data(embed_quote, card_data, post, links):
|
|
113
|
+
# Warning: mutates post and links
|
|
114
|
+
|
|
115
|
+
post["quoted_cid"] = embed_quote["cid"]
|
|
116
|
+
post["quoted_uri"] = embed_quote["uri"]
|
|
117
|
+
post["quoted_user_did"], post["quoted_did"] = parse_post_uri(
|
|
118
|
+
post["quoted_uri"], post["url"]
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# First store ugly quoted url with user did in case full quote data is missing (recursion > 3 or detached quote)
|
|
122
|
+
post["quoted_url"] = format_post_url(post["quoted_user_did"], post["quoted_did"])
|
|
123
|
+
|
|
124
|
+
quoted_data = None
|
|
125
|
+
if card_data:
|
|
126
|
+
if card_data.get("detached", False):
|
|
127
|
+
post["quoted_status"] = "detached"
|
|
128
|
+
|
|
129
|
+
else:
|
|
130
|
+
quoted_data = deepcopy(card_data)
|
|
131
|
+
|
|
132
|
+
# Grab user handle and cleanup links when no quote data but url in text
|
|
133
|
+
if not quoted_data:
|
|
134
|
+
for link in links:
|
|
135
|
+
if link.startswith("https://bsky.app/profile/") and link.endswith(
|
|
136
|
+
post["quoted_did"]
|
|
137
|
+
):
|
|
138
|
+
# Take better quoted url with user_handle
|
|
139
|
+
post["quoted_url"] = link
|
|
140
|
+
break
|
|
141
|
+
|
|
142
|
+
# Remove quoted link from post links
|
|
143
|
+
if post["quoted_url"] in links:
|
|
144
|
+
links.remove(post["quoted_url"])
|
|
145
|
+
|
|
146
|
+
# Extract user handle from url
|
|
147
|
+
if "did:plc:" not in post["quoted_url"]:
|
|
148
|
+
post["quoted_user_handle"], _ = parse_post_url(post["quoted_url"], post["url"])
|
|
149
|
+
|
|
150
|
+
return (post, quoted_data, links)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def merge_nested_posts(referenced_posts, nested, source):
|
|
154
|
+
for new_post in nested:
|
|
155
|
+
ordered_id = "%s_%s" % (new_post["did"], new_post["user_handle"])
|
|
156
|
+
if ordered_id not in referenced_posts:
|
|
157
|
+
referenced_posts[ordered_id] = new_post
|
|
158
|
+
else:
|
|
159
|
+
old_post = referenced_posts[ordered_id]
|
|
160
|
+
for key in new_post.keys():
|
|
161
|
+
if key not in old_post:
|
|
162
|
+
old_post[key] = new_post[key]
|
|
163
|
+
elif old_post[key] != new_post[key]:
|
|
164
|
+
if key == "collected_via":
|
|
165
|
+
old_post[key] += new_post[key]
|
|
166
|
+
elif key == "match_query":
|
|
167
|
+
old_post[key] = old_post[key] or new_post[key]
|
|
168
|
+
elif key not in ["collection_time"]:
|
|
169
|
+
raise BlueskyPayloadError(
|
|
170
|
+
source,
|
|
171
|
+
"a nested post appearing twice in the same payload has some diverging metadata for key %s: %s / %s"
|
|
172
|
+
% (key, old_post[key], new_post[key]),
|
|
173
|
+
)
|
|
174
|
+
return referenced_posts
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@overload
|
|
178
|
+
def normalize_post(
|
|
179
|
+
data: Dict,
|
|
180
|
+
locale: Optional[str] = ...,
|
|
181
|
+
extract_referenced_posts: Literal[True] = ...,
|
|
182
|
+
collection_source: Optional[str] = ...,
|
|
183
|
+
) -> List[BlueskyPost]: ...
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
@overload
|
|
187
|
+
def normalize_post(
|
|
188
|
+
data: Dict,
|
|
189
|
+
locale: Optional[str] = ...,
|
|
190
|
+
extract_referenced_posts: Literal[False] = ...,
|
|
191
|
+
collection_source: Optional[str] = ...,
|
|
192
|
+
) -> BlueskyPost: ...
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def normalize_post(
|
|
196
|
+
payload: Dict,
|
|
197
|
+
locale: Optional[str] = None,
|
|
198
|
+
extract_referenced_posts: bool = False,
|
|
199
|
+
collection_source: Optional[str] = None,
|
|
200
|
+
) -> Union[BlueskyPost, List[BlueskyPost]]:
|
|
201
|
+
"""
|
|
202
|
+
Function "normalizing" a post as returned by Bluesky's API in order to
|
|
203
|
+
cleanup and optimize some fields.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
payload (dict): post or feed payload json dict from Bluesky API.
|
|
207
|
+
locale (pytz.timezone, optional): Timezone for date conversions.
|
|
208
|
+
extract_referenced_posts (bool, optional): Whether to return, in
|
|
209
|
+
addition to the original post, also the full list of posts
|
|
210
|
+
found in the given payload (including the tree of quoted posts
|
|
211
|
+
as well as the parent and root posts of the thread if the post
|
|
212
|
+
comes as an answer to another one). Defaults
|
|
213
|
+
to `False`.
|
|
214
|
+
collection_source (str, optional): string explaining how the post
|
|
215
|
+
was collected. Defaults to `None`.
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
(dict or list): Either a single post dict or a list of post dicts if
|
|
219
|
+
`extract_referenced_posts` was set to `True`.
|
|
220
|
+
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
if not isinstance(payload, dict):
|
|
224
|
+
raise BlueskyPayloadError(
|
|
225
|
+
"UNKNOWN", f"data provided to normalize_post is not a dictionary: {payload}"
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
valid, error = validate_post_payload(payload)
|
|
229
|
+
if not valid:
|
|
230
|
+
raise BlueskyPayloadError(
|
|
231
|
+
payload.get("uri", payload.get("post", {}).get("uri", "UNKNOWN")),
|
|
232
|
+
f"data provided to normalize_post is not a standard Bluesky post or feed payload:\n{error}",
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
if "post" in payload:
|
|
236
|
+
data = payload["post"]
|
|
237
|
+
reply_data = payload.get("reply")
|
|
238
|
+
repost_data = payload.get("reason")
|
|
239
|
+
else:
|
|
240
|
+
data = payload
|
|
241
|
+
reply_data = None
|
|
242
|
+
repost_data = None
|
|
243
|
+
|
|
244
|
+
if extract_referenced_posts:
|
|
245
|
+
referenced_posts = {}
|
|
246
|
+
|
|
247
|
+
if collection_source is None:
|
|
248
|
+
collection_source = data.get("collection_source")
|
|
249
|
+
|
|
250
|
+
post = {}
|
|
251
|
+
|
|
252
|
+
# Store original text and prepare text for quotes & medias enriched version
|
|
253
|
+
post["original_text"] = data["record"]["text"]
|
|
254
|
+
text = post["original_text"].encode("utf-8")
|
|
255
|
+
|
|
256
|
+
# Handle datetime fields
|
|
257
|
+
post["collection_time"] = get_collection_time()
|
|
258
|
+
post["timestamp_utc"], post["local_time"] = get_dates(
|
|
259
|
+
data["record"]["createdAt"], locale=locale, source="bluesky"
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
# Handle post/user identifiers
|
|
263
|
+
post["cid"] = data["cid"]
|
|
264
|
+
post["uri"] = data["uri"]
|
|
265
|
+
post["user_did"], post["did"] = parse_post_uri(data["uri"])
|
|
266
|
+
post["user_handle"] = data["author"]["handle"]
|
|
267
|
+
post["user_url"] = format_profile_url(post["user_handle"])
|
|
268
|
+
post["url"] = format_post_url(post["user_handle"], post["did"])
|
|
269
|
+
|
|
270
|
+
if post["user_did"] != data["author"]["did"]:
|
|
271
|
+
raise BlueskyPayloadError(
|
|
272
|
+
post["url"],
|
|
273
|
+
"inconsistent user_did between Bluesky post's uri and post's author metadata: %s %s"
|
|
274
|
+
% (data["uri"], data["author"]),
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Handle user metadata
|
|
278
|
+
post["user_diplay_name"] = data["author"].get("displayName", "")
|
|
279
|
+
post["user_avatar"] = data["author"].get("avatar", "")
|
|
280
|
+
post["user_timestamp_utc"], post["user_created_at"] = get_dates(
|
|
281
|
+
data["author"]["createdAt"], locale=locale, source="bluesky"
|
|
282
|
+
)
|
|
283
|
+
post["user_langs"] = data["record"].get("langs", [])
|
|
284
|
+
|
|
285
|
+
if "bridgyOriginalUrl" in data["record"]:
|
|
286
|
+
post["bridgy_original_url"] = data["record"]["bridgyOriginalUrl"]
|
|
287
|
+
|
|
288
|
+
# Handle metrics
|
|
289
|
+
post["repost_count"] = data["repostCount"]
|
|
290
|
+
post["reply_count"] = data["replyCount"]
|
|
291
|
+
post["like_count"] = data["likeCount"]
|
|
292
|
+
post["quote_count"] = data["quoteCount"]
|
|
293
|
+
|
|
294
|
+
# Handle hashtags, mentions & links from facets
|
|
295
|
+
post["mentioned_user_handles"] = []
|
|
296
|
+
post["mentioned_user_dids"] = []
|
|
297
|
+
hashtags = set()
|
|
298
|
+
links = set()
|
|
299
|
+
links_to_replace = []
|
|
300
|
+
for facet in data["record"].get("facets", []):
|
|
301
|
+
if len(facet["features"]) != 1:
|
|
302
|
+
raise BlueskyPayloadError(
|
|
303
|
+
post["url"],
|
|
304
|
+
"unusual record facet content with more or less than a unique feature: %s"
|
|
305
|
+
% facet,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
feat = facet["features"][0]
|
|
309
|
+
|
|
310
|
+
# Hashtags
|
|
311
|
+
if feat["$type"].endswith("#tag"):
|
|
312
|
+
hashtags.add(feat["tag"].strip().lower())
|
|
313
|
+
|
|
314
|
+
# Mentions
|
|
315
|
+
elif feat["$type"].endswith("#mention"):
|
|
316
|
+
if feat["did"] not in post["mentioned_user_dids"]:
|
|
317
|
+
post["mentioned_user_dids"].append(feat["did"])
|
|
318
|
+
|
|
319
|
+
# Check & fix occasional errored mention positioning
|
|
320
|
+
# example: https://bsky.app/profile/snjcgt.bsky.social/post/3lpmqkkkgp52u
|
|
321
|
+
byteStart = facet["index"]["byteStart"]
|
|
322
|
+
if text[byteStart : byteStart + 1] != b"@":
|
|
323
|
+
byteStart = text.find(b"@", byteStart)
|
|
324
|
+
|
|
325
|
+
handle = (
|
|
326
|
+
text[byteStart + 1 : facet["index"]["byteEnd"] + byteStart - facet["index"]["byteStart"]]
|
|
327
|
+
.strip()
|
|
328
|
+
.lower()
|
|
329
|
+
.decode("utf-8")
|
|
330
|
+
)
|
|
331
|
+
post["mentioned_user_handles"].append(handle)
|
|
332
|
+
|
|
333
|
+
# Links
|
|
334
|
+
elif feat["$type"].endswith("#link"):
|
|
335
|
+
# Handle native polls
|
|
336
|
+
if "https://poll.blue/" in feat["uri"]:
|
|
337
|
+
if feat["uri"].endswith("/0"):
|
|
338
|
+
links.add(custom_normalize_url(feat["uri"]))
|
|
339
|
+
text += b" %s" % feat["uri"].encode("utf-8")
|
|
340
|
+
continue
|
|
341
|
+
|
|
342
|
+
links.add(custom_normalize_url(feat["uri"]))
|
|
343
|
+
# Check & fix occasional errored link positioning
|
|
344
|
+
# example: https://bsky.app/profile/ecrime.ch/post/3lqotmopayr23
|
|
345
|
+
byteStart = facet["index"]["byteStart"]
|
|
346
|
+
if b" " in text[byteStart : facet["index"]["byteEnd"]]:
|
|
347
|
+
byteStart = text.find(b"http", byteStart)
|
|
348
|
+
|
|
349
|
+
links_to_replace.append(
|
|
350
|
+
{
|
|
351
|
+
"uri": feat["uri"].encode("utf-8"),
|
|
352
|
+
"start": byteStart,
|
|
353
|
+
"end": byteStart - facet["index"]["byteStart"] + facet["index"]["byteEnd"],
|
|
354
|
+
}
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
elif feat["$type"].endswith("#bold"):
|
|
358
|
+
pass
|
|
359
|
+
elif feat["$type"].endswith("#option"):
|
|
360
|
+
pass
|
|
361
|
+
else:
|
|
362
|
+
raise BlueskyPayloadError(
|
|
363
|
+
post["url"], "unusual record facet feature $type: %s" % feat
|
|
364
|
+
)
|
|
365
|
+
post["hashtags"] = sorted(hashtags)
|
|
366
|
+
|
|
367
|
+
# Rewrite full links within post's text
|
|
368
|
+
for link in sorted(links_to_replace, key=lambda x: x["start"], reverse=True):
|
|
369
|
+
if link["start"] < 0:
|
|
370
|
+
text = text + b" " + link["uri"]
|
|
371
|
+
else:
|
|
372
|
+
text = text[: link["start"]] + link["uri"] + text[link["end"] :]
|
|
373
|
+
|
|
374
|
+
# Handle thread info when applicable
|
|
375
|
+
# Unfortunately posts' payload only provide at uris for these so we do not have the handles
|
|
376
|
+
# We could sometimes resolve them from the mentionned and quote data but that would not handle most cases
|
|
377
|
+
# Issue opened here to have user handles along: https://github.com/bluesky-social/atproto/issues/3722
|
|
378
|
+
if "reply" in data["record"]:
|
|
379
|
+
if "parent" in data["record"]["reply"]:
|
|
380
|
+
post["to_post_cid"] = data["record"]["reply"]["parent"]["cid"]
|
|
381
|
+
post["to_post_uri"] = data["record"]["reply"]["parent"]["uri"]
|
|
382
|
+
post["to_user_did"], post["to_post_did"] = parse_post_uri(
|
|
383
|
+
post["to_post_uri"], post["url"]
|
|
384
|
+
)
|
|
385
|
+
post["to_post_url"] = format_post_url(
|
|
386
|
+
post["to_user_did"], post["to_post_did"]
|
|
387
|
+
)
|
|
388
|
+
if "root" in data["record"]["reply"]:
|
|
389
|
+
post["to_root_post_cid"] = data["record"]["reply"]["root"]["cid"]
|
|
390
|
+
post["to_root_post_uri"] = data["record"]["reply"]["root"]["uri"]
|
|
391
|
+
post["to_root_user_did"], post["to_root_post_did"] = parse_post_uri(
|
|
392
|
+
post["to_root_post_uri"], post["url"]
|
|
393
|
+
)
|
|
394
|
+
post["to_root_post_url"] = format_post_url(
|
|
395
|
+
post["to_root_user_did"], post["to_root_post_did"]
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
# Handle quotes & medias
|
|
399
|
+
media_ids = set()
|
|
400
|
+
post["media_urls"] = []
|
|
401
|
+
post["media_thumbnails"] = []
|
|
402
|
+
post["media_types"] = []
|
|
403
|
+
post["media_alt_texts"] = []
|
|
404
|
+
if "embed" in data["record"]:
|
|
405
|
+
embed = data["record"]["embed"]
|
|
406
|
+
quoted_data = None
|
|
407
|
+
media_data = []
|
|
408
|
+
extra_links = []
|
|
409
|
+
|
|
410
|
+
if not valid_embed_type(embed["$type"]):
|
|
411
|
+
raise BlueskyPayloadError(
|
|
412
|
+
post["url"], "unusual record embed $type: %s" % embed
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
# Links from cards
|
|
416
|
+
if embed["$type"].endswith(".external"):
|
|
417
|
+
link = embed["external"]["uri"]
|
|
418
|
+
|
|
419
|
+
# Handle native gifs as medias
|
|
420
|
+
if link.startswith("https://media.tenor.com/"):
|
|
421
|
+
media_data.append(
|
|
422
|
+
prepare_native_gif_as_media(
|
|
423
|
+
embed["external"], post["user_did"], post["url"]
|
|
424
|
+
)
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
# Extra card links sometimes missing from facets & text due to manual action in post form
|
|
428
|
+
else:
|
|
429
|
+
extra_links.append(embed["external"]["uri"])
|
|
430
|
+
# Handle link card metadata
|
|
431
|
+
if "embed" in data:
|
|
432
|
+
post = process_card_data(data["embed"]["external"], post)
|
|
433
|
+
|
|
434
|
+
# Images
|
|
435
|
+
if embed["$type"].endswith(".images"):
|
|
436
|
+
media_data.extend([prepare_image_as_media(i) for i in embed["images"]])
|
|
437
|
+
|
|
438
|
+
# Video
|
|
439
|
+
if embed["$type"].endswith(".video"):
|
|
440
|
+
media_data.append(prepare_video_as_media(embed["video"]))
|
|
441
|
+
|
|
442
|
+
# Quote & Starter-packs
|
|
443
|
+
if embed["$type"].endswith(".record"):
|
|
444
|
+
if "app.bsky.graph.starterpack" in embed["record"]["uri"]:
|
|
445
|
+
post = process_starterpack_card(data.get("embed", {}).get("record"), post)
|
|
446
|
+
if post["card_link"]:
|
|
447
|
+
extra_links.append(post["card_link"])
|
|
448
|
+
else:
|
|
449
|
+
post, quoted_data, links = prepare_quote_data(
|
|
450
|
+
embed["record"], data.get("embed", {}).get("record"), post, links
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
# Quote with medias
|
|
454
|
+
if embed["$type"].endswith(".recordWithMedia"):
|
|
455
|
+
post, quoted_data, links = prepare_quote_data(
|
|
456
|
+
embed["record"]["record"],
|
|
457
|
+
data.get("embed", {}).get("record", {}).get("record"),
|
|
458
|
+
post,
|
|
459
|
+
links,
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
# Links from cards
|
|
463
|
+
if embed["media"]["$type"].endswith(".external"):
|
|
464
|
+
link = embed["media"]["external"]["uri"]
|
|
465
|
+
|
|
466
|
+
# Handle native gifs as medias
|
|
467
|
+
if link.startswith("https://media.tenor.com/"):
|
|
468
|
+
media_data.append(
|
|
469
|
+
prepare_native_gif_as_media(
|
|
470
|
+
embed["media"]["external"], post["user_did"], post["url"]
|
|
471
|
+
)
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
# Extra card links sometimes missing from facets & text due to manual action in post form
|
|
475
|
+
else:
|
|
476
|
+
extra_links = [link] + extra_links
|
|
477
|
+
# Handle link card metadata
|
|
478
|
+
if "embed" in data and "media" in data["embed"]["media"]:
|
|
479
|
+
post = process_card_data(
|
|
480
|
+
data["embed"]["media"]["external"], post
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# Images
|
|
484
|
+
elif embed["media"]["$type"].endswith(".images"):
|
|
485
|
+
media_data.extend(
|
|
486
|
+
[prepare_image_as_media(i) for i in embed["media"]["images"]]
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
# Video
|
|
490
|
+
elif embed["media"]["$type"].endswith(".video"):
|
|
491
|
+
media_data.append(prepare_video_as_media(embed["media"]["video"]))
|
|
492
|
+
|
|
493
|
+
else:
|
|
494
|
+
raise BlueskyPayloadError(
|
|
495
|
+
post["url"],
|
|
496
|
+
"unusual record embed media $type from a recordWithMedia: %s"
|
|
497
|
+
% embed,
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
# Process extra links
|
|
501
|
+
for link in extra_links:
|
|
502
|
+
norm_link = custom_normalize_url(link)
|
|
503
|
+
if norm_link not in links:
|
|
504
|
+
links.add(norm_link)
|
|
505
|
+
text += b" " + link.encode("utf-8")
|
|
506
|
+
|
|
507
|
+
# Process medias
|
|
508
|
+
for media in media_data:
|
|
509
|
+
if media["id"] not in media_ids:
|
|
510
|
+
media_ids.add(media["id"])
|
|
511
|
+
media_type = media["type"]
|
|
512
|
+
if "url" in media:
|
|
513
|
+
media_url = media["url"]
|
|
514
|
+
media_thumb = media["thumb"]
|
|
515
|
+
else:
|
|
516
|
+
media_url, media_thumb = format_media_url(
|
|
517
|
+
post["user_did"], media["id"], media_type, post["url"]
|
|
518
|
+
)
|
|
519
|
+
post["media_urls"].append(media_url)
|
|
520
|
+
post["media_thumbnails"].append(media_thumb)
|
|
521
|
+
post["media_types"].append(media_type)
|
|
522
|
+
post["media_alt_texts"].append(media.get("alt", ""))
|
|
523
|
+
|
|
524
|
+
# Rewrite post's text to include links to medias within
|
|
525
|
+
text += b" " + (
|
|
526
|
+
media_thumb if media_type.startswith("video") and not media_type.endswith("/gif") else media_url
|
|
527
|
+
).encode("utf-8")
|
|
528
|
+
|
|
529
|
+
# Process quotes
|
|
530
|
+
if quoted_data and "value" in quoted_data:
|
|
531
|
+
if quoted_data["cid"] != post["quoted_cid"]:
|
|
532
|
+
raise BlueskyPayloadError(
|
|
533
|
+
post["url"],
|
|
534
|
+
"inconsistent quote cid found between record.embed.record.cid & embed.record.cid: %s %s"
|
|
535
|
+
% (post["quoted_cid"], quoted_data),
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
quoted_data["record"] = quoted_data["value"]
|
|
539
|
+
del quoted_data["value"]
|
|
540
|
+
if "embeds" in quoted_data and len(quoted_data["embeds"]):
|
|
541
|
+
if len(quoted_data["embeds"]) != 1:
|
|
542
|
+
raise BlueskyPayloadError(
|
|
543
|
+
post["url"],
|
|
544
|
+
"unusual multiple embeds found within a quoted post: %s"
|
|
545
|
+
% quoted_data["embeds"],
|
|
546
|
+
)
|
|
547
|
+
quoted_data["embed"] = quoted_data["embeds"][0]
|
|
548
|
+
del quoted_data["embeds"]
|
|
549
|
+
|
|
550
|
+
nested = normalize_post(
|
|
551
|
+
quoted_data,
|
|
552
|
+
locale=locale,
|
|
553
|
+
extract_referenced_posts=True,
|
|
554
|
+
collection_source="quote",
|
|
555
|
+
)
|
|
556
|
+
quoted = nested[-1]
|
|
557
|
+
if extract_referenced_posts:
|
|
558
|
+
referenced_posts = merge_nested_posts(
|
|
559
|
+
referenced_posts, nested, post["url"]
|
|
560
|
+
)
|
|
561
|
+
|
|
562
|
+
# Take better quoted url with user_handle
|
|
563
|
+
post["quoted_url"] = quoted["url"]
|
|
564
|
+
post["quoted_user_handle"] = quoted["user_handle"]
|
|
565
|
+
post["quoted_created_at"] = quoted["local_time"]
|
|
566
|
+
post["quoted_timestamp_utc"] = quoted["timestamp_utc"]
|
|
567
|
+
|
|
568
|
+
# Remove quoted link from post links if present in text
|
|
569
|
+
if quoted["url"] in links:
|
|
570
|
+
links.remove(quoted["url"])
|
|
571
|
+
|
|
572
|
+
# Rewrite post's text to include quote within (or replace the link to the quote if present)
|
|
573
|
+
quote = (
|
|
574
|
+
"« @%s: %s — %s »"
|
|
575
|
+
% (quoted["user_handle"], quoted["text"], quoted["url"])
|
|
576
|
+
).encode("utf-8")
|
|
577
|
+
url_lower = quoted["url"].encode("utf-8").lower()
|
|
578
|
+
text_lower = text.lower()
|
|
579
|
+
if url_lower in text_lower:
|
|
580
|
+
url_pos = text_lower.find(url_lower)
|
|
581
|
+
text = text[:url_pos] + quote + text[url_pos + len(quoted["url"]) :]
|
|
582
|
+
else:
|
|
583
|
+
text += b" " + quote
|
|
584
|
+
|
|
585
|
+
# Process links domains
|
|
586
|
+
post["links"] = sorted(links)
|
|
587
|
+
post["domains"] = [custom_get_normalized_hostname(link) for link in post["links"]]
|
|
588
|
+
|
|
589
|
+
# Handle threadgates (replies rules)
|
|
590
|
+
# WARNING: quoted posts do not seem to include threadgates info
|
|
591
|
+
# Issue opened about it here: https://github.com/bluesky-social/atproto/issues/3716
|
|
592
|
+
if "threadgate" in data:
|
|
593
|
+
post["replies_rules"] = []
|
|
594
|
+
if "allow" in data["threadgate"]["record"]:
|
|
595
|
+
for rule in data["threadgate"]["record"]["allow"]:
|
|
596
|
+
rule_string = (
|
|
597
|
+
"allow_from_" + rule["$type"].split("#")[1].split("Rule")[0]
|
|
598
|
+
)
|
|
599
|
+
if rule_string.endswith("_list") and "list" in rule:
|
|
600
|
+
for allowed_list in rule["list"]:
|
|
601
|
+
post["replies_rules"].append(rule_string + ":" + allowed_list)
|
|
602
|
+
else:
|
|
603
|
+
post["replies_rules"].append(rule_string)
|
|
604
|
+
if not data["threadgate"]["record"]["allow"]:
|
|
605
|
+
post["replies_rules"].append("disallow")
|
|
606
|
+
(
|
|
607
|
+
post["replies_rules_timestamp_utc"],
|
|
608
|
+
post["replies_rules_created_at"],
|
|
609
|
+
) = get_dates(
|
|
610
|
+
data["threadgate"]["record"]["createdAt"], locale=locale, source="bluesky"
|
|
611
|
+
)
|
|
612
|
+
post["hidden_replies_uris"] = data["threadgate"]["record"].get(
|
|
613
|
+
"hiddenReplies", []
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
# Handle postgates (quotes rules)
|
|
617
|
+
#
|
|
618
|
+
# Users can forbid others to quote a post, but payloads do not seem to
|
|
619
|
+
# include it yet although the API spec documents it:
|
|
620
|
+
# https://github.com/bluesky-social/atproto/blob/main/lexicons/app/bsky/feed/postgate.json
|
|
621
|
+
# Issue opened about it here: https://github.com/bluesky-social/atproto/issues/3712
|
|
622
|
+
#
|
|
623
|
+
# if "postgate" in data:
|
|
624
|
+
# if "embeddingRules" in data["postgate"]["record"] and data["postgate"]["record"]["embeddingRules"]:
|
|
625
|
+
# post["quotes_rule"] = "disallow"
|
|
626
|
+
# post["quotes_rules_timestamp_utc"], post["quotes_rules_created_at"] = get_dates(data["postgate"]["record"]["createdAt"], locale=locale, source="bluesky")
|
|
627
|
+
# post["detached_quotes_uris"] = data["postgate"]["record"].get("detachedEmbeddingUris", [])
|
|
628
|
+
|
|
629
|
+
# Handle reposts when data comes from a feed
|
|
630
|
+
if repost_data:
|
|
631
|
+
if not repost_data["$type"].endswith("reasonRepost"):
|
|
632
|
+
raise BlueskyPayloadError(
|
|
633
|
+
post["url"],
|
|
634
|
+
"unusual reason for including a post within a feed: %s" % repost_data,
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
post["repost_by_user_did"] = repost_data["by"]["did"]
|
|
638
|
+
post["repost_by_user_handle"] = repost_data["by"]["handle"]
|
|
639
|
+
post["repost_timestamp_utc"], post["repost_created_at"] = get_dates(
|
|
640
|
+
repost_data["indexedAt"], locale=locale, source="bluesky"
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
post["text"] = text.decode("utf-8")
|
|
644
|
+
|
|
645
|
+
if collection_source is not None:
|
|
646
|
+
post["collected_via"] = [collection_source]
|
|
647
|
+
post["match_query"] = collection_source not in ["thread", "quote"]
|
|
648
|
+
|
|
649
|
+
if extract_referenced_posts:
|
|
650
|
+
# Handle thread posts when data comes from a feed
|
|
651
|
+
if reply_data:
|
|
652
|
+
if "parent" in reply_data:
|
|
653
|
+
nested = normalize_post(
|
|
654
|
+
reply_data["parent"],
|
|
655
|
+
locale=locale,
|
|
656
|
+
extract_referenced_posts=True,
|
|
657
|
+
collection_source="thread",
|
|
658
|
+
)
|
|
659
|
+
referenced_posts = merge_nested_posts(
|
|
660
|
+
referenced_posts, nested, post["url"]
|
|
661
|
+
)
|
|
662
|
+
|
|
663
|
+
if "root" in reply_data and (
|
|
664
|
+
"parent" not in reply_data
|
|
665
|
+
or reply_data["parent"]["cid"] != reply_data["root"]["cid"]
|
|
666
|
+
):
|
|
667
|
+
nested = normalize_post(
|
|
668
|
+
reply_data["root"],
|
|
669
|
+
locale=locale,
|
|
670
|
+
extract_referenced_posts=True,
|
|
671
|
+
collection_source="thread",
|
|
672
|
+
)
|
|
673
|
+
referenced_posts = merge_nested_posts(
|
|
674
|
+
referenced_posts, nested, post["url"]
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
if "grandparentAuthor" in reply_data:
|
|
678
|
+
# TODO ? Shall we do anything from that?
|
|
679
|
+
pass
|
|
680
|
+
|
|
681
|
+
assert referenced_posts is not None
|
|
682
|
+
return [referenced_posts[did] for did in sorted(referenced_posts.keys())] + [
|
|
683
|
+
post
|
|
684
|
+
] # type: ignore
|
|
685
|
+
|
|
686
|
+
return post # type: ignore
|