twitwi 0.20.0__py3-none-any.whl → 0.21.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,686 @@
1
+ from copy import deepcopy
2
+ from typing import List, Dict, Union, Optional, Literal, overload
3
+
4
+ from twitwi.exceptions import BlueskyPayloadError
5
+ from twitwi.utils import (
6
+ get_collection_time,
7
+ get_dates,
8
+ custom_normalize_url,
9
+ custom_get_normalized_hostname,
10
+ )
11
+ from twitwi.bluesky.utils import (
12
+ validate_post_payload,
13
+ valid_embed_type,
14
+ format_profile_url,
15
+ format_post_url,
16
+ parse_post_url,
17
+ parse_post_uri,
18
+ format_starterpack_url,
19
+ format_media_url,
20
+ )
21
+ from twitwi.bluesky.types import BlueskyProfile, BlueskyPost
22
+
23
+
24
+ def normalize_profile(data: Dict, locale: Optional[str] = None) -> BlueskyProfile:
25
+ associated = data["associated"]
26
+
27
+ pinned_post_uri = None
28
+ pinned_post_data = data.get("pinnedPost")
29
+
30
+ if pinned_post_data is not None:
31
+ pinned_post_uri = pinned_post_data["uri"]
32
+
33
+ timestamp_utc, created_at = get_dates(
34
+ data["createdAt"], locale=locale, source="bluesky"
35
+ )
36
+
37
+ return {
38
+ "did": data["did"],
39
+ "url": format_profile_url(data["handle"]),
40
+ "handle": data["handle"],
41
+ "display_name": data.get("displayName", ""),
42
+ "created_at": created_at,
43
+ "timestamp_utc": timestamp_utc,
44
+ "description": data["description"],
45
+ "avatar": data.get("avatar", ""),
46
+ "posts": data["postsCount"],
47
+ "followers": data["followersCount"],
48
+ "follows": data["followsCount"],
49
+ "lists": associated["lists"],
50
+ "feedgens": associated["feedgens"],
51
+ "starter_packs": associated["starterPacks"],
52
+ "banner": data["banner"],
53
+ "pinned_post_uri": pinned_post_uri,
54
+ "collection_time": get_collection_time(),
55
+ }
56
+
57
+
58
+ def prepare_native_gif_as_media(gif_data, user_did, source):
59
+ if "thumb" in gif_data:
60
+ media_cid = gif_data["thumb"]["ref"]["$link"]
61
+ _, thumbnail = format_media_url(user_did, media_cid, "image/jpeg", source)
62
+ else:
63
+ media_cid = ""
64
+ thumbnail = ""
65
+
66
+ return {
67
+ "id": media_cid,
68
+ "type": "video/gif",
69
+ "alt": gif_data["title"],
70
+ "url": gif_data["uri"],
71
+ "thumb": thumbnail,
72
+ }
73
+
74
+
75
+ def prepare_image_as_media(image_data):
76
+ return {
77
+ "id": image_data["image"]["ref"]["$link"],
78
+ "type": image_data["image"]["mimeType"],
79
+ "alt": image_data["alt"],
80
+ }
81
+
82
+
83
+ def prepare_video_as_media(video_data):
84
+ return {
85
+ "id": video_data["ref"]["$link"],
86
+ "type": video_data["mimeType"],
87
+ }
88
+
89
+
90
+ def process_starterpack_card(embed_data, post):
91
+ # Warning: mutates post
92
+
93
+ card = embed_data.get("record", {})
94
+ creator_did, pack_did = parse_post_uri(embed_data["uri"])
95
+ post["card_link"] = format_starterpack_url(embed_data.get("creator", {}).get("handle") or creator_did, pack_did)
96
+ post["card_title"] = card.get("name", "")
97
+ post["card_description"] = card.get("description", "")
98
+ post["card_thumbnail"] = card.get("thumb", "")
99
+ return post
100
+
101
+
102
+ def process_card_data(embed_data, post):
103
+ # Warning: mutates post
104
+
105
+ post["card_link"] = embed_data["uri"]
106
+ post["card_title"] = embed_data.get("title", "")
107
+ post["card_description"] = embed_data.get("description", "")
108
+ post["card_thumbnail"] = embed_data.get("thumb", "")
109
+ return post
110
+
111
+
112
+ def prepare_quote_data(embed_quote, card_data, post, links):
113
+ # Warning: mutates post and links
114
+
115
+ post["quoted_cid"] = embed_quote["cid"]
116
+ post["quoted_uri"] = embed_quote["uri"]
117
+ post["quoted_user_did"], post["quoted_did"] = parse_post_uri(
118
+ post["quoted_uri"], post["url"]
119
+ )
120
+
121
+ # First store ugly quoted url with user did in case full quote data is missing (recursion > 3 or detached quote)
122
+ post["quoted_url"] = format_post_url(post["quoted_user_did"], post["quoted_did"])
123
+
124
+ quoted_data = None
125
+ if card_data:
126
+ if card_data.get("detached", False):
127
+ post["quoted_status"] = "detached"
128
+
129
+ else:
130
+ quoted_data = deepcopy(card_data)
131
+
132
+ # Grab user handle and cleanup links when no quote data but url in text
133
+ if not quoted_data:
134
+ for link in links:
135
+ if link.startswith("https://bsky.app/profile/") and link.endswith(
136
+ post["quoted_did"]
137
+ ):
138
+ # Take better quoted url with user_handle
139
+ post["quoted_url"] = link
140
+ break
141
+
142
+ # Remove quoted link from post links
143
+ if post["quoted_url"] in links:
144
+ links.remove(post["quoted_url"])
145
+
146
+ # Extract user handle from url
147
+ if "did:plc:" not in post["quoted_url"]:
148
+ post["quoted_user_handle"], _ = parse_post_url(post["quoted_url"], post["url"])
149
+
150
+ return (post, quoted_data, links)
151
+
152
+
153
+ def merge_nested_posts(referenced_posts, nested, source):
154
+ for new_post in nested:
155
+ ordered_id = "%s_%s" % (new_post["did"], new_post["user_handle"])
156
+ if ordered_id not in referenced_posts:
157
+ referenced_posts[ordered_id] = new_post
158
+ else:
159
+ old_post = referenced_posts[ordered_id]
160
+ for key in new_post.keys():
161
+ if key not in old_post:
162
+ old_post[key] = new_post[key]
163
+ elif old_post[key] != new_post[key]:
164
+ if key == "collected_via":
165
+ old_post[key] += new_post[key]
166
+ elif key == "match_query":
167
+ old_post[key] = old_post[key] or new_post[key]
168
+ elif key not in ["collection_time"]:
169
+ raise BlueskyPayloadError(
170
+ source,
171
+ "a nested post appearing twice in the same payload has some diverging metadata for key %s: %s / %s"
172
+ % (key, old_post[key], new_post[key]),
173
+ )
174
+ return referenced_posts
175
+
176
+
177
+ @overload
178
+ def normalize_post(
179
+ data: Dict,
180
+ locale: Optional[str] = ...,
181
+ extract_referenced_posts: Literal[True] = ...,
182
+ collection_source: Optional[str] = ...,
183
+ ) -> List[BlueskyPost]: ...
184
+
185
+
186
+ @overload
187
+ def normalize_post(
188
+ data: Dict,
189
+ locale: Optional[str] = ...,
190
+ extract_referenced_posts: Literal[False] = ...,
191
+ collection_source: Optional[str] = ...,
192
+ ) -> BlueskyPost: ...
193
+
194
+
195
+ def normalize_post(
196
+ payload: Dict,
197
+ locale: Optional[str] = None,
198
+ extract_referenced_posts: bool = False,
199
+ collection_source: Optional[str] = None,
200
+ ) -> Union[BlueskyPost, List[BlueskyPost]]:
201
+ """
202
+ Function "normalizing" a post as returned by Bluesky's API in order to
203
+ cleanup and optimize some fields.
204
+
205
+ Args:
206
+ payload (dict): post or feed payload json dict from Bluesky API.
207
+ locale (pytz.timezone, optional): Timezone for date conversions.
208
+ extract_referenced_posts (bool, optional): Whether to return, in
209
+ addition to the original post, also the full list of posts
210
+ found in the given payload (including the tree of quoted posts
211
+ as well as the parent and root posts of the thread if the post
212
+ comes as an answer to another one). Defaults
213
+ to `False`.
214
+ collection_source (str, optional): string explaining how the post
215
+ was collected. Defaults to `None`.
216
+
217
+ Returns:
218
+ (dict or list): Either a single post dict or a list of post dicts if
219
+ `extract_referenced_posts` was set to `True`.
220
+
221
+ """
222
+
223
+ if not isinstance(payload, dict):
224
+ raise BlueskyPayloadError(
225
+ "UNKNOWN", f"data provided to normalize_post is not a dictionary: {payload}"
226
+ )
227
+
228
+ valid, error = validate_post_payload(payload)
229
+ if not valid:
230
+ raise BlueskyPayloadError(
231
+ payload.get("uri", payload.get("post", {}).get("uri", "UNKNOWN")),
232
+ f"data provided to normalize_post is not a standard Bluesky post or feed payload:\n{error}",
233
+ )
234
+
235
+ if "post" in payload:
236
+ data = payload["post"]
237
+ reply_data = payload.get("reply")
238
+ repost_data = payload.get("reason")
239
+ else:
240
+ data = payload
241
+ reply_data = None
242
+ repost_data = None
243
+
244
+ if extract_referenced_posts:
245
+ referenced_posts = {}
246
+
247
+ if collection_source is None:
248
+ collection_source = data.get("collection_source")
249
+
250
+ post = {}
251
+
252
+ # Store original text and prepare text for quotes & medias enriched version
253
+ post["original_text"] = data["record"]["text"]
254
+ text = post["original_text"].encode("utf-8")
255
+
256
+ # Handle datetime fields
257
+ post["collection_time"] = get_collection_time()
258
+ post["timestamp_utc"], post["local_time"] = get_dates(
259
+ data["record"]["createdAt"], locale=locale, source="bluesky"
260
+ )
261
+
262
+ # Handle post/user identifiers
263
+ post["cid"] = data["cid"]
264
+ post["uri"] = data["uri"]
265
+ post["user_did"], post["did"] = parse_post_uri(data["uri"])
266
+ post["user_handle"] = data["author"]["handle"]
267
+ post["user_url"] = format_profile_url(post["user_handle"])
268
+ post["url"] = format_post_url(post["user_handle"], post["did"])
269
+
270
+ if post["user_did"] != data["author"]["did"]:
271
+ raise BlueskyPayloadError(
272
+ post["url"],
273
+ "inconsistent user_did between Bluesky post's uri and post's author metadata: %s %s"
274
+ % (data["uri"], data["author"]),
275
+ )
276
+
277
+ # Handle user metadata
278
+ post["user_diplay_name"] = data["author"].get("displayName", "")
279
+ post["user_avatar"] = data["author"].get("avatar", "")
280
+ post["user_timestamp_utc"], post["user_created_at"] = get_dates(
281
+ data["author"]["createdAt"], locale=locale, source="bluesky"
282
+ )
283
+ post["user_langs"] = data["record"].get("langs", [])
284
+
285
+ if "bridgyOriginalUrl" in data["record"]:
286
+ post["bridgy_original_url"] = data["record"]["bridgyOriginalUrl"]
287
+
288
+ # Handle metrics
289
+ post["repost_count"] = data["repostCount"]
290
+ post["reply_count"] = data["replyCount"]
291
+ post["like_count"] = data["likeCount"]
292
+ post["quote_count"] = data["quoteCount"]
293
+
294
+ # Handle hashtags, mentions & links from facets
295
+ post["mentioned_user_handles"] = []
296
+ post["mentioned_user_dids"] = []
297
+ hashtags = set()
298
+ links = set()
299
+ links_to_replace = []
300
+ for facet in data["record"].get("facets", []):
301
+ if len(facet["features"]) != 1:
302
+ raise BlueskyPayloadError(
303
+ post["url"],
304
+ "unusual record facet content with more or less than a unique feature: %s"
305
+ % facet,
306
+ )
307
+
308
+ feat = facet["features"][0]
309
+
310
+ # Hashtags
311
+ if feat["$type"].endswith("#tag"):
312
+ hashtags.add(feat["tag"].strip().lower())
313
+
314
+ # Mentions
315
+ elif feat["$type"].endswith("#mention"):
316
+ if feat["did"] not in post["mentioned_user_dids"]:
317
+ post["mentioned_user_dids"].append(feat["did"])
318
+
319
+ # Check & fix occasional errored mention positioning
320
+ # example: https://bsky.app/profile/snjcgt.bsky.social/post/3lpmqkkkgp52u
321
+ byteStart = facet["index"]["byteStart"]
322
+ if text[byteStart : byteStart + 1] != b"@":
323
+ byteStart = text.find(b"@", byteStart)
324
+
325
+ handle = (
326
+ text[byteStart + 1 : facet["index"]["byteEnd"] + byteStart - facet["index"]["byteStart"]]
327
+ .strip()
328
+ .lower()
329
+ .decode("utf-8")
330
+ )
331
+ post["mentioned_user_handles"].append(handle)
332
+
333
+ # Links
334
+ elif feat["$type"].endswith("#link"):
335
+ # Handle native polls
336
+ if "https://poll.blue/" in feat["uri"]:
337
+ if feat["uri"].endswith("/0"):
338
+ links.add(custom_normalize_url(feat["uri"]))
339
+ text += b" %s" % feat["uri"].encode("utf-8")
340
+ continue
341
+
342
+ links.add(custom_normalize_url(feat["uri"]))
343
+ # Check & fix occasional errored link positioning
344
+ # example: https://bsky.app/profile/ecrime.ch/post/3lqotmopayr23
345
+ byteStart = facet["index"]["byteStart"]
346
+ if b" " in text[byteStart : facet["index"]["byteEnd"]]:
347
+ byteStart = text.find(b"http", byteStart)
348
+
349
+ links_to_replace.append(
350
+ {
351
+ "uri": feat["uri"].encode("utf-8"),
352
+ "start": byteStart,
353
+ "end": byteStart - facet["index"]["byteStart"] + facet["index"]["byteEnd"],
354
+ }
355
+ )
356
+
357
+ elif feat["$type"].endswith("#bold"):
358
+ pass
359
+ elif feat["$type"].endswith("#option"):
360
+ pass
361
+ else:
362
+ raise BlueskyPayloadError(
363
+ post["url"], "unusual record facet feature $type: %s" % feat
364
+ )
365
+ post["hashtags"] = sorted(hashtags)
366
+
367
+ # Rewrite full links within post's text
368
+ for link in sorted(links_to_replace, key=lambda x: x["start"], reverse=True):
369
+ if link["start"] < 0:
370
+ text = text + b" " + link["uri"]
371
+ else:
372
+ text = text[: link["start"]] + link["uri"] + text[link["end"] :]
373
+
374
+ # Handle thread info when applicable
375
+ # Unfortunately posts' payload only provide at uris for these so we do not have the handles
376
+ # We could sometimes resolve them from the mentionned and quote data but that would not handle most cases
377
+ # Issue opened here to have user handles along: https://github.com/bluesky-social/atproto/issues/3722
378
+ if "reply" in data["record"]:
379
+ if "parent" in data["record"]["reply"]:
380
+ post["to_post_cid"] = data["record"]["reply"]["parent"]["cid"]
381
+ post["to_post_uri"] = data["record"]["reply"]["parent"]["uri"]
382
+ post["to_user_did"], post["to_post_did"] = parse_post_uri(
383
+ post["to_post_uri"], post["url"]
384
+ )
385
+ post["to_post_url"] = format_post_url(
386
+ post["to_user_did"], post["to_post_did"]
387
+ )
388
+ if "root" in data["record"]["reply"]:
389
+ post["to_root_post_cid"] = data["record"]["reply"]["root"]["cid"]
390
+ post["to_root_post_uri"] = data["record"]["reply"]["root"]["uri"]
391
+ post["to_root_user_did"], post["to_root_post_did"] = parse_post_uri(
392
+ post["to_root_post_uri"], post["url"]
393
+ )
394
+ post["to_root_post_url"] = format_post_url(
395
+ post["to_root_user_did"], post["to_root_post_did"]
396
+ )
397
+
398
+ # Handle quotes & medias
399
+ media_ids = set()
400
+ post["media_urls"] = []
401
+ post["media_thumbnails"] = []
402
+ post["media_types"] = []
403
+ post["media_alt_texts"] = []
404
+ if "embed" in data["record"]:
405
+ embed = data["record"]["embed"]
406
+ quoted_data = None
407
+ media_data = []
408
+ extra_links = []
409
+
410
+ if not valid_embed_type(embed["$type"]):
411
+ raise BlueskyPayloadError(
412
+ post["url"], "unusual record embed $type: %s" % embed
413
+ )
414
+
415
+ # Links from cards
416
+ if embed["$type"].endswith(".external"):
417
+ link = embed["external"]["uri"]
418
+
419
+ # Handle native gifs as medias
420
+ if link.startswith("https://media.tenor.com/"):
421
+ media_data.append(
422
+ prepare_native_gif_as_media(
423
+ embed["external"], post["user_did"], post["url"]
424
+ )
425
+ )
426
+
427
+ # Extra card links sometimes missing from facets & text due to manual action in post form
428
+ else:
429
+ extra_links.append(embed["external"]["uri"])
430
+ # Handle link card metadata
431
+ if "embed" in data:
432
+ post = process_card_data(data["embed"]["external"], post)
433
+
434
+ # Images
435
+ if embed["$type"].endswith(".images"):
436
+ media_data.extend([prepare_image_as_media(i) for i in embed["images"]])
437
+
438
+ # Video
439
+ if embed["$type"].endswith(".video"):
440
+ media_data.append(prepare_video_as_media(embed["video"]))
441
+
442
+ # Quote & Starter-packs
443
+ if embed["$type"].endswith(".record"):
444
+ if "app.bsky.graph.starterpack" in embed["record"]["uri"]:
445
+ post = process_starterpack_card(data.get("embed", {}).get("record"), post)
446
+ if post["card_link"]:
447
+ extra_links.append(post["card_link"])
448
+ else:
449
+ post, quoted_data, links = prepare_quote_data(
450
+ embed["record"], data.get("embed", {}).get("record"), post, links
451
+ )
452
+
453
+ # Quote with medias
454
+ if embed["$type"].endswith(".recordWithMedia"):
455
+ post, quoted_data, links = prepare_quote_data(
456
+ embed["record"]["record"],
457
+ data.get("embed", {}).get("record", {}).get("record"),
458
+ post,
459
+ links,
460
+ )
461
+
462
+ # Links from cards
463
+ if embed["media"]["$type"].endswith(".external"):
464
+ link = embed["media"]["external"]["uri"]
465
+
466
+ # Handle native gifs as medias
467
+ if link.startswith("https://media.tenor.com/"):
468
+ media_data.append(
469
+ prepare_native_gif_as_media(
470
+ embed["media"]["external"], post["user_did"], post["url"]
471
+ )
472
+ )
473
+
474
+ # Extra card links sometimes missing from facets & text due to manual action in post form
475
+ else:
476
+ extra_links = [link] + extra_links
477
+ # Handle link card metadata
478
+ if "embed" in data and "media" in data["embed"]["media"]:
479
+ post = process_card_data(
480
+ data["embed"]["media"]["external"], post
481
+ )
482
+
483
+ # Images
484
+ elif embed["media"]["$type"].endswith(".images"):
485
+ media_data.extend(
486
+ [prepare_image_as_media(i) for i in embed["media"]["images"]]
487
+ )
488
+
489
+ # Video
490
+ elif embed["media"]["$type"].endswith(".video"):
491
+ media_data.append(prepare_video_as_media(embed["media"]["video"]))
492
+
493
+ else:
494
+ raise BlueskyPayloadError(
495
+ post["url"],
496
+ "unusual record embed media $type from a recordWithMedia: %s"
497
+ % embed,
498
+ )
499
+
500
+ # Process extra links
501
+ for link in extra_links:
502
+ norm_link = custom_normalize_url(link)
503
+ if norm_link not in links:
504
+ links.add(norm_link)
505
+ text += b" " + link.encode("utf-8")
506
+
507
+ # Process medias
508
+ for media in media_data:
509
+ if media["id"] not in media_ids:
510
+ media_ids.add(media["id"])
511
+ media_type = media["type"]
512
+ if "url" in media:
513
+ media_url = media["url"]
514
+ media_thumb = media["thumb"]
515
+ else:
516
+ media_url, media_thumb = format_media_url(
517
+ post["user_did"], media["id"], media_type, post["url"]
518
+ )
519
+ post["media_urls"].append(media_url)
520
+ post["media_thumbnails"].append(media_thumb)
521
+ post["media_types"].append(media_type)
522
+ post["media_alt_texts"].append(media.get("alt", ""))
523
+
524
+ # Rewrite post's text to include links to medias within
525
+ text += b" " + (
526
+ media_thumb if media_type.startswith("video") and not media_type.endswith("/gif") else media_url
527
+ ).encode("utf-8")
528
+
529
+ # Process quotes
530
+ if quoted_data and "value" in quoted_data:
531
+ if quoted_data["cid"] != post["quoted_cid"]:
532
+ raise BlueskyPayloadError(
533
+ post["url"],
534
+ "inconsistent quote cid found between record.embed.record.cid & embed.record.cid: %s %s"
535
+ % (post["quoted_cid"], quoted_data),
536
+ )
537
+
538
+ quoted_data["record"] = quoted_data["value"]
539
+ del quoted_data["value"]
540
+ if "embeds" in quoted_data and len(quoted_data["embeds"]):
541
+ if len(quoted_data["embeds"]) != 1:
542
+ raise BlueskyPayloadError(
543
+ post["url"],
544
+ "unusual multiple embeds found within a quoted post: %s"
545
+ % quoted_data["embeds"],
546
+ )
547
+ quoted_data["embed"] = quoted_data["embeds"][0]
548
+ del quoted_data["embeds"]
549
+
550
+ nested = normalize_post(
551
+ quoted_data,
552
+ locale=locale,
553
+ extract_referenced_posts=True,
554
+ collection_source="quote",
555
+ )
556
+ quoted = nested[-1]
557
+ if extract_referenced_posts:
558
+ referenced_posts = merge_nested_posts(
559
+ referenced_posts, nested, post["url"]
560
+ )
561
+
562
+ # Take better quoted url with user_handle
563
+ post["quoted_url"] = quoted["url"]
564
+ post["quoted_user_handle"] = quoted["user_handle"]
565
+ post["quoted_created_at"] = quoted["local_time"]
566
+ post["quoted_timestamp_utc"] = quoted["timestamp_utc"]
567
+
568
+ # Remove quoted link from post links if present in text
569
+ if quoted["url"] in links:
570
+ links.remove(quoted["url"])
571
+
572
+ # Rewrite post's text to include quote within (or replace the link to the quote if present)
573
+ quote = (
574
+ "« @%s: %s — %s »"
575
+ % (quoted["user_handle"], quoted["text"], quoted["url"])
576
+ ).encode("utf-8")
577
+ url_lower = quoted["url"].encode("utf-8").lower()
578
+ text_lower = text.lower()
579
+ if url_lower in text_lower:
580
+ url_pos = text_lower.find(url_lower)
581
+ text = text[:url_pos] + quote + text[url_pos + len(quoted["url"]) :]
582
+ else:
583
+ text += b" " + quote
584
+
585
+ # Process links domains
586
+ post["links"] = sorted(links)
587
+ post["domains"] = [custom_get_normalized_hostname(link) for link in post["links"]]
588
+
589
+ # Handle threadgates (replies rules)
590
+ # WARNING: quoted posts do not seem to include threadgates info
591
+ # Issue opened about it here: https://github.com/bluesky-social/atproto/issues/3716
592
+ if "threadgate" in data:
593
+ post["replies_rules"] = []
594
+ if "allow" in data["threadgate"]["record"]:
595
+ for rule in data["threadgate"]["record"]["allow"]:
596
+ rule_string = (
597
+ "allow_from_" + rule["$type"].split("#")[1].split("Rule")[0]
598
+ )
599
+ if rule_string.endswith("_list") and "list" in rule:
600
+ for allowed_list in rule["list"]:
601
+ post["replies_rules"].append(rule_string + ":" + allowed_list)
602
+ else:
603
+ post["replies_rules"].append(rule_string)
604
+ if not data["threadgate"]["record"]["allow"]:
605
+ post["replies_rules"].append("disallow")
606
+ (
607
+ post["replies_rules_timestamp_utc"],
608
+ post["replies_rules_created_at"],
609
+ ) = get_dates(
610
+ data["threadgate"]["record"]["createdAt"], locale=locale, source="bluesky"
611
+ )
612
+ post["hidden_replies_uris"] = data["threadgate"]["record"].get(
613
+ "hiddenReplies", []
614
+ )
615
+
616
+ # Handle postgates (quotes rules)
617
+ #
618
+ # Users can forbid others to quote a post, but payloads do not seem to
619
+ # include it yet although the API spec documents it:
620
+ # https://github.com/bluesky-social/atproto/blob/main/lexicons/app/bsky/feed/postgate.json
621
+ # Issue opened about it here: https://github.com/bluesky-social/atproto/issues/3712
622
+ #
623
+ # if "postgate" in data:
624
+ # if "embeddingRules" in data["postgate"]["record"] and data["postgate"]["record"]["embeddingRules"]:
625
+ # post["quotes_rule"] = "disallow"
626
+ # post["quotes_rules_timestamp_utc"], post["quotes_rules_created_at"] = get_dates(data["postgate"]["record"]["createdAt"], locale=locale, source="bluesky")
627
+ # post["detached_quotes_uris"] = data["postgate"]["record"].get("detachedEmbeddingUris", [])
628
+
629
+ # Handle reposts when data comes from a feed
630
+ if repost_data:
631
+ if not repost_data["$type"].endswith("reasonRepost"):
632
+ raise BlueskyPayloadError(
633
+ post["url"],
634
+ "unusual reason for including a post within a feed: %s" % repost_data,
635
+ )
636
+
637
+ post["repost_by_user_did"] = repost_data["by"]["did"]
638
+ post["repost_by_user_handle"] = repost_data["by"]["handle"]
639
+ post["repost_timestamp_utc"], post["repost_created_at"] = get_dates(
640
+ repost_data["indexedAt"], locale=locale, source="bluesky"
641
+ )
642
+
643
+ post["text"] = text.decode("utf-8")
644
+
645
+ if collection_source is not None:
646
+ post["collected_via"] = [collection_source]
647
+ post["match_query"] = collection_source not in ["thread", "quote"]
648
+
649
+ if extract_referenced_posts:
650
+ # Handle thread posts when data comes from a feed
651
+ if reply_data:
652
+ if "parent" in reply_data:
653
+ nested = normalize_post(
654
+ reply_data["parent"],
655
+ locale=locale,
656
+ extract_referenced_posts=True,
657
+ collection_source="thread",
658
+ )
659
+ referenced_posts = merge_nested_posts(
660
+ referenced_posts, nested, post["url"]
661
+ )
662
+
663
+ if "root" in reply_data and (
664
+ "parent" not in reply_data
665
+ or reply_data["parent"]["cid"] != reply_data["root"]["cid"]
666
+ ):
667
+ nested = normalize_post(
668
+ reply_data["root"],
669
+ locale=locale,
670
+ extract_referenced_posts=True,
671
+ collection_source="thread",
672
+ )
673
+ referenced_posts = merge_nested_posts(
674
+ referenced_posts, nested, post["url"]
675
+ )
676
+
677
+ if "grandparentAuthor" in reply_data:
678
+ # TODO ? Shall we do anything from that?
679
+ pass
680
+
681
+ assert referenced_posts is not None
682
+ return [referenced_posts[did] for did in sorted(referenced_posts.keys())] + [
683
+ post
684
+ ] # type: ignore
685
+
686
+ return post # type: ignore