twitwi 0.19.2__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,641 @@
1
+ from copy import deepcopy
2
+ from typing import List, Dict, Union, Optional, Literal, overload
3
+
4
+ from twitwi.exceptions import BlueskyPayloadError
5
+ from twitwi.utils import (
6
+ get_collection_time,
7
+ get_dates,
8
+ custom_normalize_url,
9
+ custom_get_normalized_hostname,
10
+ )
11
+ from twitwi.bluesky.utils import (
12
+ validate_post_payload,
13
+ valid_embed_type,
14
+ format_profile_url,
15
+ format_post_url,
16
+ parse_post_url,
17
+ parse_post_uri,
18
+ format_media_url,
19
+ )
20
+ from twitwi.bluesky.types import BlueskyProfile, BlueskyPost
21
+
22
+
23
+ def normalize_profile(data: Dict, locale: Optional[str] = None) -> BlueskyProfile:
24
+ associated = data["associated"]
25
+
26
+ pinned_post_uri = None
27
+ pinned_post_data = data.get("pinnedPost")
28
+
29
+ if pinned_post_data is not None:
30
+ pinned_post_uri = pinned_post_data["uri"]
31
+
32
+ timestamp_utc, created_at = get_dates(
33
+ data["createdAt"], locale=locale, source="bluesky"
34
+ )
35
+
36
+ return {
37
+ "did": data["did"],
38
+ "url": format_profile_url(data["handle"]),
39
+ "handle": data["handle"],
40
+ "display_name": data["displayName"],
41
+ "created_at": created_at,
42
+ "timestamp_utc": timestamp_utc,
43
+ "description": data["description"],
44
+ "avatar": data["avatar"],
45
+ "posts": data["postsCount"],
46
+ "followers": data["followersCount"],
47
+ "follows": data["followsCount"],
48
+ "lists": associated["lists"],
49
+ "feedgens": associated["feedgens"],
50
+ "starter_packs": associated["starterPacks"],
51
+ "banner": data["banner"],
52
+ "pinned_post_uri": pinned_post_uri,
53
+ "collection_time": get_collection_time(),
54
+ }
55
+
56
+
57
+ def prepare_native_gif_as_media(gif_data, user_did, source):
58
+ media_cid = gif_data["thumb"]["ref"]["$link"]
59
+ _, thumbnail = format_media_url(user_did, media_cid, "image/jpeg", source)
60
+ return {
61
+ "id": media_cid,
62
+ "type": "video/gif",
63
+ "alt": gif_data["title"],
64
+ "url": gif_data["uri"],
65
+ "thumb": thumbnail,
66
+ }
67
+
68
+
69
+ def prepare_image_as_media(image_data):
70
+ return {
71
+ "id": image_data["image"]["ref"]["$link"],
72
+ "type": image_data["image"]["mimeType"],
73
+ "alt": image_data["alt"],
74
+ }
75
+
76
+
77
+ def prepare_video_as_media(video_data):
78
+ return {
79
+ "id": video_data["ref"]["$link"],
80
+ "type": video_data["mimeType"],
81
+ }
82
+
83
+
84
+ def process_card_data(embed_data, post):
85
+ # Warning: mutates post
86
+
87
+ post["card_link"] = embed_data["uri"]
88
+ post["card_title"] = embed_data.get("title", "")
89
+ post["card_description"] = embed_data.get("description", "")
90
+ post["card_thumbnail"] = embed_data.get("thumb", "")
91
+ return post
92
+
93
+
94
+ def prepare_quote_data(embed_quote, card_data, post, links):
95
+ # Warning: mutates post and links
96
+
97
+ post["quoted_cid"] = embed_quote["cid"]
98
+ post["quoted_uri"] = embed_quote["uri"]
99
+ post["quoted_user_did"], post["quoted_did"] = parse_post_uri(
100
+ post["quoted_uri"], post["url"]
101
+ )
102
+
103
+ # First store ugly quoted url with user did in case full quote data is missing (recursion > 3 or detached quote)
104
+ post["quoted_url"] = format_post_url(post["quoted_user_did"], post["quoted_did"])
105
+
106
+ quoted_data = None
107
+ if card_data:
108
+ if card_data.get("detached", False):
109
+ post["quoted_status"] = "detached"
110
+
111
+ else:
112
+ quoted_data = deepcopy(card_data)
113
+
114
+ # Grab user handle and cleanup links when no quote data but url in text
115
+ if not quoted_data:
116
+ for link in links:
117
+ if link.startswith("https://bsky.app/profile/") and link.endswith(
118
+ post["quoted_did"]
119
+ ):
120
+ # Take better quoted url with user_handle
121
+ post["quoted_url"] = link
122
+ break
123
+
124
+ # Remove quoted link from post links
125
+ links.remove(post["quoted_url"])
126
+
127
+ # Extract user handle from url
128
+ post["quoted_user_handle"], _ = parse_post_url(post["quoted_url"], post["url"])
129
+
130
+ return (post, quoted_data, links)
131
+
132
+
133
+ def merge_nested_posts(referenced_posts, nested, source):
134
+ for new_post in nested:
135
+ ordered_id = "%s_%s" % (new_post["did"], new_post["user_handle"])
136
+ if ordered_id not in referenced_posts:
137
+ referenced_posts[ordered_id] = new_post
138
+ else:
139
+ old_post = referenced_posts[ordered_id]
140
+ for key in new_post.keys():
141
+ if key not in old_post:
142
+ old_post[key] = new_post[key]
143
+ elif old_post[key] != new_post[key]:
144
+ if key == "collected_via":
145
+ old_post[key] += new_post[key]
146
+ elif key == "match_query":
147
+ old_post[key] = old_post[key] or new_post[key]
148
+ elif key not in ["collection_time"]:
149
+ raise BlueskyPayloadError(
150
+ source,
151
+ "a nested post appearing twice in the same payload has some diverging metadata for key %s: %s / %s"
152
+ % (key, old_post[key], new_post[key]),
153
+ )
154
+ return referenced_posts
155
+
156
+
157
+ @overload
158
+ def normalize_post(
159
+ data: Dict,
160
+ locale: Optional[str] = ...,
161
+ extract_referenced_posts: Literal[True] = ...,
162
+ collection_source: Optional[str] = ...,
163
+ ) -> List[BlueskyPost]: ...
164
+
165
+
166
+ @overload
167
+ def normalize_post(
168
+ data: Dict,
169
+ locale: Optional[str] = ...,
170
+ extract_referenced_posts: Literal[False] = ...,
171
+ collection_source: Optional[str] = ...,
172
+ ) -> BlueskyPost: ...
173
+
174
+
175
+ def normalize_post(
176
+ payload: Dict,
177
+ locale: Optional[str] = None,
178
+ extract_referenced_posts: bool = False,
179
+ collection_source: Optional[str] = None,
180
+ ) -> Union[BlueskyPost, List[BlueskyPost]]:
181
+ """
182
+ Function "normalizing" a post as returned by Bluesky's API in order to
183
+ cleanup and optimize some fields.
184
+
185
+ Args:
186
+ payload (dict): post or feed payload json dict from Bluesky API.
187
+ locale (pytz.timezone, optional): Timezone for date conversions.
188
+ extract_referenced_posts (bool, optional): Whether to return, in
189
+ addition to the original post, also the full list of posts
190
+ found in the given payload (including the tree of quoted posts
191
+ as well as the parent and root posts of the thread if the post
192
+ comes as an answer to another one). Defaults
193
+ to `False`.
194
+ collection_source (str, optional): string explaining how the post
195
+ was collected. Defaults to `None`.
196
+
197
+ Returns:
198
+ (dict or list): Either a single post dict or a list of post dicts if
199
+ `extract_referenced_posts` was set to `True`.
200
+
201
+ """
202
+
203
+ if not isinstance(payload, dict):
204
+ raise BlueskyPayloadError(
205
+ "UNKNOWN", f"data provided to normalize_post is not a dictionary: {payload}"
206
+ )
207
+
208
+ valid, error = validate_post_payload(payload)
209
+ if not valid:
210
+ raise BlueskyPayloadError(
211
+ payload.get("uri", payload.get("post", {}).get("uri", "UNKNOWN")),
212
+ f"data provided to normalize_post is not a standard Bluesky post or feed payload:\n{error}",
213
+ )
214
+
215
+ if "post" in payload:
216
+ data = payload["post"]
217
+ reply_data = payload.get("reply")
218
+ repost_data = payload.get("reason")
219
+ else:
220
+ data = payload
221
+ reply_data = None
222
+ repost_data = None
223
+
224
+ if extract_referenced_posts:
225
+ referenced_posts = {}
226
+
227
+ if collection_source is None:
228
+ collection_source = data.get("collection_source")
229
+
230
+ post = {}
231
+
232
+ # Store original text and prepare text for quotes & medias enriched version
233
+ post["original_text"] = data["record"]["text"]
234
+ text = post["original_text"].encode("utf-8")
235
+
236
+ # Handle datetime fields
237
+ post["collection_time"] = get_collection_time()
238
+ post["timestamp_utc"], post["local_time"] = get_dates(
239
+ data["record"]["createdAt"], locale=locale, source="bluesky"
240
+ )
241
+
242
+ # Handle post/user identifiers
243
+ post["cid"] = data["cid"]
244
+ post["uri"] = data["uri"]
245
+ post["user_did"], post["did"] = parse_post_uri(data["uri"])
246
+ post["user_handle"] = data["author"]["handle"]
247
+ post["user_url"] = format_profile_url(post["user_handle"])
248
+ post["url"] = format_post_url(post["user_handle"], post["did"])
249
+
250
+ if post["user_did"] != data["author"]["did"]:
251
+ raise BlueskyPayloadError(
252
+ post["url"],
253
+ "inconsistent user_did between Bluesky post's uri and post's author metadata: %s %s"
254
+ % (data["uri"], data["author"]),
255
+ )
256
+
257
+ # Handle user metadata
258
+ post["user_diplay_name"] = data["author"]["displayName"]
259
+ post["user_avatar"] = data["author"]["avatar"]
260
+ post["user_timestamp_utc"], post["user_created_at"] = get_dates(
261
+ data["author"]["createdAt"], locale=locale, source="bluesky"
262
+ )
263
+ post["user_langs"] = data["record"].get("langs", [])
264
+
265
+ if "bridgyOriginalUrl" in data["record"]:
266
+ post["bridgy_original_url"] = data["record"]["bridgyOriginalUrl"]
267
+
268
+ # Handle metrics
269
+ post["repost_count"] = data["repostCount"]
270
+ post["reply_count"] = data["replyCount"]
271
+ post["like_count"] = data["likeCount"]
272
+ post["quote_count"] = data["quoteCount"]
273
+
274
+ # Handle hashtags, mentions & links from facets
275
+ post["mentioned_user_handles"] = []
276
+ post["mentioned_user_dids"] = []
277
+ hashtags = set()
278
+ links = set()
279
+ links_to_replace = []
280
+ for facet in data["record"].get("facets", []):
281
+ if len(facet["features"]) != 1:
282
+ raise BlueskyPayloadError(
283
+ post["url"],
284
+ "unusual record facet content with more or less than a unique feature: %s"
285
+ % facet,
286
+ )
287
+
288
+ feat = facet["features"][0]
289
+
290
+ # Hashtags
291
+ if feat["$type"].endswith("#tag"):
292
+ hashtags.add(feat["tag"].strip().lower())
293
+
294
+ # Mentions
295
+ elif feat["$type"].endswith("#mention"):
296
+ if feat["did"] not in post["mentioned_user_dids"]:
297
+ post["mentioned_user_dids"].append(feat["did"])
298
+
299
+ # Check & fix occasional errored mention positioning
300
+ # example: https://bsky.app/profile/snjcgt.bsky.social/post/3lpmqkkkgp52u
301
+ byteStart = facet["index"]["byteStart"]
302
+ if text[byteStart : byteStart + 1] != b"@":
303
+ byteStart = text.find(b"@", byteStart)
304
+
305
+ handle = (
306
+ text[byteStart + 1 : facet["index"]["byteEnd"] + byteStart - facet["index"]["byteStart"]]
307
+ .strip()
308
+ .lower()
309
+ .decode("utf-8")
310
+ )
311
+ post["mentioned_user_handles"].append(handle)
312
+
313
+ # Links
314
+ elif feat["$type"].endswith("#link"):
315
+ links.add(custom_normalize_url(feat["uri"]))
316
+ links_to_replace.append(
317
+ {
318
+ "uri": feat["uri"].encode("utf-8"),
319
+ "start": facet["index"]["byteStart"],
320
+ "end": facet["index"]["byteEnd"],
321
+ }
322
+ )
323
+
324
+ else:
325
+ raise BlueskyPayloadError(
326
+ post["url"], "unusual record facet feature $type: %s" % feat
327
+ )
328
+ post["hashtags"] = sorted(hashtags)
329
+
330
+ # Rewrite full links within post's text
331
+ for link in sorted(links_to_replace, key=lambda x: x["start"], reverse=True):
332
+ text = text[: link["start"]] + link["uri"] + text[link["end"] :]
333
+
334
+ # Handle thread info when applicable
335
+ # Unfortunately posts' payload only provide at uris for these so we do not have the handles
336
+ # We could sometimes resolve them from the mentionned and quote data but that would not handle most cases
337
+ # Issue opened here to have user handles along: https://github.com/bluesky-social/atproto/issues/3722
338
+ if "reply" in data["record"]:
339
+ if "parent" in data["record"]["reply"]:
340
+ post["to_post_cid"] = data["record"]["reply"]["parent"]["cid"]
341
+ post["to_post_uri"] = data["record"]["reply"]["parent"]["uri"]
342
+ post["to_user_did"], post["to_post_did"] = parse_post_uri(
343
+ post["to_post_uri"], post["url"]
344
+ )
345
+ post["to_post_url"] = format_post_url(
346
+ post["to_user_did"], post["to_post_did"]
347
+ )
348
+ if "root" in data["record"]["reply"]:
349
+ post["to_root_post_cid"] = data["record"]["reply"]["root"]["cid"]
350
+ post["to_root_post_uri"] = data["record"]["reply"]["root"]["uri"]
351
+ post["to_root_user_did"], post["to_root_post_did"] = parse_post_uri(
352
+ post["to_root_post_uri"], post["url"]
353
+ )
354
+ post["to_root_post_url"] = format_post_url(
355
+ post["to_root_user_did"], post["to_root_post_did"]
356
+ )
357
+
358
+ # Handle quotes & medias
359
+ media_ids = set()
360
+ post["media_urls"] = []
361
+ post["media_thumbnails"] = []
362
+ post["media_types"] = []
363
+ post["media_alt_texts"] = []
364
+ if "embed" in data["record"]:
365
+ embed = data["record"]["embed"]
366
+ quoted_data = None
367
+ media_data = []
368
+ extra_links = []
369
+
370
+ if not valid_embed_type(embed["$type"]):
371
+ raise BlueskyPayloadError(
372
+ post["url"], "unusual record embed $type: %s" % embed
373
+ )
374
+
375
+ # Links from cards
376
+ if embed["$type"].endswith(".external"):
377
+ link = embed["external"]["uri"]
378
+
379
+ # Handle native gifs as medias
380
+ if link.startswith("https://media.tenor.com/"):
381
+ media_data.append(
382
+ prepare_native_gif_as_media(
383
+ embed["external"], post["user_did"], post["url"]
384
+ )
385
+ )
386
+
387
+ # Extra card links sometimes missing from facets & text due to manual action in post form
388
+ else:
389
+ extra_links.append(embed["external"]["uri"])
390
+ # Handle link card metadata
391
+ if "embed" in data:
392
+ post = process_card_data(data["embed"]["external"], post)
393
+
394
+ # Images
395
+ if embed["$type"].endswith(".images"):
396
+ media_data.extend([prepare_image_as_media(i) for i in embed["images"]])
397
+
398
+ # Video
399
+ if embed["$type"].endswith(".video"):
400
+ media_data.append(prepare_video_as_media(embed["video"]))
401
+
402
+ # Quote
403
+ if embed["$type"].endswith(".record"):
404
+ post, quoted_data, links = prepare_quote_data(
405
+ embed["record"], data.get("embed", {}).get("record"), post, links
406
+ )
407
+
408
+ # Quote with medias
409
+ if embed["$type"].endswith(".recordWithMedia"):
410
+ post, quoted_data, links = prepare_quote_data(
411
+ embed["record"]["record"],
412
+ data.get("embed", {}).get("record", {}).get("record"),
413
+ post,
414
+ links,
415
+ )
416
+
417
+ # Links from cards
418
+ if embed["media"]["$type"].endswith(".external"):
419
+ link = embed["media"]["external"]["uri"]
420
+
421
+ # Handle native gifs as medias
422
+ if link.startswith("https://media.tenor.com/"):
423
+ media_data.append(
424
+ prepare_native_gif_as_media(
425
+ embed["media"]["external"], post["user_did"], post["url"]
426
+ )
427
+ )
428
+
429
+ # Extra card links sometimes missing from facets & text due to manual action in post form
430
+ else:
431
+ extra_links = [link] + extra_links
432
+ # Handle link card metadata
433
+ if "embed" in data and "media" in data["embed"]["media"]:
434
+ post = process_card_data(
435
+ data["embed"]["media"]["external"], post
436
+ )
437
+
438
+ # Images
439
+ elif embed["media"]["$type"].endswith(".images"):
440
+ media_data.extend(
441
+ [prepare_image_as_media(i) for i in embed["media"]["images"]]
442
+ )
443
+
444
+ # Video
445
+ elif embed["media"]["$type"].endswith(".video"):
446
+ media_data.append(prepare_video_as_media(embed["media"]["video"]))
447
+
448
+ else:
449
+ raise BlueskyPayloadError(
450
+ post["url"],
451
+ "unusual record embed media $type from a recordWithMedia: %s"
452
+ % embed,
453
+ )
454
+
455
+ # Process extra links
456
+ for link in extra_links:
457
+ norm_link = custom_normalize_url(link)
458
+ if norm_link not in links:
459
+ links.add(norm_link)
460
+ text += b" " + link.encode("utf-8")
461
+
462
+ # Process medias
463
+ for media in media_data:
464
+ if media["id"] not in media_ids:
465
+ media_ids.add(media["id"])
466
+ media_type = media["type"]
467
+ if "url" in media:
468
+ media_url = media["url"]
469
+ media_thumb = media["thumb"]
470
+ else:
471
+ media_url, media_thumb = format_media_url(
472
+ post["user_did"], media["id"], media_type, post["url"]
473
+ )
474
+ post["media_urls"].append(media_url)
475
+ post["media_thumbnails"].append(media_thumb)
476
+ post["media_types"].append(media_type)
477
+ post["media_alt_texts"].append(media.get("alt", ""))
478
+
479
+ # Rewrite post's text to include links to medias within
480
+ text += b" " + (
481
+ media_thumb if media_type.startswith("video") and not media_type.endswith("/gif") else media_url
482
+ ).encode("utf-8")
483
+
484
+ # Process quotes
485
+ if quoted_data:
486
+ if quoted_data["cid"] != post["quoted_cid"]:
487
+ raise BlueskyPayloadError(
488
+ post["url"],
489
+ "inconsistent quote cid found between record.embed.record.cid & embed.record.cid: %s %s"
490
+ % (post["quoted_cid"], quoted_data),
491
+ )
492
+
493
+ quoted_data["record"] = quoted_data["value"]
494
+ del quoted_data["value"]
495
+ if "embeds" in quoted_data and len(quoted_data["embeds"]):
496
+ if len(quoted_data["embeds"]) != 1:
497
+ raise BlueskyPayloadError(
498
+ post["url"],
499
+ "unusual multiple embeds found within a quoted post: %s"
500
+ % quoted_data["embeds"],
501
+ )
502
+ quoted_data["embed"] = quoted_data["embeds"][0]
503
+ del quoted_data["embeds"]
504
+
505
+ nested = normalize_post(
506
+ quoted_data,
507
+ locale=locale,
508
+ extract_referenced_posts=True,
509
+ collection_source="quote",
510
+ )
511
+ quoted = nested[-1]
512
+ if extract_referenced_posts:
513
+ referenced_posts = merge_nested_posts(
514
+ referenced_posts, nested, post["url"]
515
+ )
516
+
517
+ # Take better quoted url with user_handle
518
+ post["quoted_url"] = quoted["url"]
519
+ post["quoted_user_handle"] = quoted["user_handle"]
520
+ post["quoted_created_at"] = quoted["local_time"]
521
+ post["quoted_timestamp_utc"] = quoted["timestamp_utc"]
522
+
523
+ # Remove quoted link from post links if present in text
524
+ if quoted["url"] in links:
525
+ links.remove(quoted["url"])
526
+
527
+ # Rewrite post's text to include quote within (or replace the link to the quote if present)
528
+ quote = (
529
+ "« @%s: %s — %s »"
530
+ % (quoted["user_handle"], quoted["text"], quoted["url"])
531
+ ).encode("utf-8")
532
+ url_lower = quoted["url"].encode("utf-8").lower()
533
+ text_lower = text.lower()
534
+ if url_lower in text_lower:
535
+ url_pos = text_lower.find(url_lower)
536
+ text = text[:url_pos] + quote + text[url_pos + len(quoted["url"]) :]
537
+ else:
538
+ text += b" " + quote
539
+
540
+ # Process links domains
541
+ post["links"] = sorted(links)
542
+ post["domains"] = [custom_get_normalized_hostname(link) for link in post["links"]]
543
+
544
+ # Handle threadgates (replies rules)
545
+ # WARNING: quoted posts do not seem to include threadgates info
546
+ # Issue opened about it here: https://github.com/bluesky-social/atproto/issues/3716
547
+ if "threadgate" in data:
548
+ post["replies_rules"] = []
549
+ if "allow" in data["threadgate"]["record"]:
550
+ for rule in data["threadgate"]["record"]["allow"]:
551
+ rule_string = (
552
+ "allow_from_" + rule["$type"].split("#")[1].split("Rule")[0]
553
+ )
554
+ if rule_string.endswith("_list") and "list" in rule:
555
+ for allowed_list in rule["list"]:
556
+ post["replies_rules"].append(rule_string + ":" + allowed_list)
557
+ else:
558
+ post["replies_rules"].append(rule_string)
559
+ if not data["threadgate"]["record"]["allow"]:
560
+ post["replies_rules"].append("disallow")
561
+ (
562
+ post["replies_rules_timestamp_utc"],
563
+ post["replies_rules_created_at"],
564
+ ) = get_dates(
565
+ data["threadgate"]["record"]["createdAt"], locale=locale, source="bluesky"
566
+ )
567
+ post["hidden_replies_uris"] = data["threadgate"]["record"].get(
568
+ "hiddenReplies", []
569
+ )
570
+
571
+ # Handle postgates (quotes rules)
572
+ #
573
+ # Users can forbid others to quote a post, but payloads do not seem to
574
+ # include it yet although the API spec documents it:
575
+ # https://github.com/bluesky-social/atproto/blob/main/lexicons/app/bsky/feed/postgate.json
576
+ # Issue opened about it here: https://github.com/bluesky-social/atproto/issues/3712
577
+ #
578
+ # if "postgate" in data:
579
+ # if "embeddingRules" in data["postgate"]["record"] and data["postgate"]["record"]["embeddingRules"]:
580
+ # post["quotes_rule"] = "disallow"
581
+ # post["quotes_rules_timestamp_utc"], post["quotes_rules_created_at"] = get_dates(data["postgate"]["record"]["createdAt"], locale=locale, source="bluesky")
582
+ # post["detached_quotes_uris"] = data["postgate"]["record"].get("detachedEmbeddingUris", [])
583
+
584
+ # Handle reposts when data comes from a feed
585
+ if repost_data:
586
+ if not repost_data["$type"].endswith("reasonRepost"):
587
+ raise BlueskyPayloadError(
588
+ post["url"],
589
+ "unusual reason for including a post within a feed: %s" % repost_data,
590
+ )
591
+
592
+ post["repost_by_user_did"] = repost_data["by"]["did"]
593
+ post["repost_by_user_handle"] = repost_data["by"]["handle"]
594
+ post["repost_timestamp_utc"], post["repost_created_at"] = get_dates(
595
+ repost_data["indexedAt"], locale=locale, source="bluesky"
596
+ )
597
+
598
+ post["text"] = text.decode("utf-8")
599
+
600
+ if collection_source is not None:
601
+ post["collected_via"] = [collection_source]
602
+ post["match_query"] = collection_source not in ["thread", "quote"]
603
+
604
+ if extract_referenced_posts:
605
+ # Handle thread posts when data comes from a feed
606
+ if reply_data:
607
+ if "parent" in reply_data:
608
+ nested = normalize_post(
609
+ reply_data["parent"],
610
+ locale=locale,
611
+ extract_referenced_posts=True,
612
+ collection_source="thread",
613
+ )
614
+ referenced_posts = merge_nested_posts(
615
+ referenced_posts, nested, post["url"]
616
+ )
617
+
618
+ if "root" in reply_data and (
619
+ "parent" not in reply_data
620
+ or reply_data["parent"]["cid"] != reply_data["root"]["cid"]
621
+ ):
622
+ nested = normalize_post(
623
+ reply_data["root"],
624
+ locale=locale,
625
+ extract_referenced_posts=True,
626
+ collection_source="thread",
627
+ )
628
+ referenced_posts = merge_nested_posts(
629
+ referenced_posts, nested, post["url"]
630
+ )
631
+
632
+ if "grandparentAuthor" in reply_data:
633
+ # TODO ? Shall we do anything from that?
634
+ pass
635
+
636
+ assert referenced_posts is not None
637
+ return [referenced_posts[did] for did in sorted(referenced_posts.keys())] + [
638
+ post
639
+ ] # type: ignore
640
+
641
+ return post # type: ignore