PyPI - twitwi - Versions diffs - 0.23.0__py3-none-any.whl → 0.24.0__py3-none-any.whl - Mend

twitwi 0.23.0py3-none-any.whl → 0.24.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

test/bluesky/formatters_test.py +4 -2
twitwi/bluesky/normalizers.py +379 -76
twitwi/bluesky/utils.py +30 -14
twitwi/exceptions.py +1 -1
twitwi/formatters.py +16 -3
twitwi/utils.py +15 -9
{twitwi-0.23.0.dist-info → twitwi-0.24.0.dist-info}/METADATA +3 -3
{twitwi-0.23.0.dist-info → twitwi-0.24.0.dist-info}/RECORD +12 -12
{twitwi-0.23.0.dist-info → twitwi-0.24.0.dist-info}/WHEEL +1 -1
{twitwi-0.23.0.dist-info → twitwi-0.24.0.dist-info}/licenses/LICENSE.txt +0 -0
{twitwi-0.23.0.dist-info → twitwi-0.24.0.dist-info}/top_level.txt +0 -0
{twitwi-0.23.0.dist-info → twitwi-0.24.0.dist-info}/zip-safe +0 -0

test/bluesky/formatters_test.py CHANGED Viewed

@@ -112,7 +112,9 @@ class TestFormatters:
         for source in normalized_posts:
             for post in source:
-                writer.writerow(format_post_as_csv_row(post))
+                writer.writerow(
+                    format_post_as_csv_row(post, allow_erroneous_plurals=True)
+                )
         if OVERWRITE_TESTS:
             written = buffer.getvalue()
@@ -140,7 +142,7 @@ class TestFormatters:
         for source in normalized_posts:
             for post in source:
-                transform_post_into_csv_dict(post)
+                transform_post_into_csv_dict(post, allow_erroneous_plurals=True)
                 writer.writerow(post)
         with open_resource("bluesky-posts-export.csv") as f:

twitwi/bluesky/normalizers.py CHANGED Viewed

@@ -99,14 +99,33 @@ def prepare_native_gif_as_media(gif_data, user_did, source):
     }
-def prepare_image_as_media(image_data):
-    if "ref" not in image_data["image"] or "$link" not in image_data["image"]["ref"]:
-        image_id = image_data["image"]["cid"]
+def prepare_image_as_media(image_data, source):
+    if isinstance(image_data["image"], str):
+        # As in this post: https://bsky.app/profile/did:plc:xafmeedgq77f6smn6kmalasr/post/3lcnxglm3o62z
+        image_type = "image/jpeg"
+        image_id = image_data["image"]
+    elif isinstance(image_data["image"], dict):
+        image_type = image_data["image"]["mimeType"]
+        if (
+            "ref" not in image_data["image"]
+            or "$link" not in image_data["image"]["ref"]
+        ):
+            # As in this post: https://bsky.app/profile/testjuan06.bsky.social/post/3ljkzygywso2b
+            if "link" in image_data["image"]:
+                image_id = image_data["image"]["link"]
+            elif "cid" in image_data["image"]:
+                image_id = image_data["image"]["cid"]
+            else:
+                raise BlueskyPayloadError(
+                    source, "Unable to find image id in image data: %s" % image_data
+                )
+        else:
+            image_id = image_data["image"]["ref"]["$link"]
     else:
-        image_id = image_data["image"]["ref"]["$link"]
+        raise BlueskyPayloadError(source, "Unable to parse image data: %s" % image_data)
     return {
         "id": image_id,
-        "type": image_data["image"]["mimeType"],
+        "type": image_type,
         "alt": image_data["alt"],
     }
@@ -140,7 +159,15 @@ def process_card_data(embed_data, post):
     post["card_link"] = embed_data["uri"]
     post["card_title"] = embed_data.get("title", "")
     post["card_description"] = embed_data.get("description", "")
-    post["card_thumbnail"] = embed_data.get("thumb", "")
+    if isinstance(embed_data.get("thumb"), dict) and embed_data["thumb"].get(
+        "ref", {}
+    ).get("$link"):
+        media_cid = embed_data["thumb"]["ref"]["$link"]
+        post["card_thumbnail"] = (
+            f"https://cdn.bsky.app/img/feed_thumbnail/plain/{post['user_did']}/{media_cid}@jpeg"
+        )
+    else:
+        post["card_thumbnail"] = embed_data.get("thumb", "")
     return post
@@ -308,6 +335,10 @@ def normalize_post(
     post["timestamp_utc"], post["local_time"] = get_dates(
         data["record"]["createdAt"], locale=locale, source="bluesky"
     )
+    # Completing year with less than 4 digits as in some posts: https://bsky.app/profile/koro.icu/post/3kbpuogc6fz2o
+    # len 26 example: '2023-06-15T12:34:56.789000'
+    while len(post["local_time"]) < 26 and len(post["local_time"].split("-")[0]) < 4:
+        post["local_time"] = "0" + post["local_time"]
     post["indexed_at_utc"] = data["indexedAt"]
     # Handle post/user identifiers
@@ -316,7 +347,11 @@ def normalize_post(
     post["user_did"], post["did"] = parse_post_uri(data["uri"])
     post["user_handle"] = data["author"]["handle"]
     post["user_url"] = format_profile_url(post["user_handle"])
-    post["url"] = format_post_url(post["user_handle"], post["did"])
+    # example: https://bsky.app/profile/did:plc:n5pm4vggu475okayqvqipkoh/post/3lmdcgp3a7cnd
+    if post["user_handle"] == "handle.invalid":
+        post["url"] = format_post_url(post["user_did"], post["did"])
+    else:
+        post["url"] = format_post_url(post["user_handle"], post["did"])
     if post["user_did"] != data["author"]["did"]:
         raise BlueskyPayloadError(
@@ -350,19 +385,91 @@ def normalize_post(
     hashtags = set()
     links = set()
     links_to_replace = []
+    media_data = []
+    extra_links = []
+    post["media_urls"] = []
     for facet in data["record"].get("facets", []):
         if len(facet["features"]) != 1:
-            raise BlueskyPayloadError(
-                post["url"],
-                "unusual record facet content with more or less than a unique feature: %s"
-                % facet,
-            )
+            raising_error = False
+            for feat in facet["features"]:
+                # Already handled linkcards separately below
+                if feat["$type"].endswith("#linkcard"):
+                    continue
+                # If there are links, we register them and do not replace anything in original text
+                # as we don't have position for each link
+                # example: https://bsky.app/profile/77cupons.bsky.social/post/3latbufuvqw25
+                elif feat["$type"].endswith("#link") and "uri" in feat:
+                    link = safe_normalize_url(feat["uri"])
+                    if is_url(link):
+                        links.add(link)
+                        links_to_replace.append(
+                            {"uri": feat["uri"].encode("utf-8"), "start": -1, "end": -1}
+                        )
+                elif feat["$type"].lower().endswith("#tag"):
+                    hashtags.add(feat["tag"].strip().lower())
+                # As in this post: https://bsky.app/profile/havehashad.com/post/3ki3rk5ytqd2e
+                elif feat["$type"].endswith("#image") and "uri" in feat:
+                    post["media_urls"].append(safe_normalize_url(feat["uri"]))
+                else:
+                    raising_error = True
+            if raising_error:
+                raise BlueskyPayloadError(
+                    post["url"],
+                    "unusual record facet content with more or less than a unique feature: %s"
+                    % facet,
+                )
+            continue
         feat = facet["features"][0]
+        lower_feat_type = feat["$type"].lower()
         # Hashtags
-        if feat["$type"].endswith("#tag") or feat["$type"].endswith("#hashtag"):
-            hashtags.add(feat["tag"].strip().lower())
+        if (
+            lower_feat_type.endswith("#tag")
+            or lower_feat_type.endswith(".tag")
+            or lower_feat_type.endswith("#hashtag")
+            or lower_feat_type == "facettag"
+        ):
+            # Some posts have the full text in the "text" field of the hashtag feature
+            if "text" in feat:
+                for tag in feat["text"].split("#"):
+                    if tag.strip():
+                        hashtags.add(tag.strip().lower())
+            # some posts have "hashtag" instead of "tag" field
+            # example: https://bsky.app/profile/did:plc:jrodn6nnfuwzm2zxbxbpzgot/post/3lhwag3mzoo2k
+            else:
+                if "tag" in feat:
+                    tag = feat["tag"].strip().lower()
+                elif "hashtag" in feat:
+                    tag = feat["hashtag"].strip().lower()
+                # Somehow no tag found, we'll try to get it in the text slice
+                # example: https://bsky.app/profile/did:plc:p6yojdpa5iatdk3ttaty2zu2/post/3knvsl6h4x22i
+                elif len(feat) == 1:
+                    byteStart = facet["index"]["byteStart"]
+                    if text[byteStart : byteStart + 1] == b"#":
+                        byteEnd = facet["index"]["byteEnd"]
+                        try:
+                            tag = (
+                                text[byteStart:byteEnd]
+                                .decode("utf-8")
+                                .strip()
+                                .lstrip("#")
+                                .lower()
+                            )
+                        except UnicodeDecodeError:
+                            raise BlueskyPayloadError(
+                                post["url"],
+                                "unable to decode utf-8 slice for hashtag extraction: %s"
+                                % facet,
+                            )
+                    else:
+                        raise BlueskyPayloadError(
+                            post["url"],
+                            "unable to extract hashtag from text slice: %s" % facet,
+                        )
+                hashtags.add(tag)
         # Mentions
         elif feat["$type"].endswith("#mention"):
@@ -392,12 +499,23 @@ def normalize_post(
                     ]
                     .strip()
                     .lower()
-                    .decode("utf-8")
                 )
+                while byteEnd >= byteStart:
+                    try:
+                        handle.decode("utf-8")
+                        break
+                    except UnicodeDecodeError:
+                        handle = handle[:-1]
+                        continue
+                handle = handle.decode("utf-8")
                 post["mentioned_user_handles"].append(handle)
         # Links
-        elif feat["$type"].endswith("#link"):
+        elif (
+            feat["$type"].endswith("#link")
+            or feat["$type"].endswith(".link")
+            or feat["$type"].endswith(".url")
+        ):
             # Handle native polls
             if "https://poll.blue/" in feat["uri"]:
                 if feat["uri"].endswith("/0"):
@@ -420,57 +538,100 @@ def normalize_post(
             byteStart = facet["index"]["byteStart"]
             byteEnd = facet["index"]["byteEnd"]
-            if not text[byteStart:byteEnd].startswith(b"http"):
-                new_byteStart = text.find(b"http", byteStart, byteEnd)
+            # Skip overlapping links cases
+            # examples: https://bsky.app/profile/researchtrend.ai/post/3lbieylwwxs2b
+            #           https://bsky.app/profile/dj-cyberspace.otoskey.tarbin.net.ap.brid.gy/post/3lchg3plpdjp2
+            for elt in links_to_replace:
+                if (byteStart >= elt["start"] and byteStart <= elt["end"]) or (
+                    byteEnd >= elt["start"] and byteEnd <= elt["end"]
+                ):
+                    # Overlapping links, we skip this one
+                    byteStart = -1
+                    byteEnd = -1
+                    break
+            # Meaning we will try to fix the link position
+            if byteStart != -1 or byteEnd != -1:
+                # It appears that some links end before they start... Bluesky please: what's going on?
+                # example: https://bsky.app/profile/ondarockwebzine.bsky.social/post/3lqxxejza6o2t
+                # if int(byteEnd) < int(byteStart) or byteStart < 0:
+                if int(byteEnd) < int(byteStart):
+                    byteStart = -1
+                    byteEnd = -1
+                # There are mentionned links which are positionned after the end of the text,
+                # so we put them at the end of the original text
+                elif byteStart >= len(post["original_text"].encode("utf-8")):
+                    byteStart = -1
+                    byteEnd = -1
+                elif not text[byteStart:byteEnd].startswith(b"http"):
+                    new_byteStart = text.find(b"http", byteStart, byteEnd)
+                    # means that the link is shifted, like on this post:
+                    # https://bsky.app/profile/ecrime.ch/post/3lqotmopayr23
+                    if new_byteStart != -1:
+                        byteStart = new_byteStart
+                        # Find the index of the first space character after byteStart in case the link is a personalized one
+                        # but still with the link in it (somehow existing in some posts, such as this one:
+                        # https://bsky.app/profile/did:plc:rkphrshyfiqe4n2hz5vj56ig/post/3ltmljz5blca2)
+                        # In this case, we don't want to touch the position of the link given in the payload
+                        byteEnd = min(
+                            byteStart
+                            - facet["index"]["byteStart"]
+                            + facet["index"]["byteEnd"],
+                            len(post["original_text"].encode("utf-8")),
+                        )
+                        for i in range(byteStart, byteEnd):
+                            if chr(text[i]).isspace():
+                                byteStart = facet["index"]["byteStart"]
+                        byteEnd = (
+                            byteStart
+                            - facet["index"]["byteStart"]
+                            + facet["index"]["byteEnd"]
+                        )
-                # means that the link is shifted, like on this post:
-                # https://bsky.app/profile/ecrime.ch/post/3lqotmopayr23
-                if new_byteStart != -1:
-                    byteStart = new_byteStart
+                    # means that the link is a "personalized" one like on this post:
+                    # https://bsky.app/profile/newyork.activitypub.awakari.com.ap.brid.gy/post/3ln33tx7bpdu2
+                    else:
+                        # we're looking for a link which could be valid if we add "https://" at the beginning,
+                        # as in some cases the "http(s)://" part is missing in the post text
+                        for starting in range(byteEnd - byteStart):
+                            try:
+                                if is_url(
+                                    "https://"
+                                    + text[
+                                        byteStart + starting : byteEnd + starting
+                                    ].decode("utf-8")
+                                ):
+                                    byteStart += starting
+                                    break
+                            except UnicodeDecodeError:
+                                pass
+                        # If we did not find any valid link, we just keep the original position as it is
+                        # meaning that we have a personalized link like in the example above
+                        # Extend byteEnd to the right until we find a valid utf-8 ending,
+                        # as in some cases the link is longer than the position given in the payload
+                        # and it gets cut in the middle of a utf-8 char, leading to UnicodeDecodeError
+                        # example: https://bsky.app/profile/radiogaspesie.bsky.social/post/3lmkzhvhtta22
+                        while byteEnd <= len(post["original_text"].encode("utf-8")):
+                            try:
+                                text[byteStart:byteEnd].decode("utf-8")
+                                break
+                            except UnicodeDecodeError:
+                                byteEnd += 1
+                                continue
-                    # Find the index of the first space character after byteStart in case the link is a personalized one
-                    # but still with the link in it (somehow existing in some posts, such as this one:
-                    # https://bsky.app/profile/did:plc:rkphrshyfiqe4n2hz5vj56ig/post/3ltmljz5blca2)
-                    # In this case, we don't want to touch the position of the link given in the payload
-                    byteEnd = min(
-                        byteStart
-                        - facet["index"]["byteStart"]
-                        + facet["index"]["byteEnd"],
-                        len(post["original_text"].encode("utf-8")),
-                    )
-                    for i in range(byteStart, byteEnd):
-                        if chr(text[i]).isspace():
-                            byteStart = facet["index"]["byteStart"]
-                    byteEnd = (
-                        byteStart
-                        - facet["index"]["byteStart"]
-                        + facet["index"]["byteEnd"]
-                    )
+                        # Meaning that we did not find a valid utf-8 ending, so we reset byteEnd to its original value
+                        if byteEnd > len(post["original_text"].encode("utf-8")):
+                            byteEnd = facet["index"]["byteEnd"]
-                # means that the link is a "personalized" one like on this post:
-                # https://bsky.app/profile/newyork.activitypub.awakari.com.ap.brid.gy/post/3ln33tx7bpdu2
+                        byteEnd += byteStart - facet["index"]["byteStart"]
                 else:
-                    # we're looking for a link which could be valid if we add "https://" at the beginning,
-                    # as in some cases the "http(s)://" part is missing in the post text
-                    for starting in range(byteEnd - byteStart):
-                        try:
-                            if is_url(
-                                "https://"
-                                + text[
-                                    byteStart + starting : byteEnd + starting
-                                ].decode("utf-8")
-                            ):
-                                byteStart += starting
-                                break
-                        except UnicodeDecodeError:
-                            pass
-                    # If we did not find any valid link, we just keep the original position as it is
-                    # meaning that we have a personalized link like in the example above
-                    # Extend byteEnd to the right until we find a valid utf-8 ending,
-                    # as in some cases the link is longer than the position given in the payload
-                    # and it gets cut in the middle of a utf-8 char, leading to UnicodeDecodeError
-                    # example: https://bsky.app/profile/radiogaspesie.bsky.social/post/3lmkzhvhtta22
+                    # Handling case of errored byteEnd in the end of the text
+                    # example: https://bsky.app/profile/twif.bsky.social/post/3lm4izkvbfm2r
                     while byteEnd <= len(post["original_text"].encode("utf-8")):
                         try:
                             text[byteStart:byteEnd].decode("utf-8")
@@ -482,8 +643,6 @@ def normalize_post(
                     if byteEnd > len(post["original_text"].encode("utf-8")):
                         byteEnd = facet["index"]["byteEnd"]
-                    byteEnd += byteStart - facet["index"]["byteStart"]
             # In some cases, the link is completely wrong in the post text,
             # like in this post: https://bsky.app/profile/sudetsoleil.bsky.social/post/3ljf3h74wee2m
             # So we chose to not replace anything in the text in this case
@@ -500,10 +659,66 @@ def normalize_post(
                 pass
                 # raise UnicodeDecodeError(e.encoding, e.object, e.start, e.end, f"{e.reason} in post {post['url']}.\nText to decode: {text}\nSlice of text to decode: {text[e.start:e.end]}")
-        elif feat["$type"].endswith("#bold"):
+        elif any(
+            feat["$type"].endswith(suffix)
+            for suffix in [
+                "#bold",
+                "#italic",
+                "#underline",
+                "#option",
+                "#encrypt",
+                "#text",
+            ]
+        ):
             pass
-        elif feat["$type"].endswith("#option"):
+        # Bluesky seems to use format features for some internal purposes, but we ignore them
+        # e.g.: https://bsky.app/profile/ferromar.bsky.social/post/3lzyfaixayd2g
+        elif feat["$type"].endswith("format"):
             pass
+        # Not normal feature type, but still existing in some posts
+        # Note that external features aren't visible on the Bluesky app, only external embeds are
+        # e.g.: https://bsky.app/profile/did:plc:4qvb4dpkg6tkbzym77j6jcm4/post/3lbjktt6tw52h
+        elif feat["$type"].endswith("external"):
+            link = feat["external"]["uri"]
+            # Handle native gifs as medias
+            if link.startswith("https://media.tenor.com/"):
+                media_data.append(
+                    prepare_native_gif_as_media(
+                        feat["external"], post["user_did"], post["url"]
+                    )
+                )
+            # Extra card links sometimes missing from facets & text due to manual action in post form
+            else:
+                extra_links.append(link)
+            if isinstance(feat["external"].get("thumb"), dict):
+                post = process_card_data(feat["external"], post)
+        # Some people share code snippets using third party apps
+        # e.g.: https://bsky.app/profile/alexdln.com/post/3mbwzgrymow2o
+        elif (
+            "#" in feat["$type"]
+            and feat["$type"].split("#")[1].startswith("code")
+            and "code" in feat
+        ):
+            language = (
+                feat["$type"].split("#")[1].split(".")[1]
+                if "." in feat["$type"].split("#")[1]
+                else "plain"
+            )
+            text += (
+                b"\n```"
+                + language.encode("utf-8")
+                + b"\n"
+                + feat["code"].encode("utf-8")
+                + b"\n```\n"
+            )
+        # We chose to ignore non Bluesky features for now (e.g. personalized features)
+        # example: https://bsky.app/profile/poll.blue/post/3kmuqjkkozh2r
+        elif "bsky" not in feat["$type"]:
+            continue
         else:
             raise BlueskyPayloadError(
                 post["url"], "unusual record facet feature $type: %s" % feat
@@ -543,21 +758,61 @@ def normalize_post(
     # Handle quotes & medias
     media_ids = set()
-    post["media_urls"] = []
     post["media_thumbnails"] = []
     post["media_types"] = []
     post["media_alt_texts"] = []
     if "embed" in data["record"]:
         embed = data["record"]["embed"]
         quoted_data = None
-        media_data = []
-        extra_links = []
         if not valid_embed_type(embed["$type"]):
+            if "bsky" in embed["$type"]:
+                raise BlueskyPayloadError(
+                    post["url"], "unusual record embed $type: %s" % embed
+                )
+            # Ignore non Bluesky embeds for now (e.g. personalized embeds)
+        # Empty embed (not usual, but seen in the Bluesky jungle, e.g.
+        # https://bsky.app/profile/did:plc:na6u3avvaz2x5wyzqrnviqiz/post/3lzf5qi2ra62k
+        # https://bsky.app/profile/dangelodario.it/post/3l3inqifqj42p
+        # or https://bsky.app/profile/soirilab.bsky.social/post/3lywaa7vhsu2c)
+        if embed["$type"].endswith(".post") or embed["$type"] == "N/A":
+            # Some posts have extra keys in their empty embed, certainly personalized ones.
+            # Personalized quote (not visible on Bluesky for the example)
+            # example: https://bsky.app/profile/jacksmithsocial.bsky.social/post/3lbca2nxy4f2a
+            if embed.get("$type") == "app.bsky.feed.post" and embed.get(
+                "record", {}
+            ).get("uri"):
+                post, quoted_data, links = prepare_quote_data(
+                    embed["record"], data.get("embed", {}).get("record"), post, links
+                )
+            # for the other ones we know up to now, we want to ignore them
+            # e.g.: https://bsky.app/profile/granmouse.bsky.social/post/3lwvh5xd2xk2p
+            #       https://bsky.app/profile/flyingaubrey.bsky.social/post/3lxngessntk2p
+            elif len(embed.keys()) > 1 and embed.get("type") not in ["private", "list"]:
+                raise BlueskyPayloadError(
+                    post["url"],
+                    "unusual empty record embed with extra keys: %s" % embed,
+                )
+            # Nothing to do for empty embed
+        if (
+            embed["$type"].endswith(".embed")
+            and len(embed.keys()) > 2
+            and len(embed.get("images")) > 0
+        ):
             raise BlueskyPayloadError(
-                post["url"], "unusual record embed $type: %s" % embed
+                post["url"], "unusual empty record embed with extra keys: %s" % embed
             )
+        # Links from links embed
+        # e.g.: https://bsky.app/profile/sacredatoz.bsky.social/post/3lrqvemv7qe2f
+        if embed["$type"].endswith(".links"):
+            for link in embed["links"]:
+                extra_links.append(link)
         # Links from cards
         if embed["$type"].endswith(".external"):
             link = embed["external"]["uri"]
@@ -577,13 +832,48 @@ def normalize_post(
                 if "embed" in data:
                     post = process_card_data(data["embed"]["external"], post)
+        # Not visible images
+        # examples: https://bsky.app/profile/lubosmichalik.bsky.social/post/3ltjvxsaej62c
+        #           https://bsky.app/profile/lubosmichalik.bsky.social/post/3ltjvz52x7s2m
+        if embed["$type"].endswith(".viewImages"):
+            if "images" in embed:
+                for i in embed["images"]:
+                    post["media_urls"].append(
+                        i.get("viewImage", {}).get("thumb", {}).get("uri", "")
+                    )
+            elif "viewImage" in embed:
+                for i in embed["viewImage"]:
+                    if "viewImage" in i:
+                        sub_image = "viewImage"
+                    elif "image" in i:
+                        sub_image = "image"
+                    else:
+                        raise BlueskyPayloadError(
+                            post["url"],
+                            "unusual viewImages embed content: %s" % embed,
+                        )
+                    post["media_urls"].append(
+                        i[sub_image].get("thumb", {}).get("uri", "")
+                    )
         # Images
-        if embed["$type"].endswith(".images"):
-            media_data.extend([prepare_image_as_media(i) for i in embed["images"]])
+        if embed["$type"].endswith(".images") or embed["$type"].endswith("image"):
+            media_data.extend(
+                [prepare_image_as_media(i, post["url"]) for i in embed["images"]]
+            )
         # Video
         if embed["$type"].endswith(".video"):
             media_data.append(prepare_video_as_media(embed["video"]))
+        elif embed["$type"].endswith(".videos"):
+            for elt in embed["videos"]:
+                media_data.append(prepare_video_as_media(elt["video"]))
+        elif embed["$type"].endswith(".media"):
+            if isinstance(embed["media"], dict):
+                media_data.append(prepare_video_as_media(embed["media"]["video"]))
+            elif isinstance(embed["media"], list):
+                for elt in embed["media"]:
+                    media_data.append(prepare_video_as_media(elt["media"]))
         # Quote & Starter-packs
         if embed["$type"].endswith(".record"):
@@ -631,13 +921,21 @@ def normalize_post(
             # Images
             elif embed["media"]["$type"].endswith(".images"):
                 media_data.extend(
-                    [prepare_image_as_media(i) for i in embed["media"]["images"]]
+                    [
+                        prepare_image_as_media(i, post["url"])
+                        for i in embed["media"]["images"]
+                    ]
                 )
             # Video
             elif embed["media"]["$type"].endswith(".video"):
                 media_data.append(prepare_video_as_media(embed["media"]["video"]))
+            # A personalized record with media embed type, but video unavailable
+            # e.g.: https://bsky.app/profile/meteolatorregassa.bsky.social/post/3lhoxazzptj2b
+            elif embed["media"]["$type"].endswith("#media"):
+                pass
             else:
                 raise BlueskyPayloadError(
                     post["url"],
@@ -751,8 +1049,13 @@ def normalize_post(
                     "allow_from_" + rule["$type"].split("#")[1].split("Rule")[0]
                 )
                 if rule_string.endswith("_list") and "list" in rule:
-                    for allowed_list in rule["list"]:
-                        post["replies_rules"].append(rule_string + ":" + allowed_list)
+                    if isinstance(rule["list"], str):
+                        post["replies_rules"].append(rule_string + ":" + rule["list"])
+                    else:
+                        for allowed_list in rule["list"]:
+                            post["replies_rules"].append(
+                                rule_string + ":" + allowed_list
+                            )
                 else:
                     post["replies_rules"].append(rule_string)
             if not data["threadgate"]["record"]["allow"]:

twitwi/bluesky/utils.py CHANGED Viewed

@@ -37,7 +37,9 @@ def validate_post_payload(data):
                 post["record"],
             )
-    if post["record"].get("$type") != "app.bsky.feed.post":
+    # Splitting by '#' to ignore possible suffixes in $type
+    # e.g. https://bsky.app/profile/did:plc:k6acu4chiwkixvdedcmdgmal/post/3lagdncjsu22y
+    if post["record"].get("$type").split("#")[0] != "app.bsky.feed.post":
         return False, "payload's record $type is not a post: %s" % post["record"].get(
             "$type"
         )
@@ -56,7 +58,7 @@ def validate_post_payload(data):
 re_embed_types = re.compile(
-    r"\.(record|recordWithMedia|images|video|external)(?:#.*)?$"
+    r"(?:\.(?:record|recordWithMedia|images|videos?|external|post|embed|links|media|file|viewImages)(?:#.*)?|N\/A|image)$"
 )
@@ -88,17 +90,25 @@ def parse_post_url(url, source):
 def parse_post_uri(uri, source=None):
     """Returns a tuple of (author_did, post_did) from an at:// post URI"""
-    known_splits = [
-        "/app.bsky.feed.post/",
-        "/app.bsky.graph.starterpack/",
-        "/app.bsky.feed.generator/",
-        "/app.bsky.graph.list/",
-    ]
+    # known_splits = [
+    #     "/app.bsky.feed.post/",
+    #     "/app.bsky.graph.starterpack/",
+    #     "/app.bsky.feed.generator/",
+    #     "/app.bsky.graph.list/",
+    #     "/app.bsky.graph.follow/", # This one is often found when a post is an anwser to a deleted post (e.g. https://bsky.app/profile/sydney-chat.bsky.social/post/3ltsph6kxfl25)
+    # ]
+    # if uri.startswith("at://"):
+    #     for split in known_splits:
+    #         if split in uri:
+    #             return uri[5:].split(split)
+    # There's too much variability in the post URIs, and we cannot be exhaustive,
+    # so we do with the simple approach:
     if uri.startswith("at://"):
-        for split in known_splits:
-            if split in uri:
-                return uri[5:].split(split)
+        # Using maxsplit=3 to avoid issues if future uris contain more slashes
+        author_did, _, post_did = uri[5:].split("/", 3)
+        return author_did, post_did
     raise BlueskyPayloadError(source or uri, f"{uri} is not a usual Bluesky post uri")
@@ -112,18 +122,24 @@ def format_media_url(user_did, media_cid, mime_type, source):
     if mime_type.startswith("image"):
         media_url = f"https://cdn.bsky.app/img/feed_fullsize/plain/{user_did}/{media_cid}@{media_type}"
         media_thumb = f"https://cdn.bsky.app/img/feed_thumbnail/plain/{user_did}/{media_cid}@{media_type}"
-    elif mime_type.startswith("video"):
+    elif (
+        mime_type.startswith("video")
+        or mime_type == "application/xml"
+        or mime_type == "*/*"
+    ):
         media_url = f"https://video.bsky.app/watch/{user_did}/{media_cid}/playlist.m3u8"
         media_thumb = (
             f"https://video.bsky.app/watch/{user_did}/{media_cid}/thumbnail.jpg"
         )
-    elif mime_type in ["application/octet-stream", "text/plain"]:
+    elif any(mt in mime_type for mt in ["octet-stream", "text/plain", "text/html"]):
         media_url = (
             f"https://cdn.bsky.app/img/feed_fullsize/plain/{user_did}/{media_cid}@jpeg"
         )
         media_thumb = (
             f"https://cdn.bsky.app/img/feed_thumbnail/plain/{user_did}/{media_cid}@jpeg"
         )
+    elif "empty" in mime_type:
+        media_url, media_thumb = "", ""
     else:
         raise BlueskyPayloadError(source, f"{mime_type} is an unusual media mimeType")
     return media_url, media_thumb

twitwi/exceptions.py CHANGED Viewed

@@ -21,4 +21,4 @@ class BlueskyPayloadError(TwitwiError):
     def __init__(self, source, message):
         self.source = source
         self.message = message
-        super().__init__(f"Error while processing Bluesky post {source}:\n{message}")
+        super().__init__(f"Error while processing Bluesky post {source}.\n{message}")

twitwi/formatters.py CHANGED Viewed

@@ -52,7 +52,9 @@ def make_transform_into_csv_dict(plural_fields, boolean_fields):
 def make_format_as_csv_row(fields, plural_fields, boolean_fields):
-    def format_field_for_csv(field, item, item_id=None, plural_separator="|"):
+    def format_field_for_csv(
+        field, item, item_id=None, plural_separator="|", allow_erroneous_plurals=False
+    ):
         if field == "id" and item_id is not None:
             return item_id
@@ -63,6 +65,11 @@ def make_format_as_csv_row(fields, plural_fields, boolean_fields):
             if field == "links":
                 v = item.get("proper_links", v)
+            # Clean None values that may have slipped in, such as in the 'domains' field when
+            # normalizing this Bluesky post: https://bsky.app/profile/did:plc:cs5qjcmnntogoahrrsagmg2z/post/3lvqhn7raq62v
+            if allow_erroneous_plurals:
+                v = [element if element is not None else "" for element in v]
             return plural_separator.join(v)
         if field in boolean_fields:
@@ -70,10 +77,16 @@ def make_format_as_csv_row(fields, plural_fields, boolean_fields):
         return item.get(field, "")
-    def format_item_as_csv_row(item, item_id=None, plural_separator="|"):
+    def format_item_as_csv_row(
+        item, item_id=None, plural_separator="|", allow_erroneous_plurals=False
+    ):
         return [
             format_field_for_csv(
-                field, item, item_id=item_id, plural_separator=plural_separator
+                field,
+                item,
+                item_id=item_id,
+                plural_separator=plural_separator,
+                allow_erroneous_plurals=allow_erroneous_plurals,
             )
             for field in fields
         ]

twitwi/utils.py CHANGED Viewed

@@ -61,7 +61,9 @@ def get_dates(
         locale = UTC_TIMEZONE
     # Let's pray we never see a negative year...
-    year_zero = date_str.startswith("0000")
+    year_zero = date_str.startswith("0000") or all(
+        c == "0" for c in date_str.split("-")[0]
+    )
     try:
         parsed_datetime = datetime.strptime(
@@ -84,26 +86,30 @@ def get_dates(
         utc_datetime = UTC_TIMEZONE.localize(parsed_datetime)
     locale_datetime = utc_datetime.astimezone(locale)
+    formatted_date_str = datetime.strftime(
+        locale_datetime,
+        FORMATTED_FULL_DATETIME_FORMAT
+        if source == "bluesky"
+        else FORMATTED_TWEET_DATETIME_FORMAT,
+    )
     timestamp = int(utc_datetime.timestamp())
     if year_zero:
         # Subtract one year (year 0001 is not a leap year) in seconds
         timestamp -= 31536000
+        # Doing like so using split because on ubuntu, datetime.strftime on year with less than 4 digits
+        # only returns 1 digit for year 0 (e.g. "0-05-12...") instead of 4 digits ("0000-05-12..."),
+        # whereas on macOS and Windows it returns 4 digits.
+        formatted_date_str = "0000-" + formatted_date_str.split("-", 1)[1]
     if millisecond_timestamp:
         timestamp *= 1000
         timestamp += utc_datetime.microsecond / 1000
-    formatted_date_str = datetime.strftime(
-        locale_datetime,
-        FORMATTED_FULL_DATETIME_FORMAT
-        if source == "bluesky"
-        else FORMATTED_TWEET_DATETIME_FORMAT,
-    )
     return (
         int(timestamp),
-        formatted_date_str if not year_zero else "0" + formatted_date_str[1:],
+        formatted_date_str,
     )

{twitwi-0.23.0.dist-info → twitwi-0.24.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: twitwi
-Version: 0.23.0
+Version: 0.24.0
 Summary: A collection of Twitter-related helper functions for python.
 Home-page: http://github.com/medialab/twitwi
 Author: Béatrice Mazoyer, Guillaume Plique, Benjamin Ooghe-Tabanou
@@ -260,7 +260,7 @@ List of a Bluesky user profile's normalized field names. Useful to declare heade
 ### PARTIAL_PROFILE_FIELDS
-List of a Bluesky user partial profile's (retrieved from [`app.bsky.graph.getFollowers` HTTP endpoint](https://docs.bsky.app/docs/api/app-bsky-graph-get-followers#responses) for example) normalized field names. Useful to declare headers with csv writers. Be careful not to confuse with [PROFILE_FIELDS](#profile_fields) which correspond to the full version of the profile data, retrieved from [`app.bsky.actor.getProfiles` HTTP endpoint](docs.bsky.app/docs/api/app-bsky-actor-get-profiles#responses) for example.
+List of a Bluesky user partial profile's (retrieved from [`app.bsky.graph.getFollowers` HTTP endpoint](https://docs.bsky.app/docs/api/app-bsky-graph-get-followers#responses) for example) normalized field names. Useful to declare headers with csv writers. Be careful not to confuse with [PROFILE_FIELDS](#profile_fields) which correspond to the full version of the profile data, retrieved from [`app.bsky.actor.getProfiles` HTTP endpoint](https://docs.bsky.app/docs/api/app-bsky-actor-get-profiles#responses) for example.
 ### POST_FIELDS
@@ -277,7 +277,7 @@ Will return datetimes as UTC but can take an optional second `locale` argument a
 * **data** *(dict)*: user profile data payload coming from Twitter API v1.1 or v2.
 * **locale** *(pytz.timezone as str, optional)*: timezone used to convert dates. If not given, will default to UTC.
 * **pure** *(bool, optional)*: whether to allow the function to mutate its original `data` argument. Defaults to `True`.
 ### normalize_tweet
 Function taking a nested dict describing a tweet from Twitter's JSON payload (API v1.1) and returning a flat "normalized" dict composed of all [TWEET_FIELDS](#tweet_fields) keys.

{twitwi-0.23.0.dist-info → twitwi-0.24.0.dist-info}/RECORD RENAMED Viewed

@@ -1,22 +1,22 @@
 test/bluesky/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-test/bluesky/formatters_test.py,sha256=dMpRV_IuStZAnXhJGKnYsi0tb4BaSTSU4JVfihU1aQs,5002
+test/bluesky/formatters_test.py,sha256=kUXoLNEep-mGRwLN0y5DqB9pAorV0PkVKMm_uVIvAQQ,5100
 test/bluesky/normalizers_test.py,sha256=R4NziqErGW5MBdQEZ1vNxLGNRvJTyGnXfqo0v5gBCgw,5662
 twitwi/__init__.py,sha256=y0bAx9gE3THtlWE1YpXDIhGwqJ5_I8DCStWyyiiXJkw,1095
 twitwi/anonymizers.py,sha256=nkl6HL1BWLz00wJ060XSbqjN5JF8pvcpEPnRXt70TUY,1588
 twitwi/constants.py,sha256=fvqCngJIGyz5CpdVWbcAfjmE3_kvcx9giN0rEljL7OU,16001
-twitwi/exceptions.py,sha256=OCIDagu2ErDyOGWunRBCK3O62TnzFpIMQ9gS8l9EALQ,696
-twitwi/formatters.py,sha256=yn14AsrGAUw8rShOnYJvoMbzdWpfTeSs0P0ZPNTwhLU,3142
+twitwi/exceptions.py,sha256=xUikeIRmFcptQFlKGKXkbH9vbcQlQL3sviknhvSTcmw,696
+twitwi/formatters.py,sha256=pwI4UYPDFUzjRPE9B36k8tK-Va-k0HFLwvmc8aIc8P0,3681
 twitwi/normalizers.py,sha256=CWUK-XwhcEjLDjWH_qb6E03WZKsbIcwiRAVUjwXKQho,28438
-twitwi/utils.py,sha256=ruyqTx9JELRiE4-Svhaeo02KrsdHrrHJNqbGRWMmuAs,4421
+twitwi/utils.py,sha256=PPmbeMlKbHMTg07PgI4A0HRZw2QGuvCOGcP_FtqMyHQ,4774
 twitwi/bluesky/__init__.py,sha256=SqeHZUzL2U9UpL3EB33vaowQWaKXSPkvsAkasRqmFpY,694
 twitwi/bluesky/constants.py,sha256=CPkTIrDwyRWpkFTbaee1oFm_LWGj2WIC7A6xEGqDGB4,573
 twitwi/bluesky/formatters.py,sha256=L_yROAPcBECifCGiFAGYFJwLq6re8UlJNoZ7R2DXm5g,1025
-twitwi/bluesky/normalizers.py,sha256=AsOX3d4FsMn-GPvo-0oA7cZQwqAxQNbLq1ajbnXe7bk,33976
+twitwi/bluesky/normalizers.py,sha256=m4oNWJt8eZK2iVREPIKC42yw3YNpZo3pf4OQGZz_1i8,48611
 twitwi/bluesky/types.py,sha256=INe6R8eOqrOooWn25dtk61-Wqd_pUDwb737R7jY_vkc,13915
-twitwi/bluesky/utils.py,sha256=mFL1h_Mqay66UGEUlzweO_0TzbqS51oNE2TKoT2xf-4,3969
-twitwi-0.23.0.dist-info/licenses/LICENSE.txt,sha256=Ddg_PcGnl0qd2167o2dheCjE_rCZJOoBxjJnJhhOpX4,1099
-twitwi-0.23.0.dist-info/METADATA,sha256=05Mq7RsXYLpVK4aTX3zAUMcPYdpd8UBPOc81Z9_FYQw,21365
-twitwi-0.23.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-twitwi-0.23.0.dist-info/top_level.txt,sha256=TaKyGU7j_EVbP5KI0UD6qjbaKv2Qn0OrkfUQ29a04kg,12
-twitwi-0.23.0.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
-twitwi-0.23.0.dist-info/RECORD,,
+twitwi/bluesky/utils.py,sha256=zIofl7UHmIr0JgjoXRK3ekovkri3CVOvQvo8PmFrWGg,4895
+twitwi-0.24.0.dist-info/licenses/LICENSE.txt,sha256=Ddg_PcGnl0qd2167o2dheCjE_rCZJOoBxjJnJhhOpX4,1099
+twitwi-0.24.0.dist-info/METADATA,sha256=4cGwKAsqA9kXkG713fx0lLfoCb2znbLiTsqm-n_wI4g,21365
+twitwi-0.24.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+twitwi-0.24.0.dist-info/top_level.txt,sha256=TaKyGU7j_EVbP5KI0UD6qjbaKv2Qn0OrkfUQ29a04kg,12
+twitwi-0.24.0.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
+twitwi-0.24.0.dist-info/RECORD,,

{twitwi-0.23.0.dist-info → twitwi-0.24.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

{twitwi-0.23.0.dist-info → twitwi-0.24.0.dist-info}/licenses/LICENSE.txt RENAMED Viewed

File without changes

{twitwi-0.23.0.dist-info → twitwi-0.24.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

{twitwi-0.23.0.dist-info → twitwi-0.24.0.dist-info}/zip-safe RENAMED Viewed

File without changes

twitwi 0.23.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

twitwi 0.23.0py3-none-any.whl → 0.24.0py3-none-any.whl