PyPI - twitwi - Versions diffs - 0.22.0__py3-none-any.whl → 0.23.0__py3-none-any.whl - Mend

twitwi 0.22.0py3-none-any.whl → 0.23.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

twitwi/bluesky/normalizers.py +159 -62
twitwi/bluesky/types.py +2 -0
twitwi/bluesky/utils.py +2 -2
twitwi/utils.py +22 -7
{twitwi-0.22.0.dist-info → twitwi-0.23.0.dist-info}/METADATA +1 -1
{twitwi-0.22.0.dist-info → twitwi-0.23.0.dist-info}/RECORD +10 -10
{twitwi-0.22.0.dist-info → twitwi-0.23.0.dist-info}/WHEEL +0 -0
{twitwi-0.22.0.dist-info → twitwi-0.23.0.dist-info}/licenses/LICENSE.txt +0 -0
{twitwi-0.22.0.dist-info → twitwi-0.23.0.dist-info}/top_level.txt +0 -0
{twitwi-0.22.0.dist-info → twitwi-0.23.0.dist-info}/zip-safe +0 -0

twitwi/bluesky/normalizers.py CHANGED Viewed

@@ -122,13 +122,15 @@ def process_starterpack_card(embed_data, post):
     # Warning: mutates post
     card = embed_data.get("record", {})
-    creator_did, pack_did = parse_post_uri(embed_data["uri"])
-    post["card_link"] = format_starterpack_url(
-        embed_data.get("creator", {}).get("handle") or creator_did, pack_did
-    )
-    post["card_title"] = card.get("name", "")
-    post["card_description"] = card.get("description", "")
-    post["card_thumbnail"] = card.get("thumb", "")
+    if "uri" in embed_data:
+        creator_did, pack_did = parse_post_uri(embed_data["uri"])
+        post["card_link"] = format_starterpack_url(
+            embed_data.get("creator", {}).get("handle") or creator_did, pack_did
+        )
+    if card:
+        post["card_title"] = card.get("name", "")
+        post["card_description"] = card.get("description", "")
+        post["card_thumbnail"] = card.get("thumb", "")
     return post
@@ -145,49 +147,55 @@ def process_card_data(embed_data, post):
 def prepare_quote_data(embed_quote, card_data, post, links):
     # Warning: mutates post and links
+    quoted_data = None
     post["quoted_cid"] = embed_quote["cid"]
     post["quoted_uri"] = embed_quote["uri"]
-    post["quoted_user_did"], post["quoted_did"] = parse_post_uri(
-        post["quoted_uri"], post["url"]
-    )
-    # First store ugly quoted url with user did in case full quote data is missing (recursion > 3 or detached quote)
-    # Handling special posts types (only lists for now, for example: https://bsky.app/profile/lanana421.bsky.social/lists/3lxdgjtpqhf2z)
-    if "/app.bsky.graph.list/" in post["quoted_uri"]:
-        post_splitter = "/lists/"
+    # Sometimes quoted post is not found, even if uri and cid are given
+    # example: https://bsky.app/profile/takobiotech.masto.bike.ap.brid.gy/post/3lc6r7nzil6m2
+    if card_data and card_data.get("notFound"):
+        post["quoted_status"] = "notFound"
     else:
-        post_splitter = "/post/"
-    post["quoted_url"] = format_post_url(
-        post["quoted_user_did"], post["quoted_did"], post_splitter=post_splitter
-    )
-    quoted_data = None
-    if card_data:
-        if card_data.get("detached", False):
-            post["quoted_status"] = "detached"
+        post["quoted_user_did"], post["quoted_did"] = parse_post_uri(
+            post["quoted_uri"], post["url"]
+        )
+        # First store ugly quoted url with user did in case full quote data is missing (recursion > 3 or detached quote)
+        # Handling special posts types (only lists for now, for example: https://bsky.app/profile/lanana421.bsky.social/lists/3lxdgjtpqhf2z)
+        if "/app.bsky.graph.list/" in post["quoted_uri"]:
+            post_splitter = "/lists/"
         else:
-            quoted_data = deepcopy(card_data)
+            post_splitter = "/post/"
+        post["quoted_url"] = format_post_url(
+            post["quoted_user_did"], post["quoted_did"], post_splitter=post_splitter
+        )
-    # Grab user handle and cleanup links when no quote data but url in text
-    if not quoted_data:
-        for link in links:
-            if link.startswith("https://bsky.app/profile/") and link.endswith(
-                post["quoted_did"]
-            ):
-                # Take better quoted url with user_handle
-                post["quoted_url"] = link
-                break
-        # Remove quoted link from post links
-        if post["quoted_url"] in links:
-            links.remove(post["quoted_url"])
-        # Extract user handle from url
-        if "did:plc:" not in post["quoted_url"]:
-            post["quoted_user_handle"], _ = parse_post_url(
-                post["quoted_url"], post["url"]
-            )
+        if card_data:
+            if card_data.get("detached"):
+                post["quoted_status"] = "detached"
+            else:
+                quoted_data = deepcopy(card_data)
+        # Grab user handle and cleanup links when no quote data but url in text
+        if not quoted_data:
+            for link in links:
+                if link.startswith("https://bsky.app/profile/") and link.endswith(
+                    post["quoted_did"]
+                ):
+                    # Take better quoted url with user_handle
+                    post["quoted_url"] = link
+                    break
+            # Remove quoted link from post links
+            if post["quoted_url"] in links:
+                links.remove(post["quoted_url"])
+            # Extract user handle from url
+            if "did:plc:" not in post["quoted_url"]:
+                post["quoted_user_handle"], _ = parse_post_url(
+                    post["quoted_url"], post["url"]
+                )
     return (post, quoted_data, links)
@@ -300,6 +308,7 @@ def normalize_post(
     post["timestamp_utc"], post["local_time"] = get_dates(
         data["record"]["createdAt"], locale=locale, source="bluesky"
     )
+    post["indexed_at_utc"] = data["indexedAt"]
     # Handle post/user identifiers
     post["cid"] = data["cid"]
@@ -332,6 +341,8 @@ def normalize_post(
     post["reply_count"] = data["replyCount"]
     post["like_count"] = data["likeCount"]
     post["quote_count"] = data["quoteCount"]
+    # When a post cites another, the cited post doesn't have the bookmarkCount field
+    post["bookmark_count"] = data.get("bookmarkCount")
     # Handle hashtags, mentions & links from facets
     post["mentioned_user_handles"] = []
@@ -361,12 +372,21 @@ def normalize_post(
                 # Check & fix occasional errored mention positioning
                 # example: https://bsky.app/profile/snjcgt.bsky.social/post/3lpmqkkkgp52u
                 byteStart = facet["index"]["byteStart"]
+                byteEnd = facet["index"]["byteEnd"]
                 if text[byteStart : byteStart + 1] != b"@":
                     byteStart = text.find(b"@", byteStart)
+                # in some cases, the errored positioning is before the position given
+                # example: https://bsky.app/profile/springer.springernature.com/post/3lovyad4nt324
+                if byteStart == -1 or byteStart > byteEnd:
+                    # When decrementing byteStart, we also decrement byteEnd (see below)
+                    # shifting the slice to extract the mention correctly
+                    byteStart = facet["index"]["byteStart"] - 1
+                    # to extend the size of the mention, which is somehow 1 char too short because of the '@'
+                    byteEnd += 1
                 handle = (
                     text[
-                        byteStart + 1 : facet["index"]["byteEnd"]
+                        byteStart + 1 : byteEnd
                         + byteStart
                         - facet["index"]["byteStart"]
                     ]
@@ -398,21 +418,87 @@ def normalize_post(
             # examples: https://bsky.app/profile/ecrime.ch/post/3lqotmopayr23
             #           https://bsky.app/profile/clustz.com/post/3lqfi7mnto52w
             byteStart = facet["index"]["byteStart"]
+            byteEnd = facet["index"]["byteEnd"]
-            if not text[byteStart : facet["index"]["byteEnd"]].startswith(b"http"):
-                new_byteStart = text.find(b"http", byteStart, facet["index"]["byteEnd"])
+            if not text[byteStart:byteEnd].startswith(b"http"):
+                new_byteStart = text.find(b"http", byteStart, byteEnd)
+                # means that the link is shifted, like on this post:
+                # https://bsky.app/profile/ecrime.ch/post/3lqotmopayr23
                 if new_byteStart != -1:
                     byteStart = new_byteStart
-            links_to_replace.append(
-                {
-                    "uri": feat["uri"].encode("utf-8"),
-                    "start": byteStart,
-                    "end": byteStart
-                    - facet["index"]["byteStart"]
-                    + facet["index"]["byteEnd"],
-                }
-            )
+                    # Find the index of the first space character after byteStart in case the link is a personalized one
+                    # but still with the link in it (somehow existing in some posts, such as this one:
+                    # https://bsky.app/profile/did:plc:rkphrshyfiqe4n2hz5vj56ig/post/3ltmljz5blca2)
+                    # In this case, we don't want to touch the position of the link given in the payload
+                    byteEnd = min(
+                        byteStart
+                        - facet["index"]["byteStart"]
+                        + facet["index"]["byteEnd"],
+                        len(post["original_text"].encode("utf-8")),
+                    )
+                    for i in range(byteStart, byteEnd):
+                        if chr(text[i]).isspace():
+                            byteStart = facet["index"]["byteStart"]
+                    byteEnd = (
+                        byteStart
+                        - facet["index"]["byteStart"]
+                        + facet["index"]["byteEnd"]
+                    )
+                # means that the link is a "personalized" one like on this post:
+                # https://bsky.app/profile/newyork.activitypub.awakari.com.ap.brid.gy/post/3ln33tx7bpdu2
+                else:
+                    # we're looking for a link which could be valid if we add "https://" at the beginning,
+                    # as in some cases the "http(s)://" part is missing in the post text
+                    for starting in range(byteEnd - byteStart):
+                        try:
+                            if is_url(
+                                "https://"
+                                + text[
+                                    byteStart + starting : byteEnd + starting
+                                ].decode("utf-8")
+                            ):
+                                byteStart += starting
+                                break
+                        except UnicodeDecodeError:
+                            pass
+                    # If we did not find any valid link, we just keep the original position as it is
+                    # meaning that we have a personalized link like in the example above
+                    # Extend byteEnd to the right until we find a valid utf-8 ending,
+                    # as in some cases the link is longer than the position given in the payload
+                    # and it gets cut in the middle of a utf-8 char, leading to UnicodeDecodeError
+                    # example: https://bsky.app/profile/radiogaspesie.bsky.social/post/3lmkzhvhtta22
+                    while byteEnd <= len(post["original_text"].encode("utf-8")):
+                        try:
+                            text[byteStart:byteEnd].decode("utf-8")
+                            break
+                        except UnicodeDecodeError:
+                            byteEnd += 1
+                            continue
+                    if byteEnd > len(post["original_text"].encode("utf-8")):
+                        byteEnd = facet["index"]["byteEnd"]
+                    byteEnd += byteStart - facet["index"]["byteStart"]
+            # In some cases, the link is completely wrong in the post text,
+            # like in this post: https://bsky.app/profile/sudetsoleil.bsky.social/post/3ljf3h74wee2m
+            # So we chose to not replace anything in the text in this case
+            try:
+                text[byteStart:byteEnd].decode("utf-8")
+                links_to_replace.append(
+                    {
+                        "uri": feat["uri"].encode("utf-8"),
+                        "start": byteStart,
+                        "end": byteEnd,
+                    }
+                )
+            except UnicodeDecodeError:
+                pass
+                # raise UnicodeDecodeError(e.encoding, e.object, e.start, e.end, f"{e.reason} in post {post['url']}.\nText to decode: {text}\nSlice of text to decode: {text[e.start:e.end]}")
         elif feat["$type"].endswith("#bold"):
             pass
@@ -503,9 +589,9 @@ def normalize_post(
         if embed["$type"].endswith(".record"):
             if "app.bsky.graph.starterpack" in embed["record"]["uri"]:
                 post = process_starterpack_card(
-                    data.get("embed", {}).get("record"), post
+                    data.get("embed", {}).get("record", {}), post
                 )
-                if post["card_link"]:
+                if post.get("card_link"):
                     extra_links.append(post["card_link"])
             else:
                 post, quoted_data, links = prepare_quote_data(
@@ -594,11 +680,13 @@ def normalize_post(
         # Process quotes
         if quoted_data and "value" in quoted_data:
-            if quoted_data["cid"] != post["quoted_cid"]:
+            # We're checking on the uri as the cid can be different in some cases,
+            # and the uri seems to be unique for each post
+            if quoted_data["uri"] != post["quoted_uri"]:
                 raise BlueskyPayloadError(
                     post["url"],
-                    "inconsistent quote cid found between record.embed.record.cid & embed.record.cid: %s %s"
-                    % (post["quoted_cid"], quoted_data),
+                    "inconsistent quote uri found between record.embed.record.uri & embed.record.uri: %s %s"
+                    % (post["quoted_uri"], quoted_data),
                 )
             quoted_data["record"] = quoted_data["value"]
@@ -706,7 +794,16 @@ def normalize_post(
             repost_data["indexedAt"], locale=locale, source="bluesky"
         )
-    post["text"] = text.decode("utf-8")
+    try:
+        post["text"] = text.decode("utf-8")
+    except UnicodeDecodeError as e:
+        raise UnicodeDecodeError(
+            e.encoding,
+            e.object,
+            e.start,
+            e.end,
+            f"{e.reason} in post {post['url']}.\nText to decode: {text}\nSlice of text to decode: {text[e.start : e.end]}",
+        )
     if collection_source is not None:
         post["collected_via"] = [collection_source]

twitwi/bluesky/types.py CHANGED Viewed

@@ -50,6 +50,7 @@ class BlueskyPost(TypedDict):
     # Datetime fields
     timestamp_utc: int                  # Unix UTC timestamp of when the post was submitted
     local_time: str                     # datetime (potentially timezoned) of when the post was submitted
+    indexed_at_utc: str                 # datetime (NOT timezoned, for reuse of the Bluesky API) of when the post was indexed by the Bluesky service
     # Author identifying fields
     user_did: str                       # persistent long-term identifier of the account who authored the post
@@ -64,6 +65,7 @@ class BlueskyPost(TypedDict):
     like_count: int                     # total number of likes received by the post (at collection time)
     reply_count: int                    # total number of replies received by the post (at collection time)
     quote_count: int                    # total number of posts the post was quoted into (at collection time)
+    bookmark_count: Optional[int]       # total number of bookmarks received by the post (at collection time)
     # Extra field
     bridgy_original_url: Optional[str]  # source of the original post when it was posted from another platform such as Mastodon via the Bridgy connection tool

twitwi/bluesky/utils.py CHANGED Viewed

@@ -75,7 +75,7 @@ def format_post_url(user_handle_or_did, post_did, post_splitter="/post/"):
 def parse_post_url(url, source):
     """Returns a tuple of (author_handle/did, post_did) from an https://bsky.app post URL"""
-    known_splits = ["/post/", "/lists/"]
+    known_splits = ["/post/", "/lists/", "/feed/"]
     if url.startswith("https://bsky.app/profile/"):
         for split in known_splits:
@@ -117,7 +117,7 @@ def format_media_url(user_did, media_cid, mime_type, source):
         media_thumb = (
             f"https://video.bsky.app/watch/{user_did}/{media_cid}/thumbnail.jpg"
         )
-    elif mime_type == "application/octet-stream":
+    elif mime_type in ["application/octet-stream", "text/plain"]:
         media_url = (
             f"https://cdn.bsky.app/img/feed_fullsize/plain/{user_did}/{media_cid}@jpeg"
         )

twitwi/utils.py CHANGED Viewed

@@ -60,6 +60,9 @@ def get_dates(
     if locale is None:
         locale = UTC_TIMEZONE
+    # Let's pray we never see a negative year...
+    year_zero = date_str.startswith("0000")
     try:
         parsed_datetime = datetime.strptime(
             date_str,
@@ -68,7 +71,13 @@ def get_dates(
     except ValueError as e:
         if source != "bluesky":
             raise e
-        parsed_datetime = parse_date(date_str)
+        # Yes, it seems that some people were active in year 0...
+        # see by yourself: https://bsky.app/profile/koro.icu/post/3kbpuogc6fz2o
+        if year_zero:
+            date_str_fixed = "0001" + date_str[4:]
+            parsed_datetime = parse_date(date_str_fixed)
+        else:
+            parsed_datetime = parse_date(date_str)
     utc_datetime = parsed_datetime
     if not parsed_datetime.tzinfo:
@@ -77,18 +86,24 @@ def get_dates(
     timestamp = int(utc_datetime.timestamp())
+    if year_zero:
+        # Subtract one year (year 0001 is not a leap year) in seconds
+        timestamp -= 31536000
     if millisecond_timestamp:
         timestamp *= 1000
         timestamp += utc_datetime.microsecond / 1000
+    formatted_date_str = datetime.strftime(
+        locale_datetime,
+        FORMATTED_FULL_DATETIME_FORMAT
+        if source == "bluesky"
+        else FORMATTED_TWEET_DATETIME_FORMAT,
+    )
     return (
         int(timestamp),
-        datetime.strftime(
-            locale_datetime,
-            FORMATTED_FULL_DATETIME_FORMAT
-            if source == "bluesky"
-            else FORMATTED_TWEET_DATETIME_FORMAT,
-        ),
+        formatted_date_str if not year_zero else "0" + formatted_date_str[1:],
     )

{twitwi-0.22.0.dist-info → twitwi-0.23.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: twitwi
-Version: 0.22.0
+Version: 0.23.0
 Summary: A collection of Twitter-related helper functions for python.
 Home-page: http://github.com/medialab/twitwi
 Author: Béatrice Mazoyer, Guillaume Plique, Benjamin Ooghe-Tabanou

{twitwi-0.22.0.dist-info → twitwi-0.23.0.dist-info}/RECORD RENAMED Viewed

@@ -7,16 +7,16 @@ twitwi/constants.py,sha256=fvqCngJIGyz5CpdVWbcAfjmE3_kvcx9giN0rEljL7OU,16001
 twitwi/exceptions.py,sha256=OCIDagu2ErDyOGWunRBCK3O62TnzFpIMQ9gS8l9EALQ,696
 twitwi/formatters.py,sha256=yn14AsrGAUw8rShOnYJvoMbzdWpfTeSs0P0ZPNTwhLU,3142
 twitwi/normalizers.py,sha256=CWUK-XwhcEjLDjWH_qb6E03WZKsbIcwiRAVUjwXKQho,28438
-twitwi/utils.py,sha256=f02cMx19Sr_GvJQf_0jTIERGLq1oC3znnPQxE__rlFc,3838
+twitwi/utils.py,sha256=ruyqTx9JELRiE4-Svhaeo02KrsdHrrHJNqbGRWMmuAs,4421
 twitwi/bluesky/__init__.py,sha256=SqeHZUzL2U9UpL3EB33vaowQWaKXSPkvsAkasRqmFpY,694
 twitwi/bluesky/constants.py,sha256=CPkTIrDwyRWpkFTbaee1oFm_LWGj2WIC7A6xEGqDGB4,573
 twitwi/bluesky/formatters.py,sha256=L_yROAPcBECifCGiFAGYFJwLq6re8UlJNoZ7R2DXm5g,1025
-twitwi/bluesky/normalizers.py,sha256=1tt4q9dKhCLuNhB-Qn8YGSHILvgu-JNIRnfumwkEAe4,28422
-twitwi/bluesky/types.py,sha256=WUxfyA5fc68qURGh7bxiDlIBFgdbyysRRdvHLoXwWlA,13656
-twitwi/bluesky/utils.py,sha256=9il8t_qkKCmGQ-MDkF5qahxKV1Qsmwzul_1VzzD-jH4,3943
-twitwi-0.22.0.dist-info/licenses/LICENSE.txt,sha256=Ddg_PcGnl0qd2167o2dheCjE_rCZJOoBxjJnJhhOpX4,1099
-twitwi-0.22.0.dist-info/METADATA,sha256=QggjTIdvTg15eWBDNuug7rcZINc2Uh6choMNQeeFoNM,21365
-twitwi-0.22.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-twitwi-0.22.0.dist-info/top_level.txt,sha256=TaKyGU7j_EVbP5KI0UD6qjbaKv2Qn0OrkfUQ29a04kg,12
-twitwi-0.22.0.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
-twitwi-0.22.0.dist-info/RECORD,,
+twitwi/bluesky/normalizers.py,sha256=AsOX3d4FsMn-GPvo-0oA7cZQwqAxQNbLq1ajbnXe7bk,33976
+twitwi/bluesky/types.py,sha256=INe6R8eOqrOooWn25dtk61-Wqd_pUDwb737R7jY_vkc,13915
+twitwi/bluesky/utils.py,sha256=mFL1h_Mqay66UGEUlzweO_0TzbqS51oNE2TKoT2xf-4,3969
+twitwi-0.23.0.dist-info/licenses/LICENSE.txt,sha256=Ddg_PcGnl0qd2167o2dheCjE_rCZJOoBxjJnJhhOpX4,1099
+twitwi-0.23.0.dist-info/METADATA,sha256=05Mq7RsXYLpVK4aTX3zAUMcPYdpd8UBPOc81Z9_FYQw,21365
+twitwi-0.23.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+twitwi-0.23.0.dist-info/top_level.txt,sha256=TaKyGU7j_EVbP5KI0UD6qjbaKv2Qn0OrkfUQ29a04kg,12
+twitwi-0.23.0.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
+twitwi-0.23.0.dist-info/RECORD,,

{twitwi-0.22.0.dist-info → twitwi-0.23.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{twitwi-0.22.0.dist-info → twitwi-0.23.0.dist-info}/licenses/LICENSE.txt RENAMED Viewed

File without changes

{twitwi-0.22.0.dist-info → twitwi-0.23.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

{twitwi-0.22.0.dist-info → twitwi-0.23.0.dist-info}/zip-safe RENAMED Viewed

File without changes

twitwi 0.22.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

twitwi 0.22.0py3-none-any.whl → 0.23.0py3-none-any.whl