twitwi 0.20.0__py3-none-any.whl → 0.21.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- test/bluesky/__init__.py +0 -0
- test/bluesky/formatters_test.py +101 -0
- test/bluesky/normalizers_test.py +130 -0
- twitwi/__init__.py +19 -2
- twitwi/anonymizers.py +3 -9
- twitwi/bluesky/__init__.py +16 -0
- twitwi/bluesky/constants.py +19 -0
- twitwi/bluesky/formatters.py +29 -0
- twitwi/bluesky/normalizers.py +686 -0
- twitwi/bluesky/types.py +135 -0
- twitwi/bluesky/utils.py +110 -0
- twitwi/constants.py +323 -349
- twitwi/exceptions.py +8 -1
- twitwi/formatters.py +35 -37
- twitwi/normalizers.py +403 -339
- twitwi/utils.py +46 -18
- twitwi-0.21.1.dist-info/METADATA +436 -0
- twitwi-0.21.1.dist-info/RECORD +22 -0
- {twitwi-0.20.0.dist-info → twitwi-0.21.1.dist-info}/WHEEL +1 -1
- {twitwi-0.20.0.dist-info → twitwi-0.21.1.dist-info}/top_level.txt +1 -0
- twitwi-0.20.0.dist-info/METADATA +0 -156
- twitwi-0.20.0.dist-info/RECORD +0 -13
- {twitwi-0.20.0.dist-info → twitwi-0.21.1.dist-info}/licenses/LICENSE.txt +0 -0
- {twitwi-0.20.0.dist-info → twitwi-0.21.1.dist-info}/zip-safe +0 -0
twitwi/bluesky/types.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
from typing import TypedDict, List, Optional
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
# To know more about Bluesky IDs:
|
|
5
|
+
# - https://docs.bsky.app/docs/advanced-guides/resolving-identities
|
|
6
|
+
# - https://bluesky.idunno.dev/docs/commonTerms.html
|
|
7
|
+
|
|
8
|
+
class BlueskyProfile(TypedDict):
|
|
9
|
+
did: str # persistent long-term identifier of the account
|
|
10
|
+
url: str # URL of the profile accessible on the web
|
|
11
|
+
handle: str # updatable human-readable username of the account (usually like username.bsky.social or username.com)
|
|
12
|
+
display_name: Optional[str] # updatable human-readable name of the account
|
|
13
|
+
description: str # profile short description written by the user
|
|
14
|
+
posts: int # total number of posts submitted by the user (at collection time)
|
|
15
|
+
followers: int # total number of followers of the user (at collection time)
|
|
16
|
+
follows: int # total number of other users followed by the user (at collection time)
|
|
17
|
+
lists: int # total number of lists created by the user (at collection time)
|
|
18
|
+
feedgens: int # total number of custom feeds created by the user (at collection time)
|
|
19
|
+
starter_packs: int # total number of starter packs created by the user (at collection time)
|
|
20
|
+
avatar: Optional[str] # URL to the image serving as avatar to the user
|
|
21
|
+
banner: str # URL to the image serving as profile banner to the user
|
|
22
|
+
pinned_post_uri: Optional[str] # ATProto's internal URI to the post potentially pinned by the user to appear at the top of his posts on his profile
|
|
23
|
+
created_at: str # datetime (potentially timezoned) of when the user created the account
|
|
24
|
+
timestamp_utc: int # Unix UTC timestamp of when the user created the account
|
|
25
|
+
collection_time: Optional[str] # datetime (potentially timezoned) of when the data was normalized
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class BlueskyPost(TypedDict):
|
|
30
|
+
# Identifying fields
|
|
31
|
+
cid: str # internal content identifier of the post
|
|
32
|
+
did: str # persistent long-term identifier of the post
|
|
33
|
+
uri: str # ATProto's internal URI to the post
|
|
34
|
+
url: str # URL of the post accessible on the web
|
|
35
|
+
|
|
36
|
+
# Datetime fields
|
|
37
|
+
timestamp_utc: int # Unix UTC timestamp of when the post was submitted
|
|
38
|
+
local_time: str # datetime (potentially timezoned) of when the post was submitted
|
|
39
|
+
|
|
40
|
+
# Author identifying fields
|
|
41
|
+
user_did: str # persistent long-term identifier of the account who authored the post
|
|
42
|
+
user_handle: str # updatable human-readable username of the account who authored the post
|
|
43
|
+
|
|
44
|
+
# Content fields
|
|
45
|
+
text: str # reprocessed complete text of the post, including full links, full text of the quoted post (recursively up to 3 posts) and links to images or videos included within
|
|
46
|
+
original_text: str # original text of the post as returned by the Bluesky API
|
|
47
|
+
|
|
48
|
+
# Metrics fields
|
|
49
|
+
repost_count: int # total number of reposts of the post (at collection time)
|
|
50
|
+
like_count: int # total number of likes received by the post (at collection time)
|
|
51
|
+
reply_count: int # total number of replies received by the post (at collection time)
|
|
52
|
+
quote_count: int # total number of posts the post was quoted into (at collection time)
|
|
53
|
+
|
|
54
|
+
# Extra field
|
|
55
|
+
bridgy_original_url: Optional[str] # source of the original post when it was posted from another platform such as Mastodon via the Bridgy connection tool
|
|
56
|
+
|
|
57
|
+
# Author metadata fields
|
|
58
|
+
user_url: str # URL of the profile accessible on the web of the account who authored the post
|
|
59
|
+
user_diplay_name: str # updatable human-readable name of the account who authored the post
|
|
60
|
+
# user_description: str # not available from posts payloads
|
|
61
|
+
# user_posts: int # not available from posts payloads
|
|
62
|
+
# user_followers: int # not available from posts payloads
|
|
63
|
+
# user_follows: int # not available from posts payloads
|
|
64
|
+
# user_lists: int # not available from posts payloads
|
|
65
|
+
user_langs: List[str] # languages in which the author of the posts usually writes posts (declarative)
|
|
66
|
+
user_avatar: Optional[str] # URL to the image serving as avatar to the user who authored the post
|
|
67
|
+
user_created_at: str # datetime (potentially timezoned) ofwhen the user who authored the post created the account
|
|
68
|
+
user_timestamp_utc: int # Unix UTC timestamp of when the user who authored the post created the account
|
|
69
|
+
|
|
70
|
+
# Parent post identifying fields
|
|
71
|
+
# (if the post comes in a conversation as an answer to another post)
|
|
72
|
+
to_post_cid: Optional[str] # internal content identifier of the parent post
|
|
73
|
+
to_post_did: Optional[str] # persistent long-term identifier of the parent post
|
|
74
|
+
to_post_uri: Optional[str] # ATProto's internal URI of the parent post
|
|
75
|
+
to_post_url: Optional[str] # URL of the parent post on the web
|
|
76
|
+
to_user_did: Optional[str] # persistent long-term identifier of the account who authored the parent post
|
|
77
|
+
# to_user_handle: Optional[str] # not available from posts payloads, cf https://github.com/bluesky-social/atproto/issues/3722
|
|
78
|
+
|
|
79
|
+
# Root conversation post identifying fields
|
|
80
|
+
# (if the post comes in a conversation, this post is the first one that initiated the thread)
|
|
81
|
+
to_root_post_cid: Optional[str] # internal content identifier of the root conversation post
|
|
82
|
+
to_root_post_did: Optional[str] # persistent long-term identifier of the root conversation post
|
|
83
|
+
to_root_post_uri: Optional[str] # ATProto's internal URI of the root conversation post
|
|
84
|
+
to_root_post_url: Optional[str] # URL of the root conversation post on the web
|
|
85
|
+
to_root_user_did: Optional[str] # persistent long-term identifier of the account who authored the root conversation post
|
|
86
|
+
# to_root_user_handle: Optional[str] # not available from posts payloads, cf https://github.com/bluesky-social/atproto/issues/3722
|
|
87
|
+
|
|
88
|
+
# Repost metadata fields
|
|
89
|
+
# Contrary to Twitter where a retweet is a new tweet with its own ID, reposting on Bluesky only adds a flag to the post saying it was reposted by a specific user at a specific time. These are available for instance when collecting a user's feed.
|
|
90
|
+
repost_by_user_did: Optional[str] # persistent long-term identifier of the account who reposted the post
|
|
91
|
+
repost_by_user_handle: Optional[str] # updatable human-readable username of the account who reposted the post
|
|
92
|
+
repost_created_at: Optional[int] # datetime (potentially timezoned) of when the repost was done
|
|
93
|
+
repost_timestamp_utc: Optional[int] # Unix UTC timestamp of when the repost was done
|
|
94
|
+
|
|
95
|
+
# Quoted post metadata fields
|
|
96
|
+
# (when a post embeds another one)
|
|
97
|
+
quoted_cid: Optional[str] # internal content identifier of the quoted post
|
|
98
|
+
quoted_did: Optional[str] # persistent long-term identifier of the quoted post
|
|
99
|
+
quoted_uri: Optional[str] # ATProto's internal URI to the quoted post
|
|
100
|
+
quoted_url: Optional[str] # URL of the quoted post accessible on the web
|
|
101
|
+
quoted_user_did: Optional[str] # persistent long-term identifier of the account who authored the quoted post
|
|
102
|
+
quoted_user_handle: Optional[str] # updatable human-readable username of the account who authored the quoted post
|
|
103
|
+
quoted_created_at: Optional[int] # datetime (potentially timezoned) of when the quoted post was submitted
|
|
104
|
+
quoted_timestamp_utc: Optional[int] # Unix UTC timestamp of when the quoted post was submitted
|
|
105
|
+
quoted_status: Optional[str] # empty or "detached" when the author of the quoted post intentionnally required the quoting post not to be accessible from their own
|
|
106
|
+
|
|
107
|
+
# Embedded elements metadata fields
|
|
108
|
+
links: List[str] # list of URLs of all links shared within the post (including potentially the embedded card detailed below, but not the link to a potential quoted post)
|
|
109
|
+
domains: List[str] # list of domains of the links shared within the post (here a domain refer to a full hostname, including subdomains, for instance bluesky.com or medialab.sciencespo.fr)
|
|
110
|
+
card_link: Optional[str] # URL of the link displayed as a card within the post if any
|
|
111
|
+
card_title: Optional[str] # title of the webpage corresponding to the linkg diplayed as a card within the post if any
|
|
112
|
+
card_description: Optional[str] # description of the webpage corresponding to the linkg diplayed as a card within the post if any
|
|
113
|
+
card_thumbnail: Optional[str] # image displayed as an illustration of the webpage corresponding to the linkg diplayed as a card within the post if any
|
|
114
|
+
media_urls: List[str] # list of URLs to all medias (images, videos, gifs) embedded in the post
|
|
115
|
+
media_thumbnails: List[str] # list of URLs to small thumbnail version of all medias (images, videos, gifs) embedded in the post
|
|
116
|
+
media_types: List[str] # MIME types (such as image/jpeg, image/gif, video/mp4, etc.) of all medias (images, videos, gifs) embedded in the post
|
|
117
|
+
media_alt_texts: List[str] # description texts of all medias (images, videos, gifs) embedded in the post
|
|
118
|
+
mentioned_user_dids: List[str] # list of all persistent long-term identifier of the accounts adressed within the post (does not include users to which the post replied)
|
|
119
|
+
mentioned_user_handles: List[str] # list of all updatable human-readable username of the accounts adressed within the post (does not include users to which the post replied)
|
|
120
|
+
hashtags: List[str] # list of all unique lowercased hashtags found within the post's text
|
|
121
|
+
|
|
122
|
+
# Conversation rules fields
|
|
123
|
+
replies_rules: Optional[List[str]] # list of specific conversation rules set by the author for the current post (can be one or a combination of: disallow, allow_from_follower, allow_from_following, allow_from_mention, or allow_from_list: followed by a list of user DIDs)
|
|
124
|
+
replies_rules_created_at: Optional[str] # datetime (potentially timezoned) of when the user set the replies_rules
|
|
125
|
+
replies_rules_timestamp_utc: Optional[int] # Unix UTC timestamp of when the userset the replies_rules
|
|
126
|
+
hidden_replies_uris: Optional[List[str]] # list of ATProto's internal URIs to posts who replied to the post, but where intentionnally marked as hidden by the current post's author
|
|
127
|
+
# quotes_rule: Optional[str] # not available from posts payloads, cf https://github.com/bluesky-social/atproto/issues/3712
|
|
128
|
+
# quotes_rules_created_at: Optional[str] # not available from posts payloads, cf https://github.com/bluesky-social/atproto/issues/3712
|
|
129
|
+
# quotes_rules_timestamp_utc: Optional[int] # not available from posts payloads, cf https://github.com/bluesky-social/atproto/issues/3712
|
|
130
|
+
# detached_quotes: Optional[List[str]] # not available from posts payloads, cf https://github.com/bluesky-social/atproto/issues/3712
|
|
131
|
+
|
|
132
|
+
# Extra fields linked to the data collection and processing
|
|
133
|
+
collection_time: Optional[str] # datetime (potentially timezoned) of when the data was normalized
|
|
134
|
+
collected_via: Optional[List[str]] # extra field added by the normalization process to express how the data collection was ran, will be "quote" or "thread" when a post was grabbed as a referenced post within a really collected post using the "extract_referenced_posts" option of "normalize_post"
|
|
135
|
+
match_query: Optional[bool] # extra field added by the normalization process to express whether the post was an intentionnally collected one or only came as a referenced post within a really collected post using the "extract_referenced_posts" option of "normalize_post"
|
twitwi/bluesky/utils.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from twitwi.exceptions import BlueskyPayloadError
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
valid_post_keys = [
|
|
7
|
+
"cid",
|
|
8
|
+
"uri",
|
|
9
|
+
"author",
|
|
10
|
+
"record",
|
|
11
|
+
"replyCount",
|
|
12
|
+
"repostCount",
|
|
13
|
+
"likeCount",
|
|
14
|
+
"quoteCount",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
valid_record_keys = ["$type", "createdAt", "text"]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
valid_author_keys = ["did", "handle", "createdAt"]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def validate_post_payload(data):
|
|
24
|
+
post = data.get("post", data)
|
|
25
|
+
|
|
26
|
+
for key in valid_post_keys:
|
|
27
|
+
if key not in post:
|
|
28
|
+
return False, f"key {key} is missing from payload: {post}"
|
|
29
|
+
|
|
30
|
+
if not isinstance(post["record"], dict):
|
|
31
|
+
return False, "payload's record is not a dictionary: %s" % post["record"]
|
|
32
|
+
|
|
33
|
+
for key in valid_record_keys:
|
|
34
|
+
if key not in post["record"]:
|
|
35
|
+
return False, "key %s is missing from payload's record: %s" % (
|
|
36
|
+
key,
|
|
37
|
+
post["record"],
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
if post["record"].get("$type") != "app.bsky.feed.post":
|
|
41
|
+
return False, "payload's record $type is not a post: %s" % post["record"].get(
|
|
42
|
+
"$type"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
if not isinstance(post["author"], dict):
|
|
46
|
+
return False, "payload's author is not a dictionary: %s" % post["author"]
|
|
47
|
+
|
|
48
|
+
for key in valid_author_keys:
|
|
49
|
+
if key not in post["author"]:
|
|
50
|
+
return False, "key %s is missing from payload's author: %s" % (
|
|
51
|
+
key,
|
|
52
|
+
post["author"],
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
return True, None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
re_embed_types = re.compile(r"\.(record|recordWithMedia|images|video|external)$")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def valid_embed_type(embed_type):
|
|
62
|
+
return re_embed_types.search(embed_type)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def format_profile_url(user_handle_or_did):
|
|
66
|
+
return f"https://bsky.app/profile/{user_handle_or_did}"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def format_post_url(user_handle_or_did, post_did):
|
|
70
|
+
return f"https://bsky.app/profile/{user_handle_or_did}/post/{post_did}"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def parse_post_url(url, source):
|
|
74
|
+
"""Returns a tuple of (author_handle/did, post_did) from an https://bsky.app post URL"""
|
|
75
|
+
|
|
76
|
+
if not url.startswith("https://bsky.app/profile/") and "/post/" not in url:
|
|
77
|
+
raise BlueskyPayloadError(source, f"{url} is not a usual Bluesky post url")
|
|
78
|
+
return url[25:].split("/post/")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def parse_post_uri(uri, source=None):
|
|
82
|
+
"""Returns a tuple of (author_did, post_did) from an at:// post URI"""
|
|
83
|
+
|
|
84
|
+
if uri.startswith("at://") and "/app.bsky.graph.starterpack/" in uri:
|
|
85
|
+
return uri[5:].split("/app.bsky.graph.starterpack/")
|
|
86
|
+
|
|
87
|
+
if not uri.startswith("at://") and "/app.bsky.feed.post/" not in uri:
|
|
88
|
+
raise BlueskyPayloadError(
|
|
89
|
+
source or uri, f"{uri} is not a usual Bluesky post uri"
|
|
90
|
+
)
|
|
91
|
+
return uri[5:].split("/app.bsky.feed.post/")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def format_starterpack_url(user_handle_or_did, record_did):
|
|
95
|
+
return f"https://bsky.app/starter-pack/{user_handle_or_did}/{record_did}"
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def format_media_url(user_did, media_cid, mime_type, source):
|
|
99
|
+
media_type = mime_type.split("/")[1]
|
|
100
|
+
if mime_type.startswith("image"):
|
|
101
|
+
media_url = f"https://cdn.bsky.app/img/feed_fullsize/plain/{user_did}/{media_cid}@{media_type}"
|
|
102
|
+
media_thumb = f"https://cdn.bsky.app/img/feed_thumbnail/plain/{user_did}/{media_cid}@{media_type}"
|
|
103
|
+
elif mime_type.startswith("video"):
|
|
104
|
+
media_url = f"https://video.bsky.app/watch/{user_did}/{media_cid}/playlist.m3u8"
|
|
105
|
+
media_thumb = (
|
|
106
|
+
f"https://video.bsky.app/watch/{user_did}/{media_cid}/thumbnail.jpg"
|
|
107
|
+
)
|
|
108
|
+
else:
|
|
109
|
+
raise BlueskyPayloadError(source, f"{mime_type} is an usual media mimeType")
|
|
110
|
+
return media_url, media_thumb
|