twitwi 0.21.1__tar.gz → 0.21.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {twitwi-0.21.1/twitwi.egg-info → twitwi-0.21.2}/PKG-INFO +2 -3
- {twitwi-0.21.1 → twitwi-0.21.2}/README.md +1 -2
- {twitwi-0.21.1 → twitwi-0.21.2}/setup.py +1 -1
- {twitwi-0.21.1 → twitwi-0.21.2}/test/bluesky/normalizers_test.py +12 -2
- {twitwi-0.21.1 → twitwi-0.21.2}/twitwi/bluesky/normalizers.py +29 -10
- {twitwi-0.21.1 → twitwi-0.21.2}/twitwi/bluesky/types.py +13 -13
- {twitwi-0.21.1 → twitwi-0.21.2/twitwi.egg-info}/PKG-INFO +2 -3
- {twitwi-0.21.1 → twitwi-0.21.2}/LICENSE.txt +0 -0
- {twitwi-0.21.1 → twitwi-0.21.2}/setup.cfg +0 -0
- {twitwi-0.21.1 → twitwi-0.21.2}/test/bluesky/__init__.py +0 -0
- {twitwi-0.21.1 → twitwi-0.21.2}/test/bluesky/formatters_test.py +0 -0
- {twitwi-0.21.1 → twitwi-0.21.2}/twitwi/__init__.py +0 -0
- {twitwi-0.21.1 → twitwi-0.21.2}/twitwi/anonymizers.py +0 -0
- {twitwi-0.21.1 → twitwi-0.21.2}/twitwi/bluesky/__init__.py +0 -0
- {twitwi-0.21.1 → twitwi-0.21.2}/twitwi/bluesky/constants.py +0 -0
- {twitwi-0.21.1 → twitwi-0.21.2}/twitwi/bluesky/formatters.py +0 -0
- {twitwi-0.21.1 → twitwi-0.21.2}/twitwi/bluesky/utils.py +0 -0
- {twitwi-0.21.1 → twitwi-0.21.2}/twitwi/constants.py +0 -0
- {twitwi-0.21.1 → twitwi-0.21.2}/twitwi/exceptions.py +0 -0
- {twitwi-0.21.1 → twitwi-0.21.2}/twitwi/formatters.py +0 -0
- {twitwi-0.21.1 → twitwi-0.21.2}/twitwi/normalizers.py +0 -0
- {twitwi-0.21.1 → twitwi-0.21.2}/twitwi/utils.py +0 -0
- {twitwi-0.21.1 → twitwi-0.21.2}/twitwi.egg-info/SOURCES.txt +0 -0
- {twitwi-0.21.1 → twitwi-0.21.2}/twitwi.egg-info/dependency_links.txt +0 -0
- {twitwi-0.21.1 → twitwi-0.21.2}/twitwi.egg-info/requires.txt +0 -0
- {twitwi-0.21.1 → twitwi-0.21.2}/twitwi.egg-info/top_level.txt +0 -0
- {twitwi-0.21.1 → twitwi-0.21.2}/twitwi.egg-info/zip-safe +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: twitwi
|
|
3
|
-
Version: 0.21.
|
|
3
|
+
Version: 0.21.2
|
|
4
4
|
Summary: A collection of Twitter-related helper functions for python.
|
|
5
5
|
Home-page: http://github.com/medialab/twitwi
|
|
6
6
|
Author: Béatrice Mazoyer, Guillaume Plique, Benjamin Ooghe-Tabanou
|
|
@@ -95,7 +95,7 @@ for post_data in posts_payload_from_API:
|
|
|
95
95
|
|
|
96
96
|
# Then, saving normalized profiles into a CSV using DictWriter:
|
|
97
97
|
|
|
98
|
-
|
|
98
|
+
import csv
|
|
99
99
|
from twitwi.bluesky.constants import POST_FIELDS
|
|
100
100
|
from twitwi.bluesky import transform_post_into_csv_dict
|
|
101
101
|
|
|
@@ -108,7 +108,6 @@ with open("normalized_bluesky_posts.csv", "w") as f:
|
|
|
108
108
|
|
|
109
109
|
# Or using the basic CSV writer:
|
|
110
110
|
|
|
111
|
-
from csv import writer
|
|
112
111
|
from twitwi.bluesky import format_post_as_csv_row
|
|
113
112
|
|
|
114
113
|
with open("normalized_bluesky_posts.csv", "w") as f:
|
|
@@ -68,7 +68,7 @@ for post_data in posts_payload_from_API:
|
|
|
68
68
|
|
|
69
69
|
# Then, saving normalized profiles into a CSV using DictWriter:
|
|
70
70
|
|
|
71
|
-
|
|
71
|
+
import csv
|
|
72
72
|
from twitwi.bluesky.constants import POST_FIELDS
|
|
73
73
|
from twitwi.bluesky import transform_post_into_csv_dict
|
|
74
74
|
|
|
@@ -81,7 +81,6 @@ with open("normalized_bluesky_posts.csv", "w") as f:
|
|
|
81
81
|
|
|
82
82
|
# Or using the basic CSV writer:
|
|
83
83
|
|
|
84
|
-
from csv import writer
|
|
85
84
|
from twitwi.bluesky import format_post_as_csv_row
|
|
86
85
|
|
|
87
86
|
with open("normalized_bluesky_posts.csv", "w") as f:
|
|
@@ -5,7 +5,7 @@ with open("./README.md", "r") as f:
|
|
|
5
5
|
|
|
6
6
|
setup(
|
|
7
7
|
name="twitwi",
|
|
8
|
-
version="0.21.
|
|
8
|
+
version="0.21.2",
|
|
9
9
|
description="A collection of Twitter-related helper functions for python.",
|
|
10
10
|
long_description=long_description,
|
|
11
11
|
long_description_content_type="text/markdown",
|
|
@@ -15,6 +15,8 @@ OVERWRITE_TESTS = False
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
FAKE_COLLECTION_TIME = "2025-01-01T00:00:00.000000"
|
|
18
|
+
|
|
19
|
+
|
|
18
20
|
def set_fake_collection_time(dico):
|
|
19
21
|
if "collection_time" in dico:
|
|
20
22
|
dico["collection_time"] = FAKE_COLLECTION_TIME
|
|
@@ -47,7 +49,9 @@ class TestNormalizers:
|
|
|
47
49
|
if OVERWRITE_TESTS:
|
|
48
50
|
from test.utils import dump_json_resource
|
|
49
51
|
|
|
50
|
-
normalized_profiles = [
|
|
52
|
+
normalized_profiles = [
|
|
53
|
+
set_fake_collection_time(fn(profile)) for profile in profiles
|
|
54
|
+
]
|
|
51
55
|
dump_json_resource(normalized_profiles, "bluesky-normalized-profiles.json")
|
|
52
56
|
|
|
53
57
|
expected = get_json_resource("bluesky-normalized-profiles.json")
|
|
@@ -79,7 +83,13 @@ class TestNormalizers:
|
|
|
79
83
|
if OVERWRITE_TESTS:
|
|
80
84
|
from test.utils import dump_json_resource
|
|
81
85
|
|
|
82
|
-
normalized_posts = [
|
|
86
|
+
normalized_posts = [
|
|
87
|
+
[
|
|
88
|
+
set_fake_collection_time(p)
|
|
89
|
+
for p in fn(post, extract_referenced_posts=True)
|
|
90
|
+
]
|
|
91
|
+
for post in posts
|
|
92
|
+
]
|
|
83
93
|
dump_json_resource(normalized_posts, "bluesky-normalized-posts.json")
|
|
84
94
|
|
|
85
95
|
expected = get_json_resource("bluesky-normalized-posts.json")
|
|
@@ -73,8 +73,12 @@ def prepare_native_gif_as_media(gif_data, user_did, source):
|
|
|
73
73
|
|
|
74
74
|
|
|
75
75
|
def prepare_image_as_media(image_data):
|
|
76
|
+
if "ref" not in image_data["image"] or "$link" not in image_data["image"]["ref"]:
|
|
77
|
+
image_id = image_data["image"]["cid"]
|
|
78
|
+
else:
|
|
79
|
+
image_id = image_data["image"]["ref"]["$link"]
|
|
76
80
|
return {
|
|
77
|
-
"id":
|
|
81
|
+
"id": image_id,
|
|
78
82
|
"type": image_data["image"]["mimeType"],
|
|
79
83
|
"alt": image_data["alt"],
|
|
80
84
|
}
|
|
@@ -92,7 +96,9 @@ def process_starterpack_card(embed_data, post):
|
|
|
92
96
|
|
|
93
97
|
card = embed_data.get("record", {})
|
|
94
98
|
creator_did, pack_did = parse_post_uri(embed_data["uri"])
|
|
95
|
-
post["card_link"] = format_starterpack_url(
|
|
99
|
+
post["card_link"] = format_starterpack_url(
|
|
100
|
+
embed_data.get("creator", {}).get("handle") or creator_did, pack_did
|
|
101
|
+
)
|
|
96
102
|
post["card_title"] = card.get("name", "")
|
|
97
103
|
post["card_description"] = card.get("description", "")
|
|
98
104
|
post["card_thumbnail"] = card.get("thumb", "")
|
|
@@ -145,7 +151,9 @@ def prepare_quote_data(embed_quote, card_data, post, links):
|
|
|
145
151
|
|
|
146
152
|
# Extract user handle from url
|
|
147
153
|
if "did:plc:" not in post["quoted_url"]:
|
|
148
|
-
post["quoted_user_handle"], _ = parse_post_url(
|
|
154
|
+
post["quoted_user_handle"], _ = parse_post_url(
|
|
155
|
+
post["quoted_url"], post["url"]
|
|
156
|
+
)
|
|
149
157
|
|
|
150
158
|
return (post, quoted_data, links)
|
|
151
159
|
|
|
@@ -176,7 +184,7 @@ def merge_nested_posts(referenced_posts, nested, source):
|
|
|
176
184
|
|
|
177
185
|
@overload
|
|
178
186
|
def normalize_post(
|
|
179
|
-
|
|
187
|
+
payload: Dict,
|
|
180
188
|
locale: Optional[str] = ...,
|
|
181
189
|
extract_referenced_posts: Literal[True] = ...,
|
|
182
190
|
collection_source: Optional[str] = ...,
|
|
@@ -185,7 +193,7 @@ def normalize_post(
|
|
|
185
193
|
|
|
186
194
|
@overload
|
|
187
195
|
def normalize_post(
|
|
188
|
-
|
|
196
|
+
payload: Dict,
|
|
189
197
|
locale: Optional[str] = ...,
|
|
190
198
|
extract_referenced_posts: Literal[False] = ...,
|
|
191
199
|
collection_source: Optional[str] = ...,
|
|
@@ -308,7 +316,7 @@ def normalize_post(
|
|
|
308
316
|
feat = facet["features"][0]
|
|
309
317
|
|
|
310
318
|
# Hashtags
|
|
311
|
-
if feat["$type"].endswith("#tag"):
|
|
319
|
+
if feat["$type"].endswith("#tag") or feat["$type"].endswith("#hashtag"):
|
|
312
320
|
hashtags.add(feat["tag"].strip().lower())
|
|
313
321
|
|
|
314
322
|
# Mentions
|
|
@@ -323,7 +331,11 @@ def normalize_post(
|
|
|
323
331
|
byteStart = text.find(b"@", byteStart)
|
|
324
332
|
|
|
325
333
|
handle = (
|
|
326
|
-
text[
|
|
334
|
+
text[
|
|
335
|
+
byteStart + 1 : facet["index"]["byteEnd"]
|
|
336
|
+
+ byteStart
|
|
337
|
+
- facet["index"]["byteStart"]
|
|
338
|
+
]
|
|
327
339
|
.strip()
|
|
328
340
|
.lower()
|
|
329
341
|
.decode("utf-8")
|
|
@@ -350,7 +362,9 @@ def normalize_post(
|
|
|
350
362
|
{
|
|
351
363
|
"uri": feat["uri"].encode("utf-8"),
|
|
352
364
|
"start": byteStart,
|
|
353
|
-
"end": byteStart
|
|
365
|
+
"end": byteStart
|
|
366
|
+
- facet["index"]["byteStart"]
|
|
367
|
+
+ facet["index"]["byteEnd"],
|
|
354
368
|
}
|
|
355
369
|
)
|
|
356
370
|
|
|
@@ -442,7 +456,9 @@ def normalize_post(
|
|
|
442
456
|
# Quote & Starter-packs
|
|
443
457
|
if embed["$type"].endswith(".record"):
|
|
444
458
|
if "app.bsky.graph.starterpack" in embed["record"]["uri"]:
|
|
445
|
-
post = process_starterpack_card(
|
|
459
|
+
post = process_starterpack_card(
|
|
460
|
+
data.get("embed", {}).get("record"), post
|
|
461
|
+
)
|
|
446
462
|
if post["card_link"]:
|
|
447
463
|
extra_links.append(post["card_link"])
|
|
448
464
|
else:
|
|
@@ -523,7 +539,10 @@ def normalize_post(
|
|
|
523
539
|
|
|
524
540
|
# Rewrite post's text to include links to medias within
|
|
525
541
|
text += b" " + (
|
|
526
|
-
media_thumb
|
|
542
|
+
media_thumb
|
|
543
|
+
if media_type.startswith("video")
|
|
544
|
+
and not media_type.endswith("/gif")
|
|
545
|
+
else media_url
|
|
527
546
|
).encode("utf-8")
|
|
528
547
|
|
|
529
548
|
# Process quotes
|
|
@@ -64,7 +64,7 @@ class BlueskyPost(TypedDict):
|
|
|
64
64
|
# user_lists: int # not available from posts payloads
|
|
65
65
|
user_langs: List[str] # languages in which the author of the posts usually writes posts (declarative)
|
|
66
66
|
user_avatar: Optional[str] # URL to the image serving as avatar to the user who authored the post
|
|
67
|
-
user_created_at: str # datetime (potentially timezoned)
|
|
67
|
+
user_created_at: str # datetime (potentially timezoned) of when the user who authored the post created the account
|
|
68
68
|
user_timestamp_utc: int # Unix UTC timestamp of when the user who authored the post created the account
|
|
69
69
|
|
|
70
70
|
# Parent post identifying fields
|
|
@@ -102,27 +102,27 @@ class BlueskyPost(TypedDict):
|
|
|
102
102
|
quoted_user_handle: Optional[str] # updatable human-readable username of the account who authored the quoted post
|
|
103
103
|
quoted_created_at: Optional[int] # datetime (potentially timezoned) of when the quoted post was submitted
|
|
104
104
|
quoted_timestamp_utc: Optional[int] # Unix UTC timestamp of when the quoted post was submitted
|
|
105
|
-
quoted_status: Optional[str] # empty or "detached" when the author of the quoted post intentionnally required the quoting post not to
|
|
105
|
+
quoted_status: Optional[str] # empty or "detached" when the author of the quoted post intentionnally required the quoting post not to appear in the list of this post's quotes
|
|
106
106
|
|
|
107
107
|
# Embedded elements metadata fields
|
|
108
108
|
links: List[str] # list of URLs of all links shared within the post (including potentially the embedded card detailed below, but not the link to a potential quoted post)
|
|
109
|
-
domains: List[str] # list of domains of the links shared within the post (here a domain
|
|
109
|
+
domains: List[str] # list of domains of the links shared within the post (here a domain refers to a full hostname, including subdomains, for instance bluesky.com or medialab.sciencespo.fr)
|
|
110
110
|
card_link: Optional[str] # URL of the link displayed as a card within the post if any
|
|
111
111
|
card_title: Optional[str] # title of the webpage corresponding to the linkg diplayed as a card within the post if any
|
|
112
112
|
card_description: Optional[str] # description of the webpage corresponding to the linkg diplayed as a card within the post if any
|
|
113
|
-
card_thumbnail: Optional[str] # image displayed as an illustration of the webpage corresponding to the
|
|
114
|
-
media_urls: List[str] # list of URLs to all
|
|
115
|
-
media_thumbnails: List[str] # list of URLs to small thumbnail version of all
|
|
116
|
-
media_types: List[str] # MIME types (such as image/jpeg, image/gif, video/mp4, etc.) of all
|
|
117
|
-
media_alt_texts: List[str] # description texts of all
|
|
118
|
-
mentioned_user_dids: List[str] # list of all persistent long-term
|
|
119
|
-
mentioned_user_handles: List[str] # list of all updatable human-readable
|
|
113
|
+
card_thumbnail: Optional[str] # image displayed as an illustration of the webpage corresponding to the link diplayed as a card within the post if any
|
|
114
|
+
media_urls: List[str] # list of URLs to all media (images, videos, gifs) embedded in the post
|
|
115
|
+
media_thumbnails: List[str] # list of URLs to small thumbnail version of all media (images, videos, gifs) embedded in the post
|
|
116
|
+
media_types: List[str] # MIME types (such as image/jpeg, image/gif, video/mp4, etc.) of all media (images, videos, gifs) embedded in the post
|
|
117
|
+
media_alt_texts: List[str] # description texts of all media (images, videos, gifs) embedded in the post
|
|
118
|
+
mentioned_user_dids: List[str] # list of all persistent long-term identifiers of the accounts adressed within the post (does not include users to which the post replied)
|
|
119
|
+
mentioned_user_handles: List[str] # list of all updatable human-readable usernames of the accounts adressed within the post (does not include users to which the post replied)
|
|
120
120
|
hashtags: List[str] # list of all unique lowercased hashtags found within the post's text
|
|
121
121
|
|
|
122
122
|
# Conversation rules fields
|
|
123
123
|
replies_rules: Optional[List[str]] # list of specific conversation rules set by the author for the current post (can be one or a combination of: disallow, allow_from_follower, allow_from_following, allow_from_mention, or allow_from_list: followed by a list of user DIDs)
|
|
124
124
|
replies_rules_created_at: Optional[str] # datetime (potentially timezoned) of when the user set the replies_rules
|
|
125
|
-
replies_rules_timestamp_utc: Optional[int] # Unix UTC timestamp of when the
|
|
125
|
+
replies_rules_timestamp_utc: Optional[int] # Unix UTC timestamp of when the user set the replies_rules
|
|
126
126
|
hidden_replies_uris: Optional[List[str]] # list of ATProto's internal URIs to posts who replied to the post, but where intentionnally marked as hidden by the current post's author
|
|
127
127
|
# quotes_rule: Optional[str] # not available from posts payloads, cf https://github.com/bluesky-social/atproto/issues/3712
|
|
128
128
|
# quotes_rules_created_at: Optional[str] # not available from posts payloads, cf https://github.com/bluesky-social/atproto/issues/3712
|
|
@@ -131,5 +131,5 @@ class BlueskyPost(TypedDict):
|
|
|
131
131
|
|
|
132
132
|
# Extra fields linked to the data collection and processing
|
|
133
133
|
collection_time: Optional[str] # datetime (potentially timezoned) of when the data was normalized
|
|
134
|
-
collected_via: Optional[List[str]] # extra field added by the normalization process to express how the data collection was ran, will be "quote" or "thread" when a post was grabbed as a referenced post within
|
|
135
|
-
match_query: Optional[bool] # extra field added by the normalization process to express whether the post was an intentionnally collected one or only came as a referenced post within
|
|
134
|
+
collected_via: Optional[List[str]] # extra field added by the normalization process to express how the data collection was ran, will be "quote" or "thread" when a post was grabbed as a referenced post within the originally collected post using the "extract_referenced_posts" option of "normalize_post"
|
|
135
|
+
match_query: Optional[bool] # extra field added by the normalization process to express whether the post was an intentionnally collected one or only came as a referenced post within the originally collected post using the "extract_referenced_posts" option of "normalize_post"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: twitwi
|
|
3
|
-
Version: 0.21.
|
|
3
|
+
Version: 0.21.2
|
|
4
4
|
Summary: A collection of Twitter-related helper functions for python.
|
|
5
5
|
Home-page: http://github.com/medialab/twitwi
|
|
6
6
|
Author: Béatrice Mazoyer, Guillaume Plique, Benjamin Ooghe-Tabanou
|
|
@@ -95,7 +95,7 @@ for post_data in posts_payload_from_API:
|
|
|
95
95
|
|
|
96
96
|
# Then, saving normalized profiles into a CSV using DictWriter:
|
|
97
97
|
|
|
98
|
-
|
|
98
|
+
import csv
|
|
99
99
|
from twitwi.bluesky.constants import POST_FIELDS
|
|
100
100
|
from twitwi.bluesky import transform_post_into_csv_dict
|
|
101
101
|
|
|
@@ -108,7 +108,6 @@ with open("normalized_bluesky_posts.csv", "w") as f:
|
|
|
108
108
|
|
|
109
109
|
# Or using the basic CSV writer:
|
|
110
110
|
|
|
111
|
-
from csv import writer
|
|
112
111
|
from twitwi.bluesky import format_post_as_csv_row
|
|
113
112
|
|
|
114
113
|
with open("normalized_bluesky_posts.csv", "w") as f:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|