twitwi 0.21.0__tar.gz → 0.21.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {twitwi-0.21.0/twitwi.egg-info → twitwi-0.21.2}/PKG-INFO +3 -3
- {twitwi-0.21.0 → twitwi-0.21.2}/README.md +1 -2
- {twitwi-0.21.0 → twitwi-0.21.2}/setup.py +2 -2
- {twitwi-0.21.0 → twitwi-0.21.2}/test/bluesky/normalizers_test.py +12 -2
- {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/bluesky/normalizers.py +86 -22
- {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/bluesky/types.py +16 -16
- {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/bluesky/utils.py +8 -1
- {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/constants.py +0 -1
- {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/utils.py +6 -5
- {twitwi-0.21.0 → twitwi-0.21.2/twitwi.egg-info}/PKG-INFO +3 -3
- {twitwi-0.21.0 → twitwi-0.21.2}/twitwi.egg-info/requires.txt +1 -0
- {twitwi-0.21.0 → twitwi-0.21.2}/LICENSE.txt +0 -0
- {twitwi-0.21.0 → twitwi-0.21.2}/setup.cfg +0 -0
- {twitwi-0.21.0 → twitwi-0.21.2}/test/bluesky/__init__.py +0 -0
- {twitwi-0.21.0 → twitwi-0.21.2}/test/bluesky/formatters_test.py +0 -0
- {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/__init__.py +0 -0
- {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/anonymizers.py +0 -0
- {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/bluesky/__init__.py +0 -0
- {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/bluesky/constants.py +0 -0
- {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/bluesky/formatters.py +0 -0
- {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/exceptions.py +0 -0
- {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/formatters.py +0 -0
- {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/normalizers.py +0 -0
- {twitwi-0.21.0 → twitwi-0.21.2}/twitwi.egg-info/SOURCES.txt +0 -0
- {twitwi-0.21.0 → twitwi-0.21.2}/twitwi.egg-info/dependency_links.txt +0 -0
- {twitwi-0.21.0 → twitwi-0.21.2}/twitwi.egg-info/top_level.txt +0 -0
- {twitwi-0.21.0 → twitwi-0.21.2}/twitwi.egg-info/zip-safe +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: twitwi
|
|
3
|
-
Version: 0.21.
|
|
3
|
+
Version: 0.21.2
|
|
4
4
|
Summary: A collection of Twitter-related helper functions for python.
|
|
5
5
|
Home-page: http://github.com/medialab/twitwi
|
|
6
6
|
Author: Béatrice Mazoyer, Guillaume Plique, Benjamin Ooghe-Tabanou
|
|
@@ -12,6 +12,7 @@ Description-Content-Type: text/markdown
|
|
|
12
12
|
License-File: LICENSE.txt
|
|
13
13
|
Requires-Dist: pytz>=2019.3
|
|
14
14
|
Requires-Dist: ural>=0.31.1
|
|
15
|
+
Requires-Dist: python-dateutil>=2.9.0
|
|
15
16
|
Dynamic: author
|
|
16
17
|
Dynamic: author-email
|
|
17
18
|
Dynamic: description
|
|
@@ -94,7 +95,7 @@ for post_data in posts_payload_from_API:
|
|
|
94
95
|
|
|
95
96
|
# Then, saving normalized profiles into a CSV using DictWriter:
|
|
96
97
|
|
|
97
|
-
|
|
98
|
+
import csv
|
|
98
99
|
from twitwi.bluesky.constants import POST_FIELDS
|
|
99
100
|
from twitwi.bluesky import transform_post_into_csv_dict
|
|
100
101
|
|
|
@@ -107,7 +108,6 @@ with open("normalized_bluesky_posts.csv", "w") as f:
|
|
|
107
108
|
|
|
108
109
|
# Or using the basic CSV writer:
|
|
109
110
|
|
|
110
|
-
from csv import writer
|
|
111
111
|
from twitwi.bluesky import format_post_as_csv_row
|
|
112
112
|
|
|
113
113
|
with open("normalized_bluesky_posts.csv", "w") as f:
|
|
@@ -68,7 +68,7 @@ for post_data in posts_payload_from_API:
|
|
|
68
68
|
|
|
69
69
|
# Then, saving normalized profiles into a CSV using DictWriter:
|
|
70
70
|
|
|
71
|
-
|
|
71
|
+
import csv
|
|
72
72
|
from twitwi.bluesky.constants import POST_FIELDS
|
|
73
73
|
from twitwi.bluesky import transform_post_into_csv_dict
|
|
74
74
|
|
|
@@ -81,7 +81,6 @@ with open("normalized_bluesky_posts.csv", "w") as f:
|
|
|
81
81
|
|
|
82
82
|
# Or using the basic CSV writer:
|
|
83
83
|
|
|
84
|
-
from csv import writer
|
|
85
84
|
from twitwi.bluesky import format_post_as_csv_row
|
|
86
85
|
|
|
87
86
|
with open("normalized_bluesky_posts.csv", "w") as f:
|
|
@@ -5,7 +5,7 @@ with open("./README.md", "r") as f:
|
|
|
5
5
|
|
|
6
6
|
setup(
|
|
7
7
|
name="twitwi",
|
|
8
|
-
version="0.21.
|
|
8
|
+
version="0.21.2",
|
|
9
9
|
description="A collection of Twitter-related helper functions for python.",
|
|
10
10
|
long_description=long_description,
|
|
11
11
|
long_description_content_type="text/markdown",
|
|
@@ -17,6 +17,6 @@ setup(
|
|
|
17
17
|
python_requires=">=3.8",
|
|
18
18
|
packages=find_packages(exclude=["scripts", "test"]),
|
|
19
19
|
package_data={"docs": ["README.md"]},
|
|
20
|
-
install_requires=["pytz>=2019.3", "ural>=0.31.1"],
|
|
20
|
+
install_requires=["pytz>=2019.3", "ural>=0.31.1", "python-dateutil>=2.9.0"],
|
|
21
21
|
zip_safe=True,
|
|
22
22
|
)
|
|
@@ -15,6 +15,8 @@ OVERWRITE_TESTS = False
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
FAKE_COLLECTION_TIME = "2025-01-01T00:00:00.000000"
|
|
18
|
+
|
|
19
|
+
|
|
18
20
|
def set_fake_collection_time(dico):
|
|
19
21
|
if "collection_time" in dico:
|
|
20
22
|
dico["collection_time"] = FAKE_COLLECTION_TIME
|
|
@@ -47,7 +49,9 @@ class TestNormalizers:
|
|
|
47
49
|
if OVERWRITE_TESTS:
|
|
48
50
|
from test.utils import dump_json_resource
|
|
49
51
|
|
|
50
|
-
normalized_profiles = [
|
|
52
|
+
normalized_profiles = [
|
|
53
|
+
set_fake_collection_time(fn(profile)) for profile in profiles
|
|
54
|
+
]
|
|
51
55
|
dump_json_resource(normalized_profiles, "bluesky-normalized-profiles.json")
|
|
52
56
|
|
|
53
57
|
expected = get_json_resource("bluesky-normalized-profiles.json")
|
|
@@ -79,7 +83,13 @@ class TestNormalizers:
|
|
|
79
83
|
if OVERWRITE_TESTS:
|
|
80
84
|
from test.utils import dump_json_resource
|
|
81
85
|
|
|
82
|
-
normalized_posts = [
|
|
86
|
+
normalized_posts = [
|
|
87
|
+
[
|
|
88
|
+
set_fake_collection_time(p)
|
|
89
|
+
for p in fn(post, extract_referenced_posts=True)
|
|
90
|
+
]
|
|
91
|
+
for post in posts
|
|
92
|
+
]
|
|
83
93
|
dump_json_resource(normalized_posts, "bluesky-normalized-posts.json")
|
|
84
94
|
|
|
85
95
|
expected = get_json_resource("bluesky-normalized-posts.json")
|
|
@@ -15,6 +15,7 @@ from twitwi.bluesky.utils import (
|
|
|
15
15
|
format_post_url,
|
|
16
16
|
parse_post_url,
|
|
17
17
|
parse_post_uri,
|
|
18
|
+
format_starterpack_url,
|
|
18
19
|
format_media_url,
|
|
19
20
|
)
|
|
20
21
|
from twitwi.bluesky.types import BlueskyProfile, BlueskyPost
|
|
@@ -37,11 +38,11 @@ def normalize_profile(data: Dict, locale: Optional[str] = None) -> BlueskyProfil
|
|
|
37
38
|
"did": data["did"],
|
|
38
39
|
"url": format_profile_url(data["handle"]),
|
|
39
40
|
"handle": data["handle"],
|
|
40
|
-
"display_name": data
|
|
41
|
+
"display_name": data.get("displayName", ""),
|
|
41
42
|
"created_at": created_at,
|
|
42
43
|
"timestamp_utc": timestamp_utc,
|
|
43
44
|
"description": data["description"],
|
|
44
|
-
"avatar": data
|
|
45
|
+
"avatar": data.get("avatar", ""),
|
|
45
46
|
"posts": data["postsCount"],
|
|
46
47
|
"followers": data["followersCount"],
|
|
47
48
|
"follows": data["followsCount"],
|
|
@@ -55,8 +56,13 @@ def normalize_profile(data: Dict, locale: Optional[str] = None) -> BlueskyProfil
|
|
|
55
56
|
|
|
56
57
|
|
|
57
58
|
def prepare_native_gif_as_media(gif_data, user_did, source):
|
|
58
|
-
|
|
59
|
-
|
|
59
|
+
if "thumb" in gif_data:
|
|
60
|
+
media_cid = gif_data["thumb"]["ref"]["$link"]
|
|
61
|
+
_, thumbnail = format_media_url(user_did, media_cid, "image/jpeg", source)
|
|
62
|
+
else:
|
|
63
|
+
media_cid = ""
|
|
64
|
+
thumbnail = ""
|
|
65
|
+
|
|
60
66
|
return {
|
|
61
67
|
"id": media_cid,
|
|
62
68
|
"type": "video/gif",
|
|
@@ -67,8 +73,12 @@ def prepare_native_gif_as_media(gif_data, user_did, source):
|
|
|
67
73
|
|
|
68
74
|
|
|
69
75
|
def prepare_image_as_media(image_data):
|
|
76
|
+
if "ref" not in image_data["image"] or "$link" not in image_data["image"]["ref"]:
|
|
77
|
+
image_id = image_data["image"]["cid"]
|
|
78
|
+
else:
|
|
79
|
+
image_id = image_data["image"]["ref"]["$link"]
|
|
70
80
|
return {
|
|
71
|
-
"id":
|
|
81
|
+
"id": image_id,
|
|
72
82
|
"type": image_data["image"]["mimeType"],
|
|
73
83
|
"alt": image_data["alt"],
|
|
74
84
|
}
|
|
@@ -81,6 +91,20 @@ def prepare_video_as_media(video_data):
|
|
|
81
91
|
}
|
|
82
92
|
|
|
83
93
|
|
|
94
|
+
def process_starterpack_card(embed_data, post):
|
|
95
|
+
# Warning: mutates post
|
|
96
|
+
|
|
97
|
+
card = embed_data.get("record", {})
|
|
98
|
+
creator_did, pack_did = parse_post_uri(embed_data["uri"])
|
|
99
|
+
post["card_link"] = format_starterpack_url(
|
|
100
|
+
embed_data.get("creator", {}).get("handle") or creator_did, pack_did
|
|
101
|
+
)
|
|
102
|
+
post["card_title"] = card.get("name", "")
|
|
103
|
+
post["card_description"] = card.get("description", "")
|
|
104
|
+
post["card_thumbnail"] = card.get("thumb", "")
|
|
105
|
+
return post
|
|
106
|
+
|
|
107
|
+
|
|
84
108
|
def process_card_data(embed_data, post):
|
|
85
109
|
# Warning: mutates post
|
|
86
110
|
|
|
@@ -122,10 +146,14 @@ def prepare_quote_data(embed_quote, card_data, post, links):
|
|
|
122
146
|
break
|
|
123
147
|
|
|
124
148
|
# Remove quoted link from post links
|
|
125
|
-
|
|
149
|
+
if post["quoted_url"] in links:
|
|
150
|
+
links.remove(post["quoted_url"])
|
|
126
151
|
|
|
127
152
|
# Extract user handle from url
|
|
128
|
-
|
|
153
|
+
if "did:plc:" not in post["quoted_url"]:
|
|
154
|
+
post["quoted_user_handle"], _ = parse_post_url(
|
|
155
|
+
post["quoted_url"], post["url"]
|
|
156
|
+
)
|
|
129
157
|
|
|
130
158
|
return (post, quoted_data, links)
|
|
131
159
|
|
|
@@ -156,7 +184,7 @@ def merge_nested_posts(referenced_posts, nested, source):
|
|
|
156
184
|
|
|
157
185
|
@overload
|
|
158
186
|
def normalize_post(
|
|
159
|
-
|
|
187
|
+
payload: Dict,
|
|
160
188
|
locale: Optional[str] = ...,
|
|
161
189
|
extract_referenced_posts: Literal[True] = ...,
|
|
162
190
|
collection_source: Optional[str] = ...,
|
|
@@ -165,7 +193,7 @@ def normalize_post(
|
|
|
165
193
|
|
|
166
194
|
@overload
|
|
167
195
|
def normalize_post(
|
|
168
|
-
|
|
196
|
+
payload: Dict,
|
|
169
197
|
locale: Optional[str] = ...,
|
|
170
198
|
extract_referenced_posts: Literal[False] = ...,
|
|
171
199
|
collection_source: Optional[str] = ...,
|
|
@@ -255,8 +283,8 @@ def normalize_post(
|
|
|
255
283
|
)
|
|
256
284
|
|
|
257
285
|
# Handle user metadata
|
|
258
|
-
post["user_diplay_name"] = data["author"]
|
|
259
|
-
post["user_avatar"] = data["author"]
|
|
286
|
+
post["user_diplay_name"] = data["author"].get("displayName", "")
|
|
287
|
+
post["user_avatar"] = data["author"].get("avatar", "")
|
|
260
288
|
post["user_timestamp_utc"], post["user_created_at"] = get_dates(
|
|
261
289
|
data["author"]["createdAt"], locale=locale, source="bluesky"
|
|
262
290
|
)
|
|
@@ -288,7 +316,7 @@ def normalize_post(
|
|
|
288
316
|
feat = facet["features"][0]
|
|
289
317
|
|
|
290
318
|
# Hashtags
|
|
291
|
-
if feat["$type"].endswith("#tag"):
|
|
319
|
+
if feat["$type"].endswith("#tag") or feat["$type"].endswith("#hashtag"):
|
|
292
320
|
hashtags.add(feat["tag"].strip().lower())
|
|
293
321
|
|
|
294
322
|
# Mentions
|
|
@@ -303,7 +331,11 @@ def normalize_post(
|
|
|
303
331
|
byteStart = text.find(b"@", byteStart)
|
|
304
332
|
|
|
305
333
|
handle = (
|
|
306
|
-
text[
|
|
334
|
+
text[
|
|
335
|
+
byteStart + 1 : facet["index"]["byteEnd"]
|
|
336
|
+
+ byteStart
|
|
337
|
+
- facet["index"]["byteStart"]
|
|
338
|
+
]
|
|
307
339
|
.strip()
|
|
308
340
|
.lower()
|
|
309
341
|
.decode("utf-8")
|
|
@@ -312,15 +344,34 @@ def normalize_post(
|
|
|
312
344
|
|
|
313
345
|
# Links
|
|
314
346
|
elif feat["$type"].endswith("#link"):
|
|
347
|
+
# Handle native polls
|
|
348
|
+
if "https://poll.blue/" in feat["uri"]:
|
|
349
|
+
if feat["uri"].endswith("/0"):
|
|
350
|
+
links.add(custom_normalize_url(feat["uri"]))
|
|
351
|
+
text += b" %s" % feat["uri"].encode("utf-8")
|
|
352
|
+
continue
|
|
353
|
+
|
|
315
354
|
links.add(custom_normalize_url(feat["uri"]))
|
|
355
|
+
# Check & fix occasional errored link positioning
|
|
356
|
+
# example: https://bsky.app/profile/ecrime.ch/post/3lqotmopayr23
|
|
357
|
+
byteStart = facet["index"]["byteStart"]
|
|
358
|
+
if b" " in text[byteStart : facet["index"]["byteEnd"]]:
|
|
359
|
+
byteStart = text.find(b"http", byteStart)
|
|
360
|
+
|
|
316
361
|
links_to_replace.append(
|
|
317
362
|
{
|
|
318
363
|
"uri": feat["uri"].encode("utf-8"),
|
|
319
|
-
"start":
|
|
320
|
-
"end":
|
|
364
|
+
"start": byteStart,
|
|
365
|
+
"end": byteStart
|
|
366
|
+
- facet["index"]["byteStart"]
|
|
367
|
+
+ facet["index"]["byteEnd"],
|
|
321
368
|
}
|
|
322
369
|
)
|
|
323
370
|
|
|
371
|
+
elif feat["$type"].endswith("#bold"):
|
|
372
|
+
pass
|
|
373
|
+
elif feat["$type"].endswith("#option"):
|
|
374
|
+
pass
|
|
324
375
|
else:
|
|
325
376
|
raise BlueskyPayloadError(
|
|
326
377
|
post["url"], "unusual record facet feature $type: %s" % feat
|
|
@@ -329,7 +380,10 @@ def normalize_post(
|
|
|
329
380
|
|
|
330
381
|
# Rewrite full links within post's text
|
|
331
382
|
for link in sorted(links_to_replace, key=lambda x: x["start"], reverse=True):
|
|
332
|
-
|
|
383
|
+
if link["start"] < 0:
|
|
384
|
+
text = text + b" " + link["uri"]
|
|
385
|
+
else:
|
|
386
|
+
text = text[: link["start"]] + link["uri"] + text[link["end"] :]
|
|
333
387
|
|
|
334
388
|
# Handle thread info when applicable
|
|
335
389
|
# Unfortunately posts' payload only provide at uris for these so we do not have the handles
|
|
@@ -399,11 +453,18 @@ def normalize_post(
|
|
|
399
453
|
if embed["$type"].endswith(".video"):
|
|
400
454
|
media_data.append(prepare_video_as_media(embed["video"]))
|
|
401
455
|
|
|
402
|
-
# Quote
|
|
456
|
+
# Quote & Starter-packs
|
|
403
457
|
if embed["$type"].endswith(".record"):
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
458
|
+
if "app.bsky.graph.starterpack" in embed["record"]["uri"]:
|
|
459
|
+
post = process_starterpack_card(
|
|
460
|
+
data.get("embed", {}).get("record"), post
|
|
461
|
+
)
|
|
462
|
+
if post["card_link"]:
|
|
463
|
+
extra_links.append(post["card_link"])
|
|
464
|
+
else:
|
|
465
|
+
post, quoted_data, links = prepare_quote_data(
|
|
466
|
+
embed["record"], data.get("embed", {}).get("record"), post, links
|
|
467
|
+
)
|
|
407
468
|
|
|
408
469
|
# Quote with medias
|
|
409
470
|
if embed["$type"].endswith(".recordWithMedia"):
|
|
@@ -478,11 +539,14 @@ def normalize_post(
|
|
|
478
539
|
|
|
479
540
|
# Rewrite post's text to include links to medias within
|
|
480
541
|
text += b" " + (
|
|
481
|
-
media_thumb
|
|
542
|
+
media_thumb
|
|
543
|
+
if media_type.startswith("video")
|
|
544
|
+
and not media_type.endswith("/gif")
|
|
545
|
+
else media_url
|
|
482
546
|
).encode("utf-8")
|
|
483
547
|
|
|
484
548
|
# Process quotes
|
|
485
|
-
if quoted_data:
|
|
549
|
+
if quoted_data and "value" in quoted_data:
|
|
486
550
|
if quoted_data["cid"] != post["quoted_cid"]:
|
|
487
551
|
raise BlueskyPayloadError(
|
|
488
552
|
post["url"],
|
|
@@ -9,7 +9,7 @@ class BlueskyProfile(TypedDict):
|
|
|
9
9
|
did: str # persistent long-term identifier of the account
|
|
10
10
|
url: str # URL of the profile accessible on the web
|
|
11
11
|
handle: str # updatable human-readable username of the account (usually like username.bsky.social or username.com)
|
|
12
|
-
display_name: str
|
|
12
|
+
display_name: Optional[str] # updatable human-readable name of the account
|
|
13
13
|
description: str # profile short description written by the user
|
|
14
14
|
posts: int # total number of posts submitted by the user (at collection time)
|
|
15
15
|
followers: int # total number of followers of the user (at collection time)
|
|
@@ -17,7 +17,7 @@ class BlueskyProfile(TypedDict):
|
|
|
17
17
|
lists: int # total number of lists created by the user (at collection time)
|
|
18
18
|
feedgens: int # total number of custom feeds created by the user (at collection time)
|
|
19
19
|
starter_packs: int # total number of starter packs created by the user (at collection time)
|
|
20
|
-
avatar: str
|
|
20
|
+
avatar: Optional[str] # URL to the image serving as avatar to the user
|
|
21
21
|
banner: str # URL to the image serving as profile banner to the user
|
|
22
22
|
pinned_post_uri: Optional[str] # ATProto's internal URI to the post potentially pinned by the user to appear at the top of his posts on his profile
|
|
23
23
|
created_at: str # datetime (potentially timezoned) of when the user created the account
|
|
@@ -63,8 +63,8 @@ class BlueskyPost(TypedDict):
|
|
|
63
63
|
# user_follows: int # not available from posts payloads
|
|
64
64
|
# user_lists: int # not available from posts payloads
|
|
65
65
|
user_langs: List[str] # languages in which the author of the posts usually writes posts (declarative)
|
|
66
|
-
user_avatar: str
|
|
67
|
-
user_created_at: str # datetime (potentially timezoned)
|
|
66
|
+
user_avatar: Optional[str] # URL to the image serving as avatar to the user who authored the post
|
|
67
|
+
user_created_at: str # datetime (potentially timezoned) of when the user who authored the post created the account
|
|
68
68
|
user_timestamp_utc: int # Unix UTC timestamp of when the user who authored the post created the account
|
|
69
69
|
|
|
70
70
|
# Parent post identifying fields
|
|
@@ -102,27 +102,27 @@ class BlueskyPost(TypedDict):
|
|
|
102
102
|
quoted_user_handle: Optional[str] # updatable human-readable username of the account who authored the quoted post
|
|
103
103
|
quoted_created_at: Optional[int] # datetime (potentially timezoned) of when the quoted post was submitted
|
|
104
104
|
quoted_timestamp_utc: Optional[int] # Unix UTC timestamp of when the quoted post was submitted
|
|
105
|
-
quoted_status: Optional[str] # empty or "detached" when the author of the quoted post intentionnally required the quoting post not to
|
|
105
|
+
quoted_status: Optional[str] # empty or "detached" when the author of the quoted post intentionnally required the quoting post not to appear in the list of this post's quotes
|
|
106
106
|
|
|
107
107
|
# Embedded elements metadata fields
|
|
108
108
|
links: List[str] # list of URLs of all links shared within the post (including potentially the embedded card detailed below, but not the link to a potential quoted post)
|
|
109
|
-
domains: List[str] # list of domains of the links shared within the post (here a domain
|
|
109
|
+
domains: List[str] # list of domains of the links shared within the post (here a domain refers to a full hostname, including subdomains, for instance bluesky.com or medialab.sciencespo.fr)
|
|
110
110
|
card_link: Optional[str] # URL of the link displayed as a card within the post if any
|
|
111
111
|
card_title: Optional[str] # title of the webpage corresponding to the linkg diplayed as a card within the post if any
|
|
112
112
|
card_description: Optional[str] # description of the webpage corresponding to the linkg diplayed as a card within the post if any
|
|
113
|
-
card_thumbnail: Optional[str] # image displayed as an illustration of the webpage corresponding to the
|
|
114
|
-
media_urls: List[str] # list of URLs to all
|
|
115
|
-
media_thumbnails: List[str] # list of URLs to small thumbnail version of all
|
|
116
|
-
media_types: List[str] # MIME types (such as image/jpeg, image/gif, video/mp4, etc.) of all
|
|
117
|
-
media_alt_texts: List[str] # description texts of all
|
|
118
|
-
mentioned_user_dids: List[str] # list of all persistent long-term
|
|
119
|
-
mentioned_user_handles: List[str] # list of all updatable human-readable
|
|
113
|
+
card_thumbnail: Optional[str] # image displayed as an illustration of the webpage corresponding to the link diplayed as a card within the post if any
|
|
114
|
+
media_urls: List[str] # list of URLs to all media (images, videos, gifs) embedded in the post
|
|
115
|
+
media_thumbnails: List[str] # list of URLs to small thumbnail version of all media (images, videos, gifs) embedded in the post
|
|
116
|
+
media_types: List[str] # MIME types (such as image/jpeg, image/gif, video/mp4, etc.) of all media (images, videos, gifs) embedded in the post
|
|
117
|
+
media_alt_texts: List[str] # description texts of all media (images, videos, gifs) embedded in the post
|
|
118
|
+
mentioned_user_dids: List[str] # list of all persistent long-term identifiers of the accounts adressed within the post (does not include users to which the post replied)
|
|
119
|
+
mentioned_user_handles: List[str] # list of all updatable human-readable usernames of the accounts adressed within the post (does not include users to which the post replied)
|
|
120
120
|
hashtags: List[str] # list of all unique lowercased hashtags found within the post's text
|
|
121
121
|
|
|
122
122
|
# Conversation rules fields
|
|
123
123
|
replies_rules: Optional[List[str]] # list of specific conversation rules set by the author for the current post (can be one or a combination of: disallow, allow_from_follower, allow_from_following, allow_from_mention, or allow_from_list: followed by a list of user DIDs)
|
|
124
124
|
replies_rules_created_at: Optional[str] # datetime (potentially timezoned) of when the user set the replies_rules
|
|
125
|
-
replies_rules_timestamp_utc: Optional[int] # Unix UTC timestamp of when the
|
|
125
|
+
replies_rules_timestamp_utc: Optional[int] # Unix UTC timestamp of when the user set the replies_rules
|
|
126
126
|
hidden_replies_uris: Optional[List[str]] # list of ATProto's internal URIs to posts who replied to the post, but where intentionnally marked as hidden by the current post's author
|
|
127
127
|
# quotes_rule: Optional[str] # not available from posts payloads, cf https://github.com/bluesky-social/atproto/issues/3712
|
|
128
128
|
# quotes_rules_created_at: Optional[str] # not available from posts payloads, cf https://github.com/bluesky-social/atproto/issues/3712
|
|
@@ -131,5 +131,5 @@ class BlueskyPost(TypedDict):
|
|
|
131
131
|
|
|
132
132
|
# Extra fields linked to the data collection and processing
|
|
133
133
|
collection_time: Optional[str] # datetime (potentially timezoned) of when the data was normalized
|
|
134
|
-
collected_via: Optional[List[str]] # extra field added by the normalization process to express how the data collection was ran, will be "quote" or "thread" when a post was grabbed as a referenced post within
|
|
135
|
-
match_query: Optional[bool] # extra field added by the normalization process to express whether the post was an intentionnally collected one or only came as a referenced post within
|
|
134
|
+
collected_via: Optional[List[str]] # extra field added by the normalization process to express how the data collection was ran, will be "quote" or "thread" when a post was grabbed as a referenced post within the originally collected post using the "extract_referenced_posts" option of "normalize_post"
|
|
135
|
+
match_query: Optional[bool] # extra field added by the normalization process to express whether the post was an intentionnally collected one or only came as a referenced post within the originally collected post using the "extract_referenced_posts" option of "normalize_post"
|
|
@@ -17,7 +17,7 @@ valid_post_keys = [
|
|
|
17
17
|
valid_record_keys = ["$type", "createdAt", "text"]
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
valid_author_keys = ["did", "handle", "
|
|
20
|
+
valid_author_keys = ["did", "handle", "createdAt"]
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
def validate_post_payload(data):
|
|
@@ -81,6 +81,9 @@ def parse_post_url(url, source):
|
|
|
81
81
|
def parse_post_uri(uri, source=None):
|
|
82
82
|
"""Returns a tuple of (author_did, post_did) from an at:// post URI"""
|
|
83
83
|
|
|
84
|
+
if uri.startswith("at://") and "/app.bsky.graph.starterpack/" in uri:
|
|
85
|
+
return uri[5:].split("/app.bsky.graph.starterpack/")
|
|
86
|
+
|
|
84
87
|
if not uri.startswith("at://") and "/app.bsky.feed.post/" not in uri:
|
|
85
88
|
raise BlueskyPayloadError(
|
|
86
89
|
source or uri, f"{uri} is not a usual Bluesky post uri"
|
|
@@ -88,6 +91,10 @@ def parse_post_uri(uri, source=None):
|
|
|
88
91
|
return uri[5:].split("/app.bsky.feed.post/")
|
|
89
92
|
|
|
90
93
|
|
|
94
|
+
def format_starterpack_url(user_handle_or_did, record_did):
|
|
95
|
+
return f"https://bsky.app/starter-pack/{user_handle_or_did}/{record_did}"
|
|
96
|
+
|
|
97
|
+
|
|
91
98
|
def format_media_url(user_did, media_cid, mime_type, source):
|
|
92
99
|
media_type = mime_type.split("/")[1]
|
|
93
100
|
if mime_type.startswith("image"):
|
|
@@ -6,7 +6,6 @@
|
|
|
6
6
|
#
|
|
7
7
|
SOURCE_DATETIME_FORMAT = "%a %b %d %H:%M:%S +0000 %Y"
|
|
8
8
|
SOURCE_DATETIME_FORMAT_V2 = "%Y-%m-%dT%H:%M:%S.%fZ"
|
|
9
|
-
SOURCE_DATETIME_FORMAT_V3 = "%Y-%m-%dT%H:%M:%SZ"
|
|
10
9
|
FORMATTED_TWEET_DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S"
|
|
11
10
|
|
|
12
11
|
FORMATTED_FULL_DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%f"
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
# Miscellaneous utility functions.
|
|
6
6
|
#
|
|
7
7
|
from pytz import timezone
|
|
8
|
+
from dateutil.parser import parse as parse_date
|
|
8
9
|
from ural import normalize_url, get_normalized_hostname
|
|
9
10
|
from functools import partial
|
|
10
11
|
from datetime import datetime
|
|
@@ -12,7 +13,6 @@ from datetime import datetime
|
|
|
12
13
|
from twitwi.constants import (
|
|
13
14
|
SOURCE_DATETIME_FORMAT,
|
|
14
15
|
SOURCE_DATETIME_FORMAT_V2,
|
|
15
|
-
SOURCE_DATETIME_FORMAT_V3,
|
|
16
16
|
FORMATTED_TWEET_DATETIME_FORMAT,
|
|
17
17
|
FORMATTED_FULL_DATETIME_FORMAT,
|
|
18
18
|
CANONICAL_URL_KWARGS,
|
|
@@ -47,12 +47,13 @@ def get_dates(date_str, locale=None, source="v1"):
|
|
|
47
47
|
SOURCE_DATETIME_FORMAT if source == "v1" else SOURCE_DATETIME_FORMAT_V2,
|
|
48
48
|
)
|
|
49
49
|
except ValueError as e:
|
|
50
|
-
if source
|
|
51
|
-
parsed_datetime = datetime.strptime(date_str, SOURCE_DATETIME_FORMAT_V3)
|
|
52
|
-
else:
|
|
50
|
+
if source != "bluesky":
|
|
53
51
|
raise e
|
|
52
|
+
parsed_datetime = parse_date(date_str)
|
|
54
53
|
|
|
55
|
-
utc_datetime =
|
|
54
|
+
utc_datetime = parsed_datetime
|
|
55
|
+
if not parsed_datetime.tzinfo:
|
|
56
|
+
utc_datetime = UTC_TIMEZONE.localize(parsed_datetime)
|
|
56
57
|
locale_datetime = utc_datetime.astimezone(locale)
|
|
57
58
|
|
|
58
59
|
return (
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: twitwi
|
|
3
|
-
Version: 0.21.
|
|
3
|
+
Version: 0.21.2
|
|
4
4
|
Summary: A collection of Twitter-related helper functions for python.
|
|
5
5
|
Home-page: http://github.com/medialab/twitwi
|
|
6
6
|
Author: Béatrice Mazoyer, Guillaume Plique, Benjamin Ooghe-Tabanou
|
|
@@ -12,6 +12,7 @@ Description-Content-Type: text/markdown
|
|
|
12
12
|
License-File: LICENSE.txt
|
|
13
13
|
Requires-Dist: pytz>=2019.3
|
|
14
14
|
Requires-Dist: ural>=0.31.1
|
|
15
|
+
Requires-Dist: python-dateutil>=2.9.0
|
|
15
16
|
Dynamic: author
|
|
16
17
|
Dynamic: author-email
|
|
17
18
|
Dynamic: description
|
|
@@ -94,7 +95,7 @@ for post_data in posts_payload_from_API:
|
|
|
94
95
|
|
|
95
96
|
# Then, saving normalized profiles into a CSV using DictWriter:
|
|
96
97
|
|
|
97
|
-
|
|
98
|
+
import csv
|
|
98
99
|
from twitwi.bluesky.constants import POST_FIELDS
|
|
99
100
|
from twitwi.bluesky import transform_post_into_csv_dict
|
|
100
101
|
|
|
@@ -107,7 +108,6 @@ with open("normalized_bluesky_posts.csv", "w") as f:
|
|
|
107
108
|
|
|
108
109
|
# Or using the basic CSV writer:
|
|
109
110
|
|
|
110
|
-
from csv import writer
|
|
111
111
|
from twitwi.bluesky import format_post_as_csv_row
|
|
112
112
|
|
|
113
113
|
with open("normalized_bluesky_posts.csv", "w") as f:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|