twitwi 0.22.1__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- test/bluesky/formatters_test.py +4 -2
- twitwi/bluesky/normalizers.py +484 -91
- twitwi/bluesky/types.py +2 -0
- twitwi/bluesky/utils.py +31 -15
- twitwi/exceptions.py +1 -1
- twitwi/formatters.py +16 -3
- twitwi/utils.py +28 -7
- {twitwi-0.22.1.dist-info → twitwi-0.24.0.dist-info}/METADATA +3 -3
- twitwi-0.24.0.dist-info/RECORD +22 -0
- {twitwi-0.22.1.dist-info → twitwi-0.24.0.dist-info}/WHEEL +1 -1
- twitwi-0.22.1.dist-info/RECORD +0 -22
- {twitwi-0.22.1.dist-info → twitwi-0.24.0.dist-info}/licenses/LICENSE.txt +0 -0
- {twitwi-0.22.1.dist-info → twitwi-0.24.0.dist-info}/top_level.txt +0 -0
- {twitwi-0.22.1.dist-info → twitwi-0.24.0.dist-info}/zip-safe +0 -0
test/bluesky/formatters_test.py
CHANGED
|
@@ -112,7 +112,9 @@ class TestFormatters:
|
|
|
112
112
|
|
|
113
113
|
for source in normalized_posts:
|
|
114
114
|
for post in source:
|
|
115
|
-
writer.writerow(
|
|
115
|
+
writer.writerow(
|
|
116
|
+
format_post_as_csv_row(post, allow_erroneous_plurals=True)
|
|
117
|
+
)
|
|
116
118
|
|
|
117
119
|
if OVERWRITE_TESTS:
|
|
118
120
|
written = buffer.getvalue()
|
|
@@ -140,7 +142,7 @@ class TestFormatters:
|
|
|
140
142
|
|
|
141
143
|
for source in normalized_posts:
|
|
142
144
|
for post in source:
|
|
143
|
-
transform_post_into_csv_dict(post)
|
|
145
|
+
transform_post_into_csv_dict(post, allow_erroneous_plurals=True)
|
|
144
146
|
writer.writerow(post)
|
|
145
147
|
|
|
146
148
|
with open_resource("bluesky-posts-export.csv") as f:
|
twitwi/bluesky/normalizers.py
CHANGED
|
@@ -99,14 +99,33 @@ def prepare_native_gif_as_media(gif_data, user_did, source):
|
|
|
99
99
|
}
|
|
100
100
|
|
|
101
101
|
|
|
102
|
-
def prepare_image_as_media(image_data):
|
|
103
|
-
if
|
|
104
|
-
|
|
102
|
+
def prepare_image_as_media(image_data, source):
|
|
103
|
+
if isinstance(image_data["image"], str):
|
|
104
|
+
# As in this post: https://bsky.app/profile/did:plc:xafmeedgq77f6smn6kmalasr/post/3lcnxglm3o62z
|
|
105
|
+
image_type = "image/jpeg"
|
|
106
|
+
image_id = image_data["image"]
|
|
107
|
+
elif isinstance(image_data["image"], dict):
|
|
108
|
+
image_type = image_data["image"]["mimeType"]
|
|
109
|
+
if (
|
|
110
|
+
"ref" not in image_data["image"]
|
|
111
|
+
or "$link" not in image_data["image"]["ref"]
|
|
112
|
+
):
|
|
113
|
+
# As in this post: https://bsky.app/profile/testjuan06.bsky.social/post/3ljkzygywso2b
|
|
114
|
+
if "link" in image_data["image"]:
|
|
115
|
+
image_id = image_data["image"]["link"]
|
|
116
|
+
elif "cid" in image_data["image"]:
|
|
117
|
+
image_id = image_data["image"]["cid"]
|
|
118
|
+
else:
|
|
119
|
+
raise BlueskyPayloadError(
|
|
120
|
+
source, "Unable to find image id in image data: %s" % image_data
|
|
121
|
+
)
|
|
122
|
+
else:
|
|
123
|
+
image_id = image_data["image"]["ref"]["$link"]
|
|
105
124
|
else:
|
|
106
|
-
|
|
125
|
+
raise BlueskyPayloadError(source, "Unable to parse image data: %s" % image_data)
|
|
107
126
|
return {
|
|
108
127
|
"id": image_id,
|
|
109
|
-
"type":
|
|
128
|
+
"type": image_type,
|
|
110
129
|
"alt": image_data["alt"],
|
|
111
130
|
}
|
|
112
131
|
|
|
@@ -122,13 +141,15 @@ def process_starterpack_card(embed_data, post):
|
|
|
122
141
|
# Warning: mutates post
|
|
123
142
|
|
|
124
143
|
card = embed_data.get("record", {})
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
144
|
+
if "uri" in embed_data:
|
|
145
|
+
creator_did, pack_did = parse_post_uri(embed_data["uri"])
|
|
146
|
+
post["card_link"] = format_starterpack_url(
|
|
147
|
+
embed_data.get("creator", {}).get("handle") or creator_did, pack_did
|
|
148
|
+
)
|
|
149
|
+
if card:
|
|
150
|
+
post["card_title"] = card.get("name", "")
|
|
151
|
+
post["card_description"] = card.get("description", "")
|
|
152
|
+
post["card_thumbnail"] = card.get("thumb", "")
|
|
132
153
|
return post
|
|
133
154
|
|
|
134
155
|
|
|
@@ -138,56 +159,70 @@ def process_card_data(embed_data, post):
|
|
|
138
159
|
post["card_link"] = embed_data["uri"]
|
|
139
160
|
post["card_title"] = embed_data.get("title", "")
|
|
140
161
|
post["card_description"] = embed_data.get("description", "")
|
|
141
|
-
|
|
162
|
+
if isinstance(embed_data.get("thumb"), dict) and embed_data["thumb"].get(
|
|
163
|
+
"ref", {}
|
|
164
|
+
).get("$link"):
|
|
165
|
+
media_cid = embed_data["thumb"]["ref"]["$link"]
|
|
166
|
+
post["card_thumbnail"] = (
|
|
167
|
+
f"https://cdn.bsky.app/img/feed_thumbnail/plain/{post['user_did']}/{media_cid}@jpeg"
|
|
168
|
+
)
|
|
169
|
+
else:
|
|
170
|
+
post["card_thumbnail"] = embed_data.get("thumb", "")
|
|
142
171
|
return post
|
|
143
172
|
|
|
144
173
|
|
|
145
174
|
def prepare_quote_data(embed_quote, card_data, post, links):
|
|
146
175
|
# Warning: mutates post and links
|
|
147
176
|
|
|
177
|
+
quoted_data = None
|
|
178
|
+
|
|
148
179
|
post["quoted_cid"] = embed_quote["cid"]
|
|
149
180
|
post["quoted_uri"] = embed_quote["uri"]
|
|
150
|
-
post
|
|
151
|
-
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
# First store ugly quoted url with user did in case full quote data is missing (recursion > 3 or detached quote)
|
|
155
|
-
# Handling special posts types (only lists for now, for example: https://bsky.app/profile/lanana421.bsky.social/lists/3lxdgjtpqhf2z)
|
|
156
|
-
if "/app.bsky.graph.list/" in post["quoted_uri"]:
|
|
157
|
-
post_splitter = "/lists/"
|
|
181
|
+
# Sometimes quoted post is not found, even if uri and cid are given
|
|
182
|
+
# example: https://bsky.app/profile/takobiotech.masto.bike.ap.brid.gy/post/3lc6r7nzil6m2
|
|
183
|
+
if card_data and card_data.get("notFound"):
|
|
184
|
+
post["quoted_status"] = "notFound"
|
|
158
185
|
else:
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
)
|
|
163
|
-
|
|
164
|
-
quoted_data = None
|
|
165
|
-
if card_data:
|
|
166
|
-
if card_data.get("detached", False):
|
|
167
|
-
post["quoted_status"] = "detached"
|
|
186
|
+
post["quoted_user_did"], post["quoted_did"] = parse_post_uri(
|
|
187
|
+
post["quoted_uri"], post["url"]
|
|
188
|
+
)
|
|
168
189
|
|
|
190
|
+
# First store ugly quoted url with user did in case full quote data is missing (recursion > 3 or detached quote)
|
|
191
|
+
# Handling special posts types (only lists for now, for example: https://bsky.app/profile/lanana421.bsky.social/lists/3lxdgjtpqhf2z)
|
|
192
|
+
if "/app.bsky.graph.list/" in post["quoted_uri"]:
|
|
193
|
+
post_splitter = "/lists/"
|
|
169
194
|
else:
|
|
170
|
-
|
|
195
|
+
post_splitter = "/post/"
|
|
196
|
+
post["quoted_url"] = format_post_url(
|
|
197
|
+
post["quoted_user_did"], post["quoted_did"], post_splitter=post_splitter
|
|
198
|
+
)
|
|
171
199
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
200
|
+
if card_data:
|
|
201
|
+
if card_data.get("detached"):
|
|
202
|
+
post["quoted_status"] = "detached"
|
|
203
|
+
|
|
204
|
+
else:
|
|
205
|
+
quoted_data = deepcopy(card_data)
|
|
206
|
+
|
|
207
|
+
# Grab user handle and cleanup links when no quote data but url in text
|
|
208
|
+
if not quoted_data:
|
|
209
|
+
for link in links:
|
|
210
|
+
if link.startswith("https://bsky.app/profile/") and link.endswith(
|
|
211
|
+
post["quoted_did"]
|
|
212
|
+
):
|
|
213
|
+
# Take better quoted url with user_handle
|
|
214
|
+
post["quoted_url"] = link
|
|
215
|
+
break
|
|
216
|
+
|
|
217
|
+
# Remove quoted link from post links
|
|
218
|
+
if post["quoted_url"] in links:
|
|
219
|
+
links.remove(post["quoted_url"])
|
|
220
|
+
|
|
221
|
+
# Extract user handle from url
|
|
222
|
+
if "did:plc:" not in post["quoted_url"]:
|
|
223
|
+
post["quoted_user_handle"], _ = parse_post_url(
|
|
224
|
+
post["quoted_url"], post["url"]
|
|
225
|
+
)
|
|
191
226
|
|
|
192
227
|
return (post, quoted_data, links)
|
|
193
228
|
|
|
@@ -300,6 +335,11 @@ def normalize_post(
|
|
|
300
335
|
post["timestamp_utc"], post["local_time"] = get_dates(
|
|
301
336
|
data["record"]["createdAt"], locale=locale, source="bluesky"
|
|
302
337
|
)
|
|
338
|
+
# Completing year with less than 4 digits as in some posts: https://bsky.app/profile/koro.icu/post/3kbpuogc6fz2o
|
|
339
|
+
# len 26 example: '2023-06-15T12:34:56.789000'
|
|
340
|
+
while len(post["local_time"]) < 26 and len(post["local_time"].split("-")[0]) < 4:
|
|
341
|
+
post["local_time"] = "0" + post["local_time"]
|
|
342
|
+
post["indexed_at_utc"] = data["indexedAt"]
|
|
303
343
|
|
|
304
344
|
# Handle post/user identifiers
|
|
305
345
|
post["cid"] = data["cid"]
|
|
@@ -307,7 +347,11 @@ def normalize_post(
|
|
|
307
347
|
post["user_did"], post["did"] = parse_post_uri(data["uri"])
|
|
308
348
|
post["user_handle"] = data["author"]["handle"]
|
|
309
349
|
post["user_url"] = format_profile_url(post["user_handle"])
|
|
310
|
-
|
|
350
|
+
# example: https://bsky.app/profile/did:plc:n5pm4vggu475okayqvqipkoh/post/3lmdcgp3a7cnd
|
|
351
|
+
if post["user_handle"] == "handle.invalid":
|
|
352
|
+
post["url"] = format_post_url(post["user_did"], post["did"])
|
|
353
|
+
else:
|
|
354
|
+
post["url"] = format_post_url(post["user_handle"], post["did"])
|
|
311
355
|
|
|
312
356
|
if post["user_did"] != data["author"]["did"]:
|
|
313
357
|
raise BlueskyPayloadError(
|
|
@@ -332,6 +376,8 @@ def normalize_post(
|
|
|
332
376
|
post["reply_count"] = data["replyCount"]
|
|
333
377
|
post["like_count"] = data["likeCount"]
|
|
334
378
|
post["quote_count"] = data["quoteCount"]
|
|
379
|
+
# When a post cites another, the cited post doesn't have the bookmarkCount field
|
|
380
|
+
post["bookmark_count"] = data.get("bookmarkCount")
|
|
335
381
|
|
|
336
382
|
# Handle hashtags, mentions & links from facets
|
|
337
383
|
post["mentioned_user_handles"] = []
|
|
@@ -339,19 +385,91 @@ def normalize_post(
|
|
|
339
385
|
hashtags = set()
|
|
340
386
|
links = set()
|
|
341
387
|
links_to_replace = []
|
|
388
|
+
media_data = []
|
|
389
|
+
extra_links = []
|
|
390
|
+
post["media_urls"] = []
|
|
342
391
|
for facet in data["record"].get("facets", []):
|
|
343
392
|
if len(facet["features"]) != 1:
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
393
|
+
raising_error = False
|
|
394
|
+
for feat in facet["features"]:
|
|
395
|
+
# Already handled linkcards separately below
|
|
396
|
+
if feat["$type"].endswith("#linkcard"):
|
|
397
|
+
continue
|
|
398
|
+
|
|
399
|
+
# If there are links, we register them and do not replace anything in original text
|
|
400
|
+
# as we don't have position for each link
|
|
401
|
+
# example: https://bsky.app/profile/77cupons.bsky.social/post/3latbufuvqw25
|
|
402
|
+
elif feat["$type"].endswith("#link") and "uri" in feat:
|
|
403
|
+
link = safe_normalize_url(feat["uri"])
|
|
404
|
+
if is_url(link):
|
|
405
|
+
links.add(link)
|
|
406
|
+
links_to_replace.append(
|
|
407
|
+
{"uri": feat["uri"].encode("utf-8"), "start": -1, "end": -1}
|
|
408
|
+
)
|
|
409
|
+
elif feat["$type"].lower().endswith("#tag"):
|
|
410
|
+
hashtags.add(feat["tag"].strip().lower())
|
|
411
|
+
# As in this post: https://bsky.app/profile/havehashad.com/post/3ki3rk5ytqd2e
|
|
412
|
+
elif feat["$type"].endswith("#image") and "uri" in feat:
|
|
413
|
+
post["media_urls"].append(safe_normalize_url(feat["uri"]))
|
|
414
|
+
else:
|
|
415
|
+
raising_error = True
|
|
416
|
+
|
|
417
|
+
if raising_error:
|
|
418
|
+
raise BlueskyPayloadError(
|
|
419
|
+
post["url"],
|
|
420
|
+
"unusual record facet content with more or less than a unique feature: %s"
|
|
421
|
+
% facet,
|
|
422
|
+
)
|
|
423
|
+
continue
|
|
349
424
|
|
|
350
425
|
feat = facet["features"][0]
|
|
426
|
+
lower_feat_type = feat["$type"].lower()
|
|
351
427
|
|
|
352
428
|
# Hashtags
|
|
353
|
-
if
|
|
354
|
-
|
|
429
|
+
if (
|
|
430
|
+
lower_feat_type.endswith("#tag")
|
|
431
|
+
or lower_feat_type.endswith(".tag")
|
|
432
|
+
or lower_feat_type.endswith("#hashtag")
|
|
433
|
+
or lower_feat_type == "facettag"
|
|
434
|
+
):
|
|
435
|
+
# Some posts have the full text in the "text" field of the hashtag feature
|
|
436
|
+
if "text" in feat:
|
|
437
|
+
for tag in feat["text"].split("#"):
|
|
438
|
+
if tag.strip():
|
|
439
|
+
hashtags.add(tag.strip().lower())
|
|
440
|
+
# some posts have "hashtag" instead of "tag" field
|
|
441
|
+
# example: https://bsky.app/profile/did:plc:jrodn6nnfuwzm2zxbxbpzgot/post/3lhwag3mzoo2k
|
|
442
|
+
else:
|
|
443
|
+
if "tag" in feat:
|
|
444
|
+
tag = feat["tag"].strip().lower()
|
|
445
|
+
elif "hashtag" in feat:
|
|
446
|
+
tag = feat["hashtag"].strip().lower()
|
|
447
|
+
# Somehow no tag found, we'll try to get it in the text slice
|
|
448
|
+
# example: https://bsky.app/profile/did:plc:p6yojdpa5iatdk3ttaty2zu2/post/3knvsl6h4x22i
|
|
449
|
+
elif len(feat) == 1:
|
|
450
|
+
byteStart = facet["index"]["byteStart"]
|
|
451
|
+
if text[byteStart : byteStart + 1] == b"#":
|
|
452
|
+
byteEnd = facet["index"]["byteEnd"]
|
|
453
|
+
try:
|
|
454
|
+
tag = (
|
|
455
|
+
text[byteStart:byteEnd]
|
|
456
|
+
.decode("utf-8")
|
|
457
|
+
.strip()
|
|
458
|
+
.lstrip("#")
|
|
459
|
+
.lower()
|
|
460
|
+
)
|
|
461
|
+
except UnicodeDecodeError:
|
|
462
|
+
raise BlueskyPayloadError(
|
|
463
|
+
post["url"],
|
|
464
|
+
"unable to decode utf-8 slice for hashtag extraction: %s"
|
|
465
|
+
% facet,
|
|
466
|
+
)
|
|
467
|
+
else:
|
|
468
|
+
raise BlueskyPayloadError(
|
|
469
|
+
post["url"],
|
|
470
|
+
"unable to extract hashtag from text slice: %s" % facet,
|
|
471
|
+
)
|
|
472
|
+
hashtags.add(tag)
|
|
355
473
|
|
|
356
474
|
# Mentions
|
|
357
475
|
elif feat["$type"].endswith("#mention"):
|
|
@@ -361,23 +479,43 @@ def normalize_post(
|
|
|
361
479
|
# Check & fix occasional errored mention positioning
|
|
362
480
|
# example: https://bsky.app/profile/snjcgt.bsky.social/post/3lpmqkkkgp52u
|
|
363
481
|
byteStart = facet["index"]["byteStart"]
|
|
482
|
+
byteEnd = facet["index"]["byteEnd"]
|
|
364
483
|
if text[byteStart : byteStart + 1] != b"@":
|
|
365
484
|
byteStart = text.find(b"@", byteStart)
|
|
485
|
+
# in some cases, the errored positioning is before the position given
|
|
486
|
+
# example: https://bsky.app/profile/springer.springernature.com/post/3lovyad4nt324
|
|
487
|
+
if byteStart == -1 or byteStart > byteEnd:
|
|
488
|
+
# When decrementing byteStart, we also decrement byteEnd (see below)
|
|
489
|
+
# shifting the slice to extract the mention correctly
|
|
490
|
+
byteStart = facet["index"]["byteStart"] - 1
|
|
491
|
+
# to extend the size of the mention, which is somehow 1 char too short because of the '@'
|
|
492
|
+
byteEnd += 1
|
|
366
493
|
|
|
367
494
|
handle = (
|
|
368
495
|
text[
|
|
369
|
-
byteStart + 1 :
|
|
496
|
+
byteStart + 1 : byteEnd
|
|
370
497
|
+ byteStart
|
|
371
498
|
- facet["index"]["byteStart"]
|
|
372
499
|
]
|
|
373
500
|
.strip()
|
|
374
501
|
.lower()
|
|
375
|
-
.decode("utf-8")
|
|
376
502
|
)
|
|
503
|
+
while byteEnd >= byteStart:
|
|
504
|
+
try:
|
|
505
|
+
handle.decode("utf-8")
|
|
506
|
+
break
|
|
507
|
+
except UnicodeDecodeError:
|
|
508
|
+
handle = handle[:-1]
|
|
509
|
+
continue
|
|
510
|
+
handle = handle.decode("utf-8")
|
|
377
511
|
post["mentioned_user_handles"].append(handle)
|
|
378
512
|
|
|
379
513
|
# Links
|
|
380
|
-
elif
|
|
514
|
+
elif (
|
|
515
|
+
feat["$type"].endswith("#link")
|
|
516
|
+
or feat["$type"].endswith(".link")
|
|
517
|
+
or feat["$type"].endswith(".url")
|
|
518
|
+
):
|
|
381
519
|
# Handle native polls
|
|
382
520
|
if "https://poll.blue/" in feat["uri"]:
|
|
383
521
|
if feat["uri"].endswith("/0"):
|
|
@@ -398,26 +536,189 @@ def normalize_post(
|
|
|
398
536
|
# examples: https://bsky.app/profile/ecrime.ch/post/3lqotmopayr23
|
|
399
537
|
# https://bsky.app/profile/clustz.com/post/3lqfi7mnto52w
|
|
400
538
|
byteStart = facet["index"]["byteStart"]
|
|
539
|
+
byteEnd = facet["index"]["byteEnd"]
|
|
540
|
+
|
|
541
|
+
# Skip overlapping links cases
|
|
542
|
+
# examples: https://bsky.app/profile/researchtrend.ai/post/3lbieylwwxs2b
|
|
543
|
+
# https://bsky.app/profile/dj-cyberspace.otoskey.tarbin.net.ap.brid.gy/post/3lchg3plpdjp2
|
|
544
|
+
for elt in links_to_replace:
|
|
545
|
+
if (byteStart >= elt["start"] and byteStart <= elt["end"]) or (
|
|
546
|
+
byteEnd >= elt["start"] and byteEnd <= elt["end"]
|
|
547
|
+
):
|
|
548
|
+
# Overlapping links, we skip this one
|
|
549
|
+
byteStart = -1
|
|
550
|
+
byteEnd = -1
|
|
551
|
+
break
|
|
552
|
+
|
|
553
|
+
# Meaning we will try to fix the link position
|
|
554
|
+
if byteStart != -1 or byteEnd != -1:
|
|
555
|
+
# It appears that some links end before they start... Bluesky please: what's going on?
|
|
556
|
+
# example: https://bsky.app/profile/ondarockwebzine.bsky.social/post/3lqxxejza6o2t
|
|
557
|
+
# if int(byteEnd) < int(byteStart) or byteStart < 0:
|
|
558
|
+
if int(byteEnd) < int(byteStart):
|
|
559
|
+
byteStart = -1
|
|
560
|
+
byteEnd = -1
|
|
561
|
+
|
|
562
|
+
# There are mentionned links which are positionned after the end of the text,
|
|
563
|
+
# so we put them at the end of the original text
|
|
564
|
+
elif byteStart >= len(post["original_text"].encode("utf-8")):
|
|
565
|
+
byteStart = -1
|
|
566
|
+
byteEnd = -1
|
|
567
|
+
|
|
568
|
+
elif not text[byteStart:byteEnd].startswith(b"http"):
|
|
569
|
+
new_byteStart = text.find(b"http", byteStart, byteEnd)
|
|
570
|
+
|
|
571
|
+
# means that the link is shifted, like on this post:
|
|
572
|
+
# https://bsky.app/profile/ecrime.ch/post/3lqotmopayr23
|
|
573
|
+
if new_byteStart != -1:
|
|
574
|
+
byteStart = new_byteStart
|
|
575
|
+
|
|
576
|
+
# Find the index of the first space character after byteStart in case the link is a personalized one
|
|
577
|
+
# but still with the link in it (somehow existing in some posts, such as this one:
|
|
578
|
+
# https://bsky.app/profile/did:plc:rkphrshyfiqe4n2hz5vj56ig/post/3ltmljz5blca2)
|
|
579
|
+
# In this case, we don't want to touch the position of the link given in the payload
|
|
580
|
+
byteEnd = min(
|
|
581
|
+
byteStart
|
|
582
|
+
- facet["index"]["byteStart"]
|
|
583
|
+
+ facet["index"]["byteEnd"],
|
|
584
|
+
len(post["original_text"].encode("utf-8")),
|
|
585
|
+
)
|
|
586
|
+
for i in range(byteStart, byteEnd):
|
|
587
|
+
if chr(text[i]).isspace():
|
|
588
|
+
byteStart = facet["index"]["byteStart"]
|
|
589
|
+
byteEnd = (
|
|
590
|
+
byteStart
|
|
591
|
+
- facet["index"]["byteStart"]
|
|
592
|
+
+ facet["index"]["byteEnd"]
|
|
593
|
+
)
|
|
401
594
|
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
595
|
+
# means that the link is a "personalized" one like on this post:
|
|
596
|
+
# https://bsky.app/profile/newyork.activitypub.awakari.com.ap.brid.gy/post/3ln33tx7bpdu2
|
|
597
|
+
else:
|
|
598
|
+
# we're looking for a link which could be valid if we add "https://" at the beginning,
|
|
599
|
+
# as in some cases the "http(s)://" part is missing in the post text
|
|
600
|
+
for starting in range(byteEnd - byteStart):
|
|
601
|
+
try:
|
|
602
|
+
if is_url(
|
|
603
|
+
"https://"
|
|
604
|
+
+ text[
|
|
605
|
+
byteStart + starting : byteEnd + starting
|
|
606
|
+
].decode("utf-8")
|
|
607
|
+
):
|
|
608
|
+
byteStart += starting
|
|
609
|
+
break
|
|
610
|
+
except UnicodeDecodeError:
|
|
611
|
+
pass
|
|
612
|
+
# If we did not find any valid link, we just keep the original position as it is
|
|
613
|
+
# meaning that we have a personalized link like in the example above
|
|
614
|
+
|
|
615
|
+
# Extend byteEnd to the right until we find a valid utf-8 ending,
|
|
616
|
+
# as in some cases the link is longer than the position given in the payload
|
|
617
|
+
# and it gets cut in the middle of a utf-8 char, leading to UnicodeDecodeError
|
|
618
|
+
# example: https://bsky.app/profile/radiogaspesie.bsky.social/post/3lmkzhvhtta22
|
|
619
|
+
while byteEnd <= len(post["original_text"].encode("utf-8")):
|
|
620
|
+
try:
|
|
621
|
+
text[byteStart:byteEnd].decode("utf-8")
|
|
622
|
+
break
|
|
623
|
+
except UnicodeDecodeError:
|
|
624
|
+
byteEnd += 1
|
|
625
|
+
continue
|
|
626
|
+
|
|
627
|
+
# Meaning that we did not find a valid utf-8 ending, so we reset byteEnd to its original value
|
|
628
|
+
if byteEnd > len(post["original_text"].encode("utf-8")):
|
|
629
|
+
byteEnd = facet["index"]["byteEnd"]
|
|
630
|
+
|
|
631
|
+
byteEnd += byteStart - facet["index"]["byteStart"]
|
|
632
|
+
else:
|
|
633
|
+
# Handling case of errored byteEnd in the end of the text
|
|
634
|
+
# example: https://bsky.app/profile/twif.bsky.social/post/3lm4izkvbfm2r
|
|
635
|
+
while byteEnd <= len(post["original_text"].encode("utf-8")):
|
|
636
|
+
try:
|
|
637
|
+
text[byteStart:byteEnd].decode("utf-8")
|
|
638
|
+
break
|
|
639
|
+
except UnicodeDecodeError:
|
|
640
|
+
byteEnd += 1
|
|
641
|
+
continue
|
|
642
|
+
|
|
643
|
+
if byteEnd > len(post["original_text"].encode("utf-8")):
|
|
644
|
+
byteEnd = facet["index"]["byteEnd"]
|
|
645
|
+
|
|
646
|
+
# In some cases, the link is completely wrong in the post text,
|
|
647
|
+
# like in this post: https://bsky.app/profile/sudetsoleil.bsky.social/post/3ljf3h74wee2m
|
|
648
|
+
# So we chose to not replace anything in the text in this case
|
|
649
|
+
try:
|
|
650
|
+
text[byteStart:byteEnd].decode("utf-8")
|
|
651
|
+
links_to_replace.append(
|
|
652
|
+
{
|
|
653
|
+
"uri": feat["uri"].encode("utf-8"),
|
|
654
|
+
"start": byteStart,
|
|
655
|
+
"end": byteEnd,
|
|
656
|
+
}
|
|
657
|
+
)
|
|
658
|
+
except UnicodeDecodeError:
|
|
659
|
+
pass
|
|
660
|
+
# raise UnicodeDecodeError(e.encoding, e.object, e.start, e.end, f"{e.reason} in post {post['url']}.\nText to decode: {text}\nSlice of text to decode: {text[e.start:e.end]}")
|
|
661
|
+
|
|
662
|
+
elif any(
|
|
663
|
+
feat["$type"].endswith(suffix)
|
|
664
|
+
for suffix in [
|
|
665
|
+
"#bold",
|
|
666
|
+
"#italic",
|
|
667
|
+
"#underline",
|
|
668
|
+
"#option",
|
|
669
|
+
"#encrypt",
|
|
670
|
+
"#text",
|
|
671
|
+
]
|
|
672
|
+
):
|
|
418
673
|
pass
|
|
419
|
-
|
|
674
|
+
# Bluesky seems to use format features for some internal purposes, but we ignore them
|
|
675
|
+
# e.g.: https://bsky.app/profile/ferromar.bsky.social/post/3lzyfaixayd2g
|
|
676
|
+
elif feat["$type"].endswith("format"):
|
|
420
677
|
pass
|
|
678
|
+
# Not normal feature type, but still existing in some posts
|
|
679
|
+
# Note that external features aren't visible on the Bluesky app, only external embeds are
|
|
680
|
+
# e.g.: https://bsky.app/profile/did:plc:4qvb4dpkg6tkbzym77j6jcm4/post/3lbjktt6tw52h
|
|
681
|
+
elif feat["$type"].endswith("external"):
|
|
682
|
+
link = feat["external"]["uri"]
|
|
683
|
+
|
|
684
|
+
# Handle native gifs as medias
|
|
685
|
+
if link.startswith("https://media.tenor.com/"):
|
|
686
|
+
media_data.append(
|
|
687
|
+
prepare_native_gif_as_media(
|
|
688
|
+
feat["external"], post["user_did"], post["url"]
|
|
689
|
+
)
|
|
690
|
+
)
|
|
691
|
+
# Extra card links sometimes missing from facets & text due to manual action in post form
|
|
692
|
+
else:
|
|
693
|
+
extra_links.append(link)
|
|
694
|
+
|
|
695
|
+
if isinstance(feat["external"].get("thumb"), dict):
|
|
696
|
+
post = process_card_data(feat["external"], post)
|
|
697
|
+
|
|
698
|
+
# Some people share code snippets using third party apps
|
|
699
|
+
# e.g.: https://bsky.app/profile/alexdln.com/post/3mbwzgrymow2o
|
|
700
|
+
elif (
|
|
701
|
+
"#" in feat["$type"]
|
|
702
|
+
and feat["$type"].split("#")[1].startswith("code")
|
|
703
|
+
and "code" in feat
|
|
704
|
+
):
|
|
705
|
+
language = (
|
|
706
|
+
feat["$type"].split("#")[1].split(".")[1]
|
|
707
|
+
if "." in feat["$type"].split("#")[1]
|
|
708
|
+
else "plain"
|
|
709
|
+
)
|
|
710
|
+
text += (
|
|
711
|
+
b"\n```"
|
|
712
|
+
+ language.encode("utf-8")
|
|
713
|
+
+ b"\n"
|
|
714
|
+
+ feat["code"].encode("utf-8")
|
|
715
|
+
+ b"\n```\n"
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
# We chose to ignore non Bluesky features for now (e.g. personalized features)
|
|
719
|
+
# example: https://bsky.app/profile/poll.blue/post/3kmuqjkkozh2r
|
|
720
|
+
elif "bsky" not in feat["$type"]:
|
|
721
|
+
continue
|
|
421
722
|
else:
|
|
422
723
|
raise BlueskyPayloadError(
|
|
423
724
|
post["url"], "unusual record facet feature $type: %s" % feat
|
|
@@ -457,21 +758,61 @@ def normalize_post(
|
|
|
457
758
|
|
|
458
759
|
# Handle quotes & medias
|
|
459
760
|
media_ids = set()
|
|
460
|
-
post["media_urls"] = []
|
|
461
761
|
post["media_thumbnails"] = []
|
|
462
762
|
post["media_types"] = []
|
|
463
763
|
post["media_alt_texts"] = []
|
|
464
764
|
if "embed" in data["record"]:
|
|
465
765
|
embed = data["record"]["embed"]
|
|
466
766
|
quoted_data = None
|
|
467
|
-
media_data = []
|
|
468
|
-
extra_links = []
|
|
469
767
|
|
|
470
768
|
if not valid_embed_type(embed["$type"]):
|
|
769
|
+
if "bsky" in embed["$type"]:
|
|
770
|
+
raise BlueskyPayloadError(
|
|
771
|
+
post["url"], "unusual record embed $type: %s" % embed
|
|
772
|
+
)
|
|
773
|
+
# Ignore non Bluesky embeds for now (e.g. personalized embeds)
|
|
774
|
+
|
|
775
|
+
# Empty embed (not usual, but seen in the Bluesky jungle, e.g.
|
|
776
|
+
# https://bsky.app/profile/did:plc:na6u3avvaz2x5wyzqrnviqiz/post/3lzf5qi2ra62k
|
|
777
|
+
# https://bsky.app/profile/dangelodario.it/post/3l3inqifqj42p
|
|
778
|
+
# or https://bsky.app/profile/soirilab.bsky.social/post/3lywaa7vhsu2c)
|
|
779
|
+
if embed["$type"].endswith(".post") or embed["$type"] == "N/A":
|
|
780
|
+
# Some posts have extra keys in their empty embed, certainly personalized ones.
|
|
781
|
+
|
|
782
|
+
# Personalized quote (not visible on Bluesky for the example)
|
|
783
|
+
# example: https://bsky.app/profile/jacksmithsocial.bsky.social/post/3lbca2nxy4f2a
|
|
784
|
+
if embed.get("$type") == "app.bsky.feed.post" and embed.get(
|
|
785
|
+
"record", {}
|
|
786
|
+
).get("uri"):
|
|
787
|
+
post, quoted_data, links = prepare_quote_data(
|
|
788
|
+
embed["record"], data.get("embed", {}).get("record"), post, links
|
|
789
|
+
)
|
|
790
|
+
|
|
791
|
+
# for the other ones we know up to now, we want to ignore them
|
|
792
|
+
# e.g.: https://bsky.app/profile/granmouse.bsky.social/post/3lwvh5xd2xk2p
|
|
793
|
+
# https://bsky.app/profile/flyingaubrey.bsky.social/post/3lxngessntk2p
|
|
794
|
+
elif len(embed.keys()) > 1 and embed.get("type") not in ["private", "list"]:
|
|
795
|
+
raise BlueskyPayloadError(
|
|
796
|
+
post["url"],
|
|
797
|
+
"unusual empty record embed with extra keys: %s" % embed,
|
|
798
|
+
)
|
|
799
|
+
# Nothing to do for empty embed
|
|
800
|
+
|
|
801
|
+
if (
|
|
802
|
+
embed["$type"].endswith(".embed")
|
|
803
|
+
and len(embed.keys()) > 2
|
|
804
|
+
and len(embed.get("images")) > 0
|
|
805
|
+
):
|
|
471
806
|
raise BlueskyPayloadError(
|
|
472
|
-
post["url"], "unusual record embed
|
|
807
|
+
post["url"], "unusual empty record embed with extra keys: %s" % embed
|
|
473
808
|
)
|
|
474
809
|
|
|
810
|
+
# Links from links embed
|
|
811
|
+
# e.g.: https://bsky.app/profile/sacredatoz.bsky.social/post/3lrqvemv7qe2f
|
|
812
|
+
if embed["$type"].endswith(".links"):
|
|
813
|
+
for link in embed["links"]:
|
|
814
|
+
extra_links.append(link)
|
|
815
|
+
|
|
475
816
|
# Links from cards
|
|
476
817
|
if embed["$type"].endswith(".external"):
|
|
477
818
|
link = embed["external"]["uri"]
|
|
@@ -491,21 +832,56 @@ def normalize_post(
|
|
|
491
832
|
if "embed" in data:
|
|
492
833
|
post = process_card_data(data["embed"]["external"], post)
|
|
493
834
|
|
|
835
|
+
# Not visible images
|
|
836
|
+
# examples: https://bsky.app/profile/lubosmichalik.bsky.social/post/3ltjvxsaej62c
|
|
837
|
+
# https://bsky.app/profile/lubosmichalik.bsky.social/post/3ltjvz52x7s2m
|
|
838
|
+
if embed["$type"].endswith(".viewImages"):
|
|
839
|
+
if "images" in embed:
|
|
840
|
+
for i in embed["images"]:
|
|
841
|
+
post["media_urls"].append(
|
|
842
|
+
i.get("viewImage", {}).get("thumb", {}).get("uri", "")
|
|
843
|
+
)
|
|
844
|
+
elif "viewImage" in embed:
|
|
845
|
+
for i in embed["viewImage"]:
|
|
846
|
+
if "viewImage" in i:
|
|
847
|
+
sub_image = "viewImage"
|
|
848
|
+
elif "image" in i:
|
|
849
|
+
sub_image = "image"
|
|
850
|
+
else:
|
|
851
|
+
raise BlueskyPayloadError(
|
|
852
|
+
post["url"],
|
|
853
|
+
"unusual viewImages embed content: %s" % embed,
|
|
854
|
+
)
|
|
855
|
+
post["media_urls"].append(
|
|
856
|
+
i[sub_image].get("thumb", {}).get("uri", "")
|
|
857
|
+
)
|
|
858
|
+
|
|
494
859
|
# Images
|
|
495
|
-
if embed["$type"].endswith(".images"):
|
|
496
|
-
media_data.extend(
|
|
860
|
+
if embed["$type"].endswith(".images") or embed["$type"].endswith("image"):
|
|
861
|
+
media_data.extend(
|
|
862
|
+
[prepare_image_as_media(i, post["url"]) for i in embed["images"]]
|
|
863
|
+
)
|
|
497
864
|
|
|
498
865
|
# Video
|
|
499
866
|
if embed["$type"].endswith(".video"):
|
|
500
867
|
media_data.append(prepare_video_as_media(embed["video"]))
|
|
868
|
+
elif embed["$type"].endswith(".videos"):
|
|
869
|
+
for elt in embed["videos"]:
|
|
870
|
+
media_data.append(prepare_video_as_media(elt["video"]))
|
|
871
|
+
elif embed["$type"].endswith(".media"):
|
|
872
|
+
if isinstance(embed["media"], dict):
|
|
873
|
+
media_data.append(prepare_video_as_media(embed["media"]["video"]))
|
|
874
|
+
elif isinstance(embed["media"], list):
|
|
875
|
+
for elt in embed["media"]:
|
|
876
|
+
media_data.append(prepare_video_as_media(elt["media"]))
|
|
501
877
|
|
|
502
878
|
# Quote & Starter-packs
|
|
503
879
|
if embed["$type"].endswith(".record"):
|
|
504
880
|
if "app.bsky.graph.starterpack" in embed["record"]["uri"]:
|
|
505
881
|
post = process_starterpack_card(
|
|
506
|
-
data.get("embed", {}).get("record"), post
|
|
882
|
+
data.get("embed", {}).get("record", {}), post
|
|
507
883
|
)
|
|
508
|
-
if post
|
|
884
|
+
if post.get("card_link"):
|
|
509
885
|
extra_links.append(post["card_link"])
|
|
510
886
|
else:
|
|
511
887
|
post, quoted_data, links = prepare_quote_data(
|
|
@@ -545,13 +921,21 @@ def normalize_post(
|
|
|
545
921
|
# Images
|
|
546
922
|
elif embed["media"]["$type"].endswith(".images"):
|
|
547
923
|
media_data.extend(
|
|
548
|
-
[
|
|
924
|
+
[
|
|
925
|
+
prepare_image_as_media(i, post["url"])
|
|
926
|
+
for i in embed["media"]["images"]
|
|
927
|
+
]
|
|
549
928
|
)
|
|
550
929
|
|
|
551
930
|
# Video
|
|
552
931
|
elif embed["media"]["$type"].endswith(".video"):
|
|
553
932
|
media_data.append(prepare_video_as_media(embed["media"]["video"]))
|
|
554
933
|
|
|
934
|
+
# A personalized record with media embed type, but video unavailable
|
|
935
|
+
# e.g.: https://bsky.app/profile/meteolatorregassa.bsky.social/post/3lhoxazzptj2b
|
|
936
|
+
elif embed["media"]["$type"].endswith("#media"):
|
|
937
|
+
pass
|
|
938
|
+
|
|
555
939
|
else:
|
|
556
940
|
raise BlueskyPayloadError(
|
|
557
941
|
post["url"],
|
|
@@ -596,7 +980,7 @@ def normalize_post(
|
|
|
596
980
|
if quoted_data and "value" in quoted_data:
|
|
597
981
|
# We're checking on the uri as the cid can be different in some cases,
|
|
598
982
|
# and the uri seems to be unique for each post
|
|
599
|
-
if quoted_data["uri"] != post["quoted_uri"]:
|
|
983
|
+
if quoted_data["uri"] != post["quoted_uri"]:
|
|
600
984
|
raise BlueskyPayloadError(
|
|
601
985
|
post["url"],
|
|
602
986
|
"inconsistent quote uri found between record.embed.record.uri & embed.record.uri: %s %s"
|
|
@@ -665,8 +1049,13 @@ def normalize_post(
|
|
|
665
1049
|
"allow_from_" + rule["$type"].split("#")[1].split("Rule")[0]
|
|
666
1050
|
)
|
|
667
1051
|
if rule_string.endswith("_list") and "list" in rule:
|
|
668
|
-
|
|
669
|
-
post["replies_rules"].append(rule_string + ":" +
|
|
1052
|
+
if isinstance(rule["list"], str):
|
|
1053
|
+
post["replies_rules"].append(rule_string + ":" + rule["list"])
|
|
1054
|
+
else:
|
|
1055
|
+
for allowed_list in rule["list"]:
|
|
1056
|
+
post["replies_rules"].append(
|
|
1057
|
+
rule_string + ":" + allowed_list
|
|
1058
|
+
)
|
|
670
1059
|
else:
|
|
671
1060
|
post["replies_rules"].append(rule_string)
|
|
672
1061
|
if not data["threadgate"]["record"]["allow"]:
|
|
@@ -712,7 +1101,11 @@ def normalize_post(
|
|
|
712
1101
|
post["text"] = text.decode("utf-8")
|
|
713
1102
|
except UnicodeDecodeError as e:
|
|
714
1103
|
raise UnicodeDecodeError(
|
|
715
|
-
|
|
1104
|
+
e.encoding,
|
|
1105
|
+
e.object,
|
|
1106
|
+
e.start,
|
|
1107
|
+
e.end,
|
|
1108
|
+
f"{e.reason} in post {post['url']}.\nText to decode: {text}\nSlice of text to decode: {text[e.start : e.end]}",
|
|
716
1109
|
)
|
|
717
1110
|
|
|
718
1111
|
if collection_source is not None:
|
twitwi/bluesky/types.py
CHANGED
|
@@ -50,6 +50,7 @@ class BlueskyPost(TypedDict):
|
|
|
50
50
|
# Datetime fields
|
|
51
51
|
timestamp_utc: int # Unix UTC timestamp of when the post was submitted
|
|
52
52
|
local_time: str # datetime (potentially timezoned) of when the post was submitted
|
|
53
|
+
indexed_at_utc: str # datetime (NOT timezoned, for reuse of the Bluesky API) of when the post was indexed by the Bluesky service
|
|
53
54
|
|
|
54
55
|
# Author identifying fields
|
|
55
56
|
user_did: str # persistent long-term identifier of the account who authored the post
|
|
@@ -64,6 +65,7 @@ class BlueskyPost(TypedDict):
|
|
|
64
65
|
like_count: int # total number of likes received by the post (at collection time)
|
|
65
66
|
reply_count: int # total number of replies received by the post (at collection time)
|
|
66
67
|
quote_count: int # total number of posts the post was quoted into (at collection time)
|
|
68
|
+
bookmark_count: Optional[int] # total number of bookmarks received by the post (at collection time)
|
|
67
69
|
|
|
68
70
|
# Extra field
|
|
69
71
|
bridgy_original_url: Optional[str] # source of the original post when it was posted from another platform such as Mastodon via the Bridgy connection tool
|
twitwi/bluesky/utils.py
CHANGED
|
@@ -37,7 +37,9 @@ def validate_post_payload(data):
|
|
|
37
37
|
post["record"],
|
|
38
38
|
)
|
|
39
39
|
|
|
40
|
-
|
|
40
|
+
# Splitting by '#' to ignore possible suffixes in $type
|
|
41
|
+
# e.g. https://bsky.app/profile/did:plc:k6acu4chiwkixvdedcmdgmal/post/3lagdncjsu22y
|
|
42
|
+
if post["record"].get("$type").split("#")[0] != "app.bsky.feed.post":
|
|
41
43
|
return False, "payload's record $type is not a post: %s" % post["record"].get(
|
|
42
44
|
"$type"
|
|
43
45
|
)
|
|
@@ -56,7 +58,7 @@ def validate_post_payload(data):
|
|
|
56
58
|
|
|
57
59
|
|
|
58
60
|
re_embed_types = re.compile(
|
|
59
|
-
r"
|
|
61
|
+
r"(?:\.(?:record|recordWithMedia|images|videos?|external|post|embed|links|media|file|viewImages)(?:#.*)?|N\/A|image)$"
|
|
60
62
|
)
|
|
61
63
|
|
|
62
64
|
|
|
@@ -75,7 +77,7 @@ def format_post_url(user_handle_or_did, post_did, post_splitter="/post/"):
|
|
|
75
77
|
def parse_post_url(url, source):
|
|
76
78
|
"""Returns a tuple of (author_handle/did, post_did) from an https://bsky.app post URL"""
|
|
77
79
|
|
|
78
|
-
known_splits = ["/post/", "/lists/"]
|
|
80
|
+
known_splits = ["/post/", "/lists/", "/feed/"]
|
|
79
81
|
|
|
80
82
|
if url.startswith("https://bsky.app/profile/"):
|
|
81
83
|
for split in known_splits:
|
|
@@ -88,17 +90,25 @@ def parse_post_url(url, source):
|
|
|
88
90
|
def parse_post_uri(uri, source=None):
|
|
89
91
|
"""Returns a tuple of (author_did, post_did) from an at:// post URI"""
|
|
90
92
|
|
|
91
|
-
known_splits = [
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
93
|
+
# known_splits = [
|
|
94
|
+
# "/app.bsky.feed.post/",
|
|
95
|
+
# "/app.bsky.graph.starterpack/",
|
|
96
|
+
# "/app.bsky.feed.generator/",
|
|
97
|
+
# "/app.bsky.graph.list/",
|
|
98
|
+
# "/app.bsky.graph.follow/", # This one is often found when a post is an anwser to a deleted post (e.g. https://bsky.app/profile/sydney-chat.bsky.social/post/3ltsph6kxfl25)
|
|
99
|
+
# ]
|
|
100
|
+
|
|
101
|
+
# if uri.startswith("at://"):
|
|
102
|
+
# for split in known_splits:
|
|
103
|
+
# if split in uri:
|
|
104
|
+
# return uri[5:].split(split)
|
|
105
|
+
|
|
106
|
+
# There's too much variability in the post URIs, and we cannot be exhaustive,
|
|
107
|
+
# so we do with the simple approach:
|
|
98
108
|
if uri.startswith("at://"):
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
109
|
+
# Using maxsplit=3 to avoid issues if future uris contain more slashes
|
|
110
|
+
author_did, _, post_did = uri[5:].split("/", 3)
|
|
111
|
+
return author_did, post_did
|
|
102
112
|
|
|
103
113
|
raise BlueskyPayloadError(source or uri, f"{uri} is not a usual Bluesky post uri")
|
|
104
114
|
|
|
@@ -112,18 +122,24 @@ def format_media_url(user_did, media_cid, mime_type, source):
|
|
|
112
122
|
if mime_type.startswith("image"):
|
|
113
123
|
media_url = f"https://cdn.bsky.app/img/feed_fullsize/plain/{user_did}/{media_cid}@{media_type}"
|
|
114
124
|
media_thumb = f"https://cdn.bsky.app/img/feed_thumbnail/plain/{user_did}/{media_cid}@{media_type}"
|
|
115
|
-
elif
|
|
125
|
+
elif (
|
|
126
|
+
mime_type.startswith("video")
|
|
127
|
+
or mime_type == "application/xml"
|
|
128
|
+
or mime_type == "*/*"
|
|
129
|
+
):
|
|
116
130
|
media_url = f"https://video.bsky.app/watch/{user_did}/{media_cid}/playlist.m3u8"
|
|
117
131
|
media_thumb = (
|
|
118
132
|
f"https://video.bsky.app/watch/{user_did}/{media_cid}/thumbnail.jpg"
|
|
119
133
|
)
|
|
120
|
-
elif mime_type
|
|
134
|
+
elif any(mt in mime_type for mt in ["octet-stream", "text/plain", "text/html"]):
|
|
121
135
|
media_url = (
|
|
122
136
|
f"https://cdn.bsky.app/img/feed_fullsize/plain/{user_did}/{media_cid}@jpeg"
|
|
123
137
|
)
|
|
124
138
|
media_thumb = (
|
|
125
139
|
f"https://cdn.bsky.app/img/feed_thumbnail/plain/{user_did}/{media_cid}@jpeg"
|
|
126
140
|
)
|
|
141
|
+
elif "empty" in mime_type:
|
|
142
|
+
media_url, media_thumb = "", ""
|
|
127
143
|
else:
|
|
128
144
|
raise BlueskyPayloadError(source, f"{mime_type} is an unusual media mimeType")
|
|
129
145
|
return media_url, media_thumb
|
twitwi/exceptions.py
CHANGED
|
@@ -21,4 +21,4 @@ class BlueskyPayloadError(TwitwiError):
|
|
|
21
21
|
def __init__(self, source, message):
|
|
22
22
|
self.source = source
|
|
23
23
|
self.message = message
|
|
24
|
-
super().__init__(f"Error while processing Bluesky post {source}
|
|
24
|
+
super().__init__(f"Error while processing Bluesky post {source}.\n{message}")
|
twitwi/formatters.py
CHANGED
|
@@ -52,7 +52,9 @@ def make_transform_into_csv_dict(plural_fields, boolean_fields):
|
|
|
52
52
|
|
|
53
53
|
|
|
54
54
|
def make_format_as_csv_row(fields, plural_fields, boolean_fields):
|
|
55
|
-
def format_field_for_csv(
|
|
55
|
+
def format_field_for_csv(
|
|
56
|
+
field, item, item_id=None, plural_separator="|", allow_erroneous_plurals=False
|
|
57
|
+
):
|
|
56
58
|
if field == "id" and item_id is not None:
|
|
57
59
|
return item_id
|
|
58
60
|
|
|
@@ -63,6 +65,11 @@ def make_format_as_csv_row(fields, plural_fields, boolean_fields):
|
|
|
63
65
|
if field == "links":
|
|
64
66
|
v = item.get("proper_links", v)
|
|
65
67
|
|
|
68
|
+
# Clean None values that may have slipped in, such as in the 'domains' field when
|
|
69
|
+
# normalizing this Bluesky post: https://bsky.app/profile/did:plc:cs5qjcmnntogoahrrsagmg2z/post/3lvqhn7raq62v
|
|
70
|
+
if allow_erroneous_plurals:
|
|
71
|
+
v = [element if element is not None else "" for element in v]
|
|
72
|
+
|
|
66
73
|
return plural_separator.join(v)
|
|
67
74
|
|
|
68
75
|
if field in boolean_fields:
|
|
@@ -70,10 +77,16 @@ def make_format_as_csv_row(fields, plural_fields, boolean_fields):
|
|
|
70
77
|
|
|
71
78
|
return item.get(field, "")
|
|
72
79
|
|
|
73
|
-
def format_item_as_csv_row(
|
|
80
|
+
def format_item_as_csv_row(
|
|
81
|
+
item, item_id=None, plural_separator="|", allow_erroneous_plurals=False
|
|
82
|
+
):
|
|
74
83
|
return [
|
|
75
84
|
format_field_for_csv(
|
|
76
|
-
field,
|
|
85
|
+
field,
|
|
86
|
+
item,
|
|
87
|
+
item_id=item_id,
|
|
88
|
+
plural_separator=plural_separator,
|
|
89
|
+
allow_erroneous_plurals=allow_erroneous_plurals,
|
|
77
90
|
)
|
|
78
91
|
for field in fields
|
|
79
92
|
]
|
twitwi/utils.py
CHANGED
|
@@ -60,6 +60,11 @@ def get_dates(
|
|
|
60
60
|
if locale is None:
|
|
61
61
|
locale = UTC_TIMEZONE
|
|
62
62
|
|
|
63
|
+
# Let's pray we never see a negative year...
|
|
64
|
+
year_zero = date_str.startswith("0000") or all(
|
|
65
|
+
c == "0" for c in date_str.split("-")[0]
|
|
66
|
+
)
|
|
67
|
+
|
|
63
68
|
try:
|
|
64
69
|
parsed_datetime = datetime.strptime(
|
|
65
70
|
date_str,
|
|
@@ -68,27 +73,43 @@ def get_dates(
|
|
|
68
73
|
except ValueError as e:
|
|
69
74
|
if source != "bluesky":
|
|
70
75
|
raise e
|
|
71
|
-
|
|
76
|
+
# Yes, it seems that some people were active in year 0...
|
|
77
|
+
# see by yourself: https://bsky.app/profile/koro.icu/post/3kbpuogc6fz2o
|
|
78
|
+
if year_zero:
|
|
79
|
+
date_str_fixed = "0001" + date_str[4:]
|
|
80
|
+
parsed_datetime = parse_date(date_str_fixed)
|
|
81
|
+
else:
|
|
82
|
+
parsed_datetime = parse_date(date_str)
|
|
72
83
|
|
|
73
84
|
utc_datetime = parsed_datetime
|
|
74
85
|
if not parsed_datetime.tzinfo:
|
|
75
86
|
utc_datetime = UTC_TIMEZONE.localize(parsed_datetime)
|
|
76
87
|
locale_datetime = utc_datetime.astimezone(locale)
|
|
77
88
|
|
|
89
|
+
formatted_date_str = datetime.strftime(
|
|
90
|
+
locale_datetime,
|
|
91
|
+
FORMATTED_FULL_DATETIME_FORMAT
|
|
92
|
+
if source == "bluesky"
|
|
93
|
+
else FORMATTED_TWEET_DATETIME_FORMAT,
|
|
94
|
+
)
|
|
95
|
+
|
|
78
96
|
timestamp = int(utc_datetime.timestamp())
|
|
79
97
|
|
|
98
|
+
if year_zero:
|
|
99
|
+
# Subtract one year (year 0001 is not a leap year) in seconds
|
|
100
|
+
timestamp -= 31536000
|
|
101
|
+
# Doing like so using split because on ubuntu, datetime.strftime on year with less than 4 digits
|
|
102
|
+
# only returns 1 digit for year 0 (e.g. "0-05-12...") instead of 4 digits ("0000-05-12..."),
|
|
103
|
+
# whereas on macOS and Windows it returns 4 digits.
|
|
104
|
+
formatted_date_str = "0000-" + formatted_date_str.split("-", 1)[1]
|
|
105
|
+
|
|
80
106
|
if millisecond_timestamp:
|
|
81
107
|
timestamp *= 1000
|
|
82
108
|
timestamp += utc_datetime.microsecond / 1000
|
|
83
109
|
|
|
84
110
|
return (
|
|
85
111
|
int(timestamp),
|
|
86
|
-
|
|
87
|
-
locale_datetime,
|
|
88
|
-
FORMATTED_FULL_DATETIME_FORMAT
|
|
89
|
-
if source == "bluesky"
|
|
90
|
-
else FORMATTED_TWEET_DATETIME_FORMAT,
|
|
91
|
-
),
|
|
112
|
+
formatted_date_str,
|
|
92
113
|
)
|
|
93
114
|
|
|
94
115
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: twitwi
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.24.0
|
|
4
4
|
Summary: A collection of Twitter-related helper functions for python.
|
|
5
5
|
Home-page: http://github.com/medialab/twitwi
|
|
6
6
|
Author: Béatrice Mazoyer, Guillaume Plique, Benjamin Ooghe-Tabanou
|
|
@@ -260,7 +260,7 @@ List of a Bluesky user profile's normalized field names. Useful to declare heade
|
|
|
260
260
|
|
|
261
261
|
### PARTIAL_PROFILE_FIELDS
|
|
262
262
|
|
|
263
|
-
List of a Bluesky user partial profile's (retrieved from [`app.bsky.graph.getFollowers` HTTP endpoint](https://docs.bsky.app/docs/api/app-bsky-graph-get-followers#responses) for example) normalized field names. Useful to declare headers with csv writers. Be careful not to confuse with [PROFILE_FIELDS](#profile_fields) which correspond to the full version of the profile data, retrieved from [`app.bsky.actor.getProfiles` HTTP endpoint](docs.bsky.app/docs/api/app-bsky-actor-get-profiles#responses) for example.
|
|
263
|
+
List of a Bluesky user partial profile's (retrieved from [`app.bsky.graph.getFollowers` HTTP endpoint](https://docs.bsky.app/docs/api/app-bsky-graph-get-followers#responses) for example) normalized field names. Useful to declare headers with csv writers. Be careful not to confuse with [PROFILE_FIELDS](#profile_fields) which correspond to the full version of the profile data, retrieved from [`app.bsky.actor.getProfiles` HTTP endpoint](https://docs.bsky.app/docs/api/app-bsky-actor-get-profiles#responses) for example.
|
|
264
264
|
|
|
265
265
|
### POST_FIELDS
|
|
266
266
|
|
|
@@ -277,7 +277,7 @@ Will return datetimes as UTC but can take an optional second `locale` argument a
|
|
|
277
277
|
* **data** *(dict)*: user profile data payload coming from Twitter API v1.1 or v2.
|
|
278
278
|
* **locale** *(pytz.timezone as str, optional)*: timezone used to convert dates. If not given, will default to UTC.
|
|
279
279
|
* **pure** *(bool, optional)*: whether to allow the function to mutate its original `data` argument. Defaults to `True`.
|
|
280
|
-
|
|
280
|
+
|
|
281
281
|
### normalize_tweet
|
|
282
282
|
|
|
283
283
|
Function taking a nested dict describing a tweet from Twitter's JSON payload (API v1.1) and returning a flat "normalized" dict composed of all [TWEET_FIELDS](#tweet_fields) keys.
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
test/bluesky/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
test/bluesky/formatters_test.py,sha256=kUXoLNEep-mGRwLN0y5DqB9pAorV0PkVKMm_uVIvAQQ,5100
|
|
3
|
+
test/bluesky/normalizers_test.py,sha256=R4NziqErGW5MBdQEZ1vNxLGNRvJTyGnXfqo0v5gBCgw,5662
|
|
4
|
+
twitwi/__init__.py,sha256=y0bAx9gE3THtlWE1YpXDIhGwqJ5_I8DCStWyyiiXJkw,1095
|
|
5
|
+
twitwi/anonymizers.py,sha256=nkl6HL1BWLz00wJ060XSbqjN5JF8pvcpEPnRXt70TUY,1588
|
|
6
|
+
twitwi/constants.py,sha256=fvqCngJIGyz5CpdVWbcAfjmE3_kvcx9giN0rEljL7OU,16001
|
|
7
|
+
twitwi/exceptions.py,sha256=xUikeIRmFcptQFlKGKXkbH9vbcQlQL3sviknhvSTcmw,696
|
|
8
|
+
twitwi/formatters.py,sha256=pwI4UYPDFUzjRPE9B36k8tK-Va-k0HFLwvmc8aIc8P0,3681
|
|
9
|
+
twitwi/normalizers.py,sha256=CWUK-XwhcEjLDjWH_qb6E03WZKsbIcwiRAVUjwXKQho,28438
|
|
10
|
+
twitwi/utils.py,sha256=PPmbeMlKbHMTg07PgI4A0HRZw2QGuvCOGcP_FtqMyHQ,4774
|
|
11
|
+
twitwi/bluesky/__init__.py,sha256=SqeHZUzL2U9UpL3EB33vaowQWaKXSPkvsAkasRqmFpY,694
|
|
12
|
+
twitwi/bluesky/constants.py,sha256=CPkTIrDwyRWpkFTbaee1oFm_LWGj2WIC7A6xEGqDGB4,573
|
|
13
|
+
twitwi/bluesky/formatters.py,sha256=L_yROAPcBECifCGiFAGYFJwLq6re8UlJNoZ7R2DXm5g,1025
|
|
14
|
+
twitwi/bluesky/normalizers.py,sha256=m4oNWJt8eZK2iVREPIKC42yw3YNpZo3pf4OQGZz_1i8,48611
|
|
15
|
+
twitwi/bluesky/types.py,sha256=INe6R8eOqrOooWn25dtk61-Wqd_pUDwb737R7jY_vkc,13915
|
|
16
|
+
twitwi/bluesky/utils.py,sha256=zIofl7UHmIr0JgjoXRK3ekovkri3CVOvQvo8PmFrWGg,4895
|
|
17
|
+
twitwi-0.24.0.dist-info/licenses/LICENSE.txt,sha256=Ddg_PcGnl0qd2167o2dheCjE_rCZJOoBxjJnJhhOpX4,1099
|
|
18
|
+
twitwi-0.24.0.dist-info/METADATA,sha256=4cGwKAsqA9kXkG713fx0lLfoCb2znbLiTsqm-n_wI4g,21365
|
|
19
|
+
twitwi-0.24.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
20
|
+
twitwi-0.24.0.dist-info/top_level.txt,sha256=TaKyGU7j_EVbP5KI0UD6qjbaKv2Qn0OrkfUQ29a04kg,12
|
|
21
|
+
twitwi-0.24.0.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
22
|
+
twitwi-0.24.0.dist-info/RECORD,,
|
twitwi-0.22.1.dist-info/RECORD
DELETED
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
test/bluesky/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
test/bluesky/formatters_test.py,sha256=dMpRV_IuStZAnXhJGKnYsi0tb4BaSTSU4JVfihU1aQs,5002
|
|
3
|
-
test/bluesky/normalizers_test.py,sha256=R4NziqErGW5MBdQEZ1vNxLGNRvJTyGnXfqo0v5gBCgw,5662
|
|
4
|
-
twitwi/__init__.py,sha256=y0bAx9gE3THtlWE1YpXDIhGwqJ5_I8DCStWyyiiXJkw,1095
|
|
5
|
-
twitwi/anonymizers.py,sha256=nkl6HL1BWLz00wJ060XSbqjN5JF8pvcpEPnRXt70TUY,1588
|
|
6
|
-
twitwi/constants.py,sha256=fvqCngJIGyz5CpdVWbcAfjmE3_kvcx9giN0rEljL7OU,16001
|
|
7
|
-
twitwi/exceptions.py,sha256=OCIDagu2ErDyOGWunRBCK3O62TnzFpIMQ9gS8l9EALQ,696
|
|
8
|
-
twitwi/formatters.py,sha256=yn14AsrGAUw8rShOnYJvoMbzdWpfTeSs0P0ZPNTwhLU,3142
|
|
9
|
-
twitwi/normalizers.py,sha256=CWUK-XwhcEjLDjWH_qb6E03WZKsbIcwiRAVUjwXKQho,28438
|
|
10
|
-
twitwi/utils.py,sha256=f02cMx19Sr_GvJQf_0jTIERGLq1oC3znnPQxE__rlFc,3838
|
|
11
|
-
twitwi/bluesky/__init__.py,sha256=SqeHZUzL2U9UpL3EB33vaowQWaKXSPkvsAkasRqmFpY,694
|
|
12
|
-
twitwi/bluesky/constants.py,sha256=CPkTIrDwyRWpkFTbaee1oFm_LWGj2WIC7A6xEGqDGB4,573
|
|
13
|
-
twitwi/bluesky/formatters.py,sha256=L_yROAPcBECifCGiFAGYFJwLq6re8UlJNoZ7R2DXm5g,1025
|
|
14
|
-
twitwi/bluesky/normalizers.py,sha256=MtS1UW9Chi8zzs8VBfyXxDQaeIrEEXeNy3GBvgG_ivc,28759
|
|
15
|
-
twitwi/bluesky/types.py,sha256=WUxfyA5fc68qURGh7bxiDlIBFgdbyysRRdvHLoXwWlA,13656
|
|
16
|
-
twitwi/bluesky/utils.py,sha256=9il8t_qkKCmGQ-MDkF5qahxKV1Qsmwzul_1VzzD-jH4,3943
|
|
17
|
-
twitwi-0.22.1.dist-info/licenses/LICENSE.txt,sha256=Ddg_PcGnl0qd2167o2dheCjE_rCZJOoBxjJnJhhOpX4,1099
|
|
18
|
-
twitwi-0.22.1.dist-info/METADATA,sha256=1B2fgQqdo1CpPDe0g-NWJzqQ5XqZZEkRl1osx1bxht8,21365
|
|
19
|
-
twitwi-0.22.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
20
|
-
twitwi-0.22.1.dist-info/top_level.txt,sha256=TaKyGU7j_EVbP5KI0UD6qjbaKv2Qn0OrkfUQ29a04kg,12
|
|
21
|
-
twitwi-0.22.1.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
22
|
-
twitwi-0.22.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|