twitwi 0.22.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -122,13 +122,15 @@ def process_starterpack_card(embed_data, post):
122
122
  # Warning: mutates post
123
123
 
124
124
  card = embed_data.get("record", {})
125
- creator_did, pack_did = parse_post_uri(embed_data["uri"])
126
- post["card_link"] = format_starterpack_url(
127
- embed_data.get("creator", {}).get("handle") or creator_did, pack_did
128
- )
129
- post["card_title"] = card.get("name", "")
130
- post["card_description"] = card.get("description", "")
131
- post["card_thumbnail"] = card.get("thumb", "")
125
+ if "uri" in embed_data:
126
+ creator_did, pack_did = parse_post_uri(embed_data["uri"])
127
+ post["card_link"] = format_starterpack_url(
128
+ embed_data.get("creator", {}).get("handle") or creator_did, pack_did
129
+ )
130
+ if card:
131
+ post["card_title"] = card.get("name", "")
132
+ post["card_description"] = card.get("description", "")
133
+ post["card_thumbnail"] = card.get("thumb", "")
132
134
  return post
133
135
 
134
136
 
@@ -145,49 +147,55 @@ def process_card_data(embed_data, post):
145
147
  def prepare_quote_data(embed_quote, card_data, post, links):
146
148
  # Warning: mutates post and links
147
149
 
150
+ quoted_data = None
151
+
148
152
  post["quoted_cid"] = embed_quote["cid"]
149
153
  post["quoted_uri"] = embed_quote["uri"]
150
- post["quoted_user_did"], post["quoted_did"] = parse_post_uri(
151
- post["quoted_uri"], post["url"]
152
- )
153
-
154
- # First store ugly quoted url with user did in case full quote data is missing (recursion > 3 or detached quote)
155
- # Handling special posts types (only lists for now, for example: https://bsky.app/profile/lanana421.bsky.social/lists/3lxdgjtpqhf2z)
156
- if "/app.bsky.graph.list/" in post["quoted_uri"]:
157
- post_splitter = "/lists/"
154
+ # Sometimes quoted post is not found, even if uri and cid are given
155
+ # example: https://bsky.app/profile/takobiotech.masto.bike.ap.brid.gy/post/3lc6r7nzil6m2
156
+ if card_data and card_data.get("notFound"):
157
+ post["quoted_status"] = "notFound"
158
158
  else:
159
- post_splitter = "/post/"
160
- post["quoted_url"] = format_post_url(
161
- post["quoted_user_did"], post["quoted_did"], post_splitter=post_splitter
162
- )
163
-
164
- quoted_data = None
165
- if card_data:
166
- if card_data.get("detached", False):
167
- post["quoted_status"] = "detached"
159
+ post["quoted_user_did"], post["quoted_did"] = parse_post_uri(
160
+ post["quoted_uri"], post["url"]
161
+ )
168
162
 
163
+ # First store ugly quoted url with user did in case full quote data is missing (recursion > 3 or detached quote)
164
+ # Handling special posts types (only lists for now, for example: https://bsky.app/profile/lanana421.bsky.social/lists/3lxdgjtpqhf2z)
165
+ if "/app.bsky.graph.list/" in post["quoted_uri"]:
166
+ post_splitter = "/lists/"
169
167
  else:
170
- quoted_data = deepcopy(card_data)
168
+ post_splitter = "/post/"
169
+ post["quoted_url"] = format_post_url(
170
+ post["quoted_user_did"], post["quoted_did"], post_splitter=post_splitter
171
+ )
171
172
 
172
- # Grab user handle and cleanup links when no quote data but url in text
173
- if not quoted_data:
174
- for link in links:
175
- if link.startswith("https://bsky.app/profile/") and link.endswith(
176
- post["quoted_did"]
177
- ):
178
- # Take better quoted url with user_handle
179
- post["quoted_url"] = link
180
- break
181
-
182
- # Remove quoted link from post links
183
- if post["quoted_url"] in links:
184
- links.remove(post["quoted_url"])
185
-
186
- # Extract user handle from url
187
- if "did:plc:" not in post["quoted_url"]:
188
- post["quoted_user_handle"], _ = parse_post_url(
189
- post["quoted_url"], post["url"]
190
- )
173
+ if card_data:
174
+ if card_data.get("detached"):
175
+ post["quoted_status"] = "detached"
176
+
177
+ else:
178
+ quoted_data = deepcopy(card_data)
179
+
180
+ # Grab user handle and cleanup links when no quote data but url in text
181
+ if not quoted_data:
182
+ for link in links:
183
+ if link.startswith("https://bsky.app/profile/") and link.endswith(
184
+ post["quoted_did"]
185
+ ):
186
+ # Take better quoted url with user_handle
187
+ post["quoted_url"] = link
188
+ break
189
+
190
+ # Remove quoted link from post links
191
+ if post["quoted_url"] in links:
192
+ links.remove(post["quoted_url"])
193
+
194
+ # Extract user handle from url
195
+ if "did:plc:" not in post["quoted_url"]:
196
+ post["quoted_user_handle"], _ = parse_post_url(
197
+ post["quoted_url"], post["url"]
198
+ )
191
199
 
192
200
  return (post, quoted_data, links)
193
201
 
@@ -300,6 +308,7 @@ def normalize_post(
300
308
  post["timestamp_utc"], post["local_time"] = get_dates(
301
309
  data["record"]["createdAt"], locale=locale, source="bluesky"
302
310
  )
311
+ post["indexed_at_utc"] = data["indexedAt"]
303
312
 
304
313
  # Handle post/user identifiers
305
314
  post["cid"] = data["cid"]
@@ -332,6 +341,8 @@ def normalize_post(
332
341
  post["reply_count"] = data["replyCount"]
333
342
  post["like_count"] = data["likeCount"]
334
343
  post["quote_count"] = data["quoteCount"]
344
+ # When a post cites another, the cited post doesn't have the bookmarkCount field
345
+ post["bookmark_count"] = data.get("bookmarkCount")
335
346
 
336
347
  # Handle hashtags, mentions & links from facets
337
348
  post["mentioned_user_handles"] = []
@@ -361,12 +372,21 @@ def normalize_post(
361
372
  # Check & fix occasional errored mention positioning
362
373
  # example: https://bsky.app/profile/snjcgt.bsky.social/post/3lpmqkkkgp52u
363
374
  byteStart = facet["index"]["byteStart"]
375
+ byteEnd = facet["index"]["byteEnd"]
364
376
  if text[byteStart : byteStart + 1] != b"@":
365
377
  byteStart = text.find(b"@", byteStart)
378
+ # in some cases, the errored positioning is before the position given
379
+ # example: https://bsky.app/profile/springer.springernature.com/post/3lovyad4nt324
380
+ if byteStart == -1 or byteStart > byteEnd:
381
+ # When decrementing byteStart, we also decrement byteEnd (see below)
382
+ # shifting the slice to extract the mention correctly
383
+ byteStart = facet["index"]["byteStart"] - 1
384
+ # to extend the size of the mention, which is somehow 1 char too short because of the '@'
385
+ byteEnd += 1
366
386
 
367
387
  handle = (
368
388
  text[
369
- byteStart + 1 : facet["index"]["byteEnd"]
389
+ byteStart + 1 : byteEnd
370
390
  + byteStart
371
391
  - facet["index"]["byteStart"]
372
392
  ]
@@ -398,21 +418,87 @@ def normalize_post(
398
418
  # examples: https://bsky.app/profile/ecrime.ch/post/3lqotmopayr23
399
419
  # https://bsky.app/profile/clustz.com/post/3lqfi7mnto52w
400
420
  byteStart = facet["index"]["byteStart"]
421
+ byteEnd = facet["index"]["byteEnd"]
401
422
 
402
- if not text[byteStart : facet["index"]["byteEnd"]].startswith(b"http"):
403
- new_byteStart = text.find(b"http", byteStart, facet["index"]["byteEnd"])
423
+ if not text[byteStart:byteEnd].startswith(b"http"):
424
+ new_byteStart = text.find(b"http", byteStart, byteEnd)
425
+
426
+ # means that the link is shifted, like on this post:
427
+ # https://bsky.app/profile/ecrime.ch/post/3lqotmopayr23
404
428
  if new_byteStart != -1:
405
429
  byteStart = new_byteStart
406
430
 
407
- links_to_replace.append(
408
- {
409
- "uri": feat["uri"].encode("utf-8"),
410
- "start": byteStart,
411
- "end": byteStart
412
- - facet["index"]["byteStart"]
413
- + facet["index"]["byteEnd"],
414
- }
415
- )
431
+ # Find the index of the first space character after byteStart in case the link is a personalized one
432
+ # but still with the link in it (somehow existing in some posts, such as this one:
433
+ # https://bsky.app/profile/did:plc:rkphrshyfiqe4n2hz5vj56ig/post/3ltmljz5blca2)
434
+ # In this case, we don't want to touch the position of the link given in the payload
435
+ byteEnd = min(
436
+ byteStart
437
+ - facet["index"]["byteStart"]
438
+ + facet["index"]["byteEnd"],
439
+ len(post["original_text"].encode("utf-8")),
440
+ )
441
+ for i in range(byteStart, byteEnd):
442
+ if chr(text[i]).isspace():
443
+ byteStart = facet["index"]["byteStart"]
444
+ byteEnd = (
445
+ byteStart
446
+ - facet["index"]["byteStart"]
447
+ + facet["index"]["byteEnd"]
448
+ )
449
+
450
+ # means that the link is a "personalized" one like on this post:
451
+ # https://bsky.app/profile/newyork.activitypub.awakari.com.ap.brid.gy/post/3ln33tx7bpdu2
452
+ else:
453
+ # we're looking for a link which could be valid if we add "https://" at the beginning,
454
+ # as in some cases the "http(s)://" part is missing in the post text
455
+ for starting in range(byteEnd - byteStart):
456
+ try:
457
+ if is_url(
458
+ "https://"
459
+ + text[
460
+ byteStart + starting : byteEnd + starting
461
+ ].decode("utf-8")
462
+ ):
463
+ byteStart += starting
464
+ break
465
+ except UnicodeDecodeError:
466
+ pass
467
+ # If we did not find any valid link, we just keep the original position as it is
468
+ # meaning that we have a personalized link like in the example above
469
+
470
+ # Extend byteEnd to the right until we find a valid utf-8 ending,
471
+ # as in some cases the link is longer than the position given in the payload
472
+ # and it gets cut in the middle of a utf-8 char, leading to UnicodeDecodeError
473
+ # example: https://bsky.app/profile/radiogaspesie.bsky.social/post/3lmkzhvhtta22
474
+ while byteEnd <= len(post["original_text"].encode("utf-8")):
475
+ try:
476
+ text[byteStart:byteEnd].decode("utf-8")
477
+ break
478
+ except UnicodeDecodeError:
479
+ byteEnd += 1
480
+ continue
481
+
482
+ if byteEnd > len(post["original_text"].encode("utf-8")):
483
+ byteEnd = facet["index"]["byteEnd"]
484
+
485
+ byteEnd += byteStart - facet["index"]["byteStart"]
486
+
487
+ # In some cases, the link is completely wrong in the post text,
488
+ # like in this post: https://bsky.app/profile/sudetsoleil.bsky.social/post/3ljf3h74wee2m
489
+ # So we chose to not replace anything in the text in this case
490
+ try:
491
+ text[byteStart:byteEnd].decode("utf-8")
492
+ links_to_replace.append(
493
+ {
494
+ "uri": feat["uri"].encode("utf-8"),
495
+ "start": byteStart,
496
+ "end": byteEnd,
497
+ }
498
+ )
499
+ except UnicodeDecodeError:
500
+ pass
501
+ # raise UnicodeDecodeError(e.encoding, e.object, e.start, e.end, f"{e.reason} in post {post['url']}.\nText to decode: {text}\nSlice of text to decode: {text[e.start:e.end]}")
416
502
 
417
503
  elif feat["$type"].endswith("#bold"):
418
504
  pass
@@ -503,9 +589,9 @@ def normalize_post(
503
589
  if embed["$type"].endswith(".record"):
504
590
  if "app.bsky.graph.starterpack" in embed["record"]["uri"]:
505
591
  post = process_starterpack_card(
506
- data.get("embed", {}).get("record"), post
592
+ data.get("embed", {}).get("record", {}), post
507
593
  )
508
- if post["card_link"]:
594
+ if post.get("card_link"):
509
595
  extra_links.append(post["card_link"])
510
596
  else:
511
597
  post, quoted_data, links = prepare_quote_data(
@@ -594,11 +680,13 @@ def normalize_post(
594
680
 
595
681
  # Process quotes
596
682
  if quoted_data and "value" in quoted_data:
597
- if quoted_data["cid"] != post["quoted_cid"]:
683
+ # We're checking on the uri as the cid can be different in some cases,
684
+ # and the uri seems to be unique for each post
685
+ if quoted_data["uri"] != post["quoted_uri"]:
598
686
  raise BlueskyPayloadError(
599
687
  post["url"],
600
- "inconsistent quote cid found between record.embed.record.cid & embed.record.cid: %s %s"
601
- % (post["quoted_cid"], quoted_data),
688
+ "inconsistent quote uri found between record.embed.record.uri & embed.record.uri: %s %s"
689
+ % (post["quoted_uri"], quoted_data),
602
690
  )
603
691
 
604
692
  quoted_data["record"] = quoted_data["value"]
@@ -706,7 +794,16 @@ def normalize_post(
706
794
  repost_data["indexedAt"], locale=locale, source="bluesky"
707
795
  )
708
796
 
709
- post["text"] = text.decode("utf-8")
797
+ try:
798
+ post["text"] = text.decode("utf-8")
799
+ except UnicodeDecodeError as e:
800
+ raise UnicodeDecodeError(
801
+ e.encoding,
802
+ e.object,
803
+ e.start,
804
+ e.end,
805
+ f"{e.reason} in post {post['url']}.\nText to decode: {text}\nSlice of text to decode: {text[e.start : e.end]}",
806
+ )
710
807
 
711
808
  if collection_source is not None:
712
809
  post["collected_via"] = [collection_source]
twitwi/bluesky/types.py CHANGED
@@ -50,6 +50,7 @@ class BlueskyPost(TypedDict):
50
50
  # Datetime fields
51
51
  timestamp_utc: int # Unix UTC timestamp of when the post was submitted
52
52
  local_time: str # datetime (potentially timezoned) of when the post was submitted
53
+ indexed_at_utc: str # datetime (NOT timezoned, for reuse of the Bluesky API) of when the post was indexed by the Bluesky service
53
54
 
54
55
  # Author identifying fields
55
56
  user_did: str # persistent long-term identifier of the account who authored the post
@@ -64,6 +65,7 @@ class BlueskyPost(TypedDict):
64
65
  like_count: int # total number of likes received by the post (at collection time)
65
66
  reply_count: int # total number of replies received by the post (at collection time)
66
67
  quote_count: int # total number of posts the post was quoted into (at collection time)
68
+ bookmark_count: Optional[int] # total number of bookmarks received by the post (at collection time)
67
69
 
68
70
  # Extra field
69
71
  bridgy_original_url: Optional[str] # source of the original post when it was posted from another platform such as Mastodon via the Bridgy connection tool
twitwi/bluesky/utils.py CHANGED
@@ -75,7 +75,7 @@ def format_post_url(user_handle_or_did, post_did, post_splitter="/post/"):
75
75
  def parse_post_url(url, source):
76
76
  """Returns a tuple of (author_handle/did, post_did) from an https://bsky.app post URL"""
77
77
 
78
- known_splits = ["/post/", "/lists/"]
78
+ known_splits = ["/post/", "/lists/", "/feed/"]
79
79
 
80
80
  if url.startswith("https://bsky.app/profile/"):
81
81
  for split in known_splits:
@@ -117,7 +117,7 @@ def format_media_url(user_did, media_cid, mime_type, source):
117
117
  media_thumb = (
118
118
  f"https://video.bsky.app/watch/{user_did}/{media_cid}/thumbnail.jpg"
119
119
  )
120
- elif mime_type == "application/octet-stream":
120
+ elif mime_type in ["application/octet-stream", "text/plain"]:
121
121
  media_url = (
122
122
  f"https://cdn.bsky.app/img/feed_fullsize/plain/{user_did}/{media_cid}@jpeg"
123
123
  )
twitwi/utils.py CHANGED
@@ -60,6 +60,9 @@ def get_dates(
60
60
  if locale is None:
61
61
  locale = UTC_TIMEZONE
62
62
 
63
+ # Let's pray we never see a negative year...
64
+ year_zero = date_str.startswith("0000")
65
+
63
66
  try:
64
67
  parsed_datetime = datetime.strptime(
65
68
  date_str,
@@ -68,7 +71,13 @@ def get_dates(
68
71
  except ValueError as e:
69
72
  if source != "bluesky":
70
73
  raise e
71
- parsed_datetime = parse_date(date_str)
74
+ # Yes, it seems that some people were active in year 0...
75
+ # see by yourself: https://bsky.app/profile/koro.icu/post/3kbpuogc6fz2o
76
+ if year_zero:
77
+ date_str_fixed = "0001" + date_str[4:]
78
+ parsed_datetime = parse_date(date_str_fixed)
79
+ else:
80
+ parsed_datetime = parse_date(date_str)
72
81
 
73
82
  utc_datetime = parsed_datetime
74
83
  if not parsed_datetime.tzinfo:
@@ -77,18 +86,24 @@ def get_dates(
77
86
 
78
87
  timestamp = int(utc_datetime.timestamp())
79
88
 
89
+ if year_zero:
90
+ # Subtract one year (year 0001 is not a leap year) in seconds
91
+ timestamp -= 31536000
92
+
80
93
  if millisecond_timestamp:
81
94
  timestamp *= 1000
82
95
  timestamp += utc_datetime.microsecond / 1000
83
96
 
97
+ formatted_date_str = datetime.strftime(
98
+ locale_datetime,
99
+ FORMATTED_FULL_DATETIME_FORMAT
100
+ if source == "bluesky"
101
+ else FORMATTED_TWEET_DATETIME_FORMAT,
102
+ )
103
+
84
104
  return (
85
105
  int(timestamp),
86
- datetime.strftime(
87
- locale_datetime,
88
- FORMATTED_FULL_DATETIME_FORMAT
89
- if source == "bluesky"
90
- else FORMATTED_TWEET_DATETIME_FORMAT,
91
- ),
106
+ formatted_date_str if not year_zero else "0" + formatted_date_str[1:],
92
107
  )
93
108
 
94
109
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: twitwi
3
- Version: 0.22.0
3
+ Version: 0.23.0
4
4
  Summary: A collection of Twitter-related helper functions for python.
5
5
  Home-page: http://github.com/medialab/twitwi
6
6
  Author: Béatrice Mazoyer, Guillaume Plique, Benjamin Ooghe-Tabanou
@@ -7,16 +7,16 @@ twitwi/constants.py,sha256=fvqCngJIGyz5CpdVWbcAfjmE3_kvcx9giN0rEljL7OU,16001
7
7
  twitwi/exceptions.py,sha256=OCIDagu2ErDyOGWunRBCK3O62TnzFpIMQ9gS8l9EALQ,696
8
8
  twitwi/formatters.py,sha256=yn14AsrGAUw8rShOnYJvoMbzdWpfTeSs0P0ZPNTwhLU,3142
9
9
  twitwi/normalizers.py,sha256=CWUK-XwhcEjLDjWH_qb6E03WZKsbIcwiRAVUjwXKQho,28438
10
- twitwi/utils.py,sha256=f02cMx19Sr_GvJQf_0jTIERGLq1oC3znnPQxE__rlFc,3838
10
+ twitwi/utils.py,sha256=ruyqTx9JELRiE4-Svhaeo02KrsdHrrHJNqbGRWMmuAs,4421
11
11
  twitwi/bluesky/__init__.py,sha256=SqeHZUzL2U9UpL3EB33vaowQWaKXSPkvsAkasRqmFpY,694
12
12
  twitwi/bluesky/constants.py,sha256=CPkTIrDwyRWpkFTbaee1oFm_LWGj2WIC7A6xEGqDGB4,573
13
13
  twitwi/bluesky/formatters.py,sha256=L_yROAPcBECifCGiFAGYFJwLq6re8UlJNoZ7R2DXm5g,1025
14
- twitwi/bluesky/normalizers.py,sha256=1tt4q9dKhCLuNhB-Qn8YGSHILvgu-JNIRnfumwkEAe4,28422
15
- twitwi/bluesky/types.py,sha256=WUxfyA5fc68qURGh7bxiDlIBFgdbyysRRdvHLoXwWlA,13656
16
- twitwi/bluesky/utils.py,sha256=9il8t_qkKCmGQ-MDkF5qahxKV1Qsmwzul_1VzzD-jH4,3943
17
- twitwi-0.22.0.dist-info/licenses/LICENSE.txt,sha256=Ddg_PcGnl0qd2167o2dheCjE_rCZJOoBxjJnJhhOpX4,1099
18
- twitwi-0.22.0.dist-info/METADATA,sha256=QggjTIdvTg15eWBDNuug7rcZINc2Uh6choMNQeeFoNM,21365
19
- twitwi-0.22.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
- twitwi-0.22.0.dist-info/top_level.txt,sha256=TaKyGU7j_EVbP5KI0UD6qjbaKv2Qn0OrkfUQ29a04kg,12
21
- twitwi-0.22.0.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
22
- twitwi-0.22.0.dist-info/RECORD,,
14
+ twitwi/bluesky/normalizers.py,sha256=AsOX3d4FsMn-GPvo-0oA7cZQwqAxQNbLq1ajbnXe7bk,33976
15
+ twitwi/bluesky/types.py,sha256=INe6R8eOqrOooWn25dtk61-Wqd_pUDwb737R7jY_vkc,13915
16
+ twitwi/bluesky/utils.py,sha256=mFL1h_Mqay66UGEUlzweO_0TzbqS51oNE2TKoT2xf-4,3969
17
+ twitwi-0.23.0.dist-info/licenses/LICENSE.txt,sha256=Ddg_PcGnl0qd2167o2dheCjE_rCZJOoBxjJnJhhOpX4,1099
18
+ twitwi-0.23.0.dist-info/METADATA,sha256=05Mq7RsXYLpVK4aTX3zAUMcPYdpd8UBPOc81Z9_FYQw,21365
19
+ twitwi-0.23.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
+ twitwi-0.23.0.dist-info/top_level.txt,sha256=TaKyGU7j_EVbP5KI0UD6qjbaKv2Qn0OrkfUQ29a04kg,12
21
+ twitwi-0.23.0.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
22
+ twitwi-0.23.0.dist-info/RECORD,,