twitwi 0.23.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -112,7 +112,9 @@ class TestFormatters:
112
112
 
113
113
  for source in normalized_posts:
114
114
  for post in source:
115
- writer.writerow(format_post_as_csv_row(post))
115
+ writer.writerow(
116
+ format_post_as_csv_row(post, allow_erroneous_plurals=True)
117
+ )
116
118
 
117
119
  if OVERWRITE_TESTS:
118
120
  written = buffer.getvalue()
@@ -140,7 +142,7 @@ class TestFormatters:
140
142
 
141
143
  for source in normalized_posts:
142
144
  for post in source:
143
- transform_post_into_csv_dict(post)
145
+ transform_post_into_csv_dict(post, allow_erroneous_plurals=True)
144
146
  writer.writerow(post)
145
147
 
146
148
  with open_resource("bluesky-posts-export.csv") as f:
@@ -99,14 +99,33 @@ def prepare_native_gif_as_media(gif_data, user_did, source):
99
99
  }
100
100
 
101
101
 
102
- def prepare_image_as_media(image_data):
103
- if "ref" not in image_data["image"] or "$link" not in image_data["image"]["ref"]:
104
- image_id = image_data["image"]["cid"]
102
+ def prepare_image_as_media(image_data, source):
103
+ if isinstance(image_data["image"], str):
104
+ # As in this post: https://bsky.app/profile/did:plc:xafmeedgq77f6smn6kmalasr/post/3lcnxglm3o62z
105
+ image_type = "image/jpeg"
106
+ image_id = image_data["image"]
107
+ elif isinstance(image_data["image"], dict):
108
+ image_type = image_data["image"]["mimeType"]
109
+ if (
110
+ "ref" not in image_data["image"]
111
+ or "$link" not in image_data["image"]["ref"]
112
+ ):
113
+ # As in this post: https://bsky.app/profile/testjuan06.bsky.social/post/3ljkzygywso2b
114
+ if "link" in image_data["image"]:
115
+ image_id = image_data["image"]["link"]
116
+ elif "cid" in image_data["image"]:
117
+ image_id = image_data["image"]["cid"]
118
+ else:
119
+ raise BlueskyPayloadError(
120
+ source, "Unable to find image id in image data: %s" % image_data
121
+ )
122
+ else:
123
+ image_id = image_data["image"]["ref"]["$link"]
105
124
  else:
106
- image_id = image_data["image"]["ref"]["$link"]
125
+ raise BlueskyPayloadError(source, "Unable to parse image data: %s" % image_data)
107
126
  return {
108
127
  "id": image_id,
109
- "type": image_data["image"]["mimeType"],
128
+ "type": image_type,
110
129
  "alt": image_data["alt"],
111
130
  }
112
131
 
@@ -140,7 +159,15 @@ def process_card_data(embed_data, post):
140
159
  post["card_link"] = embed_data["uri"]
141
160
  post["card_title"] = embed_data.get("title", "")
142
161
  post["card_description"] = embed_data.get("description", "")
143
- post["card_thumbnail"] = embed_data.get("thumb", "")
162
+ if isinstance(embed_data.get("thumb"), dict) and embed_data["thumb"].get(
163
+ "ref", {}
164
+ ).get("$link"):
165
+ media_cid = embed_data["thumb"]["ref"]["$link"]
166
+ post["card_thumbnail"] = (
167
+ f"https://cdn.bsky.app/img/feed_thumbnail/plain/{post['user_did']}/{media_cid}@jpeg"
168
+ )
169
+ else:
170
+ post["card_thumbnail"] = embed_data.get("thumb", "")
144
171
  return post
145
172
 
146
173
 
@@ -308,6 +335,10 @@ def normalize_post(
308
335
  post["timestamp_utc"], post["local_time"] = get_dates(
309
336
  data["record"]["createdAt"], locale=locale, source="bluesky"
310
337
  )
338
+ # Completing year with less than 4 digits as in some posts: https://bsky.app/profile/koro.icu/post/3kbpuogc6fz2o
339
+ # len 26 example: '2023-06-15T12:34:56.789000'
340
+ while len(post["local_time"]) < 26 and len(post["local_time"].split("-")[0]) < 4:
341
+ post["local_time"] = "0" + post["local_time"]
311
342
  post["indexed_at_utc"] = data["indexedAt"]
312
343
 
313
344
  # Handle post/user identifiers
@@ -316,7 +347,11 @@ def normalize_post(
316
347
  post["user_did"], post["did"] = parse_post_uri(data["uri"])
317
348
  post["user_handle"] = data["author"]["handle"]
318
349
  post["user_url"] = format_profile_url(post["user_handle"])
319
- post["url"] = format_post_url(post["user_handle"], post["did"])
350
+ # example: https://bsky.app/profile/did:plc:n5pm4vggu475okayqvqipkoh/post/3lmdcgp3a7cnd
351
+ if post["user_handle"] == "handle.invalid":
352
+ post["url"] = format_post_url(post["user_did"], post["did"])
353
+ else:
354
+ post["url"] = format_post_url(post["user_handle"], post["did"])
320
355
 
321
356
  if post["user_did"] != data["author"]["did"]:
322
357
  raise BlueskyPayloadError(
@@ -350,19 +385,91 @@ def normalize_post(
350
385
  hashtags = set()
351
386
  links = set()
352
387
  links_to_replace = []
388
+ media_data = []
389
+ extra_links = []
390
+ post["media_urls"] = []
353
391
  for facet in data["record"].get("facets", []):
354
392
  if len(facet["features"]) != 1:
355
- raise BlueskyPayloadError(
356
- post["url"],
357
- "unusual record facet content with more or less than a unique feature: %s"
358
- % facet,
359
- )
393
+ raising_error = False
394
+ for feat in facet["features"]:
395
+ # Already handled linkcards separately below
396
+ if feat["$type"].endswith("#linkcard"):
397
+ continue
398
+
399
+ # If there are links, we register them and do not replace anything in original text
400
+ # as we don't have position for each link
401
+ # example: https://bsky.app/profile/77cupons.bsky.social/post/3latbufuvqw25
402
+ elif feat["$type"].endswith("#link") and "uri" in feat:
403
+ link = safe_normalize_url(feat["uri"])
404
+ if is_url(link):
405
+ links.add(link)
406
+ links_to_replace.append(
407
+ {"uri": feat["uri"].encode("utf-8"), "start": -1, "end": -1}
408
+ )
409
+ elif feat["$type"].lower().endswith("#tag"):
410
+ hashtags.add(feat["tag"].strip().lower())
411
+ # As in this post: https://bsky.app/profile/havehashad.com/post/3ki3rk5ytqd2e
412
+ elif feat["$type"].endswith("#image") and "uri" in feat:
413
+ post["media_urls"].append(safe_normalize_url(feat["uri"]))
414
+ else:
415
+ raising_error = True
416
+
417
+ if raising_error:
418
+ raise BlueskyPayloadError(
419
+ post["url"],
420
+ "unusual record facet content with more or less than a unique feature: %s"
421
+ % facet,
422
+ )
423
+ continue
360
424
 
361
425
  feat = facet["features"][0]
426
+ lower_feat_type = feat["$type"].lower()
362
427
 
363
428
  # Hashtags
364
- if feat["$type"].endswith("#tag") or feat["$type"].endswith("#hashtag"):
365
- hashtags.add(feat["tag"].strip().lower())
429
+ if (
430
+ lower_feat_type.endswith("#tag")
431
+ or lower_feat_type.endswith(".tag")
432
+ or lower_feat_type.endswith("#hashtag")
433
+ or lower_feat_type == "facettag"
434
+ ):
435
+ # Some posts have the full text in the "text" field of the hashtag feature
436
+ if "text" in feat:
437
+ for tag in feat["text"].split("#"):
438
+ if tag.strip():
439
+ hashtags.add(tag.strip().lower())
440
+ # some posts have "hashtag" instead of "tag" field
441
+ # example: https://bsky.app/profile/did:plc:jrodn6nnfuwzm2zxbxbpzgot/post/3lhwag3mzoo2k
442
+ else:
443
+ if "tag" in feat:
444
+ tag = feat["tag"].strip().lower()
445
+ elif "hashtag" in feat:
446
+ tag = feat["hashtag"].strip().lower()
447
+ # Somehow no tag found, we'll try to get it in the text slice
448
+ # example: https://bsky.app/profile/did:plc:p6yojdpa5iatdk3ttaty2zu2/post/3knvsl6h4x22i
449
+ elif len(feat) == 1:
450
+ byteStart = facet["index"]["byteStart"]
451
+ if text[byteStart : byteStart + 1] == b"#":
452
+ byteEnd = facet["index"]["byteEnd"]
453
+ try:
454
+ tag = (
455
+ text[byteStart:byteEnd]
456
+ .decode("utf-8")
457
+ .strip()
458
+ .lstrip("#")
459
+ .lower()
460
+ )
461
+ except UnicodeDecodeError:
462
+ raise BlueskyPayloadError(
463
+ post["url"],
464
+ "unable to decode utf-8 slice for hashtag extraction: %s"
465
+ % facet,
466
+ )
467
+ else:
468
+ raise BlueskyPayloadError(
469
+ post["url"],
470
+ "unable to extract hashtag from text slice: %s" % facet,
471
+ )
472
+ hashtags.add(tag)
366
473
 
367
474
  # Mentions
368
475
  elif feat["$type"].endswith("#mention"):
@@ -392,12 +499,23 @@ def normalize_post(
392
499
  ]
393
500
  .strip()
394
501
  .lower()
395
- .decode("utf-8")
396
502
  )
503
+ while byteEnd >= byteStart:
504
+ try:
505
+ handle.decode("utf-8")
506
+ break
507
+ except UnicodeDecodeError:
508
+ handle = handle[:-1]
509
+ continue
510
+ handle = handle.decode("utf-8")
397
511
  post["mentioned_user_handles"].append(handle)
398
512
 
399
513
  # Links
400
- elif feat["$type"].endswith("#link"):
514
+ elif (
515
+ feat["$type"].endswith("#link")
516
+ or feat["$type"].endswith(".link")
517
+ or feat["$type"].endswith(".url")
518
+ ):
401
519
  # Handle native polls
402
520
  if "https://poll.blue/" in feat["uri"]:
403
521
  if feat["uri"].endswith("/0"):
@@ -420,57 +538,100 @@ def normalize_post(
420
538
  byteStart = facet["index"]["byteStart"]
421
539
  byteEnd = facet["index"]["byteEnd"]
422
540
 
423
- if not text[byteStart:byteEnd].startswith(b"http"):
424
- new_byteStart = text.find(b"http", byteStart, byteEnd)
541
+ # Skip overlapping links cases
542
+ # examples: https://bsky.app/profile/researchtrend.ai/post/3lbieylwwxs2b
543
+ # https://bsky.app/profile/dj-cyberspace.otoskey.tarbin.net.ap.brid.gy/post/3lchg3plpdjp2
544
+ for elt in links_to_replace:
545
+ if (byteStart >= elt["start"] and byteStart <= elt["end"]) or (
546
+ byteEnd >= elt["start"] and byteEnd <= elt["end"]
547
+ ):
548
+ # Overlapping links, we skip this one
549
+ byteStart = -1
550
+ byteEnd = -1
551
+ break
552
+
553
+ # Meaning we will try to fix the link position
554
+ if byteStart != -1 or byteEnd != -1:
555
+ # It appears that some links end before they start... Bluesky please: what's going on?
556
+ # example: https://bsky.app/profile/ondarockwebzine.bsky.social/post/3lqxxejza6o2t
557
+ # if int(byteEnd) < int(byteStart) or byteStart < 0:
558
+ if int(byteEnd) < int(byteStart):
559
+ byteStart = -1
560
+ byteEnd = -1
561
+
562
+ # There are mentionned links which are positionned after the end of the text,
563
+ # so we put them at the end of the original text
564
+ elif byteStart >= len(post["original_text"].encode("utf-8")):
565
+ byteStart = -1
566
+ byteEnd = -1
567
+
568
+ elif not text[byteStart:byteEnd].startswith(b"http"):
569
+ new_byteStart = text.find(b"http", byteStart, byteEnd)
570
+
571
+ # means that the link is shifted, like on this post:
572
+ # https://bsky.app/profile/ecrime.ch/post/3lqotmopayr23
573
+ if new_byteStart != -1:
574
+ byteStart = new_byteStart
575
+
576
+ # Find the index of the first space character after byteStart in case the link is a personalized one
577
+ # but still with the link in it (somehow existing in some posts, such as this one:
578
+ # https://bsky.app/profile/did:plc:rkphrshyfiqe4n2hz5vj56ig/post/3ltmljz5blca2)
579
+ # In this case, we don't want to touch the position of the link given in the payload
580
+ byteEnd = min(
581
+ byteStart
582
+ - facet["index"]["byteStart"]
583
+ + facet["index"]["byteEnd"],
584
+ len(post["original_text"].encode("utf-8")),
585
+ )
586
+ for i in range(byteStart, byteEnd):
587
+ if chr(text[i]).isspace():
588
+ byteStart = facet["index"]["byteStart"]
589
+ byteEnd = (
590
+ byteStart
591
+ - facet["index"]["byteStart"]
592
+ + facet["index"]["byteEnd"]
593
+ )
425
594
 
426
- # means that the link is shifted, like on this post:
427
- # https://bsky.app/profile/ecrime.ch/post/3lqotmopayr23
428
- if new_byteStart != -1:
429
- byteStart = new_byteStart
595
+ # means that the link is a "personalized" one like on this post:
596
+ # https://bsky.app/profile/newyork.activitypub.awakari.com.ap.brid.gy/post/3ln33tx7bpdu2
597
+ else:
598
+ # we're looking for a link which could be valid if we add "https://" at the beginning,
599
+ # as in some cases the "http(s)://" part is missing in the post text
600
+ for starting in range(byteEnd - byteStart):
601
+ try:
602
+ if is_url(
603
+ "https://"
604
+ + text[
605
+ byteStart + starting : byteEnd + starting
606
+ ].decode("utf-8")
607
+ ):
608
+ byteStart += starting
609
+ break
610
+ except UnicodeDecodeError:
611
+ pass
612
+ # If we did not find any valid link, we just keep the original position as it is
613
+ # meaning that we have a personalized link like in the example above
614
+
615
+ # Extend byteEnd to the right until we find a valid utf-8 ending,
616
+ # as in some cases the link is longer than the position given in the payload
617
+ # and it gets cut in the middle of a utf-8 char, leading to UnicodeDecodeError
618
+ # example: https://bsky.app/profile/radiogaspesie.bsky.social/post/3lmkzhvhtta22
619
+ while byteEnd <= len(post["original_text"].encode("utf-8")):
620
+ try:
621
+ text[byteStart:byteEnd].decode("utf-8")
622
+ break
623
+ except UnicodeDecodeError:
624
+ byteEnd += 1
625
+ continue
430
626
 
431
- # Find the index of the first space character after byteStart in case the link is a personalized one
432
- # but still with the link in it (somehow existing in some posts, such as this one:
433
- # https://bsky.app/profile/did:plc:rkphrshyfiqe4n2hz5vj56ig/post/3ltmljz5blca2)
434
- # In this case, we don't want to touch the position of the link given in the payload
435
- byteEnd = min(
436
- byteStart
437
- - facet["index"]["byteStart"]
438
- + facet["index"]["byteEnd"],
439
- len(post["original_text"].encode("utf-8")),
440
- )
441
- for i in range(byteStart, byteEnd):
442
- if chr(text[i]).isspace():
443
- byteStart = facet["index"]["byteStart"]
444
- byteEnd = (
445
- byteStart
446
- - facet["index"]["byteStart"]
447
- + facet["index"]["byteEnd"]
448
- )
627
+ # Meaning that we did not find a valid utf-8 ending, so we reset byteEnd to its original value
628
+ if byteEnd > len(post["original_text"].encode("utf-8")):
629
+ byteEnd = facet["index"]["byteEnd"]
449
630
 
450
- # means that the link is a "personalized" one like on this post:
451
- # https://bsky.app/profile/newyork.activitypub.awakari.com.ap.brid.gy/post/3ln33tx7bpdu2
631
+ byteEnd += byteStart - facet["index"]["byteStart"]
452
632
  else:
453
- # we're looking for a link which could be valid if we add "https://" at the beginning,
454
- # as in some cases the "http(s)://" part is missing in the post text
455
- for starting in range(byteEnd - byteStart):
456
- try:
457
- if is_url(
458
- "https://"
459
- + text[
460
- byteStart + starting : byteEnd + starting
461
- ].decode("utf-8")
462
- ):
463
- byteStart += starting
464
- break
465
- except UnicodeDecodeError:
466
- pass
467
- # If we did not find any valid link, we just keep the original position as it is
468
- # meaning that we have a personalized link like in the example above
469
-
470
- # Extend byteEnd to the right until we find a valid utf-8 ending,
471
- # as in some cases the link is longer than the position given in the payload
472
- # and it gets cut in the middle of a utf-8 char, leading to UnicodeDecodeError
473
- # example: https://bsky.app/profile/radiogaspesie.bsky.social/post/3lmkzhvhtta22
633
+ # Handling case of errored byteEnd in the end of the text
634
+ # example: https://bsky.app/profile/twif.bsky.social/post/3lm4izkvbfm2r
474
635
  while byteEnd <= len(post["original_text"].encode("utf-8")):
475
636
  try:
476
637
  text[byteStart:byteEnd].decode("utf-8")
@@ -482,8 +643,6 @@ def normalize_post(
482
643
  if byteEnd > len(post["original_text"].encode("utf-8")):
483
644
  byteEnd = facet["index"]["byteEnd"]
484
645
 
485
- byteEnd += byteStart - facet["index"]["byteStart"]
486
-
487
646
  # In some cases, the link is completely wrong in the post text,
488
647
  # like in this post: https://bsky.app/profile/sudetsoleil.bsky.social/post/3ljf3h74wee2m
489
648
  # So we chose to not replace anything in the text in this case
@@ -500,10 +659,66 @@ def normalize_post(
500
659
  pass
501
660
  # raise UnicodeDecodeError(e.encoding, e.object, e.start, e.end, f"{e.reason} in post {post['url']}.\nText to decode: {text}\nSlice of text to decode: {text[e.start:e.end]}")
502
661
 
503
- elif feat["$type"].endswith("#bold"):
662
+ elif any(
663
+ feat["$type"].endswith(suffix)
664
+ for suffix in [
665
+ "#bold",
666
+ "#italic",
667
+ "#underline",
668
+ "#option",
669
+ "#encrypt",
670
+ "#text",
671
+ ]
672
+ ):
504
673
  pass
505
- elif feat["$type"].endswith("#option"):
674
+ # Bluesky seems to use format features for some internal purposes, but we ignore them
675
+ # e.g.: https://bsky.app/profile/ferromar.bsky.social/post/3lzyfaixayd2g
676
+ elif feat["$type"].endswith("format"):
506
677
  pass
678
+ # Not normal feature type, but still existing in some posts
679
+ # Note that external features aren't visible on the Bluesky app, only external embeds are
680
+ # e.g.: https://bsky.app/profile/did:plc:4qvb4dpkg6tkbzym77j6jcm4/post/3lbjktt6tw52h
681
+ elif feat["$type"].endswith("external"):
682
+ link = feat["external"]["uri"]
683
+
684
+ # Handle native gifs as medias
685
+ if link.startswith("https://media.tenor.com/"):
686
+ media_data.append(
687
+ prepare_native_gif_as_media(
688
+ feat["external"], post["user_did"], post["url"]
689
+ )
690
+ )
691
+ # Extra card links sometimes missing from facets & text due to manual action in post form
692
+ else:
693
+ extra_links.append(link)
694
+
695
+ if isinstance(feat["external"].get("thumb"), dict):
696
+ post = process_card_data(feat["external"], post)
697
+
698
+ # Some people share code snippets using third party apps
699
+ # e.g.: https://bsky.app/profile/alexdln.com/post/3mbwzgrymow2o
700
+ elif (
701
+ "#" in feat["$type"]
702
+ and feat["$type"].split("#")[1].startswith("code")
703
+ and "code" in feat
704
+ ):
705
+ language = (
706
+ feat["$type"].split("#")[1].split(".")[1]
707
+ if "." in feat["$type"].split("#")[1]
708
+ else "plain"
709
+ )
710
+ text += (
711
+ b"\n```"
712
+ + language.encode("utf-8")
713
+ + b"\n"
714
+ + feat["code"].encode("utf-8")
715
+ + b"\n```\n"
716
+ )
717
+
718
+ # We chose to ignore non Bluesky features for now (e.g. personalized features)
719
+ # example: https://bsky.app/profile/poll.blue/post/3kmuqjkkozh2r
720
+ elif "bsky" not in feat["$type"]:
721
+ continue
507
722
  else:
508
723
  raise BlueskyPayloadError(
509
724
  post["url"], "unusual record facet feature $type: %s" % feat
@@ -543,21 +758,61 @@ def normalize_post(
543
758
 
544
759
  # Handle quotes & medias
545
760
  media_ids = set()
546
- post["media_urls"] = []
547
761
  post["media_thumbnails"] = []
548
762
  post["media_types"] = []
549
763
  post["media_alt_texts"] = []
550
764
  if "embed" in data["record"]:
551
765
  embed = data["record"]["embed"]
552
766
  quoted_data = None
553
- media_data = []
554
- extra_links = []
555
767
 
556
768
  if not valid_embed_type(embed["$type"]):
769
+ if "bsky" in embed["$type"]:
770
+ raise BlueskyPayloadError(
771
+ post["url"], "unusual record embed $type: %s" % embed
772
+ )
773
+ # Ignore non Bluesky embeds for now (e.g. personalized embeds)
774
+
775
+ # Empty embed (not usual, but seen in the Bluesky jungle, e.g.
776
+ # https://bsky.app/profile/did:plc:na6u3avvaz2x5wyzqrnviqiz/post/3lzf5qi2ra62k
777
+ # https://bsky.app/profile/dangelodario.it/post/3l3inqifqj42p
778
+ # or https://bsky.app/profile/soirilab.bsky.social/post/3lywaa7vhsu2c)
779
+ if embed["$type"].endswith(".post") or embed["$type"] == "N/A":
780
+ # Some posts have extra keys in their empty embed, certainly personalized ones.
781
+
782
+ # Personalized quote (not visible on Bluesky for the example)
783
+ # example: https://bsky.app/profile/jacksmithsocial.bsky.social/post/3lbca2nxy4f2a
784
+ if embed.get("$type") == "app.bsky.feed.post" and embed.get(
785
+ "record", {}
786
+ ).get("uri"):
787
+ post, quoted_data, links = prepare_quote_data(
788
+ embed["record"], data.get("embed", {}).get("record"), post, links
789
+ )
790
+
791
+ # for the other ones we know up to now, we want to ignore them
792
+ # e.g.: https://bsky.app/profile/granmouse.bsky.social/post/3lwvh5xd2xk2p
793
+ # https://bsky.app/profile/flyingaubrey.bsky.social/post/3lxngessntk2p
794
+ elif len(embed.keys()) > 1 and embed.get("type") not in ["private", "list"]:
795
+ raise BlueskyPayloadError(
796
+ post["url"],
797
+ "unusual empty record embed with extra keys: %s" % embed,
798
+ )
799
+ # Nothing to do for empty embed
800
+
801
+ if (
802
+ embed["$type"].endswith(".embed")
803
+ and len(embed.keys()) > 2
804
+ and len(embed.get("images")) > 0
805
+ ):
557
806
  raise BlueskyPayloadError(
558
- post["url"], "unusual record embed $type: %s" % embed
807
+ post["url"], "unusual empty record embed with extra keys: %s" % embed
559
808
  )
560
809
 
810
+ # Links from links embed
811
+ # e.g.: https://bsky.app/profile/sacredatoz.bsky.social/post/3lrqvemv7qe2f
812
+ if embed["$type"].endswith(".links"):
813
+ for link in embed["links"]:
814
+ extra_links.append(link)
815
+
561
816
  # Links from cards
562
817
  if embed["$type"].endswith(".external"):
563
818
  link = embed["external"]["uri"]
@@ -577,13 +832,48 @@ def normalize_post(
577
832
  if "embed" in data:
578
833
  post = process_card_data(data["embed"]["external"], post)
579
834
 
835
+ # Not visible images
836
+ # examples: https://bsky.app/profile/lubosmichalik.bsky.social/post/3ltjvxsaej62c
837
+ # https://bsky.app/profile/lubosmichalik.bsky.social/post/3ltjvz52x7s2m
838
+ if embed["$type"].endswith(".viewImages"):
839
+ if "images" in embed:
840
+ for i in embed["images"]:
841
+ post["media_urls"].append(
842
+ i.get("viewImage", {}).get("thumb", {}).get("uri", "")
843
+ )
844
+ elif "viewImage" in embed:
845
+ for i in embed["viewImage"]:
846
+ if "viewImage" in i:
847
+ sub_image = "viewImage"
848
+ elif "image" in i:
849
+ sub_image = "image"
850
+ else:
851
+ raise BlueskyPayloadError(
852
+ post["url"],
853
+ "unusual viewImages embed content: %s" % embed,
854
+ )
855
+ post["media_urls"].append(
856
+ i[sub_image].get("thumb", {}).get("uri", "")
857
+ )
858
+
580
859
  # Images
581
- if embed["$type"].endswith(".images"):
582
- media_data.extend([prepare_image_as_media(i) for i in embed["images"]])
860
+ if embed["$type"].endswith(".images") or embed["$type"].endswith("image"):
861
+ media_data.extend(
862
+ [prepare_image_as_media(i, post["url"]) for i in embed["images"]]
863
+ )
583
864
 
584
865
  # Video
585
866
  if embed["$type"].endswith(".video"):
586
867
  media_data.append(prepare_video_as_media(embed["video"]))
868
+ elif embed["$type"].endswith(".videos"):
869
+ for elt in embed["videos"]:
870
+ media_data.append(prepare_video_as_media(elt["video"]))
871
+ elif embed["$type"].endswith(".media"):
872
+ if isinstance(embed["media"], dict):
873
+ media_data.append(prepare_video_as_media(embed["media"]["video"]))
874
+ elif isinstance(embed["media"], list):
875
+ for elt in embed["media"]:
876
+ media_data.append(prepare_video_as_media(elt["media"]))
587
877
 
588
878
  # Quote & Starter-packs
589
879
  if embed["$type"].endswith(".record"):
@@ -631,13 +921,21 @@ def normalize_post(
631
921
  # Images
632
922
  elif embed["media"]["$type"].endswith(".images"):
633
923
  media_data.extend(
634
- [prepare_image_as_media(i) for i in embed["media"]["images"]]
924
+ [
925
+ prepare_image_as_media(i, post["url"])
926
+ for i in embed["media"]["images"]
927
+ ]
635
928
  )
636
929
 
637
930
  # Video
638
931
  elif embed["media"]["$type"].endswith(".video"):
639
932
  media_data.append(prepare_video_as_media(embed["media"]["video"]))
640
933
 
934
+ # A personalized record with media embed type, but video unavailable
935
+ # e.g.: https://bsky.app/profile/meteolatorregassa.bsky.social/post/3lhoxazzptj2b
936
+ elif embed["media"]["$type"].endswith("#media"):
937
+ pass
938
+
641
939
  else:
642
940
  raise BlueskyPayloadError(
643
941
  post["url"],
@@ -751,8 +1049,13 @@ def normalize_post(
751
1049
  "allow_from_" + rule["$type"].split("#")[1].split("Rule")[0]
752
1050
  )
753
1051
  if rule_string.endswith("_list") and "list" in rule:
754
- for allowed_list in rule["list"]:
755
- post["replies_rules"].append(rule_string + ":" + allowed_list)
1052
+ if isinstance(rule["list"], str):
1053
+ post["replies_rules"].append(rule_string + ":" + rule["list"])
1054
+ else:
1055
+ for allowed_list in rule["list"]:
1056
+ post["replies_rules"].append(
1057
+ rule_string + ":" + allowed_list
1058
+ )
756
1059
  else:
757
1060
  post["replies_rules"].append(rule_string)
758
1061
  if not data["threadgate"]["record"]["allow"]:
twitwi/bluesky/utils.py CHANGED
@@ -37,7 +37,9 @@ def validate_post_payload(data):
37
37
  post["record"],
38
38
  )
39
39
 
40
- if post["record"].get("$type") != "app.bsky.feed.post":
40
+ # Splitting by '#' to ignore possible suffixes in $type
41
+ # e.g. https://bsky.app/profile/did:plc:k6acu4chiwkixvdedcmdgmal/post/3lagdncjsu22y
42
+ if post["record"].get("$type").split("#")[0] != "app.bsky.feed.post":
41
43
  return False, "payload's record $type is not a post: %s" % post["record"].get(
42
44
  "$type"
43
45
  )
@@ -56,7 +58,7 @@ def validate_post_payload(data):
56
58
 
57
59
 
58
60
  re_embed_types = re.compile(
59
- r"\.(record|recordWithMedia|images|video|external)(?:#.*)?$"
61
+ r"(?:\.(?:record|recordWithMedia|images|videos?|external|post|embed|links|media|file|viewImages)(?:#.*)?|N\/A|image)$"
60
62
  )
61
63
 
62
64
 
@@ -88,17 +90,25 @@ def parse_post_url(url, source):
88
90
  def parse_post_uri(uri, source=None):
89
91
  """Returns a tuple of (author_did, post_did) from an at:// post URI"""
90
92
 
91
- known_splits = [
92
- "/app.bsky.feed.post/",
93
- "/app.bsky.graph.starterpack/",
94
- "/app.bsky.feed.generator/",
95
- "/app.bsky.graph.list/",
96
- ]
97
-
93
+ # known_splits = [
94
+ # "/app.bsky.feed.post/",
95
+ # "/app.bsky.graph.starterpack/",
96
+ # "/app.bsky.feed.generator/",
97
+ # "/app.bsky.graph.list/",
98
+ # "/app.bsky.graph.follow/", # This one is often found when a post is an anwser to a deleted post (e.g. https://bsky.app/profile/sydney-chat.bsky.social/post/3ltsph6kxfl25)
99
+ # ]
100
+
101
+ # if uri.startswith("at://"):
102
+ # for split in known_splits:
103
+ # if split in uri:
104
+ # return uri[5:].split(split)
105
+
106
+ # There's too much variability in the post URIs, and we cannot be exhaustive,
107
+ # so we do with the simple approach:
98
108
  if uri.startswith("at://"):
99
- for split in known_splits:
100
- if split in uri:
101
- return uri[5:].split(split)
109
+ # Using maxsplit=3 to avoid issues if future uris contain more slashes
110
+ author_did, _, post_did = uri[5:].split("/", 3)
111
+ return author_did, post_did
102
112
 
103
113
  raise BlueskyPayloadError(source or uri, f"{uri} is not a usual Bluesky post uri")
104
114
 
@@ -112,18 +122,24 @@ def format_media_url(user_did, media_cid, mime_type, source):
112
122
  if mime_type.startswith("image"):
113
123
  media_url = f"https://cdn.bsky.app/img/feed_fullsize/plain/{user_did}/{media_cid}@{media_type}"
114
124
  media_thumb = f"https://cdn.bsky.app/img/feed_thumbnail/plain/{user_did}/{media_cid}@{media_type}"
115
- elif mime_type.startswith("video"):
125
+ elif (
126
+ mime_type.startswith("video")
127
+ or mime_type == "application/xml"
128
+ or mime_type == "*/*"
129
+ ):
116
130
  media_url = f"https://video.bsky.app/watch/{user_did}/{media_cid}/playlist.m3u8"
117
131
  media_thumb = (
118
132
  f"https://video.bsky.app/watch/{user_did}/{media_cid}/thumbnail.jpg"
119
133
  )
120
- elif mime_type in ["application/octet-stream", "text/plain"]:
134
+ elif any(mt in mime_type for mt in ["octet-stream", "text/plain", "text/html"]):
121
135
  media_url = (
122
136
  f"https://cdn.bsky.app/img/feed_fullsize/plain/{user_did}/{media_cid}@jpeg"
123
137
  )
124
138
  media_thumb = (
125
139
  f"https://cdn.bsky.app/img/feed_thumbnail/plain/{user_did}/{media_cid}@jpeg"
126
140
  )
141
+ elif "empty" in mime_type:
142
+ media_url, media_thumb = "", ""
127
143
  else:
128
144
  raise BlueskyPayloadError(source, f"{mime_type} is an unusual media mimeType")
129
145
  return media_url, media_thumb
twitwi/exceptions.py CHANGED
@@ -21,4 +21,4 @@ class BlueskyPayloadError(TwitwiError):
21
21
  def __init__(self, source, message):
22
22
  self.source = source
23
23
  self.message = message
24
- super().__init__(f"Error while processing Bluesky post {source}:\n{message}")
24
+ super().__init__(f"Error while processing Bluesky post {source}.\n{message}")
twitwi/formatters.py CHANGED
@@ -52,7 +52,9 @@ def make_transform_into_csv_dict(plural_fields, boolean_fields):
52
52
 
53
53
 
54
54
  def make_format_as_csv_row(fields, plural_fields, boolean_fields):
55
- def format_field_for_csv(field, item, item_id=None, plural_separator="|"):
55
+ def format_field_for_csv(
56
+ field, item, item_id=None, plural_separator="|", allow_erroneous_plurals=False
57
+ ):
56
58
  if field == "id" and item_id is not None:
57
59
  return item_id
58
60
 
@@ -63,6 +65,11 @@ def make_format_as_csv_row(fields, plural_fields, boolean_fields):
63
65
  if field == "links":
64
66
  v = item.get("proper_links", v)
65
67
 
68
+ # Clean None values that may have slipped in, such as in the 'domains' field when
69
+ # normalizing this Bluesky post: https://bsky.app/profile/did:plc:cs5qjcmnntogoahrrsagmg2z/post/3lvqhn7raq62v
70
+ if allow_erroneous_plurals:
71
+ v = [element if element is not None else "" for element in v]
72
+
66
73
  return plural_separator.join(v)
67
74
 
68
75
  if field in boolean_fields:
@@ -70,10 +77,16 @@ def make_format_as_csv_row(fields, plural_fields, boolean_fields):
70
77
 
71
78
  return item.get(field, "")
72
79
 
73
- def format_item_as_csv_row(item, item_id=None, plural_separator="|"):
80
+ def format_item_as_csv_row(
81
+ item, item_id=None, plural_separator="|", allow_erroneous_plurals=False
82
+ ):
74
83
  return [
75
84
  format_field_for_csv(
76
- field, item, item_id=item_id, plural_separator=plural_separator
85
+ field,
86
+ item,
87
+ item_id=item_id,
88
+ plural_separator=plural_separator,
89
+ allow_erroneous_plurals=allow_erroneous_plurals,
77
90
  )
78
91
  for field in fields
79
92
  ]
twitwi/utils.py CHANGED
@@ -61,7 +61,9 @@ def get_dates(
61
61
  locale = UTC_TIMEZONE
62
62
 
63
63
  # Let's pray we never see a negative year...
64
- year_zero = date_str.startswith("0000")
64
+ year_zero = date_str.startswith("0000") or all(
65
+ c == "0" for c in date_str.split("-")[0]
66
+ )
65
67
 
66
68
  try:
67
69
  parsed_datetime = datetime.strptime(
@@ -84,26 +86,30 @@ def get_dates(
84
86
  utc_datetime = UTC_TIMEZONE.localize(parsed_datetime)
85
87
  locale_datetime = utc_datetime.astimezone(locale)
86
88
 
89
+ formatted_date_str = datetime.strftime(
90
+ locale_datetime,
91
+ FORMATTED_FULL_DATETIME_FORMAT
92
+ if source == "bluesky"
93
+ else FORMATTED_TWEET_DATETIME_FORMAT,
94
+ )
95
+
87
96
  timestamp = int(utc_datetime.timestamp())
88
97
 
89
98
  if year_zero:
90
99
  # Subtract one year (year 0001 is not a leap year) in seconds
91
100
  timestamp -= 31536000
101
+ # Doing like so using split because on ubuntu, datetime.strftime on year with less than 4 digits
102
+ # only returns 1 digit for year 0 (e.g. "0-05-12...") instead of 4 digits ("0000-05-12..."),
103
+ # whereas on macOS and Windows it returns 4 digits.
104
+ formatted_date_str = "0000-" + formatted_date_str.split("-", 1)[1]
92
105
 
93
106
  if millisecond_timestamp:
94
107
  timestamp *= 1000
95
108
  timestamp += utc_datetime.microsecond / 1000
96
109
 
97
- formatted_date_str = datetime.strftime(
98
- locale_datetime,
99
- FORMATTED_FULL_DATETIME_FORMAT
100
- if source == "bluesky"
101
- else FORMATTED_TWEET_DATETIME_FORMAT,
102
- )
103
-
104
110
  return (
105
111
  int(timestamp),
106
- formatted_date_str if not year_zero else "0" + formatted_date_str[1:],
112
+ formatted_date_str,
107
113
  )
108
114
 
109
115
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: twitwi
3
- Version: 0.23.0
3
+ Version: 0.24.0
4
4
  Summary: A collection of Twitter-related helper functions for python.
5
5
  Home-page: http://github.com/medialab/twitwi
6
6
  Author: Béatrice Mazoyer, Guillaume Plique, Benjamin Ooghe-Tabanou
@@ -260,7 +260,7 @@ List of a Bluesky user profile's normalized field names. Useful to declare heade
260
260
 
261
261
  ### PARTIAL_PROFILE_FIELDS
262
262
 
263
- List of a Bluesky user partial profile's (retrieved from [`app.bsky.graph.getFollowers` HTTP endpoint](https://docs.bsky.app/docs/api/app-bsky-graph-get-followers#responses) for example) normalized field names. Useful to declare headers with csv writers. Be careful not to confuse with [PROFILE_FIELDS](#profile_fields) which correspond to the full version of the profile data, retrieved from [`app.bsky.actor.getProfiles` HTTP endpoint](docs.bsky.app/docs/api/app-bsky-actor-get-profiles#responses) for example.
263
+ List of a Bluesky user partial profile's (retrieved from [`app.bsky.graph.getFollowers` HTTP endpoint](https://docs.bsky.app/docs/api/app-bsky-graph-get-followers#responses) for example) normalized field names. Useful to declare headers with csv writers. Be careful not to confuse with [PROFILE_FIELDS](#profile_fields) which correspond to the full version of the profile data, retrieved from [`app.bsky.actor.getProfiles` HTTP endpoint](https://docs.bsky.app/docs/api/app-bsky-actor-get-profiles#responses) for example.
264
264
 
265
265
  ### POST_FIELDS
266
266
 
@@ -277,7 +277,7 @@ Will return datetimes as UTC but can take an optional second `locale` argument a
277
277
  * **data** *(dict)*: user profile data payload coming from Twitter API v1.1 or v2.
278
278
  * **locale** *(pytz.timezone as str, optional)*: timezone used to convert dates. If not given, will default to UTC.
279
279
  * **pure** *(bool, optional)*: whether to allow the function to mutate its original `data` argument. Defaults to `True`.
280
-
280
+
281
281
  ### normalize_tweet
282
282
 
283
283
  Function taking a nested dict describing a tweet from Twitter's JSON payload (API v1.1) and returning a flat "normalized" dict composed of all [TWEET_FIELDS](#tweet_fields) keys.
@@ -1,22 +1,22 @@
1
1
  test/bluesky/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- test/bluesky/formatters_test.py,sha256=dMpRV_IuStZAnXhJGKnYsi0tb4BaSTSU4JVfihU1aQs,5002
2
+ test/bluesky/formatters_test.py,sha256=kUXoLNEep-mGRwLN0y5DqB9pAorV0PkVKMm_uVIvAQQ,5100
3
3
  test/bluesky/normalizers_test.py,sha256=R4NziqErGW5MBdQEZ1vNxLGNRvJTyGnXfqo0v5gBCgw,5662
4
4
  twitwi/__init__.py,sha256=y0bAx9gE3THtlWE1YpXDIhGwqJ5_I8DCStWyyiiXJkw,1095
5
5
  twitwi/anonymizers.py,sha256=nkl6HL1BWLz00wJ060XSbqjN5JF8pvcpEPnRXt70TUY,1588
6
6
  twitwi/constants.py,sha256=fvqCngJIGyz5CpdVWbcAfjmE3_kvcx9giN0rEljL7OU,16001
7
- twitwi/exceptions.py,sha256=OCIDagu2ErDyOGWunRBCK3O62TnzFpIMQ9gS8l9EALQ,696
8
- twitwi/formatters.py,sha256=yn14AsrGAUw8rShOnYJvoMbzdWpfTeSs0P0ZPNTwhLU,3142
7
+ twitwi/exceptions.py,sha256=xUikeIRmFcptQFlKGKXkbH9vbcQlQL3sviknhvSTcmw,696
8
+ twitwi/formatters.py,sha256=pwI4UYPDFUzjRPE9B36k8tK-Va-k0HFLwvmc8aIc8P0,3681
9
9
  twitwi/normalizers.py,sha256=CWUK-XwhcEjLDjWH_qb6E03WZKsbIcwiRAVUjwXKQho,28438
10
- twitwi/utils.py,sha256=ruyqTx9JELRiE4-Svhaeo02KrsdHrrHJNqbGRWMmuAs,4421
10
+ twitwi/utils.py,sha256=PPmbeMlKbHMTg07PgI4A0HRZw2QGuvCOGcP_FtqMyHQ,4774
11
11
  twitwi/bluesky/__init__.py,sha256=SqeHZUzL2U9UpL3EB33vaowQWaKXSPkvsAkasRqmFpY,694
12
12
  twitwi/bluesky/constants.py,sha256=CPkTIrDwyRWpkFTbaee1oFm_LWGj2WIC7A6xEGqDGB4,573
13
13
  twitwi/bluesky/formatters.py,sha256=L_yROAPcBECifCGiFAGYFJwLq6re8UlJNoZ7R2DXm5g,1025
14
- twitwi/bluesky/normalizers.py,sha256=AsOX3d4FsMn-GPvo-0oA7cZQwqAxQNbLq1ajbnXe7bk,33976
14
+ twitwi/bluesky/normalizers.py,sha256=m4oNWJt8eZK2iVREPIKC42yw3YNpZo3pf4OQGZz_1i8,48611
15
15
  twitwi/bluesky/types.py,sha256=INe6R8eOqrOooWn25dtk61-Wqd_pUDwb737R7jY_vkc,13915
16
- twitwi/bluesky/utils.py,sha256=mFL1h_Mqay66UGEUlzweO_0TzbqS51oNE2TKoT2xf-4,3969
17
- twitwi-0.23.0.dist-info/licenses/LICENSE.txt,sha256=Ddg_PcGnl0qd2167o2dheCjE_rCZJOoBxjJnJhhOpX4,1099
18
- twitwi-0.23.0.dist-info/METADATA,sha256=05Mq7RsXYLpVK4aTX3zAUMcPYdpd8UBPOc81Z9_FYQw,21365
19
- twitwi-0.23.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
- twitwi-0.23.0.dist-info/top_level.txt,sha256=TaKyGU7j_EVbP5KI0UD6qjbaKv2Qn0OrkfUQ29a04kg,12
21
- twitwi-0.23.0.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
22
- twitwi-0.23.0.dist-info/RECORD,,
16
+ twitwi/bluesky/utils.py,sha256=zIofl7UHmIr0JgjoXRK3ekovkri3CVOvQvo8PmFrWGg,4895
17
+ twitwi-0.24.0.dist-info/licenses/LICENSE.txt,sha256=Ddg_PcGnl0qd2167o2dheCjE_rCZJOoBxjJnJhhOpX4,1099
18
+ twitwi-0.24.0.dist-info/METADATA,sha256=4cGwKAsqA9kXkG713fx0lLfoCb2znbLiTsqm-n_wI4g,21365
19
+ twitwi-0.24.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
20
+ twitwi-0.24.0.dist-info/top_level.txt,sha256=TaKyGU7j_EVbP5KI0UD6qjbaKv2Qn0OrkfUQ29a04kg,12
21
+ twitwi-0.24.0.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
22
+ twitwi-0.24.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5