twitwi 0.21.0__tar.gz → 0.21.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {twitwi-0.21.0/twitwi.egg-info → twitwi-0.21.2}/PKG-INFO +3 -3
  2. {twitwi-0.21.0 → twitwi-0.21.2}/README.md +1 -2
  3. {twitwi-0.21.0 → twitwi-0.21.2}/setup.py +2 -2
  4. {twitwi-0.21.0 → twitwi-0.21.2}/test/bluesky/normalizers_test.py +12 -2
  5. {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/bluesky/normalizers.py +86 -22
  6. {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/bluesky/types.py +16 -16
  7. {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/bluesky/utils.py +8 -1
  8. {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/constants.py +0 -1
  9. {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/utils.py +6 -5
  10. {twitwi-0.21.0 → twitwi-0.21.2/twitwi.egg-info}/PKG-INFO +3 -3
  11. {twitwi-0.21.0 → twitwi-0.21.2}/twitwi.egg-info/requires.txt +1 -0
  12. {twitwi-0.21.0 → twitwi-0.21.2}/LICENSE.txt +0 -0
  13. {twitwi-0.21.0 → twitwi-0.21.2}/setup.cfg +0 -0
  14. {twitwi-0.21.0 → twitwi-0.21.2}/test/bluesky/__init__.py +0 -0
  15. {twitwi-0.21.0 → twitwi-0.21.2}/test/bluesky/formatters_test.py +0 -0
  16. {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/__init__.py +0 -0
  17. {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/anonymizers.py +0 -0
  18. {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/bluesky/__init__.py +0 -0
  19. {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/bluesky/constants.py +0 -0
  20. {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/bluesky/formatters.py +0 -0
  21. {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/exceptions.py +0 -0
  22. {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/formatters.py +0 -0
  23. {twitwi-0.21.0 → twitwi-0.21.2}/twitwi/normalizers.py +0 -0
  24. {twitwi-0.21.0 → twitwi-0.21.2}/twitwi.egg-info/SOURCES.txt +0 -0
  25. {twitwi-0.21.0 → twitwi-0.21.2}/twitwi.egg-info/dependency_links.txt +0 -0
  26. {twitwi-0.21.0 → twitwi-0.21.2}/twitwi.egg-info/top_level.txt +0 -0
  27. {twitwi-0.21.0 → twitwi-0.21.2}/twitwi.egg-info/zip-safe +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: twitwi
3
- Version: 0.21.0
3
+ Version: 0.21.2
4
4
  Summary: A collection of Twitter-related helper functions for python.
5
5
  Home-page: http://github.com/medialab/twitwi
6
6
  Author: Béatrice Mazoyer, Guillaume Plique, Benjamin Ooghe-Tabanou
@@ -12,6 +12,7 @@ Description-Content-Type: text/markdown
12
12
  License-File: LICENSE.txt
13
13
  Requires-Dist: pytz>=2019.3
14
14
  Requires-Dist: ural>=0.31.1
15
+ Requires-Dist: python-dateutil>=2.9.0
15
16
  Dynamic: author
16
17
  Dynamic: author-email
17
18
  Dynamic: description
@@ -94,7 +95,7 @@ for post_data in posts_payload_from_API:
94
95
 
95
96
  # Then, saving normalized profiles into a CSV using DictWriter:
96
97
 
97
- from csv import DictWriter
98
+ import csv
98
99
  from twitwi.bluesky.constants import POST_FIELDS
99
100
  from twitwi.bluesky import transform_post_into_csv_dict
100
101
 
@@ -107,7 +108,6 @@ with open("normalized_bluesky_posts.csv", "w") as f:
107
108
 
108
109
  # Or using the basic CSV writer:
109
110
 
110
- from csv import writer
111
111
  from twitwi.bluesky import format_post_as_csv_row
112
112
 
113
113
  with open("normalized_bluesky_posts.csv", "w") as f:
@@ -68,7 +68,7 @@ for post_data in posts_payload_from_API:
68
68
 
69
69
  # Then, saving normalized profiles into a CSV using DictWriter:
70
70
 
71
- from csv import DictWriter
71
+ import csv
72
72
  from twitwi.bluesky.constants import POST_FIELDS
73
73
  from twitwi.bluesky import transform_post_into_csv_dict
74
74
 
@@ -81,7 +81,6 @@ with open("normalized_bluesky_posts.csv", "w") as f:
81
81
 
82
82
  # Or using the basic CSV writer:
83
83
 
84
- from csv import writer
85
84
  from twitwi.bluesky import format_post_as_csv_row
86
85
 
87
86
  with open("normalized_bluesky_posts.csv", "w") as f:
@@ -5,7 +5,7 @@ with open("./README.md", "r") as f:
5
5
 
6
6
  setup(
7
7
  name="twitwi",
8
- version="0.21.0",
8
+ version="0.21.2",
9
9
  description="A collection of Twitter-related helper functions for python.",
10
10
  long_description=long_description,
11
11
  long_description_content_type="text/markdown",
@@ -17,6 +17,6 @@ setup(
17
17
  python_requires=">=3.8",
18
18
  packages=find_packages(exclude=["scripts", "test"]),
19
19
  package_data={"docs": ["README.md"]},
20
- install_requires=["pytz>=2019.3", "ural>=0.31.1"],
20
+ install_requires=["pytz>=2019.3", "ural>=0.31.1", "python-dateutil>=2.9.0"],
21
21
  zip_safe=True,
22
22
  )
@@ -15,6 +15,8 @@ OVERWRITE_TESTS = False
15
15
 
16
16
 
17
17
  FAKE_COLLECTION_TIME = "2025-01-01T00:00:00.000000"
18
+
19
+
18
20
  def set_fake_collection_time(dico):
19
21
  if "collection_time" in dico:
20
22
  dico["collection_time"] = FAKE_COLLECTION_TIME
@@ -47,7 +49,9 @@ class TestNormalizers:
47
49
  if OVERWRITE_TESTS:
48
50
  from test.utils import dump_json_resource
49
51
 
50
- normalized_profiles = [set_fake_collection_time(fn(profile)) for profile in profiles]
52
+ normalized_profiles = [
53
+ set_fake_collection_time(fn(profile)) for profile in profiles
54
+ ]
51
55
  dump_json_resource(normalized_profiles, "bluesky-normalized-profiles.json")
52
56
 
53
57
  expected = get_json_resource("bluesky-normalized-profiles.json")
@@ -79,7 +83,13 @@ class TestNormalizers:
79
83
  if OVERWRITE_TESTS:
80
84
  from test.utils import dump_json_resource
81
85
 
82
- normalized_posts = [[set_fake_collection_time(p) for p in fn(post, extract_referenced_posts=True)] for post in posts]
86
+ normalized_posts = [
87
+ [
88
+ set_fake_collection_time(p)
89
+ for p in fn(post, extract_referenced_posts=True)
90
+ ]
91
+ for post in posts
92
+ ]
83
93
  dump_json_resource(normalized_posts, "bluesky-normalized-posts.json")
84
94
 
85
95
  expected = get_json_resource("bluesky-normalized-posts.json")
@@ -15,6 +15,7 @@ from twitwi.bluesky.utils import (
15
15
  format_post_url,
16
16
  parse_post_url,
17
17
  parse_post_uri,
18
+ format_starterpack_url,
18
19
  format_media_url,
19
20
  )
20
21
  from twitwi.bluesky.types import BlueskyProfile, BlueskyPost
@@ -37,11 +38,11 @@ def normalize_profile(data: Dict, locale: Optional[str] = None) -> BlueskyProfil
37
38
  "did": data["did"],
38
39
  "url": format_profile_url(data["handle"]),
39
40
  "handle": data["handle"],
40
- "display_name": data["displayName"],
41
+ "display_name": data.get("displayName", ""),
41
42
  "created_at": created_at,
42
43
  "timestamp_utc": timestamp_utc,
43
44
  "description": data["description"],
44
- "avatar": data["avatar"],
45
+ "avatar": data.get("avatar", ""),
45
46
  "posts": data["postsCount"],
46
47
  "followers": data["followersCount"],
47
48
  "follows": data["followsCount"],
@@ -55,8 +56,13 @@ def normalize_profile(data: Dict, locale: Optional[str] = None) -> BlueskyProfil
55
56
 
56
57
 
57
58
  def prepare_native_gif_as_media(gif_data, user_did, source):
58
- media_cid = gif_data["thumb"]["ref"]["$link"]
59
- _, thumbnail = format_media_url(user_did, media_cid, "image/jpeg", source)
59
+ if "thumb" in gif_data:
60
+ media_cid = gif_data["thumb"]["ref"]["$link"]
61
+ _, thumbnail = format_media_url(user_did, media_cid, "image/jpeg", source)
62
+ else:
63
+ media_cid = ""
64
+ thumbnail = ""
65
+
60
66
  return {
61
67
  "id": media_cid,
62
68
  "type": "video/gif",
@@ -67,8 +73,12 @@ def prepare_native_gif_as_media(gif_data, user_did, source):
67
73
 
68
74
 
69
75
  def prepare_image_as_media(image_data):
76
+ if "ref" not in image_data["image"] or "$link" not in image_data["image"]["ref"]:
77
+ image_id = image_data["image"]["cid"]
78
+ else:
79
+ image_id = image_data["image"]["ref"]["$link"]
70
80
  return {
71
- "id": image_data["image"]["ref"]["$link"],
81
+ "id": image_id,
72
82
  "type": image_data["image"]["mimeType"],
73
83
  "alt": image_data["alt"],
74
84
  }
@@ -81,6 +91,20 @@ def prepare_video_as_media(video_data):
81
91
  }
82
92
 
83
93
 
94
+ def process_starterpack_card(embed_data, post):
95
+ # Warning: mutates post
96
+
97
+ card = embed_data.get("record", {})
98
+ creator_did, pack_did = parse_post_uri(embed_data["uri"])
99
+ post["card_link"] = format_starterpack_url(
100
+ embed_data.get("creator", {}).get("handle") or creator_did, pack_did
101
+ )
102
+ post["card_title"] = card.get("name", "")
103
+ post["card_description"] = card.get("description", "")
104
+ post["card_thumbnail"] = card.get("thumb", "")
105
+ return post
106
+
107
+
84
108
  def process_card_data(embed_data, post):
85
109
  # Warning: mutates post
86
110
 
@@ -122,10 +146,14 @@ def prepare_quote_data(embed_quote, card_data, post, links):
122
146
  break
123
147
 
124
148
  # Remove quoted link from post links
125
- links.remove(post["quoted_url"])
149
+ if post["quoted_url"] in links:
150
+ links.remove(post["quoted_url"])
126
151
 
127
152
  # Extract user handle from url
128
- post["quoted_user_handle"], _ = parse_post_url(post["quoted_url"], post["url"])
153
+ if "did:plc:" not in post["quoted_url"]:
154
+ post["quoted_user_handle"], _ = parse_post_url(
155
+ post["quoted_url"], post["url"]
156
+ )
129
157
 
130
158
  return (post, quoted_data, links)
131
159
 
@@ -156,7 +184,7 @@ def merge_nested_posts(referenced_posts, nested, source):
156
184
 
157
185
  @overload
158
186
  def normalize_post(
159
- data: Dict,
187
+ payload: Dict,
160
188
  locale: Optional[str] = ...,
161
189
  extract_referenced_posts: Literal[True] = ...,
162
190
  collection_source: Optional[str] = ...,
@@ -165,7 +193,7 @@ def normalize_post(
165
193
 
166
194
  @overload
167
195
  def normalize_post(
168
- data: Dict,
196
+ payload: Dict,
169
197
  locale: Optional[str] = ...,
170
198
  extract_referenced_posts: Literal[False] = ...,
171
199
  collection_source: Optional[str] = ...,
@@ -255,8 +283,8 @@ def normalize_post(
255
283
  )
256
284
 
257
285
  # Handle user metadata
258
- post["user_diplay_name"] = data["author"]["displayName"]
259
- post["user_avatar"] = data["author"]["avatar"]
286
+ post["user_diplay_name"] = data["author"].get("displayName", "")
287
+ post["user_avatar"] = data["author"].get("avatar", "")
260
288
  post["user_timestamp_utc"], post["user_created_at"] = get_dates(
261
289
  data["author"]["createdAt"], locale=locale, source="bluesky"
262
290
  )
@@ -288,7 +316,7 @@ def normalize_post(
288
316
  feat = facet["features"][0]
289
317
 
290
318
  # Hashtags
291
- if feat["$type"].endswith("#tag"):
319
+ if feat["$type"].endswith("#tag") or feat["$type"].endswith("#hashtag"):
292
320
  hashtags.add(feat["tag"].strip().lower())
293
321
 
294
322
  # Mentions
@@ -303,7 +331,11 @@ def normalize_post(
303
331
  byteStart = text.find(b"@", byteStart)
304
332
 
305
333
  handle = (
306
- text[byteStart + 1 : facet["index"]["byteEnd"] + byteStart - facet["index"]["byteStart"]]
334
+ text[
335
+ byteStart + 1 : facet["index"]["byteEnd"]
336
+ + byteStart
337
+ - facet["index"]["byteStart"]
338
+ ]
307
339
  .strip()
308
340
  .lower()
309
341
  .decode("utf-8")
@@ -312,15 +344,34 @@ def normalize_post(
312
344
 
313
345
  # Links
314
346
  elif feat["$type"].endswith("#link"):
347
+ # Handle native polls
348
+ if "https://poll.blue/" in feat["uri"]:
349
+ if feat["uri"].endswith("/0"):
350
+ links.add(custom_normalize_url(feat["uri"]))
351
+ text += b" %s" % feat["uri"].encode("utf-8")
352
+ continue
353
+
315
354
  links.add(custom_normalize_url(feat["uri"]))
355
+ # Check & fix occasional errored link positioning
356
+ # example: https://bsky.app/profile/ecrime.ch/post/3lqotmopayr23
357
+ byteStart = facet["index"]["byteStart"]
358
+ if b" " in text[byteStart : facet["index"]["byteEnd"]]:
359
+ byteStart = text.find(b"http", byteStart)
360
+
316
361
  links_to_replace.append(
317
362
  {
318
363
  "uri": feat["uri"].encode("utf-8"),
319
- "start": facet["index"]["byteStart"],
320
- "end": facet["index"]["byteEnd"],
364
+ "start": byteStart,
365
+ "end": byteStart
366
+ - facet["index"]["byteStart"]
367
+ + facet["index"]["byteEnd"],
321
368
  }
322
369
  )
323
370
 
371
+ elif feat["$type"].endswith("#bold"):
372
+ pass
373
+ elif feat["$type"].endswith("#option"):
374
+ pass
324
375
  else:
325
376
  raise BlueskyPayloadError(
326
377
  post["url"], "unusual record facet feature $type: %s" % feat
@@ -329,7 +380,10 @@ def normalize_post(
329
380
 
330
381
  # Rewrite full links within post's text
331
382
  for link in sorted(links_to_replace, key=lambda x: x["start"], reverse=True):
332
- text = text[: link["start"]] + link["uri"] + text[link["end"] :]
383
+ if link["start"] < 0:
384
+ text = text + b" " + link["uri"]
385
+ else:
386
+ text = text[: link["start"]] + link["uri"] + text[link["end"] :]
333
387
 
334
388
  # Handle thread info when applicable
335
389
  # Unfortunately posts' payload only provide at uris for these so we do not have the handles
@@ -399,11 +453,18 @@ def normalize_post(
399
453
  if embed["$type"].endswith(".video"):
400
454
  media_data.append(prepare_video_as_media(embed["video"]))
401
455
 
402
- # Quote
456
+ # Quote & Starter-packs
403
457
  if embed["$type"].endswith(".record"):
404
- post, quoted_data, links = prepare_quote_data(
405
- embed["record"], data.get("embed", {}).get("record"), post, links
406
- )
458
+ if "app.bsky.graph.starterpack" in embed["record"]["uri"]:
459
+ post = process_starterpack_card(
460
+ data.get("embed", {}).get("record"), post
461
+ )
462
+ if post["card_link"]:
463
+ extra_links.append(post["card_link"])
464
+ else:
465
+ post, quoted_data, links = prepare_quote_data(
466
+ embed["record"], data.get("embed", {}).get("record"), post, links
467
+ )
407
468
 
408
469
  # Quote with medias
409
470
  if embed["$type"].endswith(".recordWithMedia"):
@@ -478,11 +539,14 @@ def normalize_post(
478
539
 
479
540
  # Rewrite post's text to include links to medias within
480
541
  text += b" " + (
481
- media_thumb if media_type.startswith("video") and not media_type.endswith("/gif") else media_url
542
+ media_thumb
543
+ if media_type.startswith("video")
544
+ and not media_type.endswith("/gif")
545
+ else media_url
482
546
  ).encode("utf-8")
483
547
 
484
548
  # Process quotes
485
- if quoted_data:
549
+ if quoted_data and "value" in quoted_data:
486
550
  if quoted_data["cid"] != post["quoted_cid"]:
487
551
  raise BlueskyPayloadError(
488
552
  post["url"],
@@ -9,7 +9,7 @@ class BlueskyProfile(TypedDict):
9
9
  did: str # persistent long-term identifier of the account
10
10
  url: str # URL of the profile accessible on the web
11
11
  handle: str # updatable human-readable username of the account (usually like username.bsky.social or username.com)
12
- display_name: str # updatable human-readable name of the account
12
+ display_name: Optional[str] # updatable human-readable name of the account
13
13
  description: str # profile short description written by the user
14
14
  posts: int # total number of posts submitted by the user (at collection time)
15
15
  followers: int # total number of followers of the user (at collection time)
@@ -17,7 +17,7 @@ class BlueskyProfile(TypedDict):
17
17
  lists: int # total number of lists created by the user (at collection time)
18
18
  feedgens: int # total number of custom feeds created by the user (at collection time)
19
19
  starter_packs: int # total number of starter packs created by the user (at collection time)
20
- avatar: str # URL to the image serving as avatar to the user
20
+ avatar: Optional[str] # URL to the image serving as avatar to the user
21
21
  banner: str # URL to the image serving as profile banner to the user
22
22
  pinned_post_uri: Optional[str] # ATProto's internal URI to the post potentially pinned by the user to appear at the top of his posts on his profile
23
23
  created_at: str # datetime (potentially timezoned) of when the user created the account
@@ -63,8 +63,8 @@ class BlueskyPost(TypedDict):
63
63
  # user_follows: int # not available from posts payloads
64
64
  # user_lists: int # not available from posts payloads
65
65
  user_langs: List[str] # languages in which the author of the posts usually writes posts (declarative)
66
- user_avatar: str # URL to the image serving as avatar to the user who authored the post
67
- user_created_at: str # datetime (potentially timezoned) ofwhen the user who authored the post created the account
66
+ user_avatar: Optional[str] # URL to the image serving as avatar to the user who authored the post
67
+ user_created_at: str # datetime (potentially timezoned) of when the user who authored the post created the account
68
68
  user_timestamp_utc: int # Unix UTC timestamp of when the user who authored the post created the account
69
69
 
70
70
  # Parent post identifying fields
@@ -102,27 +102,27 @@ class BlueskyPost(TypedDict):
102
102
  quoted_user_handle: Optional[str] # updatable human-readable username of the account who authored the quoted post
103
103
  quoted_created_at: Optional[int] # datetime (potentially timezoned) of when the quoted post was submitted
104
104
  quoted_timestamp_utc: Optional[int] # Unix UTC timestamp of when the quoted post was submitted
105
- quoted_status: Optional[str] # empty or "detached" when the author of the quoted post intentionnally required the quoting post not to be accessible from their own
105
+ quoted_status: Optional[str] # empty or "detached" when the author of the quoted post intentionnally required the quoting post not to appear in the list of this post's quotes
106
106
 
107
107
  # Embedded elements metadata fields
108
108
  links: List[str] # list of URLs of all links shared within the post (including potentially the embedded card detailed below, but not the link to a potential quoted post)
109
- domains: List[str] # list of domains of the links shared within the post (here a domain refer to a full hostname, including subdomains, for instance bluesky.com or medialab.sciencespo.fr)
109
+ domains: List[str] # list of domains of the links shared within the post (here a domain refers to a full hostname, including subdomains, for instance bluesky.com or medialab.sciencespo.fr)
110
110
  card_link: Optional[str] # URL of the link displayed as a card within the post if any
111
111
  card_title: Optional[str] # title of the webpage corresponding to the linkg diplayed as a card within the post if any
112
112
  card_description: Optional[str] # description of the webpage corresponding to the linkg diplayed as a card within the post if any
113
- card_thumbnail: Optional[str] # image displayed as an illustration of the webpage corresponding to the linkg diplayed as a card within the post if any
114
- media_urls: List[str] # list of URLs to all medias (images, videos, gifs) embedded in the post
115
- media_thumbnails: List[str] # list of URLs to small thumbnail version of all medias (images, videos, gifs) embedded in the post
116
- media_types: List[str] # MIME types (such as image/jpeg, image/gif, video/mp4, etc.) of all medias (images, videos, gifs) embedded in the post
117
- media_alt_texts: List[str] # description texts of all medias (images, videos, gifs) embedded in the post
118
- mentioned_user_dids: List[str] # list of all persistent long-term identifier of the accounts adressed within the post (does not include users to which the post replied)
119
- mentioned_user_handles: List[str] # list of all updatable human-readable username of the accounts adressed within the post (does not include users to which the post replied)
113
+ card_thumbnail: Optional[str] # image displayed as an illustration of the webpage corresponding to the link diplayed as a card within the post if any
114
+ media_urls: List[str] # list of URLs to all media (images, videos, gifs) embedded in the post
115
+ media_thumbnails: List[str] # list of URLs to small thumbnail version of all media (images, videos, gifs) embedded in the post
116
+ media_types: List[str] # MIME types (such as image/jpeg, image/gif, video/mp4, etc.) of all media (images, videos, gifs) embedded in the post
117
+ media_alt_texts: List[str] # description texts of all media (images, videos, gifs) embedded in the post
118
+ mentioned_user_dids: List[str] # list of all persistent long-term identifiers of the accounts adressed within the post (does not include users to which the post replied)
119
+ mentioned_user_handles: List[str] # list of all updatable human-readable usernames of the accounts adressed within the post (does not include users to which the post replied)
120
120
  hashtags: List[str] # list of all unique lowercased hashtags found within the post's text
121
121
 
122
122
  # Conversation rules fields
123
123
  replies_rules: Optional[List[str]] # list of specific conversation rules set by the author for the current post (can be one or a combination of: disallow, allow_from_follower, allow_from_following, allow_from_mention, or allow_from_list: followed by a list of user DIDs)
124
124
  replies_rules_created_at: Optional[str] # datetime (potentially timezoned) of when the user set the replies_rules
125
- replies_rules_timestamp_utc: Optional[int] # Unix UTC timestamp of when the userset the replies_rules
125
+ replies_rules_timestamp_utc: Optional[int] # Unix UTC timestamp of when the user set the replies_rules
126
126
  hidden_replies_uris: Optional[List[str]] # list of ATProto's internal URIs to posts who replied to the post, but where intentionnally marked as hidden by the current post's author
127
127
  # quotes_rule: Optional[str] # not available from posts payloads, cf https://github.com/bluesky-social/atproto/issues/3712
128
128
  # quotes_rules_created_at: Optional[str] # not available from posts payloads, cf https://github.com/bluesky-social/atproto/issues/3712
@@ -131,5 +131,5 @@ class BlueskyPost(TypedDict):
131
131
 
132
132
  # Extra fields linked to the data collection and processing
133
133
  collection_time: Optional[str] # datetime (potentially timezoned) of when the data was normalized
134
- collected_via: Optional[List[str]] # extra field added by the normalization process to express how the data collection was ran, will be "quote" or "thread" when a post was grabbed as a referenced post within a really collected post using the "extract_referenced_posts" option of "normalize_post"
135
- match_query: Optional[bool] # extra field added by the normalization process to express whether the post was an intentionnally collected one or only came as a referenced post within a really collected post using the "extract_referenced_posts" option of "normalize_post"
134
+ collected_via: Optional[List[str]] # extra field added by the normalization process to express how the data collection was ran, will be "quote" or "thread" when a post was grabbed as a referenced post within the originally collected post using the "extract_referenced_posts" option of "normalize_post"
135
+ match_query: Optional[bool] # extra field added by the normalization process to express whether the post was an intentionnally collected one or only came as a referenced post within the originally collected post using the "extract_referenced_posts" option of "normalize_post"
@@ -17,7 +17,7 @@ valid_post_keys = [
17
17
  valid_record_keys = ["$type", "createdAt", "text"]
18
18
 
19
19
 
20
- valid_author_keys = ["did", "handle", "displayName", "avatar", "createdAt"]
20
+ valid_author_keys = ["did", "handle", "createdAt"]
21
21
 
22
22
 
23
23
  def validate_post_payload(data):
@@ -81,6 +81,9 @@ def parse_post_url(url, source):
81
81
  def parse_post_uri(uri, source=None):
82
82
  """Returns a tuple of (author_did, post_did) from an at:// post URI"""
83
83
 
84
+ if uri.startswith("at://") and "/app.bsky.graph.starterpack/" in uri:
85
+ return uri[5:].split("/app.bsky.graph.starterpack/")
86
+
84
87
  if not uri.startswith("at://") and "/app.bsky.feed.post/" not in uri:
85
88
  raise BlueskyPayloadError(
86
89
  source or uri, f"{uri} is not a usual Bluesky post uri"
@@ -88,6 +91,10 @@ def parse_post_uri(uri, source=None):
88
91
  return uri[5:].split("/app.bsky.feed.post/")
89
92
 
90
93
 
94
+ def format_starterpack_url(user_handle_or_did, record_did):
95
+ return f"https://bsky.app/starter-pack/{user_handle_or_did}/{record_did}"
96
+
97
+
91
98
  def format_media_url(user_did, media_cid, mime_type, source):
92
99
  media_type = mime_type.split("/")[1]
93
100
  if mime_type.startswith("image"):
@@ -6,7 +6,6 @@
6
6
  #
7
7
  SOURCE_DATETIME_FORMAT = "%a %b %d %H:%M:%S +0000 %Y"
8
8
  SOURCE_DATETIME_FORMAT_V2 = "%Y-%m-%dT%H:%M:%S.%fZ"
9
- SOURCE_DATETIME_FORMAT_V3 = "%Y-%m-%dT%H:%M:%SZ"
10
9
  FORMATTED_TWEET_DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S"
11
10
 
12
11
  FORMATTED_FULL_DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%f"
@@ -5,6 +5,7 @@
5
5
  # Miscellaneous utility functions.
6
6
  #
7
7
  from pytz import timezone
8
+ from dateutil.parser import parse as parse_date
8
9
  from ural import normalize_url, get_normalized_hostname
9
10
  from functools import partial
10
11
  from datetime import datetime
@@ -12,7 +13,6 @@ from datetime import datetime
12
13
  from twitwi.constants import (
13
14
  SOURCE_DATETIME_FORMAT,
14
15
  SOURCE_DATETIME_FORMAT_V2,
15
- SOURCE_DATETIME_FORMAT_V3,
16
16
  FORMATTED_TWEET_DATETIME_FORMAT,
17
17
  FORMATTED_FULL_DATETIME_FORMAT,
18
18
  CANONICAL_URL_KWARGS,
@@ -47,12 +47,13 @@ def get_dates(date_str, locale=None, source="v1"):
47
47
  SOURCE_DATETIME_FORMAT if source == "v1" else SOURCE_DATETIME_FORMAT_V2,
48
48
  )
49
49
  except ValueError as e:
50
- if source == "bluesky":
51
- parsed_datetime = datetime.strptime(date_str, SOURCE_DATETIME_FORMAT_V3)
52
- else:
50
+ if source != "bluesky":
53
51
  raise e
52
+ parsed_datetime = parse_date(date_str)
54
53
 
55
- utc_datetime = UTC_TIMEZONE.localize(parsed_datetime)
54
+ utc_datetime = parsed_datetime
55
+ if not parsed_datetime.tzinfo:
56
+ utc_datetime = UTC_TIMEZONE.localize(parsed_datetime)
56
57
  locale_datetime = utc_datetime.astimezone(locale)
57
58
 
58
59
  return (
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: twitwi
3
- Version: 0.21.0
3
+ Version: 0.21.2
4
4
  Summary: A collection of Twitter-related helper functions for python.
5
5
  Home-page: http://github.com/medialab/twitwi
6
6
  Author: Béatrice Mazoyer, Guillaume Plique, Benjamin Ooghe-Tabanou
@@ -12,6 +12,7 @@ Description-Content-Type: text/markdown
12
12
  License-File: LICENSE.txt
13
13
  Requires-Dist: pytz>=2019.3
14
14
  Requires-Dist: ural>=0.31.1
15
+ Requires-Dist: python-dateutil>=2.9.0
15
16
  Dynamic: author
16
17
  Dynamic: author-email
17
18
  Dynamic: description
@@ -94,7 +95,7 @@ for post_data in posts_payload_from_API:
94
95
 
95
96
  # Then, saving normalized profiles into a CSV using DictWriter:
96
97
 
97
- from csv import DictWriter
98
+ import csv
98
99
  from twitwi.bluesky.constants import POST_FIELDS
99
100
  from twitwi.bluesky import transform_post_into_csv_dict
100
101
 
@@ -107,7 +108,6 @@ with open("normalized_bluesky_posts.csv", "w") as f:
107
108
 
108
109
  # Or using the basic CSV writer:
109
110
 
110
- from csv import writer
111
111
  from twitwi.bluesky import format_post_as_csv_row
112
112
 
113
113
  with open("normalized_bluesky_posts.csv", "w") as f:
@@ -1,2 +1,3 @@
1
1
  pytz>=2019.3
2
2
  ural>=0.31.1
3
+ python-dateutil>=2.9.0
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes