twitwi 0.20.0__py3-none-any.whl → 0.21.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,101 @@
1
+ import csv
2
+ from io import StringIO
3
+ from twitwi.bluesky import (
4
+ format_profile_as_csv_row,
5
+ format_post_as_csv_row,
6
+ transform_profile_into_csv_dict,
7
+ transform_post_into_csv_dict,
8
+ )
9
+ from twitwi.bluesky.constants import PROFILE_FIELDS, POST_FIELDS
10
+ from test.utils import get_json_resource, open_resource
11
+
12
+
13
+ # Set to True to regenerate test results
14
+ OVERWRITE_TESTS = False
15
+
16
+
17
+ class TestFormatters:
18
+ def test_format_profile_as_csv_row(self):
19
+ normalized_profiles = get_json_resource("bluesky-normalized-profiles.json")
20
+
21
+ buffer = StringIO(newline=None)
22
+ writer = csv.writer(buffer, quoting=csv.QUOTE_MINIMAL)
23
+ writer.writerow(PROFILE_FIELDS)
24
+
25
+ for profile in normalized_profiles:
26
+ writer.writerow(format_profile_as_csv_row(profile))
27
+
28
+ if OVERWRITE_TESTS:
29
+ written = buffer.getvalue()
30
+
31
+ with open("test/resources/bluesky-profiles-export.csv", "w") as f:
32
+ f.write(written)
33
+
34
+ with open_resource("bluesky-profiles-export.csv") as f:
35
+ buffer.seek(0)
36
+ assert list(csv.reader(buffer)) == list(csv.reader(f))
37
+
38
+ def test_transform_profile_into_csv_dict(self):
39
+ normalized_profiles = get_json_resource("bluesky-normalized-profiles.json")
40
+
41
+ buffer = StringIO(newline=None)
42
+ writer = csv.DictWriter(
43
+ buffer,
44
+ fieldnames=PROFILE_FIELDS,
45
+ extrasaction="ignore",
46
+ restval="",
47
+ quoting=csv.QUOTE_MINIMAL,
48
+ )
49
+ writer.writeheader()
50
+
51
+ for profile in normalized_profiles:
52
+ transform_profile_into_csv_dict(profile)
53
+ writer.writerow(profile)
54
+
55
+ with open_resource("bluesky-profiles-export.csv") as f:
56
+ buffer.seek(0)
57
+ assert list(csv.DictReader(buffer)) == list(csv.DictReader(f))
58
+
59
+ def test_format_post_as_csv_row(self):
60
+ normalized_posts = get_json_resource("bluesky-normalized-posts.json")
61
+
62
+ buffer = StringIO(newline=None)
63
+ writer = csv.writer(buffer, quoting=csv.QUOTE_MINIMAL)
64
+ writer.writerow(POST_FIELDS)
65
+
66
+ for source in normalized_posts:
67
+ for post in source:
68
+ writer.writerow(format_post_as_csv_row(post))
69
+
70
+ if OVERWRITE_TESTS:
71
+ written = buffer.getvalue()
72
+
73
+ with open("test/resources/bluesky-posts-export.csv", "w") as f:
74
+ f.write(written)
75
+
76
+ with open_resource("bluesky-posts-export.csv") as f:
77
+ buffer.seek(0)
78
+
79
+ assert list(csv.reader(buffer)) == list(csv.reader(f))
80
+
81
+ def test_transform_post_into_csv_dict(self):
82
+ normalized_posts = get_json_resource("bluesky-normalized-posts.json")
83
+
84
+ buffer = StringIO(newline=None)
85
+ writer = csv.DictWriter(
86
+ buffer,
87
+ fieldnames=POST_FIELDS,
88
+ extrasaction="ignore",
89
+ restval="",
90
+ quoting=csv.QUOTE_MINIMAL,
91
+ )
92
+ writer.writeheader()
93
+
94
+ for source in normalized_posts:
95
+ for post in source:
96
+ transform_post_into_csv_dict(post)
97
+ writer.writerow(post)
98
+
99
+ with open_resource("bluesky-posts-export.csv") as f:
100
+ buffer.seek(0)
101
+ assert list(csv.DictReader(buffer)) == list(csv.DictReader(f))
@@ -0,0 +1,130 @@
1
+ # =============================================================================
2
+ # Twitwi Bluesky Normalizers Unit Tests
3
+ # =============================================================================
4
+ from functools import partial
5
+ from pytz import timezone
6
+ from copy import deepcopy
7
+
8
+ from twitwi.bluesky import normalize_profile, normalize_post
9
+
10
+ from test.utils import get_json_resource
11
+
12
+
13
+ # Set to True to regenerate test results
14
+ OVERWRITE_TESTS = False
15
+
16
+
17
+ FAKE_COLLECTION_TIME = "2025-01-01T00:00:00.000000"
18
+ def set_fake_collection_time(dico):
19
+ if "collection_time" in dico:
20
+ dico["collection_time"] = FAKE_COLLECTION_TIME
21
+ return dico
22
+
23
+
24
+ def compare_dicts(_id, d1, d2, ignore_fields=[]):
25
+ for k in d2.keys():
26
+ if k not in ignore_fields + ["collection_time"]:
27
+ assert d1[k] == d2[k], (
28
+ 'Different value for key "%s" with payload data for "%s"'
29
+ % (
30
+ k,
31
+ _id,
32
+ )
33
+ )
34
+
35
+ for k in d1.keys():
36
+ if k not in ignore_fields:
37
+ assert k in d2, 'Missing key "%s" with payload data for "%s"' % (k, _id)
38
+
39
+
40
+ class TestNormalizers:
41
+ def test_normalize_profile(self):
42
+ tz = timezone("Europe/Paris")
43
+
44
+ profiles = get_json_resource("bluesky-profiles.json")
45
+ fn = partial(normalize_profile, locale=tz)
46
+
47
+ if OVERWRITE_TESTS:
48
+ from test.utils import dump_json_resource
49
+
50
+ normalized_profiles = [set_fake_collection_time(fn(profile)) for profile in profiles]
51
+ dump_json_resource(normalized_profiles, "bluesky-normalized-profiles.json")
52
+
53
+ expected = get_json_resource("bluesky-normalized-profiles.json")
54
+
55
+ for idx, profile in enumerate(profiles):
56
+ result = fn(profile)
57
+ assert isinstance(result, dict)
58
+ assert "collection_time" in result and isinstance(
59
+ result["collection_time"], str
60
+ )
61
+
62
+ compare_dicts(profile["handle"], result, expected[idx])
63
+
64
+ def test_normalize_profile_should_not_mutate(self):
65
+ profile = get_json_resource("bluesky-profiles.json")[0]
66
+
67
+ original_arg = deepcopy(profile)
68
+
69
+ normalize_profile(profile)
70
+
71
+ assert profile == original_arg
72
+
73
+ def test_normalize_post(self):
74
+ tz = timezone("Europe/Paris")
75
+
76
+ posts = get_json_resource("bluesky-posts.json")
77
+ fn = partial(normalize_post, locale=tz)
78
+
79
+ if OVERWRITE_TESTS:
80
+ from test.utils import dump_json_resource
81
+
82
+ normalized_posts = [[set_fake_collection_time(p) for p in fn(post, extract_referenced_posts=True)] for post in posts]
83
+ dump_json_resource(normalized_posts, "bluesky-normalized-posts.json")
84
+
85
+ expected = get_json_resource("bluesky-normalized-posts.json")
86
+
87
+ # With referenced tweets
88
+ for idx, post in enumerate(posts):
89
+ result = fn(post, extract_referenced_posts=True)
90
+ assert isinstance(result, list)
91
+ assert set(p["uri"] for p in result) == set(p["uri"] for p in expected[idx])
92
+ for idx2, p in enumerate(result):
93
+ assert "collection_time" in p and isinstance(p["collection_time"], str)
94
+
95
+ if "post" in post:
96
+ uri = post["post"]["uri"]
97
+ else:
98
+ uri = post["uri"]
99
+ compare_dicts(uri, p, expected[idx][idx2])
100
+
101
+ # With single output
102
+ for idx, post in enumerate(posts):
103
+ result = fn(post)
104
+
105
+ assert isinstance(result, dict)
106
+
107
+ _id = p["uri"]
108
+ compare_dicts(_id, result, expected[idx][-1])
109
+
110
+ # With custom collection_source
111
+ for post in posts:
112
+ result = fn(post, collection_source="unit_test")
113
+
114
+ assert result["collected_via"] == ["unit_test"]
115
+
116
+ def test_normalize_post_should_not_mutate(self):
117
+ post = get_json_resource("bluesky-posts.json")[0]
118
+
119
+ original_arg = deepcopy(post)
120
+
121
+ normalize_post(post)
122
+
123
+ assert post == original_arg
124
+
125
+ def test_normalize_post_should_be_normalized_across_sources(self):
126
+ # handle same post from different sources (search, get_post and user_feed)
127
+ pass
128
+
129
+ def test_badly_formatted_posts_payload(self):
130
+ pass
twitwi/__init__.py CHANGED
@@ -8,8 +8,9 @@ from twitwi.formatters import (
8
8
  format_tweet_as_csv_row,
9
9
  transform_user_into_csv_dict,
10
10
  format_user_as_csv_row,
11
- apply_tcat_format
11
+ apply_tcat_format,
12
12
  )
13
+
13
14
  # NOTE: should we drop this from public exports?
14
15
  from twitwi.utils import (
15
16
  get_dates,
@@ -20,5 +21,21 @@ from twitwi.utils import (
20
21
  from twitwi.normalizers import (
21
22
  normalize_tweet,
22
23
  normalize_user,
23
- normalize_tweets_payload_v2
24
+ normalize_tweets_payload_v2,
24
25
  )
26
+
27
+ __all__ = [
28
+ "anonymize_normalized_tweet",
29
+ "transform_tweet_into_csv_dict",
30
+ "format_tweet_as_csv_row",
31
+ "transform_user_into_csv_dict",
32
+ "format_user_as_csv_row",
33
+ "apply_tcat_format",
34
+ "get_dates",
35
+ "custom_normalize_url",
36
+ "get_timestamp_from_id",
37
+ "get_dates_from_id",
38
+ "normalize_tweet",
39
+ "normalize_user",
40
+ "normalize_tweets_payload_v2",
41
+ ]
twitwi/anonymizers.py CHANGED
@@ -8,13 +8,12 @@ def redact_quoted_text(text: str) -> str:
8
8
 
9
9
 
10
10
  def redact_rt_text(text: str) -> str:
11
- return 'RT: ' + text.split(': ', 1)[1]
11
+ return "RT: " + text.split(": ", 1)[1]
12
12
 
13
13
 
14
14
  FIELDS_TO_DELETE = [
15
15
  # The tweet's url leaks the user
16
16
  "url",
17
-
18
17
  # User's place
19
18
  "lat",
20
19
  "lng",
@@ -23,7 +22,6 @@ FIELDS_TO_DELETE = [
23
22
  "place_name",
24
23
  "place_type",
25
24
  "user_location",
26
-
27
25
  # User info
28
26
  "user_created_at",
29
27
  "user_description",
@@ -34,16 +32,13 @@ FIELDS_TO_DELETE = [
34
32
  "user_timestamp_utc",
35
33
  "user_url",
36
34
  "user_verified",
37
-
38
35
  # Retweeted user info
39
36
  "retweeted_timestamp_utc",
40
37
  "retweeted_user",
41
38
  "retweeted_user_id",
42
-
43
39
  # Replied user info
44
40
  "to_userid",
45
41
  "to_username",
46
-
47
42
  # Quoted user info
48
43
  "quoted_user",
49
44
  "quoted_user_id",
@@ -56,14 +51,13 @@ FIELDS_TO_DELETE = [
56
51
  # NOTE: we do not redact mentions either.
57
52
  # NOTE: we also don't redact replies.
58
53
  def anonymize_normalized_tweet(normalized_tweet) -> None:
59
-
60
54
  # Text mangling
61
55
  text = normalized_tweet["text"]
62
56
 
63
- if normalized_tweet.get('retweeted_id', None) is not None:
57
+ if normalized_tweet.get("retweeted_id", None) is not None:
64
58
  normalized_tweet["text"] = redact_rt_text(text)
65
59
 
66
- elif normalized_tweet.get('quoted_id', None) is not None:
60
+ elif normalized_tweet.get("quoted_id", None) is not None:
67
61
  normalized_tweet["text"] = redact_quoted_text(text)
68
62
 
69
63
  for field in FIELDS_TO_DELETE:
@@ -0,0 +1,16 @@
1
+ from twitwi.bluesky.normalizers import normalize_profile, normalize_post
2
+ from twitwi.bluesky.formatters import (
3
+ transform_profile_into_csv_dict,
4
+ format_profile_as_csv_row,
5
+ transform_post_into_csv_dict,
6
+ format_post_as_csv_row,
7
+ )
8
+
9
+ __all__ = [
10
+ "transform_profile_into_csv_dict",
11
+ "format_profile_as_csv_row",
12
+ "transform_post_into_csv_dict",
13
+ "format_post_as_csv_row",
14
+ "normalize_profile",
15
+ "normalize_post",
16
+ ]
@@ -0,0 +1,19 @@
1
+ from typing import List, Optional
2
+
3
+ from twitwi.bluesky.types import BlueskyProfile, BlueskyPost
4
+
5
+ PROFILE_FIELDS = list(BlueskyProfile.__annotations__.keys())
6
+
7
+ POST_FIELDS = list(BlueskyPost.__annotations__.keys())
8
+
9
+ POST_PLURAL_FIELDS = [
10
+ k
11
+ for k, v in BlueskyPost.__annotations__.items()
12
+ if v == List[str] or v == Optional[List[str]]
13
+ ]
14
+
15
+ POST_BOOLEAN_FIELDS = [
16
+ k
17
+ for k, v in BlueskyPost.__annotations__.items()
18
+ if v is bool or v == Optional[bool]
19
+ ]
@@ -0,0 +1,29 @@
1
+ from twitwi.formatters import make_transform_into_csv_dict, make_format_as_csv_row
2
+ from twitwi.bluesky.constants import (
3
+ PROFILE_FIELDS,
4
+ POST_FIELDS,
5
+ POST_PLURAL_FIELDS,
6
+ POST_BOOLEAN_FIELDS,
7
+ )
8
+
9
+
10
+ transform_post_into_csv_dict = make_transform_into_csv_dict(
11
+ POST_PLURAL_FIELDS, POST_BOOLEAN_FIELDS
12
+ )
13
+
14
+ format_post_as_csv_row = make_format_as_csv_row(
15
+ POST_FIELDS, POST_PLURAL_FIELDS, POST_BOOLEAN_FIELDS
16
+ )
17
+
18
+
19
+ transform_profile_into_csv_dict = make_transform_into_csv_dict([], [])
20
+
21
+ format_profile_as_csv_row = make_format_as_csv_row(PROFILE_FIELDS, [], [])
22
+
23
+
24
+ __all__ = [
25
+ "transform_post_into_csv_dict",
26
+ "format_post_as_csv_row",
27
+ "transform_profile_into_csv_dict",
28
+ "format_profile_as_csv_row",
29
+ ]