twitwi 0.20.0__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- test/bluesky/__init__.py +0 -0
- test/bluesky/formatters_test.py +101 -0
- test/bluesky/normalizers_test.py +130 -0
- twitwi/__init__.py +19 -2
- twitwi/anonymizers.py +3 -9
- twitwi/bluesky/__init__.py +16 -0
- twitwi/bluesky/constants.py +19 -0
- twitwi/bluesky/formatters.py +29 -0
- twitwi/bluesky/normalizers.py +641 -0
- twitwi/bluesky/types.py +135 -0
- twitwi/bluesky/utils.py +103 -0
- twitwi/constants.py +324 -349
- twitwi/exceptions.py +8 -1
- twitwi/formatters.py +35 -37
- twitwi/normalizers.py +403 -339
- twitwi/utils.py +44 -17
- twitwi-0.21.0.dist-info/METADATA +435 -0
- twitwi-0.21.0.dist-info/RECORD +22 -0
- {twitwi-0.20.0.dist-info → twitwi-0.21.0.dist-info}/WHEEL +1 -1
- {twitwi-0.20.0.dist-info → twitwi-0.21.0.dist-info}/top_level.txt +1 -0
- twitwi-0.20.0.dist-info/METADATA +0 -156
- twitwi-0.20.0.dist-info/RECORD +0 -13
- {twitwi-0.20.0.dist-info → twitwi-0.21.0.dist-info}/licenses/LICENSE.txt +0 -0
- {twitwi-0.20.0.dist-info → twitwi-0.21.0.dist-info}/zip-safe +0 -0
test/bluesky/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
from io import StringIO
|
|
3
|
+
from twitwi.bluesky import (
|
|
4
|
+
format_profile_as_csv_row,
|
|
5
|
+
format_post_as_csv_row,
|
|
6
|
+
transform_profile_into_csv_dict,
|
|
7
|
+
transform_post_into_csv_dict,
|
|
8
|
+
)
|
|
9
|
+
from twitwi.bluesky.constants import PROFILE_FIELDS, POST_FIELDS
|
|
10
|
+
from test.utils import get_json_resource, open_resource
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# Set to True to regenerate test results
|
|
14
|
+
OVERWRITE_TESTS = False
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TestFormatters:
|
|
18
|
+
def test_format_profile_as_csv_row(self):
|
|
19
|
+
normalized_profiles = get_json_resource("bluesky-normalized-profiles.json")
|
|
20
|
+
|
|
21
|
+
buffer = StringIO(newline=None)
|
|
22
|
+
writer = csv.writer(buffer, quoting=csv.QUOTE_MINIMAL)
|
|
23
|
+
writer.writerow(PROFILE_FIELDS)
|
|
24
|
+
|
|
25
|
+
for profile in normalized_profiles:
|
|
26
|
+
writer.writerow(format_profile_as_csv_row(profile))
|
|
27
|
+
|
|
28
|
+
if OVERWRITE_TESTS:
|
|
29
|
+
written = buffer.getvalue()
|
|
30
|
+
|
|
31
|
+
with open("test/resources/bluesky-profiles-export.csv", "w") as f:
|
|
32
|
+
f.write(written)
|
|
33
|
+
|
|
34
|
+
with open_resource("bluesky-profiles-export.csv") as f:
|
|
35
|
+
buffer.seek(0)
|
|
36
|
+
assert list(csv.reader(buffer)) == list(csv.reader(f))
|
|
37
|
+
|
|
38
|
+
def test_transform_profile_into_csv_dict(self):
|
|
39
|
+
normalized_profiles = get_json_resource("bluesky-normalized-profiles.json")
|
|
40
|
+
|
|
41
|
+
buffer = StringIO(newline=None)
|
|
42
|
+
writer = csv.DictWriter(
|
|
43
|
+
buffer,
|
|
44
|
+
fieldnames=PROFILE_FIELDS,
|
|
45
|
+
extrasaction="ignore",
|
|
46
|
+
restval="",
|
|
47
|
+
quoting=csv.QUOTE_MINIMAL,
|
|
48
|
+
)
|
|
49
|
+
writer.writeheader()
|
|
50
|
+
|
|
51
|
+
for profile in normalized_profiles:
|
|
52
|
+
transform_profile_into_csv_dict(profile)
|
|
53
|
+
writer.writerow(profile)
|
|
54
|
+
|
|
55
|
+
with open_resource("bluesky-profiles-export.csv") as f:
|
|
56
|
+
buffer.seek(0)
|
|
57
|
+
assert list(csv.DictReader(buffer)) == list(csv.DictReader(f))
|
|
58
|
+
|
|
59
|
+
def test_format_post_as_csv_row(self):
|
|
60
|
+
normalized_posts = get_json_resource("bluesky-normalized-posts.json")
|
|
61
|
+
|
|
62
|
+
buffer = StringIO(newline=None)
|
|
63
|
+
writer = csv.writer(buffer, quoting=csv.QUOTE_MINIMAL)
|
|
64
|
+
writer.writerow(POST_FIELDS)
|
|
65
|
+
|
|
66
|
+
for source in normalized_posts:
|
|
67
|
+
for post in source:
|
|
68
|
+
writer.writerow(format_post_as_csv_row(post))
|
|
69
|
+
|
|
70
|
+
if OVERWRITE_TESTS:
|
|
71
|
+
written = buffer.getvalue()
|
|
72
|
+
|
|
73
|
+
with open("test/resources/bluesky-posts-export.csv", "w") as f:
|
|
74
|
+
f.write(written)
|
|
75
|
+
|
|
76
|
+
with open_resource("bluesky-posts-export.csv") as f:
|
|
77
|
+
buffer.seek(0)
|
|
78
|
+
|
|
79
|
+
assert list(csv.reader(buffer)) == list(csv.reader(f))
|
|
80
|
+
|
|
81
|
+
def test_transform_post_into_csv_dict(self):
|
|
82
|
+
normalized_posts = get_json_resource("bluesky-normalized-posts.json")
|
|
83
|
+
|
|
84
|
+
buffer = StringIO(newline=None)
|
|
85
|
+
writer = csv.DictWriter(
|
|
86
|
+
buffer,
|
|
87
|
+
fieldnames=POST_FIELDS,
|
|
88
|
+
extrasaction="ignore",
|
|
89
|
+
restval="",
|
|
90
|
+
quoting=csv.QUOTE_MINIMAL,
|
|
91
|
+
)
|
|
92
|
+
writer.writeheader()
|
|
93
|
+
|
|
94
|
+
for source in normalized_posts:
|
|
95
|
+
for post in source:
|
|
96
|
+
transform_post_into_csv_dict(post)
|
|
97
|
+
writer.writerow(post)
|
|
98
|
+
|
|
99
|
+
with open_resource("bluesky-posts-export.csv") as f:
|
|
100
|
+
buffer.seek(0)
|
|
101
|
+
assert list(csv.DictReader(buffer)) == list(csv.DictReader(f))
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# Twitwi Bluesky Normalizers Unit Tests
|
|
3
|
+
# =============================================================================
|
|
4
|
+
from functools import partial
|
|
5
|
+
from pytz import timezone
|
|
6
|
+
from copy import deepcopy
|
|
7
|
+
|
|
8
|
+
from twitwi.bluesky import normalize_profile, normalize_post
|
|
9
|
+
|
|
10
|
+
from test.utils import get_json_resource
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# Set to True to regenerate test results
|
|
14
|
+
OVERWRITE_TESTS = False
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
FAKE_COLLECTION_TIME = "2025-01-01T00:00:00.000000"
|
|
18
|
+
def set_fake_collection_time(dico):
|
|
19
|
+
if "collection_time" in dico:
|
|
20
|
+
dico["collection_time"] = FAKE_COLLECTION_TIME
|
|
21
|
+
return dico
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def compare_dicts(_id, d1, d2, ignore_fields=[]):
|
|
25
|
+
for k in d2.keys():
|
|
26
|
+
if k not in ignore_fields + ["collection_time"]:
|
|
27
|
+
assert d1[k] == d2[k], (
|
|
28
|
+
'Different value for key "%s" with payload data for "%s"'
|
|
29
|
+
% (
|
|
30
|
+
k,
|
|
31
|
+
_id,
|
|
32
|
+
)
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
for k in d1.keys():
|
|
36
|
+
if k not in ignore_fields:
|
|
37
|
+
assert k in d2, 'Missing key "%s" with payload data for "%s"' % (k, _id)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class TestNormalizers:
|
|
41
|
+
def test_normalize_profile(self):
|
|
42
|
+
tz = timezone("Europe/Paris")
|
|
43
|
+
|
|
44
|
+
profiles = get_json_resource("bluesky-profiles.json")
|
|
45
|
+
fn = partial(normalize_profile, locale=tz)
|
|
46
|
+
|
|
47
|
+
if OVERWRITE_TESTS:
|
|
48
|
+
from test.utils import dump_json_resource
|
|
49
|
+
|
|
50
|
+
normalized_profiles = [set_fake_collection_time(fn(profile)) for profile in profiles]
|
|
51
|
+
dump_json_resource(normalized_profiles, "bluesky-normalized-profiles.json")
|
|
52
|
+
|
|
53
|
+
expected = get_json_resource("bluesky-normalized-profiles.json")
|
|
54
|
+
|
|
55
|
+
for idx, profile in enumerate(profiles):
|
|
56
|
+
result = fn(profile)
|
|
57
|
+
assert isinstance(result, dict)
|
|
58
|
+
assert "collection_time" in result and isinstance(
|
|
59
|
+
result["collection_time"], str
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
compare_dicts(profile["handle"], result, expected[idx])
|
|
63
|
+
|
|
64
|
+
def test_normalize_profile_should_not_mutate(self):
|
|
65
|
+
profile = get_json_resource("bluesky-profiles.json")[0]
|
|
66
|
+
|
|
67
|
+
original_arg = deepcopy(profile)
|
|
68
|
+
|
|
69
|
+
normalize_profile(profile)
|
|
70
|
+
|
|
71
|
+
assert profile == original_arg
|
|
72
|
+
|
|
73
|
+
def test_normalize_post(self):
|
|
74
|
+
tz = timezone("Europe/Paris")
|
|
75
|
+
|
|
76
|
+
posts = get_json_resource("bluesky-posts.json")
|
|
77
|
+
fn = partial(normalize_post, locale=tz)
|
|
78
|
+
|
|
79
|
+
if OVERWRITE_TESTS:
|
|
80
|
+
from test.utils import dump_json_resource
|
|
81
|
+
|
|
82
|
+
normalized_posts = [[set_fake_collection_time(p) for p in fn(post, extract_referenced_posts=True)] for post in posts]
|
|
83
|
+
dump_json_resource(normalized_posts, "bluesky-normalized-posts.json")
|
|
84
|
+
|
|
85
|
+
expected = get_json_resource("bluesky-normalized-posts.json")
|
|
86
|
+
|
|
87
|
+
# With referenced tweets
|
|
88
|
+
for idx, post in enumerate(posts):
|
|
89
|
+
result = fn(post, extract_referenced_posts=True)
|
|
90
|
+
assert isinstance(result, list)
|
|
91
|
+
assert set(p["uri"] for p in result) == set(p["uri"] for p in expected[idx])
|
|
92
|
+
for idx2, p in enumerate(result):
|
|
93
|
+
assert "collection_time" in p and isinstance(p["collection_time"], str)
|
|
94
|
+
|
|
95
|
+
if "post" in post:
|
|
96
|
+
uri = post["post"]["uri"]
|
|
97
|
+
else:
|
|
98
|
+
uri = post["uri"]
|
|
99
|
+
compare_dicts(uri, p, expected[idx][idx2])
|
|
100
|
+
|
|
101
|
+
# With single output
|
|
102
|
+
for idx, post in enumerate(posts):
|
|
103
|
+
result = fn(post)
|
|
104
|
+
|
|
105
|
+
assert isinstance(result, dict)
|
|
106
|
+
|
|
107
|
+
_id = p["uri"]
|
|
108
|
+
compare_dicts(_id, result, expected[idx][-1])
|
|
109
|
+
|
|
110
|
+
# With custom collection_source
|
|
111
|
+
for post in posts:
|
|
112
|
+
result = fn(post, collection_source="unit_test")
|
|
113
|
+
|
|
114
|
+
assert result["collected_via"] == ["unit_test"]
|
|
115
|
+
|
|
116
|
+
def test_normalize_post_should_not_mutate(self):
|
|
117
|
+
post = get_json_resource("bluesky-posts.json")[0]
|
|
118
|
+
|
|
119
|
+
original_arg = deepcopy(post)
|
|
120
|
+
|
|
121
|
+
normalize_post(post)
|
|
122
|
+
|
|
123
|
+
assert post == original_arg
|
|
124
|
+
|
|
125
|
+
def test_normalize_post_should_be_normalized_across_sources(self):
|
|
126
|
+
# handle same post from different sources (search, get_post and user_feed)
|
|
127
|
+
pass
|
|
128
|
+
|
|
129
|
+
def test_badly_formatted_posts_payload(self):
|
|
130
|
+
pass
|
twitwi/__init__.py
CHANGED
|
@@ -8,8 +8,9 @@ from twitwi.formatters import (
|
|
|
8
8
|
format_tweet_as_csv_row,
|
|
9
9
|
transform_user_into_csv_dict,
|
|
10
10
|
format_user_as_csv_row,
|
|
11
|
-
apply_tcat_format
|
|
11
|
+
apply_tcat_format,
|
|
12
12
|
)
|
|
13
|
+
|
|
13
14
|
# NOTE: should we drop this from public exports?
|
|
14
15
|
from twitwi.utils import (
|
|
15
16
|
get_dates,
|
|
@@ -20,5 +21,21 @@ from twitwi.utils import (
|
|
|
20
21
|
from twitwi.normalizers import (
|
|
21
22
|
normalize_tweet,
|
|
22
23
|
normalize_user,
|
|
23
|
-
normalize_tweets_payload_v2
|
|
24
|
+
normalize_tweets_payload_v2,
|
|
24
25
|
)
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"anonymize_normalized_tweet",
|
|
29
|
+
"transform_tweet_into_csv_dict",
|
|
30
|
+
"format_tweet_as_csv_row",
|
|
31
|
+
"transform_user_into_csv_dict",
|
|
32
|
+
"format_user_as_csv_row",
|
|
33
|
+
"apply_tcat_format",
|
|
34
|
+
"get_dates",
|
|
35
|
+
"custom_normalize_url",
|
|
36
|
+
"get_timestamp_from_id",
|
|
37
|
+
"get_dates_from_id",
|
|
38
|
+
"normalize_tweet",
|
|
39
|
+
"normalize_user",
|
|
40
|
+
"normalize_tweets_payload_v2",
|
|
41
|
+
]
|
twitwi/anonymizers.py
CHANGED
|
@@ -8,13 +8,12 @@ def redact_quoted_text(text: str) -> str:
|
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def redact_rt_text(text: str) -> str:
|
|
11
|
-
return
|
|
11
|
+
return "RT: " + text.split(": ", 1)[1]
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
FIELDS_TO_DELETE = [
|
|
15
15
|
# The tweet's url leaks the user
|
|
16
16
|
"url",
|
|
17
|
-
|
|
18
17
|
# User's place
|
|
19
18
|
"lat",
|
|
20
19
|
"lng",
|
|
@@ -23,7 +22,6 @@ FIELDS_TO_DELETE = [
|
|
|
23
22
|
"place_name",
|
|
24
23
|
"place_type",
|
|
25
24
|
"user_location",
|
|
26
|
-
|
|
27
25
|
# User info
|
|
28
26
|
"user_created_at",
|
|
29
27
|
"user_description",
|
|
@@ -34,16 +32,13 @@ FIELDS_TO_DELETE = [
|
|
|
34
32
|
"user_timestamp_utc",
|
|
35
33
|
"user_url",
|
|
36
34
|
"user_verified",
|
|
37
|
-
|
|
38
35
|
# Retweeted user info
|
|
39
36
|
"retweeted_timestamp_utc",
|
|
40
37
|
"retweeted_user",
|
|
41
38
|
"retweeted_user_id",
|
|
42
|
-
|
|
43
39
|
# Replied user info
|
|
44
40
|
"to_userid",
|
|
45
41
|
"to_username",
|
|
46
|
-
|
|
47
42
|
# Quoted user info
|
|
48
43
|
"quoted_user",
|
|
49
44
|
"quoted_user_id",
|
|
@@ -56,14 +51,13 @@ FIELDS_TO_DELETE = [
|
|
|
56
51
|
# NOTE: we do not redact mentions either.
|
|
57
52
|
# NOTE: we also don't redact replies.
|
|
58
53
|
def anonymize_normalized_tweet(normalized_tweet) -> None:
|
|
59
|
-
|
|
60
54
|
# Text mangling
|
|
61
55
|
text = normalized_tweet["text"]
|
|
62
56
|
|
|
63
|
-
if normalized_tweet.get(
|
|
57
|
+
if normalized_tweet.get("retweeted_id", None) is not None:
|
|
64
58
|
normalized_tweet["text"] = redact_rt_text(text)
|
|
65
59
|
|
|
66
|
-
elif normalized_tweet.get(
|
|
60
|
+
elif normalized_tweet.get("quoted_id", None) is not None:
|
|
67
61
|
normalized_tweet["text"] = redact_quoted_text(text)
|
|
68
62
|
|
|
69
63
|
for field in FIELDS_TO_DELETE:
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from twitwi.bluesky.normalizers import normalize_profile, normalize_post
|
|
2
|
+
from twitwi.bluesky.formatters import (
|
|
3
|
+
transform_profile_into_csv_dict,
|
|
4
|
+
format_profile_as_csv_row,
|
|
5
|
+
transform_post_into_csv_dict,
|
|
6
|
+
format_post_as_csv_row,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"transform_profile_into_csv_dict",
|
|
11
|
+
"format_profile_as_csv_row",
|
|
12
|
+
"transform_post_into_csv_dict",
|
|
13
|
+
"format_post_as_csv_row",
|
|
14
|
+
"normalize_profile",
|
|
15
|
+
"normalize_post",
|
|
16
|
+
]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
from twitwi.bluesky.types import BlueskyProfile, BlueskyPost
|
|
4
|
+
|
|
5
|
+
PROFILE_FIELDS = list(BlueskyProfile.__annotations__.keys())
|
|
6
|
+
|
|
7
|
+
POST_FIELDS = list(BlueskyPost.__annotations__.keys())
|
|
8
|
+
|
|
9
|
+
POST_PLURAL_FIELDS = [
|
|
10
|
+
k
|
|
11
|
+
for k, v in BlueskyPost.__annotations__.items()
|
|
12
|
+
if v == List[str] or v == Optional[List[str]]
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
POST_BOOLEAN_FIELDS = [
|
|
16
|
+
k
|
|
17
|
+
for k, v in BlueskyPost.__annotations__.items()
|
|
18
|
+
if v is bool or v == Optional[bool]
|
|
19
|
+
]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from twitwi.formatters import make_transform_into_csv_dict, make_format_as_csv_row
|
|
2
|
+
from twitwi.bluesky.constants import (
|
|
3
|
+
PROFILE_FIELDS,
|
|
4
|
+
POST_FIELDS,
|
|
5
|
+
POST_PLURAL_FIELDS,
|
|
6
|
+
POST_BOOLEAN_FIELDS,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
transform_post_into_csv_dict = make_transform_into_csv_dict(
|
|
11
|
+
POST_PLURAL_FIELDS, POST_BOOLEAN_FIELDS
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
format_post_as_csv_row = make_format_as_csv_row(
|
|
15
|
+
POST_FIELDS, POST_PLURAL_FIELDS, POST_BOOLEAN_FIELDS
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
transform_profile_into_csv_dict = make_transform_into_csv_dict([], [])
|
|
20
|
+
|
|
21
|
+
format_profile_as_csv_row = make_format_as_csv_row(PROFILE_FIELDS, [], [])
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"transform_post_into_csv_dict",
|
|
26
|
+
"format_post_as_csv_row",
|
|
27
|
+
"transform_profile_into_csv_dict",
|
|
28
|
+
"format_profile_as_csv_row",
|
|
29
|
+
]
|