twitwi 0.18.2__tar.gz → 0.19.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {twitwi-0.18.2/twitwi.egg-info → twitwi-0.19.2}/PKG-INFO +1 -1
- {twitwi-0.18.2 → twitwi-0.19.2}/setup.py +1 -1
- {twitwi-0.18.2 → twitwi-0.19.2}/twitwi/__init__.py +1 -0
- twitwi-0.19.2/twitwi/anonymizers.py +71 -0
- {twitwi-0.18.2 → twitwi-0.19.2}/twitwi/normalizers.py +7 -3
- {twitwi-0.18.2 → twitwi-0.19.2/twitwi.egg-info}/PKG-INFO +1 -1
- {twitwi-0.18.2 → twitwi-0.19.2}/twitwi.egg-info/SOURCES.txt +1 -0
- {twitwi-0.18.2 → twitwi-0.19.2}/LICENSE.txt +0 -0
- {twitwi-0.18.2 → twitwi-0.19.2}/README.md +0 -0
- {twitwi-0.18.2 → twitwi-0.19.2}/setup.cfg +0 -0
- {twitwi-0.18.2 → twitwi-0.19.2}/twitwi/client_wrapper.py +0 -0
- {twitwi-0.18.2 → twitwi-0.19.2}/twitwi/constants.py +0 -0
- {twitwi-0.18.2 → twitwi-0.19.2}/twitwi/exceptions.py +0 -0
- {twitwi-0.18.2 → twitwi-0.19.2}/twitwi/formatters.py +0 -0
- {twitwi-0.18.2 → twitwi-0.19.2}/twitwi/utils.py +0 -0
- {twitwi-0.18.2 → twitwi-0.19.2}/twitwi.egg-info/dependency_links.txt +0 -0
- {twitwi-0.18.2 → twitwi-0.19.2}/twitwi.egg-info/requires.txt +0 -0
- {twitwi-0.18.2 → twitwi-0.19.2}/twitwi.egg-info/top_level.txt +0 -0
- {twitwi-0.18.2 → twitwi-0.19.2}/twitwi.egg-info/zip-safe +0 -0
|
@@ -4,7 +4,7 @@ with open('./README.md', 'r') as f:
|
|
|
4
4
|
long_description = f.read()
|
|
5
5
|
|
|
6
6
|
setup(name='twitwi',
|
|
7
|
-
version='0.
|
|
7
|
+
version='0.19.2',
|
|
8
8
|
description='A collection of Twitter-related helper functions for python.',
|
|
9
9
|
long_description=long_description,
|
|
10
10
|
long_description_content_type='text/markdown',
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# Twitwi Library Endpoint
|
|
3
3
|
# =============================================================================
|
|
4
4
|
#
|
|
5
|
+
from twitwi.anonymizers import anonymize_normalized_tweet
|
|
5
6
|
from twitwi.client_wrapper import TwitterWrapper
|
|
6
7
|
from twitwi.formatters import (
|
|
7
8
|
transform_tweet_into_csv_dict,
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
QUOTED_REDACT_RE = re.compile(r"«\s+[^»]+:\s+([^»]+)\s+»")
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def redact_quoted_text(text: str) -> str:
|
|
7
|
+
return QUOTED_REDACT_RE.sub(r"« \1 »", text)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def redact_rt_text(text: str) -> str:
|
|
11
|
+
return 'RT: ' + text.split(': ', 1)[1]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
FIELDS_TO_DELETE = [
|
|
15
|
+
# The tweet's url leaks the user
|
|
16
|
+
"url",
|
|
17
|
+
|
|
18
|
+
# User's place
|
|
19
|
+
"lat",
|
|
20
|
+
"lng",
|
|
21
|
+
"place_coordinates",
|
|
22
|
+
"place_country_code",
|
|
23
|
+
"place_name",
|
|
24
|
+
"place_type",
|
|
25
|
+
"user_location",
|
|
26
|
+
|
|
27
|
+
# User info
|
|
28
|
+
"user_created_at",
|
|
29
|
+
"user_description",
|
|
30
|
+
"user_id",
|
|
31
|
+
"user_image",
|
|
32
|
+
"user_name",
|
|
33
|
+
"user_screen_name",
|
|
34
|
+
"user_timestamp_utc",
|
|
35
|
+
"user_url",
|
|
36
|
+
"user_verified",
|
|
37
|
+
|
|
38
|
+
# Retweeted user info
|
|
39
|
+
"retweeted_timestamp_utc",
|
|
40
|
+
"retweeted_user",
|
|
41
|
+
"retweeted_user_id",
|
|
42
|
+
|
|
43
|
+
# Replied user info
|
|
44
|
+
"to_userid",
|
|
45
|
+
"to_username",
|
|
46
|
+
|
|
47
|
+
# Quoted user info
|
|
48
|
+
"quoted_user",
|
|
49
|
+
"quoted_user_id",
|
|
50
|
+
"quoted_timestamp_utc",
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# NOTE: currently we still keep the id, but we should drop it
|
|
55
|
+
# to really call this an anonymized tweet.
|
|
56
|
+
# NOTE: we do not redact mentions either.
|
|
57
|
+
# NOTE: we also don't redact replies.
|
|
58
|
+
def anonymize_normalized_tweet(normalized_tweet) -> None:
|
|
59
|
+
|
|
60
|
+
# Text mangling
|
|
61
|
+
text = normalized_tweet["text"]
|
|
62
|
+
|
|
63
|
+
if normalized_tweet.get('retweeted_id', None) is not None:
|
|
64
|
+
normalized_tweet["text"] = redact_rt_text(text)
|
|
65
|
+
|
|
66
|
+
elif normalized_tweet.get('quoted_id', None) is not None:
|
|
67
|
+
normalized_tweet["text"] = redact_quoted_text(text)
|
|
68
|
+
|
|
69
|
+
for field in FIELDS_TO_DELETE:
|
|
70
|
+
if field in normalized_tweet:
|
|
71
|
+
del normalized_tweet[field]
|
|
@@ -330,7 +330,11 @@ def normalize_tweet(tweet, locale=None, extract_referenced_tweets=False,
|
|
|
330
330
|
media_urls.append(med_url.split('?tag=')[0])
|
|
331
331
|
media_files.append('%s_%s' % (source_id, med_name))
|
|
332
332
|
media_alt_texts.append(entity.get("ext_alt_text") or '')
|
|
333
|
-
|
|
333
|
+
|
|
334
|
+
# NOTE: fun fact, Twitter is starting to break down and we cannot guarantee
|
|
335
|
+
# expanded_url exists anymore. It even crashes the website itself lol:
|
|
336
|
+
# https://x.com/lmerzeau/status/426318495450943488
|
|
337
|
+
elif "expanded_url" in entity:
|
|
334
338
|
normalized = custom_normalize_url(entity['expanded_url'])
|
|
335
339
|
links.add(normalized)
|
|
336
340
|
|
|
@@ -459,8 +463,8 @@ def normalize_user(user, locale=None, pure=True, v2=False):
|
|
|
459
463
|
'timestamp_utc': timestamp_utc,
|
|
460
464
|
'local_time': local_time,
|
|
461
465
|
'location': user.get('location'),
|
|
462
|
-
'verified': user.get('verified'),
|
|
463
|
-
'protected': user.get('protected'),
|
|
466
|
+
'verified': user.get('verified', False),
|
|
467
|
+
'protected': user.get('protected', False),
|
|
464
468
|
'tweets': user['statuses_count'] if not v2 else user['public_metrics']['tweet_count'],
|
|
465
469
|
'followers': user['followers_count'] if not v2 else user['public_metrics']['followers_count'],
|
|
466
470
|
'friends': user['friends_count'] if not v2 else user['public_metrics']['following_count'],
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|