twitwi 0.18.2__tar.gz → 0.19.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: twitwi
3
- Version: 0.18.2
3
+ Version: 0.19.2
4
4
  Summary: A collection of Twitter-related helper functions for python.
5
5
  Home-page: http://github.com/medialab/twitwi
6
6
  Author: Béatrice Mazoyer, Guillaume Plique, Benjamin Ooghe-Tabanou
@@ -4,7 +4,7 @@ with open('./README.md', 'r') as f:
4
4
  long_description = f.read()
5
5
 
6
6
  setup(name='twitwi',
7
- version='0.18.2',
7
+ version='0.19.2',
8
8
  description='A collection of Twitter-related helper functions for python.',
9
9
  long_description=long_description,
10
10
  long_description_content_type='text/markdown',
@@ -2,6 +2,7 @@
2
2
  # Twitwi Library Endpoint
3
3
  # =============================================================================
4
4
  #
5
+ from twitwi.anonymizers import anonymize_normalized_tweet
5
6
  from twitwi.client_wrapper import TwitterWrapper
6
7
  from twitwi.formatters import (
7
8
  transform_tweet_into_csv_dict,
@@ -0,0 +1,71 @@
1
+ import re
2
+
3
+ QUOTED_REDACT_RE = re.compile(r"«\s+[^»]+:\s+([^»]+)\s+»")
4
+
5
+
6
+ def redact_quoted_text(text: str) -> str:
7
+ return QUOTED_REDACT_RE.sub(r"« \1 »", text)
8
+
9
+
10
+ def redact_rt_text(text: str) -> str:
11
+ return 'RT: ' + text.split(': ', 1)[1]
12
+
13
+
14
+ FIELDS_TO_DELETE = [
15
+ # The tweet's url leaks the user
16
+ "url",
17
+
18
+ # User's place
19
+ "lat",
20
+ "lng",
21
+ "place_coordinates",
22
+ "place_country_code",
23
+ "place_name",
24
+ "place_type",
25
+ "user_location",
26
+
27
+ # User info
28
+ "user_created_at",
29
+ "user_description",
30
+ "user_id",
31
+ "user_image",
32
+ "user_name",
33
+ "user_screen_name",
34
+ "user_timestamp_utc",
35
+ "user_url",
36
+ "user_verified",
37
+
38
+ # Retweeted user info
39
+ "retweeted_timestamp_utc",
40
+ "retweeted_user",
41
+ "retweeted_user_id",
42
+
43
+ # Replied user info
44
+ "to_userid",
45
+ "to_username",
46
+
47
+ # Quoted user info
48
+ "quoted_user",
49
+ "quoted_user_id",
50
+ "quoted_timestamp_utc",
51
+ ]
52
+
53
+
54
+ # NOTE: currently we still keep the id, but we should drop it
55
+ # to really call this an anonymized tweet.
56
+ # NOTE: we do not redact mentions either.
57
+ # NOTE: we also don't redact replies.
58
+ def anonymize_normalized_tweet(normalized_tweet) -> None:
59
+
60
+ # Text mangling
61
+ text = normalized_tweet["text"]
62
+
63
+ if normalized_tweet.get('retweeted_id', None) is not None:
64
+ normalized_tweet["text"] = redact_rt_text(text)
65
+
66
+ elif normalized_tweet.get('quoted_id', None) is not None:
67
+ normalized_tweet["text"] = redact_quoted_text(text)
68
+
69
+ for field in FIELDS_TO_DELETE:
70
+ if field in normalized_tweet:
71
+ del normalized_tweet[field]
@@ -330,7 +330,11 @@ def normalize_tweet(tweet, locale=None, extract_referenced_tweets=False,
330
330
  media_urls.append(med_url.split('?tag=')[0])
331
331
  media_files.append('%s_%s' % (source_id, med_name))
332
332
  media_alt_texts.append(entity.get("ext_alt_text") or '')
333
- else:
333
+
334
+ # NOTE: fun fact, Twitter is starting to break down and we cannot guarantee
335
+ # expanded_url exists anymore. It even crashes the website itself lol:
336
+ # https://x.com/lmerzeau/status/426318495450943488
337
+ elif "expanded_url" in entity:
334
338
  normalized = custom_normalize_url(entity['expanded_url'])
335
339
  links.add(normalized)
336
340
 
@@ -459,8 +463,8 @@ def normalize_user(user, locale=None, pure=True, v2=False):
459
463
  'timestamp_utc': timestamp_utc,
460
464
  'local_time': local_time,
461
465
  'location': user.get('location'),
462
- 'verified': user.get('verified'),
463
- 'protected': user.get('protected'),
466
+ 'verified': user.get('verified', False),
467
+ 'protected': user.get('protected', False),
464
468
  'tweets': user['statuses_count'] if not v2 else user['public_metrics']['tweet_count'],
465
469
  'followers': user['followers_count'] if not v2 else user['public_metrics']['followers_count'],
466
470
  'friends': user['friends_count'] if not v2 else user['public_metrics']['following_count'],
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: twitwi
3
- Version: 0.18.2
3
+ Version: 0.19.2
4
4
  Summary: A collection of Twitter-related helper functions for python.
5
5
  Home-page: http://github.com/medialab/twitwi
6
6
  Author: Béatrice Mazoyer, Guillaume Plique, Benjamin Ooghe-Tabanou
@@ -2,6 +2,7 @@ LICENSE.txt
2
2
  README.md
3
3
  setup.py
4
4
  twitwi/__init__.py
5
+ twitwi/anonymizers.py
5
6
  twitwi/client_wrapper.py
6
7
  twitwi/constants.py
7
8
  twitwi/exceptions.py
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes