opsci-toolbox 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/rapidapi_helpers.py +1 -2
- opsci_toolbox/apis/reddit.py +342 -334
- opsci_toolbox/apis/telegram.py +471 -41
- opsci_toolbox/helpers/common.py +3 -1
- opsci_toolbox/helpers/dates.py +1 -1
- opsci_toolbox/helpers/nlp.py +178 -33
- opsci_toolbox/helpers/nlp_cuml.py +47 -2
- opsci_toolbox/helpers/sna.py +34 -0
- {opsci_toolbox-0.0.13.dist-info → opsci_toolbox-0.0.15.dist-info}/METADATA +2 -2
- {opsci_toolbox-0.0.13.dist-info → opsci_toolbox-0.0.15.dist-info}/RECORD +13 -12
- opsci_toolbox-0.0.15.dist-info/dependency_links.txt +1 -0
- {opsci_toolbox-0.0.13.dist-info → opsci_toolbox-0.0.15.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.13.dist-info → opsci_toolbox-0.0.15.dist-info}/top_level.txt +0 -0
opsci_toolbox/apis/reddit.py
CHANGED
@@ -2,214 +2,53 @@ import praw
|
|
2
2
|
import datetime
|
3
3
|
import pandas as pd
|
4
4
|
from tqdm import tqdm
|
5
|
+
import time
|
5
6
|
|
6
|
-
def
|
7
|
-
|
8
|
-
if
|
9
|
-
if author.comment_karma:
|
10
|
-
author_comment_karma=author.comment_karma
|
11
|
-
else:
|
12
|
-
author_comment_karma = None
|
13
|
-
if author.created_utc:
|
14
|
-
author_created_utc=datetime.datetime.fromtimestamp(int(author.created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
|
15
|
-
else:
|
16
|
-
author_created_utc = datetime.datetime(1970,1,1,0,0,0)
|
17
|
-
if author.icon_img:
|
18
|
-
author_icon_img=author.icon_img
|
19
|
-
else:
|
20
|
-
author_icon_img = None
|
21
|
-
if author.id:
|
22
|
-
author_id=author.id
|
23
|
-
else:
|
24
|
-
author_id = None
|
25
|
-
if author.is_employee:
|
26
|
-
author_is_employee=author.is_employee
|
27
|
-
else:
|
28
|
-
author_is_employee = None
|
29
|
-
if author.is_mod:
|
30
|
-
author_is_mod=author.is_mod
|
31
|
-
else:
|
32
|
-
author_is_mod = None
|
33
|
-
|
34
|
-
if author.is_gold:
|
35
|
-
author_is_gold=author.is_gold
|
36
|
-
else:
|
37
|
-
author_is_gold = None
|
38
|
-
|
39
|
-
if author.link_karma:
|
40
|
-
author_link_karma=author.link_karma
|
41
|
-
else:
|
42
|
-
author_link_karma = None
|
43
|
-
|
44
|
-
if author.name:
|
45
|
-
author_name=author.name
|
46
|
-
else:
|
47
|
-
author_name = None
|
48
|
-
record = (author_id, author_name, author_link_karma, author_comment_karma, author_created_utc, author_icon_img, author_is_employee, author_is_mod, author_is_gold)
|
49
|
-
else:
|
50
|
-
record = (None, None, None, None, None, None, None, None, None)
|
51
|
-
return record
|
52
|
-
|
53
|
-
|
54
|
-
def getSubmissions(reddit_client, lst_ids, subreddit_filter, subreddit_items, time_filter):
|
55
|
-
|
56
|
-
all_records = []
|
57
|
-
for url in tqdm(lst_ids, total=len(lst_ids), desc="Récupération des soumissions"):
|
58
|
-
subreddit = reddit_client.subreddit(str(url))
|
59
|
-
|
60
|
-
sub_record = parse_subreddit(subreddit)
|
61
|
-
|
62
|
-
if subreddit_filter == "top":
|
63
|
-
subreddit_selection = subreddit.top(limit=subreddit_items, time_filter= time_filter)
|
64
|
-
elif subreddit_filter == "hot":
|
65
|
-
subreddit_selection = subreddit.hot(limit=subreddit_items, time_filter= time_filter)
|
66
|
-
elif subreddit_filter == "controversial":
|
67
|
-
subreddit_selection = subreddit.controversial(limit=subreddit_items, time_filter= time_filter)
|
68
|
-
else:
|
69
|
-
subreddit_selection = subreddit.new(limit=subreddit_items, time_filter= time_filter)
|
70
|
-
|
71
|
-
for submission in subreddit_selection:
|
72
|
-
author = submission.author
|
73
|
-
author_record = parse_author(author)
|
74
|
-
submission_record = parse_submission(submission)
|
75
|
-
|
76
|
-
record = sub_record + author_record + submission_record
|
77
|
-
all_records.append(record)
|
78
|
-
|
79
|
-
df = pd.DataFrame.from_records(all_records,
|
80
|
-
columns = ["subreddit_id", "subreddit_name", "subreddit_display_name", "subreddit_subscribers", "subreddit_date", "subreddit_description", "subreddit_public_description", "subreddit_over18",
|
81
|
-
"subreddit_spoilers_enabled", "subreddit_can_assign_user_flair", "subreddit_can_assign_link_flair", "subreddit_lang", "subreddit_active_user_count",
|
82
|
-
"author_id", "author_name", "author_link_karma", "author_comment_karma", "author_created_utc", "author_icon_img", "author_is_employee", "author_is_mod", "author_is_gold",
|
83
|
-
"submission_id", "submission_title", "submission_name", "submission_created_utc", "submission_distinguished", "submission_edited", "submission_is_self", "submission_link_flair_template_id",
|
84
|
-
"submission_link_flair_text", "submission_locked", "submission_num_comments", "submission_over_18", "submission_permalink", "submission_score", "submission_selftext", "submission_spoiler",
|
85
|
-
"submission_stickied", "submission_url", "submission_upvote_ratio", "submission_downs", "submission_ups", "submission_num_crossposts", "submission_num_reports", "submission_score",
|
86
|
-
"submission_total_awards_received", "submission_view_count"]
|
87
|
-
)
|
88
|
-
|
89
|
-
return df
|
90
|
-
|
91
|
-
|
92
|
-
def parse_submission(submission):
|
93
|
-
if submission.id:
|
94
|
-
submission_id=submission.id
|
95
|
-
else:
|
96
|
-
submission_id = None
|
97
|
-
if submission.title:
|
98
|
-
submission_title=submission.title
|
99
|
-
else:
|
100
|
-
submission_title = None
|
101
|
-
if submission.name:
|
102
|
-
submission_name=submission.name
|
103
|
-
else:
|
104
|
-
submission_name = None
|
105
|
-
if submission.created_utc:
|
106
|
-
submission_created_utc=datetime.datetime.fromtimestamp(int(submission.created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
|
107
|
-
else:
|
108
|
-
submission_created_utc = datetime.datetime(1970,1,1,0,0,0)
|
109
|
-
if submission.distinguished:
|
110
|
-
submission_distinguished=submission.distinguished
|
111
|
-
else:
|
112
|
-
submission_distinguished = None
|
113
|
-
if submission.edited:
|
114
|
-
submission_edited=submission.edited
|
115
|
-
else:
|
116
|
-
submission_edited = None
|
117
|
-
if submission.is_self:
|
118
|
-
submission_is_self=submission.is_self
|
119
|
-
else:
|
120
|
-
submission_is_self = None
|
121
|
-
if submission.link_flair_template_id:
|
122
|
-
submission_link_flair_template_id=submission.link_flair_template_id
|
123
|
-
else:
|
124
|
-
submission_link_flair_template_id = None
|
125
|
-
if submission.link_flair_text:
|
126
|
-
submission_link_flair_text=submission.link_flair_text
|
127
|
-
else:
|
128
|
-
submission_link_flair_text = None
|
129
|
-
if submission.locked:
|
130
|
-
submission_locked=submission.locked
|
131
|
-
else:
|
132
|
-
submission_locked = None
|
133
|
-
if submission.num_comments:
|
134
|
-
submission_num_comments=submission.num_comments
|
135
|
-
else:
|
136
|
-
submission_num_comments = None
|
137
|
-
if submission.over_18:
|
138
|
-
submission_over_18=submission.over_18
|
139
|
-
else:
|
140
|
-
submission_over_18 = None
|
141
|
-
if submission.permalink:
|
142
|
-
submission_permalink=submission.permalink
|
143
|
-
else:
|
144
|
-
submission_permalink = None
|
145
|
-
if submission.score:
|
146
|
-
submission_score=submission.score
|
147
|
-
else:
|
148
|
-
submission_score = None
|
149
|
-
if submission.selftext:
|
150
|
-
submission_selftext=submission.selftext
|
151
|
-
else:
|
152
|
-
submission_selftext = None
|
153
|
-
if submission.spoiler:
|
154
|
-
submission_spoiler=submission.spoiler
|
155
|
-
else:
|
156
|
-
submission_spoiler = None
|
157
|
-
if submission.stickied:
|
158
|
-
submission_stickied=submission.stickied
|
159
|
-
else:
|
160
|
-
submission_stickied = None
|
161
|
-
if submission.upvote_ratio:
|
162
|
-
submission_upvote_ratio=submission.upvote_ratio
|
163
|
-
else:
|
164
|
-
submission_upvote_ratio = None
|
165
|
-
if submission.url:
|
166
|
-
submission_url=submission.url
|
167
|
-
else:
|
168
|
-
submission_url = None
|
169
|
-
if submission.downs:
|
170
|
-
submission_downs=submission.downs
|
171
|
-
else:
|
172
|
-
submission_downs = None
|
173
|
-
if submission.num_crossposts:
|
174
|
-
submission_num_crossposts=submission.num_crossposts
|
175
|
-
else:
|
176
|
-
submission_num_crossposts = None
|
177
|
-
if submission.num_reports:
|
178
|
-
submission_num_reports=submission.num_reports
|
179
|
-
else:
|
180
|
-
submission_num_reports = None
|
181
|
-
|
182
|
-
if submission.score:
|
183
|
-
submission_score=submission.score
|
184
|
-
else:
|
185
|
-
submission_score = None
|
186
|
-
|
187
|
-
if submission.ups:
|
188
|
-
submission_ups=submission.ups
|
189
|
-
else:
|
190
|
-
submission_ups = None
|
7
|
+
def check_limit(reddit_client : praw.Reddit) -> tuple:
|
8
|
+
"""
|
9
|
+
Check Reddit Client rate limit and wait if necessary.
|
191
10
|
|
192
|
-
|
193
|
-
|
194
|
-
else:
|
195
|
-
submission_total_awards_received = None
|
11
|
+
Args:
|
12
|
+
reddit_client (praw.Reddit): current reddit client
|
196
13
|
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
14
|
+
Returns:
|
15
|
+
tuple containing the following information:
|
16
|
+
- remaining: remaining queries.
|
17
|
+
- reset_timestamp: time before reset.
|
18
|
+
- used: number of sent queries.
|
201
19
|
|
20
|
+
"""
|
21
|
+
headers = reddit_client.auth.limits
|
22
|
+
remaining = headers.get('remaining')
|
23
|
+
reset_timestamp = headers.get('reset_timestamp')
|
24
|
+
used = headers.get('used')
|
25
|
+
|
26
|
+
if remaining and reset_timestamp :
|
27
|
+
if remaining <= 10:
|
28
|
+
# Calculate the time to wait until reset
|
29
|
+
current_time = time.time()
|
30
|
+
wait_time = reset_timestamp - current_time
|
31
|
+
|
32
|
+
if wait_time > 0:
|
33
|
+
# Convert wait_time to seconds and wait
|
34
|
+
print(f"Waiting for {wait_time:.2f} seconds until the next reset...")
|
35
|
+
time.sleep(wait_time)
|
36
|
+
else:
|
37
|
+
print("Reset time is in the past. No need to wait.")
|
38
|
+
# else:
|
39
|
+
# print(f"{remaining} requests remaining. No need to wait.")
|
40
|
+
else :
|
41
|
+
print("Missing required header information. Cannot determine wait time.")
|
42
|
+
|
43
|
+
return remaining, reset_timestamp, used
|
202
44
|
|
203
|
-
record = (submission_id, submission_title, submission_name, submission_created_utc, submission_distinguished, submission_edited, submission_is_self, submission_link_flair_template_id, submission_link_flair_text, submission_locked, submission_num_comments, submission_over_18, submission_permalink, submission_score, submission_selftext, submission_spoiler, submission_stickied, submission_url, submission_upvote_ratio, submission_downs, submission_ups, submission_num_crossposts, submission_num_reports, submission_score, submission_total_awards_received, submission_view_count)
|
204
|
-
return record
|
205
|
-
|
206
|
-
|
207
45
|
|
208
46
|
def get_subreddit_info(reddit_client : praw.Reddit, lst_ids: list) -> pd.DataFrame:
|
209
47
|
"""
|
210
48
|
Retrieves information about subreddits based on a list of subreddit IDs.
|
211
49
|
|
212
50
|
Args:
|
51
|
+
reddit_client (praw.Reddit): current reddit client
|
213
52
|
lst_ids (list): A list of subreddit IDs.
|
214
53
|
|
215
54
|
Returns:
|
@@ -228,172 +67,341 @@ def get_subreddit_info(reddit_client : praw.Reddit, lst_ids: list) -> pd.DataFra
|
|
228
67
|
"""
|
229
68
|
all_records = []
|
230
69
|
for reddit_id in lst_ids:
|
70
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
231
71
|
subreddit = reddit_client.subreddit(str(reddit_id))
|
232
|
-
record = parse_subreddit(subreddit)
|
72
|
+
record = parse_subreddit(reddit_client, subreddit)
|
233
73
|
|
234
74
|
all_records.append(record)
|
235
75
|
|
236
76
|
df = pd.DataFrame.from_records(all_records, columns=["subreddit_id", "subreddit_name", "subreddit_display_name", "subreddit_subscribers", "subreddit_date", "subreddit_description", "subreddit_public_description", "subreddit_over18", "subreddit_spoilers_enabled", "subreddit_can_assign_user_flair", "subreddit_can_assign_link_flair", "subreddit_lang", "subreddit_active_user_count"])
|
237
77
|
return df
|
238
78
|
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
79
|
+
|
80
|
+
def getSubmissions(reddit_client : praw.Reddit, sub_id : str, subreddit_filter : str, subreddit_items : int, time_filter : str) -> pd.DataFrame:
|
81
|
+
"""
|
82
|
+
Retrieves submission from a subreddit ID.
|
83
|
+
|
84
|
+
Args:
|
85
|
+
reddit_client (praw.Reddit): current reddit client
|
86
|
+
sub_id (str): a subreddit ID.
|
87
|
+
subreddit_filter (str): the filter to apply to the subreddit (top, hot, new, controversial).
|
88
|
+
subreddit_items (int): the number of items to retrieve. None to retrieve all items.
|
89
|
+
time_filter (str): the time filter to apply to the subreddit (hour, day, week, month, year, all).
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
pd.DataFrame: A DataFrame containing submissions metadata.
|
93
|
+
"""
|
94
|
+
|
95
|
+
all_records = []
|
96
|
+
# for sub_id in tqdm(lst_ids, total=len(lst_ids), desc="Récupération des soumissions"):
|
97
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
98
|
+
subreddit = reddit_client.subreddit(str(sub_id))
|
99
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
100
|
+
if not vars(subreddit).get('_fetched'):
|
101
|
+
subreddit._fetch()
|
102
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
103
|
+
|
104
|
+
sub_record = parse_subreddit(reddit_client, subreddit)
|
105
|
+
|
106
|
+
if subreddit_filter == "top":
|
107
|
+
subreddit_selection = subreddit.top(limit=subreddit_items, time_filter= time_filter)
|
108
|
+
elif subreddit_filter == "hot":
|
109
|
+
subreddit_selection = subreddit.hot(limit=subreddit_items)
|
110
|
+
elif subreddit_filter == "controversial":
|
111
|
+
subreddit_selection = subreddit.controversial(limit=subreddit_items, time_filter= time_filter)
|
112
|
+
elif subreddit_filter == "new":
|
113
|
+
subreddit_selection = subreddit.new(limit=subreddit_items)
|
114
|
+
elif subreddit_filter == "gilded":
|
115
|
+
subreddit_selection = subreddit.gilded(limit=subreddit_items)
|
116
|
+
elif subreddit_filter == "rising":
|
117
|
+
subreddit_selection = subreddit.rising(limit=subreddit_items)
|
118
|
+
else:
|
119
|
+
return pd.DataFrame()
|
244
120
|
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
121
|
+
|
122
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
123
|
+
for i, submission in enumerate(subreddit_selection):
|
124
|
+
try:
|
125
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
126
|
+
if not vars(submission).get('_fetched'):
|
127
|
+
submission._fetch()
|
128
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
249
129
|
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
display_name = None
|
130
|
+
author = submission.author
|
131
|
+
author_record = parse_author(reddit_client, author)
|
132
|
+
submission_record = parse_submission(reddit_client, submission)
|
254
133
|
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
134
|
+
record = sub_record + author_record + submission_record
|
135
|
+
all_records.append(record)
|
136
|
+
|
137
|
+
except Exception as e:
|
138
|
+
pass
|
139
|
+
print(e)
|
140
|
+
|
141
|
+
df = pd.DataFrame.from_records(all_records,
|
142
|
+
columns = ["subreddit_id", "subreddit_name", "subreddit_display_name", "subreddit_subscribers", "subreddit_date", "subreddit_description", "subreddit_public_description", "subreddit_over18",
|
143
|
+
"subreddit_spoilers_enabled", "subreddit_can_assign_user_flair", "subreddit_can_assign_link_flair", "subreddit_lang", "subreddit_active_user_count",
|
144
|
+
"author_id", "author_name", "author_link_karma", "author_comment_karma", "author_created_utc", "author_icon_img", "author_is_employee", "author_is_mod", "author_is_gold",
|
145
|
+
"submission_id", "submission_title", "submission_name", "submission_created_utc", "submission_distinguished", "submission_edited", "submission_is_self", "submission_link_flair_template_id",
|
146
|
+
"submission_link_flair_text", "submission_locked", "submission_num_comments", "submission_over_18", "submission_permalink", "submission_selftext", "submission_spoiler",
|
147
|
+
"submission_stickied", "submission_url", "submission_upvote_ratio", "submission_downs", "submission_ups", "submission_num_crossposts", "submission_num_reports", "submission_score",
|
148
|
+
"submission_total_awards_received", "submission_view_count"]
|
149
|
+
)
|
259
150
|
|
260
|
-
|
261
|
-
date=datetime.datetime.fromtimestamp(int(subreddit.created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
|
262
|
-
else:
|
263
|
-
date = datetime.datetime(1970,1,1,0,0,0)
|
264
|
-
if subreddit.description:
|
265
|
-
description=subreddit.description
|
266
|
-
else:
|
267
|
-
description=None
|
268
|
-
if subreddit.public_description:
|
269
|
-
public_description = subreddit.public_description
|
270
|
-
else:
|
271
|
-
public_description = None
|
272
|
-
if subreddit.over18:
|
273
|
-
over18 = subreddit.over18
|
274
|
-
else:
|
275
|
-
over18 = None
|
276
|
-
if subreddit.lang:
|
277
|
-
lang = subreddit.lang
|
278
|
-
else:
|
279
|
-
lang = None
|
280
|
-
if subreddit.active_user_count:
|
281
|
-
active_user_count = subreddit.active_user_count
|
282
|
-
else:
|
283
|
-
active_user_count = None
|
151
|
+
return df
|
284
152
|
|
285
|
-
if subreddit.spoilers_enabled:
|
286
|
-
spoilers_enabled = subreddit.spoilers_enabled
|
287
|
-
else:
|
288
|
-
spoilers_enabled = None
|
289
153
|
|
290
|
-
if subreddit.can_assign_user_flair:
|
291
|
-
can_assign_user_flair = subreddit.can_assign_user_flair
|
292
|
-
else:
|
293
|
-
can_assign_user_flair = None
|
294
154
|
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
can_assign_link_flair = None
|
155
|
+
def getComments(reddit_client : praw.Reddit, submission_id : str) -> pd.DataFrame:
|
156
|
+
"""
|
157
|
+
Retrieves all comments from a submission ID.
|
299
158
|
|
300
|
-
|
301
|
-
|
159
|
+
Args:
|
160
|
+
reddit_client (praw.Reddit): current reddit client
|
161
|
+
submission_id (str): a submission ID.
|
302
162
|
|
163
|
+
Returns:
|
164
|
+
pd.DataFrame: A DataFrame containing comments metadata.
|
165
|
+
|
166
|
+
"""
|
167
|
+
|
168
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
169
|
+
submission = reddit_client.submission(str(submission_id))
|
170
|
+
if not vars(submission).get('_fetched'):
|
171
|
+
submission._fetch()
|
172
|
+
|
173
|
+
submission.comments.replace_more(limit=None)
|
174
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
303
175
|
|
304
|
-
def getComments(reddit, lst_ids):
|
305
176
|
all_records = []
|
306
|
-
for
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
all_records.append(record)
|
177
|
+
for comment in tqdm(submission.comments.list(), total=len(submission.comments.list()), desc="Récupération des commentaires"):
|
178
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
179
|
+
record = (submission_id,) + parse_comments(reddit_client, comment)
|
180
|
+
all_records.append(record)
|
312
181
|
|
313
|
-
df = pd.DataFrame.from_records(all_records, columns=["submission_id", "comment_id", "comment_body", "comment_date", "comment_distinguished", "
|
314
|
-
"comment_controversiality", "comment_depth", "
|
182
|
+
df = pd.DataFrame.from_records(all_records, columns=["submission_id", "comment_id", "comment_body", "comment_date", "comment_distinguished", "comment_is_submitter", "comment_link_id", "comment_parent_id", "comment_permalink",
|
183
|
+
"comment_controversiality", "comment_depth", "comment_score", "comment_total_awards_received", "comment_ups",
|
315
184
|
"author_id", "author_name", "author_link_karma", "author_comment_karma", "author_created_utc", "author_icon_img", "author_is_employee", "author_is_mod", "author_is_gold"
|
316
185
|
])
|
317
186
|
|
318
187
|
return df
|
319
188
|
|
320
|
-
def
|
321
|
-
|
322
|
-
|
323
|
-
else:
|
324
|
-
comment_id = None
|
325
|
-
if comment.body:
|
326
|
-
comment_body=comment.body
|
327
|
-
else:
|
328
|
-
comment_body = None
|
329
|
-
if comment.created_utc:
|
330
|
-
comment_date=datetime.datetime.fromtimestamp(int(comment.created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
|
331
|
-
else:
|
332
|
-
comment_date = None
|
333
|
-
if comment.distinguished:
|
334
|
-
comment_distinguished=comment.distinguished
|
335
|
-
else:
|
336
|
-
comment_distinguished = None
|
337
|
-
if comment.edited:
|
338
|
-
comment_edited=comment.edited
|
339
|
-
else:
|
340
|
-
comment_edited = None
|
341
|
-
if comment.is_submitter:
|
342
|
-
comment_is_submitter=comment.is_submitter
|
343
|
-
else:
|
344
|
-
comment_is_submitter = None
|
189
|
+
def get_top_level_comments(reddit_client : praw.Reddit, submission_id : str) -> pd.DataFrame:
|
190
|
+
"""
|
191
|
+
Retrieves top level comments from a submission ID.
|
345
192
|
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
comment_link_id = None
|
350
|
-
if comment.parent_id:
|
351
|
-
comment_parent_id=comment.parent_id
|
352
|
-
else:
|
353
|
-
comment_parent_id = None
|
354
|
-
if comment.permalink:
|
355
|
-
comment_permalink=comment.permalink
|
356
|
-
else:
|
357
|
-
comment_permalink = None
|
193
|
+
Args:
|
194
|
+
reddit_client (praw.Reddit): current reddit client
|
195
|
+
submission_id (str): a submission ID.
|
358
196
|
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
197
|
+
Returns:
|
198
|
+
pd.DataFrame: A DataFrame containing comments metadata.
|
199
|
+
|
200
|
+
"""
|
201
|
+
|
202
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
203
|
+
submission = reddit_client.submission(str(submission_id))
|
204
|
+
if not vars(submission).get('_fetched'):
|
205
|
+
submission._fetch()
|
206
|
+
|
207
|
+
submission.comments.replace_more(limit=None)
|
208
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
209
|
+
|
210
|
+
all_records = []
|
211
|
+
for comment in tqdm(submission.comments, total=len(submission.comments), desc="Récupération des commentaires"):
|
212
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
213
|
+
record = (submission_id,) + parse_comments(reddit_client, comment)
|
214
|
+
all_records.append(record)
|
215
|
+
|
216
|
+
df = pd.DataFrame.from_records(all_records, columns=["submission_id", "comment_id", "comment_body", "comment_date", "comment_distinguished", "comment_is_submitter", "comment_link_id", "comment_parent_id", "comment_permalink",
|
217
|
+
"comment_controversiality", "comment_depth", "comment_score", "comment_total_awards_received", "comment_ups",
|
218
|
+
"author_id", "author_name", "author_link_karma", "author_comment_karma", "author_created_utc", "author_icon_img", "author_is_employee", "author_is_mod", "author_is_gold"
|
219
|
+
])
|
220
|
+
|
221
|
+
return df
|
222
|
+
|
223
|
+
def parse_author(reddit_client : praw.Reddit, author : praw.models.Redditor) -> tuple:
|
224
|
+
"""
|
225
|
+
Parses a Reddit author object and extracts relevant information.
|
226
|
+
|
227
|
+
Args:
|
228
|
+
reddit_client (praw.Reddit): current reddit client
|
229
|
+
author (praw.models.Redditor): The Reddit author object.
|
230
|
+
|
231
|
+
Returns:
|
232
|
+
tuple: A tuple containing the following information about the author:
|
233
|
+
- author_id: The ID of the author.
|
234
|
+
- author_name: The name of the author.
|
235
|
+
- author_link_karma: The link karma of the author.
|
236
|
+
- author_comment_karma: The comment karma of the author.
|
237
|
+
- author_created_utc: The creation date of the author.
|
238
|
+
- author_icon_img: The icon image of the author.
|
239
|
+
- author_is_employee: Indicates if the author is an employee.
|
240
|
+
- author_is_mod: Indicates if the author is a moderator.
|
241
|
+
- author_is_gold: Indicates if the author has Reddit Gold.
|
242
|
+
"""
|
243
|
+
|
244
|
+
if author:
|
245
|
+
if not vars(author).get('_fetched'):
|
246
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
247
|
+
author._fetch()
|
248
|
+
author_comment_karma= vars(author).get("comment_karma", None)
|
249
|
+
|
250
|
+
author_created_utc= vars(author).get("created_utc", None)
|
251
|
+
if author_created_utc:
|
252
|
+
author_created_utc = datetime.datetime.fromtimestamp(int(author_created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
|
253
|
+
else:
|
254
|
+
author_created_utc = datetime.datetime(1970,1,1,0,0,0)
|
255
|
+
|
256
|
+
author_icon_img= vars(author).get("icon_img", None)
|
257
|
+
author_is_employee= vars(author).get("is_employee", None)
|
258
|
+
author_is_mod= vars(author).get("is_mod", None)
|
259
|
+
author_is_gold= vars(author).get("is_gold", None)
|
260
|
+
author_link_karma= vars(author).get("link_karma", None)
|
261
|
+
author_name= vars(author).get("name", None)
|
262
|
+
author_id= vars(author).get("id", None)
|
263
|
+
|
264
|
+
record = (author_id, author_name, author_link_karma, author_comment_karma, author_created_utc, author_icon_img, author_is_employee, author_is_mod, author_is_gold)
|
389
265
|
else:
|
390
|
-
|
391
|
-
|
392
|
-
|
266
|
+
record = (None, None, None, None, None, None, None, None, None)
|
267
|
+
return record
|
268
|
+
|
269
|
+
def parse_submission(reddit_client : praw.Reddit, submission : praw.models.Submission) -> tuple:
|
270
|
+
"""
|
271
|
+
Parses a Reddit submission object and extracts relevant information.
|
272
|
+
|
273
|
+
Args:
|
274
|
+
reddit_client (praw.Reddit): current reddit client
|
275
|
+
submission (praw.models.Submission): The Reddit submission object.
|
276
|
+
|
277
|
+
Returns:
|
278
|
+
tuple: A tuple containing information about the submission.
|
279
|
+
"""
|
280
|
+
|
281
|
+
if submission :
|
282
|
+
if not vars(submission).get('_fetched'):
|
283
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
284
|
+
submission._fetch()
|
285
|
+
submission_id= vars(submission).get("id", None)
|
286
|
+
submission_title= vars(submission).get("title", None)
|
287
|
+
submission_name= vars(submission).get("name", None)
|
288
|
+
submission_created_utc= vars(submission).get("created_utc", None)
|
289
|
+
if submission_created_utc:
|
290
|
+
submission_created_utc = datetime.datetime.fromtimestamp(int(submission_created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
|
291
|
+
else:
|
292
|
+
submission_created_utc = datetime.datetime(1970,1,1,0,0,0)
|
293
|
+
submission_distinguished= vars(submission).get("distinguished", None)
|
294
|
+
submission_edited= vars(submission).get("edited", None)
|
295
|
+
submission_is_self= vars(submission).get("is_self", None)
|
296
|
+
submission_link_flair_template_id= vars(submission).get("link_flair_template_id", None)
|
297
|
+
submission_link_flair_text= vars(submission).get("link_flair_text", None)
|
298
|
+
submission_locked= vars(submission).get("locked", None)
|
299
|
+
submission_num_comments= vars(submission).get("num_comments", None)
|
300
|
+
submission_over_18= vars(submission).get("over_18", None)
|
301
|
+
submission_permalink= vars(submission).get("permalink", None)
|
302
|
+
submission_selftext= vars(submission).get("selftext", None)
|
303
|
+
submission_spoiler= vars(submission).get("spoiler", None)
|
304
|
+
submission_stickied= vars(submission).get("stickied", None)
|
305
|
+
submission_upvote_ratio= vars(submission).get("upvote_ratio", None)
|
306
|
+
submission_url= vars(submission).get("url", None)
|
307
|
+
submission_downs= vars(submission).get("downs", None)
|
308
|
+
submission_num_crossposts= vars(submission).get("num_crossposts", None)
|
309
|
+
submission_num_reports= vars(submission).get("num_reports", None)
|
310
|
+
submission_score= vars(submission).get("score", None)
|
311
|
+
submission_total_awards_received= vars(submission).get("total_awards_received", None)
|
312
|
+
submission_view_count= vars(submission).get("view_count", None)
|
313
|
+
submission_ups= vars(submission).get("ups", None)
|
314
|
+
record = (submission_id, submission_title, submission_name, submission_created_utc, submission_distinguished, submission_edited, submission_is_self, submission_link_flair_template_id,
|
315
|
+
submission_link_flair_text, submission_locked, submission_num_comments, submission_over_18, submission_permalink, submission_selftext, submission_spoiler,
|
316
|
+
submission_stickied, submission_url, submission_upvote_ratio, submission_downs, submission_ups, submission_num_crossposts, submission_num_reports, submission_score,
|
317
|
+
submission_total_awards_received, submission_view_count)
|
318
|
+
else:
|
319
|
+
record = (None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
|
320
|
+
return record
|
321
|
+
|
322
|
+
|
323
|
+
def parse_subreddit(reddit_client : praw.Reddit, subreddit : praw.models.Subreddit) -> tuple:
|
324
|
+
"""
|
325
|
+
Parses a Reddit subreddit object and extracts relevant information.
|
326
|
+
|
327
|
+
Args:
|
328
|
+
reddit_client (praw.Reddit): current reddit client
|
329
|
+
subreddit (praw.models.Subreddit): The Reddit subreddit object.
|
330
|
+
|
331
|
+
Returns:
|
332
|
+
tuple: A tuple containing information about the subreddit.
|
333
|
+
"""
|
334
|
+
|
335
|
+
if subreddit:
|
336
|
+
if not vars(subreddit).get('_fetched'):
|
337
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
338
|
+
subreddit._fetch()
|
339
|
+
subreddit_id= vars(subreddit).get("id", None)
|
340
|
+
name = vars(subreddit).get("name", None)
|
341
|
+
display_name = vars(subreddit).get("display_name", None)
|
342
|
+
subscribers = vars(subreddit).get("subscribers", None)
|
343
|
+
subscribers = vars(subreddit).get("subscribers", None)
|
344
|
+
date = vars(subreddit).get("created_utc", None)
|
345
|
+
if date:
|
346
|
+
date=datetime.datetime.fromtimestamp(int(date)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
|
347
|
+
else:
|
348
|
+
date = datetime.datetime(1970,1,1,0,0,0)
|
349
|
+
description = vars(subreddit).get("description", None)
|
350
|
+
public_description = vars(subreddit).get("public_description", None)
|
351
|
+
over18 = vars(subreddit).get("over18", None)
|
352
|
+
spoilers_enabled = vars(subreddit).get("spoilers_enabled", None)
|
353
|
+
can_assign_user_flair = vars(subreddit).get("can_assign_user_flair", None)
|
354
|
+
can_assign_link_flair = vars(subreddit).get("can_assign_link_flair", None)
|
355
|
+
lang = vars(subreddit).get("lang", None)
|
356
|
+
active_user_count = vars(subreddit).get("active_user_count", None)
|
357
|
+
|
358
|
+
record = (subreddit_id, name, display_name, subscribers, date, description, public_description, over18, spoilers_enabled, can_assign_user_flair, can_assign_link_flair, lang, active_user_count)
|
359
|
+
|
393
360
|
else:
|
394
|
-
|
361
|
+
record = (None, None, None, None, None, None, None, None, None, None, None, None, None)
|
362
|
+
return record
|
395
363
|
|
396
|
-
|
364
|
+
def parse_comments(reddit_client : praw.Reddit, comment : praw.models.Comment) -> tuple:
|
365
|
+
"""
|
366
|
+
Parses a Reddit comment object and extracts relevant information.
|
397
367
|
|
368
|
+
Args:
|
369
|
+
reddit_client (praw.Reddit): current reddit client
|
370
|
+
comment (praw.models.Comment): The Reddit comment object.
|
371
|
+
|
372
|
+
Returns:
|
373
|
+
tuple: A tuple containing information about the comment.
|
374
|
+
"""
|
375
|
+
|
376
|
+
if comment:
|
377
|
+
if not vars(comment).get('_fetched'):
|
378
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
379
|
+
comment._fetch()
|
380
|
+
comment_id = vars(comment).get("id", None)
|
381
|
+
comment_body = vars(comment).get("body", None)
|
382
|
+
comment_date = vars(comment).get("created_utc", None)
|
383
|
+
if comment_date:
|
384
|
+
comment_date = datetime.datetime.fromtimestamp(int(comment_date)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
|
385
|
+
else:
|
386
|
+
comment_date = datetime.datetime(1970,1,1,0,0,0)
|
387
|
+
comment_distinguished = vars(comment).get("distinguished", None)
|
388
|
+
# comment_edited = vars(comment).get("edited", None)
|
389
|
+
comment_is_submitter = vars(comment).get("is_submitter", None)
|
390
|
+
comment_link_id = vars(comment).get("link_id", None)
|
391
|
+
comment_parent_id = vars(comment).get("parent_id", None)
|
392
|
+
comment_permalink = vars(comment).get("permalink", None)
|
393
|
+
comment_controversiality = vars(comment).get("controversiality", None)
|
394
|
+
comment_depth = vars(comment).get("depth", None)
|
395
|
+
# comment_downs = vars(comment).get("downs", None)
|
396
|
+
# comment_likes = vars(comment).get("likes", None)
|
397
|
+
# comment_num_reports = vars(comment).get("num_reports", None)
|
398
|
+
comment_score = vars(comment).get("score", None)
|
399
|
+
comment_total_awards_received = vars(comment).get("total_awards_received", None)
|
400
|
+
comment_ups = vars(comment).get("ups", None)
|
401
|
+
author = comment.author
|
402
|
+
author_record = parse_author(reddit_client, author)
|
403
|
+
record = (comment_id, comment_body, comment_date, comment_distinguished, comment_is_submitter, comment_link_id, comment_parent_id, comment_permalink, comment_controversiality, comment_depth, comment_score, comment_total_awards_received, comment_ups) + author_record
|
404
|
+
else:
|
405
|
+
record = (None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
|
398
406
|
return record
|
399
407
|
|