opsci-toolbox 0.0.12__py3-none-any.whl → 0.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,399 @@
1
+ import praw
2
+ import datetime
3
+ import pandas as pd
4
+ from tqdm import tqdm
5
+
6
+ def parse_author(author):
7
+ print(author)
8
+ if author:
9
+ if author.comment_karma:
10
+ author_comment_karma=author.comment_karma
11
+ else:
12
+ author_comment_karma = None
13
+ if author.created_utc:
14
+ author_created_utc=datetime.datetime.fromtimestamp(int(author.created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
15
+ else:
16
+ author_created_utc = datetime.datetime(1970,1,1,0,0,0)
17
+ if author.icon_img:
18
+ author_icon_img=author.icon_img
19
+ else:
20
+ author_icon_img = None
21
+ if author.id:
22
+ author_id=author.id
23
+ else:
24
+ author_id = None
25
+ if author.is_employee:
26
+ author_is_employee=author.is_employee
27
+ else:
28
+ author_is_employee = None
29
+ if author.is_mod:
30
+ author_is_mod=author.is_mod
31
+ else:
32
+ author_is_mod = None
33
+
34
+ if author.is_gold:
35
+ author_is_gold=author.is_gold
36
+ else:
37
+ author_is_gold = None
38
+
39
+ if author.link_karma:
40
+ author_link_karma=author.link_karma
41
+ else:
42
+ author_link_karma = None
43
+
44
+ if author.name:
45
+ author_name=author.name
46
+ else:
47
+ author_name = None
48
+ record = (author_id, author_name, author_link_karma, author_comment_karma, author_created_utc, author_icon_img, author_is_employee, author_is_mod, author_is_gold)
49
+ else:
50
+ record = (None, None, None, None, None, None, None, None, None)
51
+ return record
52
+
53
+
54
+ def getSubmissions(reddit_client, lst_ids, subreddit_filter, subreddit_items, time_filter):
55
+
56
+ all_records = []
57
+ for url in tqdm(lst_ids, total=len(lst_ids), desc="Récupération des soumissions"):
58
+ subreddit = reddit_client.subreddit(str(url))
59
+
60
+ sub_record = parse_subreddit(subreddit)
61
+
62
+ if subreddit_filter == "top":
63
+ subreddit_selection = subreddit.top(limit=subreddit_items, time_filter= time_filter)
64
+ elif subreddit_filter == "hot":
65
+ subreddit_selection = subreddit.hot(limit=subreddit_items, time_filter= time_filter)
66
+ elif subreddit_filter == "controversial":
67
+ subreddit_selection = subreddit.controversial(limit=subreddit_items, time_filter= time_filter)
68
+ else:
69
+ subreddit_selection = subreddit.new(limit=subreddit_items, time_filter= time_filter)
70
+
71
+ for submission in subreddit_selection:
72
+ author = submission.author
73
+ author_record = parse_author(author)
74
+ submission_record = parse_submission(submission)
75
+
76
+ record = sub_record + author_record + submission_record
77
+ all_records.append(record)
78
+
79
+ df = pd.DataFrame.from_records(all_records,
80
+ columns = ["subreddit_id", "subreddit_name", "subreddit_display_name", "subreddit_subscribers", "subreddit_date", "subreddit_description", "subreddit_public_description", "subreddit_over18",
81
+ "subreddit_spoilers_enabled", "subreddit_can_assign_user_flair", "subreddit_can_assign_link_flair", "subreddit_lang", "subreddit_active_user_count",
82
+ "author_id", "author_name", "author_link_karma", "author_comment_karma", "author_created_utc", "author_icon_img", "author_is_employee", "author_is_mod", "author_is_gold",
83
+ "submission_id", "submission_title", "submission_name", "submission_created_utc", "submission_distinguished", "submission_edited", "submission_is_self", "submission_link_flair_template_id",
84
+ "submission_link_flair_text", "submission_locked", "submission_num_comments", "submission_over_18", "submission_permalink", "submission_score", "submission_selftext", "submission_spoiler",
85
+ "submission_stickied", "submission_url", "submission_upvote_ratio", "submission_downs", "submission_ups", "submission_num_crossposts", "submission_num_reports", "submission_score",
86
+ "submission_total_awards_received", "submission_view_count"]
87
+ )
88
+
89
+ return df
90
+
91
+
92
+ def parse_submission(submission):
93
+ if submission.id:
94
+ submission_id=submission.id
95
+ else:
96
+ submission_id = None
97
+ if submission.title:
98
+ submission_title=submission.title
99
+ else:
100
+ submission_title = None
101
+ if submission.name:
102
+ submission_name=submission.name
103
+ else:
104
+ submission_name = None
105
+ if submission.created_utc:
106
+ submission_created_utc=datetime.datetime.fromtimestamp(int(submission.created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
107
+ else:
108
+ submission_created_utc = datetime.datetime(1970,1,1,0,0,0)
109
+ if submission.distinguished:
110
+ submission_distinguished=submission.distinguished
111
+ else:
112
+ submission_distinguished = None
113
+ if submission.edited:
114
+ submission_edited=submission.edited
115
+ else:
116
+ submission_edited = None
117
+ if submission.is_self:
118
+ submission_is_self=submission.is_self
119
+ else:
120
+ submission_is_self = None
121
+ if submission.link_flair_template_id:
122
+ submission_link_flair_template_id=submission.link_flair_template_id
123
+ else:
124
+ submission_link_flair_template_id = None
125
+ if submission.link_flair_text:
126
+ submission_link_flair_text=submission.link_flair_text
127
+ else:
128
+ submission_link_flair_text = None
129
+ if submission.locked:
130
+ submission_locked=submission.locked
131
+ else:
132
+ submission_locked = None
133
+ if submission.num_comments:
134
+ submission_num_comments=submission.num_comments
135
+ else:
136
+ submission_num_comments = None
137
+ if submission.over_18:
138
+ submission_over_18=submission.over_18
139
+ else:
140
+ submission_over_18 = None
141
+ if submission.permalink:
142
+ submission_permalink=submission.permalink
143
+ else:
144
+ submission_permalink = None
145
+ if submission.score:
146
+ submission_score=submission.score
147
+ else:
148
+ submission_score = None
149
+ if submission.selftext:
150
+ submission_selftext=submission.selftext
151
+ else:
152
+ submission_selftext = None
153
+ if submission.spoiler:
154
+ submission_spoiler=submission.spoiler
155
+ else:
156
+ submission_spoiler = None
157
+ if submission.stickied:
158
+ submission_stickied=submission.stickied
159
+ else:
160
+ submission_stickied = None
161
+ if submission.upvote_ratio:
162
+ submission_upvote_ratio=submission.upvote_ratio
163
+ else:
164
+ submission_upvote_ratio = None
165
+ if submission.url:
166
+ submission_url=submission.url
167
+ else:
168
+ submission_url = None
169
+ if submission.downs:
170
+ submission_downs=submission.downs
171
+ else:
172
+ submission_downs = None
173
+ if submission.num_crossposts:
174
+ submission_num_crossposts=submission.num_crossposts
175
+ else:
176
+ submission_num_crossposts = None
177
+ if submission.num_reports:
178
+ submission_num_reports=submission.num_reports
179
+ else:
180
+ submission_num_reports = None
181
+
182
+ if submission.score:
183
+ submission_score=submission.score
184
+ else:
185
+ submission_score = None
186
+
187
+ if submission.ups:
188
+ submission_ups=submission.ups
189
+ else:
190
+ submission_ups = None
191
+
192
+ if submission.total_awards_received:
193
+ submission_total_awards_received=submission.total_awards_received
194
+ else:
195
+ submission_total_awards_received = None
196
+
197
+ if submission.view_count:
198
+ submission_view_count=submission.view_count
199
+ else:
200
+ submission_view_count = None
201
+
202
+
203
+ record = (submission_id, submission_title, submission_name, submission_created_utc, submission_distinguished, submission_edited, submission_is_self, submission_link_flair_template_id, submission_link_flair_text, submission_locked, submission_num_comments, submission_over_18, submission_permalink, submission_score, submission_selftext, submission_spoiler, submission_stickied, submission_url, submission_upvote_ratio, submission_downs, submission_ups, submission_num_crossposts, submission_num_reports, submission_score, submission_total_awards_received, submission_view_count)
204
+ return record
205
+
206
+
207
+
208
+ def get_subreddit_info(reddit_client : praw.Reddit, lst_ids: list) -> pd.DataFrame:
209
+ """
210
+ Retrieves information about subreddits based on a list of subreddit IDs.
211
+
212
+ Args:
213
+ lst_ids (list): A list of subreddit IDs.
214
+
215
+ Returns:
216
+ pd.DataFrame: A DataFrame containing the following information for each subreddit:
217
+ - subreddit_id: The ID of the subreddit.
218
+ - name: The name of the subreddit.
219
+ - display_name: The display name of the subreddit.
220
+ - subscribers: The number of subscribers to the subreddit.
221
+ - date: The creation date of the subreddit.
222
+ - description: The description of the subreddit.
223
+ - public_description: The public description of the subreddit.
224
+ - over18: Indicates if the subreddit is for users over 18 years old.
225
+ - spoilers_enabled: Indicates if spoilers are enabled in the subreddit.
226
+ - can_assign_user_flair: Indicates if users can assign their own flair in the subreddit.
227
+ - can_assign_link_flair: Indicates if users can assign flair to links in the subreddit.
228
+ """
229
+ all_records = []
230
+ for reddit_id in lst_ids:
231
+ subreddit = reddit_client.subreddit(str(reddit_id))
232
+ record = parse_subreddit(subreddit)
233
+
234
+ all_records.append(record)
235
+
236
+ df = pd.DataFrame.from_records(all_records, columns=["subreddit_id", "subreddit_name", "subreddit_display_name", "subreddit_subscribers", "subreddit_date", "subreddit_description", "subreddit_public_description", "subreddit_over18", "subreddit_spoilers_enabled", "subreddit_can_assign_user_flair", "subreddit_can_assign_link_flair", "subreddit_lang", "subreddit_active_user_count"])
237
+ return df
238
+
239
+ def parse_subreddit(subreddit):
240
+ if subreddit.id:
241
+ subreddit_id = subreddit.id
242
+ else:
243
+ subreddit_id = None
244
+
245
+ if subreddit.name:
246
+ name = subreddit.name
247
+ else:
248
+ name = None
249
+
250
+ if subreddit.display_name:
251
+ display_name = subreddit.display_name
252
+ else:
253
+ display_name = None
254
+
255
+ if subreddit.subscribers:
256
+ subscribers = subreddit.subscribers
257
+ else:
258
+ subscribers = None
259
+
260
+ if subreddit.created_utc:
261
+ date=datetime.datetime.fromtimestamp(int(subreddit.created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
262
+ else:
263
+ date = datetime.datetime(1970,1,1,0,0,0)
264
+ if subreddit.description:
265
+ description=subreddit.description
266
+ else:
267
+ description=None
268
+ if subreddit.public_description:
269
+ public_description = subreddit.public_description
270
+ else:
271
+ public_description = None
272
+ if subreddit.over18:
273
+ over18 = subreddit.over18
274
+ else:
275
+ over18 = None
276
+ if subreddit.lang:
277
+ lang = subreddit.lang
278
+ else:
279
+ lang = None
280
+ if subreddit.active_user_count:
281
+ active_user_count = subreddit.active_user_count
282
+ else:
283
+ active_user_count = None
284
+
285
+ if subreddit.spoilers_enabled:
286
+ spoilers_enabled = subreddit.spoilers_enabled
287
+ else:
288
+ spoilers_enabled = None
289
+
290
+ if subreddit.can_assign_user_flair:
291
+ can_assign_user_flair = subreddit.can_assign_user_flair
292
+ else:
293
+ can_assign_user_flair = None
294
+
295
+ if subreddit.can_assign_link_flair:
296
+ can_assign_link_flair = subreddit.can_assign_link_flair
297
+ else:
298
+ can_assign_link_flair = None
299
+
300
+ record = (subreddit_id, name, display_name, subscribers, date, description, public_description, over18, spoilers_enabled, can_assign_user_flair, can_assign_link_flair, lang, active_user_count)
301
+ return record
302
+
303
+
304
+ def getComments(reddit, lst_ids):
305
+ all_records = []
306
+ for i, submission_id in tqdm(enumerate(lst_ids), total=len(lst_ids), desc="Récupération des commentaires"):
307
+ submission = reddit.submission(str(submission_id))
308
+ for comment in submission.comments.list():
309
+ record = (submission_id,) + parse_comments(comment)
310
+
311
+ all_records.append(record)
312
+
313
+ df = pd.DataFrame.from_records(all_records, columns=["submission_id", "comment_id", "comment_body", "comment_date", "comment_distinguished", "comment_edited", "comment_is_submitter", "comment_link_id", "comment_parent_id", "comment_permalink",
314
+ "comment_controversiality", "comment_depth", "comment_downs", "comment_likes", "comment_num_reports", "comment_score", "comment_total_awards_received", "comment_ups",
315
+ "author_id", "author_name", "author_link_karma", "author_comment_karma", "author_created_utc", "author_icon_img", "author_is_employee", "author_is_mod", "author_is_gold"
316
+ ])
317
+
318
+ return df
319
+
320
+ def parse_comments(comment):
321
+ if comment.id:
322
+ comment_id=comment.id
323
+ else:
324
+ comment_id = None
325
+ if comment.body:
326
+ comment_body=comment.body
327
+ else:
328
+ comment_body = None
329
+ if comment.created_utc:
330
+ comment_date=datetime.datetime.fromtimestamp(int(comment.created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
331
+ else:
332
+ comment_date = None
333
+ if comment.distinguished:
334
+ comment_distinguished=comment.distinguished
335
+ else:
336
+ comment_distinguished = None
337
+ if comment.edited:
338
+ comment_edited=comment.edited
339
+ else:
340
+ comment_edited = None
341
+ if comment.is_submitter:
342
+ comment_is_submitter=comment.is_submitter
343
+ else:
344
+ comment_is_submitter = None
345
+
346
+ if comment.link_id:
347
+ comment_link_id=comment.link_id
348
+ else:
349
+ comment_link_id = None
350
+ if comment.parent_id:
351
+ comment_parent_id=comment.parent_id
352
+ else:
353
+ comment_parent_id = None
354
+ if comment.permalink:
355
+ comment_permalink=comment.permalink
356
+ else:
357
+ comment_permalink = None
358
+
359
+ if comment.controversiality:
360
+ comment_controversiality=comment.controversiality
361
+ else:
362
+ comment_controversiality = None
363
+ if comment.depth:
364
+ comment_depth=comment.depth
365
+ else:
366
+ comment_depth = None
367
+ if comment.downs:
368
+ comment_downs=comment.downs
369
+ else:
370
+ comment_downs = None
371
+ if comment.likes:
372
+ comment_likes=comment.likes
373
+ else:
374
+ comment_likes = None
375
+ if comment.num_reports:
376
+ comment_num_reports=comment.num_reports
377
+ else:
378
+ comment_num_reports = None
379
+ if comment.score:
380
+ comment_score=comment.score
381
+ else:
382
+ comment_score = None
383
+ if comment.total_awards_received:
384
+ comment_total_awards_received=comment.total_awards_received
385
+ else:
386
+ comment_total_awards_received = None
387
+ if comment.ups:
388
+ comment_ups=comment.ups
389
+ else:
390
+ comment_ups = None
391
+ if comment.author:
392
+ author_record = parse_author(comment.author)
393
+ else:
394
+ author_record = (None, None, None, None, None, None, None, None, None)
395
+
396
+ record = (comment_id, comment_body, comment_date, comment_distinguished, comment_edited, comment_is_submitter, comment_link_id, comment_parent_id, comment_permalink, comment_controversiality, comment_depth, comment_downs, comment_likes, comment_num_reports, comment_score, comment_total_awards_received, comment_ups) + author_record
397
+
398
+ return record
399
+