opsci-toolbox 0.0.12__py3-none-any.whl → 0.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/reddit.py +399 -0
- opsci_toolbox/apis/telegram.py +1035 -0
- opsci_toolbox/helpers/common.py +176 -4
- opsci_toolbox/helpers/dataviz.py +184 -26
- opsci_toolbox/helpers/dates.py +46 -0
- opsci_toolbox/helpers/gliner.py +88 -0
- opsci_toolbox/helpers/nlp.py +188 -7
- opsci_toolbox/helpers/nlp_cuml.py +3 -3
- opsci_toolbox/helpers/sna.py +1 -0
- {opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.13.dist-info}/METADATA +4 -1
- {opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.13.dist-info}/RECORD +13 -10
- {opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.13.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,399 @@
|
|
1
|
+
import praw
|
2
|
+
import datetime
|
3
|
+
import pandas as pd
|
4
|
+
from tqdm import tqdm
|
5
|
+
|
6
|
+
def parse_author(author):
|
7
|
+
print(author)
|
8
|
+
if author:
|
9
|
+
if author.comment_karma:
|
10
|
+
author_comment_karma=author.comment_karma
|
11
|
+
else:
|
12
|
+
author_comment_karma = None
|
13
|
+
if author.created_utc:
|
14
|
+
author_created_utc=datetime.datetime.fromtimestamp(int(author.created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
|
15
|
+
else:
|
16
|
+
author_created_utc = datetime.datetime(1970,1,1,0,0,0)
|
17
|
+
if author.icon_img:
|
18
|
+
author_icon_img=author.icon_img
|
19
|
+
else:
|
20
|
+
author_icon_img = None
|
21
|
+
if author.id:
|
22
|
+
author_id=author.id
|
23
|
+
else:
|
24
|
+
author_id = None
|
25
|
+
if author.is_employee:
|
26
|
+
author_is_employee=author.is_employee
|
27
|
+
else:
|
28
|
+
author_is_employee = None
|
29
|
+
if author.is_mod:
|
30
|
+
author_is_mod=author.is_mod
|
31
|
+
else:
|
32
|
+
author_is_mod = None
|
33
|
+
|
34
|
+
if author.is_gold:
|
35
|
+
author_is_gold=author.is_gold
|
36
|
+
else:
|
37
|
+
author_is_gold = None
|
38
|
+
|
39
|
+
if author.link_karma:
|
40
|
+
author_link_karma=author.link_karma
|
41
|
+
else:
|
42
|
+
author_link_karma = None
|
43
|
+
|
44
|
+
if author.name:
|
45
|
+
author_name=author.name
|
46
|
+
else:
|
47
|
+
author_name = None
|
48
|
+
record = (author_id, author_name, author_link_karma, author_comment_karma, author_created_utc, author_icon_img, author_is_employee, author_is_mod, author_is_gold)
|
49
|
+
else:
|
50
|
+
record = (None, None, None, None, None, None, None, None, None)
|
51
|
+
return record
|
52
|
+
|
53
|
+
|
54
|
+
def getSubmissions(reddit_client, lst_ids, subreddit_filter, subreddit_items, time_filter):
|
55
|
+
|
56
|
+
all_records = []
|
57
|
+
for url in tqdm(lst_ids, total=len(lst_ids), desc="Récupération des soumissions"):
|
58
|
+
subreddit = reddit_client.subreddit(str(url))
|
59
|
+
|
60
|
+
sub_record = parse_subreddit(subreddit)
|
61
|
+
|
62
|
+
if subreddit_filter == "top":
|
63
|
+
subreddit_selection = subreddit.top(limit=subreddit_items, time_filter= time_filter)
|
64
|
+
elif subreddit_filter == "hot":
|
65
|
+
subreddit_selection = subreddit.hot(limit=subreddit_items, time_filter= time_filter)
|
66
|
+
elif subreddit_filter == "controversial":
|
67
|
+
subreddit_selection = subreddit.controversial(limit=subreddit_items, time_filter= time_filter)
|
68
|
+
else:
|
69
|
+
subreddit_selection = subreddit.new(limit=subreddit_items, time_filter= time_filter)
|
70
|
+
|
71
|
+
for submission in subreddit_selection:
|
72
|
+
author = submission.author
|
73
|
+
author_record = parse_author(author)
|
74
|
+
submission_record = parse_submission(submission)
|
75
|
+
|
76
|
+
record = sub_record + author_record + submission_record
|
77
|
+
all_records.append(record)
|
78
|
+
|
79
|
+
df = pd.DataFrame.from_records(all_records,
|
80
|
+
columns = ["subreddit_id", "subreddit_name", "subreddit_display_name", "subreddit_subscribers", "subreddit_date", "subreddit_description", "subreddit_public_description", "subreddit_over18",
|
81
|
+
"subreddit_spoilers_enabled", "subreddit_can_assign_user_flair", "subreddit_can_assign_link_flair", "subreddit_lang", "subreddit_active_user_count",
|
82
|
+
"author_id", "author_name", "author_link_karma", "author_comment_karma", "author_created_utc", "author_icon_img", "author_is_employee", "author_is_mod", "author_is_gold",
|
83
|
+
"submission_id", "submission_title", "submission_name", "submission_created_utc", "submission_distinguished", "submission_edited", "submission_is_self", "submission_link_flair_template_id",
|
84
|
+
"submission_link_flair_text", "submission_locked", "submission_num_comments", "submission_over_18", "submission_permalink", "submission_score", "submission_selftext", "submission_spoiler",
|
85
|
+
"submission_stickied", "submission_url", "submission_upvote_ratio", "submission_downs", "submission_ups", "submission_num_crossposts", "submission_num_reports", "submission_score",
|
86
|
+
"submission_total_awards_received", "submission_view_count"]
|
87
|
+
)
|
88
|
+
|
89
|
+
return df
|
90
|
+
|
91
|
+
|
92
|
+
def parse_submission(submission):
|
93
|
+
if submission.id:
|
94
|
+
submission_id=submission.id
|
95
|
+
else:
|
96
|
+
submission_id = None
|
97
|
+
if submission.title:
|
98
|
+
submission_title=submission.title
|
99
|
+
else:
|
100
|
+
submission_title = None
|
101
|
+
if submission.name:
|
102
|
+
submission_name=submission.name
|
103
|
+
else:
|
104
|
+
submission_name = None
|
105
|
+
if submission.created_utc:
|
106
|
+
submission_created_utc=datetime.datetime.fromtimestamp(int(submission.created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
|
107
|
+
else:
|
108
|
+
submission_created_utc = datetime.datetime(1970,1,1,0,0,0)
|
109
|
+
if submission.distinguished:
|
110
|
+
submission_distinguished=submission.distinguished
|
111
|
+
else:
|
112
|
+
submission_distinguished = None
|
113
|
+
if submission.edited:
|
114
|
+
submission_edited=submission.edited
|
115
|
+
else:
|
116
|
+
submission_edited = None
|
117
|
+
if submission.is_self:
|
118
|
+
submission_is_self=submission.is_self
|
119
|
+
else:
|
120
|
+
submission_is_self = None
|
121
|
+
if submission.link_flair_template_id:
|
122
|
+
submission_link_flair_template_id=submission.link_flair_template_id
|
123
|
+
else:
|
124
|
+
submission_link_flair_template_id = None
|
125
|
+
if submission.link_flair_text:
|
126
|
+
submission_link_flair_text=submission.link_flair_text
|
127
|
+
else:
|
128
|
+
submission_link_flair_text = None
|
129
|
+
if submission.locked:
|
130
|
+
submission_locked=submission.locked
|
131
|
+
else:
|
132
|
+
submission_locked = None
|
133
|
+
if submission.num_comments:
|
134
|
+
submission_num_comments=submission.num_comments
|
135
|
+
else:
|
136
|
+
submission_num_comments = None
|
137
|
+
if submission.over_18:
|
138
|
+
submission_over_18=submission.over_18
|
139
|
+
else:
|
140
|
+
submission_over_18 = None
|
141
|
+
if submission.permalink:
|
142
|
+
submission_permalink=submission.permalink
|
143
|
+
else:
|
144
|
+
submission_permalink = None
|
145
|
+
if submission.score:
|
146
|
+
submission_score=submission.score
|
147
|
+
else:
|
148
|
+
submission_score = None
|
149
|
+
if submission.selftext:
|
150
|
+
submission_selftext=submission.selftext
|
151
|
+
else:
|
152
|
+
submission_selftext = None
|
153
|
+
if submission.spoiler:
|
154
|
+
submission_spoiler=submission.spoiler
|
155
|
+
else:
|
156
|
+
submission_spoiler = None
|
157
|
+
if submission.stickied:
|
158
|
+
submission_stickied=submission.stickied
|
159
|
+
else:
|
160
|
+
submission_stickied = None
|
161
|
+
if submission.upvote_ratio:
|
162
|
+
submission_upvote_ratio=submission.upvote_ratio
|
163
|
+
else:
|
164
|
+
submission_upvote_ratio = None
|
165
|
+
if submission.url:
|
166
|
+
submission_url=submission.url
|
167
|
+
else:
|
168
|
+
submission_url = None
|
169
|
+
if submission.downs:
|
170
|
+
submission_downs=submission.downs
|
171
|
+
else:
|
172
|
+
submission_downs = None
|
173
|
+
if submission.num_crossposts:
|
174
|
+
submission_num_crossposts=submission.num_crossposts
|
175
|
+
else:
|
176
|
+
submission_num_crossposts = None
|
177
|
+
if submission.num_reports:
|
178
|
+
submission_num_reports=submission.num_reports
|
179
|
+
else:
|
180
|
+
submission_num_reports = None
|
181
|
+
|
182
|
+
if submission.score:
|
183
|
+
submission_score=submission.score
|
184
|
+
else:
|
185
|
+
submission_score = None
|
186
|
+
|
187
|
+
if submission.ups:
|
188
|
+
submission_ups=submission.ups
|
189
|
+
else:
|
190
|
+
submission_ups = None
|
191
|
+
|
192
|
+
if submission.total_awards_received:
|
193
|
+
submission_total_awards_received=submission.total_awards_received
|
194
|
+
else:
|
195
|
+
submission_total_awards_received = None
|
196
|
+
|
197
|
+
if submission.view_count:
|
198
|
+
submission_view_count=submission.view_count
|
199
|
+
else:
|
200
|
+
submission_view_count = None
|
201
|
+
|
202
|
+
|
203
|
+
record = (submission_id, submission_title, submission_name, submission_created_utc, submission_distinguished, submission_edited, submission_is_self, submission_link_flair_template_id, submission_link_flair_text, submission_locked, submission_num_comments, submission_over_18, submission_permalink, submission_score, submission_selftext, submission_spoiler, submission_stickied, submission_url, submission_upvote_ratio, submission_downs, submission_ups, submission_num_crossposts, submission_num_reports, submission_score, submission_total_awards_received, submission_view_count)
|
204
|
+
return record
|
205
|
+
|
206
|
+
|
207
|
+
|
208
|
+
def get_subreddit_info(reddit_client : praw.Reddit, lst_ids: list) -> pd.DataFrame:
|
209
|
+
"""
|
210
|
+
Retrieves information about subreddits based on a list of subreddit IDs.
|
211
|
+
|
212
|
+
Args:
|
213
|
+
lst_ids (list): A list of subreddit IDs.
|
214
|
+
|
215
|
+
Returns:
|
216
|
+
pd.DataFrame: A DataFrame containing the following information for each subreddit:
|
217
|
+
- subreddit_id: The ID of the subreddit.
|
218
|
+
- name: The name of the subreddit.
|
219
|
+
- display_name: The display name of the subreddit.
|
220
|
+
- subscribers: The number of subscribers to the subreddit.
|
221
|
+
- date: The creation date of the subreddit.
|
222
|
+
- description: The description of the subreddit.
|
223
|
+
- public_description: The public description of the subreddit.
|
224
|
+
- over18: Indicates if the subreddit is for users over 18 years old.
|
225
|
+
- spoilers_enabled: Indicates if spoilers are enabled in the subreddit.
|
226
|
+
- can_assign_user_flair: Indicates if users can assign their own flair in the subreddit.
|
227
|
+
- can_assign_link_flair: Indicates if users can assign flair to links in the subreddit.
|
228
|
+
"""
|
229
|
+
all_records = []
|
230
|
+
for reddit_id in lst_ids:
|
231
|
+
subreddit = reddit_client.subreddit(str(reddit_id))
|
232
|
+
record = parse_subreddit(subreddit)
|
233
|
+
|
234
|
+
all_records.append(record)
|
235
|
+
|
236
|
+
df = pd.DataFrame.from_records(all_records, columns=["subreddit_id", "subreddit_name", "subreddit_display_name", "subreddit_subscribers", "subreddit_date", "subreddit_description", "subreddit_public_description", "subreddit_over18", "subreddit_spoilers_enabled", "subreddit_can_assign_user_flair", "subreddit_can_assign_link_flair", "subreddit_lang", "subreddit_active_user_count"])
|
237
|
+
return df
|
238
|
+
|
239
|
+
def parse_subreddit(subreddit):
|
240
|
+
if subreddit.id:
|
241
|
+
subreddit_id = subreddit.id
|
242
|
+
else:
|
243
|
+
subreddit_id = None
|
244
|
+
|
245
|
+
if subreddit.name:
|
246
|
+
name = subreddit.name
|
247
|
+
else:
|
248
|
+
name = None
|
249
|
+
|
250
|
+
if subreddit.display_name:
|
251
|
+
display_name = subreddit.display_name
|
252
|
+
else:
|
253
|
+
display_name = None
|
254
|
+
|
255
|
+
if subreddit.subscribers:
|
256
|
+
subscribers = subreddit.subscribers
|
257
|
+
else:
|
258
|
+
subscribers = None
|
259
|
+
|
260
|
+
if subreddit.created_utc:
|
261
|
+
date=datetime.datetime.fromtimestamp(int(subreddit.created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
|
262
|
+
else:
|
263
|
+
date = datetime.datetime(1970,1,1,0,0,0)
|
264
|
+
if subreddit.description:
|
265
|
+
description=subreddit.description
|
266
|
+
else:
|
267
|
+
description=None
|
268
|
+
if subreddit.public_description:
|
269
|
+
public_description = subreddit.public_description
|
270
|
+
else:
|
271
|
+
public_description = None
|
272
|
+
if subreddit.over18:
|
273
|
+
over18 = subreddit.over18
|
274
|
+
else:
|
275
|
+
over18 = None
|
276
|
+
if subreddit.lang:
|
277
|
+
lang = subreddit.lang
|
278
|
+
else:
|
279
|
+
lang = None
|
280
|
+
if subreddit.active_user_count:
|
281
|
+
active_user_count = subreddit.active_user_count
|
282
|
+
else:
|
283
|
+
active_user_count = None
|
284
|
+
|
285
|
+
if subreddit.spoilers_enabled:
|
286
|
+
spoilers_enabled = subreddit.spoilers_enabled
|
287
|
+
else:
|
288
|
+
spoilers_enabled = None
|
289
|
+
|
290
|
+
if subreddit.can_assign_user_flair:
|
291
|
+
can_assign_user_flair = subreddit.can_assign_user_flair
|
292
|
+
else:
|
293
|
+
can_assign_user_flair = None
|
294
|
+
|
295
|
+
if subreddit.can_assign_link_flair:
|
296
|
+
can_assign_link_flair = subreddit.can_assign_link_flair
|
297
|
+
else:
|
298
|
+
can_assign_link_flair = None
|
299
|
+
|
300
|
+
record = (subreddit_id, name, display_name, subscribers, date, description, public_description, over18, spoilers_enabled, can_assign_user_flair, can_assign_link_flair, lang, active_user_count)
|
301
|
+
return record
|
302
|
+
|
303
|
+
|
304
|
+
def getComments(reddit, lst_ids):
|
305
|
+
all_records = []
|
306
|
+
for i, submission_id in tqdm(enumerate(lst_ids), total=len(lst_ids), desc="Récupération des commentaires"):
|
307
|
+
submission = reddit.submission(str(submission_id))
|
308
|
+
for comment in submission.comments.list():
|
309
|
+
record = (submission_id,) + parse_comments(comment)
|
310
|
+
|
311
|
+
all_records.append(record)
|
312
|
+
|
313
|
+
df = pd.DataFrame.from_records(all_records, columns=["submission_id", "comment_id", "comment_body", "comment_date", "comment_distinguished", "comment_edited", "comment_is_submitter", "comment_link_id", "comment_parent_id", "comment_permalink",
|
314
|
+
"comment_controversiality", "comment_depth", "comment_downs", "comment_likes", "comment_num_reports", "comment_score", "comment_total_awards_received", "comment_ups",
|
315
|
+
"author_id", "author_name", "author_link_karma", "author_comment_karma", "author_created_utc", "author_icon_img", "author_is_employee", "author_is_mod", "author_is_gold"
|
316
|
+
])
|
317
|
+
|
318
|
+
return df
|
319
|
+
|
320
|
+
def parse_comments(comment):
|
321
|
+
if comment.id:
|
322
|
+
comment_id=comment.id
|
323
|
+
else:
|
324
|
+
comment_id = None
|
325
|
+
if comment.body:
|
326
|
+
comment_body=comment.body
|
327
|
+
else:
|
328
|
+
comment_body = None
|
329
|
+
if comment.created_utc:
|
330
|
+
comment_date=datetime.datetime.fromtimestamp(int(comment.created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
|
331
|
+
else:
|
332
|
+
comment_date = None
|
333
|
+
if comment.distinguished:
|
334
|
+
comment_distinguished=comment.distinguished
|
335
|
+
else:
|
336
|
+
comment_distinguished = None
|
337
|
+
if comment.edited:
|
338
|
+
comment_edited=comment.edited
|
339
|
+
else:
|
340
|
+
comment_edited = None
|
341
|
+
if comment.is_submitter:
|
342
|
+
comment_is_submitter=comment.is_submitter
|
343
|
+
else:
|
344
|
+
comment_is_submitter = None
|
345
|
+
|
346
|
+
if comment.link_id:
|
347
|
+
comment_link_id=comment.link_id
|
348
|
+
else:
|
349
|
+
comment_link_id = None
|
350
|
+
if comment.parent_id:
|
351
|
+
comment_parent_id=comment.parent_id
|
352
|
+
else:
|
353
|
+
comment_parent_id = None
|
354
|
+
if comment.permalink:
|
355
|
+
comment_permalink=comment.permalink
|
356
|
+
else:
|
357
|
+
comment_permalink = None
|
358
|
+
|
359
|
+
if comment.controversiality:
|
360
|
+
comment_controversiality=comment.controversiality
|
361
|
+
else:
|
362
|
+
comment_controversiality = None
|
363
|
+
if comment.depth:
|
364
|
+
comment_depth=comment.depth
|
365
|
+
else:
|
366
|
+
comment_depth = None
|
367
|
+
if comment.downs:
|
368
|
+
comment_downs=comment.downs
|
369
|
+
else:
|
370
|
+
comment_downs = None
|
371
|
+
if comment.likes:
|
372
|
+
comment_likes=comment.likes
|
373
|
+
else:
|
374
|
+
comment_likes = None
|
375
|
+
if comment.num_reports:
|
376
|
+
comment_num_reports=comment.num_reports
|
377
|
+
else:
|
378
|
+
comment_num_reports = None
|
379
|
+
if comment.score:
|
380
|
+
comment_score=comment.score
|
381
|
+
else:
|
382
|
+
comment_score = None
|
383
|
+
if comment.total_awards_received:
|
384
|
+
comment_total_awards_received=comment.total_awards_received
|
385
|
+
else:
|
386
|
+
comment_total_awards_received = None
|
387
|
+
if comment.ups:
|
388
|
+
comment_ups=comment.ups
|
389
|
+
else:
|
390
|
+
comment_ups = None
|
391
|
+
if comment.author:
|
392
|
+
author_record = parse_author(comment.author)
|
393
|
+
else:
|
394
|
+
author_record = (None, None, None, None, None, None, None, None, None)
|
395
|
+
|
396
|
+
record = (comment_id, comment_body, comment_date, comment_distinguished, comment_edited, comment_is_submitter, comment_link_id, comment_parent_id, comment_permalink, comment_controversiality, comment_depth, comment_downs, comment_likes, comment_num_reports, comment_score, comment_total_awards_received, comment_ups) + author_record
|
397
|
+
|
398
|
+
return record
|
399
|
+
|