opsci-toolbox 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,214 +2,53 @@ import praw
2
2
  import datetime
3
3
  import pandas as pd
4
4
  from tqdm import tqdm
5
+ import time
5
6
 
6
- def parse_author(author):
7
- print(author)
8
- if author:
9
- if author.comment_karma:
10
- author_comment_karma=author.comment_karma
11
- else:
12
- author_comment_karma = None
13
- if author.created_utc:
14
- author_created_utc=datetime.datetime.fromtimestamp(int(author.created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
15
- else:
16
- author_created_utc = datetime.datetime(1970,1,1,0,0,0)
17
- if author.icon_img:
18
- author_icon_img=author.icon_img
19
- else:
20
- author_icon_img = None
21
- if author.id:
22
- author_id=author.id
23
- else:
24
- author_id = None
25
- if author.is_employee:
26
- author_is_employee=author.is_employee
27
- else:
28
- author_is_employee = None
29
- if author.is_mod:
30
- author_is_mod=author.is_mod
31
- else:
32
- author_is_mod = None
33
-
34
- if author.is_gold:
35
- author_is_gold=author.is_gold
36
- else:
37
- author_is_gold = None
38
-
39
- if author.link_karma:
40
- author_link_karma=author.link_karma
41
- else:
42
- author_link_karma = None
43
-
44
- if author.name:
45
- author_name=author.name
46
- else:
47
- author_name = None
48
- record = (author_id, author_name, author_link_karma, author_comment_karma, author_created_utc, author_icon_img, author_is_employee, author_is_mod, author_is_gold)
49
- else:
50
- record = (None, None, None, None, None, None, None, None, None)
51
- return record
52
-
53
-
54
- def getSubmissions(reddit_client, lst_ids, subreddit_filter, subreddit_items, time_filter):
55
-
56
- all_records = []
57
- for url in tqdm(lst_ids, total=len(lst_ids), desc="Récupération des soumissions"):
58
- subreddit = reddit_client.subreddit(str(url))
59
-
60
- sub_record = parse_subreddit(subreddit)
61
-
62
- if subreddit_filter == "top":
63
- subreddit_selection = subreddit.top(limit=subreddit_items, time_filter= time_filter)
64
- elif subreddit_filter == "hot":
65
- subreddit_selection = subreddit.hot(limit=subreddit_items, time_filter= time_filter)
66
- elif subreddit_filter == "controversial":
67
- subreddit_selection = subreddit.controversial(limit=subreddit_items, time_filter= time_filter)
68
- else:
69
- subreddit_selection = subreddit.new(limit=subreddit_items, time_filter= time_filter)
70
-
71
- for submission in subreddit_selection:
72
- author = submission.author
73
- author_record = parse_author(author)
74
- submission_record = parse_submission(submission)
75
-
76
- record = sub_record + author_record + submission_record
77
- all_records.append(record)
78
-
79
- df = pd.DataFrame.from_records(all_records,
80
- columns = ["subreddit_id", "subreddit_name", "subreddit_display_name", "subreddit_subscribers", "subreddit_date", "subreddit_description", "subreddit_public_description", "subreddit_over18",
81
- "subreddit_spoilers_enabled", "subreddit_can_assign_user_flair", "subreddit_can_assign_link_flair", "subreddit_lang", "subreddit_active_user_count",
82
- "author_id", "author_name", "author_link_karma", "author_comment_karma", "author_created_utc", "author_icon_img", "author_is_employee", "author_is_mod", "author_is_gold",
83
- "submission_id", "submission_title", "submission_name", "submission_created_utc", "submission_distinguished", "submission_edited", "submission_is_self", "submission_link_flair_template_id",
84
- "submission_link_flair_text", "submission_locked", "submission_num_comments", "submission_over_18", "submission_permalink", "submission_score", "submission_selftext", "submission_spoiler",
85
- "submission_stickied", "submission_url", "submission_upvote_ratio", "submission_downs", "submission_ups", "submission_num_crossposts", "submission_num_reports", "submission_score",
86
- "submission_total_awards_received", "submission_view_count"]
87
- )
88
-
89
- return df
90
-
91
-
92
- def parse_submission(submission):
93
- if submission.id:
94
- submission_id=submission.id
95
- else:
96
- submission_id = None
97
- if submission.title:
98
- submission_title=submission.title
99
- else:
100
- submission_title = None
101
- if submission.name:
102
- submission_name=submission.name
103
- else:
104
- submission_name = None
105
- if submission.created_utc:
106
- submission_created_utc=datetime.datetime.fromtimestamp(int(submission.created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
107
- else:
108
- submission_created_utc = datetime.datetime(1970,1,1,0,0,0)
109
- if submission.distinguished:
110
- submission_distinguished=submission.distinguished
111
- else:
112
- submission_distinguished = None
113
- if submission.edited:
114
- submission_edited=submission.edited
115
- else:
116
- submission_edited = None
117
- if submission.is_self:
118
- submission_is_self=submission.is_self
119
- else:
120
- submission_is_self = None
121
- if submission.link_flair_template_id:
122
- submission_link_flair_template_id=submission.link_flair_template_id
123
- else:
124
- submission_link_flair_template_id = None
125
- if submission.link_flair_text:
126
- submission_link_flair_text=submission.link_flair_text
127
- else:
128
- submission_link_flair_text = None
129
- if submission.locked:
130
- submission_locked=submission.locked
131
- else:
132
- submission_locked = None
133
- if submission.num_comments:
134
- submission_num_comments=submission.num_comments
135
- else:
136
- submission_num_comments = None
137
- if submission.over_18:
138
- submission_over_18=submission.over_18
139
- else:
140
- submission_over_18 = None
141
- if submission.permalink:
142
- submission_permalink=submission.permalink
143
- else:
144
- submission_permalink = None
145
- if submission.score:
146
- submission_score=submission.score
147
- else:
148
- submission_score = None
149
- if submission.selftext:
150
- submission_selftext=submission.selftext
151
- else:
152
- submission_selftext = None
153
- if submission.spoiler:
154
- submission_spoiler=submission.spoiler
155
- else:
156
- submission_spoiler = None
157
- if submission.stickied:
158
- submission_stickied=submission.stickied
159
- else:
160
- submission_stickied = None
161
- if submission.upvote_ratio:
162
- submission_upvote_ratio=submission.upvote_ratio
163
- else:
164
- submission_upvote_ratio = None
165
- if submission.url:
166
- submission_url=submission.url
167
- else:
168
- submission_url = None
169
- if submission.downs:
170
- submission_downs=submission.downs
171
- else:
172
- submission_downs = None
173
- if submission.num_crossposts:
174
- submission_num_crossposts=submission.num_crossposts
175
- else:
176
- submission_num_crossposts = None
177
- if submission.num_reports:
178
- submission_num_reports=submission.num_reports
179
- else:
180
- submission_num_reports = None
181
-
182
- if submission.score:
183
- submission_score=submission.score
184
- else:
185
- submission_score = None
186
-
187
- if submission.ups:
188
- submission_ups=submission.ups
189
- else:
190
- submission_ups = None
7
+ def check_limit(reddit_client : praw.Reddit) -> tuple:
8
+ """
9
+ Check Reddit Client rate limit and wait if necessary.
191
10
 
192
- if submission.total_awards_received:
193
- submission_total_awards_received=submission.total_awards_received
194
- else:
195
- submission_total_awards_received = None
11
+ Args:
12
+ reddit_client (praw.Reddit): current reddit client
196
13
 
197
- if submission.view_count:
198
- submission_view_count=submission.view_count
199
- else:
200
- submission_view_count = None
14
+ Returns:
15
+ tuple containing the following information:
16
+ - remaining: remaining queries.
17
+ - reset_timestamp: time before reset.
18
+ - used: number of sent queries.
201
19
 
20
+ """
21
+ headers = reddit_client.auth.limits
22
+ remaining = headers.get('remaining')
23
+ reset_timestamp = headers.get('reset_timestamp')
24
+ used = headers.get('used')
25
+
26
+ if remaining and reset_timestamp :
27
+ if remaining <= 10:
28
+ # Calculate the time to wait until reset
29
+ current_time = time.time()
30
+ wait_time = reset_timestamp - current_time
31
+
32
+ if wait_time > 0:
33
+ # Convert wait_time to seconds and wait
34
+ print(f"Waiting for {wait_time:.2f} seconds until the next reset...")
35
+ time.sleep(wait_time)
36
+ else:
37
+ print("Reset time is in the past. No need to wait.")
38
+ # else:
39
+ # print(f"{remaining} requests remaining. No need to wait.")
40
+ else :
41
+ print("Missing required header information. Cannot determine wait time.")
42
+
43
+ return remaining, reset_timestamp, used
202
44
 
203
- record = (submission_id, submission_title, submission_name, submission_created_utc, submission_distinguished, submission_edited, submission_is_self, submission_link_flair_template_id, submission_link_flair_text, submission_locked, submission_num_comments, submission_over_18, submission_permalink, submission_score, submission_selftext, submission_spoiler, submission_stickied, submission_url, submission_upvote_ratio, submission_downs, submission_ups, submission_num_crossposts, submission_num_reports, submission_score, submission_total_awards_received, submission_view_count)
204
- return record
205
-
206
-
207
45
 
208
46
  def get_subreddit_info(reddit_client : praw.Reddit, lst_ids: list) -> pd.DataFrame:
209
47
  """
210
48
  Retrieves information about subreddits based on a list of subreddit IDs.
211
49
 
212
50
  Args:
51
+ reddit_client (praw.Reddit): current reddit client
213
52
  lst_ids (list): A list of subreddit IDs.
214
53
 
215
54
  Returns:
@@ -228,172 +67,341 @@ def get_subreddit_info(reddit_client : praw.Reddit, lst_ids: list) -> pd.DataFra
228
67
  """
229
68
  all_records = []
230
69
  for reddit_id in lst_ids:
70
+ remaining, reset_timestamp, used = check_limit(reddit_client)
231
71
  subreddit = reddit_client.subreddit(str(reddit_id))
232
- record = parse_subreddit(subreddit)
72
+ record = parse_subreddit(reddit_client, subreddit)
233
73
 
234
74
  all_records.append(record)
235
75
 
236
76
  df = pd.DataFrame.from_records(all_records, columns=["subreddit_id", "subreddit_name", "subreddit_display_name", "subreddit_subscribers", "subreddit_date", "subreddit_description", "subreddit_public_description", "subreddit_over18", "subreddit_spoilers_enabled", "subreddit_can_assign_user_flair", "subreddit_can_assign_link_flair", "subreddit_lang", "subreddit_active_user_count"])
237
77
  return df
238
78
 
239
- def parse_subreddit(subreddit):
240
- if subreddit.id:
241
- subreddit_id = subreddit.id
242
- else:
243
- subreddit_id = None
79
+
80
+ def getSubmissions(reddit_client : praw.Reddit, sub_id : str, subreddit_filter : str, subreddit_items : int, time_filter : str) -> pd.DataFrame:
81
+ """
82
+ Retrieves submission from a subreddit ID.
83
+
84
+ Args:
85
+ reddit_client (praw.Reddit): current reddit client
86
+ sub_id (str): a subreddit ID.
87
+ subreddit_filter (str): the filter to apply to the subreddit (top, hot, new, controversial).
88
+ subreddit_items (int): the number of items to retrieve. None to retrieve all items.
89
+ time_filter (str): the time filter to apply to the subreddit (hour, day, week, month, year, all).
90
+
91
+ Returns:
92
+ pd.DataFrame: A DataFrame containing submissions metadata.
93
+ """
94
+
95
+ all_records = []
96
+ # for sub_id in tqdm(lst_ids, total=len(lst_ids), desc="Récupération des soumissions"):
97
+ remaining, reset_timestamp, used = check_limit(reddit_client)
98
+ subreddit = reddit_client.subreddit(str(sub_id))
99
+ remaining, reset_timestamp, used = check_limit(reddit_client)
100
+ if not vars(subreddit).get('_fetched'):
101
+ subreddit._fetch()
102
+ remaining, reset_timestamp, used = check_limit(reddit_client)
103
+
104
+ sub_record = parse_subreddit(reddit_client, subreddit)
105
+
106
+ if subreddit_filter == "top":
107
+ subreddit_selection = subreddit.top(limit=subreddit_items, time_filter= time_filter)
108
+ elif subreddit_filter == "hot":
109
+ subreddit_selection = subreddit.hot(limit=subreddit_items)
110
+ elif subreddit_filter == "controversial":
111
+ subreddit_selection = subreddit.controversial(limit=subreddit_items, time_filter= time_filter)
112
+ elif subreddit_filter == "new":
113
+ subreddit_selection = subreddit.new(limit=subreddit_items)
114
+ elif subreddit_filter == "gilded":
115
+ subreddit_selection = subreddit.gilded(limit=subreddit_items)
116
+ elif subreddit_filter == "rising":
117
+ subreddit_selection = subreddit.rising(limit=subreddit_items)
118
+ else:
119
+ return pd.DataFrame()
244
120
 
245
- if subreddit.name:
246
- name = subreddit.name
247
- else:
248
- name = None
121
+
122
+ remaining, reset_timestamp, used = check_limit(reddit_client)
123
+ for i, submission in enumerate(subreddit_selection):
124
+ try:
125
+ remaining, reset_timestamp, used = check_limit(reddit_client)
126
+ if not vars(submission).get('_fetched'):
127
+ submission._fetch()
128
+ remaining, reset_timestamp, used = check_limit(reddit_client)
249
129
 
250
- if subreddit.display_name:
251
- display_name = subreddit.display_name
252
- else:
253
- display_name = None
130
+ author = submission.author
131
+ author_record = parse_author(reddit_client, author)
132
+ submission_record = parse_submission(reddit_client, submission)
254
133
 
255
- if subreddit.subscribers:
256
- subscribers = subreddit.subscribers
257
- else:
258
- subscribers = None
134
+ record = sub_record + author_record + submission_record
135
+ all_records.append(record)
136
+
137
+ except Exception as e:
138
+ pass
139
+ print(e)
140
+
141
+ df = pd.DataFrame.from_records(all_records,
142
+ columns = ["subreddit_id", "subreddit_name", "subreddit_display_name", "subreddit_subscribers", "subreddit_date", "subreddit_description", "subreddit_public_description", "subreddit_over18",
143
+ "subreddit_spoilers_enabled", "subreddit_can_assign_user_flair", "subreddit_can_assign_link_flair", "subreddit_lang", "subreddit_active_user_count",
144
+ "author_id", "author_name", "author_link_karma", "author_comment_karma", "author_created_utc", "author_icon_img", "author_is_employee", "author_is_mod", "author_is_gold",
145
+ "submission_id", "submission_title", "submission_name", "submission_created_utc", "submission_distinguished", "submission_edited", "submission_is_self", "submission_link_flair_template_id",
146
+ "submission_link_flair_text", "submission_locked", "submission_num_comments", "submission_over_18", "submission_permalink", "submission_selftext", "submission_spoiler",
147
+ "submission_stickied", "submission_url", "submission_upvote_ratio", "submission_downs", "submission_ups", "submission_num_crossposts", "submission_num_reports", "submission_score",
148
+ "submission_total_awards_received", "submission_view_count"]
149
+ )
259
150
 
260
- if subreddit.created_utc:
261
- date=datetime.datetime.fromtimestamp(int(subreddit.created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
262
- else:
263
- date = datetime.datetime(1970,1,1,0,0,0)
264
- if subreddit.description:
265
- description=subreddit.description
266
- else:
267
- description=None
268
- if subreddit.public_description:
269
- public_description = subreddit.public_description
270
- else:
271
- public_description = None
272
- if subreddit.over18:
273
- over18 = subreddit.over18
274
- else:
275
- over18 = None
276
- if subreddit.lang:
277
- lang = subreddit.lang
278
- else:
279
- lang = None
280
- if subreddit.active_user_count:
281
- active_user_count = subreddit.active_user_count
282
- else:
283
- active_user_count = None
151
+ return df
284
152
 
285
- if subreddit.spoilers_enabled:
286
- spoilers_enabled = subreddit.spoilers_enabled
287
- else:
288
- spoilers_enabled = None
289
153
 
290
- if subreddit.can_assign_user_flair:
291
- can_assign_user_flair = subreddit.can_assign_user_flair
292
- else:
293
- can_assign_user_flair = None
294
154
 
295
- if subreddit.can_assign_link_flair:
296
- can_assign_link_flair = subreddit.can_assign_link_flair
297
- else:
298
- can_assign_link_flair = None
155
+ def getComments(reddit_client : praw.Reddit, submission_id : str) -> pd.DataFrame:
156
+ """
157
+ Retrieves all comments from a submission ID.
299
158
 
300
- record = (subreddit_id, name, display_name, subscribers, date, description, public_description, over18, spoilers_enabled, can_assign_user_flair, can_assign_link_flair, lang, active_user_count)
301
- return record
159
+ Args:
160
+ reddit_client (praw.Reddit): current reddit client
161
+ submission_id (str): a submission ID.
302
162
 
163
+ Returns:
164
+ pd.DataFrame: A DataFrame containing comments metadata.
165
+
166
+ """
167
+
168
+ remaining, reset_timestamp, used = check_limit(reddit_client)
169
+ submission = reddit_client.submission(str(submission_id))
170
+ if not vars(submission).get('_fetched'):
171
+ submission._fetch()
172
+
173
+ submission.comments.replace_more(limit=None)
174
+ remaining, reset_timestamp, used = check_limit(reddit_client)
303
175
 
304
- def getComments(reddit, lst_ids):
305
176
  all_records = []
306
- for i, submission_id in tqdm(enumerate(lst_ids), total=len(lst_ids), desc="Récupération des commentaires"):
307
- submission = reddit.submission(str(submission_id))
308
- for comment in submission.comments.list():
309
- record = (submission_id,) + parse_comments(comment)
310
-
311
- all_records.append(record)
177
+ for comment in tqdm(submission.comments.list(), total=len(submission.comments.list()), desc="Récupération des commentaires"):
178
+ remaining, reset_timestamp, used = check_limit(reddit_client)
179
+ record = (submission_id,) + parse_comments(reddit_client, comment)
180
+ all_records.append(record)
312
181
 
313
- df = pd.DataFrame.from_records(all_records, columns=["submission_id", "comment_id", "comment_body", "comment_date", "comment_distinguished", "comment_edited", "comment_is_submitter", "comment_link_id", "comment_parent_id", "comment_permalink",
314
- "comment_controversiality", "comment_depth", "comment_downs", "comment_likes", "comment_num_reports", "comment_score", "comment_total_awards_received", "comment_ups",
182
+ df = pd.DataFrame.from_records(all_records, columns=["submission_id", "comment_id", "comment_body", "comment_date", "comment_distinguished", "comment_is_submitter", "comment_link_id", "comment_parent_id", "comment_permalink",
183
+ "comment_controversiality", "comment_depth", "comment_score", "comment_total_awards_received", "comment_ups",
315
184
  "author_id", "author_name", "author_link_karma", "author_comment_karma", "author_created_utc", "author_icon_img", "author_is_employee", "author_is_mod", "author_is_gold"
316
185
  ])
317
186
 
318
187
  return df
319
188
 
320
- def parse_comments(comment):
321
- if comment.id:
322
- comment_id=comment.id
323
- else:
324
- comment_id = None
325
- if comment.body:
326
- comment_body=comment.body
327
- else:
328
- comment_body = None
329
- if comment.created_utc:
330
- comment_date=datetime.datetime.fromtimestamp(int(comment.created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
331
- else:
332
- comment_date = None
333
- if comment.distinguished:
334
- comment_distinguished=comment.distinguished
335
- else:
336
- comment_distinguished = None
337
- if comment.edited:
338
- comment_edited=comment.edited
339
- else:
340
- comment_edited = None
341
- if comment.is_submitter:
342
- comment_is_submitter=comment.is_submitter
343
- else:
344
- comment_is_submitter = None
189
+ def get_top_level_comments(reddit_client : praw.Reddit, submission_id : str) -> pd.DataFrame:
190
+ """
191
+ Retrieves top level comments from a submission ID.
345
192
 
346
- if comment.link_id:
347
- comment_link_id=comment.link_id
348
- else:
349
- comment_link_id = None
350
- if comment.parent_id:
351
- comment_parent_id=comment.parent_id
352
- else:
353
- comment_parent_id = None
354
- if comment.permalink:
355
- comment_permalink=comment.permalink
356
- else:
357
- comment_permalink = None
193
+ Args:
194
+ reddit_client (praw.Reddit): current reddit client
195
+ submission_id (str): a submission ID.
358
196
 
359
- if comment.controversiality:
360
- comment_controversiality=comment.controversiality
361
- else:
362
- comment_controversiality = None
363
- if comment.depth:
364
- comment_depth=comment.depth
365
- else:
366
- comment_depth = None
367
- if comment.downs:
368
- comment_downs=comment.downs
369
- else:
370
- comment_downs = None
371
- if comment.likes:
372
- comment_likes=comment.likes
373
- else:
374
- comment_likes = None
375
- if comment.num_reports:
376
- comment_num_reports=comment.num_reports
377
- else:
378
- comment_num_reports = None
379
- if comment.score:
380
- comment_score=comment.score
381
- else:
382
- comment_score = None
383
- if comment.total_awards_received:
384
- comment_total_awards_received=comment.total_awards_received
385
- else:
386
- comment_total_awards_received = None
387
- if comment.ups:
388
- comment_ups=comment.ups
197
+ Returns:
198
+ pd.DataFrame: A DataFrame containing comments metadata.
199
+
200
+ """
201
+
202
+ remaining, reset_timestamp, used = check_limit(reddit_client)
203
+ submission = reddit_client.submission(str(submission_id))
204
+ if not vars(submission).get('_fetched'):
205
+ submission._fetch()
206
+
207
+ submission.comments.replace_more(limit=None)
208
+ remaining, reset_timestamp, used = check_limit(reddit_client)
209
+
210
+ all_records = []
211
+ for comment in tqdm(submission.comments, total=len(submission.comments), desc="Récupération des commentaires"):
212
+ remaining, reset_timestamp, used = check_limit(reddit_client)
213
+ record = (submission_id,) + parse_comments(reddit_client, comment)
214
+ all_records.append(record)
215
+
216
+ df = pd.DataFrame.from_records(all_records, columns=["submission_id", "comment_id", "comment_body", "comment_date", "comment_distinguished", "comment_is_submitter", "comment_link_id", "comment_parent_id", "comment_permalink",
217
+ "comment_controversiality", "comment_depth", "comment_score", "comment_total_awards_received", "comment_ups",
218
+ "author_id", "author_name", "author_link_karma", "author_comment_karma", "author_created_utc", "author_icon_img", "author_is_employee", "author_is_mod", "author_is_gold"
219
+ ])
220
+
221
+ return df
222
+
223
+ def parse_author(reddit_client : praw.Reddit, author : praw.models.Redditor) -> tuple:
224
+ """
225
+ Parses a Reddit author object and extracts relevant information.
226
+
227
+ Args:
228
+ reddit_client (praw.Reddit): current reddit client
229
+ author (praw.models.Redditor): The Reddit author object.
230
+
231
+ Returns:
232
+ tuple: A tuple containing the following information about the author:
233
+ - author_id: The ID of the author.
234
+ - author_name: The name of the author.
235
+ - author_link_karma: The link karma of the author.
236
+ - author_comment_karma: The comment karma of the author.
237
+ - author_created_utc: The creation date of the author.
238
+ - author_icon_img: The icon image of the author.
239
+ - author_is_employee: Indicates if the author is an employee.
240
+ - author_is_mod: Indicates if the author is a moderator.
241
+ - author_is_gold: Indicates if the author has Reddit Gold.
242
+ """
243
+
244
+ if author:
245
+ if not vars(author).get('_fetched'):
246
+ remaining, reset_timestamp, used = check_limit(reddit_client)
247
+ author._fetch()
248
+ author_comment_karma= vars(author).get("comment_karma", None)
249
+
250
+ author_created_utc= vars(author).get("created_utc", None)
251
+ if author_created_utc:
252
+ author_created_utc = datetime.datetime.fromtimestamp(int(author_created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
253
+ else:
254
+ author_created_utc = datetime.datetime(1970,1,1,0,0,0)
255
+
256
+ author_icon_img= vars(author).get("icon_img", None)
257
+ author_is_employee= vars(author).get("is_employee", None)
258
+ author_is_mod= vars(author).get("is_mod", None)
259
+ author_is_gold= vars(author).get("is_gold", None)
260
+ author_link_karma= vars(author).get("link_karma", None)
261
+ author_name= vars(author).get("name", None)
262
+ author_id= vars(author).get("id", None)
263
+
264
+ record = (author_id, author_name, author_link_karma, author_comment_karma, author_created_utc, author_icon_img, author_is_employee, author_is_mod, author_is_gold)
389
265
  else:
390
- comment_ups = None
391
- if comment.author:
392
- author_record = parse_author(comment.author)
266
+ record = (None, None, None, None, None, None, None, None, None)
267
+ return record
268
+
269
+ def parse_submission(reddit_client : praw.Reddit, submission : praw.models.Submission) -> tuple:
270
+ """
271
+ Parses a Reddit submission object and extracts relevant information.
272
+
273
+ Args:
274
+ reddit_client (praw.Reddit): current reddit client
275
+ submission (praw.models.Submission): The Reddit submission object.
276
+
277
+ Returns:
278
+ tuple: A tuple containing information about the submission.
279
+ """
280
+
281
+ if submission :
282
+ if not vars(submission).get('_fetched'):
283
+ remaining, reset_timestamp, used = check_limit(reddit_client)
284
+ submission._fetch()
285
+ submission_id= vars(submission).get("id", None)
286
+ submission_title= vars(submission).get("title", None)
287
+ submission_name= vars(submission).get("name", None)
288
+ submission_created_utc= vars(submission).get("created_utc", None)
289
+ if submission_created_utc:
290
+ submission_created_utc = datetime.datetime.fromtimestamp(int(submission_created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
291
+ else:
292
+ submission_created_utc = datetime.datetime(1970,1,1,0,0,0)
293
+ submission_distinguished= vars(submission).get("distinguished", None)
294
+ submission_edited= vars(submission).get("edited", None)
295
+ submission_is_self= vars(submission).get("is_self", None)
296
+ submission_link_flair_template_id= vars(submission).get("link_flair_template_id", None)
297
+ submission_link_flair_text= vars(submission).get("link_flair_text", None)
298
+ submission_locked= vars(submission).get("locked", None)
299
+ submission_num_comments= vars(submission).get("num_comments", None)
300
+ submission_over_18= vars(submission).get("over_18", None)
301
+ submission_permalink= vars(submission).get("permalink", None)
302
+ submission_selftext= vars(submission).get("selftext", None)
303
+ submission_spoiler= vars(submission).get("spoiler", None)
304
+ submission_stickied= vars(submission).get("stickied", None)
305
+ submission_upvote_ratio= vars(submission).get("upvote_ratio", None)
306
+ submission_url= vars(submission).get("url", None)
307
+ submission_downs= vars(submission).get("downs", None)
308
+ submission_num_crossposts= vars(submission).get("num_crossposts", None)
309
+ submission_num_reports= vars(submission).get("num_reports", None)
310
+ submission_score= vars(submission).get("score", None)
311
+ submission_total_awards_received= vars(submission).get("total_awards_received", None)
312
+ submission_view_count= vars(submission).get("view_count", None)
313
+ submission_ups= vars(submission).get("ups", None)
314
+ record = (submission_id, submission_title, submission_name, submission_created_utc, submission_distinguished, submission_edited, submission_is_self, submission_link_flair_template_id,
315
+ submission_link_flair_text, submission_locked, submission_num_comments, submission_over_18, submission_permalink, submission_selftext, submission_spoiler,
316
+ submission_stickied, submission_url, submission_upvote_ratio, submission_downs, submission_ups, submission_num_crossposts, submission_num_reports, submission_score,
317
+ submission_total_awards_received, submission_view_count)
318
+ else:
319
+ record = (None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
320
+ return record
321
+
322
+
323
+ def parse_subreddit(reddit_client : praw.Reddit, subreddit : praw.models.Subreddit) -> tuple:
324
+ """
325
+ Parses a Reddit subreddit object and extracts relevant information.
326
+
327
+ Args:
328
+ reddit_client (praw.Reddit): current reddit client
329
+ subreddit (praw.models.Subreddit): The Reddit subreddit object.
330
+
331
+ Returns:
332
+ tuple: A tuple containing information about the subreddit.
333
+ """
334
+
335
+ if subreddit:
336
+ if not vars(subreddit).get('_fetched'):
337
+ remaining, reset_timestamp, used = check_limit(reddit_client)
338
+ subreddit._fetch()
339
+ subreddit_id= vars(subreddit).get("id", None)
340
+ name = vars(subreddit).get("name", None)
341
+ display_name = vars(subreddit).get("display_name", None)
342
+ subscribers = vars(subreddit).get("subscribers", None)
343
+ subscribers = vars(subreddit).get("subscribers", None)
344
+ date = vars(subreddit).get("created_utc", None)
345
+ if date:
346
+ date=datetime.datetime.fromtimestamp(int(date)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
347
+ else:
348
+ date = datetime.datetime(1970,1,1,0,0,0)
349
+ description = vars(subreddit).get("description", None)
350
+ public_description = vars(subreddit).get("public_description", None)
351
+ over18 = vars(subreddit).get("over18", None)
352
+ spoilers_enabled = vars(subreddit).get("spoilers_enabled", None)
353
+ can_assign_user_flair = vars(subreddit).get("can_assign_user_flair", None)
354
+ can_assign_link_flair = vars(subreddit).get("can_assign_link_flair", None)
355
+ lang = vars(subreddit).get("lang", None)
356
+ active_user_count = vars(subreddit).get("active_user_count", None)
357
+
358
+ record = (subreddit_id, name, display_name, subscribers, date, description, public_description, over18, spoilers_enabled, can_assign_user_flair, can_assign_link_flair, lang, active_user_count)
359
+
393
360
  else:
394
- author_record = (None, None, None, None, None, None, None, None, None)
361
+ record = (None, None, None, None, None, None, None, None, None, None, None, None, None)
362
+ return record
395
363
 
396
- record = (comment_id, comment_body, comment_date, comment_distinguished, comment_edited, comment_is_submitter, comment_link_id, comment_parent_id, comment_permalink, comment_controversiality, comment_depth, comment_downs, comment_likes, comment_num_reports, comment_score, comment_total_awards_received, comment_ups) + author_record
364
+ def parse_comments(reddit_client : praw.Reddit, comment : praw.models.Comment) -> tuple:
365
+ """
366
+ Parses a Reddit comment object and extracts relevant information.
397
367
 
368
+ Args:
369
+ reddit_client (praw.Reddit): current reddit client
370
+ comment (praw.models.Comment): The Reddit comment object.
371
+
372
+ Returns:
373
+ tuple: A tuple containing information about the comment.
374
+ """
375
+
376
+ if comment:
377
+ if not vars(comment).get('_fetched'):
378
+ remaining, reset_timestamp, used = check_limit(reddit_client)
379
+ comment._fetch()
380
+ comment_id = vars(comment).get("id", None)
381
+ comment_body = vars(comment).get("body", None)
382
+ comment_date = vars(comment).get("created_utc", None)
383
+ if comment_date:
384
+ comment_date = datetime.datetime.fromtimestamp(int(comment_date)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
385
+ else:
386
+ comment_date = datetime.datetime(1970,1,1,0,0,0)
387
+ comment_distinguished = vars(comment).get("distinguished", None)
388
+ # comment_edited = vars(comment).get("edited", None)
389
+ comment_is_submitter = vars(comment).get("is_submitter", None)
390
+ comment_link_id = vars(comment).get("link_id", None)
391
+ comment_parent_id = vars(comment).get("parent_id", None)
392
+ comment_permalink = vars(comment).get("permalink", None)
393
+ comment_controversiality = vars(comment).get("controversiality", None)
394
+ comment_depth = vars(comment).get("depth", None)
395
+ # comment_downs = vars(comment).get("downs", None)
396
+ # comment_likes = vars(comment).get("likes", None)
397
+ # comment_num_reports = vars(comment).get("num_reports", None)
398
+ comment_score = vars(comment).get("score", None)
399
+ comment_total_awards_received = vars(comment).get("total_awards_received", None)
400
+ comment_ups = vars(comment).get("ups", None)
401
+ author = comment.author
402
+ author_record = parse_author(reddit_client, author)
403
+ record = (comment_id, comment_body, comment_date, comment_distinguished, comment_is_submitter, comment_link_id, comment_parent_id, comment_permalink, comment_controversiality, comment_depth, comment_score, comment_total_awards_received, comment_ups) + author_record
404
+ else:
405
+ record = (None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
398
406
  return record
399
407