opsci-toolbox 0.0.13__tar.gz → 0.0.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/PKG-INFO +1 -1
  2. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/apis/rapidapi_helpers.py +1 -2
  3. opsci_toolbox-0.0.14/opsci_toolbox/apis/reddit.py +407 -0
  4. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/apis/telegram.py +106 -16
  5. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/common.py +1 -1
  6. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/dates.py +1 -1
  7. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/nlp.py +85 -8
  8. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/nlp_cuml.py +41 -0
  9. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox.egg-info/PKG-INFO +1 -1
  10. opsci_toolbox-0.0.14/opsci_toolbox.egg-info/dependency_links.txt +1 -0
  11. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox.egg-info/requires.txt +1 -1
  12. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/setup.py +5 -2
  13. opsci_toolbox-0.0.13/opsci_toolbox/apis/reddit.py +0 -399
  14. opsci_toolbox-0.0.13/opsci_toolbox.egg-info/dependency_links.txt +0 -1
  15. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/MANIFEST.in +0 -0
  16. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/README.md +0 -0
  17. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/__init__.py +0 -0
  18. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/apis/__init__.py +0 -0
  19. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/apis/webscraping.py +0 -0
  20. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/apis/youtube_helpers.py +0 -0
  21. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/__init__.py +0 -0
  22. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/cv.py +0 -0
  23. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/dataviz.py +0 -0
  24. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/gliner.py +0 -0
  25. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/sna.py +0 -0
  26. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/sql.py +0 -0
  27. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/surreaction.py +0 -0
  28. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/lexicons/__init__.py +0 -0
  29. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/lexicons/stop_words_en.csv +0 -0
  30. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/lexicons/stop_words_fr.csv +0 -0
  31. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox.egg-info/SOURCES.txt +0 -0
  32. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox.egg-info/top_level.txt +0 -0
  33. {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: opsci_toolbox
3
- Version: 0.0.13
3
+ Version: 0.0.14
4
4
  Summary: a complete toolbox
5
5
  Home-page: UNKNOWN
6
6
  Author: Erwan Le Nagard
@@ -1,5 +1,4 @@
1
1
  import requests
2
- from datetime import datetime
3
2
  import pandas as pd
4
3
  from tqdm import tqdm
5
4
  import re
@@ -495,7 +494,7 @@ def parse_list_entries(jsonl_data: list)-> pd.DataFrame:
495
494
  protected = data.get("protected", False)
496
495
  verified = data.get("verified", False)
497
496
  verified_type = data.get("verified_type", "")
498
- entities = data.get("entities")
497
+ entities = data.get("entities", {})
499
498
  urls = [url.get("expanded_url","") for url in entities.get('url', {}).get("urls",[])]
500
499
  user_mentions = [um.get("screen_name","") for um in entities.get('description', {}).get('user_mentions', [])]
501
500
  user_mentions_indices = [um.get("indices",[]) for um in entities.get('description', {}).get('user_mentions', [])]
@@ -0,0 +1,407 @@
1
+ import praw
2
+ import datetime
3
+ import pandas as pd
4
+ from tqdm import tqdm
5
+ import time
6
+
7
+ def check_limit(reddit_client : praw.Reddit) -> tuple:
8
+ """
9
+ Check Reddit Client rate limit and wait if necessary.
10
+
11
+ Args:
12
+ reddit_client (praw.Reddit): current reddit client
13
+
14
+ Returns:
15
+ tuple containing the following information:
16
+ - remaining: remaining queries.
17
+ - reset_timestamp: time before reset.
18
+ - used: number of sent queries.
19
+
20
+ """
21
+ headers = reddit_client.auth.limits
22
+ remaining = headers.get('remaining')
23
+ reset_timestamp = headers.get('reset_timestamp')
24
+ used = headers.get('used')
25
+
26
+ if remaining and reset_timestamp :
27
+ if remaining <= 10:
28
+ # Calculate the time to wait until reset
29
+ current_time = time.time()
30
+ wait_time = reset_timestamp - current_time
31
+
32
+ if wait_time > 0:
33
+ # Convert wait_time to seconds and wait
34
+ print(f"Waiting for {wait_time:.2f} seconds until the next reset...")
35
+ time.sleep(wait_time)
36
+ else:
37
+ print("Reset time is in the past. No need to wait.")
38
+ # else:
39
+ # print(f"{remaining} requests remaining. No need to wait.")
40
+ else :
41
+ print("Missing required header information. Cannot determine wait time.")
42
+
43
+ return remaining, reset_timestamp, used
44
+
45
+
46
+ def get_subreddit_info(reddit_client : praw.Reddit, lst_ids: list) -> pd.DataFrame:
47
+ """
48
+ Retrieves information about subreddits based on a list of subreddit IDs.
49
+
50
+ Args:
51
+ reddit_client (praw.Reddit): current reddit client
52
+ lst_ids (list): A list of subreddit IDs.
53
+
54
+ Returns:
55
+ pd.DataFrame: A DataFrame containing the following information for each subreddit:
56
+ - subreddit_id: The ID of the subreddit.
57
+ - name: The name of the subreddit.
58
+ - display_name: The display name of the subreddit.
59
+ - subscribers: The number of subscribers to the subreddit.
60
+ - date: The creation date of the subreddit.
61
+ - description: The description of the subreddit.
62
+ - public_description: The public description of the subreddit.
63
+ - over18: Indicates if the subreddit is for users over 18 years old.
64
+ - spoilers_enabled: Indicates if spoilers are enabled in the subreddit.
65
+ - can_assign_user_flair: Indicates if users can assign their own flair in the subreddit.
66
+ - can_assign_link_flair: Indicates if users can assign flair to links in the subreddit.
67
+ """
68
+ all_records = []
69
+ for reddit_id in lst_ids:
70
+ remaining, reset_timestamp, used = check_limit(reddit_client)
71
+ subreddit = reddit_client.subreddit(str(reddit_id))
72
+ record = parse_subreddit(reddit_client, subreddit)
73
+
74
+ all_records.append(record)
75
+
76
+ df = pd.DataFrame.from_records(all_records, columns=["subreddit_id", "subreddit_name", "subreddit_display_name", "subreddit_subscribers", "subreddit_date", "subreddit_description", "subreddit_public_description", "subreddit_over18", "subreddit_spoilers_enabled", "subreddit_can_assign_user_flair", "subreddit_can_assign_link_flair", "subreddit_lang", "subreddit_active_user_count"])
77
+ return df
78
+
79
+
80
+ def getSubmissions(reddit_client : praw.Reddit, sub_id : str, subreddit_filter : str, subreddit_items : int, time_filter : str) -> pd.DataFrame:
81
+ """
82
+ Retrieves submission from a subreddit ID.
83
+
84
+ Args:
85
+ reddit_client (praw.Reddit): current reddit client
86
+ sub_id (str): a subreddit ID.
87
+ subreddit_filter (str): the filter to apply to the subreddit (top, hot, new, controversial).
88
+ subreddit_items (int): the number of items to retrieve. None to retrieve all items.
89
+ time_filter (str): the time filter to apply to the subreddit (hour, day, week, month, year, all).
90
+
91
+ Returns:
92
+ pd.DataFrame: A DataFrame containing submissions metadata.
93
+ """
94
+
95
+ all_records = []
96
+ # for sub_id in tqdm(lst_ids, total=len(lst_ids), desc="Récupération des soumissions"):
97
+ remaining, reset_timestamp, used = check_limit(reddit_client)
98
+ subreddit = reddit_client.subreddit(str(sub_id))
99
+ remaining, reset_timestamp, used = check_limit(reddit_client)
100
+ if not vars(subreddit).get('_fetched'):
101
+ subreddit._fetch()
102
+ remaining, reset_timestamp, used = check_limit(reddit_client)
103
+
104
+ sub_record = parse_subreddit(reddit_client, subreddit)
105
+
106
+ if subreddit_filter == "top":
107
+ subreddit_selection = subreddit.top(limit=subreddit_items, time_filter= time_filter)
108
+ elif subreddit_filter == "hot":
109
+ subreddit_selection = subreddit.hot(limit=subreddit_items)
110
+ elif subreddit_filter == "controversial":
111
+ subreddit_selection = subreddit.controversial(limit=subreddit_items, time_filter= time_filter)
112
+ elif subreddit_filter == "new":
113
+ subreddit_selection = subreddit.new(limit=subreddit_items)
114
+ elif subreddit_filter == "gilded":
115
+ subreddit_selection = subreddit.gilded(limit=subreddit_items)
116
+ elif subreddit_filter == "rising":
117
+ subreddit_selection = subreddit.rising(limit=subreddit_items)
118
+ else:
119
+ return pd.DataFrame()
120
+
121
+
122
+ remaining, reset_timestamp, used = check_limit(reddit_client)
123
+ for i, submission in enumerate(subreddit_selection):
124
+ try:
125
+ remaining, reset_timestamp, used = check_limit(reddit_client)
126
+ if not vars(submission).get('_fetched'):
127
+ submission._fetch()
128
+ remaining, reset_timestamp, used = check_limit(reddit_client)
129
+
130
+ author = submission.author
131
+ author_record = parse_author(reddit_client, author)
132
+ submission_record = parse_submission(reddit_client, submission)
133
+
134
+ record = sub_record + author_record + submission_record
135
+ all_records.append(record)
136
+
137
+ except Exception as e:
138
+ pass
139
+ print(e)
140
+
141
+ df = pd.DataFrame.from_records(all_records,
142
+ columns = ["subreddit_id", "subreddit_name", "subreddit_display_name", "subreddit_subscribers", "subreddit_date", "subreddit_description", "subreddit_public_description", "subreddit_over18",
143
+ "subreddit_spoilers_enabled", "subreddit_can_assign_user_flair", "subreddit_can_assign_link_flair", "subreddit_lang", "subreddit_active_user_count",
144
+ "author_id", "author_name", "author_link_karma", "author_comment_karma", "author_created_utc", "author_icon_img", "author_is_employee", "author_is_mod", "author_is_gold",
145
+ "submission_id", "submission_title", "submission_name", "submission_created_utc", "submission_distinguished", "submission_edited", "submission_is_self", "submission_link_flair_template_id",
146
+ "submission_link_flair_text", "submission_locked", "submission_num_comments", "submission_over_18", "submission_permalink", "submission_selftext", "submission_spoiler",
147
+ "submission_stickied", "submission_url", "submission_upvote_ratio", "submission_downs", "submission_ups", "submission_num_crossposts", "submission_num_reports", "submission_score",
148
+ "submission_total_awards_received", "submission_view_count"]
149
+ )
150
+
151
+ return df
152
+
153
+
154
+
155
+ def getComments(reddit_client : praw.Reddit, submission_id : str) -> pd.DataFrame:
156
+ """
157
+ Retrieves all comments from a submission ID.
158
+
159
+ Args:
160
+ reddit_client (praw.Reddit): current reddit client
161
+ submission_id (str): a submission ID.
162
+
163
+ Returns:
164
+ pd.DataFrame: A DataFrame containing comments metadata.
165
+
166
+ """
167
+
168
+ remaining, reset_timestamp, used = check_limit(reddit_client)
169
+ submission = reddit_client.submission(str(submission_id))
170
+ if not vars(submission).get('_fetched'):
171
+ submission._fetch()
172
+
173
+ submission.comments.replace_more(limit=None)
174
+ remaining, reset_timestamp, used = check_limit(reddit_client)
175
+
176
+ all_records = []
177
+ for comment in tqdm(submission.comments.list(), total=len(submission.comments.list()), desc="Récupération des commentaires"):
178
+ remaining, reset_timestamp, used = check_limit(reddit_client)
179
+ record = (submission_id,) + parse_comments(reddit_client, comment)
180
+ all_records.append(record)
181
+
182
+ df = pd.DataFrame.from_records(all_records, columns=["submission_id", "comment_id", "comment_body", "comment_date", "comment_distinguished", "comment_is_submitter", "comment_link_id", "comment_parent_id", "comment_permalink",
183
+ "comment_controversiality", "comment_depth", "comment_score", "comment_total_awards_received", "comment_ups",
184
+ "author_id", "author_name", "author_link_karma", "author_comment_karma", "author_created_utc", "author_icon_img", "author_is_employee", "author_is_mod", "author_is_gold"
185
+ ])
186
+
187
+ return df
188
+
189
+ def get_top_level_comments(reddit_client : praw.Reddit, submission_id : str) -> pd.DataFrame:
190
+ """
191
+ Retrieves top level comments from a submission ID.
192
+
193
+ Args:
194
+ reddit_client (praw.Reddit): current reddit client
195
+ submission_id (str): a submission ID.
196
+
197
+ Returns:
198
+ pd.DataFrame: A DataFrame containing comments metadata.
199
+
200
+ """
201
+
202
+ remaining, reset_timestamp, used = check_limit(reddit_client)
203
+ submission = reddit_client.submission(str(submission_id))
204
+ if not vars(submission).get('_fetched'):
205
+ submission._fetch()
206
+
207
+ submission.comments.replace_more(limit=None)
208
+ remaining, reset_timestamp, used = check_limit(reddit_client)
209
+
210
+ all_records = []
211
+ for comment in tqdm(submission.comments, total=len(submission.comments), desc="Récupération des commentaires"):
212
+ remaining, reset_timestamp, used = check_limit(reddit_client)
213
+ record = (submission_id,) + parse_comments(reddit_client, comment)
214
+ all_records.append(record)
215
+
216
+ df = pd.DataFrame.from_records(all_records, columns=["submission_id", "comment_id", "comment_body", "comment_date", "comment_distinguished", "comment_is_submitter", "comment_link_id", "comment_parent_id", "comment_permalink",
217
+ "comment_controversiality", "comment_depth", "comment_score", "comment_total_awards_received", "comment_ups",
218
+ "author_id", "author_name", "author_link_karma", "author_comment_karma", "author_created_utc", "author_icon_img", "author_is_employee", "author_is_mod", "author_is_gold"
219
+ ])
220
+
221
+ return df
222
+
223
+ def parse_author(reddit_client : praw.Reddit, author : praw.models.Redditor) -> tuple:
224
+ """
225
+ Parses a Reddit author object and extracts relevant information.
226
+
227
+ Args:
228
+ reddit_client (praw.Reddit): current reddit client
229
+ author (praw.models.Redditor): The Reddit author object.
230
+
231
+ Returns:
232
+ tuple: A tuple containing the following information about the author:
233
+ - author_id: The ID of the author.
234
+ - author_name: The name of the author.
235
+ - author_link_karma: The link karma of the author.
236
+ - author_comment_karma: The comment karma of the author.
237
+ - author_created_utc: The creation date of the author.
238
+ - author_icon_img: The icon image of the author.
239
+ - author_is_employee: Indicates if the author is an employee.
240
+ - author_is_mod: Indicates if the author is a moderator.
241
+ - author_is_gold: Indicates if the author has Reddit Gold.
242
+ """
243
+
244
+ if author:
245
+ if not vars(author).get('_fetched'):
246
+ remaining, reset_timestamp, used = check_limit(reddit_client)
247
+ author._fetch()
248
+ author_comment_karma= vars(author).get("comment_karma", None)
249
+
250
+ author_created_utc= vars(author).get("created_utc", None)
251
+ if author_created_utc:
252
+ author_created_utc = datetime.datetime.fromtimestamp(int(author_created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
253
+ else:
254
+ author_created_utc = datetime.datetime(1970,1,1,0,0,0)
255
+
256
+ author_icon_img= vars(author).get("icon_img", None)
257
+ author_is_employee= vars(author).get("is_employee", None)
258
+ author_is_mod= vars(author).get("is_mod", None)
259
+ author_is_gold= vars(author).get("is_gold", None)
260
+ author_link_karma= vars(author).get("link_karma", None)
261
+ author_name= vars(author).get("name", None)
262
+ author_id= vars(author).get("id", None)
263
+
264
+ record = (author_id, author_name, author_link_karma, author_comment_karma, author_created_utc, author_icon_img, author_is_employee, author_is_mod, author_is_gold)
265
+ else:
266
+ record = (None, None, None, None, None, None, None, None, None)
267
+ return record
268
+
269
+ def parse_submission(reddit_client : praw.Reddit, submission : praw.models.Submission) -> tuple:
270
+ """
271
+ Parses a Reddit submission object and extracts relevant information.
272
+
273
+ Args:
274
+ reddit_client (praw.Reddit): current reddit client
275
+ submission (praw.models.Submission): The Reddit submission object.
276
+
277
+ Returns:
278
+ tuple: A tuple containing information about the submission.
279
+ """
280
+
281
+ if submission :
282
+ if not vars(submission).get('_fetched'):
283
+ remaining, reset_timestamp, used = check_limit(reddit_client)
284
+ submission._fetch()
285
+ submission_id= vars(submission).get("id", None)
286
+ submission_title= vars(submission).get("title", None)
287
+ submission_name= vars(submission).get("name", None)
288
+ submission_created_utc= vars(submission).get("created_utc", None)
289
+ if submission_created_utc:
290
+ submission_created_utc = datetime.datetime.fromtimestamp(int(submission_created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
291
+ else:
292
+ submission_created_utc = datetime.datetime(1970,1,1,0,0,0)
293
+ submission_distinguished= vars(submission).get("distinguished", None)
294
+ submission_edited= vars(submission).get("edited", None)
295
+ submission_is_self= vars(submission).get("is_self", None)
296
+ submission_link_flair_template_id= vars(submission).get("link_flair_template_id", None)
297
+ submission_link_flair_text= vars(submission).get("link_flair_text", None)
298
+ submission_locked= vars(submission).get("locked", None)
299
+ submission_num_comments= vars(submission).get("num_comments", None)
300
+ submission_over_18= vars(submission).get("over_18", None)
301
+ submission_permalink= vars(submission).get("permalink", None)
302
+ submission_selftext= vars(submission).get("selftext", None)
303
+ submission_spoiler= vars(submission).get("spoiler", None)
304
+ submission_stickied= vars(submission).get("stickied", None)
305
+ submission_upvote_ratio= vars(submission).get("upvote_ratio", None)
306
+ submission_url= vars(submission).get("url", None)
307
+ submission_downs= vars(submission).get("downs", None)
308
+ submission_num_crossposts= vars(submission).get("num_crossposts", None)
309
+ submission_num_reports= vars(submission).get("num_reports", None)
310
+ submission_score= vars(submission).get("score", None)
311
+ submission_total_awards_received= vars(submission).get("total_awards_received", None)
312
+ submission_view_count= vars(submission).get("view_count", None)
313
+ submission_ups= vars(submission).get("ups", None)
314
+ record = (submission_id, submission_title, submission_name, submission_created_utc, submission_distinguished, submission_edited, submission_is_self, submission_link_flair_template_id,
315
+ submission_link_flair_text, submission_locked, submission_num_comments, submission_over_18, submission_permalink, submission_selftext, submission_spoiler,
316
+ submission_stickied, submission_url, submission_upvote_ratio, submission_downs, submission_ups, submission_num_crossposts, submission_num_reports, submission_score,
317
+ submission_total_awards_received, submission_view_count)
318
+ else:
319
+ record = (None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
320
+ return record
321
+
322
+
323
+ def parse_subreddit(reddit_client : praw.Reddit, subreddit : praw.models.Subreddit) -> tuple:
324
+ """
325
+ Parses a Reddit subreddit object and extracts relevant information.
326
+
327
+ Args:
328
+ reddit_client (praw.Reddit): current reddit client
329
+ subreddit (praw.models.Subreddit): The Reddit subreddit object.
330
+
331
+ Returns:
332
+ tuple: A tuple containing information about the subreddit.
333
+ """
334
+
335
+ if subreddit:
336
+ if not vars(subreddit).get('_fetched'):
337
+ remaining, reset_timestamp, used = check_limit(reddit_client)
338
+ subreddit._fetch()
339
+ subreddit_id= vars(subreddit).get("id", None)
340
+ name = vars(subreddit).get("name", None)
341
+ display_name = vars(subreddit).get("display_name", None)
342
+ subscribers = vars(subreddit).get("subscribers", None)
343
+ subscribers = vars(subreddit).get("subscribers", None)
344
+ date = vars(subreddit).get("created_utc", None)
345
+ if date:
346
+ date=datetime.datetime.fromtimestamp(int(date)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
347
+ else:
348
+ date = datetime.datetime(1970,1,1,0,0,0)
349
+ description = vars(subreddit).get("description", None)
350
+ public_description = vars(subreddit).get("public_description", None)
351
+ over18 = vars(subreddit).get("over18", None)
352
+ spoilers_enabled = vars(subreddit).get("spoilers_enabled", None)
353
+ can_assign_user_flair = vars(subreddit).get("can_assign_user_flair", None)
354
+ can_assign_link_flair = vars(subreddit).get("can_assign_link_flair", None)
355
+ lang = vars(subreddit).get("lang", None)
356
+ active_user_count = vars(subreddit).get("active_user_count", None)
357
+
358
+ record = (subreddit_id, name, display_name, subscribers, date, description, public_description, over18, spoilers_enabled, can_assign_user_flair, can_assign_link_flair, lang, active_user_count)
359
+
360
+ else:
361
+ record = (None, None, None, None, None, None, None, None, None, None, None, None, None)
362
+ return record
363
+
364
+ def parse_comments(reddit_client : praw.Reddit, comment : praw.models.Comment) -> tuple:
365
+ """
366
+ Parses a Reddit comment object and extracts relevant information.
367
+
368
+ Args:
369
+ reddit_client (praw.Reddit): current reddit client
370
+ comment (praw.models.Comment): The Reddit comment object.
371
+
372
+ Returns:
373
+ tuple: A tuple containing information about the comment.
374
+ """
375
+
376
+ if comment:
377
+ if not vars(comment).get('_fetched'):
378
+ remaining, reset_timestamp, used = check_limit(reddit_client)
379
+ comment._fetch()
380
+ comment_id = vars(comment).get("id", None)
381
+ comment_body = vars(comment).get("body", None)
382
+ comment_date = vars(comment).get("created_utc", None)
383
+ if comment_date:
384
+ comment_date = datetime.datetime.fromtimestamp(int(comment_date)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
385
+ else:
386
+ comment_date = datetime.datetime(1970,1,1,0,0,0)
387
+ comment_distinguished = vars(comment).get("distinguished", None)
388
+ # comment_edited = vars(comment).get("edited", None)
389
+ comment_is_submitter = vars(comment).get("is_submitter", None)
390
+ comment_link_id = vars(comment).get("link_id", None)
391
+ comment_parent_id = vars(comment).get("parent_id", None)
392
+ comment_permalink = vars(comment).get("permalink", None)
393
+ comment_controversiality = vars(comment).get("controversiality", None)
394
+ comment_depth = vars(comment).get("depth", None)
395
+ # comment_downs = vars(comment).get("downs", None)
396
+ # comment_likes = vars(comment).get("likes", None)
397
+ # comment_num_reports = vars(comment).get("num_reports", None)
398
+ comment_score = vars(comment).get("score", None)
399
+ comment_total_awards_received = vars(comment).get("total_awards_received", None)
400
+ comment_ups = vars(comment).get("ups", None)
401
+ author = comment.author
402
+ author_record = parse_author(reddit_client, author)
403
+ record = (comment_id, comment_body, comment_date, comment_distinguished, comment_is_submitter, comment_link_id, comment_parent_id, comment_permalink, comment_controversiality, comment_depth, comment_score, comment_total_awards_received, comment_ups) + author_record
404
+ else:
405
+ record = (None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
406
+ return record
407
+
@@ -44,6 +44,59 @@ def parse_mediafileid(message: Message) -> str:
44
44
  return None
45
45
 
46
46
 
47
+ def parse_message_entities(messages : list) -> pd.DataFrame:
48
+ """
49
+ Parse Telegram messages entities.
50
+
51
+ Args:
52
+ messages : a list of Telegram messages.
53
+
54
+ Returns:
55
+ pd.DataFrame : a DataFrame containing the parsed entities.
56
+ """
57
+ all_records = []
58
+ for message in messages:
59
+ raw_text = message.raw_text
60
+
61
+ data = message.to_dict()
62
+
63
+ message_id = data.get("id")
64
+
65
+ peer_id = data.get("peer_id", {})
66
+ if peer_id is None:
67
+ peer_id = {}
68
+ channel_id = parse_from(peer_id)
69
+
70
+ from_id = data.get("from_id", {})
71
+ if from_id is None :
72
+ from_id = {}
73
+ from_id = parse_from(from_id)
74
+ if from_id is None:
75
+ from_id = channel_id
76
+
77
+ grouped_id = data.get("grouped_id")
78
+ if grouped_id:
79
+ grouped_id = grouped_id
80
+ else:
81
+ grouped_id = message_id
82
+
83
+ message = data.get("message")
84
+
85
+ entities = data.get("entities", [])
86
+ for entity in entities:
87
+ entity_type = entity.get("_")
88
+ offset = entity.get("offset")
89
+ length = entity.get("length")
90
+ url = entity.get("url")
91
+ document_id = entity.get("document_id")
92
+
93
+ entity_record = (message_id, channel_id, from_id, grouped_id, message, raw_text, entity_type, offset, length, url, document_id)
94
+ all_records.append(entity_record)
95
+
96
+ df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "message", "raw_text", "entity_type", "offset", "length", "url", "document_id"])
97
+ return df
98
+
99
+
47
100
  def parse_messages(messages : list) -> pd.DataFrame:
48
101
  """
49
102
  Parse Telegram messages.
@@ -57,6 +110,7 @@ def parse_messages(messages : list) -> pd.DataFrame:
57
110
 
58
111
  all_records = []
59
112
  for message in messages:
113
+ raw_text = message.raw_text
60
114
 
61
115
  data = message.to_dict()
62
116
 
@@ -112,10 +166,10 @@ def parse_messages(messages : list) -> pd.DataFrame:
112
166
  engagements = forwards + replies + total_reactions
113
167
 
114
168
 
115
- post_record = (message_id, channel_id, from_id, grouped_id, date, message, views, forwards, replies, total_reactions, reactions_details, engagements, reply_to_message_id, reply_to_channel_id) + media_record + fwd_record
169
+ post_record = (message_id, channel_id, from_id, grouped_id, date, message, raw_text, views, forwards, replies, total_reactions, reactions_details, engagements, reply_to_message_id, reply_to_channel_id) + media_record + fwd_record
116
170
  all_records.append(post_record)
117
171
 
118
- df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "date", "message", "views", "forwards", "replies", "reactions", "details_reactions", "engagements", "reply_to_message_id", "reply_to_channel_id",
172
+ df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "date", "message", "raw_text", "views", "forwards", "replies", "reactions", "details_reactions", "engagements", "reply_to_message_id", "reply_to_channel_id",
119
173
  "media_id", "media_date", "media_mime_type", "media_size", "media_filename", "duration", "width", "height",
120
174
  "webpage_id", "webpage_url", "webpage_type", "webpage_site_name", "webpage_title", "webpage_description",
121
175
  "fwd_date", "fwd_from_id", "fwd_from_post_id", "fwd_from_from_name"])
@@ -338,7 +392,7 @@ def group_by_post(df : pd.DataFrame) -> pd.DataFrame:
338
392
  df = df.groupby(['channel_id',"grouped_id"]).agg(**aggregations).reset_index()
339
393
  return df
340
394
 
341
- async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), ids: list = [], path_file: str = "files") -> list:
395
+ async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), ids: list = [], path_file: str = "files", dl_thumbs : bool = False) -> list:
342
396
  """
343
397
  Retrieves messages from a Telegram channel by date.
344
398
 
@@ -362,9 +416,14 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
362
416
  try:
363
417
  await client.start(phone_number)
364
418
 
365
- current_path_file = create_dir(os.path.join(path_file, "messages"))
419
+ # current_path_file = create_dir(os.path.join(path_file, "messages"))
420
+ path_messages = create_dir(os.path.join(path_file, "messages"))
421
+ path_entities = create_dir(os.path.join(path_file, "entities"))
422
+
366
423
  if dl_files:
367
424
  current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
425
+ if dl_thumbs:
426
+ current_path_thumbs = create_dir(os.path.join(path_file, "thumbs", str(channel_username)))
368
427
 
369
428
  # Get the message history
370
429
  messages = []
@@ -381,9 +440,21 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
381
440
  if media_fileid:
382
441
  await message.download_media(file=os.path.join(current_path_img, media_fileid))
383
442
 
443
+ if dl_thumbs:
444
+ media_fileid = parse_mediafileid(message)
445
+ if media_fileid:
446
+ try:
447
+ await message.download_media(file=os.path.join(current_path_thumbs, media_fileid), thumb=-1)
448
+ except Exception as e:
449
+ pass
450
+ print(e)
451
+
384
452
  df_exploded = parse_messages(messages)
385
453
  df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
386
- write_pickle(df_exploded, current_path_file, str(channel_username))
454
+ write_pickle(df_exploded, path_messages, str(channel_username))
455
+
456
+ df_entities = parse_message_entities(messages)
457
+ write_pickle(df_entities, path_entities, str(channel_username))
387
458
 
388
459
  # df_messages = group_by_post(df_exploded)
389
460
  # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['from_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
@@ -396,7 +467,7 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
396
467
 
397
468
  async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel_username:int, dl_files:bool=False, ids:list=[], path_file:str="files")-> list:
398
469
  """
399
- Retrieves messages from a Telegram channel by date.
470
+ Retrieves messages from a Telegram channel by IDS.
400
471
 
401
472
  Args:
402
473
  client (TelegramClient): The Telegram client instance.
@@ -416,7 +487,8 @@ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel
416
487
  try:
417
488
  await client.start(phone_number)
418
489
 
419
- current_path_file = create_dir(os.path.join(path_file, "messages"))
490
+ path_messages = create_dir(os.path.join(path_file, "messages"))
491
+ path_entities = create_dir(os.path.join(path_file, "entities"))
420
492
 
421
493
  # Get the message history
422
494
  messages = []
@@ -433,7 +505,12 @@ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel
433
505
 
434
506
  df_exploded = parse_messages(messages)
435
507
  df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
436
- write_pickle(df_exploded, current_path_file, str(channel_username))
508
+ write_pickle(df_exploded, path_messages, str(channel_username))
509
+
510
+ df_entities = parse_message_entities(messages)
511
+ write_pickle(df_entities, path_entities, str(channel_username))
512
+
513
+
437
514
  # df_messages = group_by_post(df_exploded)
438
515
  # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['message_id'].astype(str)
439
516
  # write_pickle(df_messages, current_path_file, str(channel_username))
@@ -466,7 +543,10 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
466
543
  try:
467
544
  await client.start(phone_number)
468
545
 
469
- current_path_file = create_dir(os.path.join(path_file, "messages"))
546
+ # current_path_file = create_dir(os.path.join(path_file, "messages"))
547
+ path_messages = create_dir(os.path.join(path_file, "messages"))
548
+ path_entities = create_dir(os.path.join(path_file, "entities"))
549
+
470
550
  if dl_files:
471
551
  current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
472
552
 
@@ -489,10 +569,14 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
489
569
  df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
490
570
  # df_messages = group_by_post(df_exploded)
491
571
  # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
572
+ df_entities = parse_message_entities(messages)
573
+
492
574
  if channel_username:
493
- write_pickle(df_exploded, current_path_file, str(search)+'_'+str(channel_username))
575
+ write_pickle(df_exploded, path_messages, str(search)+'_'+str(channel_username))
576
+ write_pickle(df_entities, path_entities, str(search)+'_'+str(channel_username))
494
577
  else:
495
- write_pickle(df_exploded, current_path_file, str(search))
578
+ write_pickle(df_exploded, path_messages, str(search))
579
+ write_pickle(df_entities, path_entities, str(search))
496
580
 
497
581
  return messages
498
582
  finally:
@@ -504,7 +588,9 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
504
588
  # Connect the client
505
589
  await client.start(phone_number)
506
590
 
507
- current_path_file = create_dir(os.path.join(path_file, "messages"))
591
+ # current_path_file = create_dir(os.path.join(path_file, "messages"))
592
+ path_messages = create_dir(os.path.join(path_file, "messages"))
593
+ path_entities = create_dir(os.path.join(path_file, "entities"))
508
594
 
509
595
  comments = []
510
596
 
@@ -519,7 +605,10 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
519
605
 
520
606
  df_comments = parse_messages(comments)
521
607
  df_comments.loc[df_comments['media_id'].notna(), "media_fileid"] = df_comments['channel_id'].astype(str)+'_'+df_comments['grouped_id'].astype(str)+'_'+df_comments['media_id'].astype(str)
522
- write_pickle(df_comments, current_path_file, str(channel_entity)+"_"+str(message_id))
608
+ write_pickle(df_comments, path_messages, str(channel_entity)+"_"+str(message_id))
609
+
610
+ df_entities = parse_message_entities(comments)
611
+ write_pickle(df_entities, path_entities, str(channel_entity)+"_"+str(message_id))
523
612
 
524
613
  return comments
525
614
 
@@ -954,7 +1043,7 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
954
1043
  # # Disconnect the client
955
1044
  # await client.disconnect()
956
1045
 
957
- async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str) -> dict:
1046
+ async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str, path_img :str) -> dict:
958
1047
  """
959
1048
  Retrieves information about a Telegram channel.
960
1049
 
@@ -974,12 +1063,13 @@ async def get_channel_info(api_id : int, api_hash : str, phone_number : str, cha
974
1063
  try:
975
1064
  await client.start(phone_number)
976
1065
  channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
977
- channel_full_info = channel_full_info.to_dict()
1066
+ channel_full_info_json = channel_full_info.to_dict()
1067
+ img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file = path_img)
978
1068
  finally:
979
1069
  # Disconnect the client
980
1070
  await client.disconnect()
981
1071
 
982
- return channel_full_info
1072
+ return channel_full_info_json
983
1073
 
984
1074
 
985
1075
  def parse_channel(channel : dict) -> pd.DataFrame: