opsci-toolbox 0.0.13__tar.gz → 0.0.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/PKG-INFO +1 -1
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/apis/rapidapi_helpers.py +1 -2
- opsci_toolbox-0.0.14/opsci_toolbox/apis/reddit.py +407 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/apis/telegram.py +106 -16
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/common.py +1 -1
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/dates.py +1 -1
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/nlp.py +85 -8
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/nlp_cuml.py +41 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox.egg-info/PKG-INFO +1 -1
- opsci_toolbox-0.0.14/opsci_toolbox.egg-info/dependency_links.txt +1 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox.egg-info/requires.txt +1 -1
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/setup.py +5 -2
- opsci_toolbox-0.0.13/opsci_toolbox/apis/reddit.py +0 -399
- opsci_toolbox-0.0.13/opsci_toolbox.egg-info/dependency_links.txt +0 -1
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/MANIFEST.in +0 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/README.md +0 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/__init__.py +0 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/apis/__init__.py +0 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/apis/webscraping.py +0 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/apis/youtube_helpers.py +0 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/__init__.py +0 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/cv.py +0 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/dataviz.py +0 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/gliner.py +0 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/sna.py +0 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/sql.py +0 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/helpers/surreaction.py +0 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/lexicons/__init__.py +0 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/lexicons/stop_words_en.csv +0 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox/lexicons/stop_words_fr.csv +0 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox.egg-info/SOURCES.txt +0 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/opsci_toolbox.egg-info/top_level.txt +0 -0
- {opsci_toolbox-0.0.13 → opsci_toolbox-0.0.14}/setup.cfg +0 -0
@@ -1,5 +1,4 @@
|
|
1
1
|
import requests
|
2
|
-
from datetime import datetime
|
3
2
|
import pandas as pd
|
4
3
|
from tqdm import tqdm
|
5
4
|
import re
|
@@ -495,7 +494,7 @@ def parse_list_entries(jsonl_data: list)-> pd.DataFrame:
|
|
495
494
|
protected = data.get("protected", False)
|
496
495
|
verified = data.get("verified", False)
|
497
496
|
verified_type = data.get("verified_type", "")
|
498
|
-
entities = data.get("entities")
|
497
|
+
entities = data.get("entities", {})
|
499
498
|
urls = [url.get("expanded_url","") for url in entities.get('url', {}).get("urls",[])]
|
500
499
|
user_mentions = [um.get("screen_name","") for um in entities.get('description', {}).get('user_mentions', [])]
|
501
500
|
user_mentions_indices = [um.get("indices",[]) for um in entities.get('description', {}).get('user_mentions', [])]
|
@@ -0,0 +1,407 @@
|
|
1
|
+
import praw
|
2
|
+
import datetime
|
3
|
+
import pandas as pd
|
4
|
+
from tqdm import tqdm
|
5
|
+
import time
|
6
|
+
|
7
|
+
def check_limit(reddit_client : praw.Reddit) -> tuple:
|
8
|
+
"""
|
9
|
+
Check Reddit Client rate limit and wait if necessary.
|
10
|
+
|
11
|
+
Args:
|
12
|
+
reddit_client (praw.Reddit): current reddit client
|
13
|
+
|
14
|
+
Returns:
|
15
|
+
tuple containing the following information:
|
16
|
+
- remaining: remaining queries.
|
17
|
+
- reset_timestamp: time before reset.
|
18
|
+
- used: number of sent queries.
|
19
|
+
|
20
|
+
"""
|
21
|
+
headers = reddit_client.auth.limits
|
22
|
+
remaining = headers.get('remaining')
|
23
|
+
reset_timestamp = headers.get('reset_timestamp')
|
24
|
+
used = headers.get('used')
|
25
|
+
|
26
|
+
if remaining and reset_timestamp :
|
27
|
+
if remaining <= 10:
|
28
|
+
# Calculate the time to wait until reset
|
29
|
+
current_time = time.time()
|
30
|
+
wait_time = reset_timestamp - current_time
|
31
|
+
|
32
|
+
if wait_time > 0:
|
33
|
+
# Convert wait_time to seconds and wait
|
34
|
+
print(f"Waiting for {wait_time:.2f} seconds until the next reset...")
|
35
|
+
time.sleep(wait_time)
|
36
|
+
else:
|
37
|
+
print("Reset time is in the past. No need to wait.")
|
38
|
+
# else:
|
39
|
+
# print(f"{remaining} requests remaining. No need to wait.")
|
40
|
+
else :
|
41
|
+
print("Missing required header information. Cannot determine wait time.")
|
42
|
+
|
43
|
+
return remaining, reset_timestamp, used
|
44
|
+
|
45
|
+
|
46
|
+
def get_subreddit_info(reddit_client : praw.Reddit, lst_ids: list) -> pd.DataFrame:
|
47
|
+
"""
|
48
|
+
Retrieves information about subreddits based on a list of subreddit IDs.
|
49
|
+
|
50
|
+
Args:
|
51
|
+
reddit_client (praw.Reddit): current reddit client
|
52
|
+
lst_ids (list): A list of subreddit IDs.
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
pd.DataFrame: A DataFrame containing the following information for each subreddit:
|
56
|
+
- subreddit_id: The ID of the subreddit.
|
57
|
+
- name: The name of the subreddit.
|
58
|
+
- display_name: The display name of the subreddit.
|
59
|
+
- subscribers: The number of subscribers to the subreddit.
|
60
|
+
- date: The creation date of the subreddit.
|
61
|
+
- description: The description of the subreddit.
|
62
|
+
- public_description: The public description of the subreddit.
|
63
|
+
- over18: Indicates if the subreddit is for users over 18 years old.
|
64
|
+
- spoilers_enabled: Indicates if spoilers are enabled in the subreddit.
|
65
|
+
- can_assign_user_flair: Indicates if users can assign their own flair in the subreddit.
|
66
|
+
- can_assign_link_flair: Indicates if users can assign flair to links in the subreddit.
|
67
|
+
"""
|
68
|
+
all_records = []
|
69
|
+
for reddit_id in lst_ids:
|
70
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
71
|
+
subreddit = reddit_client.subreddit(str(reddit_id))
|
72
|
+
record = parse_subreddit(reddit_client, subreddit)
|
73
|
+
|
74
|
+
all_records.append(record)
|
75
|
+
|
76
|
+
df = pd.DataFrame.from_records(all_records, columns=["subreddit_id", "subreddit_name", "subreddit_display_name", "subreddit_subscribers", "subreddit_date", "subreddit_description", "subreddit_public_description", "subreddit_over18", "subreddit_spoilers_enabled", "subreddit_can_assign_user_flair", "subreddit_can_assign_link_flair", "subreddit_lang", "subreddit_active_user_count"])
|
77
|
+
return df
|
78
|
+
|
79
|
+
|
80
|
+
def getSubmissions(reddit_client : praw.Reddit, sub_id : str, subreddit_filter : str, subreddit_items : int, time_filter : str) -> pd.DataFrame:
|
81
|
+
"""
|
82
|
+
Retrieves submission from a subreddit ID.
|
83
|
+
|
84
|
+
Args:
|
85
|
+
reddit_client (praw.Reddit): current reddit client
|
86
|
+
sub_id (str): a subreddit ID.
|
87
|
+
subreddit_filter (str): the filter to apply to the subreddit (top, hot, new, controversial).
|
88
|
+
subreddit_items (int): the number of items to retrieve. None to retrieve all items.
|
89
|
+
time_filter (str): the time filter to apply to the subreddit (hour, day, week, month, year, all).
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
pd.DataFrame: A DataFrame containing submissions metadata.
|
93
|
+
"""
|
94
|
+
|
95
|
+
all_records = []
|
96
|
+
# for sub_id in tqdm(lst_ids, total=len(lst_ids), desc="Récupération des soumissions"):
|
97
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
98
|
+
subreddit = reddit_client.subreddit(str(sub_id))
|
99
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
100
|
+
if not vars(subreddit).get('_fetched'):
|
101
|
+
subreddit._fetch()
|
102
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
103
|
+
|
104
|
+
sub_record = parse_subreddit(reddit_client, subreddit)
|
105
|
+
|
106
|
+
if subreddit_filter == "top":
|
107
|
+
subreddit_selection = subreddit.top(limit=subreddit_items, time_filter= time_filter)
|
108
|
+
elif subreddit_filter == "hot":
|
109
|
+
subreddit_selection = subreddit.hot(limit=subreddit_items)
|
110
|
+
elif subreddit_filter == "controversial":
|
111
|
+
subreddit_selection = subreddit.controversial(limit=subreddit_items, time_filter= time_filter)
|
112
|
+
elif subreddit_filter == "new":
|
113
|
+
subreddit_selection = subreddit.new(limit=subreddit_items)
|
114
|
+
elif subreddit_filter == "gilded":
|
115
|
+
subreddit_selection = subreddit.gilded(limit=subreddit_items)
|
116
|
+
elif subreddit_filter == "rising":
|
117
|
+
subreddit_selection = subreddit.rising(limit=subreddit_items)
|
118
|
+
else:
|
119
|
+
return pd.DataFrame()
|
120
|
+
|
121
|
+
|
122
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
123
|
+
for i, submission in enumerate(subreddit_selection):
|
124
|
+
try:
|
125
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
126
|
+
if not vars(submission).get('_fetched'):
|
127
|
+
submission._fetch()
|
128
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
129
|
+
|
130
|
+
author = submission.author
|
131
|
+
author_record = parse_author(reddit_client, author)
|
132
|
+
submission_record = parse_submission(reddit_client, submission)
|
133
|
+
|
134
|
+
record = sub_record + author_record + submission_record
|
135
|
+
all_records.append(record)
|
136
|
+
|
137
|
+
except Exception as e:
|
138
|
+
pass
|
139
|
+
print(e)
|
140
|
+
|
141
|
+
df = pd.DataFrame.from_records(all_records,
|
142
|
+
columns = ["subreddit_id", "subreddit_name", "subreddit_display_name", "subreddit_subscribers", "subreddit_date", "subreddit_description", "subreddit_public_description", "subreddit_over18",
|
143
|
+
"subreddit_spoilers_enabled", "subreddit_can_assign_user_flair", "subreddit_can_assign_link_flair", "subreddit_lang", "subreddit_active_user_count",
|
144
|
+
"author_id", "author_name", "author_link_karma", "author_comment_karma", "author_created_utc", "author_icon_img", "author_is_employee", "author_is_mod", "author_is_gold",
|
145
|
+
"submission_id", "submission_title", "submission_name", "submission_created_utc", "submission_distinguished", "submission_edited", "submission_is_self", "submission_link_flair_template_id",
|
146
|
+
"submission_link_flair_text", "submission_locked", "submission_num_comments", "submission_over_18", "submission_permalink", "submission_selftext", "submission_spoiler",
|
147
|
+
"submission_stickied", "submission_url", "submission_upvote_ratio", "submission_downs", "submission_ups", "submission_num_crossposts", "submission_num_reports", "submission_score",
|
148
|
+
"submission_total_awards_received", "submission_view_count"]
|
149
|
+
)
|
150
|
+
|
151
|
+
return df
|
152
|
+
|
153
|
+
|
154
|
+
|
155
|
+
def getComments(reddit_client : praw.Reddit, submission_id : str) -> pd.DataFrame:
|
156
|
+
"""
|
157
|
+
Retrieves all comments from a submission ID.
|
158
|
+
|
159
|
+
Args:
|
160
|
+
reddit_client (praw.Reddit): current reddit client
|
161
|
+
submission_id (str): a submission ID.
|
162
|
+
|
163
|
+
Returns:
|
164
|
+
pd.DataFrame: A DataFrame containing comments metadata.
|
165
|
+
|
166
|
+
"""
|
167
|
+
|
168
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
169
|
+
submission = reddit_client.submission(str(submission_id))
|
170
|
+
if not vars(submission).get('_fetched'):
|
171
|
+
submission._fetch()
|
172
|
+
|
173
|
+
submission.comments.replace_more(limit=None)
|
174
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
175
|
+
|
176
|
+
all_records = []
|
177
|
+
for comment in tqdm(submission.comments.list(), total=len(submission.comments.list()), desc="Récupération des commentaires"):
|
178
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
179
|
+
record = (submission_id,) + parse_comments(reddit_client, comment)
|
180
|
+
all_records.append(record)
|
181
|
+
|
182
|
+
df = pd.DataFrame.from_records(all_records, columns=["submission_id", "comment_id", "comment_body", "comment_date", "comment_distinguished", "comment_is_submitter", "comment_link_id", "comment_parent_id", "comment_permalink",
|
183
|
+
"comment_controversiality", "comment_depth", "comment_score", "comment_total_awards_received", "comment_ups",
|
184
|
+
"author_id", "author_name", "author_link_karma", "author_comment_karma", "author_created_utc", "author_icon_img", "author_is_employee", "author_is_mod", "author_is_gold"
|
185
|
+
])
|
186
|
+
|
187
|
+
return df
|
188
|
+
|
189
|
+
def get_top_level_comments(reddit_client : praw.Reddit, submission_id : str) -> pd.DataFrame:
|
190
|
+
"""
|
191
|
+
Retrieves top level comments from a submission ID.
|
192
|
+
|
193
|
+
Args:
|
194
|
+
reddit_client (praw.Reddit): current reddit client
|
195
|
+
submission_id (str): a submission ID.
|
196
|
+
|
197
|
+
Returns:
|
198
|
+
pd.DataFrame: A DataFrame containing comments metadata.
|
199
|
+
|
200
|
+
"""
|
201
|
+
|
202
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
203
|
+
submission = reddit_client.submission(str(submission_id))
|
204
|
+
if not vars(submission).get('_fetched'):
|
205
|
+
submission._fetch()
|
206
|
+
|
207
|
+
submission.comments.replace_more(limit=None)
|
208
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
209
|
+
|
210
|
+
all_records = []
|
211
|
+
for comment in tqdm(submission.comments, total=len(submission.comments), desc="Récupération des commentaires"):
|
212
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
213
|
+
record = (submission_id,) + parse_comments(reddit_client, comment)
|
214
|
+
all_records.append(record)
|
215
|
+
|
216
|
+
df = pd.DataFrame.from_records(all_records, columns=["submission_id", "comment_id", "comment_body", "comment_date", "comment_distinguished", "comment_is_submitter", "comment_link_id", "comment_parent_id", "comment_permalink",
|
217
|
+
"comment_controversiality", "comment_depth", "comment_score", "comment_total_awards_received", "comment_ups",
|
218
|
+
"author_id", "author_name", "author_link_karma", "author_comment_karma", "author_created_utc", "author_icon_img", "author_is_employee", "author_is_mod", "author_is_gold"
|
219
|
+
])
|
220
|
+
|
221
|
+
return df
|
222
|
+
|
223
|
+
def parse_author(reddit_client : praw.Reddit, author : praw.models.Redditor) -> tuple:
|
224
|
+
"""
|
225
|
+
Parses a Reddit author object and extracts relevant information.
|
226
|
+
|
227
|
+
Args:
|
228
|
+
reddit_client (praw.Reddit): current reddit client
|
229
|
+
author (praw.models.Redditor): The Reddit author object.
|
230
|
+
|
231
|
+
Returns:
|
232
|
+
tuple: A tuple containing the following information about the author:
|
233
|
+
- author_id: The ID of the author.
|
234
|
+
- author_name: The name of the author.
|
235
|
+
- author_link_karma: The link karma of the author.
|
236
|
+
- author_comment_karma: The comment karma of the author.
|
237
|
+
- author_created_utc: The creation date of the author.
|
238
|
+
- author_icon_img: The icon image of the author.
|
239
|
+
- author_is_employee: Indicates if the author is an employee.
|
240
|
+
- author_is_mod: Indicates if the author is a moderator.
|
241
|
+
- author_is_gold: Indicates if the author has Reddit Gold.
|
242
|
+
"""
|
243
|
+
|
244
|
+
if author:
|
245
|
+
if not vars(author).get('_fetched'):
|
246
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
247
|
+
author._fetch()
|
248
|
+
author_comment_karma= vars(author).get("comment_karma", None)
|
249
|
+
|
250
|
+
author_created_utc= vars(author).get("created_utc", None)
|
251
|
+
if author_created_utc:
|
252
|
+
author_created_utc = datetime.datetime.fromtimestamp(int(author_created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
|
253
|
+
else:
|
254
|
+
author_created_utc = datetime.datetime(1970,1,1,0,0,0)
|
255
|
+
|
256
|
+
author_icon_img= vars(author).get("icon_img", None)
|
257
|
+
author_is_employee= vars(author).get("is_employee", None)
|
258
|
+
author_is_mod= vars(author).get("is_mod", None)
|
259
|
+
author_is_gold= vars(author).get("is_gold", None)
|
260
|
+
author_link_karma= vars(author).get("link_karma", None)
|
261
|
+
author_name= vars(author).get("name", None)
|
262
|
+
author_id= vars(author).get("id", None)
|
263
|
+
|
264
|
+
record = (author_id, author_name, author_link_karma, author_comment_karma, author_created_utc, author_icon_img, author_is_employee, author_is_mod, author_is_gold)
|
265
|
+
else:
|
266
|
+
record = (None, None, None, None, None, None, None, None, None)
|
267
|
+
return record
|
268
|
+
|
269
|
+
def parse_submission(reddit_client : praw.Reddit, submission : praw.models.Submission) -> tuple:
|
270
|
+
"""
|
271
|
+
Parses a Reddit submission object and extracts relevant information.
|
272
|
+
|
273
|
+
Args:
|
274
|
+
reddit_client (praw.Reddit): current reddit client
|
275
|
+
submission (praw.models.Submission): The Reddit submission object.
|
276
|
+
|
277
|
+
Returns:
|
278
|
+
tuple: A tuple containing information about the submission.
|
279
|
+
"""
|
280
|
+
|
281
|
+
if submission :
|
282
|
+
if not vars(submission).get('_fetched'):
|
283
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
284
|
+
submission._fetch()
|
285
|
+
submission_id= vars(submission).get("id", None)
|
286
|
+
submission_title= vars(submission).get("title", None)
|
287
|
+
submission_name= vars(submission).get("name", None)
|
288
|
+
submission_created_utc= vars(submission).get("created_utc", None)
|
289
|
+
if submission_created_utc:
|
290
|
+
submission_created_utc = datetime.datetime.fromtimestamp(int(submission_created_utc)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
|
291
|
+
else:
|
292
|
+
submission_created_utc = datetime.datetime(1970,1,1,0,0,0)
|
293
|
+
submission_distinguished= vars(submission).get("distinguished", None)
|
294
|
+
submission_edited= vars(submission).get("edited", None)
|
295
|
+
submission_is_self= vars(submission).get("is_self", None)
|
296
|
+
submission_link_flair_template_id= vars(submission).get("link_flair_template_id", None)
|
297
|
+
submission_link_flair_text= vars(submission).get("link_flair_text", None)
|
298
|
+
submission_locked= vars(submission).get("locked", None)
|
299
|
+
submission_num_comments= vars(submission).get("num_comments", None)
|
300
|
+
submission_over_18= vars(submission).get("over_18", None)
|
301
|
+
submission_permalink= vars(submission).get("permalink", None)
|
302
|
+
submission_selftext= vars(submission).get("selftext", None)
|
303
|
+
submission_spoiler= vars(submission).get("spoiler", None)
|
304
|
+
submission_stickied= vars(submission).get("stickied", None)
|
305
|
+
submission_upvote_ratio= vars(submission).get("upvote_ratio", None)
|
306
|
+
submission_url= vars(submission).get("url", None)
|
307
|
+
submission_downs= vars(submission).get("downs", None)
|
308
|
+
submission_num_crossposts= vars(submission).get("num_crossposts", None)
|
309
|
+
submission_num_reports= vars(submission).get("num_reports", None)
|
310
|
+
submission_score= vars(submission).get("score", None)
|
311
|
+
submission_total_awards_received= vars(submission).get("total_awards_received", None)
|
312
|
+
submission_view_count= vars(submission).get("view_count", None)
|
313
|
+
submission_ups= vars(submission).get("ups", None)
|
314
|
+
record = (submission_id, submission_title, submission_name, submission_created_utc, submission_distinguished, submission_edited, submission_is_self, submission_link_flair_template_id,
|
315
|
+
submission_link_flair_text, submission_locked, submission_num_comments, submission_over_18, submission_permalink, submission_selftext, submission_spoiler,
|
316
|
+
submission_stickied, submission_url, submission_upvote_ratio, submission_downs, submission_ups, submission_num_crossposts, submission_num_reports, submission_score,
|
317
|
+
submission_total_awards_received, submission_view_count)
|
318
|
+
else:
|
319
|
+
record = (None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
|
320
|
+
return record
|
321
|
+
|
322
|
+
|
323
|
+
def parse_subreddit(reddit_client : praw.Reddit, subreddit : praw.models.Subreddit) -> tuple:
|
324
|
+
"""
|
325
|
+
Parses a Reddit subreddit object and extracts relevant information.
|
326
|
+
|
327
|
+
Args:
|
328
|
+
reddit_client (praw.Reddit): current reddit client
|
329
|
+
subreddit (praw.models.Subreddit): The Reddit subreddit object.
|
330
|
+
|
331
|
+
Returns:
|
332
|
+
tuple: A tuple containing information about the subreddit.
|
333
|
+
"""
|
334
|
+
|
335
|
+
if subreddit:
|
336
|
+
if not vars(subreddit).get('_fetched'):
|
337
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
338
|
+
subreddit._fetch()
|
339
|
+
subreddit_id= vars(subreddit).get("id", None)
|
340
|
+
name = vars(subreddit).get("name", None)
|
341
|
+
display_name = vars(subreddit).get("display_name", None)
|
342
|
+
subscribers = vars(subreddit).get("subscribers", None)
|
343
|
+
subscribers = vars(subreddit).get("subscribers", None)
|
344
|
+
date = vars(subreddit).get("created_utc", None)
|
345
|
+
if date:
|
346
|
+
date=datetime.datetime.fromtimestamp(int(date)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
|
347
|
+
else:
|
348
|
+
date = datetime.datetime(1970,1,1,0,0,0)
|
349
|
+
description = vars(subreddit).get("description", None)
|
350
|
+
public_description = vars(subreddit).get("public_description", None)
|
351
|
+
over18 = vars(subreddit).get("over18", None)
|
352
|
+
spoilers_enabled = vars(subreddit).get("spoilers_enabled", None)
|
353
|
+
can_assign_user_flair = vars(subreddit).get("can_assign_user_flair", None)
|
354
|
+
can_assign_link_flair = vars(subreddit).get("can_assign_link_flair", None)
|
355
|
+
lang = vars(subreddit).get("lang", None)
|
356
|
+
active_user_count = vars(subreddit).get("active_user_count", None)
|
357
|
+
|
358
|
+
record = (subreddit_id, name, display_name, subscribers, date, description, public_description, over18, spoilers_enabled, can_assign_user_flair, can_assign_link_flair, lang, active_user_count)
|
359
|
+
|
360
|
+
else:
|
361
|
+
record = (None, None, None, None, None, None, None, None, None, None, None, None, None)
|
362
|
+
return record
|
363
|
+
|
364
|
+
def parse_comments(reddit_client : praw.Reddit, comment : praw.models.Comment) -> tuple:
|
365
|
+
"""
|
366
|
+
Parses a Reddit comment object and extracts relevant information.
|
367
|
+
|
368
|
+
Args:
|
369
|
+
reddit_client (praw.Reddit): current reddit client
|
370
|
+
comment (praw.models.Comment): The Reddit comment object.
|
371
|
+
|
372
|
+
Returns:
|
373
|
+
tuple: A tuple containing information about the comment.
|
374
|
+
"""
|
375
|
+
|
376
|
+
if comment:
|
377
|
+
if not vars(comment).get('_fetched'):
|
378
|
+
remaining, reset_timestamp, used = check_limit(reddit_client)
|
379
|
+
comment._fetch()
|
380
|
+
comment_id = vars(comment).get("id", None)
|
381
|
+
comment_body = vars(comment).get("body", None)
|
382
|
+
comment_date = vars(comment).get("created_utc", None)
|
383
|
+
if comment_date:
|
384
|
+
comment_date = datetime.datetime.fromtimestamp(int(comment_date)).replace(tzinfo=datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S")
|
385
|
+
else:
|
386
|
+
comment_date = datetime.datetime(1970,1,1,0,0,0)
|
387
|
+
comment_distinguished = vars(comment).get("distinguished", None)
|
388
|
+
# comment_edited = vars(comment).get("edited", None)
|
389
|
+
comment_is_submitter = vars(comment).get("is_submitter", None)
|
390
|
+
comment_link_id = vars(comment).get("link_id", None)
|
391
|
+
comment_parent_id = vars(comment).get("parent_id", None)
|
392
|
+
comment_permalink = vars(comment).get("permalink", None)
|
393
|
+
comment_controversiality = vars(comment).get("controversiality", None)
|
394
|
+
comment_depth = vars(comment).get("depth", None)
|
395
|
+
# comment_downs = vars(comment).get("downs", None)
|
396
|
+
# comment_likes = vars(comment).get("likes", None)
|
397
|
+
# comment_num_reports = vars(comment).get("num_reports", None)
|
398
|
+
comment_score = vars(comment).get("score", None)
|
399
|
+
comment_total_awards_received = vars(comment).get("total_awards_received", None)
|
400
|
+
comment_ups = vars(comment).get("ups", None)
|
401
|
+
author = comment.author
|
402
|
+
author_record = parse_author(reddit_client, author)
|
403
|
+
record = (comment_id, comment_body, comment_date, comment_distinguished, comment_is_submitter, comment_link_id, comment_parent_id, comment_permalink, comment_controversiality, comment_depth, comment_score, comment_total_awards_received, comment_ups) + author_record
|
404
|
+
else:
|
405
|
+
record = (None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
|
406
|
+
return record
|
407
|
+
|
@@ -44,6 +44,59 @@ def parse_mediafileid(message: Message) -> str:
|
|
44
44
|
return None
|
45
45
|
|
46
46
|
|
47
|
+
def parse_message_entities(messages : list) -> pd.DataFrame:
|
48
|
+
"""
|
49
|
+
Parse Telegram messages entities.
|
50
|
+
|
51
|
+
Args:
|
52
|
+
messages : a list of Telegram messages.
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
pd.DataFrame : a DataFrame containing the parsed entities.
|
56
|
+
"""
|
57
|
+
all_records = []
|
58
|
+
for message in messages:
|
59
|
+
raw_text = message.raw_text
|
60
|
+
|
61
|
+
data = message.to_dict()
|
62
|
+
|
63
|
+
message_id = data.get("id")
|
64
|
+
|
65
|
+
peer_id = data.get("peer_id", {})
|
66
|
+
if peer_id is None:
|
67
|
+
peer_id = {}
|
68
|
+
channel_id = parse_from(peer_id)
|
69
|
+
|
70
|
+
from_id = data.get("from_id", {})
|
71
|
+
if from_id is None :
|
72
|
+
from_id = {}
|
73
|
+
from_id = parse_from(from_id)
|
74
|
+
if from_id is None:
|
75
|
+
from_id = channel_id
|
76
|
+
|
77
|
+
grouped_id = data.get("grouped_id")
|
78
|
+
if grouped_id:
|
79
|
+
grouped_id = grouped_id
|
80
|
+
else:
|
81
|
+
grouped_id = message_id
|
82
|
+
|
83
|
+
message = data.get("message")
|
84
|
+
|
85
|
+
entities = data.get("entities", [])
|
86
|
+
for entity in entities:
|
87
|
+
entity_type = entity.get("_")
|
88
|
+
offset = entity.get("offset")
|
89
|
+
length = entity.get("length")
|
90
|
+
url = entity.get("url")
|
91
|
+
document_id = entity.get("document_id")
|
92
|
+
|
93
|
+
entity_record = (message_id, channel_id, from_id, grouped_id, message, raw_text, entity_type, offset, length, url, document_id)
|
94
|
+
all_records.append(entity_record)
|
95
|
+
|
96
|
+
df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "message", "raw_text", "entity_type", "offset", "length", "url", "document_id"])
|
97
|
+
return df
|
98
|
+
|
99
|
+
|
47
100
|
def parse_messages(messages : list) -> pd.DataFrame:
|
48
101
|
"""
|
49
102
|
Parse Telegram messages.
|
@@ -57,6 +110,7 @@ def parse_messages(messages : list) -> pd.DataFrame:
|
|
57
110
|
|
58
111
|
all_records = []
|
59
112
|
for message in messages:
|
113
|
+
raw_text = message.raw_text
|
60
114
|
|
61
115
|
data = message.to_dict()
|
62
116
|
|
@@ -112,10 +166,10 @@ def parse_messages(messages : list) -> pd.DataFrame:
|
|
112
166
|
engagements = forwards + replies + total_reactions
|
113
167
|
|
114
168
|
|
115
|
-
post_record = (message_id, channel_id, from_id, grouped_id, date, message, views, forwards, replies, total_reactions, reactions_details, engagements, reply_to_message_id, reply_to_channel_id) + media_record + fwd_record
|
169
|
+
post_record = (message_id, channel_id, from_id, grouped_id, date, message, raw_text, views, forwards, replies, total_reactions, reactions_details, engagements, reply_to_message_id, reply_to_channel_id) + media_record + fwd_record
|
116
170
|
all_records.append(post_record)
|
117
171
|
|
118
|
-
df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "date", "message", "views", "forwards", "replies", "reactions", "details_reactions", "engagements", "reply_to_message_id", "reply_to_channel_id",
|
172
|
+
df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "date", "message", "raw_text", "views", "forwards", "replies", "reactions", "details_reactions", "engagements", "reply_to_message_id", "reply_to_channel_id",
|
119
173
|
"media_id", "media_date", "media_mime_type", "media_size", "media_filename", "duration", "width", "height",
|
120
174
|
"webpage_id", "webpage_url", "webpage_type", "webpage_site_name", "webpage_title", "webpage_description",
|
121
175
|
"fwd_date", "fwd_from_id", "fwd_from_post_id", "fwd_from_from_name"])
|
@@ -338,7 +392,7 @@ def group_by_post(df : pd.DataFrame) -> pd.DataFrame:
|
|
338
392
|
df = df.groupby(['channel_id',"grouped_id"]).agg(**aggregations).reset_index()
|
339
393
|
return df
|
340
394
|
|
341
|
-
async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), ids: list = [], path_file: str = "files") -> list:
|
395
|
+
async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), ids: list = [], path_file: str = "files", dl_thumbs : bool = False) -> list:
|
342
396
|
"""
|
343
397
|
Retrieves messages from a Telegram channel by date.
|
344
398
|
|
@@ -362,9 +416,14 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
|
|
362
416
|
try:
|
363
417
|
await client.start(phone_number)
|
364
418
|
|
365
|
-
current_path_file = create_dir(os.path.join(path_file, "messages"))
|
419
|
+
# current_path_file = create_dir(os.path.join(path_file, "messages"))
|
420
|
+
path_messages = create_dir(os.path.join(path_file, "messages"))
|
421
|
+
path_entities = create_dir(os.path.join(path_file, "entities"))
|
422
|
+
|
366
423
|
if dl_files:
|
367
424
|
current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
|
425
|
+
if dl_thumbs:
|
426
|
+
current_path_thumbs = create_dir(os.path.join(path_file, "thumbs", str(channel_username)))
|
368
427
|
|
369
428
|
# Get the message history
|
370
429
|
messages = []
|
@@ -381,9 +440,21 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
|
|
381
440
|
if media_fileid:
|
382
441
|
await message.download_media(file=os.path.join(current_path_img, media_fileid))
|
383
442
|
|
443
|
+
if dl_thumbs:
|
444
|
+
media_fileid = parse_mediafileid(message)
|
445
|
+
if media_fileid:
|
446
|
+
try:
|
447
|
+
await message.download_media(file=os.path.join(current_path_thumbs, media_fileid), thumb=-1)
|
448
|
+
except Exception as e:
|
449
|
+
pass
|
450
|
+
print(e)
|
451
|
+
|
384
452
|
df_exploded = parse_messages(messages)
|
385
453
|
df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
|
386
|
-
write_pickle(df_exploded,
|
454
|
+
write_pickle(df_exploded, path_messages, str(channel_username))
|
455
|
+
|
456
|
+
df_entities = parse_message_entities(messages)
|
457
|
+
write_pickle(df_entities, path_entities, str(channel_username))
|
387
458
|
|
388
459
|
# df_messages = group_by_post(df_exploded)
|
389
460
|
# df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['from_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
|
@@ -396,7 +467,7 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
|
|
396
467
|
|
397
468
|
async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel_username:int, dl_files:bool=False, ids:list=[], path_file:str="files")-> list:
|
398
469
|
"""
|
399
|
-
Retrieves messages from a Telegram channel by
|
470
|
+
Retrieves messages from a Telegram channel by IDS.
|
400
471
|
|
401
472
|
Args:
|
402
473
|
client (TelegramClient): The Telegram client instance.
|
@@ -416,7 +487,8 @@ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel
|
|
416
487
|
try:
|
417
488
|
await client.start(phone_number)
|
418
489
|
|
419
|
-
|
490
|
+
path_messages = create_dir(os.path.join(path_file, "messages"))
|
491
|
+
path_entities = create_dir(os.path.join(path_file, "entities"))
|
420
492
|
|
421
493
|
# Get the message history
|
422
494
|
messages = []
|
@@ -433,7 +505,12 @@ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel
|
|
433
505
|
|
434
506
|
df_exploded = parse_messages(messages)
|
435
507
|
df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
|
436
|
-
write_pickle(df_exploded,
|
508
|
+
write_pickle(df_exploded, path_messages, str(channel_username))
|
509
|
+
|
510
|
+
df_entities = parse_message_entities(messages)
|
511
|
+
write_pickle(df_entities, path_entities, str(channel_username))
|
512
|
+
|
513
|
+
|
437
514
|
# df_messages = group_by_post(df_exploded)
|
438
515
|
# df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['message_id'].astype(str)
|
439
516
|
# write_pickle(df_messages, current_path_file, str(channel_username))
|
@@ -466,7 +543,10 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
|
|
466
543
|
try:
|
467
544
|
await client.start(phone_number)
|
468
545
|
|
469
|
-
current_path_file = create_dir(os.path.join(path_file, "messages"))
|
546
|
+
# current_path_file = create_dir(os.path.join(path_file, "messages"))
|
547
|
+
path_messages = create_dir(os.path.join(path_file, "messages"))
|
548
|
+
path_entities = create_dir(os.path.join(path_file, "entities"))
|
549
|
+
|
470
550
|
if dl_files:
|
471
551
|
current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
|
472
552
|
|
@@ -489,10 +569,14 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
|
|
489
569
|
df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
|
490
570
|
# df_messages = group_by_post(df_exploded)
|
491
571
|
# df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
|
572
|
+
df_entities = parse_message_entities(messages)
|
573
|
+
|
492
574
|
if channel_username:
|
493
|
-
write_pickle(df_exploded,
|
575
|
+
write_pickle(df_exploded, path_messages, str(search)+'_'+str(channel_username))
|
576
|
+
write_pickle(df_entities, path_entities, str(search)+'_'+str(channel_username))
|
494
577
|
else:
|
495
|
-
write_pickle(df_exploded,
|
578
|
+
write_pickle(df_exploded, path_messages, str(search))
|
579
|
+
write_pickle(df_entities, path_entities, str(search))
|
496
580
|
|
497
581
|
return messages
|
498
582
|
finally:
|
@@ -504,7 +588,9 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
|
|
504
588
|
# Connect the client
|
505
589
|
await client.start(phone_number)
|
506
590
|
|
507
|
-
current_path_file = create_dir(os.path.join(path_file, "messages"))
|
591
|
+
# current_path_file = create_dir(os.path.join(path_file, "messages"))
|
592
|
+
path_messages = create_dir(os.path.join(path_file, "messages"))
|
593
|
+
path_entities = create_dir(os.path.join(path_file, "entities"))
|
508
594
|
|
509
595
|
comments = []
|
510
596
|
|
@@ -519,7 +605,10 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
|
|
519
605
|
|
520
606
|
df_comments = parse_messages(comments)
|
521
607
|
df_comments.loc[df_comments['media_id'].notna(), "media_fileid"] = df_comments['channel_id'].astype(str)+'_'+df_comments['grouped_id'].astype(str)+'_'+df_comments['media_id'].astype(str)
|
522
|
-
write_pickle(df_comments,
|
608
|
+
write_pickle(df_comments, path_messages, str(channel_entity)+"_"+str(message_id))
|
609
|
+
|
610
|
+
df_entities = parse_message_entities(comments)
|
611
|
+
write_pickle(df_entities, path_entities, str(channel_entity)+"_"+str(message_id))
|
523
612
|
|
524
613
|
return comments
|
525
614
|
|
@@ -954,7 +1043,7 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
|
|
954
1043
|
# # Disconnect the client
|
955
1044
|
# await client.disconnect()
|
956
1045
|
|
957
|
-
async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str) -> dict:
|
1046
|
+
async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str, path_img :str) -> dict:
|
958
1047
|
"""
|
959
1048
|
Retrieves information about a Telegram channel.
|
960
1049
|
|
@@ -974,12 +1063,13 @@ async def get_channel_info(api_id : int, api_hash : str, phone_number : str, cha
|
|
974
1063
|
try:
|
975
1064
|
await client.start(phone_number)
|
976
1065
|
channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
|
977
|
-
|
1066
|
+
channel_full_info_json = channel_full_info.to_dict()
|
1067
|
+
img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file = path_img)
|
978
1068
|
finally:
|
979
1069
|
# Disconnect the client
|
980
1070
|
await client.disconnect()
|
981
1071
|
|
982
|
-
return
|
1072
|
+
return channel_full_info_json
|
983
1073
|
|
984
1074
|
|
985
1075
|
def parse_channel(channel : dict) -> pd.DataFrame:
|