opsci-toolbox 0.0.11__py3-none-any.whl → 0.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/reddit.py +399 -0
- opsci_toolbox/apis/telegram.py +1035 -0
- opsci_toolbox/apis/webscraping.py +75 -0
- opsci_toolbox/helpers/common.py +176 -4
- opsci_toolbox/helpers/dataviz.py +184 -26
- opsci_toolbox/helpers/dates.py +46 -0
- opsci_toolbox/helpers/gliner.py +88 -0
- opsci_toolbox/helpers/nlp.py +256 -8
- opsci_toolbox/helpers/nlp_cuml.py +3 -3
- opsci_toolbox/helpers/sna.py +1 -0
- {opsci_toolbox-0.0.11.dist-info → opsci_toolbox-0.0.13.dist-info}/METADATA +4 -1
- opsci_toolbox-0.0.13.dist-info/RECORD +25 -0
- opsci_toolbox-0.0.11.dist-info/RECORD +0 -22
- {opsci_toolbox-0.0.11.dist-info → opsci_toolbox-0.0.13.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.11.dist-info → opsci_toolbox-0.0.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1035 @@
|
|
1
|
+
from datetime import datetime
|
2
|
+
from telethon.sync import TelegramClient
|
3
|
+
from telethon.tl.functions.channels import GetFullChannelRequest
|
4
|
+
import pandas as pd
|
5
|
+
# from telethon.tl.types import ReactionEmoji, ReactionCustomEmoji, PeerUser, PeerChannel
|
6
|
+
from opsci_toolbox.helpers.common import create_dir, write_pickle
|
7
|
+
import os
|
8
|
+
import nest_asyncio
|
9
|
+
from telethon.tl.types import Message
|
10
|
+
from typing import Optional
|
11
|
+
|
12
|
+
nest_asyncio.apply()
|
13
|
+
|
14
|
+
|
15
|
+
def parse_mediafileid(message: Message) -> str:
|
16
|
+
"""
|
17
|
+
Parse the media file ID from a Telegram message.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
message (telethon.tl.types.Message): The Telegram message.
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
Optional[str]: The media file ID if available, None otherwise.
|
24
|
+
"""
|
25
|
+
data = message.to_dict()
|
26
|
+
media = data.get("media", {})
|
27
|
+
media_id = parse_media_id(media)
|
28
|
+
|
29
|
+
if media_id:
|
30
|
+
message_id = data.get("id")
|
31
|
+
peer_id = data.get("peer_id", {})
|
32
|
+
if peer_id is None:
|
33
|
+
peer_id = {}
|
34
|
+
channel_id = parse_from(peer_id)
|
35
|
+
grouped_id = data.get("grouped_id")
|
36
|
+
if grouped_id:
|
37
|
+
grouped_id = grouped_id
|
38
|
+
else:
|
39
|
+
grouped_id = message_id
|
40
|
+
|
41
|
+
media_fileid = str(channel_id)+'_'+str(grouped_id)+'_'+str(media_id)
|
42
|
+
return media_fileid
|
43
|
+
else:
|
44
|
+
return None
|
45
|
+
|
46
|
+
|
47
|
+
def parse_messages(messages : list) -> pd.DataFrame:
|
48
|
+
"""
|
49
|
+
Parse Telegram messages.
|
50
|
+
|
51
|
+
Args:
|
52
|
+
messages : a list of Telegram messages.
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
pd.DataFrame : a DataFrame containing the parsed information from the Telegram messages.
|
56
|
+
"""
|
57
|
+
|
58
|
+
all_records = []
|
59
|
+
for message in messages:
|
60
|
+
|
61
|
+
data = message.to_dict()
|
62
|
+
|
63
|
+
message_id = data.get("id")
|
64
|
+
|
65
|
+
peer_id = data.get("peer_id", {})
|
66
|
+
if peer_id is None:
|
67
|
+
peer_id = {}
|
68
|
+
channel_id = parse_from(peer_id)
|
69
|
+
|
70
|
+
from_id = data.get("from_id", {})
|
71
|
+
if from_id is None :
|
72
|
+
from_id = {}
|
73
|
+
from_id = parse_from(from_id)
|
74
|
+
if from_id is None:
|
75
|
+
from_id = channel_id
|
76
|
+
|
77
|
+
date = data.get("date")
|
78
|
+
message = data.get("message")
|
79
|
+
views = data.get("views", 0)
|
80
|
+
forwards = data.get("forwards", 0)
|
81
|
+
if forwards is None:
|
82
|
+
forwards = 0
|
83
|
+
|
84
|
+
replies = data.get("replies", {})
|
85
|
+
if replies:
|
86
|
+
replies = replies.get("replies")
|
87
|
+
else:
|
88
|
+
replies = 0
|
89
|
+
|
90
|
+
grouped_id = data.get("grouped_id")
|
91
|
+
if grouped_id:
|
92
|
+
grouped_id = grouped_id
|
93
|
+
else:
|
94
|
+
grouped_id = message_id
|
95
|
+
|
96
|
+
reactions = data.get("reactions", {})
|
97
|
+
if reactions:
|
98
|
+
total_reactions, reactions_details = parse_reactions(reactions)
|
99
|
+
else:
|
100
|
+
total_reactions, reactions_details = 0, None
|
101
|
+
|
102
|
+
|
103
|
+
reply_to = data.get("reply_to", {})
|
104
|
+
reply_to_message_id, reply_to_channel_id = parse_reply(reply_to)
|
105
|
+
|
106
|
+
media = data.get("media", {})
|
107
|
+
media_record = parse_media(media)
|
108
|
+
fwd_from = data.get("fwd_from", {})
|
109
|
+
if fwd_from is None:
|
110
|
+
fwd_from = {}
|
111
|
+
fwd_record = parse_fwd(fwd_from)
|
112
|
+
engagements = forwards + replies + total_reactions
|
113
|
+
|
114
|
+
|
115
|
+
post_record = (message_id, channel_id, from_id, grouped_id, date, message, views, forwards, replies, total_reactions, reactions_details, engagements, reply_to_message_id, reply_to_channel_id) + media_record + fwd_record
|
116
|
+
all_records.append(post_record)
|
117
|
+
|
118
|
+
df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "date", "message", "views", "forwards", "replies", "reactions", "details_reactions", "engagements", "reply_to_message_id", "reply_to_channel_id",
|
119
|
+
"media_id", "media_date", "media_mime_type", "media_size", "media_filename", "duration", "width", "height",
|
120
|
+
"webpage_id", "webpage_url", "webpage_type", "webpage_site_name", "webpage_title", "webpage_description",
|
121
|
+
"fwd_date", "fwd_from_id", "fwd_from_post_id", "fwd_from_from_name"])
|
122
|
+
|
123
|
+
return df
|
124
|
+
|
125
|
+
def parse_reply(reply:dict) -> tuple:
|
126
|
+
"""
|
127
|
+
Parse reply object from a Telegram message.
|
128
|
+
|
129
|
+
Args:
|
130
|
+
reply : a dict corresponding to the reply object.
|
131
|
+
|
132
|
+
Returns:
|
133
|
+
Tuple of reply_to_message_id, reply_to_channel_id
|
134
|
+
"""
|
135
|
+
reply_to_message_id, reply_to_channel_id = None, None
|
136
|
+
if reply:
|
137
|
+
reply_to_message_id = reply.get("reply_to_msg_id")
|
138
|
+
reply_to_peer_id = reply.get("reply_to_peer_id", {})
|
139
|
+
if reply_to_peer_id is None:
|
140
|
+
reply_to_peer_id = {}
|
141
|
+
reply_to_channel_id = parse_from(reply_to_peer_id)
|
142
|
+
|
143
|
+
return reply_to_message_id, reply_to_channel_id
|
144
|
+
|
145
|
+
|
146
|
+
|
147
|
+
def parse_from(data : dict) -> int:
|
148
|
+
"""
|
149
|
+
Parse a peer object from Telegram message.
|
150
|
+
|
151
|
+
Args:
|
152
|
+
data : a dict corresponding to the peer object.
|
153
|
+
|
154
|
+
Returns:
|
155
|
+
int : the channel_id or user_id.
|
156
|
+
"""
|
157
|
+
if data.get("_"):
|
158
|
+
if data.get("_")=="PeerChannel":
|
159
|
+
channel_id = data.get('channel_id')
|
160
|
+
elif data.get("_")=="PeerUser":
|
161
|
+
channel_id = data.get('user_id')
|
162
|
+
else:
|
163
|
+
print("PEER not referenced", data.get('_'))
|
164
|
+
channel_id = None
|
165
|
+
else:
|
166
|
+
channel_id = None
|
167
|
+
return channel_id
|
168
|
+
|
169
|
+
|
170
|
+
def parse_fwd(forward : dict) -> tuple:
|
171
|
+
"""
|
172
|
+
Parse a forward object from Telegram message.
|
173
|
+
|
174
|
+
Args:
|
175
|
+
forward : a dict corresponding to the forward object.
|
176
|
+
|
177
|
+
Returns:
|
178
|
+
tuple containing, date, post id, channel id and name related to the forward.
|
179
|
+
"""
|
180
|
+
fwd_from_date, from_id, fwd_from_channel_post_id, fwd_from_from_name = None, None, None, None
|
181
|
+
if forward:
|
182
|
+
fwd_from_date = forward.get("date")
|
183
|
+
fwd_from_channel_post_id = forward.get("channel_post")
|
184
|
+
fwd_from_from_name = forward.get("from_name")
|
185
|
+
fwd_from_id = forward.get("from_id", {})
|
186
|
+
if fwd_from_id is None :
|
187
|
+
fwd_from_id = {}
|
188
|
+
from_id = parse_from(fwd_from_id)
|
189
|
+
|
190
|
+
fwd_record = (fwd_from_date, from_id, fwd_from_channel_post_id, fwd_from_from_name)
|
191
|
+
return fwd_record
|
192
|
+
|
193
|
+
def parse_reactions(reactions: dict) -> tuple:
|
194
|
+
"""
|
195
|
+
Parse reactions from Telegram message.
|
196
|
+
|
197
|
+
Args:
|
198
|
+
reactions : a dict corresponding to the reactions object.
|
199
|
+
|
200
|
+
Returns:
|
201
|
+
tuple containing the total number of reactions and the details of each reaction.
|
202
|
+
"""
|
203
|
+
# details = dict()
|
204
|
+
details=[]
|
205
|
+
results = reactions.get("results", [])
|
206
|
+
total_reactions = 0
|
207
|
+
for res in results:
|
208
|
+
count = res.get("count", 0)
|
209
|
+
total_reactions += count
|
210
|
+
reaction = res.get("reaction",{})
|
211
|
+
if reaction.get("_")=="ReactionEmoji":
|
212
|
+
emoticon = reaction.get("emoticon", "")
|
213
|
+
else :
|
214
|
+
emoticon = str(reaction.get("document_id",""))
|
215
|
+
|
216
|
+
# details[emoticon] = count
|
217
|
+
details.append((emoticon, count))
|
218
|
+
|
219
|
+
return total_reactions, details
|
220
|
+
|
221
|
+
def parse_media(media : dict) -> tuple:
|
222
|
+
"""
|
223
|
+
Parse medias from Telegram message. Currently it supports photo, document and webpage.
|
224
|
+
|
225
|
+
Args:
|
226
|
+
media : a dict corresponding to the media object.
|
227
|
+
|
228
|
+
Returns:
|
229
|
+
tuple containing media metadata.
|
230
|
+
"""
|
231
|
+
webpage_id, webpage_url, webpage_type, webpage_site_name, webpage_title, webpage_description = None, None, None, None, None, None
|
232
|
+
media_id, media_date, media_mime_type, media_size, media_filename, duration, width, height = None, None, None, 0, None, 0, 0, 0
|
233
|
+
if media :
|
234
|
+
if media.get("_") == "MessageMediaPhoto":
|
235
|
+
photo = media.get("photo", {})
|
236
|
+
media_id = photo.get("id")
|
237
|
+
media_date = photo.get("date")
|
238
|
+
media_mime_type = "photo"
|
239
|
+
|
240
|
+
elif media.get("_") == "MessageMediaDocument":
|
241
|
+
document = media.get("document", {})
|
242
|
+
media_id = document.get("id")
|
243
|
+
media_date = document.get("date")
|
244
|
+
media_mime_type = document.get("mime_type")
|
245
|
+
media_size = document.get("size")
|
246
|
+
attributes = document.get("attributes", [])
|
247
|
+
for attr in attributes:
|
248
|
+
if attr.get("_") == "DocumentAttributeFilename":
|
249
|
+
media_filename = str(attr.get("file_name", ""))
|
250
|
+
elif attr.get("_") == "DocumentAttributeVideo":
|
251
|
+
duration = attr.get("duration")
|
252
|
+
width = attr.get("w")
|
253
|
+
height = attr.get("h")
|
254
|
+
elif media.get("_") == "MessageMediaWebPage":
|
255
|
+
webpage = media.get("webpage", {})
|
256
|
+
webpage_id = webpage.get("id")
|
257
|
+
webpage_url = webpage.get("url")
|
258
|
+
webpage_type = webpage.get("type")
|
259
|
+
webpage_site_name = webpage.get("site_name")
|
260
|
+
webpage_title = webpage.get("title")
|
261
|
+
webpage_description = webpage.get("description")
|
262
|
+
|
263
|
+
else :
|
264
|
+
print("MEDIA not referenced", media.get('_'))
|
265
|
+
|
266
|
+
media_record = (media_id, media_date, media_mime_type, media_size, media_filename, duration, width, height, webpage_id, webpage_url, webpage_type, webpage_site_name, webpage_title, webpage_description)
|
267
|
+
return media_record
|
268
|
+
|
269
|
+
def parse_media_id(media : dict) -> int:
|
270
|
+
"""
|
271
|
+
Parse media id from Telegram message.
|
272
|
+
|
273
|
+
Args:
|
274
|
+
media : a dict corresponding to the media object.
|
275
|
+
|
276
|
+
Returns:
|
277
|
+
int : the media_id.
|
278
|
+
"""
|
279
|
+
media_id = None
|
280
|
+
if media :
|
281
|
+
if media.get("_") == "MessageMediaPhoto":
|
282
|
+
photo = media.get("photo", {})
|
283
|
+
media_id = photo.get("id")
|
284
|
+
|
285
|
+
elif media.get("_") == "MessageMediaDocument":
|
286
|
+
document = media.get("document", {})
|
287
|
+
media_id = document.get("id")
|
288
|
+
|
289
|
+
else :
|
290
|
+
media_id = None
|
291
|
+
return media_id
|
292
|
+
|
293
|
+
def group_by_post(df : pd.DataFrame) -> pd.DataFrame:
|
294
|
+
"""
|
295
|
+
Function to group message by "post". If a post embed multiple media, Telethon returns separate messages for each media. This function groups them together ensuring also type consistency.
|
296
|
+
|
297
|
+
Args:
|
298
|
+
df : dataframe containing messages.
|
299
|
+
|
300
|
+
Returns:
|
301
|
+
pd.DataFrame : a DataFrame containing the grouped messages.
|
302
|
+
"""
|
303
|
+
aggregations = {
|
304
|
+
'concatenated_message_id': ("message_id", lambda x: '-'.join(x.dropna().astype(str))),
|
305
|
+
'message_id': ("message_id", lambda x: list(x[x.notna()].astype(int).astype(str))),
|
306
|
+
'from_id': ("from_id", lambda x: str(int(x.dropna().max())) if pd.notna(x.dropna().max()) else None),
|
307
|
+
"date" : ("date", "max"),
|
308
|
+
"message": ("message", "max"),
|
309
|
+
"views": ("views", "max"),
|
310
|
+
"forwards": ("forwards", "max"),
|
311
|
+
"replies": ("replies", "max"),
|
312
|
+
"reactions": ("reactions", "max"),
|
313
|
+
"details_reactions": ("details_reactions", lambda x: x[x.notna()]),
|
314
|
+
"engagements": ("engagements", "max"),
|
315
|
+
"reply_to_message_id": ("reply_to_message_id", lambda x: str(int(x.dropna().max())) if pd.notna(x.dropna().max()) else None),
|
316
|
+
"reply_to_channel_id": ("reply_to_channel_id", lambda x: str(int(x.dropna().max())) if pd.notna(x.dropna().max()) else None),
|
317
|
+
"media_id" : ("media_id", lambda x: list(x[x.notna()].astype(int).astype(str))),
|
318
|
+
"media_date": ("media_date", lambda x: list(x[x.notna()])),
|
319
|
+
"media_mime_type": ("media_mime_type", lambda x: list(x[x.notna()])),
|
320
|
+
"media_size": ("media_size", lambda x: list(x[x.notna()].astype(int).astype(str))),
|
321
|
+
"media_filename": ("media_filename", lambda x: list(x[x.notna()])),
|
322
|
+
"media_fileid": ("media_fileid", lambda x: list(x[x.notna()].astype(str))),
|
323
|
+
"duration": ("duration", lambda x: list(x[x.notna()].astype(int).astype(str))),
|
324
|
+
"width": ("width", lambda x: list(x[x.notna()].astype(int).astype(str))),
|
325
|
+
"height": ("height", lambda x: list(x[x.notna()].astype(int).astype(str))),
|
326
|
+
"webpage_id": ("webpage_id", lambda x: list(x[x.notna()].astype(int).astype(str))),
|
327
|
+
"webpage_url": ("webpage_url", lambda x: list(x[x.notna()])),
|
328
|
+
"webpage_type": ("webpage_type", lambda x: list(x[x.notna()])),
|
329
|
+
"webpage_site_name": ("webpage_site_name", lambda x: list(x[x.notna()])),
|
330
|
+
"webpage_title": ("webpage_title", lambda x: list(x[x.notna()])),
|
331
|
+
"webpage_description": ("webpage_description", lambda x: list(x[x.notna()])),
|
332
|
+
"fwd_date": ("fwd_date", "max"),
|
333
|
+
"fwd_from_id": ("fwd_from_id", lambda x: str(int(x.dropna().max())) if pd.notna(x.dropna().max()) else None),
|
334
|
+
"fwd_from_post_id": ("fwd_from_post_id", lambda x: str(int(x.dropna().max())) if pd.notna(x.dropna().max()) else None),
|
335
|
+
"fwd_from_from_name":("fwd_from_from_name", "max")
|
336
|
+
|
337
|
+
}
|
338
|
+
df = df.groupby(['channel_id',"grouped_id"]).agg(**aggregations).reset_index()
|
339
|
+
return df
|
340
|
+
|
341
|
+
async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), ids: list = [], path_file: str = "files") -> list:
|
342
|
+
"""
|
343
|
+
Retrieves messages from a Telegram channel by date.
|
344
|
+
|
345
|
+
Args:
|
346
|
+
client (TelegramClient): The Telegram client instance.
|
347
|
+
phone_number (str): The phone number associated with the Telegram account.
|
348
|
+
channel_username (str): The username of the channel to retrieve messages from.
|
349
|
+
dl_files (bool, optional): Whether to download media files attached to the messages. Defaults to False.
|
350
|
+
reverse (bool, optional): Whether to retrieve messages in reverse order. Defaults to True.
|
351
|
+
limit (int, optional): The maximum number of messages to retrieve. Defaults to None (retrieve all messages).
|
352
|
+
offset_date (datetime, optional): The starting date to retrieve messages from. Defaults to datetime(1970,1,1).
|
353
|
+
path_file (str, optional): The path to save the downloaded files. Defaults to "files".
|
354
|
+
|
355
|
+
Returns:
|
356
|
+
list: A list of messages retrieved from the channel.
|
357
|
+
|
358
|
+
Raises:
|
359
|
+
Exception: If there is an error during the retrieval process.
|
360
|
+
|
361
|
+
"""
|
362
|
+
try:
|
363
|
+
await client.start(phone_number)
|
364
|
+
|
365
|
+
current_path_file = create_dir(os.path.join(path_file, "messages"))
|
366
|
+
if dl_files:
|
367
|
+
current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
|
368
|
+
|
369
|
+
# Get the message history
|
370
|
+
messages = []
|
371
|
+
|
372
|
+
async for message in client.iter_messages(channel_username,
|
373
|
+
limit=limit,
|
374
|
+
offset_date=offset_date,
|
375
|
+
reverse=reverse):
|
376
|
+
messages.append(message)
|
377
|
+
|
378
|
+
if dl_files:
|
379
|
+
|
380
|
+
media_fileid = parse_mediafileid(message)
|
381
|
+
if media_fileid:
|
382
|
+
await message.download_media(file=os.path.join(current_path_img, media_fileid))
|
383
|
+
|
384
|
+
df_exploded = parse_messages(messages)
|
385
|
+
df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
|
386
|
+
write_pickle(df_exploded, current_path_file, str(channel_username))
|
387
|
+
|
388
|
+
# df_messages = group_by_post(df_exploded)
|
389
|
+
# df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['from_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
|
390
|
+
# write_pickle(df_messages, current_path_file, str(channel_username))
|
391
|
+
|
392
|
+
return messages
|
393
|
+
finally:
|
394
|
+
# Disconnect the client
|
395
|
+
await client.disconnect()
|
396
|
+
|
397
|
+
async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel_username:int, dl_files:bool=False, ids:list=[], path_file:str="files")-> list:
|
398
|
+
"""
|
399
|
+
Retrieves messages from a Telegram channel by date.
|
400
|
+
|
401
|
+
Args:
|
402
|
+
client (TelegramClient): The Telegram client instance.
|
403
|
+
phone_number (str): The phone number associated with the Telegram account.
|
404
|
+
channel_username (str): The username of the channel to retrieve messages from.
|
405
|
+
dl_files (bool, optional): Whether to download media files attached to the messages. Defaults to False.
|
406
|
+
ids (list) : list of message ids to retrieve
|
407
|
+
path_file (str, optional): The path to save the downloaded files. Defaults to "files".
|
408
|
+
|
409
|
+
Returns:
|
410
|
+
list: A list of messages retrieved from the channel.
|
411
|
+
|
412
|
+
Raises:
|
413
|
+
Exception: If there is an error during the retrieval process.
|
414
|
+
|
415
|
+
"""
|
416
|
+
try:
|
417
|
+
await client.start(phone_number)
|
418
|
+
|
419
|
+
current_path_file = create_dir(os.path.join(path_file, "messages"))
|
420
|
+
|
421
|
+
# Get the message history
|
422
|
+
messages = []
|
423
|
+
|
424
|
+
async for message in client.iter_messages(channel_username,
|
425
|
+
ids = ids):
|
426
|
+
messages.append(message)
|
427
|
+
|
428
|
+
if dl_files:
|
429
|
+
current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
|
430
|
+
media_fileid = parse_mediafileid(message)
|
431
|
+
if media_fileid:
|
432
|
+
await message.download_media(file=os.path.join(current_path_img, media_fileid))
|
433
|
+
|
434
|
+
df_exploded = parse_messages(messages)
|
435
|
+
df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
|
436
|
+
write_pickle(df_exploded, current_path_file, str(channel_username))
|
437
|
+
# df_messages = group_by_post(df_exploded)
|
438
|
+
# df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['message_id'].astype(str)
|
439
|
+
# write_pickle(df_messages, current_path_file, str(channel_username))
|
440
|
+
|
441
|
+
return messages
|
442
|
+
finally:
|
443
|
+
# Disconnect the client
|
444
|
+
await client.disconnect()
|
445
|
+
|
446
|
+
async def get_messages_by_search(client: TelegramClient, phone_number: str, search:str= "SNCF", channel_username:int = None, dl_files:bool=False, limit:int=None, path_file:str="files") -> list:
|
447
|
+
""",
|
448
|
+
Retrieves messages from a Telegram channel by date.
|
449
|
+
|
450
|
+
Args:
|
451
|
+
client (TelegramClient): The Telegram client instance.
|
452
|
+
phone_number (str): The phone number associated with the Telegram account.
|
453
|
+
search (str): The search term to look for in the messages.
|
454
|
+
channel_username (str): The username of the channel to retrieve messages from.
|
455
|
+
dl_files (bool, optional): Whether to download media files attached to the messages. Defaults to False.
|
456
|
+
limit (int, optional): The maximum number of messages to retrieve. Defaults to None (retrieve all messages).
|
457
|
+
path_file (str, optional): The path to save the downloaded files. Defaults to "files".
|
458
|
+
|
459
|
+
Returns:
|
460
|
+
list: A list of messages retrieved from the channel.
|
461
|
+
|
462
|
+
Raises:
|
463
|
+
Exception: If there is an error during the retrieval process.
|
464
|
+
|
465
|
+
"""
|
466
|
+
try:
|
467
|
+
await client.start(phone_number)
|
468
|
+
|
469
|
+
current_path_file = create_dir(os.path.join(path_file, "messages"))
|
470
|
+
if dl_files:
|
471
|
+
current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
|
472
|
+
|
473
|
+
# Get the message history
|
474
|
+
messages = []
|
475
|
+
|
476
|
+
async for message in client.iter_messages(channel_username,
|
477
|
+
search=search,
|
478
|
+
limit=limit):
|
479
|
+
messages.append(message)
|
480
|
+
|
481
|
+
if dl_files:
|
482
|
+
|
483
|
+
media_fileid = parse_mediafileid(message)
|
484
|
+
if media_fileid:
|
485
|
+
await message.download_media(file=os.path.join(current_path_img, media_fileid))
|
486
|
+
|
487
|
+
df_exploded = parse_messages(messages)
|
488
|
+
df_exploded['search']=search
|
489
|
+
df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
|
490
|
+
# df_messages = group_by_post(df_exploded)
|
491
|
+
# df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
|
492
|
+
if channel_username:
|
493
|
+
write_pickle(df_exploded, current_path_file, str(search)+'_'+str(channel_username))
|
494
|
+
else:
|
495
|
+
write_pickle(df_exploded, current_path_file, str(search))
|
496
|
+
|
497
|
+
return messages
|
498
|
+
finally:
|
499
|
+
# Disconnect the client
|
500
|
+
await client.disconnect()
|
501
|
+
|
502
|
+
async def download_comments(client: TelegramClient, phone_number: str,channel_entity : int, message_id: int, dl_files:bool = False, limit:int = None, reverse:bool=True, path_file:str ="files")->list:
|
503
|
+
try:
|
504
|
+
# Connect the client
|
505
|
+
await client.start(phone_number)
|
506
|
+
|
507
|
+
current_path_file = create_dir(os.path.join(path_file, "messages"))
|
508
|
+
|
509
|
+
comments = []
|
510
|
+
|
511
|
+
async for comment in client.iter_messages(int(channel_entity), reply_to=int(message_id), limit=limit, reverse=reverse):
|
512
|
+
comments.append(comment)
|
513
|
+
|
514
|
+
if dl_files:
|
515
|
+
current_path_img = create_dir(os.path.join(path_file, "img", str(channel_entity)))
|
516
|
+
media_fileid = parse_mediafileid(comment)
|
517
|
+
if media_fileid:
|
518
|
+
await comment.download_media(file=os.path.join(current_path_img, media_fileid))
|
519
|
+
|
520
|
+
df_comments = parse_messages(comments)
|
521
|
+
df_comments.loc[df_comments['media_id'].notna(), "media_fileid"] = df_comments['channel_id'].astype(str)+'_'+df_comments['grouped_id'].astype(str)+'_'+df_comments['media_id'].astype(str)
|
522
|
+
write_pickle(df_comments, current_path_file, str(channel_entity)+"_"+str(message_id))
|
523
|
+
|
524
|
+
return comments
|
525
|
+
|
526
|
+
finally:
|
527
|
+
# Disconnect the client
|
528
|
+
await client.disconnect()
|
529
|
+
|
530
|
+
|
531
|
+
|
532
|
+
# def parse_telegram_messages(messages : list) -> pd.DataFrame:
|
533
|
+
# """
|
534
|
+
# Parses the given list of Telegram messages and returns a DataFrame with the extracted information.
|
535
|
+
|
536
|
+
# Args:
|
537
|
+
# messages (list): A list of Telegram messages.
|
538
|
+
|
539
|
+
# Returns:
|
540
|
+
# pandas.DataFrame: A DataFrame containing the parsed information from the Telegram messages.
|
541
|
+
|
542
|
+
# """
|
543
|
+
|
544
|
+
# all_records = []
|
545
|
+
# for message in messages:
|
546
|
+
|
547
|
+
# peer_id = message.peer_id
|
548
|
+
|
549
|
+
# if peer_id:
|
550
|
+
# channel_id = str(peer_id.channel_id)
|
551
|
+
# else:
|
552
|
+
# channel_id=''
|
553
|
+
# if message.id:
|
554
|
+
# message_id = str(message.id)
|
555
|
+
# else:
|
556
|
+
# message_id=''
|
557
|
+
|
558
|
+
# uniq_id = str(channel_id) + "_" + str(message_id)
|
559
|
+
# if message.date:
|
560
|
+
# message_date = message.date
|
561
|
+
# else:
|
562
|
+
# message_date=datetime(1970,1,1)
|
563
|
+
# if message.text:
|
564
|
+
# text = message.text
|
565
|
+
# else:
|
566
|
+
# text = ''
|
567
|
+
# if message.is_reply:
|
568
|
+
# is_reply = message.is_reply
|
569
|
+
# else:
|
570
|
+
# is_reply = False
|
571
|
+
|
572
|
+
# if message.views:
|
573
|
+
# views = int(message.views)
|
574
|
+
# else:
|
575
|
+
# views = 0
|
576
|
+
# if message.forwards:
|
577
|
+
# forwards = int(message.forwards)
|
578
|
+
# else:
|
579
|
+
# forwards = 0
|
580
|
+
|
581
|
+
# ##########################################
|
582
|
+
# # REPLIES
|
583
|
+
# ##########################################
|
584
|
+
# if message.replies :
|
585
|
+
# replies = message.replies
|
586
|
+
# if replies.replies:
|
587
|
+
# replies_count = int(replies.replies)
|
588
|
+
# else:
|
589
|
+
# replies_count = 0
|
590
|
+
|
591
|
+
# if replies.channel_id:
|
592
|
+
# replies_channel_id = replies.channel_id
|
593
|
+
# else:
|
594
|
+
# replies_channel_id = ''
|
595
|
+
# else :
|
596
|
+
# replies_count, replies_channel_id= 0, ''
|
597
|
+
|
598
|
+
# ##########################################
|
599
|
+
# # REACTIONS
|
600
|
+
# ##########################################
|
601
|
+
|
602
|
+
# total_reactions = 0
|
603
|
+
# details_reactions=[]
|
604
|
+
|
605
|
+
# if message.reactions:
|
606
|
+
# reactions = message.reactions
|
607
|
+
# reactions_lst = reactions.results
|
608
|
+
# for reaction in reactions_lst:
|
609
|
+
# if reaction.count:
|
610
|
+
# count = int(reaction.count)
|
611
|
+
# else:
|
612
|
+
# count = 0
|
613
|
+
# total_reactions += count
|
614
|
+
# r = reaction.reaction
|
615
|
+
|
616
|
+
# if isinstance(r, ReactionEmoji):
|
617
|
+
# emoticon = r.emoticon
|
618
|
+
# elif isinstance(r, ReactionCustomEmoji):
|
619
|
+
# emoticon = r.document_id
|
620
|
+
# else:
|
621
|
+
# emoticon = None
|
622
|
+
# details_reactions.append((emoticon, count))
|
623
|
+
# else :
|
624
|
+
# count = 0
|
625
|
+
|
626
|
+
# ##########################################
|
627
|
+
# # FORWARDS
|
628
|
+
# ##########################################
|
629
|
+
|
630
|
+
# if message.fwd_from :
|
631
|
+
# fwd_from = message.fwd_from
|
632
|
+
# if fwd_from.date:
|
633
|
+
# fwd_from_date = fwd_from.date
|
634
|
+
# else :
|
635
|
+
# fwd_from_date = datetime(1970,1,1)
|
636
|
+
# if fwd_from.from_id:
|
637
|
+
# fwd_from_id = fwd_from.from_id
|
638
|
+
# if isinstance(fwd_from_id, PeerUser):
|
639
|
+
# fwd_from_channel_id = fwd_from_id.user_id
|
640
|
+
# elif isinstance(fwd_from_id, PeerChannel):
|
641
|
+
# fwd_from_channel_id = fwd_from_id.channel_id
|
642
|
+
# else:
|
643
|
+
# fwd_from_channel_id = None
|
644
|
+
# print(fwd_from_id, "type not implemented")
|
645
|
+
# else :
|
646
|
+
# fwd_from_channel_id = None
|
647
|
+
# if fwd_from.from_name:
|
648
|
+
# fwd_from_name = fwd_from.from_name
|
649
|
+
# else:
|
650
|
+
# fwd_from_name = ''
|
651
|
+
# if fwd_from.channel_post:
|
652
|
+
# fwd_from_channel_post = str(fwd_from.channel_post)
|
653
|
+
# else:
|
654
|
+
# fwd_from_channel_post = ''
|
655
|
+
# if fwd_from.post_author:
|
656
|
+
# fwd_from_post_author = fwd_from.post_author
|
657
|
+
# else:
|
658
|
+
# fwd_from_post_author=''
|
659
|
+
# else :
|
660
|
+
# fwd_from_date, fwd_from_id, fwd_from_channel_id, fwd_from_name, fwd_from_channel_post, fwd_from_post_author = datetime(1970,1,1), '', '', '', '', ''
|
661
|
+
|
662
|
+
# ##########################################
|
663
|
+
# # REPLIES
|
664
|
+
# ##########################################
|
665
|
+
|
666
|
+
# if message.reply_to:
|
667
|
+
# reply_to = message.reply_to
|
668
|
+
# if reply_to.quote:
|
669
|
+
# reply_to_quote = reply_to.quote
|
670
|
+
# else:
|
671
|
+
# reply_to_quote=False
|
672
|
+
# if reply_to.reply_to_msg_id:
|
673
|
+
# reply_to_msg_id = str(reply_to.reply_to_msg_id)
|
674
|
+
# else:
|
675
|
+
# reply_to_msg_id = ''
|
676
|
+
# if reply_to.reply_to_peer_id:
|
677
|
+
# reply_to_peer_id = str(reply_to.reply_to_peer_id)
|
678
|
+
# else:
|
679
|
+
# reply_to_peer_id = ''
|
680
|
+
# # reply_from = reply_to.reply_from
|
681
|
+
# # reply_media = reply_to.reply_media
|
682
|
+
# if reply_to.reply_to_top_id:
|
683
|
+
# reply_to_top_id = str(reply_to.reply_to_top_id)
|
684
|
+
# else:
|
685
|
+
# reply_to_top_id = ''
|
686
|
+
# if reply_to.quote_text:
|
687
|
+
# reply_to_quote_text = reply_to.quote_text
|
688
|
+
# else:
|
689
|
+
# reply_to_quote_text = ''
|
690
|
+
# else:
|
691
|
+
# reply_to_quote, reply_to_msg_id, reply_to_peer_id, reply_to_top_id, reply_to_quote_text = False, '', '', '', ''
|
692
|
+
|
693
|
+
# ##########################################
|
694
|
+
# # FILE
|
695
|
+
# ##########################################
|
696
|
+
# if message.file:
|
697
|
+
# file = message.file
|
698
|
+
# if file.id:
|
699
|
+
# file_id = file.id
|
700
|
+
# else:
|
701
|
+
# file_id = ''
|
702
|
+
# if file.duration:
|
703
|
+
# file_duration = file.duration
|
704
|
+
# else:
|
705
|
+
# file_duration = 0
|
706
|
+
# if file.emoji:
|
707
|
+
# file_emoji = file.emoji
|
708
|
+
# else:
|
709
|
+
# file_emoji = ''
|
710
|
+
# if file.ext:
|
711
|
+
# file_ext = file.ext
|
712
|
+
# else:
|
713
|
+
# file_ext = ''
|
714
|
+
# if file.height:
|
715
|
+
# file_height = int(file.height)
|
716
|
+
# else:
|
717
|
+
# file_height = 0
|
718
|
+
# if file.mime_type:
|
719
|
+
# file_mime_type = file.mime_type
|
720
|
+
# else:
|
721
|
+
# file_mime_type = ''
|
722
|
+
# if file.name:
|
723
|
+
# file_name = file.name
|
724
|
+
# else:
|
725
|
+
# file_name = ''
|
726
|
+
# if file.performer:
|
727
|
+
# file_performer = file.performer
|
728
|
+
# else:
|
729
|
+
# file_performer = ''
|
730
|
+
# if file.size:
|
731
|
+
# file_size = file.size
|
732
|
+
# else:
|
733
|
+
# file_size = 0
|
734
|
+
# if file.sticker_set:
|
735
|
+
# file_sticker_set = file.sticker_set
|
736
|
+
# else:
|
737
|
+
# file_sticker_set = ''
|
738
|
+
# if file.title:
|
739
|
+
# file_title = file.title
|
740
|
+
# else :
|
741
|
+
# file_title = ''
|
742
|
+
# if file.width:
|
743
|
+
# file_width = int(file.width)
|
744
|
+
# else:
|
745
|
+
# file_width = 0
|
746
|
+
# else :
|
747
|
+
# file_id, file_duration, file_emoji, file_ext, file_height, file_mime_type, file_name, file_performer, file_size, file_sticker_set, file_title, file_width = "", 0, '', '', 0, '', '', '', 0, '', '', 0
|
748
|
+
|
749
|
+
|
750
|
+
|
751
|
+
# webpage_record = parse_webpage(message.web_preview)
|
752
|
+
|
753
|
+
# current_record = (uniq_id, channel_id, message_id, message_date, text, is_reply, views, forwards, replies_count, replies_channel_id, total_reactions, details_reactions,
|
754
|
+
# fwd_from_date, fwd_from_channel_id,fwd_from_name, fwd_from_channel_post,fwd_from_post_author,
|
755
|
+
# reply_to_quote, reply_to_msg_id, reply_to_peer_id, reply_to_top_id, reply_to_quote_text,
|
756
|
+
# file_id, file_duration, file_ext, file_height, file_mime_type, file_name, file_size, file_title, file_width)
|
757
|
+
# current_record = current_record + webpage_record
|
758
|
+
|
759
|
+
# all_records.append(current_record)
|
760
|
+
# df = pd.DataFrame.from_records(all_records, columns = ['uniq_id', 'channel_id', "message_id", "message_date", "text", "is_reply", "views", "forwards", "replies_count", "replies_channel_id", "total_reactions", "details_reactions",
|
761
|
+
# "fwd_from_date", "fwd_from_channel_id","fwd_from_name", "fwd_from_channel_post","fwd_from_post_author",
|
762
|
+
# "reply_to_quote", "reply_to_msg_id", "reply_to_peer_id", "reply_to_top_id", "reply_to_quote_text",
|
763
|
+
# "file_id", "file_duration", "file_ext", "file_height", "file_mime_type", "file_name", "file_size", "file_title", "file_width",
|
764
|
+
# "webpage_id", "webpage_url", "webpage_type", "webpage_site_name", "webpage_title", "webpage_description", "webpage_embed_url", "webpage_embed_type", "webpage_embed_width", "webpage_embed_height",
|
765
|
+
# "webpage_duration", "webpage_author", "webpage_photo_id", "webpage_photo_date"
|
766
|
+
|
767
|
+
# ])
|
768
|
+
# return df
|
769
|
+
|
770
|
+
|
771
|
+
# def parse_webpage(webpage):
|
772
|
+
# """
|
773
|
+
# Parse the given webpage object and extract relevant information.
|
774
|
+
|
775
|
+
# Args:
|
776
|
+
# webpage (Webpage): The webpage object to be parsed.
|
777
|
+
|
778
|
+
# Returns:
|
779
|
+
# tuple: A tuple containing the parsed information from the webpage.
|
780
|
+
# The tuple contains the following elements:
|
781
|
+
# - webpage_id (str): The ID of the webpage.
|
782
|
+
# - webpage_url (str): The URL of the webpage.
|
783
|
+
# - webpage_type (str): The type of the webpage.
|
784
|
+
# - webpage_site_name (str): The name of the site.
|
785
|
+
# - webpage_title (str): The title of the webpage.
|
786
|
+
# - webpage_description (str): The description of the webpage.
|
787
|
+
# - webpage_embed_url (str): The embed URL of the webpage.
|
788
|
+
# - webpage_embed_type (str): The embed type of the webpage.
|
789
|
+
# - webpage_embed_width (int): The embed width of the webpage.
|
790
|
+
# - webpage_embed_height (int): The embed height of the webpage.
|
791
|
+
# - webpage_duration (int): The duration of the webpage.
|
792
|
+
# - webpage_author (str): The author of the webpage.
|
793
|
+
# - webpage_photo_record (tuple): A tuple containing the parsed photo information from the webpage.
|
794
|
+
# """
|
795
|
+
|
796
|
+
# if webpage :
|
797
|
+
# if webpage.id:
|
798
|
+
# webpage_id = str(webpage.id)
|
799
|
+
# else:
|
800
|
+
# webpage_id = ''
|
801
|
+
# if webpage.url:
|
802
|
+
# webpage_url = webpage.url
|
803
|
+
# else:
|
804
|
+
# webpage_url = ''
|
805
|
+
# # if webpage.display_url:
|
806
|
+
# # webpage_display_url = webpage.display_url
|
807
|
+
# # else:
|
808
|
+
# # webpage_display_url = ''
|
809
|
+
# # # webpage_hash = webpage.hash
|
810
|
+
# # if webpage.has_large_media:
|
811
|
+
# # webpage_has_large_media = webpage.has_large_media
|
812
|
+
# # else:
|
813
|
+
# # webpage_has_large_media = False
|
814
|
+
# if webpage.type:
|
815
|
+
# webpage_type = webpage.type
|
816
|
+
# else:
|
817
|
+
# webpage_type = ''
|
818
|
+
# if webpage.site_name:
|
819
|
+
# webpage_site_name = webpage.site_name
|
820
|
+
# else:
|
821
|
+
# webpage_site_name = ''
|
822
|
+
# if webpage.title:
|
823
|
+
# webpage_title = webpage.title
|
824
|
+
# else:
|
825
|
+
# webpage_title = ''
|
826
|
+
# if webpage.description:
|
827
|
+
# webpage_description = webpage.description
|
828
|
+
# else:
|
829
|
+
# webpage_description = ''
|
830
|
+
# if webpage.embed_url:
|
831
|
+
# webpage_embed_url = webpage.embed_url
|
832
|
+
# else:
|
833
|
+
# webpage_embed_url = ''
|
834
|
+
# if webpage.embed_type:
|
835
|
+
# webpage_embed_type = webpage.embed_type
|
836
|
+
# else:
|
837
|
+
# webpage_embed_type = ''
|
838
|
+
# if webpage.embed_width:
|
839
|
+
# webpage_embed_width = int(webpage.embed_width)
|
840
|
+
# else:
|
841
|
+
# webpage_embed_width = 0
|
842
|
+
# if webpage.embed_height:
|
843
|
+
# webpage_embed_height = int(webpage.embed_height)
|
844
|
+
# else:
|
845
|
+
# webpage_embed_height = 0
|
846
|
+
# if webpage.duration:
|
847
|
+
# webpage_duration = int(webpage.duration)
|
848
|
+
# else:
|
849
|
+
# webpage_duration = 0
|
850
|
+
# if webpage.author :
|
851
|
+
# webpage_author = webpage.author
|
852
|
+
# else :
|
853
|
+
# webpage_author = ''
|
854
|
+
|
855
|
+
# webpage_photo_record = parse_photo(webpage.photo)
|
856
|
+
# # webpage_document = webpage.document
|
857
|
+
# # webpage_cached_page = webpage.cached_page
|
858
|
+
# # webpage_attributes = webpage.attributes
|
859
|
+
# else :
|
860
|
+
# webpage_id, webpage_url, webpage_type, webpage_site_name, webpage_title, webpage_description, webpage_embed_url, webpage_embed_type, webpage_embed_width, webpage_embed_height, webpage_duration, webpage_author, webpage_photo_record = "", "", "", "", "", "", "", "", 0, 0, 0, "", ('', datetime(1970,1,1))
|
861
|
+
# record = (webpage_id, webpage_url, webpage_type, webpage_site_name, webpage_title, webpage_description, webpage_embed_url, webpage_embed_type, webpage_embed_width, webpage_embed_height, webpage_duration, webpage_author) + webpage_photo_record
|
862
|
+
# return record
|
863
|
+
|
864
|
+
# def parse_photo(photo):
|
865
|
+
# """
|
866
|
+
# Parses the given photo object and returns a tuple containing the photo ID and date.
|
867
|
+
|
868
|
+
# Args:
|
869
|
+
# photo: The photo object to be parsed.
|
870
|
+
|
871
|
+
# Returns:
|
872
|
+
# A tuple containing the photo ID and date.
|
873
|
+
# """
|
874
|
+
|
875
|
+
# if photo:
|
876
|
+
# if photo.id:
|
877
|
+
# photo_id = str(photo.id)
|
878
|
+
# else:
|
879
|
+
# photo.id = ''
|
880
|
+
# if photo.date:
|
881
|
+
# photo_date = photo.date
|
882
|
+
# else :
|
883
|
+
# photo_date = datetime(1970,1,1)
|
884
|
+
|
885
|
+
# # photo_access_hash = photo.access_hash
|
886
|
+
# # photo_file_reference = photo.file_reference
|
887
|
+
# # photo_dc_id = photo.dc_id
|
888
|
+
# # photo_sizes = photo.sizes #### A PARSER
|
889
|
+
# else :
|
890
|
+
# photo_id, photo_date = '', datetime(1970,1,1)
|
891
|
+
|
892
|
+
# record = (photo_id, photo_date)
|
893
|
+
# return record
|
894
|
+
|
895
|
+
|
896
|
+
# async def get_messages_by_date(client, phone_number, channel_username, dl_files=False, reverse=True, limit=None, offset_date=datetime(1970,1,1), path_file="files"):
|
897
|
+
# """
|
898
|
+
# Retrieves messages from a Telegram channel by date.
|
899
|
+
|
900
|
+
# Args:
|
901
|
+
# client (TelegramClient): The Telegram client instance.
|
902
|
+
# phone_number (str): The phone number associated with the Telegram account.
|
903
|
+
# channel_username (str): The username of the channel to retrieve messages from.
|
904
|
+
# dl_files (bool, optional): Whether to download media files attached to the messages. Defaults to False.
|
905
|
+
# reverse (bool, optional): Whether to retrieve messages in reverse order. Defaults to True.
|
906
|
+
# limit (int, optional): The maximum number of messages to retrieve. Defaults to None (retrieve all messages).
|
907
|
+
# offset_date (datetime, optional): The starting date to retrieve messages from. Defaults to datetime(1970,1,1).
|
908
|
+
# path_file (str, optional): The path to save the downloaded files. Defaults to "files".
|
909
|
+
|
910
|
+
# Returns:
|
911
|
+
# list: A list of messages retrieved from the channel.
|
912
|
+
|
913
|
+
# Raises:
|
914
|
+
# Exception: If there is an error during the retrieval process.
|
915
|
+
|
916
|
+
# """
|
917
|
+
# try:
|
918
|
+
# await client.start(phone_number)
|
919
|
+
|
920
|
+
# current_path_file = create_dir(os.path.join(path_file, "messages"))
|
921
|
+
|
922
|
+
# if dl_files:
|
923
|
+
# current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
|
924
|
+
|
925
|
+
# # Get the message history
|
926
|
+
# messages = []
|
927
|
+
|
928
|
+
# async for message in client.iter_messages(channel_username,
|
929
|
+
# limit=limit,
|
930
|
+
# offset_date=offset_date,
|
931
|
+
# reverse=reverse):
|
932
|
+
# messages.append(message)
|
933
|
+
|
934
|
+
# if dl_files:
|
935
|
+
# if message.peer_id:
|
936
|
+
# channel_id = str(message.peer_id.channel_id)
|
937
|
+
# else:
|
938
|
+
# channel_id=''
|
939
|
+
# if message.id:
|
940
|
+
# message_id = str(message.id)
|
941
|
+
# else:
|
942
|
+
# message_id=''
|
943
|
+
# if message.file:
|
944
|
+
# file_id = str(message.file.id)
|
945
|
+
# else:
|
946
|
+
# file_id=''
|
947
|
+
# await message.download_media(file=os.path.join(current_path_img, channel_id+"_"+message_id+"_"+file_id))
|
948
|
+
|
949
|
+
# df_messages = parse_telegram_messages(messages)
|
950
|
+
# write_pickle(df_messages, current_path_file, str(channel_username))
|
951
|
+
|
952
|
+
# return messages
|
953
|
+
# finally:
|
954
|
+
# # Disconnect the client
|
955
|
+
# await client.disconnect()
|
956
|
+
|
957
|
+
async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str) -> dict:
|
958
|
+
"""
|
959
|
+
Retrieves information about a Telegram channel.
|
960
|
+
|
961
|
+
Args:
|
962
|
+
api_id (int): The API ID of the Telegram application.
|
963
|
+
api_hash (str): The API hash of the Telegram application.
|
964
|
+
phone_number (str): The phone number associated with the Telegram account.
|
965
|
+
channel_username (str): The username of the channel.
|
966
|
+
|
967
|
+
Returns:
|
968
|
+
dict: A dictionary containing the full information of the channel.
|
969
|
+
|
970
|
+
Raises:
|
971
|
+
Exception: If there is an error during the retrieval of channel information.
|
972
|
+
"""
|
973
|
+
client = TelegramClient('session_name', api_id, api_hash)
|
974
|
+
try:
|
975
|
+
await client.start(phone_number)
|
976
|
+
channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
|
977
|
+
channel_full_info = channel_full_info.to_dict()
|
978
|
+
finally:
|
979
|
+
# Disconnect the client
|
980
|
+
await client.disconnect()
|
981
|
+
|
982
|
+
return channel_full_info
|
983
|
+
|
984
|
+
|
985
|
+
def parse_channel(channel : dict) -> pd.DataFrame:
|
986
|
+
"""
|
987
|
+
Parses the given channel data and returns a DataFrame with the parsed information.
|
988
|
+
|
989
|
+
Args:
|
990
|
+
channel (dict): The channel data to be parsed.
|
991
|
+
|
992
|
+
Returns:
|
993
|
+
pandas.DataFrame: A DataFrame containing the parsed channel information.
|
994
|
+
|
995
|
+
"""
|
996
|
+
reactions=[]
|
997
|
+
channel_title = ''
|
998
|
+
creation_date = datetime(1970,1,1)
|
999
|
+
fc = channel.get("full_chat", {})
|
1000
|
+
channel_id = fc.get("id", "")
|
1001
|
+
channel_about = fc.get("about", "")
|
1002
|
+
channel_participants = int(fc.get("participants_count",0))
|
1003
|
+
linked_chat_id = fc.get("linked_chat_id", "")
|
1004
|
+
|
1005
|
+
ar = fc.get("available_reactions", {})
|
1006
|
+
if ar:
|
1007
|
+
reaction = ar.get('reactions', [])
|
1008
|
+
for r in reaction:
|
1009
|
+
if r.get('_') == "ReactionEmoji":
|
1010
|
+
reactions.append(r.get("emoticon"))
|
1011
|
+
elif r.get('_') == "ReactionCustomEmoji":
|
1012
|
+
reactions.append(r.get("document_id"))
|
1013
|
+
else:
|
1014
|
+
print("Not implemented type", r)
|
1015
|
+
|
1016
|
+
chats = channel.get("chats", [])
|
1017
|
+
if chats:
|
1018
|
+
for chat in chats:
|
1019
|
+
print(chat)
|
1020
|
+
if chat.get("_") == "Channel":
|
1021
|
+
if chat.get("id") == channel_id:
|
1022
|
+
creation_date = chat.get("date", datetime(1970,1,1))
|
1023
|
+
channel_title = chat.get("title", "")
|
1024
|
+
break
|
1025
|
+
else:
|
1026
|
+
print("Not implemented type", chat.get("_"))
|
1027
|
+
else:
|
1028
|
+
creation_date = datetime(1970,1,1)
|
1029
|
+
channel_title = ''
|
1030
|
+
|
1031
|
+
|
1032
|
+
fc_record = (str(channel_id), channel_title, channel_about, channel_participants, linked_chat_id, reactions, creation_date)
|
1033
|
+
|
1034
|
+
df = pd.DataFrame.from_records([fc_record], columns = ['channel_id', 'channel_title', 'channel_about', 'channel_participants', "linked_chat_id", 'reactions', 'creation_date'])
|
1035
|
+
return df
|