opsci-toolbox 0.0.12__py3-none-any.whl → 0.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1035 @@
1
+ from datetime import datetime
2
+ from telethon.sync import TelegramClient
3
+ from telethon.tl.functions.channels import GetFullChannelRequest
4
+ import pandas as pd
5
+ # from telethon.tl.types import ReactionEmoji, ReactionCustomEmoji, PeerUser, PeerChannel
6
+ from opsci_toolbox.helpers.common import create_dir, write_pickle
7
+ import os
8
+ import nest_asyncio
9
+ from telethon.tl.types import Message
10
+ from typing import Optional
11
+
12
+ nest_asyncio.apply()
13
+
14
+
15
+ def parse_mediafileid(message: Message) -> str:
16
+ """
17
+ Parse the media file ID from a Telegram message.
18
+
19
+ Args:
20
+ message (telethon.tl.types.Message): The Telegram message.
21
+
22
+ Returns:
23
+ Optional[str]: The media file ID if available, None otherwise.
24
+ """
25
+ data = message.to_dict()
26
+ media = data.get("media", {})
27
+ media_id = parse_media_id(media)
28
+
29
+ if media_id:
30
+ message_id = data.get("id")
31
+ peer_id = data.get("peer_id", {})
32
+ if peer_id is None:
33
+ peer_id = {}
34
+ channel_id = parse_from(peer_id)
35
+ grouped_id = data.get("grouped_id")
36
+ if grouped_id:
37
+ grouped_id = grouped_id
38
+ else:
39
+ grouped_id = message_id
40
+
41
+ media_fileid = str(channel_id)+'_'+str(grouped_id)+'_'+str(media_id)
42
+ return media_fileid
43
+ else:
44
+ return None
45
+
46
+
47
+ def parse_messages(messages : list) -> pd.DataFrame:
48
+ """
49
+ Parse Telegram messages.
50
+
51
+ Args:
52
+ messages : a list of Telegram messages.
53
+
54
+ Returns:
55
+ pd.DataFrame : a DataFrame containing the parsed information from the Telegram messages.
56
+ """
57
+
58
+ all_records = []
59
+ for message in messages:
60
+
61
+ data = message.to_dict()
62
+
63
+ message_id = data.get("id")
64
+
65
+ peer_id = data.get("peer_id", {})
66
+ if peer_id is None:
67
+ peer_id = {}
68
+ channel_id = parse_from(peer_id)
69
+
70
+ from_id = data.get("from_id", {})
71
+ if from_id is None :
72
+ from_id = {}
73
+ from_id = parse_from(from_id)
74
+ if from_id is None:
75
+ from_id = channel_id
76
+
77
+ date = data.get("date")
78
+ message = data.get("message")
79
+ views = data.get("views", 0)
80
+ forwards = data.get("forwards", 0)
81
+ if forwards is None:
82
+ forwards = 0
83
+
84
+ replies = data.get("replies", {})
85
+ if replies:
86
+ replies = replies.get("replies")
87
+ else:
88
+ replies = 0
89
+
90
+ grouped_id = data.get("grouped_id")
91
+ if grouped_id:
92
+ grouped_id = grouped_id
93
+ else:
94
+ grouped_id = message_id
95
+
96
+ reactions = data.get("reactions", {})
97
+ if reactions:
98
+ total_reactions, reactions_details = parse_reactions(reactions)
99
+ else:
100
+ total_reactions, reactions_details = 0, None
101
+
102
+
103
+ reply_to = data.get("reply_to", {})
104
+ reply_to_message_id, reply_to_channel_id = parse_reply(reply_to)
105
+
106
+ media = data.get("media", {})
107
+ media_record = parse_media(media)
108
+ fwd_from = data.get("fwd_from", {})
109
+ if fwd_from is None:
110
+ fwd_from = {}
111
+ fwd_record = parse_fwd(fwd_from)
112
+ engagements = forwards + replies + total_reactions
113
+
114
+
115
+ post_record = (message_id, channel_id, from_id, grouped_id, date, message, views, forwards, replies, total_reactions, reactions_details, engagements, reply_to_message_id, reply_to_channel_id) + media_record + fwd_record
116
+ all_records.append(post_record)
117
+
118
+ df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "date", "message", "views", "forwards", "replies", "reactions", "details_reactions", "engagements", "reply_to_message_id", "reply_to_channel_id",
119
+ "media_id", "media_date", "media_mime_type", "media_size", "media_filename", "duration", "width", "height",
120
+ "webpage_id", "webpage_url", "webpage_type", "webpage_site_name", "webpage_title", "webpage_description",
121
+ "fwd_date", "fwd_from_id", "fwd_from_post_id", "fwd_from_from_name"])
122
+
123
+ return df
124
+
125
+ def parse_reply(reply:dict) -> tuple:
126
+ """
127
+ Parse reply object from a Telegram message.
128
+
129
+ Args:
130
+ reply : a dict corresponding to the reply object.
131
+
132
+ Returns:
133
+ Tuple of reply_to_message_id, reply_to_channel_id
134
+ """
135
+ reply_to_message_id, reply_to_channel_id = None, None
136
+ if reply:
137
+ reply_to_message_id = reply.get("reply_to_msg_id")
138
+ reply_to_peer_id = reply.get("reply_to_peer_id", {})
139
+ if reply_to_peer_id is None:
140
+ reply_to_peer_id = {}
141
+ reply_to_channel_id = parse_from(reply_to_peer_id)
142
+
143
+ return reply_to_message_id, reply_to_channel_id
144
+
145
+
146
+
147
+ def parse_from(data : dict) -> int:
148
+ """
149
+ Parse a peer object from Telegram message.
150
+
151
+ Args:
152
+ data : a dict corresponding to the peer object.
153
+
154
+ Returns:
155
+ int : the channel_id or user_id.
156
+ """
157
+ if data.get("_"):
158
+ if data.get("_")=="PeerChannel":
159
+ channel_id = data.get('channel_id')
160
+ elif data.get("_")=="PeerUser":
161
+ channel_id = data.get('user_id')
162
+ else:
163
+ print("PEER not referenced", data.get('_'))
164
+ channel_id = None
165
+ else:
166
+ channel_id = None
167
+ return channel_id
168
+
169
+
170
+ def parse_fwd(forward : dict) -> tuple:
171
+ """
172
+ Parse a forward object from Telegram message.
173
+
174
+ Args:
175
+ forward : a dict corresponding to the forward object.
176
+
177
+ Returns:
178
+ tuple containing, date, post id, channel id and name related to the forward.
179
+ """
180
+ fwd_from_date, from_id, fwd_from_channel_post_id, fwd_from_from_name = None, None, None, None
181
+ if forward:
182
+ fwd_from_date = forward.get("date")
183
+ fwd_from_channel_post_id = forward.get("channel_post")
184
+ fwd_from_from_name = forward.get("from_name")
185
+ fwd_from_id = forward.get("from_id", {})
186
+ if fwd_from_id is None :
187
+ fwd_from_id = {}
188
+ from_id = parse_from(fwd_from_id)
189
+
190
+ fwd_record = (fwd_from_date, from_id, fwd_from_channel_post_id, fwd_from_from_name)
191
+ return fwd_record
192
+
193
+ def parse_reactions(reactions: dict) -> tuple:
194
+ """
195
+ Parse reactions from Telegram message.
196
+
197
+ Args:
198
+ reactions : a dict corresponding to the reactions object.
199
+
200
+ Returns:
201
+ tuple containing the total number of reactions and the details of each reaction.
202
+ """
203
+ # details = dict()
204
+ details=[]
205
+ results = reactions.get("results", [])
206
+ total_reactions = 0
207
+ for res in results:
208
+ count = res.get("count", 0)
209
+ total_reactions += count
210
+ reaction = res.get("reaction",{})
211
+ if reaction.get("_")=="ReactionEmoji":
212
+ emoticon = reaction.get("emoticon", "")
213
+ else :
214
+ emoticon = str(reaction.get("document_id",""))
215
+
216
+ # details[emoticon] = count
217
+ details.append((emoticon, count))
218
+
219
+ return total_reactions, details
220
+
221
+ def parse_media(media : dict) -> tuple:
222
+ """
223
+ Parse medias from Telegram message. Currently it supports photo, document and webpage.
224
+
225
+ Args:
226
+ media : a dict corresponding to the media object.
227
+
228
+ Returns:
229
+ tuple containing media metadata.
230
+ """
231
+ webpage_id, webpage_url, webpage_type, webpage_site_name, webpage_title, webpage_description = None, None, None, None, None, None
232
+ media_id, media_date, media_mime_type, media_size, media_filename, duration, width, height = None, None, None, 0, None, 0, 0, 0
233
+ if media :
234
+ if media.get("_") == "MessageMediaPhoto":
235
+ photo = media.get("photo", {})
236
+ media_id = photo.get("id")
237
+ media_date = photo.get("date")
238
+ media_mime_type = "photo"
239
+
240
+ elif media.get("_") == "MessageMediaDocument":
241
+ document = media.get("document", {})
242
+ media_id = document.get("id")
243
+ media_date = document.get("date")
244
+ media_mime_type = document.get("mime_type")
245
+ media_size = document.get("size")
246
+ attributes = document.get("attributes", [])
247
+ for attr in attributes:
248
+ if attr.get("_") == "DocumentAttributeFilename":
249
+ media_filename = str(attr.get("file_name", ""))
250
+ elif attr.get("_") == "DocumentAttributeVideo":
251
+ duration = attr.get("duration")
252
+ width = attr.get("w")
253
+ height = attr.get("h")
254
+ elif media.get("_") == "MessageMediaWebPage":
255
+ webpage = media.get("webpage", {})
256
+ webpage_id = webpage.get("id")
257
+ webpage_url = webpage.get("url")
258
+ webpage_type = webpage.get("type")
259
+ webpage_site_name = webpage.get("site_name")
260
+ webpage_title = webpage.get("title")
261
+ webpage_description = webpage.get("description")
262
+
263
+ else :
264
+ print("MEDIA not referenced", media.get('_'))
265
+
266
+ media_record = (media_id, media_date, media_mime_type, media_size, media_filename, duration, width, height, webpage_id, webpage_url, webpage_type, webpage_site_name, webpage_title, webpage_description)
267
+ return media_record
268
+
269
+ def parse_media_id(media : dict) -> int:
270
+ """
271
+ Parse media id from Telegram message.
272
+
273
+ Args:
274
+ media : a dict corresponding to the media object.
275
+
276
+ Returns:
277
+ int : the media_id.
278
+ """
279
+ media_id = None
280
+ if media :
281
+ if media.get("_") == "MessageMediaPhoto":
282
+ photo = media.get("photo", {})
283
+ media_id = photo.get("id")
284
+
285
+ elif media.get("_") == "MessageMediaDocument":
286
+ document = media.get("document", {})
287
+ media_id = document.get("id")
288
+
289
+ else :
290
+ media_id = None
291
+ return media_id
292
+
293
+ def group_by_post(df : pd.DataFrame) -> pd.DataFrame:
294
+ """
295
+ Function to group message by "post". If a post embed multiple media, Telethon returns separate messages for each media. This function groups them together ensuring also type consistency.
296
+
297
+ Args:
298
+ df : dataframe containing messages.
299
+
300
+ Returns:
301
+ pd.DataFrame : a DataFrame containing the grouped messages.
302
+ """
303
+ aggregations = {
304
+ 'concatenated_message_id': ("message_id", lambda x: '-'.join(x.dropna().astype(str))),
305
+ 'message_id': ("message_id", lambda x: list(x[x.notna()].astype(int).astype(str))),
306
+ 'from_id': ("from_id", lambda x: str(int(x.dropna().max())) if pd.notna(x.dropna().max()) else None),
307
+ "date" : ("date", "max"),
308
+ "message": ("message", "max"),
309
+ "views": ("views", "max"),
310
+ "forwards": ("forwards", "max"),
311
+ "replies": ("replies", "max"),
312
+ "reactions": ("reactions", "max"),
313
+ "details_reactions": ("details_reactions", lambda x: x[x.notna()]),
314
+ "engagements": ("engagements", "max"),
315
+ "reply_to_message_id": ("reply_to_message_id", lambda x: str(int(x.dropna().max())) if pd.notna(x.dropna().max()) else None),
316
+ "reply_to_channel_id": ("reply_to_channel_id", lambda x: str(int(x.dropna().max())) if pd.notna(x.dropna().max()) else None),
317
+ "media_id" : ("media_id", lambda x: list(x[x.notna()].astype(int).astype(str))),
318
+ "media_date": ("media_date", lambda x: list(x[x.notna()])),
319
+ "media_mime_type": ("media_mime_type", lambda x: list(x[x.notna()])),
320
+ "media_size": ("media_size", lambda x: list(x[x.notna()].astype(int).astype(str))),
321
+ "media_filename": ("media_filename", lambda x: list(x[x.notna()])),
322
+ "media_fileid": ("media_fileid", lambda x: list(x[x.notna()].astype(str))),
323
+ "duration": ("duration", lambda x: list(x[x.notna()].astype(int).astype(str))),
324
+ "width": ("width", lambda x: list(x[x.notna()].astype(int).astype(str))),
325
+ "height": ("height", lambda x: list(x[x.notna()].astype(int).astype(str))),
326
+ "webpage_id": ("webpage_id", lambda x: list(x[x.notna()].astype(int).astype(str))),
327
+ "webpage_url": ("webpage_url", lambda x: list(x[x.notna()])),
328
+ "webpage_type": ("webpage_type", lambda x: list(x[x.notna()])),
329
+ "webpage_site_name": ("webpage_site_name", lambda x: list(x[x.notna()])),
330
+ "webpage_title": ("webpage_title", lambda x: list(x[x.notna()])),
331
+ "webpage_description": ("webpage_description", lambda x: list(x[x.notna()])),
332
+ "fwd_date": ("fwd_date", "max"),
333
+ "fwd_from_id": ("fwd_from_id", lambda x: str(int(x.dropna().max())) if pd.notna(x.dropna().max()) else None),
334
+ "fwd_from_post_id": ("fwd_from_post_id", lambda x: str(int(x.dropna().max())) if pd.notna(x.dropna().max()) else None),
335
+ "fwd_from_from_name":("fwd_from_from_name", "max")
336
+
337
+ }
338
+ df = df.groupby(['channel_id',"grouped_id"]).agg(**aggregations).reset_index()
339
+ return df
340
+
341
+ async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), ids: list = [], path_file: str = "files") -> list:
342
+ """
343
+ Retrieves messages from a Telegram channel by date.
344
+
345
+ Args:
346
+ client (TelegramClient): The Telegram client instance.
347
+ phone_number (str): The phone number associated with the Telegram account.
348
+ channel_username (str): The username of the channel to retrieve messages from.
349
+ dl_files (bool, optional): Whether to download media files attached to the messages. Defaults to False.
350
+ reverse (bool, optional): Whether to retrieve messages in reverse order. Defaults to True.
351
+ limit (int, optional): The maximum number of messages to retrieve. Defaults to None (retrieve all messages).
352
+ offset_date (datetime, optional): The starting date to retrieve messages from. Defaults to datetime(1970,1,1).
353
+ path_file (str, optional): The path to save the downloaded files. Defaults to "files".
354
+
355
+ Returns:
356
+ list: A list of messages retrieved from the channel.
357
+
358
+ Raises:
359
+ Exception: If there is an error during the retrieval process.
360
+
361
+ """
362
+ try:
363
+ await client.start(phone_number)
364
+
365
+ current_path_file = create_dir(os.path.join(path_file, "messages"))
366
+ if dl_files:
367
+ current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
368
+
369
+ # Get the message history
370
+ messages = []
371
+
372
+ async for message in client.iter_messages(channel_username,
373
+ limit=limit,
374
+ offset_date=offset_date,
375
+ reverse=reverse):
376
+ messages.append(message)
377
+
378
+ if dl_files:
379
+
380
+ media_fileid = parse_mediafileid(message)
381
+ if media_fileid:
382
+ await message.download_media(file=os.path.join(current_path_img, media_fileid))
383
+
384
+ df_exploded = parse_messages(messages)
385
+ df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
386
+ write_pickle(df_exploded, current_path_file, str(channel_username))
387
+
388
+ # df_messages = group_by_post(df_exploded)
389
+ # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['from_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
390
+ # write_pickle(df_messages, current_path_file, str(channel_username))
391
+
392
+ return messages
393
+ finally:
394
+ # Disconnect the client
395
+ await client.disconnect()
396
+
397
+ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel_username:int, dl_files:bool=False, ids:list=[], path_file:str="files")-> list:
398
+ """
399
+ Retrieves messages from a Telegram channel by date.
400
+
401
+ Args:
402
+ client (TelegramClient): The Telegram client instance.
403
+ phone_number (str): The phone number associated with the Telegram account.
404
+ channel_username (str): The username of the channel to retrieve messages from.
405
+ dl_files (bool, optional): Whether to download media files attached to the messages. Defaults to False.
406
+ ids (list) : list of message ids to retrieve
407
+ path_file (str, optional): The path to save the downloaded files. Defaults to "files".
408
+
409
+ Returns:
410
+ list: A list of messages retrieved from the channel.
411
+
412
+ Raises:
413
+ Exception: If there is an error during the retrieval process.
414
+
415
+ """
416
+ try:
417
+ await client.start(phone_number)
418
+
419
+ current_path_file = create_dir(os.path.join(path_file, "messages"))
420
+
421
+ # Get the message history
422
+ messages = []
423
+
424
+ async for message in client.iter_messages(channel_username,
425
+ ids = ids):
426
+ messages.append(message)
427
+
428
+ if dl_files:
429
+ current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
430
+ media_fileid = parse_mediafileid(message)
431
+ if media_fileid:
432
+ await message.download_media(file=os.path.join(current_path_img, media_fileid))
433
+
434
+ df_exploded = parse_messages(messages)
435
+ df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
436
+ write_pickle(df_exploded, current_path_file, str(channel_username))
437
+ # df_messages = group_by_post(df_exploded)
438
+ # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['message_id'].astype(str)
439
+ # write_pickle(df_messages, current_path_file, str(channel_username))
440
+
441
+ return messages
442
+ finally:
443
+ # Disconnect the client
444
+ await client.disconnect()
445
+
446
+ async def get_messages_by_search(client: TelegramClient, phone_number: str, search:str= "SNCF", channel_username:int = None, dl_files:bool=False, limit:int=None, path_file:str="files") -> list:
447
+ """,
448
+ Retrieves messages from a Telegram channel by date.
449
+
450
+ Args:
451
+ client (TelegramClient): The Telegram client instance.
452
+ phone_number (str): The phone number associated with the Telegram account.
453
+ search (str): The search term to look for in the messages.
454
+ channel_username (str): The username of the channel to retrieve messages from.
455
+ dl_files (bool, optional): Whether to download media files attached to the messages. Defaults to False.
456
+ limit (int, optional): The maximum number of messages to retrieve. Defaults to None (retrieve all messages).
457
+ path_file (str, optional): The path to save the downloaded files. Defaults to "files".
458
+
459
+ Returns:
460
+ list: A list of messages retrieved from the channel.
461
+
462
+ Raises:
463
+ Exception: If there is an error during the retrieval process.
464
+
465
+ """
466
+ try:
467
+ await client.start(phone_number)
468
+
469
+ current_path_file = create_dir(os.path.join(path_file, "messages"))
470
+ if dl_files:
471
+ current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
472
+
473
+ # Get the message history
474
+ messages = []
475
+
476
+ async for message in client.iter_messages(channel_username,
477
+ search=search,
478
+ limit=limit):
479
+ messages.append(message)
480
+
481
+ if dl_files:
482
+
483
+ media_fileid = parse_mediafileid(message)
484
+ if media_fileid:
485
+ await message.download_media(file=os.path.join(current_path_img, media_fileid))
486
+
487
+ df_exploded = parse_messages(messages)
488
+ df_exploded['search']=search
489
+ df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
490
+ # df_messages = group_by_post(df_exploded)
491
+ # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
492
+ if channel_username:
493
+ write_pickle(df_exploded, current_path_file, str(search)+'_'+str(channel_username))
494
+ else:
495
+ write_pickle(df_exploded, current_path_file, str(search))
496
+
497
+ return messages
498
+ finally:
499
+ # Disconnect the client
500
+ await client.disconnect()
501
+
502
+ async def download_comments(client: TelegramClient, phone_number: str,channel_entity : int, message_id: int, dl_files:bool = False, limit:int = None, reverse:bool=True, path_file:str ="files")->list:
503
+ try:
504
+ # Connect the client
505
+ await client.start(phone_number)
506
+
507
+ current_path_file = create_dir(os.path.join(path_file, "messages"))
508
+
509
+ comments = []
510
+
511
+ async for comment in client.iter_messages(int(channel_entity), reply_to=int(message_id), limit=limit, reverse=reverse):
512
+ comments.append(comment)
513
+
514
+ if dl_files:
515
+ current_path_img = create_dir(os.path.join(path_file, "img", str(channel_entity)))
516
+ media_fileid = parse_mediafileid(comment)
517
+ if media_fileid:
518
+ await comment.download_media(file=os.path.join(current_path_img, media_fileid))
519
+
520
+ df_comments = parse_messages(comments)
521
+ df_comments.loc[df_comments['media_id'].notna(), "media_fileid"] = df_comments['channel_id'].astype(str)+'_'+df_comments['grouped_id'].astype(str)+'_'+df_comments['media_id'].astype(str)
522
+ write_pickle(df_comments, current_path_file, str(channel_entity)+"_"+str(message_id))
523
+
524
+ return comments
525
+
526
+ finally:
527
+ # Disconnect the client
528
+ await client.disconnect()
529
+
530
+
531
+
532
+ # def parse_telegram_messages(messages : list) -> pd.DataFrame:
533
+ # """
534
+ # Parses the given list of Telegram messages and returns a DataFrame with the extracted information.
535
+
536
+ # Args:
537
+ # messages (list): A list of Telegram messages.
538
+
539
+ # Returns:
540
+ # pandas.DataFrame: A DataFrame containing the parsed information from the Telegram messages.
541
+
542
+ # """
543
+
544
+ # all_records = []
545
+ # for message in messages:
546
+
547
+ # peer_id = message.peer_id
548
+
549
+ # if peer_id:
550
+ # channel_id = str(peer_id.channel_id)
551
+ # else:
552
+ # channel_id=''
553
+ # if message.id:
554
+ # message_id = str(message.id)
555
+ # else:
556
+ # message_id=''
557
+
558
+ # uniq_id = str(channel_id) + "_" + str(message_id)
559
+ # if message.date:
560
+ # message_date = message.date
561
+ # else:
562
+ # message_date=datetime(1970,1,1)
563
+ # if message.text:
564
+ # text = message.text
565
+ # else:
566
+ # text = ''
567
+ # if message.is_reply:
568
+ # is_reply = message.is_reply
569
+ # else:
570
+ # is_reply = False
571
+
572
+ # if message.views:
573
+ # views = int(message.views)
574
+ # else:
575
+ # views = 0
576
+ # if message.forwards:
577
+ # forwards = int(message.forwards)
578
+ # else:
579
+ # forwards = 0
580
+
581
+ # ##########################################
582
+ # # REPLIES
583
+ # ##########################################
584
+ # if message.replies :
585
+ # replies = message.replies
586
+ # if replies.replies:
587
+ # replies_count = int(replies.replies)
588
+ # else:
589
+ # replies_count = 0
590
+
591
+ # if replies.channel_id:
592
+ # replies_channel_id = replies.channel_id
593
+ # else:
594
+ # replies_channel_id = ''
595
+ # else :
596
+ # replies_count, replies_channel_id= 0, ''
597
+
598
+ # ##########################################
599
+ # # REACTIONS
600
+ # ##########################################
601
+
602
+ # total_reactions = 0
603
+ # details_reactions=[]
604
+
605
+ # if message.reactions:
606
+ # reactions = message.reactions
607
+ # reactions_lst = reactions.results
608
+ # for reaction in reactions_lst:
609
+ # if reaction.count:
610
+ # count = int(reaction.count)
611
+ # else:
612
+ # count = 0
613
+ # total_reactions += count
614
+ # r = reaction.reaction
615
+
616
+ # if isinstance(r, ReactionEmoji):
617
+ # emoticon = r.emoticon
618
+ # elif isinstance(r, ReactionCustomEmoji):
619
+ # emoticon = r.document_id
620
+ # else:
621
+ # emoticon = None
622
+ # details_reactions.append((emoticon, count))
623
+ # else :
624
+ # count = 0
625
+
626
+ # ##########################################
627
+ # # FORWARDS
628
+ # ##########################################
629
+
630
+ # if message.fwd_from :
631
+ # fwd_from = message.fwd_from
632
+ # if fwd_from.date:
633
+ # fwd_from_date = fwd_from.date
634
+ # else :
635
+ # fwd_from_date = datetime(1970,1,1)
636
+ # if fwd_from.from_id:
637
+ # fwd_from_id = fwd_from.from_id
638
+ # if isinstance(fwd_from_id, PeerUser):
639
+ # fwd_from_channel_id = fwd_from_id.user_id
640
+ # elif isinstance(fwd_from_id, PeerChannel):
641
+ # fwd_from_channel_id = fwd_from_id.channel_id
642
+ # else:
643
+ # fwd_from_channel_id = None
644
+ # print(fwd_from_id, "type not implemented")
645
+ # else :
646
+ # fwd_from_channel_id = None
647
+ # if fwd_from.from_name:
648
+ # fwd_from_name = fwd_from.from_name
649
+ # else:
650
+ # fwd_from_name = ''
651
+ # if fwd_from.channel_post:
652
+ # fwd_from_channel_post = str(fwd_from.channel_post)
653
+ # else:
654
+ # fwd_from_channel_post = ''
655
+ # if fwd_from.post_author:
656
+ # fwd_from_post_author = fwd_from.post_author
657
+ # else:
658
+ # fwd_from_post_author=''
659
+ # else :
660
+ # fwd_from_date, fwd_from_id, fwd_from_channel_id, fwd_from_name, fwd_from_channel_post, fwd_from_post_author = datetime(1970,1,1), '', '', '', '', ''
661
+
662
+ # ##########################################
663
+ # # REPLIES
664
+ # ##########################################
665
+
666
+ # if message.reply_to:
667
+ # reply_to = message.reply_to
668
+ # if reply_to.quote:
669
+ # reply_to_quote = reply_to.quote
670
+ # else:
671
+ # reply_to_quote=False
672
+ # if reply_to.reply_to_msg_id:
673
+ # reply_to_msg_id = str(reply_to.reply_to_msg_id)
674
+ # else:
675
+ # reply_to_msg_id = ''
676
+ # if reply_to.reply_to_peer_id:
677
+ # reply_to_peer_id = str(reply_to.reply_to_peer_id)
678
+ # else:
679
+ # reply_to_peer_id = ''
680
+ # # reply_from = reply_to.reply_from
681
+ # # reply_media = reply_to.reply_media
682
+ # if reply_to.reply_to_top_id:
683
+ # reply_to_top_id = str(reply_to.reply_to_top_id)
684
+ # else:
685
+ # reply_to_top_id = ''
686
+ # if reply_to.quote_text:
687
+ # reply_to_quote_text = reply_to.quote_text
688
+ # else:
689
+ # reply_to_quote_text = ''
690
+ # else:
691
+ # reply_to_quote, reply_to_msg_id, reply_to_peer_id, reply_to_top_id, reply_to_quote_text = False, '', '', '', ''
692
+
693
+ # ##########################################
694
+ # # FILE
695
+ # ##########################################
696
+ # if message.file:
697
+ # file = message.file
698
+ # if file.id:
699
+ # file_id = file.id
700
+ # else:
701
+ # file_id = ''
702
+ # if file.duration:
703
+ # file_duration = file.duration
704
+ # else:
705
+ # file_duration = 0
706
+ # if file.emoji:
707
+ # file_emoji = file.emoji
708
+ # else:
709
+ # file_emoji = ''
710
+ # if file.ext:
711
+ # file_ext = file.ext
712
+ # else:
713
+ # file_ext = ''
714
+ # if file.height:
715
+ # file_height = int(file.height)
716
+ # else:
717
+ # file_height = 0
718
+ # if file.mime_type:
719
+ # file_mime_type = file.mime_type
720
+ # else:
721
+ # file_mime_type = ''
722
+ # if file.name:
723
+ # file_name = file.name
724
+ # else:
725
+ # file_name = ''
726
+ # if file.performer:
727
+ # file_performer = file.performer
728
+ # else:
729
+ # file_performer = ''
730
+ # if file.size:
731
+ # file_size = file.size
732
+ # else:
733
+ # file_size = 0
734
+ # if file.sticker_set:
735
+ # file_sticker_set = file.sticker_set
736
+ # else:
737
+ # file_sticker_set = ''
738
+ # if file.title:
739
+ # file_title = file.title
740
+ # else :
741
+ # file_title = ''
742
+ # if file.width:
743
+ # file_width = int(file.width)
744
+ # else:
745
+ # file_width = 0
746
+ # else :
747
+ # file_id, file_duration, file_emoji, file_ext, file_height, file_mime_type, file_name, file_performer, file_size, file_sticker_set, file_title, file_width = "", 0, '', '', 0, '', '', '', 0, '', '', 0
748
+
749
+
750
+
751
+ # webpage_record = parse_webpage(message.web_preview)
752
+
753
+ # current_record = (uniq_id, channel_id, message_id, message_date, text, is_reply, views, forwards, replies_count, replies_channel_id, total_reactions, details_reactions,
754
+ # fwd_from_date, fwd_from_channel_id,fwd_from_name, fwd_from_channel_post,fwd_from_post_author,
755
+ # reply_to_quote, reply_to_msg_id, reply_to_peer_id, reply_to_top_id, reply_to_quote_text,
756
+ # file_id, file_duration, file_ext, file_height, file_mime_type, file_name, file_size, file_title, file_width)
757
+ # current_record = current_record + webpage_record
758
+
759
+ # all_records.append(current_record)
760
+ # df = pd.DataFrame.from_records(all_records, columns = ['uniq_id', 'channel_id', "message_id", "message_date", "text", "is_reply", "views", "forwards", "replies_count", "replies_channel_id", "total_reactions", "details_reactions",
761
+ # "fwd_from_date", "fwd_from_channel_id","fwd_from_name", "fwd_from_channel_post","fwd_from_post_author",
762
+ # "reply_to_quote", "reply_to_msg_id", "reply_to_peer_id", "reply_to_top_id", "reply_to_quote_text",
763
+ # "file_id", "file_duration", "file_ext", "file_height", "file_mime_type", "file_name", "file_size", "file_title", "file_width",
764
+ # "webpage_id", "webpage_url", "webpage_type", "webpage_site_name", "webpage_title", "webpage_description", "webpage_embed_url", "webpage_embed_type", "webpage_embed_width", "webpage_embed_height",
765
+ # "webpage_duration", "webpage_author", "webpage_photo_id", "webpage_photo_date"
766
+
767
+ # ])
768
+ # return df
769
+
770
+
771
+ # def parse_webpage(webpage):
772
+ # """
773
+ # Parse the given webpage object and extract relevant information.
774
+
775
+ # Args:
776
+ # webpage (Webpage): The webpage object to be parsed.
777
+
778
+ # Returns:
779
+ # tuple: A tuple containing the parsed information from the webpage.
780
+ # The tuple contains the following elements:
781
+ # - webpage_id (str): The ID of the webpage.
782
+ # - webpage_url (str): The URL of the webpage.
783
+ # - webpage_type (str): The type of the webpage.
784
+ # - webpage_site_name (str): The name of the site.
785
+ # - webpage_title (str): The title of the webpage.
786
+ # - webpage_description (str): The description of the webpage.
787
+ # - webpage_embed_url (str): The embed URL of the webpage.
788
+ # - webpage_embed_type (str): The embed type of the webpage.
789
+ # - webpage_embed_width (int): The embed width of the webpage.
790
+ # - webpage_embed_height (int): The embed height of the webpage.
791
+ # - webpage_duration (int): The duration of the webpage.
792
+ # - webpage_author (str): The author of the webpage.
793
+ # - webpage_photo_record (tuple): A tuple containing the parsed photo information from the webpage.
794
+ # """
795
+
796
+ # if webpage :
797
+ # if webpage.id:
798
+ # webpage_id = str(webpage.id)
799
+ # else:
800
+ # webpage_id = ''
801
+ # if webpage.url:
802
+ # webpage_url = webpage.url
803
+ # else:
804
+ # webpage_url = ''
805
+ # # if webpage.display_url:
806
+ # # webpage_display_url = webpage.display_url
807
+ # # else:
808
+ # # webpage_display_url = ''
809
+ # # # webpage_hash = webpage.hash
810
+ # # if webpage.has_large_media:
811
+ # # webpage_has_large_media = webpage.has_large_media
812
+ # # else:
813
+ # # webpage_has_large_media = False
814
+ # if webpage.type:
815
+ # webpage_type = webpage.type
816
+ # else:
817
+ # webpage_type = ''
818
+ # if webpage.site_name:
819
+ # webpage_site_name = webpage.site_name
820
+ # else:
821
+ # webpage_site_name = ''
822
+ # if webpage.title:
823
+ # webpage_title = webpage.title
824
+ # else:
825
+ # webpage_title = ''
826
+ # if webpage.description:
827
+ # webpage_description = webpage.description
828
+ # else:
829
+ # webpage_description = ''
830
+ # if webpage.embed_url:
831
+ # webpage_embed_url = webpage.embed_url
832
+ # else:
833
+ # webpage_embed_url = ''
834
+ # if webpage.embed_type:
835
+ # webpage_embed_type = webpage.embed_type
836
+ # else:
837
+ # webpage_embed_type = ''
838
+ # if webpage.embed_width:
839
+ # webpage_embed_width = int(webpage.embed_width)
840
+ # else:
841
+ # webpage_embed_width = 0
842
+ # if webpage.embed_height:
843
+ # webpage_embed_height = int(webpage.embed_height)
844
+ # else:
845
+ # webpage_embed_height = 0
846
+ # if webpage.duration:
847
+ # webpage_duration = int(webpage.duration)
848
+ # else:
849
+ # webpage_duration = 0
850
+ # if webpage.author :
851
+ # webpage_author = webpage.author
852
+ # else :
853
+ # webpage_author = ''
854
+
855
+ # webpage_photo_record = parse_photo(webpage.photo)
856
+ # # webpage_document = webpage.document
857
+ # # webpage_cached_page = webpage.cached_page
858
+ # # webpage_attributes = webpage.attributes
859
+ # else :
860
+ # webpage_id, webpage_url, webpage_type, webpage_site_name, webpage_title, webpage_description, webpage_embed_url, webpage_embed_type, webpage_embed_width, webpage_embed_height, webpage_duration, webpage_author, webpage_photo_record = "", "", "", "", "", "", "", "", 0, 0, 0, "", ('', datetime(1970,1,1))
861
+ # record = (webpage_id, webpage_url, webpage_type, webpage_site_name, webpage_title, webpage_description, webpage_embed_url, webpage_embed_type, webpage_embed_width, webpage_embed_height, webpage_duration, webpage_author) + webpage_photo_record
862
+ # return record
863
+
864
+ # def parse_photo(photo):
865
+ # """
866
+ # Parses the given photo object and returns a tuple containing the photo ID and date.
867
+
868
+ # Args:
869
+ # photo: The photo object to be parsed.
870
+
871
+ # Returns:
872
+ # A tuple containing the photo ID and date.
873
+ # """
874
+
875
+ # if photo:
876
+ # if photo.id:
877
+ # photo_id = str(photo.id)
878
+ # else:
879
+ # photo.id = ''
880
+ # if photo.date:
881
+ # photo_date = photo.date
882
+ # else :
883
+ # photo_date = datetime(1970,1,1)
884
+
885
+ # # photo_access_hash = photo.access_hash
886
+ # # photo_file_reference = photo.file_reference
887
+ # # photo_dc_id = photo.dc_id
888
+ # # photo_sizes = photo.sizes #### A PARSER
889
+ # else :
890
+ # photo_id, photo_date = '', datetime(1970,1,1)
891
+
892
+ # record = (photo_id, photo_date)
893
+ # return record
894
+
895
+
896
+ # async def get_messages_by_date(client, phone_number, channel_username, dl_files=False, reverse=True, limit=None, offset_date=datetime(1970,1,1), path_file="files"):
897
+ # """
898
+ # Retrieves messages from a Telegram channel by date.
899
+
900
+ # Args:
901
+ # client (TelegramClient): The Telegram client instance.
902
+ # phone_number (str): The phone number associated with the Telegram account.
903
+ # channel_username (str): The username of the channel to retrieve messages from.
904
+ # dl_files (bool, optional): Whether to download media files attached to the messages. Defaults to False.
905
+ # reverse (bool, optional): Whether to retrieve messages in reverse order. Defaults to True.
906
+ # limit (int, optional): The maximum number of messages to retrieve. Defaults to None (retrieve all messages).
907
+ # offset_date (datetime, optional): The starting date to retrieve messages from. Defaults to datetime(1970,1,1).
908
+ # path_file (str, optional): The path to save the downloaded files. Defaults to "files".
909
+
910
+ # Returns:
911
+ # list: A list of messages retrieved from the channel.
912
+
913
+ # Raises:
914
+ # Exception: If there is an error during the retrieval process.
915
+
916
+ # """
917
+ # try:
918
+ # await client.start(phone_number)
919
+
920
+ # current_path_file = create_dir(os.path.join(path_file, "messages"))
921
+
922
+ # if dl_files:
923
+ # current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
924
+
925
+ # # Get the message history
926
+ # messages = []
927
+
928
+ # async for message in client.iter_messages(channel_username,
929
+ # limit=limit,
930
+ # offset_date=offset_date,
931
+ # reverse=reverse):
932
+ # messages.append(message)
933
+
934
+ # if dl_files:
935
+ # if message.peer_id:
936
+ # channel_id = str(message.peer_id.channel_id)
937
+ # else:
938
+ # channel_id=''
939
+ # if message.id:
940
+ # message_id = str(message.id)
941
+ # else:
942
+ # message_id=''
943
+ # if message.file:
944
+ # file_id = str(message.file.id)
945
+ # else:
946
+ # file_id=''
947
+ # await message.download_media(file=os.path.join(current_path_img, channel_id+"_"+message_id+"_"+file_id))
948
+
949
+ # df_messages = parse_telegram_messages(messages)
950
+ # write_pickle(df_messages, current_path_file, str(channel_username))
951
+
952
+ # return messages
953
+ # finally:
954
+ # # Disconnect the client
955
+ # await client.disconnect()
956
+
957
+ async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str) -> dict:
958
+ """
959
+ Retrieves information about a Telegram channel.
960
+
961
+ Args:
962
+ api_id (int): The API ID of the Telegram application.
963
+ api_hash (str): The API hash of the Telegram application.
964
+ phone_number (str): The phone number associated with the Telegram account.
965
+ channel_username (str): The username of the channel.
966
+
967
+ Returns:
968
+ dict: A dictionary containing the full information of the channel.
969
+
970
+ Raises:
971
+ Exception: If there is an error during the retrieval of channel information.
972
+ """
973
+ client = TelegramClient('session_name', api_id, api_hash)
974
+ try:
975
+ await client.start(phone_number)
976
+ channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
977
+ channel_full_info = channel_full_info.to_dict()
978
+ finally:
979
+ # Disconnect the client
980
+ await client.disconnect()
981
+
982
+ return channel_full_info
983
+
984
+
985
+ def parse_channel(channel : dict) -> pd.DataFrame:
986
+ """
987
+ Parses the given channel data and returns a DataFrame with the parsed information.
988
+
989
+ Args:
990
+ channel (dict): The channel data to be parsed.
991
+
992
+ Returns:
993
+ pandas.DataFrame: A DataFrame containing the parsed channel information.
994
+
995
+ """
996
+ reactions=[]
997
+ channel_title = ''
998
+ creation_date = datetime(1970,1,1)
999
+ fc = channel.get("full_chat", {})
1000
+ channel_id = fc.get("id", "")
1001
+ channel_about = fc.get("about", "")
1002
+ channel_participants = int(fc.get("participants_count",0))
1003
+ linked_chat_id = fc.get("linked_chat_id", "")
1004
+
1005
+ ar = fc.get("available_reactions", {})
1006
+ if ar:
1007
+ reaction = ar.get('reactions', [])
1008
+ for r in reaction:
1009
+ if r.get('_') == "ReactionEmoji":
1010
+ reactions.append(r.get("emoticon"))
1011
+ elif r.get('_') == "ReactionCustomEmoji":
1012
+ reactions.append(r.get("document_id"))
1013
+ else:
1014
+ print("Not implemented type", r)
1015
+
1016
+ chats = channel.get("chats", [])
1017
+ if chats:
1018
+ for chat in chats:
1019
+ print(chat)
1020
+ if chat.get("_") == "Channel":
1021
+ if chat.get("id") == channel_id:
1022
+ creation_date = chat.get("date", datetime(1970,1,1))
1023
+ channel_title = chat.get("title", "")
1024
+ break
1025
+ else:
1026
+ print("Not implemented type", chat.get("_"))
1027
+ else:
1028
+ creation_date = datetime(1970,1,1)
1029
+ channel_title = ''
1030
+
1031
+
1032
+ fc_record = (str(channel_id), channel_title, channel_about, channel_participants, linked_chat_id, reactions, creation_date)
1033
+
1034
+ df = pd.DataFrame.from_records([fc_record], columns = ['channel_id', 'channel_title', 'channel_about', 'channel_participants', "linked_chat_id", 'reactions', 'creation_date'])
1035
+ return df