opsci-toolbox 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1125 @@
1
+ from datetime import datetime
2
+ from telethon.sync import TelegramClient
3
+ from telethon.tl.functions.channels import GetFullChannelRequest
4
+ import pandas as pd
5
+ # from telethon.tl.types import ReactionEmoji, ReactionCustomEmoji, PeerUser, PeerChannel
6
+ from opsci_toolbox.helpers.common import create_dir, write_pickle
7
+ import os
8
+ import nest_asyncio
9
+ from telethon.tl.types import Message
10
+ from typing import Optional
11
+
12
+ nest_asyncio.apply()
13
+
14
+
15
+ def parse_mediafileid(message: Message) -> str:
16
+ """
17
+ Parse the media file ID from a Telegram message.
18
+
19
+ Args:
20
+ message (telethon.tl.types.Message): The Telegram message.
21
+
22
+ Returns:
23
+ Optional[str]: The media file ID if available, None otherwise.
24
+ """
25
+ data = message.to_dict()
26
+ media = data.get("media", {})
27
+ media_id = parse_media_id(media)
28
+
29
+ if media_id:
30
+ message_id = data.get("id")
31
+ peer_id = data.get("peer_id", {})
32
+ if peer_id is None:
33
+ peer_id = {}
34
+ channel_id = parse_from(peer_id)
35
+ grouped_id = data.get("grouped_id")
36
+ if grouped_id:
37
+ grouped_id = grouped_id
38
+ else:
39
+ grouped_id = message_id
40
+
41
+ media_fileid = str(channel_id)+'_'+str(grouped_id)+'_'+str(media_id)
42
+ return media_fileid
43
+ else:
44
+ return None
45
+
46
+
47
+ def parse_message_entities(messages : list) -> pd.DataFrame:
48
+ """
49
+ Parse Telegram messages entities.
50
+
51
+ Args:
52
+ messages : a list of Telegram messages.
53
+
54
+ Returns:
55
+ pd.DataFrame : a DataFrame containing the parsed entities.
56
+ """
57
+ all_records = []
58
+ for message in messages:
59
+ raw_text = message.raw_text
60
+
61
+ data = message.to_dict()
62
+
63
+ message_id = data.get("id")
64
+
65
+ peer_id = data.get("peer_id", {})
66
+ if peer_id is None:
67
+ peer_id = {}
68
+ channel_id = parse_from(peer_id)
69
+
70
+ from_id = data.get("from_id", {})
71
+ if from_id is None :
72
+ from_id = {}
73
+ from_id = parse_from(from_id)
74
+ if from_id is None:
75
+ from_id = channel_id
76
+
77
+ grouped_id = data.get("grouped_id")
78
+ if grouped_id:
79
+ grouped_id = grouped_id
80
+ else:
81
+ grouped_id = message_id
82
+
83
+ message = data.get("message")
84
+
85
+ entities = data.get("entities", [])
86
+ for entity in entities:
87
+ entity_type = entity.get("_")
88
+ offset = entity.get("offset")
89
+ length = entity.get("length")
90
+ url = entity.get("url")
91
+ document_id = entity.get("document_id")
92
+
93
+ entity_record = (message_id, channel_id, from_id, grouped_id, message, raw_text, entity_type, offset, length, url, document_id)
94
+ all_records.append(entity_record)
95
+
96
+ df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "message", "raw_text", "entity_type", "offset", "length", "url", "document_id"])
97
+ return df
98
+
99
+
100
+ def parse_messages(messages : list) -> pd.DataFrame:
101
+ """
102
+ Parse Telegram messages.
103
+
104
+ Args:
105
+ messages : a list of Telegram messages.
106
+
107
+ Returns:
108
+ pd.DataFrame : a DataFrame containing the parsed information from the Telegram messages.
109
+ """
110
+
111
+ all_records = []
112
+ for message in messages:
113
+ raw_text = message.raw_text
114
+
115
+ data = message.to_dict()
116
+
117
+ message_id = data.get("id")
118
+
119
+ peer_id = data.get("peer_id", {})
120
+ if peer_id is None:
121
+ peer_id = {}
122
+ channel_id = parse_from(peer_id)
123
+
124
+ from_id = data.get("from_id", {})
125
+ if from_id is None :
126
+ from_id = {}
127
+ from_id = parse_from(from_id)
128
+ if from_id is None:
129
+ from_id = channel_id
130
+
131
+ date = data.get("date")
132
+ message = data.get("message")
133
+ views = data.get("views", 0)
134
+ forwards = data.get("forwards", 0)
135
+ if forwards is None:
136
+ forwards = 0
137
+
138
+ replies = data.get("replies", {})
139
+ if replies:
140
+ replies = replies.get("replies")
141
+ else:
142
+ replies = 0
143
+
144
+ grouped_id = data.get("grouped_id")
145
+ if grouped_id:
146
+ grouped_id = grouped_id
147
+ else:
148
+ grouped_id = message_id
149
+
150
+ reactions = data.get("reactions", {})
151
+ if reactions:
152
+ total_reactions, reactions_details = parse_reactions(reactions)
153
+ else:
154
+ total_reactions, reactions_details = 0, None
155
+
156
+
157
+ reply_to = data.get("reply_to", {})
158
+ reply_to_message_id, reply_to_channel_id = parse_reply(reply_to)
159
+
160
+ media = data.get("media", {})
161
+ media_record = parse_media(media)
162
+ fwd_from = data.get("fwd_from", {})
163
+ if fwd_from is None:
164
+ fwd_from = {}
165
+ fwd_record = parse_fwd(fwd_from)
166
+ engagements = forwards + replies + total_reactions
167
+
168
+
169
+ post_record = (message_id, channel_id, from_id, grouped_id, date, message, raw_text, views, forwards, replies, total_reactions, reactions_details, engagements, reply_to_message_id, reply_to_channel_id) + media_record + fwd_record
170
+ all_records.append(post_record)
171
+
172
+ df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "date", "message", "raw_text", "views", "forwards", "replies", "reactions", "details_reactions", "engagements", "reply_to_message_id", "reply_to_channel_id",
173
+ "media_id", "media_date", "media_mime_type", "media_size", "media_filename", "duration", "width", "height",
174
+ "webpage_id", "webpage_url", "webpage_type", "webpage_site_name", "webpage_title", "webpage_description",
175
+ "fwd_date", "fwd_from_id", "fwd_from_post_id", "fwd_from_from_name"])
176
+
177
+ return df
178
+
179
+ def parse_reply(reply:dict) -> tuple:
180
+ """
181
+ Parse reply object from a Telegram message.
182
+
183
+ Args:
184
+ reply : a dict corresponding to the reply object.
185
+
186
+ Returns:
187
+ Tuple of reply_to_message_id, reply_to_channel_id
188
+ """
189
+ reply_to_message_id, reply_to_channel_id = None, None
190
+ if reply:
191
+ reply_to_message_id = reply.get("reply_to_msg_id")
192
+ reply_to_peer_id = reply.get("reply_to_peer_id", {})
193
+ if reply_to_peer_id is None:
194
+ reply_to_peer_id = {}
195
+ reply_to_channel_id = parse_from(reply_to_peer_id)
196
+
197
+ return reply_to_message_id, reply_to_channel_id
198
+
199
+
200
+
201
+ def parse_from(data : dict) -> int:
202
+ """
203
+ Parse a peer object from Telegram message.
204
+
205
+ Args:
206
+ data : a dict corresponding to the peer object.
207
+
208
+ Returns:
209
+ int : the channel_id or user_id.
210
+ """
211
+ if data.get("_"):
212
+ if data.get("_")=="PeerChannel":
213
+ channel_id = data.get('channel_id')
214
+ elif data.get("_")=="PeerUser":
215
+ channel_id = data.get('user_id')
216
+ else:
217
+ print("PEER not referenced", data.get('_'))
218
+ channel_id = None
219
+ else:
220
+ channel_id = None
221
+ return channel_id
222
+
223
+
224
+ def parse_fwd(forward : dict) -> tuple:
225
+ """
226
+ Parse a forward object from Telegram message.
227
+
228
+ Args:
229
+ forward : a dict corresponding to the forward object.
230
+
231
+ Returns:
232
+ tuple containing, date, post id, channel id and name related to the forward.
233
+ """
234
+ fwd_from_date, from_id, fwd_from_channel_post_id, fwd_from_from_name = None, None, None, None
235
+ if forward:
236
+ fwd_from_date = forward.get("date")
237
+ fwd_from_channel_post_id = forward.get("channel_post")
238
+ fwd_from_from_name = forward.get("from_name")
239
+ fwd_from_id = forward.get("from_id", {})
240
+ if fwd_from_id is None :
241
+ fwd_from_id = {}
242
+ from_id = parse_from(fwd_from_id)
243
+
244
+ fwd_record = (fwd_from_date, from_id, fwd_from_channel_post_id, fwd_from_from_name)
245
+ return fwd_record
246
+
247
+ def parse_reactions(reactions: dict) -> tuple:
248
+ """
249
+ Parse reactions from Telegram message.
250
+
251
+ Args:
252
+ reactions : a dict corresponding to the reactions object.
253
+
254
+ Returns:
255
+ tuple containing the total number of reactions and the details of each reaction.
256
+ """
257
+ # details = dict()
258
+ details=[]
259
+ results = reactions.get("results", [])
260
+ total_reactions = 0
261
+ for res in results:
262
+ count = res.get("count", 0)
263
+ total_reactions += count
264
+ reaction = res.get("reaction",{})
265
+ if reaction.get("_")=="ReactionEmoji":
266
+ emoticon = reaction.get("emoticon", "")
267
+ else :
268
+ emoticon = str(reaction.get("document_id",""))
269
+
270
+ # details[emoticon] = count
271
+ details.append((emoticon, count))
272
+
273
+ return total_reactions, details
274
+
275
+ def parse_media(media : dict) -> tuple:
276
+ """
277
+ Parse medias from Telegram message. Currently it supports photo, document and webpage.
278
+
279
+ Args:
280
+ media : a dict corresponding to the media object.
281
+
282
+ Returns:
283
+ tuple containing media metadata.
284
+ """
285
+ webpage_id, webpage_url, webpage_type, webpage_site_name, webpage_title, webpage_description = None, None, None, None, None, None
286
+ media_id, media_date, media_mime_type, media_size, media_filename, duration, width, height = None, None, None, 0, None, 0, 0, 0
287
+ if media :
288
+ if media.get("_") == "MessageMediaPhoto":
289
+ photo = media.get("photo", {})
290
+ media_id = photo.get("id")
291
+ media_date = photo.get("date")
292
+ media_mime_type = "photo"
293
+
294
+ elif media.get("_") == "MessageMediaDocument":
295
+ document = media.get("document", {})
296
+ media_id = document.get("id")
297
+ media_date = document.get("date")
298
+ media_mime_type = document.get("mime_type")
299
+ media_size = document.get("size")
300
+ attributes = document.get("attributes", [])
301
+ for attr in attributes:
302
+ if attr.get("_") == "DocumentAttributeFilename":
303
+ media_filename = str(attr.get("file_name", ""))
304
+ elif attr.get("_") == "DocumentAttributeVideo":
305
+ duration = attr.get("duration")
306
+ width = attr.get("w")
307
+ height = attr.get("h")
308
+ elif media.get("_") == "MessageMediaWebPage":
309
+ webpage = media.get("webpage", {})
310
+ webpage_id = webpage.get("id")
311
+ webpage_url = webpage.get("url")
312
+ webpage_type = webpage.get("type")
313
+ webpage_site_name = webpage.get("site_name")
314
+ webpage_title = webpage.get("title")
315
+ webpage_description = webpage.get("description")
316
+
317
+ else :
318
+ print("MEDIA not referenced", media.get('_'))
319
+
320
+ media_record = (media_id, media_date, media_mime_type, media_size, media_filename, duration, width, height, webpage_id, webpage_url, webpage_type, webpage_site_name, webpage_title, webpage_description)
321
+ return media_record
322
+
323
+ def parse_media_id(media : dict) -> int:
324
+ """
325
+ Parse media id from Telegram message.
326
+
327
+ Args:
328
+ media : a dict corresponding to the media object.
329
+
330
+ Returns:
331
+ int : the media_id.
332
+ """
333
+ media_id = None
334
+ if media :
335
+ if media.get("_") == "MessageMediaPhoto":
336
+ photo = media.get("photo", {})
337
+ media_id = photo.get("id")
338
+
339
+ elif media.get("_") == "MessageMediaDocument":
340
+ document = media.get("document", {})
341
+ media_id = document.get("id")
342
+
343
+ else :
344
+ media_id = None
345
+ return media_id
346
+
347
+ def group_by_post(df : pd.DataFrame) -> pd.DataFrame:
348
+ """
349
+ Function to group message by "post". If a post embed multiple media, Telethon returns separate messages for each media. This function groups them together ensuring also type consistency.
350
+
351
+ Args:
352
+ df : dataframe containing messages.
353
+
354
+ Returns:
355
+ pd.DataFrame : a DataFrame containing the grouped messages.
356
+ """
357
+ aggregations = {
358
+ 'concatenated_message_id': ("message_id", lambda x: '-'.join(x.dropna().astype(str))),
359
+ 'message_id': ("message_id", lambda x: list(x[x.notna()].astype(int).astype(str))),
360
+ 'from_id': ("from_id", lambda x: str(int(x.dropna().max())) if pd.notna(x.dropna().max()) else None),
361
+ "date" : ("date", "max"),
362
+ "message": ("message", "max"),
363
+ "views": ("views", "max"),
364
+ "forwards": ("forwards", "max"),
365
+ "replies": ("replies", "max"),
366
+ "reactions": ("reactions", "max"),
367
+ "details_reactions": ("details_reactions", lambda x: x[x.notna()]),
368
+ "engagements": ("engagements", "max"),
369
+ "reply_to_message_id": ("reply_to_message_id", lambda x: str(int(x.dropna().max())) if pd.notna(x.dropna().max()) else None),
370
+ "reply_to_channel_id": ("reply_to_channel_id", lambda x: str(int(x.dropna().max())) if pd.notna(x.dropna().max()) else None),
371
+ "media_id" : ("media_id", lambda x: list(x[x.notna()].astype(int).astype(str))),
372
+ "media_date": ("media_date", lambda x: list(x[x.notna()])),
373
+ "media_mime_type": ("media_mime_type", lambda x: list(x[x.notna()])),
374
+ "media_size": ("media_size", lambda x: list(x[x.notna()].astype(int).astype(str))),
375
+ "media_filename": ("media_filename", lambda x: list(x[x.notna()])),
376
+ "media_fileid": ("media_fileid", lambda x: list(x[x.notna()].astype(str))),
377
+ "duration": ("duration", lambda x: list(x[x.notna()].astype(int).astype(str))),
378
+ "width": ("width", lambda x: list(x[x.notna()].astype(int).astype(str))),
379
+ "height": ("height", lambda x: list(x[x.notna()].astype(int).astype(str))),
380
+ "webpage_id": ("webpage_id", lambda x: list(x[x.notna()].astype(int).astype(str))),
381
+ "webpage_url": ("webpage_url", lambda x: list(x[x.notna()])),
382
+ "webpage_type": ("webpage_type", lambda x: list(x[x.notna()])),
383
+ "webpage_site_name": ("webpage_site_name", lambda x: list(x[x.notna()])),
384
+ "webpage_title": ("webpage_title", lambda x: list(x[x.notna()])),
385
+ "webpage_description": ("webpage_description", lambda x: list(x[x.notna()])),
386
+ "fwd_date": ("fwd_date", "max"),
387
+ "fwd_from_id": ("fwd_from_id", lambda x: str(int(x.dropna().max())) if pd.notna(x.dropna().max()) else None),
388
+ "fwd_from_post_id": ("fwd_from_post_id", lambda x: str(int(x.dropna().max())) if pd.notna(x.dropna().max()) else None),
389
+ "fwd_from_from_name":("fwd_from_from_name", "max")
390
+
391
+ }
392
+ df = df.groupby(['channel_id',"grouped_id"]).agg(**aggregations).reset_index()
393
+ return df
394
+
395
+ async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), ids: list = [], path_file: str = "files", dl_thumbs : bool = False) -> list:
396
+ """
397
+ Retrieves messages from a Telegram channel by date.
398
+
399
+ Args:
400
+ client (TelegramClient): The Telegram client instance.
401
+ phone_number (str): The phone number associated with the Telegram account.
402
+ channel_username (str): The username of the channel to retrieve messages from.
403
+ dl_files (bool, optional): Whether to download media files attached to the messages. Defaults to False.
404
+ reverse (bool, optional): Whether to retrieve messages in reverse order. Defaults to True.
405
+ limit (int, optional): The maximum number of messages to retrieve. Defaults to None (retrieve all messages).
406
+ offset_date (datetime, optional): The starting date to retrieve messages from. Defaults to datetime(1970,1,1).
407
+ path_file (str, optional): The path to save the downloaded files. Defaults to "files".
408
+
409
+ Returns:
410
+ list: A list of messages retrieved from the channel.
411
+
412
+ Raises:
413
+ Exception: If there is an error during the retrieval process.
414
+
415
+ """
416
+ try:
417
+ await client.start(phone_number)
418
+
419
+ # current_path_file = create_dir(os.path.join(path_file, "messages"))
420
+ path_messages = create_dir(os.path.join(path_file, "messages"))
421
+ path_entities = create_dir(os.path.join(path_file, "entities"))
422
+
423
+ if dl_files:
424
+ current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
425
+ if dl_thumbs:
426
+ current_path_thumbs = create_dir(os.path.join(path_file, "thumbs", str(channel_username)))
427
+
428
+ # Get the message history
429
+ messages = []
430
+
431
+ async for message in client.iter_messages(channel_username,
432
+ limit=limit,
433
+ offset_date=offset_date,
434
+ reverse=reverse):
435
+ messages.append(message)
436
+
437
+ if dl_files:
438
+
439
+ media_fileid = parse_mediafileid(message)
440
+ if media_fileid:
441
+ await message.download_media(file=os.path.join(current_path_img, media_fileid))
442
+
443
+ if dl_thumbs:
444
+ media_fileid = parse_mediafileid(message)
445
+ if media_fileid:
446
+ try:
447
+ await message.download_media(file=os.path.join(current_path_thumbs, media_fileid), thumb=-1)
448
+ except Exception as e:
449
+ pass
450
+ print(e)
451
+
452
+ df_exploded = parse_messages(messages)
453
+ df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
454
+ write_pickle(df_exploded, path_messages, str(channel_username))
455
+
456
+ df_entities = parse_message_entities(messages)
457
+ write_pickle(df_entities, path_entities, str(channel_username))
458
+
459
+ # df_messages = group_by_post(df_exploded)
460
+ # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['from_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
461
+ # write_pickle(df_messages, current_path_file, str(channel_username))
462
+
463
+ return messages
464
+ finally:
465
+ # Disconnect the client
466
+ await client.disconnect()
467
+
468
+ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel_username:int, dl_files:bool=False, ids:list=[], path_file:str="files")-> list:
469
+ """
470
+ Retrieves messages from a Telegram channel by IDS.
471
+
472
+ Args:
473
+ client (TelegramClient): The Telegram client instance.
474
+ phone_number (str): The phone number associated with the Telegram account.
475
+ channel_username (str): The username of the channel to retrieve messages from.
476
+ dl_files (bool, optional): Whether to download media files attached to the messages. Defaults to False.
477
+ ids (list) : list of message ids to retrieve
478
+ path_file (str, optional): The path to save the downloaded files. Defaults to "files".
479
+
480
+ Returns:
481
+ list: A list of messages retrieved from the channel.
482
+
483
+ Raises:
484
+ Exception: If there is an error during the retrieval process.
485
+
486
+ """
487
+ try:
488
+ await client.start(phone_number)
489
+
490
+ path_messages = create_dir(os.path.join(path_file, "messages"))
491
+ path_entities = create_dir(os.path.join(path_file, "entities"))
492
+
493
+ # Get the message history
494
+ messages = []
495
+
496
+ async for message in client.iter_messages(channel_username,
497
+ ids = ids):
498
+ messages.append(message)
499
+
500
+ if dl_files:
501
+ current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
502
+ media_fileid = parse_mediafileid(message)
503
+ if media_fileid:
504
+ await message.download_media(file=os.path.join(current_path_img, media_fileid))
505
+
506
+ df_exploded = parse_messages(messages)
507
+ df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
508
+ write_pickle(df_exploded, path_messages, str(channel_username))
509
+
510
+ df_entities = parse_message_entities(messages)
511
+ write_pickle(df_entities, path_entities, str(channel_username))
512
+
513
+
514
+ # df_messages = group_by_post(df_exploded)
515
+ # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['message_id'].astype(str)
516
+ # write_pickle(df_messages, current_path_file, str(channel_username))
517
+
518
+ return messages
519
+ finally:
520
+ # Disconnect the client
521
+ await client.disconnect()
522
+
523
+ async def get_messages_by_search(client: TelegramClient, phone_number: str, search:str= "SNCF", channel_username:int = None, dl_files:bool=False, limit:int=None, path_file:str="files") -> list:
524
+ """,
525
+ Retrieves messages from a Telegram channel by date.
526
+
527
+ Args:
528
+ client (TelegramClient): The Telegram client instance.
529
+ phone_number (str): The phone number associated with the Telegram account.
530
+ search (str): The search term to look for in the messages.
531
+ channel_username (str): The username of the channel to retrieve messages from.
532
+ dl_files (bool, optional): Whether to download media files attached to the messages. Defaults to False.
533
+ limit (int, optional): The maximum number of messages to retrieve. Defaults to None (retrieve all messages).
534
+ path_file (str, optional): The path to save the downloaded files. Defaults to "files".
535
+
536
+ Returns:
537
+ list: A list of messages retrieved from the channel.
538
+
539
+ Raises:
540
+ Exception: If there is an error during the retrieval process.
541
+
542
+ """
543
+ try:
544
+ await client.start(phone_number)
545
+
546
+ # current_path_file = create_dir(os.path.join(path_file, "messages"))
547
+ path_messages = create_dir(os.path.join(path_file, "messages"))
548
+ path_entities = create_dir(os.path.join(path_file, "entities"))
549
+
550
+ if dl_files:
551
+ current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
552
+
553
+ # Get the message history
554
+ messages = []
555
+
556
+ async for message in client.iter_messages(channel_username,
557
+ search=search,
558
+ limit=limit):
559
+ messages.append(message)
560
+
561
+ if dl_files:
562
+
563
+ media_fileid = parse_mediafileid(message)
564
+ if media_fileid:
565
+ await message.download_media(file=os.path.join(current_path_img, media_fileid))
566
+
567
+ df_exploded = parse_messages(messages)
568
+ df_exploded['search']=search
569
+ df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
570
+ # df_messages = group_by_post(df_exploded)
571
+ # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
572
+ df_entities = parse_message_entities(messages)
573
+
574
+ if channel_username:
575
+ write_pickle(df_exploded, path_messages, str(search)+'_'+str(channel_username))
576
+ write_pickle(df_entities, path_entities, str(search)+'_'+str(channel_username))
577
+ else:
578
+ write_pickle(df_exploded, path_messages, str(search))
579
+ write_pickle(df_entities, path_entities, str(search))
580
+
581
+ return messages
582
+ finally:
583
+ # Disconnect the client
584
+ await client.disconnect()
585
+
586
+ async def download_comments(client: TelegramClient, phone_number: str,channel_entity : int, message_id: int, dl_files:bool = False, limit:int = None, reverse:bool=True, path_file:str ="files")->list:
587
+ try:
588
+ # Connect the client
589
+ await client.start(phone_number)
590
+
591
+ # current_path_file = create_dir(os.path.join(path_file, "messages"))
592
+ path_messages = create_dir(os.path.join(path_file, "messages"))
593
+ path_entities = create_dir(os.path.join(path_file, "entities"))
594
+
595
+ comments = []
596
+
597
+ async for comment in client.iter_messages(int(channel_entity), reply_to=int(message_id), limit=limit, reverse=reverse):
598
+ comments.append(comment)
599
+
600
+ if dl_files:
601
+ current_path_img = create_dir(os.path.join(path_file, "img", str(channel_entity)))
602
+ media_fileid = parse_mediafileid(comment)
603
+ if media_fileid:
604
+ await comment.download_media(file=os.path.join(current_path_img, media_fileid))
605
+
606
+ df_comments = parse_messages(comments)
607
+ df_comments.loc[df_comments['media_id'].notna(), "media_fileid"] = df_comments['channel_id'].astype(str)+'_'+df_comments['grouped_id'].astype(str)+'_'+df_comments['media_id'].astype(str)
608
+ write_pickle(df_comments, path_messages, str(channel_entity)+"_"+str(message_id))
609
+
610
+ df_entities = parse_message_entities(comments)
611
+ write_pickle(df_entities, path_entities, str(channel_entity)+"_"+str(message_id))
612
+
613
+ return comments
614
+
615
+ finally:
616
+ # Disconnect the client
617
+ await client.disconnect()
618
+
619
+
620
+
621
+ # def parse_telegram_messages(messages : list) -> pd.DataFrame:
622
+ # """
623
+ # Parses the given list of Telegram messages and returns a DataFrame with the extracted information.
624
+
625
+ # Args:
626
+ # messages (list): A list of Telegram messages.
627
+
628
+ # Returns:
629
+ # pandas.DataFrame: A DataFrame containing the parsed information from the Telegram messages.
630
+
631
+ # """
632
+
633
+ # all_records = []
634
+ # for message in messages:
635
+
636
+ # peer_id = message.peer_id
637
+
638
+ # if peer_id:
639
+ # channel_id = str(peer_id.channel_id)
640
+ # else:
641
+ # channel_id=''
642
+ # if message.id:
643
+ # message_id = str(message.id)
644
+ # else:
645
+ # message_id=''
646
+
647
+ # uniq_id = str(channel_id) + "_" + str(message_id)
648
+ # if message.date:
649
+ # message_date = message.date
650
+ # else:
651
+ # message_date=datetime(1970,1,1)
652
+ # if message.text:
653
+ # text = message.text
654
+ # else:
655
+ # text = ''
656
+ # if message.is_reply:
657
+ # is_reply = message.is_reply
658
+ # else:
659
+ # is_reply = False
660
+
661
+ # if message.views:
662
+ # views = int(message.views)
663
+ # else:
664
+ # views = 0
665
+ # if message.forwards:
666
+ # forwards = int(message.forwards)
667
+ # else:
668
+ # forwards = 0
669
+
670
+ # ##########################################
671
+ # # REPLIES
672
+ # ##########################################
673
+ # if message.replies :
674
+ # replies = message.replies
675
+ # if replies.replies:
676
+ # replies_count = int(replies.replies)
677
+ # else:
678
+ # replies_count = 0
679
+
680
+ # if replies.channel_id:
681
+ # replies_channel_id = replies.channel_id
682
+ # else:
683
+ # replies_channel_id = ''
684
+ # else :
685
+ # replies_count, replies_channel_id= 0, ''
686
+
687
+ # ##########################################
688
+ # # REACTIONS
689
+ # ##########################################
690
+
691
+ # total_reactions = 0
692
+ # details_reactions=[]
693
+
694
+ # if message.reactions:
695
+ # reactions = message.reactions
696
+ # reactions_lst = reactions.results
697
+ # for reaction in reactions_lst:
698
+ # if reaction.count:
699
+ # count = int(reaction.count)
700
+ # else:
701
+ # count = 0
702
+ # total_reactions += count
703
+ # r = reaction.reaction
704
+
705
+ # if isinstance(r, ReactionEmoji):
706
+ # emoticon = r.emoticon
707
+ # elif isinstance(r, ReactionCustomEmoji):
708
+ # emoticon = r.document_id
709
+ # else:
710
+ # emoticon = None
711
+ # details_reactions.append((emoticon, count))
712
+ # else :
713
+ # count = 0
714
+
715
+ # ##########################################
716
+ # # FORWARDS
717
+ # ##########################################
718
+
719
+ # if message.fwd_from :
720
+ # fwd_from = message.fwd_from
721
+ # if fwd_from.date:
722
+ # fwd_from_date = fwd_from.date
723
+ # else :
724
+ # fwd_from_date = datetime(1970,1,1)
725
+ # if fwd_from.from_id:
726
+ # fwd_from_id = fwd_from.from_id
727
+ # if isinstance(fwd_from_id, PeerUser):
728
+ # fwd_from_channel_id = fwd_from_id.user_id
729
+ # elif isinstance(fwd_from_id, PeerChannel):
730
+ # fwd_from_channel_id = fwd_from_id.channel_id
731
+ # else:
732
+ # fwd_from_channel_id = None
733
+ # print(fwd_from_id, "type not implemented")
734
+ # else :
735
+ # fwd_from_channel_id = None
736
+ # if fwd_from.from_name:
737
+ # fwd_from_name = fwd_from.from_name
738
+ # else:
739
+ # fwd_from_name = ''
740
+ # if fwd_from.channel_post:
741
+ # fwd_from_channel_post = str(fwd_from.channel_post)
742
+ # else:
743
+ # fwd_from_channel_post = ''
744
+ # if fwd_from.post_author:
745
+ # fwd_from_post_author = fwd_from.post_author
746
+ # else:
747
+ # fwd_from_post_author=''
748
+ # else :
749
+ # fwd_from_date, fwd_from_id, fwd_from_channel_id, fwd_from_name, fwd_from_channel_post, fwd_from_post_author = datetime(1970,1,1), '', '', '', '', ''
750
+
751
+ # ##########################################
752
+ # # REPLIES
753
+ # ##########################################
754
+
755
+ # if message.reply_to:
756
+ # reply_to = message.reply_to
757
+ # if reply_to.quote:
758
+ # reply_to_quote = reply_to.quote
759
+ # else:
760
+ # reply_to_quote=False
761
+ # if reply_to.reply_to_msg_id:
762
+ # reply_to_msg_id = str(reply_to.reply_to_msg_id)
763
+ # else:
764
+ # reply_to_msg_id = ''
765
+ # if reply_to.reply_to_peer_id:
766
+ # reply_to_peer_id = str(reply_to.reply_to_peer_id)
767
+ # else:
768
+ # reply_to_peer_id = ''
769
+ # # reply_from = reply_to.reply_from
770
+ # # reply_media = reply_to.reply_media
771
+ # if reply_to.reply_to_top_id:
772
+ # reply_to_top_id = str(reply_to.reply_to_top_id)
773
+ # else:
774
+ # reply_to_top_id = ''
775
+ # if reply_to.quote_text:
776
+ # reply_to_quote_text = reply_to.quote_text
777
+ # else:
778
+ # reply_to_quote_text = ''
779
+ # else:
780
+ # reply_to_quote, reply_to_msg_id, reply_to_peer_id, reply_to_top_id, reply_to_quote_text = False, '', '', '', ''
781
+
782
+ # ##########################################
783
+ # # FILE
784
+ # ##########################################
785
+ # if message.file:
786
+ # file = message.file
787
+ # if file.id:
788
+ # file_id = file.id
789
+ # else:
790
+ # file_id = ''
791
+ # if file.duration:
792
+ # file_duration = file.duration
793
+ # else:
794
+ # file_duration = 0
795
+ # if file.emoji:
796
+ # file_emoji = file.emoji
797
+ # else:
798
+ # file_emoji = ''
799
+ # if file.ext:
800
+ # file_ext = file.ext
801
+ # else:
802
+ # file_ext = ''
803
+ # if file.height:
804
+ # file_height = int(file.height)
805
+ # else:
806
+ # file_height = 0
807
+ # if file.mime_type:
808
+ # file_mime_type = file.mime_type
809
+ # else:
810
+ # file_mime_type = ''
811
+ # if file.name:
812
+ # file_name = file.name
813
+ # else:
814
+ # file_name = ''
815
+ # if file.performer:
816
+ # file_performer = file.performer
817
+ # else:
818
+ # file_performer = ''
819
+ # if file.size:
820
+ # file_size = file.size
821
+ # else:
822
+ # file_size = 0
823
+ # if file.sticker_set:
824
+ # file_sticker_set = file.sticker_set
825
+ # else:
826
+ # file_sticker_set = ''
827
+ # if file.title:
828
+ # file_title = file.title
829
+ # else :
830
+ # file_title = ''
831
+ # if file.width:
832
+ # file_width = int(file.width)
833
+ # else:
834
+ # file_width = 0
835
+ # else :
836
+ # file_id, file_duration, file_emoji, file_ext, file_height, file_mime_type, file_name, file_performer, file_size, file_sticker_set, file_title, file_width = "", 0, '', '', 0, '', '', '', 0, '', '', 0
837
+
838
+
839
+
840
+ # webpage_record = parse_webpage(message.web_preview)
841
+
842
+ # current_record = (uniq_id, channel_id, message_id, message_date, text, is_reply, views, forwards, replies_count, replies_channel_id, total_reactions, details_reactions,
843
+ # fwd_from_date, fwd_from_channel_id,fwd_from_name, fwd_from_channel_post,fwd_from_post_author,
844
+ # reply_to_quote, reply_to_msg_id, reply_to_peer_id, reply_to_top_id, reply_to_quote_text,
845
+ # file_id, file_duration, file_ext, file_height, file_mime_type, file_name, file_size, file_title, file_width)
846
+ # current_record = current_record + webpage_record
847
+
848
+ # all_records.append(current_record)
849
+ # df = pd.DataFrame.from_records(all_records, columns = ['uniq_id', 'channel_id', "message_id", "message_date", "text", "is_reply", "views", "forwards", "replies_count", "replies_channel_id", "total_reactions", "details_reactions",
850
+ # "fwd_from_date", "fwd_from_channel_id","fwd_from_name", "fwd_from_channel_post","fwd_from_post_author",
851
+ # "reply_to_quote", "reply_to_msg_id", "reply_to_peer_id", "reply_to_top_id", "reply_to_quote_text",
852
+ # "file_id", "file_duration", "file_ext", "file_height", "file_mime_type", "file_name", "file_size", "file_title", "file_width",
853
+ # "webpage_id", "webpage_url", "webpage_type", "webpage_site_name", "webpage_title", "webpage_description", "webpage_embed_url", "webpage_embed_type", "webpage_embed_width", "webpage_embed_height",
854
+ # "webpage_duration", "webpage_author", "webpage_photo_id", "webpage_photo_date"
855
+
856
+ # ])
857
+ # return df
858
+
859
+
860
+ # def parse_webpage(webpage):
861
+ # """
862
+ # Parse the given webpage object and extract relevant information.
863
+
864
+ # Args:
865
+ # webpage (Webpage): The webpage object to be parsed.
866
+
867
+ # Returns:
868
+ # tuple: A tuple containing the parsed information from the webpage.
869
+ # The tuple contains the following elements:
870
+ # - webpage_id (str): The ID of the webpage.
871
+ # - webpage_url (str): The URL of the webpage.
872
+ # - webpage_type (str): The type of the webpage.
873
+ # - webpage_site_name (str): The name of the site.
874
+ # - webpage_title (str): The title of the webpage.
875
+ # - webpage_description (str): The description of the webpage.
876
+ # - webpage_embed_url (str): The embed URL of the webpage.
877
+ # - webpage_embed_type (str): The embed type of the webpage.
878
+ # - webpage_embed_width (int): The embed width of the webpage.
879
+ # - webpage_embed_height (int): The embed height of the webpage.
880
+ # - webpage_duration (int): The duration of the webpage.
881
+ # - webpage_author (str): The author of the webpage.
882
+ # - webpage_photo_record (tuple): A tuple containing the parsed photo information from the webpage.
883
+ # """
884
+
885
+ # if webpage :
886
+ # if webpage.id:
887
+ # webpage_id = str(webpage.id)
888
+ # else:
889
+ # webpage_id = ''
890
+ # if webpage.url:
891
+ # webpage_url = webpage.url
892
+ # else:
893
+ # webpage_url = ''
894
+ # # if webpage.display_url:
895
+ # # webpage_display_url = webpage.display_url
896
+ # # else:
897
+ # # webpage_display_url = ''
898
+ # # # webpage_hash = webpage.hash
899
+ # # if webpage.has_large_media:
900
+ # # webpage_has_large_media = webpage.has_large_media
901
+ # # else:
902
+ # # webpage_has_large_media = False
903
+ # if webpage.type:
904
+ # webpage_type = webpage.type
905
+ # else:
906
+ # webpage_type = ''
907
+ # if webpage.site_name:
908
+ # webpage_site_name = webpage.site_name
909
+ # else:
910
+ # webpage_site_name = ''
911
+ # if webpage.title:
912
+ # webpage_title = webpage.title
913
+ # else:
914
+ # webpage_title = ''
915
+ # if webpage.description:
916
+ # webpage_description = webpage.description
917
+ # else:
918
+ # webpage_description = ''
919
+ # if webpage.embed_url:
920
+ # webpage_embed_url = webpage.embed_url
921
+ # else:
922
+ # webpage_embed_url = ''
923
+ # if webpage.embed_type:
924
+ # webpage_embed_type = webpage.embed_type
925
+ # else:
926
+ # webpage_embed_type = ''
927
+ # if webpage.embed_width:
928
+ # webpage_embed_width = int(webpage.embed_width)
929
+ # else:
930
+ # webpage_embed_width = 0
931
+ # if webpage.embed_height:
932
+ # webpage_embed_height = int(webpage.embed_height)
933
+ # else:
934
+ # webpage_embed_height = 0
935
+ # if webpage.duration:
936
+ # webpage_duration = int(webpage.duration)
937
+ # else:
938
+ # webpage_duration = 0
939
+ # if webpage.author :
940
+ # webpage_author = webpage.author
941
+ # else :
942
+ # webpage_author = ''
943
+
944
+ # webpage_photo_record = parse_photo(webpage.photo)
945
+ # # webpage_document = webpage.document
946
+ # # webpage_cached_page = webpage.cached_page
947
+ # # webpage_attributes = webpage.attributes
948
+ # else :
949
+ # webpage_id, webpage_url, webpage_type, webpage_site_name, webpage_title, webpage_description, webpage_embed_url, webpage_embed_type, webpage_embed_width, webpage_embed_height, webpage_duration, webpage_author, webpage_photo_record = "", "", "", "", "", "", "", "", 0, 0, 0, "", ('', datetime(1970,1,1))
950
+ # record = (webpage_id, webpage_url, webpage_type, webpage_site_name, webpage_title, webpage_description, webpage_embed_url, webpage_embed_type, webpage_embed_width, webpage_embed_height, webpage_duration, webpage_author) + webpage_photo_record
951
+ # return record
952
+
953
+ # def parse_photo(photo):
954
+ # """
955
+ # Parses the given photo object and returns a tuple containing the photo ID and date.
956
+
957
+ # Args:
958
+ # photo: The photo object to be parsed.
959
+
960
+ # Returns:
961
+ # A tuple containing the photo ID and date.
962
+ # """
963
+
964
+ # if photo:
965
+ # if photo.id:
966
+ # photo_id = str(photo.id)
967
+ # else:
968
+ # photo.id = ''
969
+ # if photo.date:
970
+ # photo_date = photo.date
971
+ # else :
972
+ # photo_date = datetime(1970,1,1)
973
+
974
+ # # photo_access_hash = photo.access_hash
975
+ # # photo_file_reference = photo.file_reference
976
+ # # photo_dc_id = photo.dc_id
977
+ # # photo_sizes = photo.sizes #### A PARSER
978
+ # else :
979
+ # photo_id, photo_date = '', datetime(1970,1,1)
980
+
981
+ # record = (photo_id, photo_date)
982
+ # return record
983
+
984
+
985
+ # async def get_messages_by_date(client, phone_number, channel_username, dl_files=False, reverse=True, limit=None, offset_date=datetime(1970,1,1), path_file="files"):
986
+ # """
987
+ # Retrieves messages from a Telegram channel by date.
988
+
989
+ # Args:
990
+ # client (TelegramClient): The Telegram client instance.
991
+ # phone_number (str): The phone number associated with the Telegram account.
992
+ # channel_username (str): The username of the channel to retrieve messages from.
993
+ # dl_files (bool, optional): Whether to download media files attached to the messages. Defaults to False.
994
+ # reverse (bool, optional): Whether to retrieve messages in reverse order. Defaults to True.
995
+ # limit (int, optional): The maximum number of messages to retrieve. Defaults to None (retrieve all messages).
996
+ # offset_date (datetime, optional): The starting date to retrieve messages from. Defaults to datetime(1970,1,1).
997
+ # path_file (str, optional): The path to save the downloaded files. Defaults to "files".
998
+
999
+ # Returns:
1000
+ # list: A list of messages retrieved from the channel.
1001
+
1002
+ # Raises:
1003
+ # Exception: If there is an error during the retrieval process.
1004
+
1005
+ # """
1006
+ # try:
1007
+ # await client.start(phone_number)
1008
+
1009
+ # current_path_file = create_dir(os.path.join(path_file, "messages"))
1010
+
1011
+ # if dl_files:
1012
+ # current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
1013
+
1014
+ # # Get the message history
1015
+ # messages = []
1016
+
1017
+ # async for message in client.iter_messages(channel_username,
1018
+ # limit=limit,
1019
+ # offset_date=offset_date,
1020
+ # reverse=reverse):
1021
+ # messages.append(message)
1022
+
1023
+ # if dl_files:
1024
+ # if message.peer_id:
1025
+ # channel_id = str(message.peer_id.channel_id)
1026
+ # else:
1027
+ # channel_id=''
1028
+ # if message.id:
1029
+ # message_id = str(message.id)
1030
+ # else:
1031
+ # message_id=''
1032
+ # if message.file:
1033
+ # file_id = str(message.file.id)
1034
+ # else:
1035
+ # file_id=''
1036
+ # await message.download_media(file=os.path.join(current_path_img, channel_id+"_"+message_id+"_"+file_id))
1037
+
1038
+ # df_messages = parse_telegram_messages(messages)
1039
+ # write_pickle(df_messages, current_path_file, str(channel_username))
1040
+
1041
+ # return messages
1042
+ # finally:
1043
+ # # Disconnect the client
1044
+ # await client.disconnect()
1045
+
1046
+ async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str, path_img :str) -> dict:
1047
+ """
1048
+ Retrieves information about a Telegram channel.
1049
+
1050
+ Args:
1051
+ api_id (int): The API ID of the Telegram application.
1052
+ api_hash (str): The API hash of the Telegram application.
1053
+ phone_number (str): The phone number associated with the Telegram account.
1054
+ channel_username (str): The username of the channel.
1055
+
1056
+ Returns:
1057
+ dict: A dictionary containing the full information of the channel.
1058
+
1059
+ Raises:
1060
+ Exception: If there is an error during the retrieval of channel information.
1061
+ """
1062
+ client = TelegramClient('session_name', api_id, api_hash)
1063
+ try:
1064
+ await client.start(phone_number)
1065
+ channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
1066
+ channel_full_info_json = channel_full_info.to_dict()
1067
+ img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file = path_img)
1068
+ finally:
1069
+ # Disconnect the client
1070
+ await client.disconnect()
1071
+
1072
+ return channel_full_info_json
1073
+
1074
+
1075
+ def parse_channel(channel : dict) -> pd.DataFrame:
1076
+ """
1077
+ Parses the given channel data and returns a DataFrame with the parsed information.
1078
+
1079
+ Args:
1080
+ channel (dict): The channel data to be parsed.
1081
+
1082
+ Returns:
1083
+ pandas.DataFrame: A DataFrame containing the parsed channel information.
1084
+
1085
+ """
1086
+ reactions=[]
1087
+ channel_title = ''
1088
+ creation_date = datetime(1970,1,1)
1089
+ fc = channel.get("full_chat", {})
1090
+ channel_id = fc.get("id", "")
1091
+ channel_about = fc.get("about", "")
1092
+ channel_participants = int(fc.get("participants_count",0))
1093
+ linked_chat_id = fc.get("linked_chat_id", "")
1094
+
1095
+ ar = fc.get("available_reactions", {})
1096
+ if ar:
1097
+ reaction = ar.get('reactions', [])
1098
+ for r in reaction:
1099
+ if r.get('_') == "ReactionEmoji":
1100
+ reactions.append(r.get("emoticon"))
1101
+ elif r.get('_') == "ReactionCustomEmoji":
1102
+ reactions.append(r.get("document_id"))
1103
+ else:
1104
+ print("Not implemented type", r)
1105
+
1106
+ chats = channel.get("chats", [])
1107
+ if chats:
1108
+ for chat in chats:
1109
+ print(chat)
1110
+ if chat.get("_") == "Channel":
1111
+ if chat.get("id") == channel_id:
1112
+ creation_date = chat.get("date", datetime(1970,1,1))
1113
+ channel_title = chat.get("title", "")
1114
+ break
1115
+ else:
1116
+ print("Not implemented type", chat.get("_"))
1117
+ else:
1118
+ creation_date = datetime(1970,1,1)
1119
+ channel_title = ''
1120
+
1121
+
1122
+ fc_record = (str(channel_id), channel_title, channel_about, channel_participants, linked_chat_id, reactions, creation_date)
1123
+
1124
+ df = pd.DataFrame.from_records([fc_record], columns = ['channel_id', 'channel_title', 'channel_about', 'channel_participants', "linked_chat_id", 'reactions', 'creation_date'])
1125
+ return df