opsci-toolbox 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,47 @@
1
1
  from datetime import datetime
2
2
  from telethon.sync import TelegramClient
3
3
  from telethon.tl.functions.channels import GetFullChannelRequest
4
+ from telethon.errors.rpcerrorlist import ChannelPrivateError, ChannelInvalidError
4
5
  import pandas as pd
5
6
  # from telethon.tl.types import ReactionEmoji, ReactionCustomEmoji, PeerUser, PeerChannel
6
- from opsci_toolbox.helpers.common import create_dir, write_pickle
7
+ from opsci_toolbox.helpers.common import create_dir, write_pickle, write_json
7
8
  import os
8
9
  import nest_asyncio
9
10
  from telethon.tl.types import Message
10
- from typing import Optional
11
-
11
+ import json
12
+ from tqdm import tqdm
12
13
  nest_asyncio.apply()
13
14
 
14
15
 
16
+ class JSONEncoder(json.JSONEncoder):
17
+ '''
18
+ JSONEncoder subclass that knows how to encode date/time and bytes.
19
+ '''
20
+ def default(self, o):
21
+ if isinstance(o, datetime) or isinstance(o, bytes):
22
+ return str(o)
23
+ return super().default(o)
24
+
25
+ def dump_jsonl(data: list[dict], path: str, name: str) -> str:
26
+ """
27
+ Write data to a JSON Lines (jsonl) file. Each dictionary in the list represents a single JSON object.
28
+
29
+ Args:
30
+ data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
31
+ path (str): The directory path where the JSON Lines file will be saved.
32
+ name (str): The name of the JSON Lines file (without the extension).
33
+
34
+ Returns:
35
+ str: The full path to the saved JSON Lines file.
36
+ """
37
+ file_path = os.path.join(path, name + '.jsonl')
38
+ with open(file_path, 'w') as file:
39
+ for entry in data:
40
+ json.dump(entry, file, cls=JSONEncoder)
41
+ file.write('\n')
42
+ return file_path
43
+
44
+
15
45
  def parse_mediafileid(message: Message) -> str:
16
46
  """
17
47
  Parse the media file ID from a Telegram message.
@@ -38,12 +68,65 @@ def parse_mediafileid(message: Message) -> str:
38
68
  else:
39
69
  grouped_id = message_id
40
70
 
41
- media_fileid = str(channel_id)+'_'+str(grouped_id)+'_'+str(media_id)
71
+ media_fileid = str(int(channel_id))+'_'+str(int(grouped_id))+'_'+str(int(media_id))
42
72
  return media_fileid
43
73
  else:
44
74
  return None
45
75
 
46
76
 
77
+ def parse_message_entities(messages : list) -> pd.DataFrame:
78
+ """
79
+ Parse Telegram messages entities.
80
+
81
+ Args:
82
+ messages : a list of Telegram messages.
83
+
84
+ Returns:
85
+ pd.DataFrame : a DataFrame containing the parsed entities.
86
+ """
87
+ all_records = []
88
+ for data in messages:
89
+ # raw_text = message.raw_text
90
+
91
+ # data = message.to_dict()
92
+
93
+ message_id = data.get("id")
94
+
95
+ peer_id = data.get("peer_id", {})
96
+ if peer_id is None:
97
+ peer_id = {}
98
+ channel_id = parse_from(peer_id)
99
+
100
+ from_id = data.get("from_id", {})
101
+ if from_id is None :
102
+ from_id = {}
103
+ from_id = parse_from(from_id)
104
+ if from_id is None:
105
+ from_id = channel_id
106
+
107
+ grouped_id = data.get("grouped_id")
108
+ if grouped_id:
109
+ grouped_id = grouped_id
110
+ else:
111
+ grouped_id = message_id
112
+
113
+ message = data.get("message")
114
+
115
+ entities = data.get("entities", [])
116
+ for entity in entities:
117
+ entity_type = entity.get("_")
118
+ offset = entity.get("offset")
119
+ length = entity.get("length")
120
+ url = entity.get("url")
121
+ document_id = entity.get("document_id")
122
+
123
+ entity_record = (message_id, channel_id, from_id, grouped_id, message, entity_type, offset, length, url, document_id)
124
+ all_records.append(entity_record)
125
+
126
+ df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "message", "entity_type", "offset", "length", "url", "document_id"])
127
+ return df
128
+
129
+
47
130
  def parse_messages(messages : list) -> pd.DataFrame:
48
131
  """
49
132
  Parse Telegram messages.
@@ -57,8 +140,9 @@ def parse_messages(messages : list) -> pd.DataFrame:
57
140
 
58
141
  all_records = []
59
142
  for message in messages:
143
+ # raw_text = message.raw_text
60
144
 
61
- data = message.to_dict()
145
+ data = message
62
146
 
63
147
  message_id = data.get("id")
64
148
 
@@ -142,8 +226,6 @@ def parse_reply(reply:dict) -> tuple:
142
226
 
143
227
  return reply_to_message_id, reply_to_channel_id
144
228
 
145
-
146
-
147
229
  def parse_from(data : dict) -> int:
148
230
  """
149
231
  Parse a peer object from Telegram message.
@@ -234,12 +316,16 @@ def parse_media(media : dict) -> tuple:
234
316
  if media.get("_") == "MessageMediaPhoto":
235
317
  photo = media.get("photo", {})
236
318
  media_id = photo.get("id")
319
+ if media_id:
320
+ media_id = str(int(media_id))
237
321
  media_date = photo.get("date")
238
322
  media_mime_type = "photo"
239
323
 
240
324
  elif media.get("_") == "MessageMediaDocument":
241
325
  document = media.get("document", {})
242
326
  media_id = document.get("id")
327
+ if media_id:
328
+ media_id = str(int(media_id))
243
329
  media_date = document.get("date")
244
330
  media_mime_type = document.get("mime_type")
245
331
  media_size = document.get("size")
@@ -290,6 +376,107 @@ def parse_media_id(media : dict) -> int:
290
376
  media_id = None
291
377
  return media_id
292
378
 
379
+
380
+ class JSONEncoder(json.JSONEncoder):
381
+ '''
382
+ JSONEncoder subclass that knows how to encode date/time and bytes.
383
+ '''
384
+ def default(self, o):
385
+ if isinstance(o, datetime) or isinstance(o, bytes):
386
+ return str(o)
387
+ return super().default(o)
388
+
389
+ def dump_jsonl(data: list[dict], path: str, name: str) -> str:
390
+ """
391
+ Write data to a JSON Lines (jsonl) file. Each dictionary in the list represents a single JSON object.
392
+
393
+ Args:
394
+ data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
395
+ path (str): The directory path where the JSON Lines file will be saved.
396
+ name (str): The name of the JSON Lines file (without the extension).
397
+
398
+ Returns:
399
+ str: The full path to the saved JSON Lines file.
400
+ """
401
+ file_path = os.path.join(path, name + '.jsonl')
402
+ with open(file_path, 'w') as file:
403
+ for entry in data:
404
+ json.dump(entry, file, cls=JSONEncoder)
405
+ file.write('\n')
406
+ return file_path
407
+
408
+
409
+ async def get_forwarded_messages(client: TelegramClient, phone_number: str, channel_username: int, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), path_file: str = "files"):
410
+ try:
411
+ await client.start(phone_number)
412
+
413
+ path_json = create_dir(os.path.join(path_file, "JSON"))
414
+
415
+ # Fetch the messages from the channel
416
+ # forwarded_messages = []
417
+ forwarded_messages_dict = []
418
+ new_channels = set()
419
+
420
+ async for message in client.iter_messages(channel_username,
421
+ limit=limit,
422
+ offset_date=offset_date,
423
+ reverse=reverse):
424
+ # Check if the message is a forward
425
+ if message.forward and hasattr(message.forward.chat, 'username'):
426
+ # forwarded_messages.append(message)
427
+ forwarded_messages_dict.append(message.to_dict())
428
+
429
+ if message.forward.chat:
430
+ new_channel = message.forward.chat.username
431
+ if new_channel:
432
+ new_channels.add(new_channel)
433
+
434
+ if forwarded_messages_dict:
435
+ dump_jsonl(forwarded_messages_dict, path_json, str(channel_username))
436
+
437
+ except (ChannelPrivateError, ChannelInvalidError):
438
+ print(f"Cannot access channel: {channel_username}")
439
+
440
+ return forwarded_messages_dict, new_channels
441
+
442
+
443
+ async def recursive_forward_scraper(seed_channels, depth, client: TelegramClient, phone_number: str, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), path_file: str = "files"):
444
+ """
445
+ Recursively collects forwarded messages from channels, starting from the seed channels up to a specific depth.
446
+ """
447
+ all_forwarded_messages = []
448
+ visited_channels = set(seed_channels)
449
+ current_level_channels = set(seed_channels)
450
+
451
+ path_json = create_dir(os.path.join(path_file, "CHANNELS"))
452
+
453
+ for level in range(depth):
454
+ print(level)
455
+ print(f"Processing level {level + 1} with {len(current_level_channels)} channels...")
456
+ next_level_channels = set()
457
+
458
+ # Iterate through channels at the current level
459
+ for channel in tqdm(current_level_channels, total=len(current_level_channels), desc="get messages"):
460
+ forwarded_msgs, discovered_channels = await get_forwarded_messages(client, phone_number, channel, reverse, limit, offset_date, path_file)
461
+
462
+ # Collect forwarded messages
463
+ all_forwarded_messages.extend(forwarded_msgs)
464
+
465
+ # Add newly discovered channels to the next level, excluding already visited ones
466
+ for new_channel in discovered_channels:
467
+ if new_channel not in visited_channels:
468
+ next_level_channels.add(new_channel)
469
+ visited_channels.add(new_channel)
470
+ # Update the set of channels for the next level of recursion
471
+ current_level_channels = next_level_channels
472
+
473
+ if not current_level_channels:
474
+ break
475
+
476
+ write_json(visited_channels, path_json, "visited_channels")
477
+ return all_forwarded_messages, visited_channels
478
+
479
+
293
480
  def group_by_post(df : pd.DataFrame) -> pd.DataFrame:
294
481
  """
295
482
  Function to group message by "post". If a post embed multiple media, Telethon returns separate messages for each media. This function groups them together ensuring also type consistency.
@@ -338,7 +525,80 @@ def group_by_post(df : pd.DataFrame) -> pd.DataFrame:
338
525
  df = df.groupby(['channel_id',"grouped_id"]).agg(**aggregations).reset_index()
339
526
  return df
340
527
 
341
- async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), ids: list = [], path_file: str = "files") -> list:
528
+ # async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), ids: list = [], path_file: str = "files", dl_thumbs : bool = False) -> list:
529
+ # """
530
+ # Retrieves messages from a Telegram channel by date.
531
+
532
+ # Args:
533
+ # client (TelegramClient): The Telegram client instance.
534
+ # phone_number (str): The phone number associated with the Telegram account.
535
+ # channel_username (str): The username of the channel to retrieve messages from.
536
+ # dl_files (bool, optional): Whether to download media files attached to the messages. Defaults to False.
537
+ # reverse (bool, optional): Whether to retrieve messages in reverse order. Defaults to True.
538
+ # limit (int, optional): The maximum number of messages to retrieve. Defaults to None (retrieve all messages).
539
+ # offset_date (datetime, optional): The starting date to retrieve messages from. Defaults to datetime(1970,1,1).
540
+ # path_file (str, optional): The path to save the downloaded files. Defaults to "files".
541
+
542
+ # Returns:
543
+ # list: A list of messages retrieved from the channel.
544
+
545
+ # Raises:
546
+ # Exception: If there is an error during the retrieval process.
547
+
548
+ # """
549
+ # try:
550
+ # await client.start(phone_number)
551
+
552
+ # # current_path_file = create_dir(os.path.join(path_file, "messages"))
553
+ # path_messages = create_dir(os.path.join(path_file, "messages"))
554
+ # path_entities = create_dir(os.path.join(path_file, "entities"))
555
+
556
+ # if dl_files:
557
+ # current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
558
+ # if dl_thumbs:
559
+ # current_path_thumbs = create_dir(os.path.join(path_file, "thumbs", str(channel_username)))
560
+
561
+ # # Get the message history
562
+ # messages = []
563
+
564
+ # async for message in client.iter_messages(channel_username,
565
+ # limit=limit,
566
+ # offset_date=offset_date,
567
+ # reverse=reverse):
568
+ # messages.append(message)
569
+
570
+ # if dl_files:
571
+
572
+ # media_fileid = parse_mediafileid(message)
573
+ # if media_fileid:
574
+ # await message.download_media(file=os.path.join(current_path_img, media_fileid))
575
+
576
+ # if dl_thumbs:
577
+ # media_fileid = parse_mediafileid(message)
578
+ # if media_fileid:
579
+ # try:
580
+ # await message.download_media(file=os.path.join(current_path_thumbs, media_fileid), thumb=-1)
581
+ # except Exception as e:
582
+ # pass
583
+ # print(e)
584
+
585
+ # df_exploded = parse_messages(messages)
586
+ # df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
587
+ # write_pickle(df_exploded, path_messages, str(channel_username))
588
+
589
+ # df_entities = parse_message_entities(messages)
590
+ # write_pickle(df_entities, path_entities, str(channel_username))
591
+
592
+ # # df_messages = group_by_post(df_exploded)
593
+ # # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['from_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
594
+ # # write_pickle(df_messages, current_path_file, str(channel_username))
595
+
596
+ # return messages
597
+ # finally:
598
+ # # Disconnect the client
599
+ # await client.disconnect()
600
+
601
+ async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), path_file: str = "files", dl_thumbs : bool = False) -> list:
342
602
  """
343
603
  Retrieves messages from a Telegram channel by date.
344
604
 
@@ -362,18 +622,27 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
362
622
  try:
363
623
  await client.start(phone_number)
364
624
 
365
- current_path_file = create_dir(os.path.join(path_file, "messages"))
625
+ # current_path_file = create_dir(os.path.join(path_file, "messages"))
626
+ path_json = create_dir(os.path.join(path_file, "JSON"))
627
+ path_messages = create_dir(os.path.join(path_file, "messages"))
628
+ path_entities = create_dir(os.path.join(path_file, "entities"))
629
+
366
630
  if dl_files:
367
631
  current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
632
+ if dl_thumbs:
633
+ current_path_thumbs = create_dir(os.path.join(path_file, "thumbs", str(channel_username)))
368
634
 
369
635
  # Get the message history
370
- messages = []
636
+ # messages = []
637
+ messages_dict = []
371
638
 
372
639
  async for message in client.iter_messages(channel_username,
373
640
  limit=limit,
374
641
  offset_date=offset_date,
375
642
  reverse=reverse):
376
- messages.append(message)
643
+ # messages.append(message)
644
+ messages_dict.append(message.to_dict())
645
+
377
646
 
378
647
  if dl_files:
379
648
 
@@ -381,22 +650,35 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
381
650
  if media_fileid:
382
651
  await message.download_media(file=os.path.join(current_path_img, media_fileid))
383
652
 
384
- df_exploded = parse_messages(messages)
653
+ if dl_thumbs:
654
+ media_fileid = parse_mediafileid(message)
655
+ if media_fileid:
656
+ try:
657
+ await message.download_media(file=os.path.join(current_path_thumbs, media_fileid), thumb=-1)
658
+ except Exception as e:
659
+ pass
660
+ print(e)
661
+
662
+ dump_jsonl(messages_dict, path_json, str(channel_username))
663
+ df_exploded = parse_messages(messages_dict)
385
664
  df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
386
- write_pickle(df_exploded, current_path_file, str(channel_username))
665
+ write_pickle(df_exploded, path_messages, str(channel_username))
666
+
667
+ df_entities = parse_message_entities(messages_dict)
668
+ write_pickle(df_entities, path_entities, str(channel_username))
387
669
 
388
670
  # df_messages = group_by_post(df_exploded)
389
671
  # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['from_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
390
672
  # write_pickle(df_messages, current_path_file, str(channel_username))
391
673
 
392
- return messages
674
+ return messages_dict
393
675
  finally:
394
676
  # Disconnect the client
395
677
  await client.disconnect()
396
678
 
397
679
  async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel_username:int, dl_files:bool=False, ids:list=[], path_file:str="files")-> list:
398
680
  """
399
- Retrieves messages from a Telegram channel by date.
681
+ Retrieves messages from a Telegram channel by IDS.
400
682
 
401
683
  Args:
402
684
  client (TelegramClient): The Telegram client instance.
@@ -415,15 +697,18 @@ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel
415
697
  """
416
698
  try:
417
699
  await client.start(phone_number)
418
-
419
- current_path_file = create_dir(os.path.join(path_file, "messages"))
700
+ path_json = create_dir(os.path.join(path_file, "JSON"))
701
+ path_messages = create_dir(os.path.join(path_file, "messages"))
702
+ path_entities = create_dir(os.path.join(path_file, "entities"))
420
703
 
421
704
  # Get the message history
422
- messages = []
705
+ # messages = []
706
+ messages_dict = []
423
707
 
424
708
  async for message in client.iter_messages(channel_username,
425
709
  ids = ids):
426
- messages.append(message)
710
+ # messages.append(message)
711
+ messages_dict.append(message.to_dict())
427
712
 
428
713
  if dl_files:
429
714
  current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
@@ -431,14 +716,21 @@ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel
431
716
  if media_fileid:
432
717
  await message.download_media(file=os.path.join(current_path_img, media_fileid))
433
718
 
434
- df_exploded = parse_messages(messages)
719
+
720
+ dump_jsonl(messages_dict, path_json, str(channel_username))
721
+ df_exploded = parse_messages(messages_dict)
435
722
  df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
436
- write_pickle(df_exploded, current_path_file, str(channel_username))
723
+ write_pickle(df_exploded, path_messages, str(channel_username))
724
+
725
+ df_entities = parse_message_entities(messages_dict)
726
+ write_pickle(df_entities, path_entities, str(channel_username))
727
+
728
+
437
729
  # df_messages = group_by_post(df_exploded)
438
730
  # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['message_id'].astype(str)
439
731
  # write_pickle(df_messages, current_path_file, str(channel_username))
440
732
 
441
- return messages
733
+ return messages_dict
442
734
  finally:
443
735
  # Disconnect the client
444
736
  await client.disconnect()
@@ -466,17 +758,23 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
466
758
  try:
467
759
  await client.start(phone_number)
468
760
 
469
- current_path_file = create_dir(os.path.join(path_file, "messages"))
761
+ # current_path_file = create_dir(os.path.join(path_file, "messages"))
762
+ path_json = create_dir(os.path.join(path_file, "JSON"))
763
+ path_messages = create_dir(os.path.join(path_file, "messages"))
764
+ path_entities = create_dir(os.path.join(path_file, "entities"))
765
+
470
766
  if dl_files:
471
767
  current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
472
768
 
473
769
  # Get the message history
474
- messages = []
770
+ # messages = []
771
+ messages_dict = []
475
772
 
476
773
  async for message in client.iter_messages(channel_username,
477
774
  search=search,
478
775
  limit=limit):
479
- messages.append(message)
776
+ # messages.append(message)
777
+ messages_dict.append(message.to_dict())
480
778
 
481
779
  if dl_files:
482
780
 
@@ -484,17 +782,22 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
484
782
  if media_fileid:
485
783
  await message.download_media(file=os.path.join(current_path_img, media_fileid))
486
784
 
487
- df_exploded = parse_messages(messages)
785
+
786
+ df_exploded = parse_messages(messages_dict)
488
787
  df_exploded['search']=search
489
788
  df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
490
789
  # df_messages = group_by_post(df_exploded)
491
790
  # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
791
+ df_entities = parse_message_entities(messages_dict)
792
+
492
793
  if channel_username:
493
- write_pickle(df_exploded, current_path_file, str(search)+'_'+str(channel_username))
794
+ write_pickle(df_exploded, path_messages, str(search)+'_'+str(channel_username))
795
+ write_pickle(df_entities, path_entities, str(search)+'_'+str(channel_username))
494
796
  else:
495
- write_pickle(df_exploded, current_path_file, str(search))
797
+ write_pickle(df_exploded, path_messages, str(search))
798
+ write_pickle(df_entities, path_entities, str(search))
496
799
 
497
- return messages
800
+ return messages_dict
498
801
  finally:
499
802
  # Disconnect the client
500
803
  await client.disconnect()
@@ -504,12 +807,14 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
504
807
  # Connect the client
505
808
  await client.start(phone_number)
506
809
 
507
- current_path_file = create_dir(os.path.join(path_file, "messages"))
810
+ # current_path_file = create_dir(os.path.join(path_file, "messages"))
811
+ path_messages = create_dir(os.path.join(path_file, "messages"))
812
+ path_entities = create_dir(os.path.join(path_file, "entities"))
508
813
 
509
- comments = []
814
+ comments_dict = []
510
815
 
511
816
  async for comment in client.iter_messages(int(channel_entity), reply_to=int(message_id), limit=limit, reverse=reverse):
512
- comments.append(comment)
817
+ comments_dict.append(comment.to_dict())
513
818
 
514
819
  if dl_files:
515
820
  current_path_img = create_dir(os.path.join(path_file, "img", str(channel_entity)))
@@ -517,11 +822,14 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
517
822
  if media_fileid:
518
823
  await comment.download_media(file=os.path.join(current_path_img, media_fileid))
519
824
 
520
- df_comments = parse_messages(comments)
825
+ df_comments = parse_messages(comments_dict)
521
826
  df_comments.loc[df_comments['media_id'].notna(), "media_fileid"] = df_comments['channel_id'].astype(str)+'_'+df_comments['grouped_id'].astype(str)+'_'+df_comments['media_id'].astype(str)
522
- write_pickle(df_comments, current_path_file, str(channel_entity)+"_"+str(message_id))
827
+ write_pickle(df_comments, path_messages, str(channel_entity)+"_"+str(message_id))
828
+
829
+ df_entities = parse_message_entities(comments_dict)
830
+ write_pickle(df_entities, path_entities, str(channel_entity)+"_"+str(message_id))
523
831
 
524
- return comments
832
+ return comments_dict
525
833
 
526
834
  finally:
527
835
  # Disconnect the client
@@ -954,7 +1262,97 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
954
1262
  # # Disconnect the client
955
1263
  # await client.disconnect()
956
1264
 
957
- async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str) -> dict:
1265
+ # async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str, path_img :str) -> dict:
1266
+ # """
1267
+ # Retrieves information about a Telegram channel.
1268
+
1269
+ # Args:
1270
+ # api_id (int): The API ID of the Telegram application.
1271
+ # api_hash (str): The API hash of the Telegram application.
1272
+ # phone_number (str): The phone number associated with the Telegram account.
1273
+ # channel_username (str): The username of the channel.
1274
+
1275
+ # Returns:
1276
+ # dict: A dictionary containing the full information of the channel.
1277
+
1278
+ # Raises:
1279
+ # Exception: If there is an error during the retrieval of channel information.
1280
+ # """
1281
+ # client = TelegramClient('session_name', api_id, api_hash)
1282
+ # try:
1283
+ # await client.start(phone_number)
1284
+ # channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
1285
+ # channel_full_info_json = channel_full_info.to_dict()
1286
+ # img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file = path_img)
1287
+ # finally:
1288
+ # # Disconnect the client
1289
+ # await client.disconnect()
1290
+
1291
+ # return channel_full_info_json
1292
+
1293
+
1294
+ # async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str, path_project :str, DL_profile_pic : bool = False) -> dict:
1295
+ # """
1296
+ # Retrieves information about a Telegram channel.
1297
+
1298
+ # Args:
1299
+ # api_id (int): The API ID of the Telegram application.
1300
+ # api_hash (str): The API hash of the Telegram application.
1301
+ # phone_number (str): The phone number associated with the Telegram account.
1302
+ # channel_username (str): The username of the channel.
1303
+
1304
+ # Returns:
1305
+ # dict: A dictionary containing the full information of the channel.
1306
+
1307
+ # Raises:
1308
+ # Exception: If there is an error during the retrieval of channel information.
1309
+ # """
1310
+ # client = TelegramClient('session_name', api_id, api_hash)
1311
+ # path_img = create_dir(os.path.join(path_project, "THUMBNAILS"))
1312
+ # path_json = create_dir(os.path.join(path_project, "JSON"))
1313
+
1314
+ # try:
1315
+ # await client.start(phone_number)
1316
+ # try:
1317
+ # channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
1318
+
1319
+ # if channel_full_info:
1320
+ # channel_full_info_dict = channel_full_info.to_dict()
1321
+ # channel_full_info_json = JSONEncoder().encode(channel_full_info_dict)
1322
+ # else:
1323
+ # channel_full_info_dict = {'_': 'Channel', 'id': channel_username, 'title':'private_channel'}
1324
+ # write_json(channel_full_info_json, path_json, f"{str(channel_username)}")
1325
+
1326
+ # if DL_profile_pic:
1327
+ # img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file = path_img)
1328
+
1329
+ # except Exception as e:
1330
+ # pass
1331
+ # print(channel_username, e)
1332
+ # finally:
1333
+ # # Disconnect the client
1334
+ # await client.disconnect()
1335
+
1336
+ # return channel_full_info_dict
1337
+
1338
+ def dump_json(json_dict: dict, path: str, name: str) -> str:
1339
+ """
1340
+ Write a dictionary to a JSON file.
1341
+
1342
+ Args:
1343
+ json_dict (dict): The dictionary to be written to the JSON file.
1344
+ path (str): The directory path where the JSON file will be saved.
1345
+ name (str): The name of the JSON file (without the extension).
1346
+
1347
+ Returns:
1348
+ str: The full path to the saved JSON file.
1349
+ """
1350
+ file_path = os.path.join(path, name + '.json')
1351
+ with open(file_path, 'w') as outfile:
1352
+ json.dump(json_dict, outfile, cls=JSONEncoder)
1353
+ return file_path
1354
+
1355
+ async def get_channel_info(api_id: int, api_hash: str, phone_number: str, channel_username: str, path_project: str, DL_profile_pic: bool = False) -> dict:
958
1356
  """
959
1357
  Retrieves information about a Telegram channel.
960
1358
 
@@ -971,16 +1369,49 @@ async def get_channel_info(api_id : int, api_hash : str, phone_number : str, cha
971
1369
  Exception: If there is an error during the retrieval of channel information.
972
1370
  """
973
1371
  client = TelegramClient('session_name', api_id, api_hash)
1372
+ path_img = create_dir(os.path.join(path_project, "THUMBNAILS"))
1373
+ path_json = create_dir(os.path.join(path_project, "JSON"))
1374
+
974
1375
  try:
975
1376
  await client.start(phone_number)
976
- channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
977
- channel_full_info = channel_full_info.to_dict()
1377
+ try:
1378
+ # Fetch full channel info
1379
+ channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
1380
+
1381
+ # If channel info is retrieved
1382
+ if channel_full_info:
1383
+ channel_full_info_dict = channel_full_info.to_dict()
1384
+ else:
1385
+ channel_full_info_dict = {'_': 'ChatFull',
1386
+ 'full_chat': {'_': 'ChannelFull',
1387
+ 'id': channel_username,
1388
+ },
1389
+ 'chats': [{'_': 'Channel', 'id': channel_username, 'title': 'private'}]
1390
+ }
1391
+
1392
+ # Save the dictionary as JSON (no need to pre-encode it to a string)
1393
+ dump_json(channel_full_info_dict, path_json, f"{str(channel_username)}")
1394
+
1395
+ # Optionally download profile picture
1396
+ if DL_profile_pic:
1397
+ img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file=path_img)
1398
+
1399
+ except Exception as e:
1400
+ print(channel_username, e)
1401
+ channel_full_info_dict = {'_': 'ChatFull',
1402
+ 'full_chat': {'_': 'ChannelFull',
1403
+ 'id': channel_username,
1404
+ },
1405
+ 'chats': [{'_': 'Channel', 'id': channel_username, 'title': 'private'}]
1406
+ }
1407
+ dump_json(channel_full_info_dict, path_json, f"{str(channel_username)}")
1408
+ return {'_': 'Channel', 'id': channel_username, 'title': 'private_channel'}
1409
+
978
1410
  finally:
979
1411
  # Disconnect the client
980
1412
  await client.disconnect()
981
1413
 
982
- return channel_full_info
983
-
1414
+ return channel_full_info_dict
984
1415
 
985
1416
  def parse_channel(channel : dict) -> pd.DataFrame:
986
1417
  """
@@ -1016,7 +1447,6 @@ def parse_channel(channel : dict) -> pd.DataFrame:
1016
1447
  chats = channel.get("chats", [])
1017
1448
  if chats:
1018
1449
  for chat in chats:
1019
- print(chat)
1020
1450
  if chat.get("_") == "Channel":
1021
1451
  if chat.get("id") == channel_id:
1022
1452
  creation_date = chat.get("date", datetime(1970,1,1))