opsci-toolbox 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/rapidapi_helpers.py +1 -2
- opsci_toolbox/apis/reddit.py +342 -334
- opsci_toolbox/apis/telegram.py +471 -41
- opsci_toolbox/helpers/common.py +3 -1
- opsci_toolbox/helpers/dates.py +1 -1
- opsci_toolbox/helpers/nlp.py +178 -33
- opsci_toolbox/helpers/nlp_cuml.py +47 -2
- opsci_toolbox/helpers/sna.py +34 -0
- {opsci_toolbox-0.0.13.dist-info → opsci_toolbox-0.0.15.dist-info}/METADATA +2 -2
- {opsci_toolbox-0.0.13.dist-info → opsci_toolbox-0.0.15.dist-info}/RECORD +13 -12
- opsci_toolbox-0.0.15.dist-info/dependency_links.txt +1 -0
- {opsci_toolbox-0.0.13.dist-info → opsci_toolbox-0.0.15.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.13.dist-info → opsci_toolbox-0.0.15.dist-info}/top_level.txt +0 -0
opsci_toolbox/apis/telegram.py
CHANGED
@@ -1,17 +1,47 @@
|
|
1
1
|
from datetime import datetime
|
2
2
|
from telethon.sync import TelegramClient
|
3
3
|
from telethon.tl.functions.channels import GetFullChannelRequest
|
4
|
+
from telethon.errors.rpcerrorlist import ChannelPrivateError, ChannelInvalidError
|
4
5
|
import pandas as pd
|
5
6
|
# from telethon.tl.types import ReactionEmoji, ReactionCustomEmoji, PeerUser, PeerChannel
|
6
|
-
from opsci_toolbox.helpers.common import create_dir, write_pickle
|
7
|
+
from opsci_toolbox.helpers.common import create_dir, write_pickle, write_json
|
7
8
|
import os
|
8
9
|
import nest_asyncio
|
9
10
|
from telethon.tl.types import Message
|
10
|
-
|
11
|
-
|
11
|
+
import json
|
12
|
+
from tqdm import tqdm
|
12
13
|
nest_asyncio.apply()
|
13
14
|
|
14
15
|
|
16
|
+
class JSONEncoder(json.JSONEncoder):
|
17
|
+
'''
|
18
|
+
JSONEncoder subclass that knows how to encode date/time and bytes.
|
19
|
+
'''
|
20
|
+
def default(self, o):
|
21
|
+
if isinstance(o, datetime) or isinstance(o, bytes):
|
22
|
+
return str(o)
|
23
|
+
return super().default(o)
|
24
|
+
|
25
|
+
def dump_jsonl(data: list[dict], path: str, name: str) -> str:
|
26
|
+
"""
|
27
|
+
Write data to a JSON Lines (jsonl) file. Each dictionary in the list represents a single JSON object.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
|
31
|
+
path (str): The directory path where the JSON Lines file will be saved.
|
32
|
+
name (str): The name of the JSON Lines file (without the extension).
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
str: The full path to the saved JSON Lines file.
|
36
|
+
"""
|
37
|
+
file_path = os.path.join(path, name + '.jsonl')
|
38
|
+
with open(file_path, 'w') as file:
|
39
|
+
for entry in data:
|
40
|
+
json.dump(entry, file, cls=JSONEncoder)
|
41
|
+
file.write('\n')
|
42
|
+
return file_path
|
43
|
+
|
44
|
+
|
15
45
|
def parse_mediafileid(message: Message) -> str:
|
16
46
|
"""
|
17
47
|
Parse the media file ID from a Telegram message.
|
@@ -38,12 +68,65 @@ def parse_mediafileid(message: Message) -> str:
|
|
38
68
|
else:
|
39
69
|
grouped_id = message_id
|
40
70
|
|
41
|
-
media_fileid = str(channel_id)+'_'+str(grouped_id)+'_'+str(media_id)
|
71
|
+
media_fileid = str(int(channel_id))+'_'+str(int(grouped_id))+'_'+str(int(media_id))
|
42
72
|
return media_fileid
|
43
73
|
else:
|
44
74
|
return None
|
45
75
|
|
46
76
|
|
77
|
+
def parse_message_entities(messages : list) -> pd.DataFrame:
|
78
|
+
"""
|
79
|
+
Parse Telegram messages entities.
|
80
|
+
|
81
|
+
Args:
|
82
|
+
messages : a list of Telegram messages.
|
83
|
+
|
84
|
+
Returns:
|
85
|
+
pd.DataFrame : a DataFrame containing the parsed entities.
|
86
|
+
"""
|
87
|
+
all_records = []
|
88
|
+
for data in messages:
|
89
|
+
# raw_text = message.raw_text
|
90
|
+
|
91
|
+
# data = message.to_dict()
|
92
|
+
|
93
|
+
message_id = data.get("id")
|
94
|
+
|
95
|
+
peer_id = data.get("peer_id", {})
|
96
|
+
if peer_id is None:
|
97
|
+
peer_id = {}
|
98
|
+
channel_id = parse_from(peer_id)
|
99
|
+
|
100
|
+
from_id = data.get("from_id", {})
|
101
|
+
if from_id is None :
|
102
|
+
from_id = {}
|
103
|
+
from_id = parse_from(from_id)
|
104
|
+
if from_id is None:
|
105
|
+
from_id = channel_id
|
106
|
+
|
107
|
+
grouped_id = data.get("grouped_id")
|
108
|
+
if grouped_id:
|
109
|
+
grouped_id = grouped_id
|
110
|
+
else:
|
111
|
+
grouped_id = message_id
|
112
|
+
|
113
|
+
message = data.get("message")
|
114
|
+
|
115
|
+
entities = data.get("entities", [])
|
116
|
+
for entity in entities:
|
117
|
+
entity_type = entity.get("_")
|
118
|
+
offset = entity.get("offset")
|
119
|
+
length = entity.get("length")
|
120
|
+
url = entity.get("url")
|
121
|
+
document_id = entity.get("document_id")
|
122
|
+
|
123
|
+
entity_record = (message_id, channel_id, from_id, grouped_id, message, entity_type, offset, length, url, document_id)
|
124
|
+
all_records.append(entity_record)
|
125
|
+
|
126
|
+
df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "message", "entity_type", "offset", "length", "url", "document_id"])
|
127
|
+
return df
|
128
|
+
|
129
|
+
|
47
130
|
def parse_messages(messages : list) -> pd.DataFrame:
|
48
131
|
"""
|
49
132
|
Parse Telegram messages.
|
@@ -57,8 +140,9 @@ def parse_messages(messages : list) -> pd.DataFrame:
|
|
57
140
|
|
58
141
|
all_records = []
|
59
142
|
for message in messages:
|
143
|
+
# raw_text = message.raw_text
|
60
144
|
|
61
|
-
data = message
|
145
|
+
data = message
|
62
146
|
|
63
147
|
message_id = data.get("id")
|
64
148
|
|
@@ -142,8 +226,6 @@ def parse_reply(reply:dict) -> tuple:
|
|
142
226
|
|
143
227
|
return reply_to_message_id, reply_to_channel_id
|
144
228
|
|
145
|
-
|
146
|
-
|
147
229
|
def parse_from(data : dict) -> int:
|
148
230
|
"""
|
149
231
|
Parse a peer object from Telegram message.
|
@@ -234,12 +316,16 @@ def parse_media(media : dict) -> tuple:
|
|
234
316
|
if media.get("_") == "MessageMediaPhoto":
|
235
317
|
photo = media.get("photo", {})
|
236
318
|
media_id = photo.get("id")
|
319
|
+
if media_id:
|
320
|
+
media_id = str(int(media_id))
|
237
321
|
media_date = photo.get("date")
|
238
322
|
media_mime_type = "photo"
|
239
323
|
|
240
324
|
elif media.get("_") == "MessageMediaDocument":
|
241
325
|
document = media.get("document", {})
|
242
326
|
media_id = document.get("id")
|
327
|
+
if media_id:
|
328
|
+
media_id = str(int(media_id))
|
243
329
|
media_date = document.get("date")
|
244
330
|
media_mime_type = document.get("mime_type")
|
245
331
|
media_size = document.get("size")
|
@@ -290,6 +376,107 @@ def parse_media_id(media : dict) -> int:
|
|
290
376
|
media_id = None
|
291
377
|
return media_id
|
292
378
|
|
379
|
+
|
380
|
+
class JSONEncoder(json.JSONEncoder):
|
381
|
+
'''
|
382
|
+
JSONEncoder subclass that knows how to encode date/time and bytes.
|
383
|
+
'''
|
384
|
+
def default(self, o):
|
385
|
+
if isinstance(o, datetime) or isinstance(o, bytes):
|
386
|
+
return str(o)
|
387
|
+
return super().default(o)
|
388
|
+
|
389
|
+
def dump_jsonl(data: list[dict], path: str, name: str) -> str:
|
390
|
+
"""
|
391
|
+
Write data to a JSON Lines (jsonl) file. Each dictionary in the list represents a single JSON object.
|
392
|
+
|
393
|
+
Args:
|
394
|
+
data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
|
395
|
+
path (str): The directory path where the JSON Lines file will be saved.
|
396
|
+
name (str): The name of the JSON Lines file (without the extension).
|
397
|
+
|
398
|
+
Returns:
|
399
|
+
str: The full path to the saved JSON Lines file.
|
400
|
+
"""
|
401
|
+
file_path = os.path.join(path, name + '.jsonl')
|
402
|
+
with open(file_path, 'w') as file:
|
403
|
+
for entry in data:
|
404
|
+
json.dump(entry, file, cls=JSONEncoder)
|
405
|
+
file.write('\n')
|
406
|
+
return file_path
|
407
|
+
|
408
|
+
|
409
|
+
async def get_forwarded_messages(client: TelegramClient, phone_number: str, channel_username: int, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), path_file: str = "files"):
|
410
|
+
try:
|
411
|
+
await client.start(phone_number)
|
412
|
+
|
413
|
+
path_json = create_dir(os.path.join(path_file, "JSON"))
|
414
|
+
|
415
|
+
# Fetch the messages from the channel
|
416
|
+
# forwarded_messages = []
|
417
|
+
forwarded_messages_dict = []
|
418
|
+
new_channels = set()
|
419
|
+
|
420
|
+
async for message in client.iter_messages(channel_username,
|
421
|
+
limit=limit,
|
422
|
+
offset_date=offset_date,
|
423
|
+
reverse=reverse):
|
424
|
+
# Check if the message is a forward
|
425
|
+
if message.forward and hasattr(message.forward.chat, 'username'):
|
426
|
+
# forwarded_messages.append(message)
|
427
|
+
forwarded_messages_dict.append(message.to_dict())
|
428
|
+
|
429
|
+
if message.forward.chat:
|
430
|
+
new_channel = message.forward.chat.username
|
431
|
+
if new_channel:
|
432
|
+
new_channels.add(new_channel)
|
433
|
+
|
434
|
+
if forwarded_messages_dict:
|
435
|
+
dump_jsonl(forwarded_messages_dict, path_json, str(channel_username))
|
436
|
+
|
437
|
+
except (ChannelPrivateError, ChannelInvalidError):
|
438
|
+
print(f"Cannot access channel: {channel_username}")
|
439
|
+
|
440
|
+
return forwarded_messages_dict, new_channels
|
441
|
+
|
442
|
+
|
443
|
+
async def recursive_forward_scraper(seed_channels, depth, client: TelegramClient, phone_number: str, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), path_file: str = "files"):
|
444
|
+
"""
|
445
|
+
Recursively collects forwarded messages from channels, starting from the seed channels up to a specific depth.
|
446
|
+
"""
|
447
|
+
all_forwarded_messages = []
|
448
|
+
visited_channels = set(seed_channels)
|
449
|
+
current_level_channels = set(seed_channels)
|
450
|
+
|
451
|
+
path_json = create_dir(os.path.join(path_file, "CHANNELS"))
|
452
|
+
|
453
|
+
for level in range(depth):
|
454
|
+
print(level)
|
455
|
+
print(f"Processing level {level + 1} with {len(current_level_channels)} channels...")
|
456
|
+
next_level_channels = set()
|
457
|
+
|
458
|
+
# Iterate through channels at the current level
|
459
|
+
for channel in tqdm(current_level_channels, total=len(current_level_channels), desc="get messages"):
|
460
|
+
forwarded_msgs, discovered_channels = await get_forwarded_messages(client, phone_number, channel, reverse, limit, offset_date, path_file)
|
461
|
+
|
462
|
+
# Collect forwarded messages
|
463
|
+
all_forwarded_messages.extend(forwarded_msgs)
|
464
|
+
|
465
|
+
# Add newly discovered channels to the next level, excluding already visited ones
|
466
|
+
for new_channel in discovered_channels:
|
467
|
+
if new_channel not in visited_channels:
|
468
|
+
next_level_channels.add(new_channel)
|
469
|
+
visited_channels.add(new_channel)
|
470
|
+
# Update the set of channels for the next level of recursion
|
471
|
+
current_level_channels = next_level_channels
|
472
|
+
|
473
|
+
if not current_level_channels:
|
474
|
+
break
|
475
|
+
|
476
|
+
write_json(visited_channels, path_json, "visited_channels")
|
477
|
+
return all_forwarded_messages, visited_channels
|
478
|
+
|
479
|
+
|
293
480
|
def group_by_post(df : pd.DataFrame) -> pd.DataFrame:
|
294
481
|
"""
|
295
482
|
Function to group message by "post". If a post embed multiple media, Telethon returns separate messages for each media. This function groups them together ensuring also type consistency.
|
@@ -338,7 +525,80 @@ def group_by_post(df : pd.DataFrame) -> pd.DataFrame:
|
|
338
525
|
df = df.groupby(['channel_id',"grouped_id"]).agg(**aggregations).reset_index()
|
339
526
|
return df
|
340
527
|
|
341
|
-
async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), ids: list = [], path_file: str = "files") -> list:
|
528
|
+
# async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), ids: list = [], path_file: str = "files", dl_thumbs : bool = False) -> list:
|
529
|
+
# """
|
530
|
+
# Retrieves messages from a Telegram channel by date.
|
531
|
+
|
532
|
+
# Args:
|
533
|
+
# client (TelegramClient): The Telegram client instance.
|
534
|
+
# phone_number (str): The phone number associated with the Telegram account.
|
535
|
+
# channel_username (str): The username of the channel to retrieve messages from.
|
536
|
+
# dl_files (bool, optional): Whether to download media files attached to the messages. Defaults to False.
|
537
|
+
# reverse (bool, optional): Whether to retrieve messages in reverse order. Defaults to True.
|
538
|
+
# limit (int, optional): The maximum number of messages to retrieve. Defaults to None (retrieve all messages).
|
539
|
+
# offset_date (datetime, optional): The starting date to retrieve messages from. Defaults to datetime(1970,1,1).
|
540
|
+
# path_file (str, optional): The path to save the downloaded files. Defaults to "files".
|
541
|
+
|
542
|
+
# Returns:
|
543
|
+
# list: A list of messages retrieved from the channel.
|
544
|
+
|
545
|
+
# Raises:
|
546
|
+
# Exception: If there is an error during the retrieval process.
|
547
|
+
|
548
|
+
# """
|
549
|
+
# try:
|
550
|
+
# await client.start(phone_number)
|
551
|
+
|
552
|
+
# # current_path_file = create_dir(os.path.join(path_file, "messages"))
|
553
|
+
# path_messages = create_dir(os.path.join(path_file, "messages"))
|
554
|
+
# path_entities = create_dir(os.path.join(path_file, "entities"))
|
555
|
+
|
556
|
+
# if dl_files:
|
557
|
+
# current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
|
558
|
+
# if dl_thumbs:
|
559
|
+
# current_path_thumbs = create_dir(os.path.join(path_file, "thumbs", str(channel_username)))
|
560
|
+
|
561
|
+
# # Get the message history
|
562
|
+
# messages = []
|
563
|
+
|
564
|
+
# async for message in client.iter_messages(channel_username,
|
565
|
+
# limit=limit,
|
566
|
+
# offset_date=offset_date,
|
567
|
+
# reverse=reverse):
|
568
|
+
# messages.append(message)
|
569
|
+
|
570
|
+
# if dl_files:
|
571
|
+
|
572
|
+
# media_fileid = parse_mediafileid(message)
|
573
|
+
# if media_fileid:
|
574
|
+
# await message.download_media(file=os.path.join(current_path_img, media_fileid))
|
575
|
+
|
576
|
+
# if dl_thumbs:
|
577
|
+
# media_fileid = parse_mediafileid(message)
|
578
|
+
# if media_fileid:
|
579
|
+
# try:
|
580
|
+
# await message.download_media(file=os.path.join(current_path_thumbs, media_fileid), thumb=-1)
|
581
|
+
# except Exception as e:
|
582
|
+
# pass
|
583
|
+
# print(e)
|
584
|
+
|
585
|
+
# df_exploded = parse_messages(messages)
|
586
|
+
# df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
|
587
|
+
# write_pickle(df_exploded, path_messages, str(channel_username))
|
588
|
+
|
589
|
+
# df_entities = parse_message_entities(messages)
|
590
|
+
# write_pickle(df_entities, path_entities, str(channel_username))
|
591
|
+
|
592
|
+
# # df_messages = group_by_post(df_exploded)
|
593
|
+
# # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['from_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
|
594
|
+
# # write_pickle(df_messages, current_path_file, str(channel_username))
|
595
|
+
|
596
|
+
# return messages
|
597
|
+
# finally:
|
598
|
+
# # Disconnect the client
|
599
|
+
# await client.disconnect()
|
600
|
+
|
601
|
+
async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), path_file: str = "files", dl_thumbs : bool = False) -> list:
|
342
602
|
"""
|
343
603
|
Retrieves messages from a Telegram channel by date.
|
344
604
|
|
@@ -362,18 +622,27 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
|
|
362
622
|
try:
|
363
623
|
await client.start(phone_number)
|
364
624
|
|
365
|
-
current_path_file = create_dir(os.path.join(path_file, "messages"))
|
625
|
+
# current_path_file = create_dir(os.path.join(path_file, "messages"))
|
626
|
+
path_json = create_dir(os.path.join(path_file, "JSON"))
|
627
|
+
path_messages = create_dir(os.path.join(path_file, "messages"))
|
628
|
+
path_entities = create_dir(os.path.join(path_file, "entities"))
|
629
|
+
|
366
630
|
if dl_files:
|
367
631
|
current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
|
632
|
+
if dl_thumbs:
|
633
|
+
current_path_thumbs = create_dir(os.path.join(path_file, "thumbs", str(channel_username)))
|
368
634
|
|
369
635
|
# Get the message history
|
370
|
-
messages = []
|
636
|
+
# messages = []
|
637
|
+
messages_dict = []
|
371
638
|
|
372
639
|
async for message in client.iter_messages(channel_username,
|
373
640
|
limit=limit,
|
374
641
|
offset_date=offset_date,
|
375
642
|
reverse=reverse):
|
376
|
-
messages.append(message)
|
643
|
+
# messages.append(message)
|
644
|
+
messages_dict.append(message.to_dict())
|
645
|
+
|
377
646
|
|
378
647
|
if dl_files:
|
379
648
|
|
@@ -381,22 +650,35 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
|
|
381
650
|
if media_fileid:
|
382
651
|
await message.download_media(file=os.path.join(current_path_img, media_fileid))
|
383
652
|
|
384
|
-
|
653
|
+
if dl_thumbs:
|
654
|
+
media_fileid = parse_mediafileid(message)
|
655
|
+
if media_fileid:
|
656
|
+
try:
|
657
|
+
await message.download_media(file=os.path.join(current_path_thumbs, media_fileid), thumb=-1)
|
658
|
+
except Exception as e:
|
659
|
+
pass
|
660
|
+
print(e)
|
661
|
+
|
662
|
+
dump_jsonl(messages_dict, path_json, str(channel_username))
|
663
|
+
df_exploded = parse_messages(messages_dict)
|
385
664
|
df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
|
386
|
-
write_pickle(df_exploded,
|
665
|
+
write_pickle(df_exploded, path_messages, str(channel_username))
|
666
|
+
|
667
|
+
df_entities = parse_message_entities(messages_dict)
|
668
|
+
write_pickle(df_entities, path_entities, str(channel_username))
|
387
669
|
|
388
670
|
# df_messages = group_by_post(df_exploded)
|
389
671
|
# df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['from_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
|
390
672
|
# write_pickle(df_messages, current_path_file, str(channel_username))
|
391
673
|
|
392
|
-
return
|
674
|
+
return messages_dict
|
393
675
|
finally:
|
394
676
|
# Disconnect the client
|
395
677
|
await client.disconnect()
|
396
678
|
|
397
679
|
async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel_username:int, dl_files:bool=False, ids:list=[], path_file:str="files")-> list:
|
398
680
|
"""
|
399
|
-
Retrieves messages from a Telegram channel by
|
681
|
+
Retrieves messages from a Telegram channel by IDS.
|
400
682
|
|
401
683
|
Args:
|
402
684
|
client (TelegramClient): The Telegram client instance.
|
@@ -415,15 +697,18 @@ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel
|
|
415
697
|
"""
|
416
698
|
try:
|
417
699
|
await client.start(phone_number)
|
418
|
-
|
419
|
-
|
700
|
+
path_json = create_dir(os.path.join(path_file, "JSON"))
|
701
|
+
path_messages = create_dir(os.path.join(path_file, "messages"))
|
702
|
+
path_entities = create_dir(os.path.join(path_file, "entities"))
|
420
703
|
|
421
704
|
# Get the message history
|
422
|
-
messages = []
|
705
|
+
# messages = []
|
706
|
+
messages_dict = []
|
423
707
|
|
424
708
|
async for message in client.iter_messages(channel_username,
|
425
709
|
ids = ids):
|
426
|
-
messages.append(message)
|
710
|
+
# messages.append(message)
|
711
|
+
messages_dict.append(message.to_dict())
|
427
712
|
|
428
713
|
if dl_files:
|
429
714
|
current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
|
@@ -431,14 +716,21 @@ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel
|
|
431
716
|
if media_fileid:
|
432
717
|
await message.download_media(file=os.path.join(current_path_img, media_fileid))
|
433
718
|
|
434
|
-
|
719
|
+
|
720
|
+
dump_jsonl(messages_dict, path_json, str(channel_username))
|
721
|
+
df_exploded = parse_messages(messages_dict)
|
435
722
|
df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
|
436
|
-
write_pickle(df_exploded,
|
723
|
+
write_pickle(df_exploded, path_messages, str(channel_username))
|
724
|
+
|
725
|
+
df_entities = parse_message_entities(messages_dict)
|
726
|
+
write_pickle(df_entities, path_entities, str(channel_username))
|
727
|
+
|
728
|
+
|
437
729
|
# df_messages = group_by_post(df_exploded)
|
438
730
|
# df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['message_id'].astype(str)
|
439
731
|
# write_pickle(df_messages, current_path_file, str(channel_username))
|
440
732
|
|
441
|
-
return
|
733
|
+
return messages_dict
|
442
734
|
finally:
|
443
735
|
# Disconnect the client
|
444
736
|
await client.disconnect()
|
@@ -466,17 +758,23 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
|
|
466
758
|
try:
|
467
759
|
await client.start(phone_number)
|
468
760
|
|
469
|
-
current_path_file = create_dir(os.path.join(path_file, "messages"))
|
761
|
+
# current_path_file = create_dir(os.path.join(path_file, "messages"))
|
762
|
+
path_json = create_dir(os.path.join(path_file, "JSON"))
|
763
|
+
path_messages = create_dir(os.path.join(path_file, "messages"))
|
764
|
+
path_entities = create_dir(os.path.join(path_file, "entities"))
|
765
|
+
|
470
766
|
if dl_files:
|
471
767
|
current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
|
472
768
|
|
473
769
|
# Get the message history
|
474
|
-
messages = []
|
770
|
+
# messages = []
|
771
|
+
messages_dict = []
|
475
772
|
|
476
773
|
async for message in client.iter_messages(channel_username,
|
477
774
|
search=search,
|
478
775
|
limit=limit):
|
479
|
-
messages.append(message)
|
776
|
+
# messages.append(message)
|
777
|
+
messages_dict.append(message.to_dict())
|
480
778
|
|
481
779
|
if dl_files:
|
482
780
|
|
@@ -484,17 +782,22 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
|
|
484
782
|
if media_fileid:
|
485
783
|
await message.download_media(file=os.path.join(current_path_img, media_fileid))
|
486
784
|
|
487
|
-
|
785
|
+
|
786
|
+
df_exploded = parse_messages(messages_dict)
|
488
787
|
df_exploded['search']=search
|
489
788
|
df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
|
490
789
|
# df_messages = group_by_post(df_exploded)
|
491
790
|
# df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
|
791
|
+
df_entities = parse_message_entities(messages_dict)
|
792
|
+
|
492
793
|
if channel_username:
|
493
|
-
write_pickle(df_exploded,
|
794
|
+
write_pickle(df_exploded, path_messages, str(search)+'_'+str(channel_username))
|
795
|
+
write_pickle(df_entities, path_entities, str(search)+'_'+str(channel_username))
|
494
796
|
else:
|
495
|
-
write_pickle(df_exploded,
|
797
|
+
write_pickle(df_exploded, path_messages, str(search))
|
798
|
+
write_pickle(df_entities, path_entities, str(search))
|
496
799
|
|
497
|
-
return
|
800
|
+
return messages_dict
|
498
801
|
finally:
|
499
802
|
# Disconnect the client
|
500
803
|
await client.disconnect()
|
@@ -504,12 +807,14 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
|
|
504
807
|
# Connect the client
|
505
808
|
await client.start(phone_number)
|
506
809
|
|
507
|
-
current_path_file = create_dir(os.path.join(path_file, "messages"))
|
810
|
+
# current_path_file = create_dir(os.path.join(path_file, "messages"))
|
811
|
+
path_messages = create_dir(os.path.join(path_file, "messages"))
|
812
|
+
path_entities = create_dir(os.path.join(path_file, "entities"))
|
508
813
|
|
509
|
-
|
814
|
+
comments_dict = []
|
510
815
|
|
511
816
|
async for comment in client.iter_messages(int(channel_entity), reply_to=int(message_id), limit=limit, reverse=reverse):
|
512
|
-
|
817
|
+
comments_dict.append(comment.to_dict())
|
513
818
|
|
514
819
|
if dl_files:
|
515
820
|
current_path_img = create_dir(os.path.join(path_file, "img", str(channel_entity)))
|
@@ -517,11 +822,14 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
|
|
517
822
|
if media_fileid:
|
518
823
|
await comment.download_media(file=os.path.join(current_path_img, media_fileid))
|
519
824
|
|
520
|
-
df_comments = parse_messages(
|
825
|
+
df_comments = parse_messages(comments_dict)
|
521
826
|
df_comments.loc[df_comments['media_id'].notna(), "media_fileid"] = df_comments['channel_id'].astype(str)+'_'+df_comments['grouped_id'].astype(str)+'_'+df_comments['media_id'].astype(str)
|
522
|
-
write_pickle(df_comments,
|
827
|
+
write_pickle(df_comments, path_messages, str(channel_entity)+"_"+str(message_id))
|
828
|
+
|
829
|
+
df_entities = parse_message_entities(comments_dict)
|
830
|
+
write_pickle(df_entities, path_entities, str(channel_entity)+"_"+str(message_id))
|
523
831
|
|
524
|
-
return
|
832
|
+
return comments_dict
|
525
833
|
|
526
834
|
finally:
|
527
835
|
# Disconnect the client
|
@@ -954,7 +1262,97 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
|
|
954
1262
|
# # Disconnect the client
|
955
1263
|
# await client.disconnect()
|
956
1264
|
|
957
|
-
async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str) -> dict:
|
1265
|
+
# async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str, path_img :str) -> dict:
|
1266
|
+
# """
|
1267
|
+
# Retrieves information about a Telegram channel.
|
1268
|
+
|
1269
|
+
# Args:
|
1270
|
+
# api_id (int): The API ID of the Telegram application.
|
1271
|
+
# api_hash (str): The API hash of the Telegram application.
|
1272
|
+
# phone_number (str): The phone number associated with the Telegram account.
|
1273
|
+
# channel_username (str): The username of the channel.
|
1274
|
+
|
1275
|
+
# Returns:
|
1276
|
+
# dict: A dictionary containing the full information of the channel.
|
1277
|
+
|
1278
|
+
# Raises:
|
1279
|
+
# Exception: If there is an error during the retrieval of channel information.
|
1280
|
+
# """
|
1281
|
+
# client = TelegramClient('session_name', api_id, api_hash)
|
1282
|
+
# try:
|
1283
|
+
# await client.start(phone_number)
|
1284
|
+
# channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
|
1285
|
+
# channel_full_info_json = channel_full_info.to_dict()
|
1286
|
+
# img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file = path_img)
|
1287
|
+
# finally:
|
1288
|
+
# # Disconnect the client
|
1289
|
+
# await client.disconnect()
|
1290
|
+
|
1291
|
+
# return channel_full_info_json
|
1292
|
+
|
1293
|
+
|
1294
|
+
# async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str, path_project :str, DL_profile_pic : bool = False) -> dict:
|
1295
|
+
# """
|
1296
|
+
# Retrieves information about a Telegram channel.
|
1297
|
+
|
1298
|
+
# Args:
|
1299
|
+
# api_id (int): The API ID of the Telegram application.
|
1300
|
+
# api_hash (str): The API hash of the Telegram application.
|
1301
|
+
# phone_number (str): The phone number associated with the Telegram account.
|
1302
|
+
# channel_username (str): The username of the channel.
|
1303
|
+
|
1304
|
+
# Returns:
|
1305
|
+
# dict: A dictionary containing the full information of the channel.
|
1306
|
+
|
1307
|
+
# Raises:
|
1308
|
+
# Exception: If there is an error during the retrieval of channel information.
|
1309
|
+
# """
|
1310
|
+
# client = TelegramClient('session_name', api_id, api_hash)
|
1311
|
+
# path_img = create_dir(os.path.join(path_project, "THUMBNAILS"))
|
1312
|
+
# path_json = create_dir(os.path.join(path_project, "JSON"))
|
1313
|
+
|
1314
|
+
# try:
|
1315
|
+
# await client.start(phone_number)
|
1316
|
+
# try:
|
1317
|
+
# channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
|
1318
|
+
|
1319
|
+
# if channel_full_info:
|
1320
|
+
# channel_full_info_dict = channel_full_info.to_dict()
|
1321
|
+
# channel_full_info_json = JSONEncoder().encode(channel_full_info_dict)
|
1322
|
+
# else:
|
1323
|
+
# channel_full_info_dict = {'_': 'Channel', 'id': channel_username, 'title':'private_channel'}
|
1324
|
+
# write_json(channel_full_info_json, path_json, f"{str(channel_username)}")
|
1325
|
+
|
1326
|
+
# if DL_profile_pic:
|
1327
|
+
# img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file = path_img)
|
1328
|
+
|
1329
|
+
# except Exception as e:
|
1330
|
+
# pass
|
1331
|
+
# print(channel_username, e)
|
1332
|
+
# finally:
|
1333
|
+
# # Disconnect the client
|
1334
|
+
# await client.disconnect()
|
1335
|
+
|
1336
|
+
# return channel_full_info_dict
|
1337
|
+
|
1338
|
+
def dump_json(json_dict: dict, path: str, name: str) -> str:
|
1339
|
+
"""
|
1340
|
+
Write a dictionary to a JSON file.
|
1341
|
+
|
1342
|
+
Args:
|
1343
|
+
json_dict (dict): The dictionary to be written to the JSON file.
|
1344
|
+
path (str): The directory path where the JSON file will be saved.
|
1345
|
+
name (str): The name of the JSON file (without the extension).
|
1346
|
+
|
1347
|
+
Returns:
|
1348
|
+
str: The full path to the saved JSON file.
|
1349
|
+
"""
|
1350
|
+
file_path = os.path.join(path, name + '.json')
|
1351
|
+
with open(file_path, 'w') as outfile:
|
1352
|
+
json.dump(json_dict, outfile, cls=JSONEncoder)
|
1353
|
+
return file_path
|
1354
|
+
|
1355
|
+
async def get_channel_info(api_id: int, api_hash: str, phone_number: str, channel_username: str, path_project: str, DL_profile_pic: bool = False) -> dict:
|
958
1356
|
"""
|
959
1357
|
Retrieves information about a Telegram channel.
|
960
1358
|
|
@@ -971,16 +1369,49 @@ async def get_channel_info(api_id : int, api_hash : str, phone_number : str, cha
|
|
971
1369
|
Exception: If there is an error during the retrieval of channel information.
|
972
1370
|
"""
|
973
1371
|
client = TelegramClient('session_name', api_id, api_hash)
|
1372
|
+
path_img = create_dir(os.path.join(path_project, "THUMBNAILS"))
|
1373
|
+
path_json = create_dir(os.path.join(path_project, "JSON"))
|
1374
|
+
|
974
1375
|
try:
|
975
1376
|
await client.start(phone_number)
|
976
|
-
|
977
|
-
|
1377
|
+
try:
|
1378
|
+
# Fetch full channel info
|
1379
|
+
channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
|
1380
|
+
|
1381
|
+
# If channel info is retrieved
|
1382
|
+
if channel_full_info:
|
1383
|
+
channel_full_info_dict = channel_full_info.to_dict()
|
1384
|
+
else:
|
1385
|
+
channel_full_info_dict = {'_': 'ChatFull',
|
1386
|
+
'full_chat': {'_': 'ChannelFull',
|
1387
|
+
'id': channel_username,
|
1388
|
+
},
|
1389
|
+
'chats': [{'_': 'Channel', 'id': channel_username, 'title': 'private'}]
|
1390
|
+
}
|
1391
|
+
|
1392
|
+
# Save the dictionary as JSON (no need to pre-encode it to a string)
|
1393
|
+
dump_json(channel_full_info_dict, path_json, f"{str(channel_username)}")
|
1394
|
+
|
1395
|
+
# Optionally download profile picture
|
1396
|
+
if DL_profile_pic:
|
1397
|
+
img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file=path_img)
|
1398
|
+
|
1399
|
+
except Exception as e:
|
1400
|
+
print(channel_username, e)
|
1401
|
+
channel_full_info_dict = {'_': 'ChatFull',
|
1402
|
+
'full_chat': {'_': 'ChannelFull',
|
1403
|
+
'id': channel_username,
|
1404
|
+
},
|
1405
|
+
'chats': [{'_': 'Channel', 'id': channel_username, 'title': 'private'}]
|
1406
|
+
}
|
1407
|
+
dump_json(channel_full_info_dict, path_json, f"{str(channel_username)}")
|
1408
|
+
return {'_': 'Channel', 'id': channel_username, 'title': 'private_channel'}
|
1409
|
+
|
978
1410
|
finally:
|
979
1411
|
# Disconnect the client
|
980
1412
|
await client.disconnect()
|
981
1413
|
|
982
|
-
return
|
983
|
-
|
1414
|
+
return channel_full_info_dict
|
984
1415
|
|
985
1416
|
def parse_channel(channel : dict) -> pd.DataFrame:
|
986
1417
|
"""
|
@@ -1016,7 +1447,6 @@ def parse_channel(channel : dict) -> pd.DataFrame:
|
|
1016
1447
|
chats = channel.get("chats", [])
|
1017
1448
|
if chats:
|
1018
1449
|
for chat in chats:
|
1019
|
-
print(chat)
|
1020
1450
|
if chat.get("_") == "Channel":
|
1021
1451
|
if chat.get("id") == channel_id:
|
1022
1452
|
creation_date = chat.get("date", datetime(1970,1,1))
|