opsci-toolbox 0.0.14__tar.gz → 0.0.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/PKG-INFO +1 -1
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/apis/telegram.py +384 -44
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/apis/webscraping.py +8 -3
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/helpers/common.py +2 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/helpers/nlp.py +172 -26
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/helpers/nlp_cuml.py +8 -4
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/helpers/sna.py +34 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox.egg-info/PKG-INFO +1 -1
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/setup.py +1 -1
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/MANIFEST.in +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/README.md +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/__init__.py +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/apis/__init__.py +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/apis/rapidapi_helpers.py +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/apis/reddit.py +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/apis/youtube_helpers.py +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/helpers/__init__.py +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/helpers/cv.py +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/helpers/dataviz.py +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/helpers/dates.py +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/helpers/gliner.py +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/helpers/sql.py +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/helpers/surreaction.py +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/lexicons/__init__.py +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/lexicons/stop_words_en.csv +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/lexicons/stop_words_fr.csv +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox.egg-info/SOURCES.txt +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox.egg-info/dependency_links.txt +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox.egg-info/requires.txt +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox.egg-info/top_level.txt +0 -0
- {opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/setup.cfg +0 -0
@@ -1,17 +1,47 @@
|
|
1
1
|
from datetime import datetime
|
2
2
|
from telethon.sync import TelegramClient
|
3
3
|
from telethon.tl.functions.channels import GetFullChannelRequest
|
4
|
+
from telethon.errors.rpcerrorlist import ChannelPrivateError, ChannelInvalidError
|
4
5
|
import pandas as pd
|
5
6
|
# from telethon.tl.types import ReactionEmoji, ReactionCustomEmoji, PeerUser, PeerChannel
|
6
|
-
from opsci_toolbox.helpers.common import create_dir, write_pickle
|
7
|
+
from opsci_toolbox.helpers.common import create_dir, write_pickle, write_json
|
7
8
|
import os
|
8
9
|
import nest_asyncio
|
9
10
|
from telethon.tl.types import Message
|
10
|
-
|
11
|
-
|
11
|
+
import json
|
12
|
+
from tqdm import tqdm
|
12
13
|
nest_asyncio.apply()
|
13
14
|
|
14
15
|
|
16
|
+
class JSONEncoder(json.JSONEncoder):
|
17
|
+
'''
|
18
|
+
JSONEncoder subclass that knows how to encode date/time and bytes.
|
19
|
+
'''
|
20
|
+
def default(self, o):
|
21
|
+
if isinstance(o, datetime) or isinstance(o, bytes):
|
22
|
+
return str(o)
|
23
|
+
return super().default(o)
|
24
|
+
|
25
|
+
def dump_jsonl(data: list[dict], path: str, name: str) -> str:
|
26
|
+
"""
|
27
|
+
Write data to a JSON Lines (jsonl) file. Each dictionary in the list represents a single JSON object.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
|
31
|
+
path (str): The directory path where the JSON Lines file will be saved.
|
32
|
+
name (str): The name of the JSON Lines file (without the extension).
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
str: The full path to the saved JSON Lines file.
|
36
|
+
"""
|
37
|
+
file_path = os.path.join(path, name + '.jsonl')
|
38
|
+
with open(file_path, 'w') as file:
|
39
|
+
for entry in data:
|
40
|
+
json.dump(entry, file, cls=JSONEncoder)
|
41
|
+
file.write('\n')
|
42
|
+
return file_path
|
43
|
+
|
44
|
+
|
15
45
|
def parse_mediafileid(message: Message) -> str:
|
16
46
|
"""
|
17
47
|
Parse the media file ID from a Telegram message.
|
@@ -38,7 +68,7 @@ def parse_mediafileid(message: Message) -> str:
|
|
38
68
|
else:
|
39
69
|
grouped_id = message_id
|
40
70
|
|
41
|
-
media_fileid = str(channel_id)+'_'+str(grouped_id)+'_'+str(media_id)
|
71
|
+
media_fileid = str(int(channel_id))+'_'+str(int(grouped_id))+'_'+str(int(media_id))
|
42
72
|
return media_fileid
|
43
73
|
else:
|
44
74
|
return None
|
@@ -55,10 +85,10 @@ def parse_message_entities(messages : list) -> pd.DataFrame:
|
|
55
85
|
pd.DataFrame : a DataFrame containing the parsed entities.
|
56
86
|
"""
|
57
87
|
all_records = []
|
58
|
-
for
|
59
|
-
raw_text = message.raw_text
|
88
|
+
for data in messages:
|
89
|
+
# raw_text = message.raw_text
|
60
90
|
|
61
|
-
data = message.to_dict()
|
91
|
+
# data = message.to_dict()
|
62
92
|
|
63
93
|
message_id = data.get("id")
|
64
94
|
|
@@ -90,10 +120,10 @@ def parse_message_entities(messages : list) -> pd.DataFrame:
|
|
90
120
|
url = entity.get("url")
|
91
121
|
document_id = entity.get("document_id")
|
92
122
|
|
93
|
-
entity_record = (message_id, channel_id, from_id, grouped_id, message,
|
123
|
+
entity_record = (message_id, channel_id, from_id, grouped_id, message, entity_type, offset, length, url, document_id)
|
94
124
|
all_records.append(entity_record)
|
95
125
|
|
96
|
-
df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "message", "
|
126
|
+
df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "message", "entity_type", "offset", "length", "url", "document_id"])
|
97
127
|
return df
|
98
128
|
|
99
129
|
|
@@ -110,9 +140,9 @@ def parse_messages(messages : list) -> pd.DataFrame:
|
|
110
140
|
|
111
141
|
all_records = []
|
112
142
|
for message in messages:
|
113
|
-
raw_text = message.raw_text
|
143
|
+
# raw_text = message.raw_text
|
114
144
|
|
115
|
-
data = message
|
145
|
+
data = message
|
116
146
|
|
117
147
|
message_id = data.get("id")
|
118
148
|
|
@@ -166,10 +196,10 @@ def parse_messages(messages : list) -> pd.DataFrame:
|
|
166
196
|
engagements = forwards + replies + total_reactions
|
167
197
|
|
168
198
|
|
169
|
-
post_record = (message_id, channel_id, from_id, grouped_id, date, message,
|
199
|
+
post_record = (message_id, channel_id, from_id, grouped_id, date, message, views, forwards, replies, total_reactions, reactions_details, engagements, reply_to_message_id, reply_to_channel_id) + media_record + fwd_record
|
170
200
|
all_records.append(post_record)
|
171
201
|
|
172
|
-
df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "date", "message", "
|
202
|
+
df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "date", "message", "views", "forwards", "replies", "reactions", "details_reactions", "engagements", "reply_to_message_id", "reply_to_channel_id",
|
173
203
|
"media_id", "media_date", "media_mime_type", "media_size", "media_filename", "duration", "width", "height",
|
174
204
|
"webpage_id", "webpage_url", "webpage_type", "webpage_site_name", "webpage_title", "webpage_description",
|
175
205
|
"fwd_date", "fwd_from_id", "fwd_from_post_id", "fwd_from_from_name"])
|
@@ -196,8 +226,6 @@ def parse_reply(reply:dict) -> tuple:
|
|
196
226
|
|
197
227
|
return reply_to_message_id, reply_to_channel_id
|
198
228
|
|
199
|
-
|
200
|
-
|
201
229
|
def parse_from(data : dict) -> int:
|
202
230
|
"""
|
203
231
|
Parse a peer object from Telegram message.
|
@@ -288,12 +316,16 @@ def parse_media(media : dict) -> tuple:
|
|
288
316
|
if media.get("_") == "MessageMediaPhoto":
|
289
317
|
photo = media.get("photo", {})
|
290
318
|
media_id = photo.get("id")
|
319
|
+
if media_id:
|
320
|
+
media_id = str(int(media_id))
|
291
321
|
media_date = photo.get("date")
|
292
322
|
media_mime_type = "photo"
|
293
323
|
|
294
324
|
elif media.get("_") == "MessageMediaDocument":
|
295
325
|
document = media.get("document", {})
|
296
326
|
media_id = document.get("id")
|
327
|
+
if media_id:
|
328
|
+
media_id = str(int(media_id))
|
297
329
|
media_date = document.get("date")
|
298
330
|
media_mime_type = document.get("mime_type")
|
299
331
|
media_size = document.get("size")
|
@@ -344,6 +376,107 @@ def parse_media_id(media : dict) -> int:
|
|
344
376
|
media_id = None
|
345
377
|
return media_id
|
346
378
|
|
379
|
+
|
380
|
+
class JSONEncoder(json.JSONEncoder):
|
381
|
+
'''
|
382
|
+
JSONEncoder subclass that knows how to encode date/time and bytes.
|
383
|
+
'''
|
384
|
+
def default(self, o):
|
385
|
+
if isinstance(o, datetime) or isinstance(o, bytes):
|
386
|
+
return str(o)
|
387
|
+
return super().default(o)
|
388
|
+
|
389
|
+
def dump_jsonl(data: list[dict], path: str, name: str) -> str:
|
390
|
+
"""
|
391
|
+
Write data to a JSON Lines (jsonl) file. Each dictionary in the list represents a single JSON object.
|
392
|
+
|
393
|
+
Args:
|
394
|
+
data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
|
395
|
+
path (str): The directory path where the JSON Lines file will be saved.
|
396
|
+
name (str): The name of the JSON Lines file (without the extension).
|
397
|
+
|
398
|
+
Returns:
|
399
|
+
str: The full path to the saved JSON Lines file.
|
400
|
+
"""
|
401
|
+
file_path = os.path.join(path, name + '.jsonl')
|
402
|
+
with open(file_path, 'w') as file:
|
403
|
+
for entry in data:
|
404
|
+
json.dump(entry, file, cls=JSONEncoder)
|
405
|
+
file.write('\n')
|
406
|
+
return file_path
|
407
|
+
|
408
|
+
|
409
|
+
async def get_forwarded_messages(client: TelegramClient, phone_number: str, channel_username: int, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), path_file: str = "files"):
|
410
|
+
try:
|
411
|
+
await client.start(phone_number)
|
412
|
+
|
413
|
+
path_json = create_dir(os.path.join(path_file, "JSON"))
|
414
|
+
|
415
|
+
# Fetch the messages from the channel
|
416
|
+
# forwarded_messages = []
|
417
|
+
forwarded_messages_dict = []
|
418
|
+
new_channels = set()
|
419
|
+
|
420
|
+
async for message in client.iter_messages(channel_username,
|
421
|
+
limit=limit,
|
422
|
+
offset_date=offset_date,
|
423
|
+
reverse=reverse):
|
424
|
+
# Check if the message is a forward
|
425
|
+
if message.forward and hasattr(message.forward.chat, 'username'):
|
426
|
+
# forwarded_messages.append(message)
|
427
|
+
forwarded_messages_dict.append(message.to_dict())
|
428
|
+
|
429
|
+
if message.forward.chat:
|
430
|
+
new_channel = message.forward.chat.username
|
431
|
+
if new_channel:
|
432
|
+
new_channels.add(new_channel)
|
433
|
+
|
434
|
+
if forwarded_messages_dict:
|
435
|
+
dump_jsonl(forwarded_messages_dict, path_json, str(channel_username))
|
436
|
+
|
437
|
+
except (ChannelPrivateError, ChannelInvalidError):
|
438
|
+
print(f"Cannot access channel: {channel_username}")
|
439
|
+
|
440
|
+
return forwarded_messages_dict, new_channels
|
441
|
+
|
442
|
+
|
443
|
+
async def recursive_forward_scraper(seed_channels, depth, client: TelegramClient, phone_number: str, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), path_file: str = "files"):
|
444
|
+
"""
|
445
|
+
Recursively collects forwarded messages from channels, starting from the seed channels up to a specific depth.
|
446
|
+
"""
|
447
|
+
all_forwarded_messages = []
|
448
|
+
visited_channels = set(seed_channels)
|
449
|
+
current_level_channels = set(seed_channels)
|
450
|
+
|
451
|
+
path_json = create_dir(os.path.join(path_file, "CHANNELS"))
|
452
|
+
|
453
|
+
for level in range(depth):
|
454
|
+
print(level)
|
455
|
+
print(f"Processing level {level + 1} with {len(current_level_channels)} channels...")
|
456
|
+
next_level_channels = set()
|
457
|
+
|
458
|
+
# Iterate through channels at the current level
|
459
|
+
for channel in tqdm(current_level_channels, total=len(current_level_channels), desc="get messages"):
|
460
|
+
forwarded_msgs, discovered_channels = await get_forwarded_messages(client, phone_number, channel, reverse, limit, offset_date, path_file)
|
461
|
+
|
462
|
+
# Collect forwarded messages
|
463
|
+
all_forwarded_messages.extend(forwarded_msgs)
|
464
|
+
|
465
|
+
# Add newly discovered channels to the next level, excluding already visited ones
|
466
|
+
for new_channel in discovered_channels:
|
467
|
+
if new_channel not in visited_channels:
|
468
|
+
next_level_channels.add(new_channel)
|
469
|
+
visited_channels.add(new_channel)
|
470
|
+
# Update the set of channels for the next level of recursion
|
471
|
+
current_level_channels = next_level_channels
|
472
|
+
|
473
|
+
if not current_level_channels:
|
474
|
+
break
|
475
|
+
|
476
|
+
write_json(visited_channels, path_json, "visited_channels")
|
477
|
+
return all_forwarded_messages, visited_channels
|
478
|
+
|
479
|
+
|
347
480
|
def group_by_post(df : pd.DataFrame) -> pd.DataFrame:
|
348
481
|
"""
|
349
482
|
Function to group message by "post". If a post embed multiple media, Telethon returns separate messages for each media. This function groups them together ensuring also type consistency.
|
@@ -392,7 +525,80 @@ def group_by_post(df : pd.DataFrame) -> pd.DataFrame:
|
|
392
525
|
df = df.groupby(['channel_id',"grouped_id"]).agg(**aggregations).reset_index()
|
393
526
|
return df
|
394
527
|
|
395
|
-
async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), ids: list = [], path_file: str = "files", dl_thumbs : bool = False) -> list:
|
528
|
+
# async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), ids: list = [], path_file: str = "files", dl_thumbs : bool = False) -> list:
|
529
|
+
# """
|
530
|
+
# Retrieves messages from a Telegram channel by date.
|
531
|
+
|
532
|
+
# Args:
|
533
|
+
# client (TelegramClient): The Telegram client instance.
|
534
|
+
# phone_number (str): The phone number associated with the Telegram account.
|
535
|
+
# channel_username (str): The username of the channel to retrieve messages from.
|
536
|
+
# dl_files (bool, optional): Whether to download media files attached to the messages. Defaults to False.
|
537
|
+
# reverse (bool, optional): Whether to retrieve messages in reverse order. Defaults to True.
|
538
|
+
# limit (int, optional): The maximum number of messages to retrieve. Defaults to None (retrieve all messages).
|
539
|
+
# offset_date (datetime, optional): The starting date to retrieve messages from. Defaults to datetime(1970,1,1).
|
540
|
+
# path_file (str, optional): The path to save the downloaded files. Defaults to "files".
|
541
|
+
|
542
|
+
# Returns:
|
543
|
+
# list: A list of messages retrieved from the channel.
|
544
|
+
|
545
|
+
# Raises:
|
546
|
+
# Exception: If there is an error during the retrieval process.
|
547
|
+
|
548
|
+
# """
|
549
|
+
# try:
|
550
|
+
# await client.start(phone_number)
|
551
|
+
|
552
|
+
# # current_path_file = create_dir(os.path.join(path_file, "messages"))
|
553
|
+
# path_messages = create_dir(os.path.join(path_file, "messages"))
|
554
|
+
# path_entities = create_dir(os.path.join(path_file, "entities"))
|
555
|
+
|
556
|
+
# if dl_files:
|
557
|
+
# current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
|
558
|
+
# if dl_thumbs:
|
559
|
+
# current_path_thumbs = create_dir(os.path.join(path_file, "thumbs", str(channel_username)))
|
560
|
+
|
561
|
+
# # Get the message history
|
562
|
+
# messages = []
|
563
|
+
|
564
|
+
# async for message in client.iter_messages(channel_username,
|
565
|
+
# limit=limit,
|
566
|
+
# offset_date=offset_date,
|
567
|
+
# reverse=reverse):
|
568
|
+
# messages.append(message)
|
569
|
+
|
570
|
+
# if dl_files:
|
571
|
+
|
572
|
+
# media_fileid = parse_mediafileid(message)
|
573
|
+
# if media_fileid:
|
574
|
+
# await message.download_media(file=os.path.join(current_path_img, media_fileid))
|
575
|
+
|
576
|
+
# if dl_thumbs:
|
577
|
+
# media_fileid = parse_mediafileid(message)
|
578
|
+
# if media_fileid:
|
579
|
+
# try:
|
580
|
+
# await message.download_media(file=os.path.join(current_path_thumbs, media_fileid), thumb=-1)
|
581
|
+
# except Exception as e:
|
582
|
+
# pass
|
583
|
+
# print(e)
|
584
|
+
|
585
|
+
# df_exploded = parse_messages(messages)
|
586
|
+
# df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
|
587
|
+
# write_pickle(df_exploded, path_messages, str(channel_username))
|
588
|
+
|
589
|
+
# df_entities = parse_message_entities(messages)
|
590
|
+
# write_pickle(df_entities, path_entities, str(channel_username))
|
591
|
+
|
592
|
+
# # df_messages = group_by_post(df_exploded)
|
593
|
+
# # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['from_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
|
594
|
+
# # write_pickle(df_messages, current_path_file, str(channel_username))
|
595
|
+
|
596
|
+
# return messages
|
597
|
+
# finally:
|
598
|
+
# # Disconnect the client
|
599
|
+
# await client.disconnect()
|
600
|
+
|
601
|
+
async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), path_file: str = "files", dl_thumbs : bool = False) -> list:
|
396
602
|
"""
|
397
603
|
Retrieves messages from a Telegram channel by date.
|
398
604
|
|
@@ -417,6 +623,7 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
|
|
417
623
|
await client.start(phone_number)
|
418
624
|
|
419
625
|
# current_path_file = create_dir(os.path.join(path_file, "messages"))
|
626
|
+
path_json = create_dir(os.path.join(path_file, "JSON"))
|
420
627
|
path_messages = create_dir(os.path.join(path_file, "messages"))
|
421
628
|
path_entities = create_dir(os.path.join(path_file, "entities"))
|
422
629
|
|
@@ -426,13 +633,16 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
|
|
426
633
|
current_path_thumbs = create_dir(os.path.join(path_file, "thumbs", str(channel_username)))
|
427
634
|
|
428
635
|
# Get the message history
|
429
|
-
messages = []
|
636
|
+
# messages = []
|
637
|
+
messages_dict = []
|
430
638
|
|
431
639
|
async for message in client.iter_messages(channel_username,
|
432
640
|
limit=limit,
|
433
641
|
offset_date=offset_date,
|
434
642
|
reverse=reverse):
|
435
|
-
messages.append(message)
|
643
|
+
# messages.append(message)
|
644
|
+
messages_dict.append(message.to_dict())
|
645
|
+
|
436
646
|
|
437
647
|
if dl_files:
|
438
648
|
|
@@ -449,18 +659,19 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
|
|
449
659
|
pass
|
450
660
|
print(e)
|
451
661
|
|
452
|
-
|
662
|
+
dump_jsonl(messages_dict, path_json, str(channel_username))
|
663
|
+
df_exploded = parse_messages(messages_dict)
|
453
664
|
df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
|
454
665
|
write_pickle(df_exploded, path_messages, str(channel_username))
|
455
666
|
|
456
|
-
df_entities = parse_message_entities(
|
667
|
+
df_entities = parse_message_entities(messages_dict)
|
457
668
|
write_pickle(df_entities, path_entities, str(channel_username))
|
458
669
|
|
459
670
|
# df_messages = group_by_post(df_exploded)
|
460
671
|
# df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['from_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
|
461
672
|
# write_pickle(df_messages, current_path_file, str(channel_username))
|
462
673
|
|
463
|
-
return
|
674
|
+
return messages_dict
|
464
675
|
finally:
|
465
676
|
# Disconnect the client
|
466
677
|
await client.disconnect()
|
@@ -486,16 +697,18 @@ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel
|
|
486
697
|
"""
|
487
698
|
try:
|
488
699
|
await client.start(phone_number)
|
489
|
-
|
700
|
+
path_json = create_dir(os.path.join(path_file, "JSON"))
|
490
701
|
path_messages = create_dir(os.path.join(path_file, "messages"))
|
491
702
|
path_entities = create_dir(os.path.join(path_file, "entities"))
|
492
703
|
|
493
704
|
# Get the message history
|
494
|
-
messages = []
|
705
|
+
# messages = []
|
706
|
+
messages_dict = []
|
495
707
|
|
496
708
|
async for message in client.iter_messages(channel_username,
|
497
709
|
ids = ids):
|
498
|
-
messages.append(message)
|
710
|
+
# messages.append(message)
|
711
|
+
messages_dict.append(message.to_dict())
|
499
712
|
|
500
713
|
if dl_files:
|
501
714
|
current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
|
@@ -503,11 +716,13 @@ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel
|
|
503
716
|
if media_fileid:
|
504
717
|
await message.download_media(file=os.path.join(current_path_img, media_fileid))
|
505
718
|
|
506
|
-
|
719
|
+
|
720
|
+
dump_jsonl(messages_dict, path_json, str(channel_username))
|
721
|
+
df_exploded = parse_messages(messages_dict)
|
507
722
|
df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
|
508
723
|
write_pickle(df_exploded, path_messages, str(channel_username))
|
509
724
|
|
510
|
-
df_entities = parse_message_entities(
|
725
|
+
df_entities = parse_message_entities(messages_dict)
|
511
726
|
write_pickle(df_entities, path_entities, str(channel_username))
|
512
727
|
|
513
728
|
|
@@ -515,7 +730,7 @@ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel
|
|
515
730
|
# df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['message_id'].astype(str)
|
516
731
|
# write_pickle(df_messages, current_path_file, str(channel_username))
|
517
732
|
|
518
|
-
return
|
733
|
+
return messages_dict
|
519
734
|
finally:
|
520
735
|
# Disconnect the client
|
521
736
|
await client.disconnect()
|
@@ -544,6 +759,7 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
|
|
544
759
|
await client.start(phone_number)
|
545
760
|
|
546
761
|
# current_path_file = create_dir(os.path.join(path_file, "messages"))
|
762
|
+
path_json = create_dir(os.path.join(path_file, "JSON"))
|
547
763
|
path_messages = create_dir(os.path.join(path_file, "messages"))
|
548
764
|
path_entities = create_dir(os.path.join(path_file, "entities"))
|
549
765
|
|
@@ -551,12 +767,14 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
|
|
551
767
|
current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
|
552
768
|
|
553
769
|
# Get the message history
|
554
|
-
messages = []
|
770
|
+
# messages = []
|
771
|
+
messages_dict = []
|
555
772
|
|
556
773
|
async for message in client.iter_messages(channel_username,
|
557
774
|
search=search,
|
558
775
|
limit=limit):
|
559
|
-
messages.append(message)
|
776
|
+
# messages.append(message)
|
777
|
+
messages_dict.append(message.to_dict())
|
560
778
|
|
561
779
|
if dl_files:
|
562
780
|
|
@@ -564,12 +782,13 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
|
|
564
782
|
if media_fileid:
|
565
783
|
await message.download_media(file=os.path.join(current_path_img, media_fileid))
|
566
784
|
|
567
|
-
|
785
|
+
|
786
|
+
df_exploded = parse_messages(messages_dict)
|
568
787
|
df_exploded['search']=search
|
569
788
|
df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
|
570
789
|
# df_messages = group_by_post(df_exploded)
|
571
790
|
# df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
|
572
|
-
df_entities = parse_message_entities(
|
791
|
+
df_entities = parse_message_entities(messages_dict)
|
573
792
|
|
574
793
|
if channel_username:
|
575
794
|
write_pickle(df_exploded, path_messages, str(search)+'_'+str(channel_username))
|
@@ -578,7 +797,7 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
|
|
578
797
|
write_pickle(df_exploded, path_messages, str(search))
|
579
798
|
write_pickle(df_entities, path_entities, str(search))
|
580
799
|
|
581
|
-
return
|
800
|
+
return messages_dict
|
582
801
|
finally:
|
583
802
|
# Disconnect the client
|
584
803
|
await client.disconnect()
|
@@ -592,10 +811,10 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
|
|
592
811
|
path_messages = create_dir(os.path.join(path_file, "messages"))
|
593
812
|
path_entities = create_dir(os.path.join(path_file, "entities"))
|
594
813
|
|
595
|
-
|
814
|
+
comments_dict = []
|
596
815
|
|
597
816
|
async for comment in client.iter_messages(int(channel_entity), reply_to=int(message_id), limit=limit, reverse=reverse):
|
598
|
-
|
817
|
+
comments_dict.append(comment.to_dict())
|
599
818
|
|
600
819
|
if dl_files:
|
601
820
|
current_path_img = create_dir(os.path.join(path_file, "img", str(channel_entity)))
|
@@ -603,14 +822,14 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
|
|
603
822
|
if media_fileid:
|
604
823
|
await comment.download_media(file=os.path.join(current_path_img, media_fileid))
|
605
824
|
|
606
|
-
df_comments = parse_messages(
|
825
|
+
df_comments = parse_messages(comments_dict)
|
607
826
|
df_comments.loc[df_comments['media_id'].notna(), "media_fileid"] = df_comments['channel_id'].astype(str)+'_'+df_comments['grouped_id'].astype(str)+'_'+df_comments['media_id'].astype(str)
|
608
827
|
write_pickle(df_comments, path_messages, str(channel_entity)+"_"+str(message_id))
|
609
828
|
|
610
|
-
df_entities = parse_message_entities(
|
829
|
+
df_entities = parse_message_entities(comments_dict)
|
611
830
|
write_pickle(df_entities, path_entities, str(channel_entity)+"_"+str(message_id))
|
612
831
|
|
613
|
-
return
|
832
|
+
return comments_dict
|
614
833
|
|
615
834
|
finally:
|
616
835
|
# Disconnect the client
|
@@ -1043,7 +1262,97 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
|
|
1043
1262
|
# # Disconnect the client
|
1044
1263
|
# await client.disconnect()
|
1045
1264
|
|
1046
|
-
async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str, path_img :str) -> dict:
|
1265
|
+
# async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str, path_img :str) -> dict:
|
1266
|
+
# """
|
1267
|
+
# Retrieves information about a Telegram channel.
|
1268
|
+
|
1269
|
+
# Args:
|
1270
|
+
# api_id (int): The API ID of the Telegram application.
|
1271
|
+
# api_hash (str): The API hash of the Telegram application.
|
1272
|
+
# phone_number (str): The phone number associated with the Telegram account.
|
1273
|
+
# channel_username (str): The username of the channel.
|
1274
|
+
|
1275
|
+
# Returns:
|
1276
|
+
# dict: A dictionary containing the full information of the channel.
|
1277
|
+
|
1278
|
+
# Raises:
|
1279
|
+
# Exception: If there is an error during the retrieval of channel information.
|
1280
|
+
# """
|
1281
|
+
# client = TelegramClient('session_name', api_id, api_hash)
|
1282
|
+
# try:
|
1283
|
+
# await client.start(phone_number)
|
1284
|
+
# channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
|
1285
|
+
# channel_full_info_json = channel_full_info.to_dict()
|
1286
|
+
# img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file = path_img)
|
1287
|
+
# finally:
|
1288
|
+
# # Disconnect the client
|
1289
|
+
# await client.disconnect()
|
1290
|
+
|
1291
|
+
# return channel_full_info_json
|
1292
|
+
|
1293
|
+
|
1294
|
+
# async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str, path_project :str, DL_profile_pic : bool = False) -> dict:
|
1295
|
+
# """
|
1296
|
+
# Retrieves information about a Telegram channel.
|
1297
|
+
|
1298
|
+
# Args:
|
1299
|
+
# api_id (int): The API ID of the Telegram application.
|
1300
|
+
# api_hash (str): The API hash of the Telegram application.
|
1301
|
+
# phone_number (str): The phone number associated with the Telegram account.
|
1302
|
+
# channel_username (str): The username of the channel.
|
1303
|
+
|
1304
|
+
# Returns:
|
1305
|
+
# dict: A dictionary containing the full information of the channel.
|
1306
|
+
|
1307
|
+
# Raises:
|
1308
|
+
# Exception: If there is an error during the retrieval of channel information.
|
1309
|
+
# """
|
1310
|
+
# client = TelegramClient('session_name', api_id, api_hash)
|
1311
|
+
# path_img = create_dir(os.path.join(path_project, "THUMBNAILS"))
|
1312
|
+
# path_json = create_dir(os.path.join(path_project, "JSON"))
|
1313
|
+
|
1314
|
+
# try:
|
1315
|
+
# await client.start(phone_number)
|
1316
|
+
# try:
|
1317
|
+
# channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
|
1318
|
+
|
1319
|
+
# if channel_full_info:
|
1320
|
+
# channel_full_info_dict = channel_full_info.to_dict()
|
1321
|
+
# channel_full_info_json = JSONEncoder().encode(channel_full_info_dict)
|
1322
|
+
# else:
|
1323
|
+
# channel_full_info_dict = {'_': 'Channel', 'id': channel_username, 'title':'private_channel'}
|
1324
|
+
# write_json(channel_full_info_json, path_json, f"{str(channel_username)}")
|
1325
|
+
|
1326
|
+
# if DL_profile_pic:
|
1327
|
+
# img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file = path_img)
|
1328
|
+
|
1329
|
+
# except Exception as e:
|
1330
|
+
# pass
|
1331
|
+
# print(channel_username, e)
|
1332
|
+
# finally:
|
1333
|
+
# # Disconnect the client
|
1334
|
+
# await client.disconnect()
|
1335
|
+
|
1336
|
+
# return channel_full_info_dict
|
1337
|
+
|
1338
|
+
def dump_json(json_dict: dict, path: str, name: str) -> str:
|
1339
|
+
"""
|
1340
|
+
Write a dictionary to a JSON file.
|
1341
|
+
|
1342
|
+
Args:
|
1343
|
+
json_dict (dict): The dictionary to be written to the JSON file.
|
1344
|
+
path (str): The directory path where the JSON file will be saved.
|
1345
|
+
name (str): The name of the JSON file (without the extension).
|
1346
|
+
|
1347
|
+
Returns:
|
1348
|
+
str: The full path to the saved JSON file.
|
1349
|
+
"""
|
1350
|
+
file_path = os.path.join(path, name + '.json')
|
1351
|
+
with open(file_path, 'w') as outfile:
|
1352
|
+
json.dump(json_dict, outfile, cls=JSONEncoder)
|
1353
|
+
return file_path
|
1354
|
+
|
1355
|
+
async def get_channel_info(api_id: int, api_hash: str, phone_number: str, channel_username: str, path_project: str, DL_profile_pic: bool = False) -> dict:
|
1047
1356
|
"""
|
1048
1357
|
Retrieves information about a Telegram channel.
|
1049
1358
|
|
@@ -1060,17 +1369,49 @@ async def get_channel_info(api_id : int, api_hash : str, phone_number : str, cha
|
|
1060
1369
|
Exception: If there is an error during the retrieval of channel information.
|
1061
1370
|
"""
|
1062
1371
|
client = TelegramClient('session_name', api_id, api_hash)
|
1372
|
+
path_img = create_dir(os.path.join(path_project, "THUMBNAILS"))
|
1373
|
+
path_json = create_dir(os.path.join(path_project, "JSON"))
|
1374
|
+
|
1063
1375
|
try:
|
1064
1376
|
await client.start(phone_number)
|
1065
|
-
|
1066
|
-
|
1067
|
-
|
1377
|
+
try:
|
1378
|
+
# Fetch full channel info
|
1379
|
+
channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
|
1380
|
+
|
1381
|
+
# If channel info is retrieved
|
1382
|
+
if channel_full_info:
|
1383
|
+
channel_full_info_dict = channel_full_info.to_dict()
|
1384
|
+
else:
|
1385
|
+
channel_full_info_dict = {'_': 'ChatFull',
|
1386
|
+
'full_chat': {'_': 'ChannelFull',
|
1387
|
+
'id': channel_username,
|
1388
|
+
},
|
1389
|
+
'chats': [{'_': 'Channel', 'id': channel_username, 'title': 'private'}]
|
1390
|
+
}
|
1391
|
+
|
1392
|
+
# Save the dictionary as JSON (no need to pre-encode it to a string)
|
1393
|
+
dump_json(channel_full_info_dict, path_json, f"{str(channel_username)}")
|
1394
|
+
|
1395
|
+
# Optionally download profile picture
|
1396
|
+
if DL_profile_pic:
|
1397
|
+
img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file=path_img)
|
1398
|
+
|
1399
|
+
except Exception as e:
|
1400
|
+
print(channel_username, e)
|
1401
|
+
channel_full_info_dict = {'_': 'ChatFull',
|
1402
|
+
'full_chat': {'_': 'ChannelFull',
|
1403
|
+
'id': channel_username,
|
1404
|
+
},
|
1405
|
+
'chats': [{'_': 'Channel', 'id': channel_username, 'title': 'private'}]
|
1406
|
+
}
|
1407
|
+
dump_json(channel_full_info_dict, path_json, f"{str(channel_username)}")
|
1408
|
+
return {'_': 'Channel', 'id': channel_username, 'title': 'private_channel'}
|
1409
|
+
|
1068
1410
|
finally:
|
1069
1411
|
# Disconnect the client
|
1070
1412
|
await client.disconnect()
|
1071
1413
|
|
1072
|
-
return
|
1073
|
-
|
1414
|
+
return channel_full_info_dict
|
1074
1415
|
|
1075
1416
|
def parse_channel(channel : dict) -> pd.DataFrame:
|
1076
1417
|
"""
|
@@ -1106,7 +1447,6 @@ def parse_channel(channel : dict) -> pd.DataFrame:
|
|
1106
1447
|
chats = channel.get("chats", [])
|
1107
1448
|
if chats:
|
1108
1449
|
for chat in chats:
|
1109
|
-
print(chat)
|
1110
1450
|
if chat.get("_") == "Channel":
|
1111
1451
|
if chat.get("id") == channel_id:
|
1112
1452
|
creation_date = chat.get("date", datetime(1970,1,1))
|
@@ -97,9 +97,14 @@ def url_get_domain(url: str) -> str:
|
|
97
97
|
Returns:
|
98
98
|
str: The domain name extracted from the URL.
|
99
99
|
"""
|
100
|
-
|
101
|
-
|
102
|
-
|
100
|
+
try:
|
101
|
+
parsed_url = urlparse(url)
|
102
|
+
domain = parsed_url.hostname if parsed_url.hostname else parsed_url.netloc
|
103
|
+
return domain
|
104
|
+
except Exception as e:
|
105
|
+
pass
|
106
|
+
print(url, e)
|
107
|
+
return url
|
103
108
|
|
104
109
|
|
105
110
|
def url_get_extension(url: str) -> str:
|
@@ -30,7 +30,7 @@ from eldar import Query
|
|
30
30
|
import torch
|
31
31
|
from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
|
32
32
|
from bs4 import BeautifulSoup
|
33
|
-
|
33
|
+
from nltk.tokenize import PunktSentenceTokenizer
|
34
34
|
|
35
35
|
####################################################################
|
36
36
|
# CLEANING
|
@@ -126,7 +126,7 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
|
|
126
126
|
df[col_clean] = df[col_clean].apply(lambda x : brackets(x))
|
127
127
|
df[col_clean] = df[col_clean].apply(lambda x : urls(x, repl= ''))
|
128
128
|
df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
|
129
|
-
df[col_clean] = df[col_clean].apply(remove_multiple_hashtags)
|
129
|
+
# df[col_clean] = df[col_clean].apply(remove_multiple_hashtags)
|
130
130
|
df[col_clean] = df[col_clean].apply(remove_extra_spaces)
|
131
131
|
# df = df.loc[(df[col_clean] != ""), :]
|
132
132
|
return df
|
@@ -911,6 +911,8 @@ def topic_aggregate_chunks(df: pd.DataFrame, col_id: str, col_topic : str, col_c
|
|
911
911
|
"""
|
912
912
|
metrics_dict = dict()
|
913
913
|
# metrics_dict[col_id]=(col_id,'first')
|
914
|
+
# if col_id != col_chunk_id:
|
915
|
+
# metrics_dict[col_chunk_id]=(col_chunk_id,"nunique")
|
914
916
|
metrics_dict[col_chunk_id]=(col_chunk_id,"nunique")
|
915
917
|
metrics_dict[col_engagement]=(col_engagement,'first')
|
916
918
|
|
@@ -1578,10 +1580,10 @@ def extract_emojis(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100,
|
|
1578
1580
|
|
1579
1581
|
return df
|
1580
1582
|
|
1581
|
-
def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1, batch_size: int = 100, n_process: int = 1, stats: bool = False) -> pd.DataFrame:
|
1583
|
+
def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1, batch_size: int = 100, n_process: int = 1, stats: bool = False, threshold: int = None) -> pd.DataFrame:
|
1582
1584
|
"""
|
1583
|
-
Split a text into chunks of n sentences
|
1584
|
-
|
1585
|
+
Split a text into chunks of n sentences, returning their start and end indexes in separate columns.
|
1586
|
+
|
1585
1587
|
Parameters:
|
1586
1588
|
nlp : spacy.language.Language
|
1587
1589
|
The spaCy language processing pipeline.
|
@@ -1597,41 +1599,142 @@ def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1
|
|
1597
1599
|
The number of processes to use for text processing. Default is 1.
|
1598
1600
|
stats : bool, optional
|
1599
1601
|
Flag indicating whether to compute statistics about the splitting process. Default is False.
|
1600
|
-
|
1602
|
+
threshold : int, optional
|
1603
|
+
Maximum number of sentence batches to return per text. If None, all batches are returned. Default is None.
|
1604
|
+
|
1601
1605
|
Returns:
|
1602
1606
|
pd.DataFrame
|
1603
|
-
DataFrame containing the split sentences.
|
1607
|
+
DataFrame containing the split sentences with their start and end indexes in separate columns.
|
1604
1608
|
|
1605
|
-
Description:
|
1606
|
-
This function splits text in a DataFrame into chunks of n sentences. It returns a DataFrame containing the split sentences.
|
1607
|
-
Optionally, it can compute statistics such as the count of sentences and batches if the 'stats' parameter is set to True.
|
1608
1609
|
"""
|
1610
|
+
text = list(df[col_text].astype('unicode').values)
|
1611
|
+
|
1612
|
+
count_sentences = []
|
1613
|
+
count_batches = []
|
1614
|
+
results = []
|
1615
|
+
start_indexes = []
|
1616
|
+
end_indexes = []
|
1617
|
+
|
1618
|
+
for doc in tqdm(nlp.pipe(text, batch_size=batch_size, n_process=n_process), total=len(text), desc="Sentence splitting"):
|
1619
|
+
sentences = []
|
1620
|
+
|
1621
|
+
|
1622
|
+
# Extract sentences and their positions
|
1623
|
+
for sent in doc.sents:
|
1624
|
+
sentences.append((sent.text, sent.start_char, sent.end_char))
|
1609
1625
|
|
1610
|
-
text=list(df[col_text].astype('unicode').values)
|
1611
|
-
|
1612
|
-
count_sentences=[]
|
1613
|
-
count_batches=[]
|
1614
|
-
results=[]
|
1615
|
-
for doc in tqdm(nlp.pipe(text, batch_size=batch_size, n_process=n_process), total= len(text), desc = "Sentence splitting"):
|
1616
|
-
# Split the text into sentences
|
1617
|
-
sentences = [sent.text for sent in doc.sents]
|
1618
1626
|
if stats:
|
1619
1627
|
count_sentences.append(len(sentences))
|
1620
|
-
|
1621
|
-
|
1628
|
+
|
1629
|
+
if n_sentences > 1:
|
1630
|
+
# # Split sentences into batches of size n_sentences
|
1622
1631
|
batches = [sentences[i:i + n_sentences] for i in range(0, len(sentences), n_sentences)]
|
1623
|
-
|
1632
|
+
|
1633
|
+
# Concatenate batches of sentences and adjust spans accordingly
|
1634
|
+
concatenate_batches = [" ".join([sub[0] for sub in sublist]) for sublist in batches]
|
1635
|
+
concatenate_spans = [(sublist[0][1], sublist[-1][2]) for sublist in batches]
|
1636
|
+
|
1637
|
+
if threshold is not None:
|
1638
|
+
concatenate_batches = concatenate_batches[:threshold]
|
1639
|
+
concatenate_spans = concatenate_spans[:threshold]
|
1640
|
+
|
1624
1641
|
results.append(concatenate_batches)
|
1642
|
+
start_indexes.append([span[0] for span in concatenate_spans])
|
1643
|
+
end_indexes.append([span[1] for span in concatenate_spans])
|
1644
|
+
|
1625
1645
|
if stats:
|
1626
1646
|
count_batches.append(len(concatenate_batches))
|
1627
|
-
|
1628
1647
|
else:
|
1629
|
-
|
1648
|
+
sentences = sentences[:threshold] if threshold is not None else sentences
|
1649
|
+
|
1650
|
+
results.append([sub[0] for sub in sentences])
|
1651
|
+
start_indexes.append([sub[1] for sub in sentences])
|
1652
|
+
end_indexes.append([sub[2] for sub in sentences])
|
1630
1653
|
|
1631
1654
|
df['sentences'] = results
|
1632
|
-
|
1633
|
-
|
1634
|
-
|
1655
|
+
df['start_indexes'] = start_indexes
|
1656
|
+
df['end_indexes'] = end_indexes
|
1657
|
+
|
1658
|
+
df = df.explode(['sentences','start_indexes', 'end_indexes']).reset_index(drop=True)
|
1659
|
+
|
1660
|
+
return df
|
1661
|
+
|
1662
|
+
|
1663
|
+
def split_n_sentences_nltk(df: pd.DataFrame, col_text: str, n_sentences: int = 1, threshold: int = None, stats: bool = False) -> pd.DataFrame:
|
1664
|
+
"""
|
1665
|
+
Split a text into chunks of n sentences, returning their start and end indexes in separate columns using NLTK PunktSentenceTokenizer.
|
1666
|
+
|
1667
|
+
Parameters:
|
1668
|
+
df : pd.DataFrame
|
1669
|
+
DataFrame containing the text data to split.
|
1670
|
+
col_text : str
|
1671
|
+
The name of the column containing the text data.
|
1672
|
+
n_sentences : int, optional
|
1673
|
+
The number of sentences to group together. Default is 1.
|
1674
|
+
threshold : int, optional
|
1675
|
+
Maximum number of sentence batches to return per text. If None, all batches are returned. Default is None.
|
1676
|
+
stats : bool, optional
|
1677
|
+
Flag indicating whether to compute statistics about the splitting process. Default is False.
|
1678
|
+
|
1679
|
+
Returns:
|
1680
|
+
pd.DataFrame
|
1681
|
+
DataFrame containing the split sentences with their start and end indexes in separate columns.
|
1682
|
+
|
1683
|
+
"""
|
1684
|
+
tokenizer = PunktSentenceTokenizer()
|
1685
|
+
text = list(df[col_text].astype('unicode').values)
|
1686
|
+
|
1687
|
+
count_sentences = []
|
1688
|
+
count_batches = []
|
1689
|
+
results = []
|
1690
|
+
start_indexes = []
|
1691
|
+
end_indexes = []
|
1692
|
+
|
1693
|
+
for doc in tqdm(text, total=len(text), desc="Sentence splitting"):
|
1694
|
+
sentences = []
|
1695
|
+
start_pos = 0
|
1696
|
+
|
1697
|
+
# Tokenize sentences and compute positions
|
1698
|
+
for sent in tokenizer.tokenize(doc):
|
1699
|
+
start_idx = doc.find(sent, start_pos)
|
1700
|
+
end_idx = start_idx + len(sent)
|
1701
|
+
sentences.append((sent, start_idx, end_idx))
|
1702
|
+
start_pos = end_idx
|
1703
|
+
|
1704
|
+
if stats:
|
1705
|
+
count_sentences.append(len(sentences))
|
1706
|
+
|
1707
|
+
if n_sentences > 1:
|
1708
|
+
# Split sentences into batches of size n_sentences
|
1709
|
+
batches = [sentences[i:i + n_sentences] for i in range(0, len(sentences), n_sentences)]
|
1710
|
+
|
1711
|
+
# Concatenate batches of sentences and adjust spans accordingly
|
1712
|
+
concatenate_batches = [" ".join([sub[0] for sub in sublist]) for sublist in batches]
|
1713
|
+
concatenate_spans = [(sublist[0][1], sublist[-1][2]) for sublist in batches]
|
1714
|
+
|
1715
|
+
if threshold is not None:
|
1716
|
+
concatenate_batches = concatenate_batches[:threshold]
|
1717
|
+
concatenate_spans = concatenate_spans[:threshold]
|
1718
|
+
|
1719
|
+
results.append(concatenate_batches)
|
1720
|
+
start_indexes.append([span[0] for span in concatenate_spans])
|
1721
|
+
end_indexes.append([span[1] for span in concatenate_spans])
|
1722
|
+
|
1723
|
+
if stats:
|
1724
|
+
count_batches.append(len(concatenate_batches))
|
1725
|
+
else:
|
1726
|
+
sentences = sentences[:threshold] if threshold is not None else sentences
|
1727
|
+
|
1728
|
+
results.append([sub[0] for sub in sentences])
|
1729
|
+
start_indexes.append([sub[1] for sub in sentences])
|
1730
|
+
end_indexes.append([sub[2] for sub in sentences])
|
1731
|
+
|
1732
|
+
df['sentences'] = results
|
1733
|
+
df['start_indexes'] = start_indexes
|
1734
|
+
df['end_indexes'] = end_indexes
|
1735
|
+
|
1736
|
+
df = df.explode(['sentences', 'start_indexes', 'end_indexes']).reset_index(drop=True)
|
1737
|
+
|
1635
1738
|
return df
|
1636
1739
|
|
1637
1740
|
|
@@ -2404,3 +2507,46 @@ def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json
|
|
2404
2507
|
write_json(results, dir_json , str(filename))
|
2405
2508
|
|
2406
2509
|
return results
|
2510
|
+
|
2511
|
+
|
2512
|
+
def add_tag_libretranslate_not_translate(text):
|
2513
|
+
"""
|
2514
|
+
This function add fake html tag around words such as mentions, hashtags, urls and emojis to avoid translation of those tokens.
|
2515
|
+
|
2516
|
+
Args:
|
2517
|
+
text (str): The text to process
|
2518
|
+
|
2519
|
+
Returns:
|
2520
|
+
str: The text with the fake html tags
|
2521
|
+
"""
|
2522
|
+
# This regex finds words starting with # and followed by alphanumeric characters or underscores
|
2523
|
+
mention_pattern = r"(?:RT\s|QT\s)?(?<=^|(?<=[^a-zA-Z0-9-_\.]))(@[A-Za-z0-9_]{4,15})"
|
2524
|
+
hashtag_pattern = r"(\B#\w+)"
|
2525
|
+
url_pattern = r"(https?://[^ ]+)"
|
2526
|
+
emoji_pattern = r':[a-zA-Z_]+:'
|
2527
|
+
|
2528
|
+
pattern = re.compile(emoji_pattern+ "|" + mention_pattern + "|" + hashtag_pattern + "|" + url_pattern)
|
2529
|
+
|
2530
|
+
# This function replaces the hashtag with an HTML link tag
|
2531
|
+
def replace_with_link(match):
|
2532
|
+
matcher_group = match.group(0)
|
2533
|
+
return f'<a href="{matcher_group}"></a>'
|
2534
|
+
|
2535
|
+
# Use re.sub to substitute the hashtags with the HTML link tags
|
2536
|
+
text_no_emojis = emoji.demojize(text)
|
2537
|
+
result = re.sub(pattern, replace_with_link, text_no_emojis)
|
2538
|
+
|
2539
|
+
return result
|
2540
|
+
|
2541
|
+
def clean_libre_translate_tags(text):
|
2542
|
+
"""
|
2543
|
+
This function remove fake tags added by add_tag_libretranslate_not_translate() function.
|
2544
|
+
|
2545
|
+
Args:
|
2546
|
+
text (str): The text to process
|
2547
|
+
|
2548
|
+
Returns:
|
2549
|
+
str: The text with the fake html tags
|
2550
|
+
"""
|
2551
|
+
cleaned_string = text.replace('<a href="', '').replace('"></a>', '')
|
2552
|
+
return cleaned_string
|
@@ -18,7 +18,8 @@ def reduce_with_cuml_UMAP(embeddings: np.ndarray,
|
|
18
18
|
metric: str = "cosine",
|
19
19
|
spread: float = 1.0,
|
20
20
|
learning_rate: float = 1.0,
|
21
|
-
n_epochs:int = 300
|
21
|
+
n_epochs:int = 300,
|
22
|
+
random_state:int = None
|
22
23
|
) -> tuple:
|
23
24
|
"""
|
24
25
|
Reduces the dimensionality of embeddings using UMAP with cuML library.
|
@@ -41,7 +42,8 @@ def reduce_with_cuml_UMAP(embeddings: np.ndarray,
|
|
41
42
|
metric=metric,
|
42
43
|
spread = spread,
|
43
44
|
n_epochs=n_epochs,
|
44
|
-
learning_rate=learning_rate
|
45
|
+
learning_rate=learning_rate,
|
46
|
+
random_state=random_state).fit(embeddings)
|
45
47
|
|
46
48
|
reduced_embeddings = reducer.transform(embeddings)
|
47
49
|
return reducer, reduced_embeddings
|
@@ -56,7 +58,8 @@ def supervised_reduce_with_cuml_UMAP(embeddings: np.ndarray,
|
|
56
58
|
learning_rate: float = 1.0,
|
57
59
|
n_epochs:int = 300,
|
58
60
|
y: np.ndarray = None,
|
59
|
-
convert_dtype: bool = False
|
61
|
+
convert_dtype: bool = False,
|
62
|
+
random_state:int=None
|
60
63
|
) -> tuple:
|
61
64
|
"""
|
62
65
|
Reduces the dimensionality of embeddings using UMAP with cuML library.
|
@@ -79,7 +82,8 @@ def supervised_reduce_with_cuml_UMAP(embeddings: np.ndarray,
|
|
79
82
|
metric=metric,
|
80
83
|
spread = spread,
|
81
84
|
n_epochs=n_epochs,
|
82
|
-
learning_rate=learning_rate
|
85
|
+
learning_rate=learning_rate,
|
86
|
+
random_state=random_state).fit(X = embeddings, y = y, convert_dtype = convert_dtype)
|
83
87
|
|
84
88
|
reduced_embeddings = reducer.transform(embeddings)
|
85
89
|
return reducer, reduced_embeddings
|
@@ -11,6 +11,40 @@ from collections import Counter
|
|
11
11
|
from opsci_toolbox.helpers.dataviz import boxplot
|
12
12
|
from fa2_modified import ForceAtlas2
|
13
13
|
|
14
|
+
def create_subgraph_min_metric(G: nx.Graph, metric: str = "degree", min_value: float = 2) -> nx.Graph:
|
15
|
+
"""
|
16
|
+
Creates a subgraph containing only the nodes that have at least the specified minimum value for a given metric.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
G (nx.Graph): The input graph.
|
20
|
+
metric (str, optional): The node metric to filter nodes by (e.g., "degree", "in_degree", "out_degree", "degree_centrality"). Default is "degree".
|
21
|
+
min_value (float, optional): The minimum value required for nodes to be included in the subgraph. Default is 2.
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
subgraph (nx.Graph): A subgraph containing only the nodes with at least the specified minimum metric value.
|
25
|
+
"""
|
26
|
+
|
27
|
+
if metric == "degree":
|
28
|
+
nodes_with_min_metric = [node for node, value in G.degree() if value >= min_value]
|
29
|
+
elif metric == "in_degree" and G.is_directed():
|
30
|
+
nodes_with_min_metric = [node for node, value in G.in_degree() if value >= min_value]
|
31
|
+
elif metric == "out_degree" and G.is_directed():
|
32
|
+
nodes_with_min_metric = [node for node, value in G.out_degree() if value >= min_value]
|
33
|
+
elif metric == "degree_centrality":
|
34
|
+
centrality = nx.degree_centrality(G)
|
35
|
+
nodes_with_min_metric = [node for node, value in centrality.items() if value >= min_value]
|
36
|
+
elif metric == "betweenness_centrality":
|
37
|
+
centrality = nx.betweenness_centrality(G)
|
38
|
+
nodes_with_min_metric = [node for node, value in centrality.items() if value >= min_value]
|
39
|
+
elif metric == "closeness_centrality":
|
40
|
+
centrality = nx.closeness_centrality(G)
|
41
|
+
nodes_with_min_metric = [node for node, value in centrality.items() if value >= min_value]
|
42
|
+
else:
|
43
|
+
raise ValueError(f"Unsupported metric: {metric}")
|
44
|
+
|
45
|
+
subgraph = G.subgraph(nodes_with_min_metric).copy()
|
46
|
+
return subgraph
|
47
|
+
|
14
48
|
def group_nodes_by_values(dictionnary : dict) -> dict:
|
15
49
|
"""
|
16
50
|
Group nodes by their values from a dictionary.
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|