PyPI - opsci-toolbox - Versions diffs - 0.0.14__tar.gz → 0.0.16__tar.gz - Mend

opsci-toolbox 0.0.14tar.gz → 0.0.16tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: opsci_toolbox
-Version: 0.0.14
+Version: 0.0.16
 Summary: a complete toolbox
 Home-page: UNKNOWN
 Author: Erwan Le Nagard

{opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/apis/telegram.py RENAMED Viewed

@@ -1,17 +1,47 @@
 from datetime import datetime
 from telethon.sync import TelegramClient
 from telethon.tl.functions.channels import GetFullChannelRequest
+from telethon.errors.rpcerrorlist import ChannelPrivateError, ChannelInvalidError
 import pandas as pd
 # from telethon.tl.types import ReactionEmoji, ReactionCustomEmoji, PeerUser, PeerChannel
-from opsci_toolbox.helpers.common import create_dir, write_pickle
+from opsci_toolbox.helpers.common import create_dir, write_pickle, write_json
 import os
 import nest_asyncio
 from telethon.tl.types import Message
-from typing import Optional
+import json
+from tqdm import tqdm
 nest_asyncio.apply()
+class JSONEncoder(json.JSONEncoder):
+    '''
+    JSONEncoder subclass that knows how to encode date/time and bytes.
+    '''
+    def default(self, o):
+        if isinstance(o, datetime) or isinstance(o, bytes):
+            return str(o)
+        return super().default(o)
+def dump_jsonl(data: list[dict], path: str, name: str) -> str:
+    """
+    Write data to a JSON Lines (jsonl) file. Each dictionary in the list represents a single JSON object.
+    Args:
+        data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
+        path (str): The directory path where the JSON Lines file will be saved.
+        name (str): The name of the JSON Lines file (without the extension).
+    Returns:
+        str: The full path to the saved JSON Lines file.
+    """
+    file_path = os.path.join(path, name + '.jsonl')
+    with open(file_path, 'w') as file:
+        for entry in data:
+            json.dump(entry, file, cls=JSONEncoder)
+            file.write('\n')
+    return file_path
 def parse_mediafileid(message: Message) -> str:
     """
     Parse the media file ID from a Telegram message.
@@ -38,7 +68,7 @@ def parse_mediafileid(message: Message) -> str:
         else:
             grouped_id = message_id
-        media_fileid = str(channel_id)+'_'+str(grouped_id)+'_'+str(media_id)
+        media_fileid = str(int(channel_id))+'_'+str(int(grouped_id))+'_'+str(int(media_id))
         return media_fileid
     else:
         return None
@@ -55,10 +85,10 @@ def parse_message_entities(messages : list) -> pd.DataFrame:
         pd.DataFrame : a DataFrame containing the parsed entities.
     """
     all_records = []
-    for message in messages:
-        raw_text = message.raw_text
+    for data in messages:
+        # raw_text = message.raw_text
-        data = message.to_dict()
+        # data = message.to_dict()
         message_id = data.get("id")
@@ -90,10 +120,10 @@ def parse_message_entities(messages : list) -> pd.DataFrame:
             url = entity.get("url")
             document_id = entity.get("document_id")
-            entity_record = (message_id, channel_id, from_id, grouped_id, message, raw_text, entity_type, offset, length, url, document_id)
+            entity_record = (message_id, channel_id, from_id, grouped_id, message, entity_type, offset, length, url, document_id)
             all_records.append(entity_record)
-    df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "message", "raw_text", "entity_type", "offset", "length", "url", "document_id"])
+    df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "message", "entity_type", "offset", "length", "url", "document_id"])
     return df
@@ -110,9 +140,9 @@ def parse_messages(messages : list) -> pd.DataFrame:
     all_records = []
     for message in messages:
-        raw_text = message.raw_text
+        # raw_text = message.raw_text
-        data = message.to_dict()
+        data = message
         message_id = data.get("id")
@@ -166,10 +196,10 @@ def parse_messages(messages : list) -> pd.DataFrame:
         engagements = forwards + replies + total_reactions
-        post_record = (message_id, channel_id, from_id, grouped_id, date, message, raw_text, views, forwards, replies, total_reactions, reactions_details, engagements, reply_to_message_id, reply_to_channel_id) + media_record + fwd_record
+        post_record = (message_id, channel_id, from_id, grouped_id, date, message, views, forwards, replies, total_reactions, reactions_details, engagements, reply_to_message_id, reply_to_channel_id) + media_record + fwd_record
         all_records.append(post_record)
-    df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "date", "message", "raw_text", "views", "forwards", "replies", "reactions", "details_reactions", "engagements", "reply_to_message_id", "reply_to_channel_id",
+    df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "date", "message", "views", "forwards", "replies", "reactions", "details_reactions", "engagements", "reply_to_message_id", "reply_to_channel_id",
                                             "media_id", "media_date", "media_mime_type", "media_size", "media_filename", "duration", "width", "height",
                                             "webpage_id", "webpage_url", "webpage_type", "webpage_site_name", "webpage_title", "webpage_description",
                                             "fwd_date", "fwd_from_id", "fwd_from_post_id", "fwd_from_from_name"])
@@ -196,8 +226,6 @@ def parse_reply(reply:dict) -> tuple:
     return reply_to_message_id, reply_to_channel_id
 def parse_from(data : dict) -> int:
     """
     Parse a peer object from Telegram message.
@@ -288,12 +316,16 @@ def parse_media(media : dict) -> tuple:
         if media.get("_") == "MessageMediaPhoto":
             photo = media.get("photo", {})
             media_id = photo.get("id")
+            if media_id:
+                media_id = str(int(media_id))
             media_date = photo.get("date")
             media_mime_type = "photo"
         elif media.get("_") == "MessageMediaDocument":
             document = media.get("document", {})
             media_id = document.get("id")
+            if media_id:
+                media_id = str(int(media_id))
             media_date = document.get("date")
             media_mime_type = document.get("mime_type")
             media_size = document.get("size")
@@ -344,6 +376,107 @@ def parse_media_id(media : dict) -> int:
             media_id = None
     return media_id
+class JSONEncoder(json.JSONEncoder):
+    '''
+    JSONEncoder subclass that knows how to encode date/time and bytes.
+    '''
+    def default(self, o):
+        if isinstance(o, datetime) or isinstance(o, bytes):
+            return str(o)
+        return super().default(o)
+def dump_jsonl(data: list[dict], path: str, name: str) -> str:
+    """
+    Write data to a JSON Lines (jsonl) file. Each dictionary in the list represents a single JSON object.
+    Args:
+        data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
+        path (str): The directory path where the JSON Lines file will be saved.
+        name (str): The name of the JSON Lines file (without the extension).
+    Returns:
+        str: The full path to the saved JSON Lines file.
+    """
+    file_path = os.path.join(path, name + '.jsonl')
+    with open(file_path, 'w') as file:
+        for entry in data:
+            json.dump(entry, file, cls=JSONEncoder)
+            file.write('\n')
+    return file_path
+async def get_forwarded_messages(client: TelegramClient, phone_number: str, channel_username: int, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), path_file: str = "files"):
+    try:
+        await client.start(phone_number)
+        path_json = create_dir(os.path.join(path_file, "JSON"))
+        # Fetch the messages from the channel
+        # forwarded_messages = []
+        forwarded_messages_dict = []
+        new_channels = set()
+        async for message in client.iter_messages(channel_username,
+                                                  limit=limit,
+                                                  offset_date=offset_date,
+                                                  reverse=reverse):
+            # Check if the message is a forward
+            if message.forward and hasattr(message.forward.chat, 'username'):
+                # forwarded_messages.append(message)
+                forwarded_messages_dict.append(message.to_dict())
+                if message.forward.chat:
+                    new_channel = message.forward.chat.username
+                    if new_channel:
+                        new_channels.add(new_channel)
+        if forwarded_messages_dict:
+            dump_jsonl(forwarded_messages_dict, path_json, str(channel_username))
+    except (ChannelPrivateError, ChannelInvalidError):
+        print(f"Cannot access channel: {channel_username}")
+    return forwarded_messages_dict, new_channels
+async def recursive_forward_scraper(seed_channels, depth, client: TelegramClient, phone_number: str,  reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), path_file: str = "files"):
+    """
+    Recursively collects forwarded messages from channels, starting from the seed channels up to a specific depth.
+    """
+    all_forwarded_messages = []
+    visited_channels = set(seed_channels)
+    current_level_channels = set(seed_channels)
+    path_json = create_dir(os.path.join(path_file, "CHANNELS"))
+    for level in range(depth):
+        print(level)
+        print(f"Processing level {level + 1} with {len(current_level_channels)} channels...")
+        next_level_channels = set()
+        # Iterate through channels at the current level
+        for channel in tqdm(current_level_channels, total=len(current_level_channels), desc="get messages"):
+            forwarded_msgs, discovered_channels = await get_forwarded_messages(client, phone_number, channel, reverse, limit, offset_date, path_file)
+            # Collect forwarded messages
+            all_forwarded_messages.extend(forwarded_msgs)
+            # Add newly discovered channels to the next level, excluding already visited ones
+            for new_channel in discovered_channels:
+                if new_channel not in visited_channels:
+                    next_level_channels.add(new_channel)
+                    visited_channels.add(new_channel)
+        # Update the set of channels for the next level of recursion
+        current_level_channels = next_level_channels
+        if not current_level_channels:
+            break
+    write_json(visited_channels, path_json, "visited_channels")
+    return all_forwarded_messages, visited_channels
 def group_by_post(df : pd.DataFrame) -> pd.DataFrame:
     """
     Function to group message by "post". If a post embed multiple media, Telethon returns separate messages for each media. This function groups them together ensuring also type consistency.
@@ -392,7 +525,80 @@ def group_by_post(df : pd.DataFrame) -> pd.DataFrame:
     df = df.groupby(['channel_id',"grouped_id"]).agg(**aggregations).reset_index()
     return df
-async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), ids: list = [], path_file: str = "files", dl_thumbs : bool = False) -> list:
+# async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), ids: list = [], path_file: str = "files", dl_thumbs : bool = False) -> list:
+#     """
+#     Retrieves messages from a Telegram channel by date.
+#     Args:
+#         client (TelegramClient): The Telegram client instance.
+#         phone_number (str): The phone number associated with the Telegram account.
+#         channel_username (str): The username of the channel to retrieve messages from.
+#         dl_files (bool, optional): Whether to download media files attached to the messages. Defaults to False.
+#         reverse (bool, optional): Whether to retrieve messages in reverse order. Defaults to True.
+#         limit (int, optional): The maximum number of messages to retrieve. Defaults to None (retrieve all messages).
+#         offset_date (datetime, optional): The starting date to retrieve messages from. Defaults to datetime(1970,1,1).
+#         path_file (str, optional): The path to save the downloaded files. Defaults to "files".
+#     Returns:
+#         list: A list of messages retrieved from the channel.
+#     Raises:
+#         Exception: If there is an error during the retrieval process.
+#     """
+#     try:
+#         await client.start(phone_number)
+#         # current_path_file = create_dir(os.path.join(path_file, "messages"))
+#         path_messages = create_dir(os.path.join(path_file, "messages"))
+#         path_entities = create_dir(os.path.join(path_file, "entities"))
+#         if dl_files:
+#             current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
+#         if dl_thumbs:
+#             current_path_thumbs = create_dir(os.path.join(path_file, "thumbs", str(channel_username)))
+#         # Get the message history
+#         messages = []
+#         async for message in client.iter_messages(channel_username,
+#                                                   limit=limit,
+#                                                   offset_date=offset_date,
+#                                                   reverse=reverse):
+#             messages.append(message)
+#             if dl_files:
+#                 media_fileid = parse_mediafileid(message)
+#                 if media_fileid:
+#                     await message.download_media(file=os.path.join(current_path_img, media_fileid))
+#             if dl_thumbs:
+#                 media_fileid = parse_mediafileid(message)
+#                 if media_fileid:
+#                     try:
+#                         await message.download_media(file=os.path.join(current_path_thumbs, media_fileid), thumb=-1)
+#                     except Exception as e:
+#                         pass
+#                         print(e)
+#         df_exploded = parse_messages(messages)
+#         df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
+#         write_pickle(df_exploded, path_messages, str(channel_username))
+#         df_entities = parse_message_entities(messages)
+#         write_pickle(df_entities, path_entities, str(channel_username))
+#         # df_messages = group_by_post(df_exploded)
+#         # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['from_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
+#         # write_pickle(df_messages, current_path_file, str(channel_username))
+#         return messages
+#     finally:
+#         # Disconnect the client
+#         await client.disconnect()
+async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), path_file: str = "files", dl_thumbs : bool = False) -> list:
     """
     Retrieves messages from a Telegram channel by date.
@@ -417,6 +623,7 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
         await client.start(phone_number)
         # current_path_file = create_dir(os.path.join(path_file, "messages"))
+        path_json = create_dir(os.path.join(path_file, "JSON"))
         path_messages = create_dir(os.path.join(path_file, "messages"))
         path_entities = create_dir(os.path.join(path_file, "entities"))
@@ -426,13 +633,16 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
             current_path_thumbs = create_dir(os.path.join(path_file, "thumbs", str(channel_username)))
         # Get the message history
-        messages = []
+        # messages = []
+        messages_dict = []
         async for message in client.iter_messages(channel_username,
                                                   limit=limit,
                                                   offset_date=offset_date,
                                                   reverse=reverse):
-            messages.append(message)
+            # messages.append(message)
+            messages_dict.append(message.to_dict())
             if dl_files:
@@ -449,18 +659,19 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
                         pass
                         print(e)
-        df_exploded = parse_messages(messages)
+        dump_jsonl(messages_dict, path_json, str(channel_username))
+        df_exploded = parse_messages(messages_dict)
         df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
         write_pickle(df_exploded, path_messages, str(channel_username))
-        df_entities = parse_message_entities(messages)
+        df_entities = parse_message_entities(messages_dict)
         write_pickle(df_entities, path_entities, str(channel_username))
         # df_messages = group_by_post(df_exploded)
         # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['from_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
         # write_pickle(df_messages, current_path_file, str(channel_username))
-        return messages
+        return messages_dict
     finally:
         # Disconnect the client
         await client.disconnect()
@@ -486,16 +697,18 @@ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel
     """
     try:
         await client.start(phone_number)
+        path_json = create_dir(os.path.join(path_file, "JSON"))
         path_messages = create_dir(os.path.join(path_file, "messages"))
         path_entities = create_dir(os.path.join(path_file, "entities"))
         # Get the message history
-        messages = []
+        # messages = []
+        messages_dict = []
         async for message in client.iter_messages(channel_username,
                                                   ids = ids):
-            messages.append(message)
+            # messages.append(message)
+            messages_dict.append(message.to_dict())
             if dl_files:
                 current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
@@ -503,11 +716,13 @@ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel
                 if media_fileid:
                     await message.download_media(file=os.path.join(current_path_img, media_fileid))
-        df_exploded = parse_messages(messages)
+        dump_jsonl(messages_dict, path_json, str(channel_username))
+        df_exploded = parse_messages(messages_dict)
         df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
         write_pickle(df_exploded, path_messages, str(channel_username))
-        df_entities = parse_message_entities(messages)
+        df_entities = parse_message_entities(messages_dict)
         write_pickle(df_entities, path_entities, str(channel_username))
@@ -515,7 +730,7 @@ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel
         # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['message_id'].astype(str)
         # write_pickle(df_messages, current_path_file, str(channel_username))
-        return messages
+        return messages_dict
     finally:
         # Disconnect the client
         await client.disconnect()
@@ -544,6 +759,7 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
         await client.start(phone_number)
         # current_path_file = create_dir(os.path.join(path_file, "messages"))
+        path_json = create_dir(os.path.join(path_file, "JSON"))
         path_messages = create_dir(os.path.join(path_file, "messages"))
         path_entities = create_dir(os.path.join(path_file, "entities"))
@@ -551,12 +767,14 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
             current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
         # Get the message history
-        messages = []
+        # messages = []
+        messages_dict = []
         async for message in client.iter_messages(channel_username,
                                                   search=search,
                                                   limit=limit):
-            messages.append(message)
+            # messages.append(message)
+            messages_dict.append(message.to_dict())
             if dl_files:
@@ -564,12 +782,13 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
                 if media_fileid:
                     await message.download_media(file=os.path.join(current_path_img, media_fileid))
-        df_exploded = parse_messages(messages)
+        df_exploded = parse_messages(messages_dict)
         df_exploded['search']=search
         df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
         # df_messages = group_by_post(df_exploded)
         # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
-        df_entities = parse_message_entities(messages)
+        df_entities = parse_message_entities(messages_dict)
         if channel_username:
             write_pickle(df_exploded, path_messages, str(search)+'_'+str(channel_username))
@@ -578,7 +797,7 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
             write_pickle(df_exploded, path_messages, str(search))
             write_pickle(df_entities, path_entities, str(search))
-        return messages
+        return messages_dict
     finally:
         # Disconnect the client
         await client.disconnect()
@@ -592,10 +811,10 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
         path_messages = create_dir(os.path.join(path_file, "messages"))
         path_entities = create_dir(os.path.join(path_file, "entities"))
-        comments = []
+        comments_dict = []
         async for comment in client.iter_messages(int(channel_entity), reply_to=int(message_id), limit=limit, reverse=reverse):
-            comments.append(comment)
+            comments_dict.append(comment.to_dict())
         if dl_files:
             current_path_img = create_dir(os.path.join(path_file, "img", str(channel_entity)))
@@ -603,14 +822,14 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
             if media_fileid:
                 await comment.download_media(file=os.path.join(current_path_img, media_fileid))
-        df_comments = parse_messages(comments)
+        df_comments = parse_messages(comments_dict)
         df_comments.loc[df_comments['media_id'].notna(), "media_fileid"] = df_comments['channel_id'].astype(str)+'_'+df_comments['grouped_id'].astype(str)+'_'+df_comments['media_id'].astype(str)
         write_pickle(df_comments, path_messages, str(channel_entity)+"_"+str(message_id))
-        df_entities = parse_message_entities(comments)
+        df_entities = parse_message_entities(comments_dict)
         write_pickle(df_entities, path_entities, str(channel_entity)+"_"+str(message_id))
-        return comments
+        return comments_dict
     finally:
         # Disconnect the client
@@ -1043,7 +1262,97 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
 #         # Disconnect the client
 #         await client.disconnect()
-async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str, path_img :str) -> dict:
+# async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str, path_img :str) -> dict:
+#     """
+#     Retrieves information about a Telegram channel.
+#     Args:
+#         api_id (int): The API ID of the Telegram application.
+#         api_hash (str): The API hash of the Telegram application.
+#         phone_number (str): The phone number associated with the Telegram account.
+#         channel_username (str): The username of the channel.
+#     Returns:
+#         dict: A dictionary containing the full information of the channel.
+#     Raises:
+#         Exception: If there is an error during the retrieval of channel information.
+#     """
+#     client = TelegramClient('session_name', api_id, api_hash)
+#     try:
+#         await client.start(phone_number)
+#         channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
+#         channel_full_info_json = channel_full_info.to_dict()
+#         img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file = path_img)
+#     finally:
+#         # Disconnect the client
+#         await client.disconnect()
+#     return channel_full_info_json
+# async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str, path_project :str, DL_profile_pic : bool = False) -> dict:
+#     """
+#     Retrieves information about a Telegram channel.
+#     Args:
+#         api_id (int): The API ID of the Telegram application.
+#         api_hash (str): The API hash of the Telegram application.
+#         phone_number (str): The phone number associated with the Telegram account.
+#         channel_username (str): The username of the channel.
+#     Returns:
+#         dict: A dictionary containing the full information of the channel.
+#     Raises:
+#         Exception: If there is an error during the retrieval of channel information.
+#     """
+#     client = TelegramClient('session_name', api_id, api_hash)
+#     path_img = create_dir(os.path.join(path_project, "THUMBNAILS"))
+#     path_json = create_dir(os.path.join(path_project, "JSON"))
+#     try:
+#         await client.start(phone_number)
+#         try:
+#             channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
+#             if channel_full_info:
+#                 channel_full_info_dict = channel_full_info.to_dict()
+#                 channel_full_info_json = JSONEncoder().encode(channel_full_info_dict)
+#             else:
+#                 channel_full_info_dict = {'_': 'Channel', 'id': channel_username, 'title':'private_channel'}
+#             write_json(channel_full_info_json, path_json, f"{str(channel_username)}")
+#             if DL_profile_pic:
+#                 img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file = path_img)
+#         except Exception as e:
+#             pass
+#             print(channel_username, e)
+#     finally:
+#         # Disconnect the client
+#         await client.disconnect()
+#     return channel_full_info_dict
+def dump_json(json_dict: dict, path: str, name: str) -> str:
+    """
+    Write a dictionary to a JSON file.
+    Args:
+        json_dict (dict): The dictionary to be written to the JSON file.
+        path (str): The directory path where the JSON file will be saved.
+        name (str): The name of the JSON file (without the extension).
+    Returns:
+        str: The full path to the saved JSON file.
+    """
+    file_path = os.path.join(path, name + '.json')
+    with open(file_path, 'w') as outfile:
+        json.dump(json_dict, outfile, cls=JSONEncoder)
+    return file_path
+async def get_channel_info(api_id: int, api_hash: str, phone_number: str, channel_username: str, path_project: str, DL_profile_pic: bool = False) -> dict:
     """
     Retrieves information about a Telegram channel.
@@ -1060,17 +1369,49 @@ async def get_channel_info(api_id : int, api_hash : str, phone_number : str, cha
         Exception: If there is an error during the retrieval of channel information.
     """
     client = TelegramClient('session_name', api_id, api_hash)
+    path_img = create_dir(os.path.join(path_project, "THUMBNAILS"))
+    path_json = create_dir(os.path.join(path_project, "JSON"))
     try:
         await client.start(phone_number)
-        channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
-        channel_full_info_json = channel_full_info.to_dict()
-        img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file = path_img)
+        try:
+            # Fetch full channel info
+            channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
+            # If channel info is retrieved
+            if channel_full_info:
+                channel_full_info_dict = channel_full_info.to_dict()
+            else:
+                channel_full_info_dict = {'_': 'ChatFull',
+                                      'full_chat': {'_': 'ChannelFull',
+                                                    'id': channel_username,
+                                                    },
+                                        'chats': [{'_': 'Channel', 'id': channel_username, 'title': 'private'}]
+                                    }
+            # Save the dictionary as JSON (no need to pre-encode it to a string)
+            dump_json(channel_full_info_dict, path_json, f"{str(channel_username)}")
+            # Optionally download profile picture
+            if DL_profile_pic:
+                img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file=path_img)
+        except Exception as e:
+            print(channel_username, e)
+            channel_full_info_dict = {'_': 'ChatFull',
+                                      'full_chat': {'_': 'ChannelFull',
+                                                    'id': channel_username,
+                                                    },
+                                        'chats': [{'_': 'Channel', 'id': channel_username, 'title': 'private'}]
+                                    }
+            dump_json(channel_full_info_dict, path_json, f"{str(channel_username)}")
+            return {'_': 'Channel', 'id': channel_username, 'title': 'private_channel'}
     finally:
         # Disconnect the client
         await client.disconnect()
-    return channel_full_info_json
+    return channel_full_info_dict
 def parse_channel(channel : dict) -> pd.DataFrame:
     """
@@ -1106,7 +1447,6 @@ def parse_channel(channel : dict) -> pd.DataFrame:
     chats = channel.get("chats", [])
     if chats:
         for chat in chats:
-            print(chat)
             if chat.get("_") == "Channel":
                 if chat.get("id") == channel_id:
                     creation_date = chat.get("date", datetime(1970,1,1))

{opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/apis/webscraping.py RENAMED Viewed

@@ -97,9 +97,14 @@ def url_get_domain(url: str) -> str:
     Returns:
         str: The domain name extracted from the URL.
     """
-    parsed_url = urlparse(url)
-    domain = parsed_url.hostname if parsed_url.hostname else parsed_url.netloc
-    return domain
+    try:
+        parsed_url = urlparse(url)
+        domain = parsed_url.hostname if parsed_url.hostname else parsed_url.netloc
+        return domain
+    except Exception as e:
+        pass
+        print(url, e)
+        return url
 def url_get_extension(url: str) -> str:

{opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/helpers/common.py RENAMED Viewed

@@ -383,6 +383,8 @@ def write_json(json_dict: dict, path: str, name: str) -> str:
     return file_path
 def write_dataframe_to_json(df: pd.DataFrame, path: str, name: str, orient: str = 'records') -> str:
     """
     Write a DataFrame to a JSON file.

{opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/helpers/nlp.py RENAMED Viewed

@@ -30,7 +30,7 @@ from eldar import Query
 import torch
 from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
 from bs4 import BeautifulSoup
+from nltk.tokenize import PunktSentenceTokenizer
 ####################################################################
 # CLEANING
@@ -126,7 +126,7 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
     df[col_clean] = df[col_clean].apply(lambda x : brackets(x))
     df[col_clean] = df[col_clean].apply(lambda x : urls(x, repl= ''))
     df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
-    df[col_clean] = df[col_clean].apply(remove_multiple_hashtags)
+    # df[col_clean] = df[col_clean].apply(remove_multiple_hashtags)
     df[col_clean] = df[col_clean].apply(remove_extra_spaces)
     # df = df.loc[(df[col_clean] != ""), :]
     return df
@@ -911,6 +911,8 @@ def topic_aggregate_chunks(df: pd.DataFrame, col_id: str, col_topic : str, col_c
     """
     metrics_dict = dict()
     # metrics_dict[col_id]=(col_id,'first')
+    # if col_id != col_chunk_id:
+    #     metrics_dict[col_chunk_id]=(col_chunk_id,"nunique")
     metrics_dict[col_chunk_id]=(col_chunk_id,"nunique")
     metrics_dict[col_engagement]=(col_engagement,'first')
@@ -1578,10 +1580,10 @@ def extract_emojis(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100,
     return df
-def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1, batch_size: int = 100, n_process: int = 1, stats: bool = False) -> pd.DataFrame:
+def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1, batch_size: int = 100, n_process: int = 1, stats: bool = False, threshold: int = None) -> pd.DataFrame:
     """
-    Split a text into chunks of n sentences
+    Split a text into chunks of n sentences, returning their start and end indexes in separate columns.
     Parameters:
         nlp : spacy.language.Language
             The spaCy language processing pipeline.
@@ -1597,41 +1599,142 @@ def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1
             The number of processes to use for text processing. Default is 1.
         stats : bool, optional
             Flag indicating whether to compute statistics about the splitting process. Default is False.
+        threshold : int, optional
+            Maximum number of sentence batches to return per text. If None, all batches are returned. Default is None.
     Returns:
         pd.DataFrame
-            DataFrame containing the split sentences.
+            DataFrame containing the split sentences with their start and end indexes in separate columns.
-    Description:
-        This function splits text in a DataFrame into chunks of n sentences. It returns a DataFrame containing the split sentences.
-        Optionally, it can compute statistics such as the count of sentences and batches if the 'stats' parameter is set to True.
     """
+    text = list(df[col_text].astype('unicode').values)
+    count_sentences = []
+    count_batches = []
+    results = []
+    start_indexes = []
+    end_indexes = []
+    for doc in tqdm(nlp.pipe(text, batch_size=batch_size, n_process=n_process), total=len(text), desc="Sentence splitting"):
+        sentences = []
+        # Extract sentences and their positions
+        for sent in doc.sents:
+            sentences.append((sent.text, sent.start_char, sent.end_char))
-    text=list(df[col_text].astype('unicode').values)
-    count_sentences=[]
-    count_batches=[]
-    results=[]
-    for doc in tqdm(nlp.pipe(text, batch_size=batch_size, n_process=n_process), total= len(text), desc = "Sentence splitting"):
-        # Split the text into sentences
-        sentences = [sent.text for sent in doc.sents]
         if stats:
             count_sentences.append(len(sentences))
-        if n_sentences>1:
-            # Split the sentences into batches of size n
+        if n_sentences > 1:
+            # # Split sentences into batches of size n_sentences
             batches = [sentences[i:i + n_sentences] for i in range(0, len(sentences), n_sentences)]
-            concatenate_batches=[" ".join(sublist) for sublist in batches]
+            # Concatenate batches of sentences and adjust spans accordingly
+            concatenate_batches = [" ".join([sub[0] for sub in sublist]) for sublist in batches]
+            concatenate_spans = [(sublist[0][1], sublist[-1][2]) for sublist in batches]
+            if threshold is not None:
+                concatenate_batches = concatenate_batches[:threshold]
+                concatenate_spans = concatenate_spans[:threshold]
             results.append(concatenate_batches)
+            start_indexes.append([span[0] for span in concatenate_spans])
+            end_indexes.append([span[1] for span in concatenate_spans])
             if stats:
                 count_batches.append(len(concatenate_batches))
         else:
-            results.append(sentences)
+            sentences = sentences[:threshold] if threshold is not None else sentences
+            results.append([sub[0] for sub in sentences])
+            start_indexes.append([sub[1] for sub in sentences])
+            end_indexes.append([sub[2] for sub in sentences])
     df['sentences'] = results
-    if stats:
-        df['sentences_count']=count_sentences
-        df['batch_sentences_count']=count_batches
+    df['start_indexes'] = start_indexes
+    df['end_indexes'] = end_indexes
+    df = df.explode(['sentences','start_indexes', 'end_indexes']).reset_index(drop=True)
+    return df
+def split_n_sentences_nltk(df: pd.DataFrame, col_text: str, n_sentences: int = 1, threshold: int = None, stats: bool = False) -> pd.DataFrame:
+    """
+    Split a text into chunks of n sentences, returning their start and end indexes in separate columns using NLTK PunktSentenceTokenizer.
+    Parameters:
+        df : pd.DataFrame
+            DataFrame containing the text data to split.
+        col_text : str
+            The name of the column containing the text data.
+        n_sentences : int, optional
+            The number of sentences to group together. Default is 1.
+        threshold : int, optional
+            Maximum number of sentence batches to return per text. If None, all batches are returned. Default is None.
+        stats : bool, optional
+            Flag indicating whether to compute statistics about the splitting process. Default is False.
+    Returns:
+        pd.DataFrame
+            DataFrame containing the split sentences with their start and end indexes in separate columns.
+    """
+    tokenizer = PunktSentenceTokenizer()
+    text = list(df[col_text].astype('unicode').values)
+    count_sentences = []
+    count_batches = []
+    results = []
+    start_indexes = []
+    end_indexes = []
+    for doc in tqdm(text, total=len(text), desc="Sentence splitting"):
+        sentences = []
+        start_pos = 0
+        # Tokenize sentences and compute positions
+        for sent in tokenizer.tokenize(doc):
+            start_idx = doc.find(sent, start_pos)
+            end_idx = start_idx + len(sent)
+            sentences.append((sent, start_idx, end_idx))
+            start_pos = end_idx
+        if stats:
+            count_sentences.append(len(sentences))
+        if n_sentences > 1:
+            # Split sentences into batches of size n_sentences
+            batches = [sentences[i:i + n_sentences] for i in range(0, len(sentences), n_sentences)]
+            # Concatenate batches of sentences and adjust spans accordingly
+            concatenate_batches = [" ".join([sub[0] for sub in sublist]) for sublist in batches]
+            concatenate_spans = [(sublist[0][1], sublist[-1][2]) for sublist in batches]
+            if threshold is not None:
+                concatenate_batches = concatenate_batches[:threshold]
+                concatenate_spans = concatenate_spans[:threshold]
+            results.append(concatenate_batches)
+            start_indexes.append([span[0] for span in concatenate_spans])
+            end_indexes.append([span[1] for span in concatenate_spans])
+            if stats:
+                count_batches.append(len(concatenate_batches))
+        else:
+            sentences = sentences[:threshold] if threshold is not None else sentences
+            results.append([sub[0] for sub in sentences])
+            start_indexes.append([sub[1] for sub in sentences])
+            end_indexes.append([sub[2] for sub in sentences])
+    df['sentences'] = results
+    df['start_indexes'] = start_indexes
+    df['end_indexes'] = end_indexes
+    df = df.explode(['sentences', 'start_indexes', 'end_indexes']).reset_index(drop=True)
     return df
@@ -2404,3 +2507,46 @@ def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json
             write_json(results, dir_json , str(filename))
     return results
+def add_tag_libretranslate_not_translate(text):
+    """
+    This function add fake html tag around words such as mentions, hashtags, urls and emojis to avoid translation of those tokens.
+    Args:
+    text (str): The text to process
+    Returns:
+    str: The text with the fake html tags
+    """
+    # This regex finds words starting with # and followed by alphanumeric characters or underscores
+    mention_pattern = r"(?:RT\s|QT\s)?(?<=^|(?<=[^a-zA-Z0-9-_\.]))(@[A-Za-z0-9_]{4,15})"
+    hashtag_pattern = r"(\B#\w+)"
+    url_pattern = r"(https?://[^ ]+)"
+    emoji_pattern = r':[a-zA-Z_]+:'
+    pattern = re.compile(emoji_pattern+ "|" + mention_pattern + "|" + hashtag_pattern + "|" + url_pattern)
+    # This function replaces the hashtag with an HTML link tag
+    def replace_with_link(match):
+        matcher_group = match.group(0)
+        return f'<a href="{matcher_group}"></a>'
+    # Use re.sub to substitute the hashtags with the HTML link tags
+    text_no_emojis = emoji.demojize(text)
+    result = re.sub(pattern, replace_with_link, text_no_emojis)
+    return result
+def clean_libre_translate_tags(text):
+    """
+    This function remove fake tags added by add_tag_libretranslate_not_translate() function.
+    Args:
+    text (str): The text to process
+    Returns:
+    str: The text with the fake html tags
+    """
+    cleaned_string = text.replace('<a href="', '').replace('"></a>', '')
+    return cleaned_string

{opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/helpers/nlp_cuml.py RENAMED Viewed

@@ -18,7 +18,8 @@ def reduce_with_cuml_UMAP(embeddings: np.ndarray,
                           metric: str = "cosine",
                           spread: float = 1.0,
                           learning_rate: float = 1.0,
-                          n_epochs:int = 300
+                          n_epochs:int = 300,
+                          random_state:int = None
                            ) -> tuple:
     """
     Reduces the dimensionality of embeddings using UMAP with cuML library.
@@ -41,7 +42,8 @@ def reduce_with_cuml_UMAP(embeddings: np.ndarray,
                    metric=metric,
                    spread = spread,
                    n_epochs=n_epochs,
-                   learning_rate=learning_rate).fit(embeddings)
+                   learning_rate=learning_rate,
+                   random_state=random_state).fit(embeddings)
     reduced_embeddings = reducer.transform(embeddings)
     return reducer, reduced_embeddings
@@ -56,7 +58,8 @@ def supervised_reduce_with_cuml_UMAP(embeddings: np.ndarray,
                           learning_rate: float = 1.0,
                           n_epochs:int = 300,
                           y: np.ndarray = None,
-                          convert_dtype: bool = False
+                          convert_dtype: bool = False,
+                          random_state:int=None
                            ) -> tuple:
     """
     Reduces the dimensionality of embeddings using UMAP with cuML library.
@@ -79,7 +82,8 @@ def supervised_reduce_with_cuml_UMAP(embeddings: np.ndarray,
                    metric=metric,
                    spread = spread,
                    n_epochs=n_epochs,
-                   learning_rate=learning_rate).fit(X = embeddings, y = y, convert_dtype = convert_dtype)
+                   learning_rate=learning_rate,
+                   random_state=random_state).fit(X = embeddings, y = y, convert_dtype = convert_dtype)
     reduced_embeddings = reducer.transform(embeddings)
     return reducer, reduced_embeddings

{opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox/helpers/sna.py RENAMED Viewed

@@ -11,6 +11,40 @@ from collections import Counter
 from opsci_toolbox.helpers.dataviz import boxplot
 from fa2_modified import ForceAtlas2
+def create_subgraph_min_metric(G: nx.Graph, metric: str = "degree", min_value: float = 2) -> nx.Graph:
+    """
+    Creates a subgraph containing only the nodes that have at least the specified minimum value for a given metric.
+    Args:
+        G (nx.Graph): The input graph.
+        metric (str, optional): The node metric to filter nodes by (e.g., "degree", "in_degree", "out_degree", "degree_centrality"). Default is "degree".
+        min_value (float, optional): The minimum value required for nodes to be included in the subgraph. Default is 2.
+    Returns:
+        subgraph (nx.Graph): A subgraph containing only the nodes with at least the specified minimum metric value.
+    """
+    if metric == "degree":
+        nodes_with_min_metric = [node for node, value in G.degree() if value >= min_value]
+    elif metric == "in_degree" and G.is_directed():
+        nodes_with_min_metric = [node for node, value in G.in_degree() if value >= min_value]
+    elif metric == "out_degree" and G.is_directed():
+        nodes_with_min_metric = [node for node, value in G.out_degree() if value >= min_value]
+    elif metric == "degree_centrality":
+        centrality = nx.degree_centrality(G)
+        nodes_with_min_metric = [node for node, value in centrality.items() if value >= min_value]
+    elif metric == "betweenness_centrality":
+        centrality = nx.betweenness_centrality(G)
+        nodes_with_min_metric = [node for node, value in centrality.items() if value >= min_value]
+    elif metric == "closeness_centrality":
+        centrality = nx.closeness_centrality(G)
+        nodes_with_min_metric = [node for node, value in centrality.items() if value >= min_value]
+    else:
+        raise ValueError(f"Unsupported metric: {metric}")
+    subgraph = G.subgraph(nodes_with_min_metric).copy()
+    return subgraph
 def group_nodes_by_values(dictionnary : dict) -> dict:
     """
     Group nodes by their values from a dictionary.

{opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/opsci_toolbox.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: opsci-toolbox
-Version: 0.0.14
+Version: 0.0.16
 Summary: a complete toolbox
 Home-page: UNKNOWN
 Author: Erwan Le Nagard

{opsci_toolbox-0.0.14 → opsci_toolbox-0.0.16}/setup.py RENAMED Viewed

@@ -5,7 +5,7 @@ from setuptools import setup, find_packages
 setup(
     name='opsci_toolbox',
-    version='0.0.14',
+    version='0.0.16',
     description="a complete toolbox",
     author='Erwan Le Nagard',
     author_email='erwan@opsci.ai',