PyPI - opsci-toolbox - Versions diffs - 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl - Mend

opsci-toolbox 0.0.13py3-none-any.whl → 0.0.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

opsci_toolbox/apis/rapidapi_helpers.py +1 -2
opsci_toolbox/apis/reddit.py +342 -334
opsci_toolbox/apis/telegram.py +471 -41
opsci_toolbox/helpers/common.py +3 -1
opsci_toolbox/helpers/dates.py +1 -1
opsci_toolbox/helpers/nlp.py +178 -33
opsci_toolbox/helpers/nlp_cuml.py +47 -2
opsci_toolbox/helpers/sna.py +34 -0
{opsci_toolbox-0.0.13.dist-info → opsci_toolbox-0.0.15.dist-info}/METADATA +2 -2
{opsci_toolbox-0.0.13.dist-info → opsci_toolbox-0.0.15.dist-info}/RECORD +13 -12
opsci_toolbox-0.0.15.dist-info/dependency_links.txt +1 -0
{opsci_toolbox-0.0.13.dist-info → opsci_toolbox-0.0.15.dist-info}/WHEEL +0 -0
{opsci_toolbox-0.0.13.dist-info → opsci_toolbox-0.0.15.dist-info}/top_level.txt +0 -0

opsci_toolbox/apis/telegram.py CHANGED Viewed

@@ -1,17 +1,47 @@
 from datetime import datetime
 from telethon.sync import TelegramClient
 from telethon.tl.functions.channels import GetFullChannelRequest
+from telethon.errors.rpcerrorlist import ChannelPrivateError, ChannelInvalidError
 import pandas as pd
 # from telethon.tl.types import ReactionEmoji, ReactionCustomEmoji, PeerUser, PeerChannel
-from opsci_toolbox.helpers.common import create_dir, write_pickle
+from opsci_toolbox.helpers.common import create_dir, write_pickle, write_json
 import os
 import nest_asyncio
 from telethon.tl.types import Message
-from typing import Optional
+import json
+from tqdm import tqdm
 nest_asyncio.apply()
+class JSONEncoder(json.JSONEncoder):
+    '''
+    JSONEncoder subclass that knows how to encode date/time and bytes.
+    '''
+    def default(self, o):
+        if isinstance(o, datetime) or isinstance(o, bytes):
+            return str(o)
+        return super().default(o)
+def dump_jsonl(data: list[dict], path: str, name: str) -> str:
+    """
+    Write data to a JSON Lines (jsonl) file. Each dictionary in the list represents a single JSON object.
+    Args:
+        data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
+        path (str): The directory path where the JSON Lines file will be saved.
+        name (str): The name of the JSON Lines file (without the extension).
+    Returns:
+        str: The full path to the saved JSON Lines file.
+    """
+    file_path = os.path.join(path, name + '.jsonl')
+    with open(file_path, 'w') as file:
+        for entry in data:
+            json.dump(entry, file, cls=JSONEncoder)
+            file.write('\n')
+    return file_path
 def parse_mediafileid(message: Message) -> str:
     """
     Parse the media file ID from a Telegram message.
@@ -38,12 +68,65 @@ def parse_mediafileid(message: Message) -> str:
         else:
             grouped_id = message_id
-        media_fileid = str(channel_id)+'_'+str(grouped_id)+'_'+str(media_id)
+        media_fileid = str(int(channel_id))+'_'+str(int(grouped_id))+'_'+str(int(media_id))
         return media_fileid
     else:
         return None
+def parse_message_entities(messages : list) -> pd.DataFrame:
+    """
+    Parse Telegram messages entities.
+    Args:
+        messages : a list of Telegram messages.
+    Returns:
+        pd.DataFrame : a DataFrame containing the parsed entities.
+    """
+    all_records = []
+    for data in messages:
+        # raw_text = message.raw_text
+        # data = message.to_dict()
+        message_id = data.get("id")
+        peer_id = data.get("peer_id", {})
+        if peer_id is None:
+            peer_id = {}
+        channel_id = parse_from(peer_id)
+        from_id = data.get("from_id", {})
+        if from_id is None :
+            from_id = {}
+        from_id = parse_from(from_id)
+        if from_id is None:
+            from_id = channel_id
+        grouped_id = data.get("grouped_id")
+        if grouped_id:
+            grouped_id = grouped_id
+        else:
+            grouped_id = message_id
+        message = data.get("message")
+        entities = data.get("entities", [])
+        for entity in entities:
+            entity_type = entity.get("_")
+            offset = entity.get("offset")
+            length = entity.get("length")
+            url = entity.get("url")
+            document_id = entity.get("document_id")
+            entity_record = (message_id, channel_id, from_id, grouped_id, message, entity_type, offset, length, url, document_id)
+            all_records.append(entity_record)
+    df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "message", "entity_type", "offset", "length", "url", "document_id"])
+    return df
 def parse_messages(messages : list) -> pd.DataFrame:
     """
     Parse Telegram messages.
@@ -57,8 +140,9 @@ def parse_messages(messages : list) -> pd.DataFrame:
     all_records = []
     for message in messages:
+        # raw_text = message.raw_text
-        data = message.to_dict()
+        data = message
         message_id = data.get("id")
@@ -142,8 +226,6 @@ def parse_reply(reply:dict) -> tuple:
     return reply_to_message_id, reply_to_channel_id
 def parse_from(data : dict) -> int:
     """
     Parse a peer object from Telegram message.
@@ -234,12 +316,16 @@ def parse_media(media : dict) -> tuple:
         if media.get("_") == "MessageMediaPhoto":
             photo = media.get("photo", {})
             media_id = photo.get("id")
+            if media_id:
+                media_id = str(int(media_id))
             media_date = photo.get("date")
             media_mime_type = "photo"
         elif media.get("_") == "MessageMediaDocument":
             document = media.get("document", {})
             media_id = document.get("id")
+            if media_id:
+                media_id = str(int(media_id))
             media_date = document.get("date")
             media_mime_type = document.get("mime_type")
             media_size = document.get("size")
@@ -290,6 +376,107 @@ def parse_media_id(media : dict) -> int:
             media_id = None
     return media_id
+class JSONEncoder(json.JSONEncoder):
+    '''
+    JSONEncoder subclass that knows how to encode date/time and bytes.
+    '''
+    def default(self, o):
+        if isinstance(o, datetime) or isinstance(o, bytes):
+            return str(o)
+        return super().default(o)
+def dump_jsonl(data: list[dict], path: str, name: str) -> str:
+    """
+    Write data to a JSON Lines (jsonl) file. Each dictionary in the list represents a single JSON object.
+    Args:
+        data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
+        path (str): The directory path where the JSON Lines file will be saved.
+        name (str): The name of the JSON Lines file (without the extension).
+    Returns:
+        str: The full path to the saved JSON Lines file.
+    """
+    file_path = os.path.join(path, name + '.jsonl')
+    with open(file_path, 'w') as file:
+        for entry in data:
+            json.dump(entry, file, cls=JSONEncoder)
+            file.write('\n')
+    return file_path
+async def get_forwarded_messages(client: TelegramClient, phone_number: str, channel_username: int, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), path_file: str = "files"):
+    try:
+        await client.start(phone_number)
+        path_json = create_dir(os.path.join(path_file, "JSON"))
+        # Fetch the messages from the channel
+        # forwarded_messages = []
+        forwarded_messages_dict = []
+        new_channels = set()
+        async for message in client.iter_messages(channel_username,
+                                                  limit=limit,
+                                                  offset_date=offset_date,
+                                                  reverse=reverse):
+            # Check if the message is a forward
+            if message.forward and hasattr(message.forward.chat, 'username'):
+                # forwarded_messages.append(message)
+                forwarded_messages_dict.append(message.to_dict())
+                if message.forward.chat:
+                    new_channel = message.forward.chat.username
+                    if new_channel:
+                        new_channels.add(new_channel)
+        if forwarded_messages_dict:
+            dump_jsonl(forwarded_messages_dict, path_json, str(channel_username))
+    except (ChannelPrivateError, ChannelInvalidError):
+        print(f"Cannot access channel: {channel_username}")
+    return forwarded_messages_dict, new_channels
+async def recursive_forward_scraper(seed_channels, depth, client: TelegramClient, phone_number: str,  reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), path_file: str = "files"):
+    """
+    Recursively collects forwarded messages from channels, starting from the seed channels up to a specific depth.
+    """
+    all_forwarded_messages = []
+    visited_channels = set(seed_channels)
+    current_level_channels = set(seed_channels)
+    path_json = create_dir(os.path.join(path_file, "CHANNELS"))
+    for level in range(depth):
+        print(level)
+        print(f"Processing level {level + 1} with {len(current_level_channels)} channels...")
+        next_level_channels = set()
+        # Iterate through channels at the current level
+        for channel in tqdm(current_level_channels, total=len(current_level_channels), desc="get messages"):
+            forwarded_msgs, discovered_channels = await get_forwarded_messages(client, phone_number, channel, reverse, limit, offset_date, path_file)
+            # Collect forwarded messages
+            all_forwarded_messages.extend(forwarded_msgs)
+            # Add newly discovered channels to the next level, excluding already visited ones
+            for new_channel in discovered_channels:
+                if new_channel not in visited_channels:
+                    next_level_channels.add(new_channel)
+                    visited_channels.add(new_channel)
+        # Update the set of channels for the next level of recursion
+        current_level_channels = next_level_channels
+        if not current_level_channels:
+            break
+    write_json(visited_channels, path_json, "visited_channels")
+    return all_forwarded_messages, visited_channels
 def group_by_post(df : pd.DataFrame) -> pd.DataFrame:
     """
     Function to group message by "post". If a post embed multiple media, Telethon returns separate messages for each media. This function groups them together ensuring also type consistency.
@@ -338,7 +525,80 @@ def group_by_post(df : pd.DataFrame) -> pd.DataFrame:
     df = df.groupby(['channel_id',"grouped_id"]).agg(**aggregations).reset_index()
     return df
-async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), ids: list = [], path_file: str = "files") -> list:
+# async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), ids: list = [], path_file: str = "files", dl_thumbs : bool = False) -> list:
+#     """
+#     Retrieves messages from a Telegram channel by date.
+#     Args:
+#         client (TelegramClient): The Telegram client instance.
+#         phone_number (str): The phone number associated with the Telegram account.
+#         channel_username (str): The username of the channel to retrieve messages from.
+#         dl_files (bool, optional): Whether to download media files attached to the messages. Defaults to False.
+#         reverse (bool, optional): Whether to retrieve messages in reverse order. Defaults to True.
+#         limit (int, optional): The maximum number of messages to retrieve. Defaults to None (retrieve all messages).
+#         offset_date (datetime, optional): The starting date to retrieve messages from. Defaults to datetime(1970,1,1).
+#         path_file (str, optional): The path to save the downloaded files. Defaults to "files".
+#     Returns:
+#         list: A list of messages retrieved from the channel.
+#     Raises:
+#         Exception: If there is an error during the retrieval process.
+#     """
+#     try:
+#         await client.start(phone_number)
+#         # current_path_file = create_dir(os.path.join(path_file, "messages"))
+#         path_messages = create_dir(os.path.join(path_file, "messages"))
+#         path_entities = create_dir(os.path.join(path_file, "entities"))
+#         if dl_files:
+#             current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
+#         if dl_thumbs:
+#             current_path_thumbs = create_dir(os.path.join(path_file, "thumbs", str(channel_username)))
+#         # Get the message history
+#         messages = []
+#         async for message in client.iter_messages(channel_username,
+#                                                   limit=limit,
+#                                                   offset_date=offset_date,
+#                                                   reverse=reverse):
+#             messages.append(message)
+#             if dl_files:
+#                 media_fileid = parse_mediafileid(message)
+#                 if media_fileid:
+#                     await message.download_media(file=os.path.join(current_path_img, media_fileid))
+#             if dl_thumbs:
+#                 media_fileid = parse_mediafileid(message)
+#                 if media_fileid:
+#                     try:
+#                         await message.download_media(file=os.path.join(current_path_thumbs, media_fileid), thumb=-1)
+#                     except Exception as e:
+#                         pass
+#                         print(e)
+#         df_exploded = parse_messages(messages)
+#         df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
+#         write_pickle(df_exploded, path_messages, str(channel_username))
+#         df_entities = parse_message_entities(messages)
+#         write_pickle(df_entities, path_entities, str(channel_username))
+#         # df_messages = group_by_post(df_exploded)
+#         # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['from_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
+#         # write_pickle(df_messages, current_path_file, str(channel_username))
+#         return messages
+#     finally:
+#         # Disconnect the client
+#         await client.disconnect()
+async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), path_file: str = "files", dl_thumbs : bool = False) -> list:
     """
     Retrieves messages from a Telegram channel by date.
@@ -362,18 +622,27 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
     try:
         await client.start(phone_number)
-        current_path_file = create_dir(os.path.join(path_file, "messages"))
+        # current_path_file = create_dir(os.path.join(path_file, "messages"))
+        path_json = create_dir(os.path.join(path_file, "JSON"))
+        path_messages = create_dir(os.path.join(path_file, "messages"))
+        path_entities = create_dir(os.path.join(path_file, "entities"))
         if dl_files:
             current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
+        if dl_thumbs:
+            current_path_thumbs = create_dir(os.path.join(path_file, "thumbs", str(channel_username)))
         # Get the message history
-        messages = []
+        # messages = []
+        messages_dict = []
         async for message in client.iter_messages(channel_username,
                                                   limit=limit,
                                                   offset_date=offset_date,
                                                   reverse=reverse):
-            messages.append(message)
+            # messages.append(message)
+            messages_dict.append(message.to_dict())
             if dl_files:
@@ -381,22 +650,35 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
                 if media_fileid:
                     await message.download_media(file=os.path.join(current_path_img, media_fileid))
-        df_exploded = parse_messages(messages)
+            if dl_thumbs:
+                media_fileid = parse_mediafileid(message)
+                if media_fileid:
+                    try:
+                        await message.download_media(file=os.path.join(current_path_thumbs, media_fileid), thumb=-1)
+                    except Exception as e:
+                        pass
+                        print(e)
+        dump_jsonl(messages_dict, path_json, str(channel_username))
+        df_exploded = parse_messages(messages_dict)
         df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
-        write_pickle(df_exploded, current_path_file, str(channel_username))
+        write_pickle(df_exploded, path_messages, str(channel_username))
+        df_entities = parse_message_entities(messages_dict)
+        write_pickle(df_entities, path_entities, str(channel_username))
         # df_messages = group_by_post(df_exploded)
         # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['from_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
         # write_pickle(df_messages, current_path_file, str(channel_username))
-        return messages
+        return messages_dict
     finally:
         # Disconnect the client
         await client.disconnect()
 async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel_username:int, dl_files:bool=False, ids:list=[], path_file:str="files")-> list:
     """
-    Retrieves messages from a Telegram channel by date.
+    Retrieves messages from a Telegram channel by IDS.
     Args:
         client (TelegramClient): The Telegram client instance.
@@ -415,15 +697,18 @@ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel
     """
     try:
         await client.start(phone_number)
-        current_path_file = create_dir(os.path.join(path_file, "messages"))
+        path_json = create_dir(os.path.join(path_file, "JSON"))
+        path_messages = create_dir(os.path.join(path_file, "messages"))
+        path_entities = create_dir(os.path.join(path_file, "entities"))
         # Get the message history
-        messages = []
+        # messages = []
+        messages_dict = []
         async for message in client.iter_messages(channel_username,
                                                   ids = ids):
-            messages.append(message)
+            # messages.append(message)
+            messages_dict.append(message.to_dict())
             if dl_files:
                 current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
@@ -431,14 +716,21 @@ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel
                 if media_fileid:
                     await message.download_media(file=os.path.join(current_path_img, media_fileid))
-        df_exploded = parse_messages(messages)
+        dump_jsonl(messages_dict, path_json, str(channel_username))
+        df_exploded = parse_messages(messages_dict)
         df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
-        write_pickle(df_exploded, current_path_file, str(channel_username))
+        write_pickle(df_exploded, path_messages, str(channel_username))
+        df_entities = parse_message_entities(messages_dict)
+        write_pickle(df_entities, path_entities, str(channel_username))
         # df_messages = group_by_post(df_exploded)
         # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['message_id'].astype(str)
         # write_pickle(df_messages, current_path_file, str(channel_username))
-        return messages
+        return messages_dict
     finally:
         # Disconnect the client
         await client.disconnect()
@@ -466,17 +758,23 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
     try:
         await client.start(phone_number)
-        current_path_file = create_dir(os.path.join(path_file, "messages"))
+        # current_path_file = create_dir(os.path.join(path_file, "messages"))
+        path_json = create_dir(os.path.join(path_file, "JSON"))
+        path_messages = create_dir(os.path.join(path_file, "messages"))
+        path_entities = create_dir(os.path.join(path_file, "entities"))
         if dl_files:
             current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
         # Get the message history
-        messages = []
+        # messages = []
+        messages_dict = []
         async for message in client.iter_messages(channel_username,
                                                   search=search,
                                                   limit=limit):
-            messages.append(message)
+            # messages.append(message)
+            messages_dict.append(message.to_dict())
             if dl_files:
@@ -484,17 +782,22 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
                 if media_fileid:
                     await message.download_media(file=os.path.join(current_path_img, media_fileid))
-        df_exploded = parse_messages(messages)
+        df_exploded = parse_messages(messages_dict)
         df_exploded['search']=search
         df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
         # df_messages = group_by_post(df_exploded)
         # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
+        df_entities = parse_message_entities(messages_dict)
         if channel_username:
-            write_pickle(df_exploded, current_path_file, str(search)+'_'+str(channel_username))
+            write_pickle(df_exploded, path_messages, str(search)+'_'+str(channel_username))
+            write_pickle(df_entities, path_entities, str(search)+'_'+str(channel_username))
         else:
-            write_pickle(df_exploded, current_path_file, str(search))
+            write_pickle(df_exploded, path_messages, str(search))
+            write_pickle(df_entities, path_entities, str(search))
-        return messages
+        return messages_dict
     finally:
         # Disconnect the client
         await client.disconnect()
@@ -504,12 +807,14 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
         # Connect the client
         await client.start(phone_number)
-        current_path_file = create_dir(os.path.join(path_file, "messages"))
+        # current_path_file = create_dir(os.path.join(path_file, "messages"))
+        path_messages = create_dir(os.path.join(path_file, "messages"))
+        path_entities = create_dir(os.path.join(path_file, "entities"))
-        comments = []
+        comments_dict = []
         async for comment in client.iter_messages(int(channel_entity), reply_to=int(message_id), limit=limit, reverse=reverse):
-            comments.append(comment)
+            comments_dict.append(comment.to_dict())
         if dl_files:
             current_path_img = create_dir(os.path.join(path_file, "img", str(channel_entity)))
@@ -517,11 +822,14 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
             if media_fileid:
                 await comment.download_media(file=os.path.join(current_path_img, media_fileid))
-        df_comments = parse_messages(comments)
+        df_comments = parse_messages(comments_dict)
         df_comments.loc[df_comments['media_id'].notna(), "media_fileid"] = df_comments['channel_id'].astype(str)+'_'+df_comments['grouped_id'].astype(str)+'_'+df_comments['media_id'].astype(str)
-        write_pickle(df_comments, current_path_file, str(channel_entity)+"_"+str(message_id))
+        write_pickle(df_comments, path_messages, str(channel_entity)+"_"+str(message_id))
+        df_entities = parse_message_entities(comments_dict)
+        write_pickle(df_entities, path_entities, str(channel_entity)+"_"+str(message_id))
-        return comments
+        return comments_dict
     finally:
         # Disconnect the client
@@ -954,7 +1262,97 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
 #         # Disconnect the client
 #         await client.disconnect()
-async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str) -> dict:
+# async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str, path_img :str) -> dict:
+#     """
+#     Retrieves information about a Telegram channel.
+#     Args:
+#         api_id (int): The API ID of the Telegram application.
+#         api_hash (str): The API hash of the Telegram application.
+#         phone_number (str): The phone number associated with the Telegram account.
+#         channel_username (str): The username of the channel.
+#     Returns:
+#         dict: A dictionary containing the full information of the channel.
+#     Raises:
+#         Exception: If there is an error during the retrieval of channel information.
+#     """
+#     client = TelegramClient('session_name', api_id, api_hash)
+#     try:
+#         await client.start(phone_number)
+#         channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
+#         channel_full_info_json = channel_full_info.to_dict()
+#         img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file = path_img)
+#     finally:
+#         # Disconnect the client
+#         await client.disconnect()
+#     return channel_full_info_json
+# async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str, path_project :str, DL_profile_pic : bool = False) -> dict:
+#     """
+#     Retrieves information about a Telegram channel.
+#     Args:
+#         api_id (int): The API ID of the Telegram application.
+#         api_hash (str): The API hash of the Telegram application.
+#         phone_number (str): The phone number associated with the Telegram account.
+#         channel_username (str): The username of the channel.
+#     Returns:
+#         dict: A dictionary containing the full information of the channel.
+#     Raises:
+#         Exception: If there is an error during the retrieval of channel information.
+#     """
+#     client = TelegramClient('session_name', api_id, api_hash)
+#     path_img = create_dir(os.path.join(path_project, "THUMBNAILS"))
+#     path_json = create_dir(os.path.join(path_project, "JSON"))
+#     try:
+#         await client.start(phone_number)
+#         try:
+#             channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
+#             if channel_full_info:
+#                 channel_full_info_dict = channel_full_info.to_dict()
+#                 channel_full_info_json = JSONEncoder().encode(channel_full_info_dict)
+#             else:
+#                 channel_full_info_dict = {'_': 'Channel', 'id': channel_username, 'title':'private_channel'}
+#             write_json(channel_full_info_json, path_json, f"{str(channel_username)}")
+#             if DL_profile_pic:
+#                 img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file = path_img)
+#         except Exception as e:
+#             pass
+#             print(channel_username, e)
+#     finally:
+#         # Disconnect the client
+#         await client.disconnect()
+#     return channel_full_info_dict
+def dump_json(json_dict: dict, path: str, name: str) -> str:
+    """
+    Write a dictionary to a JSON file.
+    Args:
+        json_dict (dict): The dictionary to be written to the JSON file.
+        path (str): The directory path where the JSON file will be saved.
+        name (str): The name of the JSON file (without the extension).
+    Returns:
+        str: The full path to the saved JSON file.
+    """
+    file_path = os.path.join(path, name + '.json')
+    with open(file_path, 'w') as outfile:
+        json.dump(json_dict, outfile, cls=JSONEncoder)
+    return file_path
+async def get_channel_info(api_id: int, api_hash: str, phone_number: str, channel_username: str, path_project: str, DL_profile_pic: bool = False) -> dict:
     """
     Retrieves information about a Telegram channel.
@@ -971,16 +1369,49 @@ async def get_channel_info(api_id : int, api_hash : str, phone_number : str, cha
         Exception: If there is an error during the retrieval of channel information.
     """
     client = TelegramClient('session_name', api_id, api_hash)
+    path_img = create_dir(os.path.join(path_project, "THUMBNAILS"))
+    path_json = create_dir(os.path.join(path_project, "JSON"))
     try:
         await client.start(phone_number)
-        channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
-        channel_full_info = channel_full_info.to_dict()
+        try:
+            # Fetch full channel info
+            channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
+            # If channel info is retrieved
+            if channel_full_info:
+                channel_full_info_dict = channel_full_info.to_dict()
+            else:
+                channel_full_info_dict = {'_': 'ChatFull',
+                                      'full_chat': {'_': 'ChannelFull',
+                                                    'id': channel_username,
+                                                    },
+                                        'chats': [{'_': 'Channel', 'id': channel_username, 'title': 'private'}]
+                                    }
+            # Save the dictionary as JSON (no need to pre-encode it to a string)
+            dump_json(channel_full_info_dict, path_json, f"{str(channel_username)}")
+            # Optionally download profile picture
+            if DL_profile_pic:
+                img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file=path_img)
+        except Exception as e:
+            print(channel_username, e)
+            channel_full_info_dict = {'_': 'ChatFull',
+                                      'full_chat': {'_': 'ChannelFull',
+                                                    'id': channel_username,
+                                                    },
+                                        'chats': [{'_': 'Channel', 'id': channel_username, 'title': 'private'}]
+                                    }
+            dump_json(channel_full_info_dict, path_json, f"{str(channel_username)}")
+            return {'_': 'Channel', 'id': channel_username, 'title': 'private_channel'}
     finally:
         # Disconnect the client
         await client.disconnect()
-    return channel_full_info
+    return channel_full_info_dict
 def parse_channel(channel : dict) -> pd.DataFrame:
     """
@@ -1016,7 +1447,6 @@ def parse_channel(channel : dict) -> pd.DataFrame:
     chats = channel.get("chats", [])
     if chats:
         for chat in chats:
-            print(chat)
             if chat.get("_") == "Channel":
                 if chat.get("id") == channel_id:
                     creation_date = chat.get("date", datetime(1970,1,1))

opsci-toolbox 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

opsci-toolbox 0.0.13py3-none-any.whl → 0.0.15py3-none-any.whl