opsci-toolbox 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,47 @@
1
1
  from datetime import datetime
2
2
  from telethon.sync import TelegramClient
3
3
  from telethon.tl.functions.channels import GetFullChannelRequest
4
+ from telethon.errors.rpcerrorlist import ChannelPrivateError, ChannelInvalidError
4
5
  import pandas as pd
5
6
  # from telethon.tl.types import ReactionEmoji, ReactionCustomEmoji, PeerUser, PeerChannel
6
- from opsci_toolbox.helpers.common import create_dir, write_pickle
7
+ from opsci_toolbox.helpers.common import create_dir, write_pickle, write_json
7
8
  import os
8
9
  import nest_asyncio
9
10
  from telethon.tl.types import Message
10
- from typing import Optional
11
-
11
+ import json
12
+ from tqdm import tqdm
12
13
  nest_asyncio.apply()
13
14
 
14
15
 
16
+ class JSONEncoder(json.JSONEncoder):
17
+ '''
18
+ JSONEncoder subclass that knows how to encode date/time and bytes.
19
+ '''
20
+ def default(self, o):
21
+ if isinstance(o, datetime) or isinstance(o, bytes):
22
+ return str(o)
23
+ return super().default(o)
24
+
25
+ def dump_jsonl(data: list[dict], path: str, name: str) -> str:
26
+ """
27
+ Write data to a JSON Lines (jsonl) file. Each dictionary in the list represents a single JSON object.
28
+
29
+ Args:
30
+ data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
31
+ path (str): The directory path where the JSON Lines file will be saved.
32
+ name (str): The name of the JSON Lines file (without the extension).
33
+
34
+ Returns:
35
+ str: The full path to the saved JSON Lines file.
36
+ """
37
+ file_path = os.path.join(path, name + '.jsonl')
38
+ with open(file_path, 'w') as file:
39
+ for entry in data:
40
+ json.dump(entry, file, cls=JSONEncoder)
41
+ file.write('\n')
42
+ return file_path
43
+
44
+
15
45
  def parse_mediafileid(message: Message) -> str:
16
46
  """
17
47
  Parse the media file ID from a Telegram message.
@@ -38,7 +68,7 @@ def parse_mediafileid(message: Message) -> str:
38
68
  else:
39
69
  grouped_id = message_id
40
70
 
41
- media_fileid = str(channel_id)+'_'+str(grouped_id)+'_'+str(media_id)
71
+ media_fileid = str(int(channel_id))+'_'+str(int(grouped_id))+'_'+str(int(media_id))
42
72
  return media_fileid
43
73
  else:
44
74
  return None
@@ -55,10 +85,10 @@ def parse_message_entities(messages : list) -> pd.DataFrame:
55
85
  pd.DataFrame : a DataFrame containing the parsed entities.
56
86
  """
57
87
  all_records = []
58
- for message in messages:
59
- raw_text = message.raw_text
88
+ for data in messages:
89
+ # raw_text = message.raw_text
60
90
 
61
- data = message.to_dict()
91
+ # data = message.to_dict()
62
92
 
63
93
  message_id = data.get("id")
64
94
 
@@ -90,10 +120,10 @@ def parse_message_entities(messages : list) -> pd.DataFrame:
90
120
  url = entity.get("url")
91
121
  document_id = entity.get("document_id")
92
122
 
93
- entity_record = (message_id, channel_id, from_id, grouped_id, message, raw_text, entity_type, offset, length, url, document_id)
123
+ entity_record = (message_id, channel_id, from_id, grouped_id, message, entity_type, offset, length, url, document_id)
94
124
  all_records.append(entity_record)
95
125
 
96
- df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "message", "raw_text", "entity_type", "offset", "length", "url", "document_id"])
126
+ df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "message", "entity_type", "offset", "length", "url", "document_id"])
97
127
  return df
98
128
 
99
129
 
@@ -110,9 +140,9 @@ def parse_messages(messages : list) -> pd.DataFrame:
110
140
 
111
141
  all_records = []
112
142
  for message in messages:
113
- raw_text = message.raw_text
143
+ # raw_text = message.raw_text
114
144
 
115
- data = message.to_dict()
145
+ data = message
116
146
 
117
147
  message_id = data.get("id")
118
148
 
@@ -166,10 +196,10 @@ def parse_messages(messages : list) -> pd.DataFrame:
166
196
  engagements = forwards + replies + total_reactions
167
197
 
168
198
 
169
- post_record = (message_id, channel_id, from_id, grouped_id, date, message, raw_text, views, forwards, replies, total_reactions, reactions_details, engagements, reply_to_message_id, reply_to_channel_id) + media_record + fwd_record
199
+ post_record = (message_id, channel_id, from_id, grouped_id, date, message, views, forwards, replies, total_reactions, reactions_details, engagements, reply_to_message_id, reply_to_channel_id) + media_record + fwd_record
170
200
  all_records.append(post_record)
171
201
 
172
- df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "date", "message", "raw_text", "views", "forwards", "replies", "reactions", "details_reactions", "engagements", "reply_to_message_id", "reply_to_channel_id",
202
+ df = pd.DataFrame.from_records(all_records, columns=["message_id", "channel_id", "from_id", "grouped_id", "date", "message", "views", "forwards", "replies", "reactions", "details_reactions", "engagements", "reply_to_message_id", "reply_to_channel_id",
173
203
  "media_id", "media_date", "media_mime_type", "media_size", "media_filename", "duration", "width", "height",
174
204
  "webpage_id", "webpage_url", "webpage_type", "webpage_site_name", "webpage_title", "webpage_description",
175
205
  "fwd_date", "fwd_from_id", "fwd_from_post_id", "fwd_from_from_name"])
@@ -196,8 +226,6 @@ def parse_reply(reply:dict) -> tuple:
196
226
 
197
227
  return reply_to_message_id, reply_to_channel_id
198
228
 
199
-
200
-
201
229
  def parse_from(data : dict) -> int:
202
230
  """
203
231
  Parse a peer object from Telegram message.
@@ -288,12 +316,16 @@ def parse_media(media : dict) -> tuple:
288
316
  if media.get("_") == "MessageMediaPhoto":
289
317
  photo = media.get("photo", {})
290
318
  media_id = photo.get("id")
319
+ if media_id:
320
+ media_id = str(int(media_id))
291
321
  media_date = photo.get("date")
292
322
  media_mime_type = "photo"
293
323
 
294
324
  elif media.get("_") == "MessageMediaDocument":
295
325
  document = media.get("document", {})
296
326
  media_id = document.get("id")
327
+ if media_id:
328
+ media_id = str(int(media_id))
297
329
  media_date = document.get("date")
298
330
  media_mime_type = document.get("mime_type")
299
331
  media_size = document.get("size")
@@ -344,6 +376,107 @@ def parse_media_id(media : dict) -> int:
344
376
  media_id = None
345
377
  return media_id
346
378
 
379
+
380
+ class JSONEncoder(json.JSONEncoder):
381
+ '''
382
+ JSONEncoder subclass that knows how to encode date/time and bytes.
383
+ '''
384
+ def default(self, o):
385
+ if isinstance(o, datetime) or isinstance(o, bytes):
386
+ return str(o)
387
+ return super().default(o)
388
+
389
+ def dump_jsonl(data: list[dict], path: str, name: str) -> str:
390
+ """
391
+ Write data to a JSON Lines (jsonl) file. Each dictionary in the list represents a single JSON object.
392
+
393
+ Args:
394
+ data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
395
+ path (str): The directory path where the JSON Lines file will be saved.
396
+ name (str): The name of the JSON Lines file (without the extension).
397
+
398
+ Returns:
399
+ str: The full path to the saved JSON Lines file.
400
+ """
401
+ file_path = os.path.join(path, name + '.jsonl')
402
+ with open(file_path, 'w') as file:
403
+ for entry in data:
404
+ json.dump(entry, file, cls=JSONEncoder)
405
+ file.write('\n')
406
+ return file_path
407
+
408
+
409
+ async def get_forwarded_messages(client: TelegramClient, phone_number: str, channel_username: int, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), path_file: str = "files"):
410
+ try:
411
+ await client.start(phone_number)
412
+
413
+ path_json = create_dir(os.path.join(path_file, "JSON"))
414
+
415
+ # Fetch the messages from the channel
416
+ # forwarded_messages = []
417
+ forwarded_messages_dict = []
418
+ new_channels = set()
419
+
420
+ async for message in client.iter_messages(channel_username,
421
+ limit=limit,
422
+ offset_date=offset_date,
423
+ reverse=reverse):
424
+ # Check if the message is a forward
425
+ if message.forward and hasattr(message.forward.chat, 'username'):
426
+ # forwarded_messages.append(message)
427
+ forwarded_messages_dict.append(message.to_dict())
428
+
429
+ if message.forward.chat:
430
+ new_channel = message.forward.chat.username
431
+ if new_channel:
432
+ new_channels.add(new_channel)
433
+
434
+ if forwarded_messages_dict:
435
+ dump_jsonl(forwarded_messages_dict, path_json, str(channel_username))
436
+
437
+ except (ChannelPrivateError, ChannelInvalidError):
438
+ print(f"Cannot access channel: {channel_username}")
439
+
440
+ return forwarded_messages_dict, new_channels
441
+
442
+
443
+ async def recursive_forward_scraper(seed_channels, depth, client: TelegramClient, phone_number: str, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), path_file: str = "files"):
444
+ """
445
+ Recursively collects forwarded messages from channels, starting from the seed channels up to a specific depth.
446
+ """
447
+ all_forwarded_messages = []
448
+ visited_channels = set(seed_channels)
449
+ current_level_channels = set(seed_channels)
450
+
451
+ path_json = create_dir(os.path.join(path_file, "CHANNELS"))
452
+
453
+ for level in range(depth):
454
+ print(level)
455
+ print(f"Processing level {level + 1} with {len(current_level_channels)} channels...")
456
+ next_level_channels = set()
457
+
458
+ # Iterate through channels at the current level
459
+ for channel in tqdm(current_level_channels, total=len(current_level_channels), desc="get messages"):
460
+ forwarded_msgs, discovered_channels = await get_forwarded_messages(client, phone_number, channel, reverse, limit, offset_date, path_file)
461
+
462
+ # Collect forwarded messages
463
+ all_forwarded_messages.extend(forwarded_msgs)
464
+
465
+ # Add newly discovered channels to the next level, excluding already visited ones
466
+ for new_channel in discovered_channels:
467
+ if new_channel not in visited_channels:
468
+ next_level_channels.add(new_channel)
469
+ visited_channels.add(new_channel)
470
+ # Update the set of channels for the next level of recursion
471
+ current_level_channels = next_level_channels
472
+
473
+ if not current_level_channels:
474
+ break
475
+
476
+ write_json(visited_channels, path_json, "visited_channels")
477
+ return all_forwarded_messages, visited_channels
478
+
479
+
347
480
  def group_by_post(df : pd.DataFrame) -> pd.DataFrame:
348
481
  """
349
482
  Function to group message by "post". If a post embed multiple media, Telethon returns separate messages for each media. This function groups them together ensuring also type consistency.
@@ -392,7 +525,80 @@ def group_by_post(df : pd.DataFrame) -> pd.DataFrame:
392
525
  df = df.groupby(['channel_id',"grouped_id"]).agg(**aggregations).reset_index()
393
526
  return df
394
527
 
395
- async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), ids: list = [], path_file: str = "files", dl_thumbs : bool = False) -> list:
528
+ # async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), ids: list = [], path_file: str = "files", dl_thumbs : bool = False) -> list:
529
+ # """
530
+ # Retrieves messages from a Telegram channel by date.
531
+
532
+ # Args:
533
+ # client (TelegramClient): The Telegram client instance.
534
+ # phone_number (str): The phone number associated with the Telegram account.
535
+ # channel_username (str): The username of the channel to retrieve messages from.
536
+ # dl_files (bool, optional): Whether to download media files attached to the messages. Defaults to False.
537
+ # reverse (bool, optional): Whether to retrieve messages in reverse order. Defaults to True.
538
+ # limit (int, optional): The maximum number of messages to retrieve. Defaults to None (retrieve all messages).
539
+ # offset_date (datetime, optional): The starting date to retrieve messages from. Defaults to datetime(1970,1,1).
540
+ # path_file (str, optional): The path to save the downloaded files. Defaults to "files".
541
+
542
+ # Returns:
543
+ # list: A list of messages retrieved from the channel.
544
+
545
+ # Raises:
546
+ # Exception: If there is an error during the retrieval process.
547
+
548
+ # """
549
+ # try:
550
+ # await client.start(phone_number)
551
+
552
+ # # current_path_file = create_dir(os.path.join(path_file, "messages"))
553
+ # path_messages = create_dir(os.path.join(path_file, "messages"))
554
+ # path_entities = create_dir(os.path.join(path_file, "entities"))
555
+
556
+ # if dl_files:
557
+ # current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
558
+ # if dl_thumbs:
559
+ # current_path_thumbs = create_dir(os.path.join(path_file, "thumbs", str(channel_username)))
560
+
561
+ # # Get the message history
562
+ # messages = []
563
+
564
+ # async for message in client.iter_messages(channel_username,
565
+ # limit=limit,
566
+ # offset_date=offset_date,
567
+ # reverse=reverse):
568
+ # messages.append(message)
569
+
570
+ # if dl_files:
571
+
572
+ # media_fileid = parse_mediafileid(message)
573
+ # if media_fileid:
574
+ # await message.download_media(file=os.path.join(current_path_img, media_fileid))
575
+
576
+ # if dl_thumbs:
577
+ # media_fileid = parse_mediafileid(message)
578
+ # if media_fileid:
579
+ # try:
580
+ # await message.download_media(file=os.path.join(current_path_thumbs, media_fileid), thumb=-1)
581
+ # except Exception as e:
582
+ # pass
583
+ # print(e)
584
+
585
+ # df_exploded = parse_messages(messages)
586
+ # df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
587
+ # write_pickle(df_exploded, path_messages, str(channel_username))
588
+
589
+ # df_entities = parse_message_entities(messages)
590
+ # write_pickle(df_entities, path_entities, str(channel_username))
591
+
592
+ # # df_messages = group_by_post(df_exploded)
593
+ # # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['from_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
594
+ # # write_pickle(df_messages, current_path_file, str(channel_username))
595
+
596
+ # return messages
597
+ # finally:
598
+ # # Disconnect the client
599
+ # await client.disconnect()
600
+
601
+ async def get_messages_by_date(client: TelegramClient, phone_number: str, channel_username: int, dl_files: bool = False, reverse: bool = True, limit: int = None, offset_date: datetime = datetime(1970,1,1), path_file: str = "files", dl_thumbs : bool = False) -> list:
396
602
  """
397
603
  Retrieves messages from a Telegram channel by date.
398
604
 
@@ -417,6 +623,7 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
417
623
  await client.start(phone_number)
418
624
 
419
625
  # current_path_file = create_dir(os.path.join(path_file, "messages"))
626
+ path_json = create_dir(os.path.join(path_file, "JSON"))
420
627
  path_messages = create_dir(os.path.join(path_file, "messages"))
421
628
  path_entities = create_dir(os.path.join(path_file, "entities"))
422
629
 
@@ -426,13 +633,16 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
426
633
  current_path_thumbs = create_dir(os.path.join(path_file, "thumbs", str(channel_username)))
427
634
 
428
635
  # Get the message history
429
- messages = []
636
+ # messages = []
637
+ messages_dict = []
430
638
 
431
639
  async for message in client.iter_messages(channel_username,
432
640
  limit=limit,
433
641
  offset_date=offset_date,
434
642
  reverse=reverse):
435
- messages.append(message)
643
+ # messages.append(message)
644
+ messages_dict.append(message.to_dict())
645
+
436
646
 
437
647
  if dl_files:
438
648
 
@@ -449,18 +659,19 @@ async def get_messages_by_date(client: TelegramClient, phone_number: str, channe
449
659
  pass
450
660
  print(e)
451
661
 
452
- df_exploded = parse_messages(messages)
662
+ dump_jsonl(messages_dict, path_json, str(channel_username))
663
+ df_exploded = parse_messages(messages_dict)
453
664
  df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
454
665
  write_pickle(df_exploded, path_messages, str(channel_username))
455
666
 
456
- df_entities = parse_message_entities(messages)
667
+ df_entities = parse_message_entities(messages_dict)
457
668
  write_pickle(df_entities, path_entities, str(channel_username))
458
669
 
459
670
  # df_messages = group_by_post(df_exploded)
460
671
  # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['from_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
461
672
  # write_pickle(df_messages, current_path_file, str(channel_username))
462
673
 
463
- return messages
674
+ return messages_dict
464
675
  finally:
465
676
  # Disconnect the client
466
677
  await client.disconnect()
@@ -486,16 +697,18 @@ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel
486
697
  """
487
698
  try:
488
699
  await client.start(phone_number)
489
-
700
+ path_json = create_dir(os.path.join(path_file, "JSON"))
490
701
  path_messages = create_dir(os.path.join(path_file, "messages"))
491
702
  path_entities = create_dir(os.path.join(path_file, "entities"))
492
703
 
493
704
  # Get the message history
494
- messages = []
705
+ # messages = []
706
+ messages_dict = []
495
707
 
496
708
  async for message in client.iter_messages(channel_username,
497
709
  ids = ids):
498
- messages.append(message)
710
+ # messages.append(message)
711
+ messages_dict.append(message.to_dict())
499
712
 
500
713
  if dl_files:
501
714
  current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
@@ -503,11 +716,13 @@ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel
503
716
  if media_fileid:
504
717
  await message.download_media(file=os.path.join(current_path_img, media_fileid))
505
718
 
506
- df_exploded = parse_messages(messages)
719
+
720
+ dump_jsonl(messages_dict, path_json, str(channel_username))
721
+ df_exploded = parse_messages(messages_dict)
507
722
  df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
508
723
  write_pickle(df_exploded, path_messages, str(channel_username))
509
724
 
510
- df_entities = parse_message_entities(messages)
725
+ df_entities = parse_message_entities(messages_dict)
511
726
  write_pickle(df_entities, path_entities, str(channel_username))
512
727
 
513
728
 
@@ -515,7 +730,7 @@ async def get_messages_by_ids(client: TelegramClient, phone_number: str, channel
515
730
  # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['message_id'].astype(str)
516
731
  # write_pickle(df_messages, current_path_file, str(channel_username))
517
732
 
518
- return messages
733
+ return messages_dict
519
734
  finally:
520
735
  # Disconnect the client
521
736
  await client.disconnect()
@@ -544,6 +759,7 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
544
759
  await client.start(phone_number)
545
760
 
546
761
  # current_path_file = create_dir(os.path.join(path_file, "messages"))
762
+ path_json = create_dir(os.path.join(path_file, "JSON"))
547
763
  path_messages = create_dir(os.path.join(path_file, "messages"))
548
764
  path_entities = create_dir(os.path.join(path_file, "entities"))
549
765
 
@@ -551,12 +767,14 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
551
767
  current_path_img = create_dir(os.path.join(path_file, "img", str(channel_username)))
552
768
 
553
769
  # Get the message history
554
- messages = []
770
+ # messages = []
771
+ messages_dict = []
555
772
 
556
773
  async for message in client.iter_messages(channel_username,
557
774
  search=search,
558
775
  limit=limit):
559
- messages.append(message)
776
+ # messages.append(message)
777
+ messages_dict.append(message.to_dict())
560
778
 
561
779
  if dl_files:
562
780
 
@@ -564,12 +782,13 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
564
782
  if media_fileid:
565
783
  await message.download_media(file=os.path.join(current_path_img, media_fileid))
566
784
 
567
- df_exploded = parse_messages(messages)
785
+
786
+ df_exploded = parse_messages(messages_dict)
568
787
  df_exploded['search']=search
569
788
  df_exploded.loc[df_exploded['media_id'].notna(), "media_fileid"] = df_exploded['channel_id'].astype(str)+'_'+df_exploded['grouped_id'].astype(str)+'_'+df_exploded['media_id'].astype(str)
570
789
  # df_messages = group_by_post(df_exploded)
571
790
  # df_messages['uniq_id']=df_messages['channel_id'].astype(str)+'_'+df_messages['concatenated_message_id'].astype(str)
572
- df_entities = parse_message_entities(messages)
791
+ df_entities = parse_message_entities(messages_dict)
573
792
 
574
793
  if channel_username:
575
794
  write_pickle(df_exploded, path_messages, str(search)+'_'+str(channel_username))
@@ -578,7 +797,7 @@ async def get_messages_by_search(client: TelegramClient, phone_number: str, sear
578
797
  write_pickle(df_exploded, path_messages, str(search))
579
798
  write_pickle(df_entities, path_entities, str(search))
580
799
 
581
- return messages
800
+ return messages_dict
582
801
  finally:
583
802
  # Disconnect the client
584
803
  await client.disconnect()
@@ -592,10 +811,10 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
592
811
  path_messages = create_dir(os.path.join(path_file, "messages"))
593
812
  path_entities = create_dir(os.path.join(path_file, "entities"))
594
813
 
595
- comments = []
814
+ comments_dict = []
596
815
 
597
816
  async for comment in client.iter_messages(int(channel_entity), reply_to=int(message_id), limit=limit, reverse=reverse):
598
- comments.append(comment)
817
+ comments_dict.append(comment.to_dict())
599
818
 
600
819
  if dl_files:
601
820
  current_path_img = create_dir(os.path.join(path_file, "img", str(channel_entity)))
@@ -603,14 +822,14 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
603
822
  if media_fileid:
604
823
  await comment.download_media(file=os.path.join(current_path_img, media_fileid))
605
824
 
606
- df_comments = parse_messages(comments)
825
+ df_comments = parse_messages(comments_dict)
607
826
  df_comments.loc[df_comments['media_id'].notna(), "media_fileid"] = df_comments['channel_id'].astype(str)+'_'+df_comments['grouped_id'].astype(str)+'_'+df_comments['media_id'].astype(str)
608
827
  write_pickle(df_comments, path_messages, str(channel_entity)+"_"+str(message_id))
609
828
 
610
- df_entities = parse_message_entities(comments)
829
+ df_entities = parse_message_entities(comments_dict)
611
830
  write_pickle(df_entities, path_entities, str(channel_entity)+"_"+str(message_id))
612
831
 
613
- return comments
832
+ return comments_dict
614
833
 
615
834
  finally:
616
835
  # Disconnect the client
@@ -1043,7 +1262,97 @@ async def download_comments(client: TelegramClient, phone_number: str,channel_en
1043
1262
  # # Disconnect the client
1044
1263
  # await client.disconnect()
1045
1264
 
1046
- async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str, path_img :str) -> dict:
1265
+ # async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str, path_img :str) -> dict:
1266
+ # """
1267
+ # Retrieves information about a Telegram channel.
1268
+
1269
+ # Args:
1270
+ # api_id (int): The API ID of the Telegram application.
1271
+ # api_hash (str): The API hash of the Telegram application.
1272
+ # phone_number (str): The phone number associated with the Telegram account.
1273
+ # channel_username (str): The username of the channel.
1274
+
1275
+ # Returns:
1276
+ # dict: A dictionary containing the full information of the channel.
1277
+
1278
+ # Raises:
1279
+ # Exception: If there is an error during the retrieval of channel information.
1280
+ # """
1281
+ # client = TelegramClient('session_name', api_id, api_hash)
1282
+ # try:
1283
+ # await client.start(phone_number)
1284
+ # channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
1285
+ # channel_full_info_json = channel_full_info.to_dict()
1286
+ # img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file = path_img)
1287
+ # finally:
1288
+ # # Disconnect the client
1289
+ # await client.disconnect()
1290
+
1291
+ # return channel_full_info_json
1292
+
1293
+
1294
+ # async def get_channel_info(api_id : int, api_hash : str, phone_number : str, channel_username : str, path_project :str, DL_profile_pic : bool = False) -> dict:
1295
+ # """
1296
+ # Retrieves information about a Telegram channel.
1297
+
1298
+ # Args:
1299
+ # api_id (int): The API ID of the Telegram application.
1300
+ # api_hash (str): The API hash of the Telegram application.
1301
+ # phone_number (str): The phone number associated with the Telegram account.
1302
+ # channel_username (str): The username of the channel.
1303
+
1304
+ # Returns:
1305
+ # dict: A dictionary containing the full information of the channel.
1306
+
1307
+ # Raises:
1308
+ # Exception: If there is an error during the retrieval of channel information.
1309
+ # """
1310
+ # client = TelegramClient('session_name', api_id, api_hash)
1311
+ # path_img = create_dir(os.path.join(path_project, "THUMBNAILS"))
1312
+ # path_json = create_dir(os.path.join(path_project, "JSON"))
1313
+
1314
+ # try:
1315
+ # await client.start(phone_number)
1316
+ # try:
1317
+ # channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
1318
+
1319
+ # if channel_full_info:
1320
+ # channel_full_info_dict = channel_full_info.to_dict()
1321
+ # channel_full_info_json = JSONEncoder().encode(channel_full_info_dict)
1322
+ # else:
1323
+ # channel_full_info_dict = {'_': 'Channel', 'id': channel_username, 'title':'private_channel'}
1324
+ # write_json(channel_full_info_json, path_json, f"{str(channel_username)}")
1325
+
1326
+ # if DL_profile_pic:
1327
+ # img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file = path_img)
1328
+
1329
+ # except Exception as e:
1330
+ # pass
1331
+ # print(channel_username, e)
1332
+ # finally:
1333
+ # # Disconnect the client
1334
+ # await client.disconnect()
1335
+
1336
+ # return channel_full_info_dict
1337
+
1338
+ def dump_json(json_dict: dict, path: str, name: str) -> str:
1339
+ """
1340
+ Write a dictionary to a JSON file.
1341
+
1342
+ Args:
1343
+ json_dict (dict): The dictionary to be written to the JSON file.
1344
+ path (str): The directory path where the JSON file will be saved.
1345
+ name (str): The name of the JSON file (without the extension).
1346
+
1347
+ Returns:
1348
+ str: The full path to the saved JSON file.
1349
+ """
1350
+ file_path = os.path.join(path, name + '.json')
1351
+ with open(file_path, 'w') as outfile:
1352
+ json.dump(json_dict, outfile, cls=JSONEncoder)
1353
+ return file_path
1354
+
1355
+ async def get_channel_info(api_id: int, api_hash: str, phone_number: str, channel_username: str, path_project: str, DL_profile_pic: bool = False) -> dict:
1047
1356
  """
1048
1357
  Retrieves information about a Telegram channel.
1049
1358
 
@@ -1060,17 +1369,49 @@ async def get_channel_info(api_id : int, api_hash : str, phone_number : str, cha
1060
1369
  Exception: If there is an error during the retrieval of channel information.
1061
1370
  """
1062
1371
  client = TelegramClient('session_name', api_id, api_hash)
1372
+ path_img = create_dir(os.path.join(path_project, "THUMBNAILS"))
1373
+ path_json = create_dir(os.path.join(path_project, "JSON"))
1374
+
1063
1375
  try:
1064
1376
  await client.start(phone_number)
1065
- channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
1066
- channel_full_info_json = channel_full_info.to_dict()
1067
- img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file = path_img)
1377
+ try:
1378
+ # Fetch full channel info
1379
+ channel_full_info = await client(GetFullChannelRequest(channel=channel_username))
1380
+
1381
+ # If channel info is retrieved
1382
+ if channel_full_info:
1383
+ channel_full_info_dict = channel_full_info.to_dict()
1384
+ else:
1385
+ channel_full_info_dict = {'_': 'ChatFull',
1386
+ 'full_chat': {'_': 'ChannelFull',
1387
+ 'id': channel_username,
1388
+ },
1389
+ 'chats': [{'_': 'Channel', 'id': channel_username, 'title': 'private'}]
1390
+ }
1391
+
1392
+ # Save the dictionary as JSON (no need to pre-encode it to a string)
1393
+ dump_json(channel_full_info_dict, path_json, f"{str(channel_username)}")
1394
+
1395
+ # Optionally download profile picture
1396
+ if DL_profile_pic:
1397
+ img_path = await client.download_profile_photo(channel_full_info.chats[0], download_big=False, file=path_img)
1398
+
1399
+ except Exception as e:
1400
+ print(channel_username, e)
1401
+ channel_full_info_dict = {'_': 'ChatFull',
1402
+ 'full_chat': {'_': 'ChannelFull',
1403
+ 'id': channel_username,
1404
+ },
1405
+ 'chats': [{'_': 'Channel', 'id': channel_username, 'title': 'private'}]
1406
+ }
1407
+ dump_json(channel_full_info_dict, path_json, f"{str(channel_username)}")
1408
+ return {'_': 'Channel', 'id': channel_username, 'title': 'private_channel'}
1409
+
1068
1410
  finally:
1069
1411
  # Disconnect the client
1070
1412
  await client.disconnect()
1071
1413
 
1072
- return channel_full_info_json
1073
-
1414
+ return channel_full_info_dict
1074
1415
 
1075
1416
  def parse_channel(channel : dict) -> pd.DataFrame:
1076
1417
  """
@@ -1106,7 +1447,6 @@ def parse_channel(channel : dict) -> pd.DataFrame:
1106
1447
  chats = channel.get("chats", [])
1107
1448
  if chats:
1108
1449
  for chat in chats:
1109
- print(chat)
1110
1450
  if chat.get("_") == "Channel":
1111
1451
  if chat.get("id") == channel_id:
1112
1452
  creation_date = chat.get("date", datetime(1970,1,1))
@@ -383,6 +383,8 @@ def write_json(json_dict: dict, path: str, name: str) -> str:
383
383
  return file_path
384
384
 
385
385
 
386
+
387
+
386
388
  def write_dataframe_to_json(df: pd.DataFrame, path: str, name: str, orient: str = 'records') -> str:
387
389
  """
388
390
  Write a DataFrame to a JSON file.
@@ -126,7 +126,7 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
126
126
  df[col_clean] = df[col_clean].apply(lambda x : brackets(x))
127
127
  df[col_clean] = df[col_clean].apply(lambda x : urls(x, repl= ''))
128
128
  df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
129
- df[col_clean] = df[col_clean].apply(remove_multiple_hashtags)
129
+ # df[col_clean] = df[col_clean].apply(remove_multiple_hashtags)
130
130
  df[col_clean] = df[col_clean].apply(remove_extra_spaces)
131
131
  # df = df.loc[(df[col_clean] != ""), :]
132
132
  return df
@@ -911,6 +911,8 @@ def topic_aggregate_chunks(df: pd.DataFrame, col_id: str, col_topic : str, col_c
911
911
  """
912
912
  metrics_dict = dict()
913
913
  # metrics_dict[col_id]=(col_id,'first')
914
+ # if col_id != col_chunk_id:
915
+ # metrics_dict[col_chunk_id]=(col_chunk_id,"nunique")
914
916
  metrics_dict[col_chunk_id]=(col_chunk_id,"nunique")
915
917
  metrics_dict[col_engagement]=(col_engagement,'first')
916
918
 
@@ -1578,10 +1580,10 @@ def extract_emojis(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100,
1578
1580
 
1579
1581
  return df
1580
1582
 
1581
- def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1, batch_size: int = 100, n_process: int = 1, stats: bool = False) -> pd.DataFrame:
1583
+ def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1, batch_size: int = 100, n_process: int = 1, stats: bool = False, threshold: int = None) -> pd.DataFrame:
1582
1584
  """
1583
- Split a text into chunks of n sentences
1584
-
1585
+ Split a text into chunks of n sentences, returning their start and end indexes in separate columns.
1586
+
1585
1587
  Parameters:
1586
1588
  nlp : spacy.language.Language
1587
1589
  The spaCy language processing pipeline.
@@ -1597,41 +1599,64 @@ def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1
1597
1599
  The number of processes to use for text processing. Default is 1.
1598
1600
  stats : bool, optional
1599
1601
  Flag indicating whether to compute statistics about the splitting process. Default is False.
1600
-
1602
+ threshold : int, optional
1603
+ Maximum number of sentence batches to return per text. If None, all batches are returned. Default is None.
1604
+
1601
1605
  Returns:
1602
1606
  pd.DataFrame
1603
- DataFrame containing the split sentences.
1607
+ DataFrame containing the split sentences with their start and end indexes in separate columns.
1604
1608
 
1605
- Description:
1606
- This function splits text in a DataFrame into chunks of n sentences. It returns a DataFrame containing the split sentences.
1607
- Optionally, it can compute statistics such as the count of sentences and batches if the 'stats' parameter is set to True.
1608
1609
  """
1610
+ text = list(df[col_text].astype('unicode').values)
1611
+
1612
+ count_sentences = []
1613
+ count_batches = []
1614
+ results = []
1615
+ start_indexes = []
1616
+ end_indexes = []
1617
+
1618
+ for doc in tqdm(nlp.pipe(text, batch_size=batch_size, n_process=n_process), total=len(text), desc="Sentence splitting"):
1619
+ sentences = []
1620
+
1621
+
1622
+ # Extract sentences and their positions
1623
+ for sent in doc.sents:
1624
+ sentences.append((sent.text, sent.start_char, sent.end_char))
1609
1625
 
1610
- text=list(df[col_text].astype('unicode').values)
1611
-
1612
- count_sentences=[]
1613
- count_batches=[]
1614
- results=[]
1615
- for doc in tqdm(nlp.pipe(text, batch_size=batch_size, n_process=n_process), total= len(text), desc = "Sentence splitting"):
1616
- # Split the text into sentences
1617
- sentences = [sent.text for sent in doc.sents]
1618
1626
  if stats:
1619
1627
  count_sentences.append(len(sentences))
1620
- if n_sentences>1:
1621
- # Split the sentences into batches of size n
1628
+
1629
+ if n_sentences > 1:
1630
+ # # Split sentences into batches of size n_sentences
1622
1631
  batches = [sentences[i:i + n_sentences] for i in range(0, len(sentences), n_sentences)]
1623
- concatenate_batches=[" ".join(sublist) for sublist in batches]
1632
+
1633
+ # Concatenate batches of sentences and adjust spans accordingly
1634
+ concatenate_batches = [" ".join([sub[0] for sub in sublist]) for sublist in batches]
1635
+ concatenate_spans = [(sublist[0][1], sublist[-1][2]) for sublist in batches]
1636
+
1637
+ if threshold is not None:
1638
+ concatenate_batches = concatenate_batches[:threshold]
1639
+ concatenate_spans = concatenate_spans[:threshold]
1640
+
1624
1641
  results.append(concatenate_batches)
1642
+ start_indexes.append([span[0] for span in concatenate_spans])
1643
+ end_indexes.append([span[1] for span in concatenate_spans])
1644
+
1625
1645
  if stats:
1626
1646
  count_batches.append(len(concatenate_batches))
1627
-
1628
1647
  else:
1629
- results.append(sentences)
1648
+ sentences = sentences[:threshold] if threshold is not None else sentences
1649
+
1650
+ results.append([sub[0] for sub in sentences])
1651
+ start_indexes.append([sub[1] for sub in sentences])
1652
+ end_indexes.append([sub[2] for sub in sentences])
1630
1653
 
1631
1654
  df['sentences'] = results
1632
- if stats:
1633
- df['sentences_count']=count_sentences
1634
- df['batch_sentences_count']=count_batches
1655
+ df['start_indexes'] = start_indexes
1656
+ df['end_indexes'] = end_indexes
1657
+
1658
+ df = df.explode(['sentences','start_indexes', 'end_indexes']).reset_index(drop=True)
1659
+
1635
1660
  return df
1636
1661
 
1637
1662
 
@@ -2404,3 +2429,46 @@ def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json
2404
2429
  write_json(results, dir_json , str(filename))
2405
2430
 
2406
2431
  return results
2432
+
2433
+
2434
+ def add_tag_libretranslate_not_translate(text):
2435
+ """
2436
+ This function add fake html tag around words such as mentions, hashtags, urls and emojis to avoid translation of those tokens.
2437
+
2438
+ Args:
2439
+ text (str): The text to process
2440
+
2441
+ Returns:
2442
+ str: The text with the fake html tags
2443
+ """
2444
+ # This regex finds words starting with # and followed by alphanumeric characters or underscores
2445
+ mention_pattern = r"(?:RT\s|QT\s)?(?<=^|(?<=[^a-zA-Z0-9-_\.]))(@[A-Za-z0-9_]{4,15})"
2446
+ hashtag_pattern = r"(\B#\w+)"
2447
+ url_pattern = r"(https?://[^ ]+)"
2448
+ emoji_pattern = r':[a-zA-Z_]+:'
2449
+
2450
+ pattern = re.compile(emoji_pattern+ "|" + mention_pattern + "|" + hashtag_pattern + "|" + url_pattern)
2451
+
2452
+ # This function replaces the hashtag with an HTML link tag
2453
+ def replace_with_link(match):
2454
+ matcher_group = match.group(0)
2455
+ return f'<a href="{matcher_group}"></a>'
2456
+
2457
+ # Use re.sub to substitute the hashtags with the HTML link tags
2458
+ text_no_emojis = emoji.demojize(text)
2459
+ result = re.sub(pattern, replace_with_link, text_no_emojis)
2460
+
2461
+ return result
2462
+
2463
+ def clean_libre_translate_tags(text):
2464
+ """
2465
+ This function remove fake tags added by add_tag_libretranslate_not_translate() function.
2466
+
2467
+ Args:
2468
+ text (str): The text to process
2469
+
2470
+ Returns:
2471
+ str: The text with the fake html tags
2472
+ """
2473
+ cleaned_string = text.replace('<a href="', '').replace('"></a>', '')
2474
+ return cleaned_string
@@ -18,7 +18,8 @@ def reduce_with_cuml_UMAP(embeddings: np.ndarray,
18
18
  metric: str = "cosine",
19
19
  spread: float = 1.0,
20
20
  learning_rate: float = 1.0,
21
- n_epochs:int = 300
21
+ n_epochs:int = 300,
22
+ random_state:int = None
22
23
  ) -> tuple:
23
24
  """
24
25
  Reduces the dimensionality of embeddings using UMAP with cuML library.
@@ -41,7 +42,8 @@ def reduce_with_cuml_UMAP(embeddings: np.ndarray,
41
42
  metric=metric,
42
43
  spread = spread,
43
44
  n_epochs=n_epochs,
44
- learning_rate=learning_rate).fit(embeddings)
45
+ learning_rate=learning_rate,
46
+ random_state=random_state).fit(embeddings)
45
47
 
46
48
  reduced_embeddings = reducer.transform(embeddings)
47
49
  return reducer, reduced_embeddings
@@ -56,7 +58,8 @@ def supervised_reduce_with_cuml_UMAP(embeddings: np.ndarray,
56
58
  learning_rate: float = 1.0,
57
59
  n_epochs:int = 300,
58
60
  y: np.ndarray = None,
59
- convert_dtype: bool = False
61
+ convert_dtype: bool = False,
62
+ random_state:int=None
60
63
  ) -> tuple:
61
64
  """
62
65
  Reduces the dimensionality of embeddings using UMAP with cuML library.
@@ -79,7 +82,8 @@ def supervised_reduce_with_cuml_UMAP(embeddings: np.ndarray,
79
82
  metric=metric,
80
83
  spread = spread,
81
84
  n_epochs=n_epochs,
82
- learning_rate=learning_rate).fit(X = embeddings, y = y, convert_dtype = convert_dtype)
85
+ learning_rate=learning_rate,
86
+ random_state=random_state).fit(X = embeddings, y = y, convert_dtype = convert_dtype)
83
87
 
84
88
  reduced_embeddings = reducer.transform(embeddings)
85
89
  return reducer, reduced_embeddings
@@ -11,6 +11,40 @@ from collections import Counter
11
11
  from opsci_toolbox.helpers.dataviz import boxplot
12
12
  from fa2_modified import ForceAtlas2
13
13
 
14
+ def create_subgraph_min_metric(G: nx.Graph, metric: str = "degree", min_value: float = 2) -> nx.Graph:
15
+ """
16
+ Creates a subgraph containing only the nodes that have at least the specified minimum value for a given metric.
17
+
18
+ Args:
19
+ G (nx.Graph): The input graph.
20
+ metric (str, optional): The node metric to filter nodes by (e.g., "degree", "in_degree", "out_degree", "degree_centrality"). Default is "degree".
21
+ min_value (float, optional): The minimum value required for nodes to be included in the subgraph. Default is 2.
22
+
23
+ Returns:
24
+ subgraph (nx.Graph): A subgraph containing only the nodes with at least the specified minimum metric value.
25
+ """
26
+
27
+ if metric == "degree":
28
+ nodes_with_min_metric = [node for node, value in G.degree() if value >= min_value]
29
+ elif metric == "in_degree" and G.is_directed():
30
+ nodes_with_min_metric = [node for node, value in G.in_degree() if value >= min_value]
31
+ elif metric == "out_degree" and G.is_directed():
32
+ nodes_with_min_metric = [node for node, value in G.out_degree() if value >= min_value]
33
+ elif metric == "degree_centrality":
34
+ centrality = nx.degree_centrality(G)
35
+ nodes_with_min_metric = [node for node, value in centrality.items() if value >= min_value]
36
+ elif metric == "betweenness_centrality":
37
+ centrality = nx.betweenness_centrality(G)
38
+ nodes_with_min_metric = [node for node, value in centrality.items() if value >= min_value]
39
+ elif metric == "closeness_centrality":
40
+ centrality = nx.closeness_centrality(G)
41
+ nodes_with_min_metric = [node for node, value in centrality.items() if value >= min_value]
42
+ else:
43
+ raise ValueError(f"Unsupported metric: {metric}")
44
+
45
+ subgraph = G.subgraph(nodes_with_min_metric).copy()
46
+ return subgraph
47
+
14
48
  def group_nodes_by_values(dictionnary : dict) -> dict:
15
49
  """
16
50
  Group nodes by their values from a dictionary.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: opsci-toolbox
3
- Version: 0.0.14
3
+ Version: 0.0.15
4
4
  Summary: a complete toolbox
5
5
  Home-page: UNKNOWN
6
6
  Author: Erwan Le Nagard
@@ -2,25 +2,25 @@ opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  opsci_toolbox/apis/rapidapi_helpers.py,sha256=plX0uoGXWBEmeRqK7QfB_CVYJnW15kVUWtitESxPLNw,26669
4
4
  opsci_toolbox/apis/reddit.py,sha256=b_dJFZ_bOB9LLugGBBw5bCbUZdq8VnwtVCGaTYljIIg,21096
5
- opsci_toolbox/apis/telegram.py,sha256=IJYXMvXzA2R2Z7ywKJiny38pd-ryHK4jPxVG2Nj_dms,45676
5
+ opsci_toolbox/apis/telegram.py,sha256=JjmAk6tKvpnFIYpZDKthxS_mgqhWQpDPUOvyC7SiWPA,60920
6
6
  opsci_toolbox/apis/webscraping.py,sha256=1DAIYbywZoPwTSyoqFGxyF0-q_nUsGg_VK51zLL_bB0,21465
7
7
  opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
8
8
  opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- opsci_toolbox/helpers/common.py,sha256=dlP6TnRggZsnPksgo7LPH7IghU_t9LFz42eMEzzg99o,53323
9
+ opsci_toolbox/helpers/common.py,sha256=zmi-FbN39Rci_hGEKj2bmkcucrVwnHhMgKU6AAIap3Q,53327
10
10
  opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
11
11
  opsci_toolbox/helpers/dataviz.py,sha256=U2Kj-xoF1wHvYXUKxLsrSvKnhky9PrPUy61s1WEKp44,208743
12
12
  opsci_toolbox/helpers/dates.py,sha256=Pq-SKP2n1z0_jzU8NxGSv8CHLH_MOKjP_rNYeny0Tb8,4752
13
13
  opsci_toolbox/helpers/gliner.py,sha256=qLkpuoCDezQyYmg_TE3XYETSpobHods6WBjCLo0Gjqw,3579
14
- opsci_toolbox/helpers/nlp.py,sha256=hXnP6rUkUzyurJ5O_fNUxqT2MZK3poC21L9zy6oa22c,102551
15
- opsci_toolbox/helpers/nlp_cuml.py,sha256=OBCRkaHibuyvJ8LQAE2EC7_J0KPe7Kf-ayN2jyxDlKg,30709
16
- opsci_toolbox/helpers/sna.py,sha256=E5D_1aGDmq_YQYseHxZggEtWQOwbXJJ0GHu3YtZLGtg,31906
14
+ opsci_toolbox/helpers/nlp.py,sha256=TXf1_dvmfDY9tR0gjQ1C-KzPRib7t74_ZcvmcYZWcPs,105096
15
+ opsci_toolbox/helpers/nlp_cuml.py,sha256=KfgC0hMqLCKoOME2DOu3Wje4ormV19fEB8Fyq8G7D-E,30901
16
+ opsci_toolbox/helpers/sna.py,sha256=3qx1WBQwLKpZNGR0bLSMB2-LBRx-vtNHp8puzoj-84A,33730
17
17
  opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
18
18
  opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
19
19
  opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
21
21
  opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
22
- opsci_toolbox-0.0.14.dist-info/METADATA,sha256=X2EgVw8JlZLdgnrN1nOP6aZRs1WyztbkCkN4UKkuTLE,1727
23
- opsci_toolbox-0.0.14.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
24
- opsci_toolbox-0.0.14.dist-info/dependency_links.txt,sha256=bEiJsgyh9M0F_pGpJBwUYDefiTNq9F6QEGfQS5RH1Os,39
25
- opsci_toolbox-0.0.14.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
26
- opsci_toolbox-0.0.14.dist-info/RECORD,,
22
+ opsci_toolbox-0.0.15.dist-info/METADATA,sha256=ppE13xf4E90LfW9Eir5U30xOI91F96wQqAam7kZwV1o,1727
23
+ opsci_toolbox-0.0.15.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
24
+ opsci_toolbox-0.0.15.dist-info/dependency_links.txt,sha256=bEiJsgyh9M0F_pGpJBwUYDefiTNq9F6QEGfQS5RH1Os,39
25
+ opsci_toolbox-0.0.15.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
26
+ opsci_toolbox-0.0.15.dist-info/RECORD,,