opsci-toolbox 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -383,6 +383,8 @@ def write_json(json_dict: dict, path: str, name: str) -> str:
383
383
  return file_path
384
384
 
385
385
 
386
+
387
+
386
388
  def write_dataframe_to_json(df: pd.DataFrame, path: str, name: str, orient: str = 'records') -> str:
387
389
  """
388
390
  Write a DataFrame to a JSON file.
@@ -603,7 +605,7 @@ def list_files_in_subdirectories(path: str, filetype: str = '*.json') -> list:
603
605
 
604
606
  return files
605
607
 
606
- def copy_file(source_path: str, destination_path: str, new_filename: str = '') -> str:
608
+ def copy_file(source_path: str, destination_path: str, new_filename: str = None) -> str:
607
609
  """
608
610
  Copy a file from a source path to a destination path.
609
611
 
@@ -58,7 +58,7 @@ def number_of_days(start_date: datetime, end_date: datetime) -> int:
58
58
  days_difference (int): The number of days between the start and end dates.
59
59
  """
60
60
  # Calculate the difference
61
- time_difference = start_date - end_date
61
+ time_difference = end_date - start_date
62
62
  # Extract the number of days from the timedelta object
63
63
  days_difference = time_difference.days
64
64
  return days_difference
@@ -7,7 +7,7 @@ import os
7
7
  from sklearn.decomposition import TruncatedSVD
8
8
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
9
9
  from sklearn.manifold import TSNE
10
- from sklearn.preprocessing import StandardScaler, MinMaxScaler
10
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
11
11
  from sklearn.cluster import AgglomerativeClustering
12
12
  from sentence_transformers import SentenceTransformer
13
13
  from tqdm import tqdm
@@ -94,7 +94,7 @@ def filter_by_query(df: pd.DataFrame, col_text: str, query: str, ignore_case: bo
94
94
  The filtered DataFrame.
95
95
  """
96
96
  eldar_query=Query(query, ignore_case = ignore_case, ignore_accent=ignore_accent, match_word=match_word)
97
- df[col_text] = df[df[col_text].apply(eldar_query)]
97
+ df = df[df[col_text].apply(eldar_query)]
98
98
  df=df.reset_index(drop=True)
99
99
  return df
100
100
 
@@ -126,7 +126,7 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
126
126
  df[col_clean] = df[col_clean].apply(lambda x : brackets(x))
127
127
  df[col_clean] = df[col_clean].apply(lambda x : urls(x, repl= ''))
128
128
  df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
129
- df[col_clean] = df[col_clean].apply(remove_multiple_hashtags)
129
+ # df[col_clean] = df[col_clean].apply(remove_multiple_hashtags)
130
130
  df[col_clean] = df[col_clean].apply(remove_extra_spaces)
131
131
  # df = df.loc[(df[col_clean] != ""), :]
132
132
  return df
@@ -541,7 +541,7 @@ def substitute_punctuations_with_white_space(text : str) -> str:
541
541
  text = re.sub(r"[%s]" % re.escape('!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~“…”’'), " ", text)
542
542
  return text
543
543
 
544
- def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate") -> dict:
544
+ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate", format_payload="html") -> dict:
545
545
  """
546
546
  Translate text using LibreTranslate service.
547
547
 
@@ -558,6 +558,8 @@ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_j
558
558
  The directory to save the translation result JSON file.
559
559
  url : str, optional
560
560
  The URL of the WT Libre translation service. Default is "http://127.0.0.1:5000/translate".
561
+ format_payload : str, optional
562
+ Possible values are html or text.
561
563
 
562
564
  Returns:
563
565
  json_data : dict
@@ -568,7 +570,7 @@ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_j
568
570
  "q": text,
569
571
  "source": source,
570
572
  "target": target,
571
- "format": "text",
573
+ "format": format_payload,
572
574
  "api_key": ""
573
575
  }
574
576
 
@@ -580,7 +582,7 @@ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_j
580
582
  write_json(json_data, dir_json , str(filename))
581
583
  return json_data
582
584
 
583
- def translate_batch(batch_text: list, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate") -> list:
585
+ def translate_batch(batch_text: list, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate", format_payload="html") -> list:
584
586
  """
585
587
  Translate a batch of texts using LibreTranslate service.
586
588
 
@@ -597,6 +599,8 @@ def translate_batch(batch_text: list, source: str, target: str, filename: str, d
597
599
  The directory to save the translation result JSONL file.
598
600
  url : str, optional
599
601
  The URL of the WT Libre translation service. Default is "http://127.0.0.1:5000/translate".
602
+ format_payload : str, optional
603
+ Possible values are html or text.
600
604
 
601
605
  Returns:
602
606
  json_results : list of dict
@@ -607,7 +611,7 @@ def translate_batch(batch_text: list, source: str, target: str, filename: str, d
607
611
  "q": batch_text,
608
612
  "source": source,
609
613
  "target": target,
610
- "format": "text",
614
+ "format": format_payload,
611
615
  "api_key": ""
612
616
  }
613
617
 
@@ -623,7 +627,7 @@ def translate_batch(batch_text: list, source: str, target: str, filename: str, d
623
627
  write_jsonl(json_results, dir_json , str(filename))
624
628
  return json_results
625
629
 
626
- def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:5000/translate") -> str:
630
+ def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:5000/translate", format_payload="html") -> str:
627
631
  """
628
632
  Translate text using LibreTranslate service.
629
633
 
@@ -636,6 +640,8 @@ def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:
636
640
  The target language code.
637
641
  url : str, optional
638
642
  The URL of the translation service. Default is "http://127.0.0.1:5000/translate".
643
+ format_payload : str, optional
644
+ Possible values are html or text.
639
645
 
640
646
  Returns:
641
647
  translatedText : str
@@ -646,7 +652,7 @@ def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:
646
652
  "q": text,
647
653
  "source": source,
648
654
  "target": target,
649
- "format": "text",
655
+ "format": format_payload,
650
656
  "api_key": ""
651
657
  }
652
658
 
@@ -905,6 +911,8 @@ def topic_aggregate_chunks(df: pd.DataFrame, col_id: str, col_topic : str, col_c
905
911
  """
906
912
  metrics_dict = dict()
907
913
  # metrics_dict[col_id]=(col_id,'first')
914
+ # if col_id != col_chunk_id:
915
+ # metrics_dict[col_chunk_id]=(col_chunk_id,"nunique")
908
916
  metrics_dict[col_chunk_id]=(col_chunk_id,"nunique")
909
917
  metrics_dict[col_engagement]=(col_engagement,'first')
910
918
 
@@ -1280,10 +1288,12 @@ def PRarmy_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str = "l
1280
1288
  NER_type.append(ent.label_)
1281
1289
  NER_text.append(ent.text)
1282
1290
 
1291
+
1283
1292
  record = (NER_type, NER_text, ' '.join(map(str, lemmas_list)))
1284
1293
  all_records.append(record)
1285
1294
 
1286
1295
 
1296
+
1287
1297
  df[['NER_type', 'NER_text', col_lemma]] = pd.DataFrame(all_records, index=df.index)
1288
1298
 
1289
1299
  return df
@@ -1570,10 +1580,10 @@ def extract_emojis(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100,
1570
1580
 
1571
1581
  return df
1572
1582
 
1573
- def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1, batch_size: int = 100, n_process: int = 1, stats: bool = False) -> pd.DataFrame:
1583
+ def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1, batch_size: int = 100, n_process: int = 1, stats: bool = False, threshold: int = None) -> pd.DataFrame:
1574
1584
  """
1575
- Split a text into chunks of n sentences
1576
-
1585
+ Split a text into chunks of n sentences, returning their start and end indexes in separate columns.
1586
+
1577
1587
  Parameters:
1578
1588
  nlp : spacy.language.Language
1579
1589
  The spaCy language processing pipeline.
@@ -1589,41 +1599,64 @@ def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1
1589
1599
  The number of processes to use for text processing. Default is 1.
1590
1600
  stats : bool, optional
1591
1601
  Flag indicating whether to compute statistics about the splitting process. Default is False.
1592
-
1602
+ threshold : int, optional
1603
+ Maximum number of sentence batches to return per text. If None, all batches are returned. Default is None.
1604
+
1593
1605
  Returns:
1594
1606
  pd.DataFrame
1595
- DataFrame containing the split sentences.
1607
+ DataFrame containing the split sentences with their start and end indexes in separate columns.
1596
1608
 
1597
- Description:
1598
- This function splits text in a DataFrame into chunks of n sentences. It returns a DataFrame containing the split sentences.
1599
- Optionally, it can compute statistics such as the count of sentences and batches if the 'stats' parameter is set to True.
1600
1609
  """
1610
+ text = list(df[col_text].astype('unicode').values)
1611
+
1612
+ count_sentences = []
1613
+ count_batches = []
1614
+ results = []
1615
+ start_indexes = []
1616
+ end_indexes = []
1617
+
1618
+ for doc in tqdm(nlp.pipe(text, batch_size=batch_size, n_process=n_process), total=len(text), desc="Sentence splitting"):
1619
+ sentences = []
1620
+
1621
+
1622
+ # Extract sentences and their positions
1623
+ for sent in doc.sents:
1624
+ sentences.append((sent.text, sent.start_char, sent.end_char))
1601
1625
 
1602
- text=list(df[col_text].astype('unicode').values)
1603
-
1604
- count_sentences=[]
1605
- count_batches=[]
1606
- results=[]
1607
- for doc in tqdm(nlp.pipe(text, batch_size=batch_size, n_process=n_process), total= len(text), desc = "Sentence splitting"):
1608
- # Split the text into sentences
1609
- sentences = [sent.text for sent in doc.sents]
1610
1626
  if stats:
1611
1627
  count_sentences.append(len(sentences))
1612
- if n_sentences>1:
1613
- # Split the sentences into batches of size n
1628
+
1629
+ if n_sentences > 1:
1630
+ # # Split sentences into batches of size n_sentences
1614
1631
  batches = [sentences[i:i + n_sentences] for i in range(0, len(sentences), n_sentences)]
1615
- concatenate_batches=[" ".join(sublist) for sublist in batches]
1632
+
1633
+ # Concatenate batches of sentences and adjust spans accordingly
1634
+ concatenate_batches = [" ".join([sub[0] for sub in sublist]) for sublist in batches]
1635
+ concatenate_spans = [(sublist[0][1], sublist[-1][2]) for sublist in batches]
1636
+
1637
+ if threshold is not None:
1638
+ concatenate_batches = concatenate_batches[:threshold]
1639
+ concatenate_spans = concatenate_spans[:threshold]
1640
+
1616
1641
  results.append(concatenate_batches)
1642
+ start_indexes.append([span[0] for span in concatenate_spans])
1643
+ end_indexes.append([span[1] for span in concatenate_spans])
1644
+
1617
1645
  if stats:
1618
1646
  count_batches.append(len(concatenate_batches))
1619
-
1620
1647
  else:
1621
- results.append(sentences)
1648
+ sentences = sentences[:threshold] if threshold is not None else sentences
1649
+
1650
+ results.append([sub[0] for sub in sentences])
1651
+ start_indexes.append([sub[1] for sub in sentences])
1652
+ end_indexes.append([sub[2] for sub in sentences])
1622
1653
 
1623
1654
  df['sentences'] = results
1624
- if stats:
1625
- df['sentences_count']=count_sentences
1626
- df['batch_sentences_count']=count_batches
1655
+ df['start_indexes'] = start_indexes
1656
+ df['end_indexes'] = end_indexes
1657
+
1658
+ df = df.explode(['sentences','start_indexes', 'end_indexes']).reset_index(drop=True)
1659
+
1627
1660
  return df
1628
1661
 
1629
1662
 
@@ -1998,6 +2031,75 @@ def encode_chunked_files(chunk_files_paths: list,
1998
2031
 
1999
2032
  return new_file_paths
2000
2033
 
2034
+ ####################################################################
2035
+ # ENCODING FEATURES
2036
+ ####################################################################
2037
+
2038
+ def encode_labels(data_to_encode: np.ndarray) -> tuple:
2039
+ """
2040
+ Encodes a list of labels using a LabelEncoder.
2041
+
2042
+ Args:
2043
+ - data_to_encode (List[Union[str, int]]): The list of labels to encode. Labels can be of any hashable type,
2044
+ but strings or integers are typical.
2045
+
2046
+ Returns:
2047
+ - Tuple[LabelEncoder, np.ndarray]: A tuple containing the fitted LabelEncoder instance and a numpy array
2048
+ of encoded labels.
2049
+ """
2050
+ label_encoder = LabelEncoder()
2051
+ label_encoder.fit(data_to_encode)
2052
+ encoded_labels = label_encoder.transform(data_to_encode)
2053
+ return label_encoder, encoded_labels
2054
+
2055
+
2056
+ def encode_new_labels(label_encoder : LabelEncoder, data_to_encode : np.ndarray) -> np.ndarray:
2057
+ """
2058
+ Encodes a list of new labels using an already fitted LabelEncoder.
2059
+
2060
+ Args:
2061
+ - label_encoder (LabelEncoder): A pre-fitted LabelEncoder instance.
2062
+ - data_to_encode (List[Union[str, int]]): The list of new labels to encode using the pre-fitted encoder.
2063
+
2064
+ Returns:
2065
+ - np.ndarray: A numpy array of encoded labels.
2066
+ """
2067
+ encoded_labels = label_encoder.transform(data_to_encode)
2068
+ return encoded_labels
2069
+
2070
+ def one_hot_encode(data_to_encode:np.ndarray) -> tuple:
2071
+ """
2072
+ One-hot encodes a list of categorical values using OneHotEncoder.
2073
+
2074
+ Args:
2075
+ - data_to_encode (List[Union[str, int]]): The list of categorical values to encode. The values can be of
2076
+ any hashable type, typically strings or integers.
2077
+
2078
+ Returns:
2079
+ - Tuple[OneHotEncoder, np.ndarray]: A tuple containing the fitted OneHotEncoder instance and a numpy array
2080
+ of one-hot encoded values.
2081
+ """
2082
+ one_hot_encoder = OneHotEncoder(sparse=False)
2083
+ data_to_encode_reshaped = np.array(data_to_encode).reshape(-1, 1) # Reshape for OneHotEncoder
2084
+ one_hot_encoder.fit(data_to_encode_reshaped)
2085
+ encoded_array = one_hot_encoder.transform(data_to_encode_reshaped)
2086
+ return one_hot_encoder, encoded_array
2087
+
2088
+
2089
+ def one_hot_encode_new_data(one_hot_encoder: OneHotEncoder, data_to_encode: np.ndarray) -> np.ndarray:
2090
+ """
2091
+ One-hot encodes a list of new categorical values using an already fitted OneHotEncoder.
2092
+
2093
+ Args:
2094
+ - one_hot_encoder (OneHotEncoder): A pre-fitted OneHotEncoder instance.
2095
+ - data_to_encode (List[Union[str, int]]): The list of new categorical values to encode using the pre-fitted encoder.
2096
+
2097
+ Returns:
2098
+ - np.ndarray: A numpy array of one-hot encoded values.
2099
+ """
2100
+ data_to_encode_reshaped = np.array(data_to_encode).reshape(-1, 1) # Reshape for OneHotEncoder
2101
+ encoded_array = one_hot_encoder.transform(data_to_encode_reshaped)
2102
+ return encoded_array
2001
2103
 
2002
2104
  ####################################################################
2003
2105
  # SCALING FEATURES
@@ -2327,3 +2429,46 @@ def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json
2327
2429
  write_json(results, dir_json , str(filename))
2328
2430
 
2329
2431
  return results
2432
+
2433
+
2434
+ def add_tag_libretranslate_not_translate(text):
2435
+ """
2436
+ This function add fake html tag around words such as mentions, hashtags, urls and emojis to avoid translation of those tokens.
2437
+
2438
+ Args:
2439
+ text (str): The text to process
2440
+
2441
+ Returns:
2442
+ str: The text with the fake html tags
2443
+ """
2444
+ # This regex finds words starting with # and followed by alphanumeric characters or underscores
2445
+ mention_pattern = r"(?:RT\s|QT\s)?(?<=^|(?<=[^a-zA-Z0-9-_\.]))(@[A-Za-z0-9_]{4,15})"
2446
+ hashtag_pattern = r"(\B#\w+)"
2447
+ url_pattern = r"(https?://[^ ]+)"
2448
+ emoji_pattern = r':[a-zA-Z_]+:'
2449
+
2450
+ pattern = re.compile(emoji_pattern+ "|" + mention_pattern + "|" + hashtag_pattern + "|" + url_pattern)
2451
+
2452
+ # This function replaces the hashtag with an HTML link tag
2453
+ def replace_with_link(match):
2454
+ matcher_group = match.group(0)
2455
+ return f'<a href="{matcher_group}"></a>'
2456
+
2457
+ # Use re.sub to substitute the hashtags with the HTML link tags
2458
+ text_no_emojis = emoji.demojize(text)
2459
+ result = re.sub(pattern, replace_with_link, text_no_emojis)
2460
+
2461
+ return result
2462
+
2463
+ def clean_libre_translate_tags(text):
2464
+ """
2465
+ This function remove fake tags added by add_tag_libretranslate_not_translate() function.
2466
+
2467
+ Args:
2468
+ text (str): The text to process
2469
+
2470
+ Returns:
2471
+ str: The text with the fake html tags
2472
+ """
2473
+ cleaned_string = text.replace('<a href="', '').replace('"></a>', '')
2474
+ return cleaned_string
@@ -18,7 +18,8 @@ def reduce_with_cuml_UMAP(embeddings: np.ndarray,
18
18
  metric: str = "cosine",
19
19
  spread: float = 1.0,
20
20
  learning_rate: float = 1.0,
21
- n_epochs:int = 300
21
+ n_epochs:int = 300,
22
+ random_state:int = None
22
23
  ) -> tuple:
23
24
  """
24
25
  Reduces the dimensionality of embeddings using UMAP with cuML library.
@@ -41,7 +42,48 @@ def reduce_with_cuml_UMAP(embeddings: np.ndarray,
41
42
  metric=metric,
42
43
  spread = spread,
43
44
  n_epochs=n_epochs,
44
- learning_rate=learning_rate).fit(embeddings)
45
+ learning_rate=learning_rate,
46
+ random_state=random_state).fit(embeddings)
47
+
48
+ reduced_embeddings = reducer.transform(embeddings)
49
+ return reducer, reduced_embeddings
50
+
51
+
52
+ def supervised_reduce_with_cuml_UMAP(embeddings: np.ndarray,
53
+ n_neighbors: int = 5,
54
+ n_components: int = 3,
55
+ min_dist: float = 0.0,
56
+ metric: str = "cosine",
57
+ spread: float = 1.0,
58
+ learning_rate: float = 1.0,
59
+ n_epochs:int = 300,
60
+ y: np.ndarray = None,
61
+ convert_dtype: bool = False,
62
+ random_state:int=None
63
+ ) -> tuple:
64
+ """
65
+ Reduces the dimensionality of embeddings using UMAP with cuML library.
66
+
67
+ Args:
68
+ embeddings (np.ndarray): The input embeddings to be reduced.
69
+ n_neighbors (int, optional): The number of nearest neighbors to consider. Defaults to 5.
70
+ n_components (int, optional): The number of dimensions of the embedded space. Defaults to 3.
71
+ min_dist (float, optional): The minimum distance between embedded points. Defaults to 0.0.
72
+ metric (str, optional): The metric to use for distance computation. Defaults to "cosine".
73
+ spread (float, optional): The effective scale of embedded points. Defaults to 1.0.
74
+
75
+ Returns:
76
+ reducer (UMAP): The UMAP reducer object.
77
+ reduced_embeddings (np.ndarray): The reduced embeddings.
78
+ """
79
+ reducer = UMAP(n_neighbors=n_neighbors,
80
+ n_components=n_components,
81
+ min_dist=min_dist,
82
+ metric=metric,
83
+ spread = spread,
84
+ n_epochs=n_epochs,
85
+ learning_rate=learning_rate,
86
+ random_state=random_state).fit(X = embeddings, y = y, convert_dtype = convert_dtype)
45
87
 
46
88
  reduced_embeddings = reducer.transform(embeddings)
47
89
  return reducer, reduced_embeddings
@@ -409,6 +451,9 @@ def cuml_word_frequency_per_categorie(gdf: pd.DataFrame, col_text: str, col_cat:
409
451
  # Convert the result back to pandas DataFrame
410
452
  return df_count.to_pandas()
411
453
 
454
+
455
+
456
+
412
457
  # def cuml_chi2_per_category(lst_text: list, lst_categorie: list, col_cat: str, n_words: int = 10, p_value_limit: float = 0.95, min_freq: int = 3) -> pd.DataFrame:
413
458
 
414
459
  # # Convert input lists to cuDF Series
@@ -11,6 +11,40 @@ from collections import Counter
11
11
  from opsci_toolbox.helpers.dataviz import boxplot
12
12
  from fa2_modified import ForceAtlas2
13
13
 
14
+ def create_subgraph_min_metric(G: nx.Graph, metric: str = "degree", min_value: float = 2) -> nx.Graph:
15
+ """
16
+ Creates a subgraph containing only the nodes that have at least the specified minimum value for a given metric.
17
+
18
+ Args:
19
+ G (nx.Graph): The input graph.
20
+ metric (str, optional): The node metric to filter nodes by (e.g., "degree", "in_degree", "out_degree", "degree_centrality"). Default is "degree".
21
+ min_value (float, optional): The minimum value required for nodes to be included in the subgraph. Default is 2.
22
+
23
+ Returns:
24
+ subgraph (nx.Graph): A subgraph containing only the nodes with at least the specified minimum metric value.
25
+ """
26
+
27
+ if metric == "degree":
28
+ nodes_with_min_metric = [node for node, value in G.degree() if value >= min_value]
29
+ elif metric == "in_degree" and G.is_directed():
30
+ nodes_with_min_metric = [node for node, value in G.in_degree() if value >= min_value]
31
+ elif metric == "out_degree" and G.is_directed():
32
+ nodes_with_min_metric = [node for node, value in G.out_degree() if value >= min_value]
33
+ elif metric == "degree_centrality":
34
+ centrality = nx.degree_centrality(G)
35
+ nodes_with_min_metric = [node for node, value in centrality.items() if value >= min_value]
36
+ elif metric == "betweenness_centrality":
37
+ centrality = nx.betweenness_centrality(G)
38
+ nodes_with_min_metric = [node for node, value in centrality.items() if value >= min_value]
39
+ elif metric == "closeness_centrality":
40
+ centrality = nx.closeness_centrality(G)
41
+ nodes_with_min_metric = [node for node, value in centrality.items() if value >= min_value]
42
+ else:
43
+ raise ValueError(f"Unsupported metric: {metric}")
44
+
45
+ subgraph = G.subgraph(nodes_with_min_metric).copy()
46
+ return subgraph
47
+
14
48
  def group_nodes_by_values(dictionnary : dict) -> dict:
15
49
  """
16
50
  Group nodes by their values from a dictionary.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: opsci-toolbox
3
- Version: 0.0.13
3
+ Version: 0.0.15
4
4
  Summary: a complete toolbox
5
5
  Home-page: UNKNOWN
6
6
  Author: Erwan Le Nagard
@@ -41,7 +41,7 @@ Requires-Dist: spacy-language-detection ==0.2.1
41
41
  Requires-Dist: spacymoji ==3.1.0
42
42
  Requires-Dist: supervision ==0.21.0
43
43
  Requires-Dist: textacy ==0.13.0
44
- Requires-Dist: torch ==2.0.1
44
+ Requires-Dist: torch >=2.4.0
45
45
  Requires-Dist: tqdm >=4.66.2
46
46
  Requires-Dist: trafilatura ==1.7.0
47
47
  Requires-Dist: transformers ==4.38.2
@@ -1,25 +1,26 @@
1
1
  opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- opsci_toolbox/apis/rapidapi_helpers.py,sha256=k_hYcRNww5noNkX7zyz5Htggxb15BPoKSlbY7NLuQXI,26696
4
- opsci_toolbox/apis/reddit.py,sha256=zhK2CY9CkCezNcekQFdv1So3NmHHYxB7-tgMVErHOGI,15763
5
- opsci_toolbox/apis/telegram.py,sha256=GKDLpZg1fc9D_PGCgi9pfTaW7Jjm_2luQ-2trXTr38A,42208
3
+ opsci_toolbox/apis/rapidapi_helpers.py,sha256=plX0uoGXWBEmeRqK7QfB_CVYJnW15kVUWtitESxPLNw,26669
4
+ opsci_toolbox/apis/reddit.py,sha256=b_dJFZ_bOB9LLugGBBw5bCbUZdq8VnwtVCGaTYljIIg,21096
5
+ opsci_toolbox/apis/telegram.py,sha256=JjmAk6tKvpnFIYpZDKthxS_mgqhWQpDPUOvyC7SiWPA,60920
6
6
  opsci_toolbox/apis/webscraping.py,sha256=1DAIYbywZoPwTSyoqFGxyF0-q_nUsGg_VK51zLL_bB0,21465
7
7
  opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
8
8
  opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- opsci_toolbox/helpers/common.py,sha256=ZGjWIPEpqr-gIYjkfsS97PmCtQWHa_iF8tBbVxrQsOQ,53321
9
+ opsci_toolbox/helpers/common.py,sha256=zmi-FbN39Rci_hGEKj2bmkcucrVwnHhMgKU6AAIap3Q,53327
10
10
  opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
11
11
  opsci_toolbox/helpers/dataviz.py,sha256=U2Kj-xoF1wHvYXUKxLsrSvKnhky9PrPUy61s1WEKp44,208743
12
- opsci_toolbox/helpers/dates.py,sha256=CxbXSo61GPZ2L37PV0ujvp78vwl0DoBq7t0nkk9qHp8,4751
12
+ opsci_toolbox/helpers/dates.py,sha256=Pq-SKP2n1z0_jzU8NxGSv8CHLH_MOKjP_rNYeny0Tb8,4752
13
13
  opsci_toolbox/helpers/gliner.py,sha256=qLkpuoCDezQyYmg_TE3XYETSpobHods6WBjCLo0Gjqw,3579
14
- opsci_toolbox/helpers/nlp.py,sha256=I72F32ieofZaCIkjZ9kqpiJLktfRoM7mMhzzxyXDQ3I,99316
15
- opsci_toolbox/helpers/nlp_cuml.py,sha256=CGyThKNgo6fdFPV-iooPG0oNrzA__Hvv08t_sdEp3BE,28919
16
- opsci_toolbox/helpers/sna.py,sha256=E5D_1aGDmq_YQYseHxZggEtWQOwbXJJ0GHu3YtZLGtg,31906
14
+ opsci_toolbox/helpers/nlp.py,sha256=TXf1_dvmfDY9tR0gjQ1C-KzPRib7t74_ZcvmcYZWcPs,105096
15
+ opsci_toolbox/helpers/nlp_cuml.py,sha256=KfgC0hMqLCKoOME2DOu3Wje4ormV19fEB8Fyq8G7D-E,30901
16
+ opsci_toolbox/helpers/sna.py,sha256=3qx1WBQwLKpZNGR0bLSMB2-LBRx-vtNHp8puzoj-84A,33730
17
17
  opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
18
18
  opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
19
19
  opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
21
21
  opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
22
- opsci_toolbox-0.0.13.dist-info/METADATA,sha256=G_JhKg5tmYPkRUhAN2Uj9B6orX7x3TKWqIOKU_TjeIA,1727
23
- opsci_toolbox-0.0.13.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
24
- opsci_toolbox-0.0.13.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
25
- opsci_toolbox-0.0.13.dist-info/RECORD,,
22
+ opsci_toolbox-0.0.15.dist-info/METADATA,sha256=ppE13xf4E90LfW9Eir5U30xOI91F96wQqAam7kZwV1o,1727
23
+ opsci_toolbox-0.0.15.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
24
+ opsci_toolbox-0.0.15.dist-info/dependency_links.txt,sha256=bEiJsgyh9M0F_pGpJBwUYDefiTNq9F6QEGfQS5RH1Os,39
25
+ opsci_toolbox-0.0.15.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
26
+ opsci_toolbox-0.0.15.dist-info/RECORD,,
@@ -0,0 +1 @@
1
+ https://download.pytorch.org/whl/cu124