opsci-toolbox 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/rapidapi_helpers.py +1 -2
- opsci_toolbox/apis/reddit.py +342 -334
- opsci_toolbox/apis/telegram.py +471 -41
- opsci_toolbox/helpers/common.py +3 -1
- opsci_toolbox/helpers/dates.py +1 -1
- opsci_toolbox/helpers/nlp.py +178 -33
- opsci_toolbox/helpers/nlp_cuml.py +47 -2
- opsci_toolbox/helpers/sna.py +34 -0
- {opsci_toolbox-0.0.13.dist-info → opsci_toolbox-0.0.15.dist-info}/METADATA +2 -2
- {opsci_toolbox-0.0.13.dist-info → opsci_toolbox-0.0.15.dist-info}/RECORD +13 -12
- opsci_toolbox-0.0.15.dist-info/dependency_links.txt +1 -0
- {opsci_toolbox-0.0.13.dist-info → opsci_toolbox-0.0.15.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.13.dist-info → opsci_toolbox-0.0.15.dist-info}/top_level.txt +0 -0
opsci_toolbox/helpers/common.py
CHANGED
@@ -383,6 +383,8 @@ def write_json(json_dict: dict, path: str, name: str) -> str:
|
|
383
383
|
return file_path
|
384
384
|
|
385
385
|
|
386
|
+
|
387
|
+
|
386
388
|
def write_dataframe_to_json(df: pd.DataFrame, path: str, name: str, orient: str = 'records') -> str:
|
387
389
|
"""
|
388
390
|
Write a DataFrame to a JSON file.
|
@@ -603,7 +605,7 @@ def list_files_in_subdirectories(path: str, filetype: str = '*.json') -> list:
|
|
603
605
|
|
604
606
|
return files
|
605
607
|
|
606
|
-
def copy_file(source_path: str, destination_path: str, new_filename: str =
|
608
|
+
def copy_file(source_path: str, destination_path: str, new_filename: str = None) -> str:
|
607
609
|
"""
|
608
610
|
Copy a file from a source path to a destination path.
|
609
611
|
|
opsci_toolbox/helpers/dates.py
CHANGED
@@ -58,7 +58,7 @@ def number_of_days(start_date: datetime, end_date: datetime) -> int:
|
|
58
58
|
days_difference (int): The number of days between the start and end dates.
|
59
59
|
"""
|
60
60
|
# Calculate the difference
|
61
|
-
time_difference =
|
61
|
+
time_difference = end_date - start_date
|
62
62
|
# Extract the number of days from the timedelta object
|
63
63
|
days_difference = time_difference.days
|
64
64
|
return days_difference
|
opsci_toolbox/helpers/nlp.py
CHANGED
@@ -7,7 +7,7 @@ import os
|
|
7
7
|
from sklearn.decomposition import TruncatedSVD
|
8
8
|
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
9
9
|
from sklearn.manifold import TSNE
|
10
|
-
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
10
|
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
|
11
11
|
from sklearn.cluster import AgglomerativeClustering
|
12
12
|
from sentence_transformers import SentenceTransformer
|
13
13
|
from tqdm import tqdm
|
@@ -94,7 +94,7 @@ def filter_by_query(df: pd.DataFrame, col_text: str, query: str, ignore_case: bo
|
|
94
94
|
The filtered DataFrame.
|
95
95
|
"""
|
96
96
|
eldar_query=Query(query, ignore_case = ignore_case, ignore_accent=ignore_accent, match_word=match_word)
|
97
|
-
df
|
97
|
+
df = df[df[col_text].apply(eldar_query)]
|
98
98
|
df=df.reset_index(drop=True)
|
99
99
|
return df
|
100
100
|
|
@@ -126,7 +126,7 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
|
|
126
126
|
df[col_clean] = df[col_clean].apply(lambda x : brackets(x))
|
127
127
|
df[col_clean] = df[col_clean].apply(lambda x : urls(x, repl= ''))
|
128
128
|
df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
|
129
|
-
df[col_clean] = df[col_clean].apply(remove_multiple_hashtags)
|
129
|
+
# df[col_clean] = df[col_clean].apply(remove_multiple_hashtags)
|
130
130
|
df[col_clean] = df[col_clean].apply(remove_extra_spaces)
|
131
131
|
# df = df.loc[(df[col_clean] != ""), :]
|
132
132
|
return df
|
@@ -541,7 +541,7 @@ def substitute_punctuations_with_white_space(text : str) -> str:
|
|
541
541
|
text = re.sub(r"[%s]" % re.escape('!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~“…”’'), " ", text)
|
542
542
|
return text
|
543
543
|
|
544
|
-
def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate") -> dict:
|
544
|
+
def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate", format_payload="html") -> dict:
|
545
545
|
"""
|
546
546
|
Translate text using LibreTranslate service.
|
547
547
|
|
@@ -558,6 +558,8 @@ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_j
|
|
558
558
|
The directory to save the translation result JSON file.
|
559
559
|
url : str, optional
|
560
560
|
The URL of the WT Libre translation service. Default is "http://127.0.0.1:5000/translate".
|
561
|
+
format_payload : str, optional
|
562
|
+
Possible values are html or text.
|
561
563
|
|
562
564
|
Returns:
|
563
565
|
json_data : dict
|
@@ -568,7 +570,7 @@ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_j
|
|
568
570
|
"q": text,
|
569
571
|
"source": source,
|
570
572
|
"target": target,
|
571
|
-
"format":
|
573
|
+
"format": format_payload,
|
572
574
|
"api_key": ""
|
573
575
|
}
|
574
576
|
|
@@ -580,7 +582,7 @@ def translate_wt_libre(text: str, source: str, target: str, filename: str, dir_j
|
|
580
582
|
write_json(json_data, dir_json , str(filename))
|
581
583
|
return json_data
|
582
584
|
|
583
|
-
def translate_batch(batch_text: list, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate") -> list:
|
585
|
+
def translate_batch(batch_text: list, source: str, target: str, filename: str, dir_json: str, url: str = "http://127.0.0.1:5000/translate", format_payload="html") -> list:
|
584
586
|
"""
|
585
587
|
Translate a batch of texts using LibreTranslate service.
|
586
588
|
|
@@ -597,6 +599,8 @@ def translate_batch(batch_text: list, source: str, target: str, filename: str, d
|
|
597
599
|
The directory to save the translation result JSONL file.
|
598
600
|
url : str, optional
|
599
601
|
The URL of the WT Libre translation service. Default is "http://127.0.0.1:5000/translate".
|
602
|
+
format_payload : str, optional
|
603
|
+
Possible values are html or text.
|
600
604
|
|
601
605
|
Returns:
|
602
606
|
json_results : list of dict
|
@@ -607,7 +611,7 @@ def translate_batch(batch_text: list, source: str, target: str, filename: str, d
|
|
607
611
|
"q": batch_text,
|
608
612
|
"source": source,
|
609
613
|
"target": target,
|
610
|
-
"format":
|
614
|
+
"format": format_payload,
|
611
615
|
"api_key": ""
|
612
616
|
}
|
613
617
|
|
@@ -623,7 +627,7 @@ def translate_batch(batch_text: list, source: str, target: str, filename: str, d
|
|
623
627
|
write_jsonl(json_results, dir_json , str(filename))
|
624
628
|
return json_results
|
625
629
|
|
626
|
-
def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:5000/translate") -> str:
|
630
|
+
def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:5000/translate", format_payload="html") -> str:
|
627
631
|
"""
|
628
632
|
Translate text using LibreTranslate service.
|
629
633
|
|
@@ -636,6 +640,8 @@ def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:
|
|
636
640
|
The target language code.
|
637
641
|
url : str, optional
|
638
642
|
The URL of the translation service. Default is "http://127.0.0.1:5000/translate".
|
643
|
+
format_payload : str, optional
|
644
|
+
Possible values are html or text.
|
639
645
|
|
640
646
|
Returns:
|
641
647
|
translatedText : str
|
@@ -646,7 +652,7 @@ def translate(text: str, source: str, target: str, url: str = "http://127.0.0.1:
|
|
646
652
|
"q": text,
|
647
653
|
"source": source,
|
648
654
|
"target": target,
|
649
|
-
"format":
|
655
|
+
"format": format_payload,
|
650
656
|
"api_key": ""
|
651
657
|
}
|
652
658
|
|
@@ -905,6 +911,8 @@ def topic_aggregate_chunks(df: pd.DataFrame, col_id: str, col_topic : str, col_c
|
|
905
911
|
"""
|
906
912
|
metrics_dict = dict()
|
907
913
|
# metrics_dict[col_id]=(col_id,'first')
|
914
|
+
# if col_id != col_chunk_id:
|
915
|
+
# metrics_dict[col_chunk_id]=(col_chunk_id,"nunique")
|
908
916
|
metrics_dict[col_chunk_id]=(col_chunk_id,"nunique")
|
909
917
|
metrics_dict[col_engagement]=(col_engagement,'first')
|
910
918
|
|
@@ -1280,10 +1288,12 @@ def PRarmy_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str = "l
|
|
1280
1288
|
NER_type.append(ent.label_)
|
1281
1289
|
NER_text.append(ent.text)
|
1282
1290
|
|
1291
|
+
|
1283
1292
|
record = (NER_type, NER_text, ' '.join(map(str, lemmas_list)))
|
1284
1293
|
all_records.append(record)
|
1285
1294
|
|
1286
1295
|
|
1296
|
+
|
1287
1297
|
df[['NER_type', 'NER_text', col_lemma]] = pd.DataFrame(all_records, index=df.index)
|
1288
1298
|
|
1289
1299
|
return df
|
@@ -1570,10 +1580,10 @@ def extract_emojis(nlp, df: pd.DataFrame, col_text: str, batch_size: int = 100,
|
|
1570
1580
|
|
1571
1581
|
return df
|
1572
1582
|
|
1573
|
-
def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1, batch_size: int = 100, n_process: int = 1, stats: bool = False) -> pd.DataFrame:
|
1583
|
+
def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1, batch_size: int = 100, n_process: int = 1, stats: bool = False, threshold: int = None) -> pd.DataFrame:
|
1574
1584
|
"""
|
1575
|
-
Split a text into chunks of n sentences
|
1576
|
-
|
1585
|
+
Split a text into chunks of n sentences, returning their start and end indexes in separate columns.
|
1586
|
+
|
1577
1587
|
Parameters:
|
1578
1588
|
nlp : spacy.language.Language
|
1579
1589
|
The spaCy language processing pipeline.
|
@@ -1589,41 +1599,64 @@ def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1
|
|
1589
1599
|
The number of processes to use for text processing. Default is 1.
|
1590
1600
|
stats : bool, optional
|
1591
1601
|
Flag indicating whether to compute statistics about the splitting process. Default is False.
|
1592
|
-
|
1602
|
+
threshold : int, optional
|
1603
|
+
Maximum number of sentence batches to return per text. If None, all batches are returned. Default is None.
|
1604
|
+
|
1593
1605
|
Returns:
|
1594
1606
|
pd.DataFrame
|
1595
|
-
DataFrame containing the split sentences.
|
1607
|
+
DataFrame containing the split sentences with their start and end indexes in separate columns.
|
1596
1608
|
|
1597
|
-
Description:
|
1598
|
-
This function splits text in a DataFrame into chunks of n sentences. It returns a DataFrame containing the split sentences.
|
1599
|
-
Optionally, it can compute statistics such as the count of sentences and batches if the 'stats' parameter is set to True.
|
1600
1609
|
"""
|
1610
|
+
text = list(df[col_text].astype('unicode').values)
|
1611
|
+
|
1612
|
+
count_sentences = []
|
1613
|
+
count_batches = []
|
1614
|
+
results = []
|
1615
|
+
start_indexes = []
|
1616
|
+
end_indexes = []
|
1617
|
+
|
1618
|
+
for doc in tqdm(nlp.pipe(text, batch_size=batch_size, n_process=n_process), total=len(text), desc="Sentence splitting"):
|
1619
|
+
sentences = []
|
1620
|
+
|
1621
|
+
|
1622
|
+
# Extract sentences and their positions
|
1623
|
+
for sent in doc.sents:
|
1624
|
+
sentences.append((sent.text, sent.start_char, sent.end_char))
|
1601
1625
|
|
1602
|
-
text=list(df[col_text].astype('unicode').values)
|
1603
|
-
|
1604
|
-
count_sentences=[]
|
1605
|
-
count_batches=[]
|
1606
|
-
results=[]
|
1607
|
-
for doc in tqdm(nlp.pipe(text, batch_size=batch_size, n_process=n_process), total= len(text), desc = "Sentence splitting"):
|
1608
|
-
# Split the text into sentences
|
1609
|
-
sentences = [sent.text for sent in doc.sents]
|
1610
1626
|
if stats:
|
1611
1627
|
count_sentences.append(len(sentences))
|
1612
|
-
|
1613
|
-
|
1628
|
+
|
1629
|
+
if n_sentences > 1:
|
1630
|
+
# # Split sentences into batches of size n_sentences
|
1614
1631
|
batches = [sentences[i:i + n_sentences] for i in range(0, len(sentences), n_sentences)]
|
1615
|
-
|
1632
|
+
|
1633
|
+
# Concatenate batches of sentences and adjust spans accordingly
|
1634
|
+
concatenate_batches = [" ".join([sub[0] for sub in sublist]) for sublist in batches]
|
1635
|
+
concatenate_spans = [(sublist[0][1], sublist[-1][2]) for sublist in batches]
|
1636
|
+
|
1637
|
+
if threshold is not None:
|
1638
|
+
concatenate_batches = concatenate_batches[:threshold]
|
1639
|
+
concatenate_spans = concatenate_spans[:threshold]
|
1640
|
+
|
1616
1641
|
results.append(concatenate_batches)
|
1642
|
+
start_indexes.append([span[0] for span in concatenate_spans])
|
1643
|
+
end_indexes.append([span[1] for span in concatenate_spans])
|
1644
|
+
|
1617
1645
|
if stats:
|
1618
1646
|
count_batches.append(len(concatenate_batches))
|
1619
|
-
|
1620
1647
|
else:
|
1621
|
-
|
1648
|
+
sentences = sentences[:threshold] if threshold is not None else sentences
|
1649
|
+
|
1650
|
+
results.append([sub[0] for sub in sentences])
|
1651
|
+
start_indexes.append([sub[1] for sub in sentences])
|
1652
|
+
end_indexes.append([sub[2] for sub in sentences])
|
1622
1653
|
|
1623
1654
|
df['sentences'] = results
|
1624
|
-
|
1625
|
-
|
1626
|
-
|
1655
|
+
df['start_indexes'] = start_indexes
|
1656
|
+
df['end_indexes'] = end_indexes
|
1657
|
+
|
1658
|
+
df = df.explode(['sentences','start_indexes', 'end_indexes']).reset_index(drop=True)
|
1659
|
+
|
1627
1660
|
return df
|
1628
1661
|
|
1629
1662
|
|
@@ -1998,6 +2031,75 @@ def encode_chunked_files(chunk_files_paths: list,
|
|
1998
2031
|
|
1999
2032
|
return new_file_paths
|
2000
2033
|
|
2034
|
+
####################################################################
|
2035
|
+
# ENCODING FEATURES
|
2036
|
+
####################################################################
|
2037
|
+
|
2038
|
+
def encode_labels(data_to_encode: np.ndarray) -> tuple:
|
2039
|
+
"""
|
2040
|
+
Encodes a list of labels using a LabelEncoder.
|
2041
|
+
|
2042
|
+
Args:
|
2043
|
+
- data_to_encode (List[Union[str, int]]): The list of labels to encode. Labels can be of any hashable type,
|
2044
|
+
but strings or integers are typical.
|
2045
|
+
|
2046
|
+
Returns:
|
2047
|
+
- Tuple[LabelEncoder, np.ndarray]: A tuple containing the fitted LabelEncoder instance and a numpy array
|
2048
|
+
of encoded labels.
|
2049
|
+
"""
|
2050
|
+
label_encoder = LabelEncoder()
|
2051
|
+
label_encoder.fit(data_to_encode)
|
2052
|
+
encoded_labels = label_encoder.transform(data_to_encode)
|
2053
|
+
return label_encoder, encoded_labels
|
2054
|
+
|
2055
|
+
|
2056
|
+
def encode_new_labels(label_encoder : LabelEncoder, data_to_encode : np.ndarray) -> np.ndarray:
|
2057
|
+
"""
|
2058
|
+
Encodes a list of new labels using an already fitted LabelEncoder.
|
2059
|
+
|
2060
|
+
Args:
|
2061
|
+
- label_encoder (LabelEncoder): A pre-fitted LabelEncoder instance.
|
2062
|
+
- data_to_encode (List[Union[str, int]]): The list of new labels to encode using the pre-fitted encoder.
|
2063
|
+
|
2064
|
+
Returns:
|
2065
|
+
- np.ndarray: A numpy array of encoded labels.
|
2066
|
+
"""
|
2067
|
+
encoded_labels = label_encoder.transform(data_to_encode)
|
2068
|
+
return encoded_labels
|
2069
|
+
|
2070
|
+
def one_hot_encode(data_to_encode:np.ndarray) -> tuple:
|
2071
|
+
"""
|
2072
|
+
One-hot encodes a list of categorical values using OneHotEncoder.
|
2073
|
+
|
2074
|
+
Args:
|
2075
|
+
- data_to_encode (List[Union[str, int]]): The list of categorical values to encode. The values can be of
|
2076
|
+
any hashable type, typically strings or integers.
|
2077
|
+
|
2078
|
+
Returns:
|
2079
|
+
- Tuple[OneHotEncoder, np.ndarray]: A tuple containing the fitted OneHotEncoder instance and a numpy array
|
2080
|
+
of one-hot encoded values.
|
2081
|
+
"""
|
2082
|
+
one_hot_encoder = OneHotEncoder(sparse=False)
|
2083
|
+
data_to_encode_reshaped = np.array(data_to_encode).reshape(-1, 1) # Reshape for OneHotEncoder
|
2084
|
+
one_hot_encoder.fit(data_to_encode_reshaped)
|
2085
|
+
encoded_array = one_hot_encoder.transform(data_to_encode_reshaped)
|
2086
|
+
return one_hot_encoder, encoded_array
|
2087
|
+
|
2088
|
+
|
2089
|
+
def one_hot_encode_new_data(one_hot_encoder: OneHotEncoder, data_to_encode: np.ndarray) -> np.ndarray:
|
2090
|
+
"""
|
2091
|
+
One-hot encodes a list of new categorical values using an already fitted OneHotEncoder.
|
2092
|
+
|
2093
|
+
Args:
|
2094
|
+
- one_hot_encoder (OneHotEncoder): A pre-fitted OneHotEncoder instance.
|
2095
|
+
- data_to_encode (List[Union[str, int]]): The list of new categorical values to encode using the pre-fitted encoder.
|
2096
|
+
|
2097
|
+
Returns:
|
2098
|
+
- np.ndarray: A numpy array of one-hot encoded values.
|
2099
|
+
"""
|
2100
|
+
data_to_encode_reshaped = np.array(data_to_encode).reshape(-1, 1) # Reshape for OneHotEncoder
|
2101
|
+
encoded_array = one_hot_encoder.transform(data_to_encode_reshaped)
|
2102
|
+
return encoded_array
|
2001
2103
|
|
2002
2104
|
####################################################################
|
2003
2105
|
# SCALING FEATURES
|
@@ -2327,3 +2429,46 @@ def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json
|
|
2327
2429
|
write_json(results, dir_json , str(filename))
|
2328
2430
|
|
2329
2431
|
return results
|
2432
|
+
|
2433
|
+
|
2434
|
+
def add_tag_libretranslate_not_translate(text):
|
2435
|
+
"""
|
2436
|
+
This function add fake html tag around words such as mentions, hashtags, urls and emojis to avoid translation of those tokens.
|
2437
|
+
|
2438
|
+
Args:
|
2439
|
+
text (str): The text to process
|
2440
|
+
|
2441
|
+
Returns:
|
2442
|
+
str: The text with the fake html tags
|
2443
|
+
"""
|
2444
|
+
# This regex finds words starting with # and followed by alphanumeric characters or underscores
|
2445
|
+
mention_pattern = r"(?:RT\s|QT\s)?(?<=^|(?<=[^a-zA-Z0-9-_\.]))(@[A-Za-z0-9_]{4,15})"
|
2446
|
+
hashtag_pattern = r"(\B#\w+)"
|
2447
|
+
url_pattern = r"(https?://[^ ]+)"
|
2448
|
+
emoji_pattern = r':[a-zA-Z_]+:'
|
2449
|
+
|
2450
|
+
pattern = re.compile(emoji_pattern+ "|" + mention_pattern + "|" + hashtag_pattern + "|" + url_pattern)
|
2451
|
+
|
2452
|
+
# This function replaces the hashtag with an HTML link tag
|
2453
|
+
def replace_with_link(match):
|
2454
|
+
matcher_group = match.group(0)
|
2455
|
+
return f'<a href="{matcher_group}"></a>'
|
2456
|
+
|
2457
|
+
# Use re.sub to substitute the hashtags with the HTML link tags
|
2458
|
+
text_no_emojis = emoji.demojize(text)
|
2459
|
+
result = re.sub(pattern, replace_with_link, text_no_emojis)
|
2460
|
+
|
2461
|
+
return result
|
2462
|
+
|
2463
|
+
def clean_libre_translate_tags(text):
|
2464
|
+
"""
|
2465
|
+
This function remove fake tags added by add_tag_libretranslate_not_translate() function.
|
2466
|
+
|
2467
|
+
Args:
|
2468
|
+
text (str): The text to process
|
2469
|
+
|
2470
|
+
Returns:
|
2471
|
+
str: The text with the fake html tags
|
2472
|
+
"""
|
2473
|
+
cleaned_string = text.replace('<a href="', '').replace('"></a>', '')
|
2474
|
+
return cleaned_string
|
@@ -18,7 +18,8 @@ def reduce_with_cuml_UMAP(embeddings: np.ndarray,
|
|
18
18
|
metric: str = "cosine",
|
19
19
|
spread: float = 1.0,
|
20
20
|
learning_rate: float = 1.0,
|
21
|
-
n_epochs:int = 300
|
21
|
+
n_epochs:int = 300,
|
22
|
+
random_state:int = None
|
22
23
|
) -> tuple:
|
23
24
|
"""
|
24
25
|
Reduces the dimensionality of embeddings using UMAP with cuML library.
|
@@ -41,7 +42,48 @@ def reduce_with_cuml_UMAP(embeddings: np.ndarray,
|
|
41
42
|
metric=metric,
|
42
43
|
spread = spread,
|
43
44
|
n_epochs=n_epochs,
|
44
|
-
learning_rate=learning_rate
|
45
|
+
learning_rate=learning_rate,
|
46
|
+
random_state=random_state).fit(embeddings)
|
47
|
+
|
48
|
+
reduced_embeddings = reducer.transform(embeddings)
|
49
|
+
return reducer, reduced_embeddings
|
50
|
+
|
51
|
+
|
52
|
+
def supervised_reduce_with_cuml_UMAP(embeddings: np.ndarray,
|
53
|
+
n_neighbors: int = 5,
|
54
|
+
n_components: int = 3,
|
55
|
+
min_dist: float = 0.0,
|
56
|
+
metric: str = "cosine",
|
57
|
+
spread: float = 1.0,
|
58
|
+
learning_rate: float = 1.0,
|
59
|
+
n_epochs:int = 300,
|
60
|
+
y: np.ndarray = None,
|
61
|
+
convert_dtype: bool = False,
|
62
|
+
random_state:int=None
|
63
|
+
) -> tuple:
|
64
|
+
"""
|
65
|
+
Reduces the dimensionality of embeddings using UMAP with cuML library.
|
66
|
+
|
67
|
+
Args:
|
68
|
+
embeddings (np.ndarray): The input embeddings to be reduced.
|
69
|
+
n_neighbors (int, optional): The number of nearest neighbors to consider. Defaults to 5.
|
70
|
+
n_components (int, optional): The number of dimensions of the embedded space. Defaults to 3.
|
71
|
+
min_dist (float, optional): The minimum distance between embedded points. Defaults to 0.0.
|
72
|
+
metric (str, optional): The metric to use for distance computation. Defaults to "cosine".
|
73
|
+
spread (float, optional): The effective scale of embedded points. Defaults to 1.0.
|
74
|
+
|
75
|
+
Returns:
|
76
|
+
reducer (UMAP): The UMAP reducer object.
|
77
|
+
reduced_embeddings (np.ndarray): The reduced embeddings.
|
78
|
+
"""
|
79
|
+
reducer = UMAP(n_neighbors=n_neighbors,
|
80
|
+
n_components=n_components,
|
81
|
+
min_dist=min_dist,
|
82
|
+
metric=metric,
|
83
|
+
spread = spread,
|
84
|
+
n_epochs=n_epochs,
|
85
|
+
learning_rate=learning_rate,
|
86
|
+
random_state=random_state).fit(X = embeddings, y = y, convert_dtype = convert_dtype)
|
45
87
|
|
46
88
|
reduced_embeddings = reducer.transform(embeddings)
|
47
89
|
return reducer, reduced_embeddings
|
@@ -409,6 +451,9 @@ def cuml_word_frequency_per_categorie(gdf: pd.DataFrame, col_text: str, col_cat:
|
|
409
451
|
# Convert the result back to pandas DataFrame
|
410
452
|
return df_count.to_pandas()
|
411
453
|
|
454
|
+
|
455
|
+
|
456
|
+
|
412
457
|
# def cuml_chi2_per_category(lst_text: list, lst_categorie: list, col_cat: str, n_words: int = 10, p_value_limit: float = 0.95, min_freq: int = 3) -> pd.DataFrame:
|
413
458
|
|
414
459
|
# # Convert input lists to cuDF Series
|
opsci_toolbox/helpers/sna.py
CHANGED
@@ -11,6 +11,40 @@ from collections import Counter
|
|
11
11
|
from opsci_toolbox.helpers.dataviz import boxplot
|
12
12
|
from fa2_modified import ForceAtlas2
|
13
13
|
|
14
|
+
def create_subgraph_min_metric(G: nx.Graph, metric: str = "degree", min_value: float = 2) -> nx.Graph:
|
15
|
+
"""
|
16
|
+
Creates a subgraph containing only the nodes that have at least the specified minimum value for a given metric.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
G (nx.Graph): The input graph.
|
20
|
+
metric (str, optional): The node metric to filter nodes by (e.g., "degree", "in_degree", "out_degree", "degree_centrality"). Default is "degree".
|
21
|
+
min_value (float, optional): The minimum value required for nodes to be included in the subgraph. Default is 2.
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
subgraph (nx.Graph): A subgraph containing only the nodes with at least the specified minimum metric value.
|
25
|
+
"""
|
26
|
+
|
27
|
+
if metric == "degree":
|
28
|
+
nodes_with_min_metric = [node for node, value in G.degree() if value >= min_value]
|
29
|
+
elif metric == "in_degree" and G.is_directed():
|
30
|
+
nodes_with_min_metric = [node for node, value in G.in_degree() if value >= min_value]
|
31
|
+
elif metric == "out_degree" and G.is_directed():
|
32
|
+
nodes_with_min_metric = [node for node, value in G.out_degree() if value >= min_value]
|
33
|
+
elif metric == "degree_centrality":
|
34
|
+
centrality = nx.degree_centrality(G)
|
35
|
+
nodes_with_min_metric = [node for node, value in centrality.items() if value >= min_value]
|
36
|
+
elif metric == "betweenness_centrality":
|
37
|
+
centrality = nx.betweenness_centrality(G)
|
38
|
+
nodes_with_min_metric = [node for node, value in centrality.items() if value >= min_value]
|
39
|
+
elif metric == "closeness_centrality":
|
40
|
+
centrality = nx.closeness_centrality(G)
|
41
|
+
nodes_with_min_metric = [node for node, value in centrality.items() if value >= min_value]
|
42
|
+
else:
|
43
|
+
raise ValueError(f"Unsupported metric: {metric}")
|
44
|
+
|
45
|
+
subgraph = G.subgraph(nodes_with_min_metric).copy()
|
46
|
+
return subgraph
|
47
|
+
|
14
48
|
def group_nodes_by_values(dictionnary : dict) -> dict:
|
15
49
|
"""
|
16
50
|
Group nodes by their values from a dictionary.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: opsci-toolbox
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.15
|
4
4
|
Summary: a complete toolbox
|
5
5
|
Home-page: UNKNOWN
|
6
6
|
Author: Erwan Le Nagard
|
@@ -41,7 +41,7 @@ Requires-Dist: spacy-language-detection ==0.2.1
|
|
41
41
|
Requires-Dist: spacymoji ==3.1.0
|
42
42
|
Requires-Dist: supervision ==0.21.0
|
43
43
|
Requires-Dist: textacy ==0.13.0
|
44
|
-
Requires-Dist: torch
|
44
|
+
Requires-Dist: torch >=2.4.0
|
45
45
|
Requires-Dist: tqdm >=4.66.2
|
46
46
|
Requires-Dist: trafilatura ==1.7.0
|
47
47
|
Requires-Dist: transformers ==4.38.2
|
@@ -1,25 +1,26 @@
|
|
1
1
|
opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
opsci_toolbox/apis/rapidapi_helpers.py,sha256=
|
4
|
-
opsci_toolbox/apis/reddit.py,sha256=
|
5
|
-
opsci_toolbox/apis/telegram.py,sha256=
|
3
|
+
opsci_toolbox/apis/rapidapi_helpers.py,sha256=plX0uoGXWBEmeRqK7QfB_CVYJnW15kVUWtitESxPLNw,26669
|
4
|
+
opsci_toolbox/apis/reddit.py,sha256=b_dJFZ_bOB9LLugGBBw5bCbUZdq8VnwtVCGaTYljIIg,21096
|
5
|
+
opsci_toolbox/apis/telegram.py,sha256=JjmAk6tKvpnFIYpZDKthxS_mgqhWQpDPUOvyC7SiWPA,60920
|
6
6
|
opsci_toolbox/apis/webscraping.py,sha256=1DAIYbywZoPwTSyoqFGxyF0-q_nUsGg_VK51zLL_bB0,21465
|
7
7
|
opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
|
8
8
|
opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
-
opsci_toolbox/helpers/common.py,sha256=
|
9
|
+
opsci_toolbox/helpers/common.py,sha256=zmi-FbN39Rci_hGEKj2bmkcucrVwnHhMgKU6AAIap3Q,53327
|
10
10
|
opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
|
11
11
|
opsci_toolbox/helpers/dataviz.py,sha256=U2Kj-xoF1wHvYXUKxLsrSvKnhky9PrPUy61s1WEKp44,208743
|
12
|
-
opsci_toolbox/helpers/dates.py,sha256=
|
12
|
+
opsci_toolbox/helpers/dates.py,sha256=Pq-SKP2n1z0_jzU8NxGSv8CHLH_MOKjP_rNYeny0Tb8,4752
|
13
13
|
opsci_toolbox/helpers/gliner.py,sha256=qLkpuoCDezQyYmg_TE3XYETSpobHods6WBjCLo0Gjqw,3579
|
14
|
-
opsci_toolbox/helpers/nlp.py,sha256=
|
15
|
-
opsci_toolbox/helpers/nlp_cuml.py,sha256=
|
16
|
-
opsci_toolbox/helpers/sna.py,sha256=
|
14
|
+
opsci_toolbox/helpers/nlp.py,sha256=TXf1_dvmfDY9tR0gjQ1C-KzPRib7t74_ZcvmcYZWcPs,105096
|
15
|
+
opsci_toolbox/helpers/nlp_cuml.py,sha256=KfgC0hMqLCKoOME2DOu3Wje4ormV19fEB8Fyq8G7D-E,30901
|
16
|
+
opsci_toolbox/helpers/sna.py,sha256=3qx1WBQwLKpZNGR0bLSMB2-LBRx-vtNHp8puzoj-84A,33730
|
17
17
|
opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
|
18
18
|
opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
|
19
19
|
opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
20
|
opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
|
21
21
|
opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
|
22
|
-
opsci_toolbox-0.0.
|
23
|
-
opsci_toolbox-0.0.
|
24
|
-
opsci_toolbox-0.0.
|
25
|
-
opsci_toolbox-0.0.
|
22
|
+
opsci_toolbox-0.0.15.dist-info/METADATA,sha256=ppE13xf4E90LfW9Eir5U30xOI91F96wQqAam7kZwV1o,1727
|
23
|
+
opsci_toolbox-0.0.15.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
24
|
+
opsci_toolbox-0.0.15.dist-info/dependency_links.txt,sha256=bEiJsgyh9M0F_pGpJBwUYDefiTNq9F6QEGfQS5RH1Os,39
|
25
|
+
opsci_toolbox-0.0.15.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
|
26
|
+
opsci_toolbox-0.0.15.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
https://download.pytorch.org/whl/cu124
|
File without changes
|
File without changes
|