opsci-toolbox 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/helpers/common.py +39 -20
- opsci_toolbox/helpers/dataviz.py +4262 -1975
- opsci_toolbox/helpers/nlp.py +53 -32
- opsci_toolbox-0.0.11.dist-info/METADATA +53 -0
- {opsci_toolbox-0.0.10.dist-info → opsci_toolbox-0.0.11.dist-info}/RECORD +7 -7
- {opsci_toolbox-0.0.10.dist-info → opsci_toolbox-0.0.11.dist-info}/WHEEL +1 -1
- opsci_toolbox-0.0.10.dist-info/METADATA +0 -53
- {opsci_toolbox-0.0.10.dist-info → opsci_toolbox-0.0.11.dist-info}/top_level.txt +0 -0
opsci_toolbox/helpers/nlp.py
CHANGED
@@ -48,6 +48,28 @@ def remove_html_tags(text: str) -> str:
|
|
48
48
|
soup = BeautifulSoup(text, "html.parser")
|
49
49
|
return soup.get_text()
|
50
50
|
|
51
|
+
def remove_rt(text: str) -> str:
|
52
|
+
"""
|
53
|
+
Remove the retweet tag from a given text.
|
54
|
+
|
55
|
+
Args:
|
56
|
+
- text (str): The input text possibly containing a retweet tag in the format "RT @username: ".
|
57
|
+
|
58
|
+
Returns:
|
59
|
+
- str: The cleaned text with the retweet tag removed.
|
60
|
+
|
61
|
+
Example:
|
62
|
+
>>> remove_rt("RT @user123: Check out this tweet!")
|
63
|
+
'Check out this tweet!'
|
64
|
+
"""
|
65
|
+
# Regular expression pattern to match "RT @username: "
|
66
|
+
pattern = r'RT @\w+: '
|
67
|
+
|
68
|
+
# Substitute the pattern with an empty string
|
69
|
+
cleaned_text = re.sub(pattern, '', text)
|
70
|
+
|
71
|
+
return cleaned_text
|
72
|
+
|
51
73
|
def filter_by_query(df: pd.DataFrame, col_text: str, query: str, ignore_case: bool = True, ignore_accent: bool = True, match_word: bool = False) -> pd.DataFrame:
|
52
74
|
"""
|
53
75
|
Filter DataFrame rows by a query on a specific text column.
|
@@ -91,6 +113,7 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
|
|
91
113
|
df : pandas DataFrame
|
92
114
|
The DataFrame with cleaned text data.
|
93
115
|
"""
|
116
|
+
df[col_clean] = df[col].apply(remove_rt)
|
94
117
|
df[col_clean] = df[col].apply(lambda x : urls(x, repl= ''))
|
95
118
|
df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
|
96
119
|
df[col_clean] = df[col_clean].apply(remove_extra_spaces)
|
@@ -1358,14 +1381,14 @@ def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1
|
|
1358
1381
|
return df
|
1359
1382
|
|
1360
1383
|
|
1361
|
-
def spacy_NER(nlp, df: pd.DataFrame, col_text: str, entities_to_keep: list = ['PERSON','ORG'], explode: bool = True) -> pd.DataFrame:
|
1384
|
+
def spacy_NER(nlp, df: pd.DataFrame, col_text: str, entities_to_keep: list = ['PERSON','ORG'], explode: bool = True, batch_size : int = 100, n_process: int =1) -> pd.DataFrame:
|
1362
1385
|
"""
|
1363
1386
|
Spacy implementation of NER.
|
1364
1387
|
To define entities type to keep, call get_labels(nlp, pipe_step="ner", explanations=False)
|
1365
1388
|
explode = False means it returns 1 list of entities per document
|
1366
1389
|
explode = True means it returns 1 entity per row
|
1367
1390
|
|
1368
|
-
|
1391
|
+
Args:
|
1369
1392
|
nlp : spacy.language.Language
|
1370
1393
|
The spaCy language processing pipeline.
|
1371
1394
|
df : pd.DataFrame
|
@@ -1376,6 +1399,10 @@ def spacy_NER(nlp, df: pd.DataFrame, col_text: str, entities_to_keep: list = ['P
|
|
1376
1399
|
List of entity types to keep. Default is ['PERSON','ORG'].
|
1377
1400
|
explode : bool, optional
|
1378
1401
|
Flag indicating whether to explode the DataFrame to have one entity per row. Default is True.
|
1402
|
+
batch_size : int, optional
|
1403
|
+
Batch sizes
|
1404
|
+
n_process : int, optional
|
1405
|
+
Number of processes
|
1379
1406
|
|
1380
1407
|
Returns:
|
1381
1408
|
pd.DataFrame
|
@@ -1385,43 +1412,40 @@ def spacy_NER(nlp, df: pd.DataFrame, col_text: str, entities_to_keep: list = ['P
|
|
1385
1412
|
This function performs Named Entity Recognition (NER) using spaCy on a DataFrame with text data. It extracts entities of the specified types
|
1386
1413
|
and stores the NER information in separate columns. If 'explode' is set to True, it returns one entity per row in the DataFrame.
|
1387
1414
|
"""
|
1388
|
-
|
1389
|
-
|
1390
|
-
|
1391
|
-
|
1392
|
-
|
1393
|
-
|
1394
|
-
|
1395
|
-
|
1396
|
-
doc = nlp(row[col_text])
|
1397
|
-
entities_data = []
|
1415
|
+
l_text = df[col_text].tolist()
|
1416
|
+
all_records = []
|
1417
|
+
for doc in tqdm(nlp.pipe(l_text, batch_size=batch_size, n_process=n_process), total= len(l_text), desc = "NLP Process"):
|
1418
|
+
NER_type = []
|
1419
|
+
NER_text = []
|
1420
|
+
NER_start_char = []
|
1421
|
+
NER_end_char=[]
|
1422
|
+
# entities_data = []
|
1398
1423
|
|
1399
1424
|
if len(entities_to_keep)>0:
|
1400
1425
|
for ent in doc.ents:
|
1401
1426
|
if ent.label_ in entities_to_keep:
|
1402
|
-
|
1427
|
+
NER_type.append(ent.label_)
|
1428
|
+
NER_text.append(ent.text)
|
1429
|
+
NER_start_char.append(ent.start_char)
|
1430
|
+
NER_end_char.append(ent.end_char)
|
1431
|
+
# entities_data.append([ent.label_, ent.text, ent.start_char, ent.end_char])
|
1403
1432
|
else:
|
1404
1433
|
for ent in doc.ents:
|
1405
|
-
|
1434
|
+
NER_type.append(ent.label_)
|
1435
|
+
NER_text.append(ent.text)
|
1436
|
+
NER_start_char.append(ent.start_char)
|
1437
|
+
NER_end_char.append(ent.end_char)
|
1438
|
+
# entities_data.append([ent.label_, ent.text, ent.start_char, ent.end_char])
|
1439
|
+
record = (NER_type, NER_text, NER_start_char, NER_end_char)
|
1440
|
+
all_records.append(record)
|
1406
1441
|
|
1407
|
-
|
1408
|
-
entity_label, entity_text, start_char, end_char = zip(*entities_data)
|
1409
|
-
row['NER_type'] = entity_label
|
1410
|
-
row['NER_text'] = entity_text
|
1411
|
-
row['NER_start_char'] = start_char
|
1412
|
-
row['NER_end_char'] = end_char
|
1413
|
-
|
1414
|
-
return row
|
1415
|
-
|
1416
|
-
# Apply the processing function to each row
|
1417
|
-
df = df.apply(process_row, axis=1)
|
1442
|
+
df[['NER_type', 'NER_text','NER_start_char','NER_end_char']] = pd.DataFrame(all_records, index=df.index)
|
1418
1443
|
|
1419
1444
|
if explode:
|
1420
1445
|
df= df.explode(['NER_type', 'NER_text','NER_start_char','NER_end_char'])
|
1421
1446
|
|
1422
1447
|
return df
|
1423
1448
|
|
1424
|
-
|
1425
1449
|
def tokenize(nlp, df: pd.DataFrame, col_text: str, col_tokens: str, pos_to_keep: list, stopwords: list, batch_size: int = 100, n_process: int = 1, stats: bool = True) -> pd.DataFrame:
|
1426
1450
|
"""
|
1427
1451
|
Spacy implementation to tokenize text
|
@@ -1901,15 +1925,13 @@ def agglomerative_clustering(embeddings, n_clusters=15, metric="euclidean", link
|
|
1901
1925
|
|
1902
1926
|
|
1903
1927
|
|
1904
|
-
def hdbscan_clustering(embeddings, algorithm='best', alpha=1.0, cluster_selection_epsilon=0.0, approx_min_span_tree=True,
|
1905
|
-
gen_min_span_tree=True, leaf_size=40, metric='euclidean', min_cluster_size=5, min_samples=None,
|
1906
|
-
p=None, cluster_selection_method='eom', prediction_data = True):
|
1907
|
-
|
1928
|
+
def hdbscan_clustering(embeddings, algorithm='best', alpha=1.0, cluster_selection_epsilon=0.0, approx_min_span_tree=True, gen_min_span_tree=True, leaf_size=40, metric='euclidean', min_cluster_size=5, min_samples=None, p=None, cluster_selection_method='eom', prediction_data = True):
|
1908
1929
|
"""
|
1909
1930
|
This function performs clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm. It clusters the input data based on the specified parameters and returns the clusterer object, cluster labels for each point, and the probability of each sample being an outlier.
|
1931
|
+
|
1910
1932
|
Args
|
1911
1933
|
embeddings : array-like or sparse matrix, shape (n_samples, n_features). The input data to be clustered.
|
1912
|
-
algorithm : {'best', 'generic', 'prims_kdtree', 'boruvka_kdtree', 'boruvka_balltree', 'prims_balltree'}, optional. The algorithm to use for computation. Default is
|
1934
|
+
algorithm : {'best', 'generic', 'prims_kdtree', 'boruvka_kdtree', 'boruvka_balltree', 'prims_balltree'}, optional. The algorithm to use for computation. Default is best.
|
1913
1935
|
alpha : float, optional. Scaling factor determining the individual weight of the (unnormalized) density estimate. Default is 1.0.
|
1914
1936
|
cluster_selection_epsilon : float, optional. The epsilon value to specify a minimum cluster size. Default is 0.0.
|
1915
1937
|
approx_min_span_tree : bool, optional. Whether to compute an approximation of the minimum spanning tree. Default is True.
|
@@ -2054,7 +2076,6 @@ def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json
|
|
2054
2076
|
proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()[0]
|
2055
2077
|
label = model.config.id2label[proba.argmax()]
|
2056
2078
|
results = {"label":label, "score" : float(proba.max()), col_text : text}
|
2057
|
-
print(results)
|
2058
2079
|
write_json(results, dir_json , str(filename))
|
2059
2080
|
|
2060
2081
|
return results
|
@@ -0,0 +1,53 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: opsci-toolbox
|
3
|
+
Version: 0.0.11
|
4
|
+
Summary: a complete toolbox
|
5
|
+
Home-page: UNKNOWN
|
6
|
+
Author: Erwan Le Nagard
|
7
|
+
Author-email: erwan@opsci.ai
|
8
|
+
License: MIT
|
9
|
+
Platform: UNKNOWN
|
10
|
+
Requires-Dist: requests <3,>=2.31.0
|
11
|
+
Requires-Dist: beautifulsoup4 ==4.9.3
|
12
|
+
Requires-Dist: chardet >=4.0.0
|
13
|
+
Requires-Dist: chart-studio ==1.1.0
|
14
|
+
Requires-Dist: eldar ==0.0.8
|
15
|
+
Requires-Dist: emoji ==2.10.1
|
16
|
+
Requires-Dist: fa2-modified ==0.3.10
|
17
|
+
Requires-Dist: google-api-python-client ==2.122.0
|
18
|
+
Requires-Dist: gspread ==6.1.2
|
19
|
+
Requires-Dist: hdbscan ==0.8.33
|
20
|
+
Requires-Dist: jusText ==3.0.0
|
21
|
+
Requires-Dist: langchain ==0.1.20
|
22
|
+
Requires-Dist: matplotlib >=3.9.0
|
23
|
+
Requires-Dist: mysql-connector-python >=9.0.0
|
24
|
+
Requires-Dist: networkx ==3.2.1
|
25
|
+
Requires-Dist: nltk ==3.8.1
|
26
|
+
Requires-Dist: numpy <1.25.0,>=1.21.5
|
27
|
+
Requires-Dist: opencv-python-headless ==4.9.0.80
|
28
|
+
Requires-Dist: openpyxl ==3.1.3
|
29
|
+
Requires-Dist: pandas >=1.5.3
|
30
|
+
Requires-Dist: Pillow >=9.0.1
|
31
|
+
Requires-Dist: plotly ==5.19.0
|
32
|
+
Requires-Dist: protobuf ==4.23.4
|
33
|
+
Requires-Dist: pyarrow >=14.0.2
|
34
|
+
Requires-Dist: python-louvain ==0.16
|
35
|
+
Requires-Dist: scikit-learn ==1.4.1.post1
|
36
|
+
Requires-Dist: scipy <2.0.0,>=1.8.0
|
37
|
+
Requires-Dist: sentence-transformers ==2.5.1
|
38
|
+
Requires-Dist: setuptools ==59.6.0
|
39
|
+
Requires-Dist: spacy ==3.7.4
|
40
|
+
Requires-Dist: spacy-language-detection ==0.2.1
|
41
|
+
Requires-Dist: spacymoji ==3.1.0
|
42
|
+
Requires-Dist: supervision ==0.21.0
|
43
|
+
Requires-Dist: textacy ==0.13.0
|
44
|
+
Requires-Dist: torch ==2.0.1
|
45
|
+
Requires-Dist: tqdm >=4.66.2
|
46
|
+
Requires-Dist: trafilatura ==1.7.0
|
47
|
+
Requires-Dist: transformers ==4.38.2
|
48
|
+
Requires-Dist: umap-learn ==0.5.5
|
49
|
+
Requires-Dist: urlextract ==1.9.0
|
50
|
+
Requires-Dist: wordcloud ==1.9.3
|
51
|
+
|
52
|
+
UNKNOWN
|
53
|
+
|
@@ -4,11 +4,11 @@ opsci_toolbox/apis/rapidapi_helpers.py,sha256=k_hYcRNww5noNkX7zyz5Htggxb15BPoKSl
|
|
4
4
|
opsci_toolbox/apis/webscraping.py,sha256=Gz3hOfhOHUpwHU1Pzj3mB2WdBAcKa2WisYBHMi3lcVE,18343
|
5
5
|
opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
|
6
6
|
opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
opsci_toolbox/helpers/common.py,sha256=
|
7
|
+
opsci_toolbox/helpers/common.py,sha256=nqg9wzgU5DxVTCxEb5LSw2lUnp0f_hKF_Q-DhpRtu6g,45158
|
8
8
|
opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
|
9
|
-
opsci_toolbox/helpers/dataviz.py,sha256=
|
9
|
+
opsci_toolbox/helpers/dataviz.py,sha256=1cIGb-u81cD5iSIkkkrzyrBnfim7fbhm0x_CguHUbf0,202128
|
10
10
|
opsci_toolbox/helpers/dates.py,sha256=Wf7HxaUY62IRrY3XPdRIuoaMbGi3QqWf-vStqbRRY_o,2633
|
11
|
-
opsci_toolbox/helpers/nlp.py,sha256=
|
11
|
+
opsci_toolbox/helpers/nlp.py,sha256=baq4BsSgeLBgToPOU5RTmDA80dFJwH9xf0jppuAVseU,88947
|
12
12
|
opsci_toolbox/helpers/nlp_cuml.py,sha256=XzBfoFMpVIehpRbp60E4wGokpoqJP0lJxs1plOxQqBY,28882
|
13
13
|
opsci_toolbox/helpers/sna.py,sha256=XL1BZ-x83xWRNbGsvh7-m8Mdy6iOrWx8vjgaL2_TSmo,31905
|
14
14
|
opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
|
@@ -16,7 +16,7 @@ opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUK
|
|
16
16
|
opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
|
18
18
|
opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
|
19
|
-
opsci_toolbox-0.0.
|
20
|
-
opsci_toolbox-0.0.
|
21
|
-
opsci_toolbox-0.0.
|
22
|
-
opsci_toolbox-0.0.
|
19
|
+
opsci_toolbox-0.0.11.dist-info/METADATA,sha256=5h-cfwhi31VKlzrOfdAeZuoKTLB1iyDIA4qqsz-bZGQ,1633
|
20
|
+
opsci_toolbox-0.0.11.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
21
|
+
opsci_toolbox-0.0.11.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
|
22
|
+
opsci_toolbox-0.0.11.dist-info/RECORD,,
|
@@ -1,53 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: opsci-toolbox
|
3
|
-
Version: 0.0.10
|
4
|
-
Summary: a complete toolbox
|
5
|
-
Home-page: UNKNOWN
|
6
|
-
Author: Erwan Le Nagard
|
7
|
-
Author-email: erwan@opsci.ai
|
8
|
-
License: MIT
|
9
|
-
Platform: UNKNOWN
|
10
|
-
Requires-Dist: Pillow (>=9.0.1)
|
11
|
-
Requires-Dist: Requests (==2.32.3)
|
12
|
-
Requires-Dist: beautifulsoup4 (==4.10.0)
|
13
|
-
Requires-Dist: chardet (>=4.0.0)
|
14
|
-
Requires-Dist: chart-studio (==1.1.0)
|
15
|
-
Requires-Dist: eldar (==0.0.8)
|
16
|
-
Requires-Dist: emoji (==2.10.1)
|
17
|
-
Requires-Dist: fa2-modified (==0.3.10)
|
18
|
-
Requires-Dist: google-api-python-client (==2.122.0)
|
19
|
-
Requires-Dist: gspread (==6.1.2)
|
20
|
-
Requires-Dist: hdbscan (==0.8.33)
|
21
|
-
Requires-Dist: jusText (==3.0.0)
|
22
|
-
Requires-Dist: langchain (==0.1.20)
|
23
|
-
Requires-Dist: matplotlib (>=3.9.0)
|
24
|
-
Requires-Dist: mysql-connector-repackaged (==0.3.1)
|
25
|
-
Requires-Dist: networkx (==3.2.1)
|
26
|
-
Requires-Dist: nltk (==3.8.1)
|
27
|
-
Requires-Dist: numpy (<1.25.0,>=1.21.5)
|
28
|
-
Requires-Dist: opencv-python-headless (==4.9.0.80)
|
29
|
-
Requires-Dist: openpyxl (==3.1.3)
|
30
|
-
Requires-Dist: pandas (>=1.5.3)
|
31
|
-
Requires-Dist: plotly (==5.19.0)
|
32
|
-
Requires-Dist: protobuf (==4.23.4)
|
33
|
-
Requires-Dist: pyarrow (>=14.0.2)
|
34
|
-
Requires-Dist: python-louvain (==0.16)
|
35
|
-
Requires-Dist: scikit-learn (==1.4.1.post1)
|
36
|
-
Requires-Dist: scipy (<2.0.0,>=1.8.0)
|
37
|
-
Requires-Dist: sentence-transformers (==2.5.1)
|
38
|
-
Requires-Dist: setuptools (==59.6.0)
|
39
|
-
Requires-Dist: spacy (==3.7.4)
|
40
|
-
Requires-Dist: spacy-language-detection (==0.2.1)
|
41
|
-
Requires-Dist: spacymoji (==3.1.0)
|
42
|
-
Requires-Dist: supervision (==0.21.0)
|
43
|
-
Requires-Dist: textacy (==0.13.0)
|
44
|
-
Requires-Dist: torch (==2.0.1)
|
45
|
-
Requires-Dist: tqdm (==4.66.2)
|
46
|
-
Requires-Dist: trafilatura (==1.7.0)
|
47
|
-
Requires-Dist: transformers (==4.38.2)
|
48
|
-
Requires-Dist: umap-learn (==0.5.5)
|
49
|
-
Requires-Dist: urlextract (==1.9.0)
|
50
|
-
Requires-Dist: wordcloud (==1.9.3)
|
51
|
-
|
52
|
-
UNKNOWN
|
53
|
-
|
File without changes
|