opsci-toolbox 0.0.15__py3-none-any.whl → 0.0.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/webscraping.py +8 -3
- opsci_toolbox/helpers/nlp.py +79 -1
- {opsci_toolbox-0.0.15.dist-info → opsci_toolbox-0.0.16.dist-info}/METADATA +1 -1
- {opsci_toolbox-0.0.15.dist-info → opsci_toolbox-0.0.16.dist-info}/RECORD +7 -7
- {opsci_toolbox-0.0.15.dist-info → opsci_toolbox-0.0.16.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.15.dist-info → opsci_toolbox-0.0.16.dist-info}/dependency_links.txt +0 -0
- {opsci_toolbox-0.0.15.dist-info → opsci_toolbox-0.0.16.dist-info}/top_level.txt +0 -0
@@ -97,9 +97,14 @@ def url_get_domain(url: str) -> str:
|
|
97
97
|
Returns:
|
98
98
|
str: The domain name extracted from the URL.
|
99
99
|
"""
|
100
|
-
|
101
|
-
|
102
|
-
|
100
|
+
try:
|
101
|
+
parsed_url = urlparse(url)
|
102
|
+
domain = parsed_url.hostname if parsed_url.hostname else parsed_url.netloc
|
103
|
+
return domain
|
104
|
+
except Exception as e:
|
105
|
+
pass
|
106
|
+
print(url, e)
|
107
|
+
return url
|
103
108
|
|
104
109
|
|
105
110
|
def url_get_extension(url: str) -> str:
|
opsci_toolbox/helpers/nlp.py
CHANGED
@@ -30,7 +30,7 @@ from eldar import Query
|
|
30
30
|
import torch
|
31
31
|
from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
|
32
32
|
from bs4 import BeautifulSoup
|
33
|
-
|
33
|
+
from nltk.tokenize import PunktSentenceTokenizer
|
34
34
|
|
35
35
|
####################################################################
|
36
36
|
# CLEANING
|
@@ -1660,6 +1660,84 @@ def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1
|
|
1660
1660
|
return df
|
1661
1661
|
|
1662
1662
|
|
1663
|
+
def split_n_sentences_nltk(df: pd.DataFrame, col_text: str, n_sentences: int = 1, threshold: int = None, stats: bool = False) -> pd.DataFrame:
|
1664
|
+
"""
|
1665
|
+
Split a text into chunks of n sentences, returning their start and end indexes in separate columns using NLTK PunktSentenceTokenizer.
|
1666
|
+
|
1667
|
+
Parameters:
|
1668
|
+
df : pd.DataFrame
|
1669
|
+
DataFrame containing the text data to split.
|
1670
|
+
col_text : str
|
1671
|
+
The name of the column containing the text data.
|
1672
|
+
n_sentences : int, optional
|
1673
|
+
The number of sentences to group together. Default is 1.
|
1674
|
+
threshold : int, optional
|
1675
|
+
Maximum number of sentence batches to return per text. If None, all batches are returned. Default is None.
|
1676
|
+
stats : bool, optional
|
1677
|
+
Flag indicating whether to compute statistics about the splitting process. Default is False.
|
1678
|
+
|
1679
|
+
Returns:
|
1680
|
+
pd.DataFrame
|
1681
|
+
DataFrame containing the split sentences with their start and end indexes in separate columns.
|
1682
|
+
|
1683
|
+
"""
|
1684
|
+
tokenizer = PunktSentenceTokenizer()
|
1685
|
+
text = list(df[col_text].astype('unicode').values)
|
1686
|
+
|
1687
|
+
count_sentences = []
|
1688
|
+
count_batches = []
|
1689
|
+
results = []
|
1690
|
+
start_indexes = []
|
1691
|
+
end_indexes = []
|
1692
|
+
|
1693
|
+
for doc in tqdm(text, total=len(text), desc="Sentence splitting"):
|
1694
|
+
sentences = []
|
1695
|
+
start_pos = 0
|
1696
|
+
|
1697
|
+
# Tokenize sentences and compute positions
|
1698
|
+
for sent in tokenizer.tokenize(doc):
|
1699
|
+
start_idx = doc.find(sent, start_pos)
|
1700
|
+
end_idx = start_idx + len(sent)
|
1701
|
+
sentences.append((sent, start_idx, end_idx))
|
1702
|
+
start_pos = end_idx
|
1703
|
+
|
1704
|
+
if stats:
|
1705
|
+
count_sentences.append(len(sentences))
|
1706
|
+
|
1707
|
+
if n_sentences > 1:
|
1708
|
+
# Split sentences into batches of size n_sentences
|
1709
|
+
batches = [sentences[i:i + n_sentences] for i in range(0, len(sentences), n_sentences)]
|
1710
|
+
|
1711
|
+
# Concatenate batches of sentences and adjust spans accordingly
|
1712
|
+
concatenate_batches = [" ".join([sub[0] for sub in sublist]) for sublist in batches]
|
1713
|
+
concatenate_spans = [(sublist[0][1], sublist[-1][2]) for sublist in batches]
|
1714
|
+
|
1715
|
+
if threshold is not None:
|
1716
|
+
concatenate_batches = concatenate_batches[:threshold]
|
1717
|
+
concatenate_spans = concatenate_spans[:threshold]
|
1718
|
+
|
1719
|
+
results.append(concatenate_batches)
|
1720
|
+
start_indexes.append([span[0] for span in concatenate_spans])
|
1721
|
+
end_indexes.append([span[1] for span in concatenate_spans])
|
1722
|
+
|
1723
|
+
if stats:
|
1724
|
+
count_batches.append(len(concatenate_batches))
|
1725
|
+
else:
|
1726
|
+
sentences = sentences[:threshold] if threshold is not None else sentences
|
1727
|
+
|
1728
|
+
results.append([sub[0] for sub in sentences])
|
1729
|
+
start_indexes.append([sub[1] for sub in sentences])
|
1730
|
+
end_indexes.append([sub[2] for sub in sentences])
|
1731
|
+
|
1732
|
+
df['sentences'] = results
|
1733
|
+
df['start_indexes'] = start_indexes
|
1734
|
+
df['end_indexes'] = end_indexes
|
1735
|
+
|
1736
|
+
df = df.explode(['sentences', 'start_indexes', 'end_indexes']).reset_index(drop=True)
|
1737
|
+
|
1738
|
+
return df
|
1739
|
+
|
1740
|
+
|
1663
1741
|
def spacy_NER(nlp, df: pd.DataFrame, col_text: str, entities_to_keep: list = ['PERSON','ORG'], explode: bool = True, batch_size : int = 100, n_process: int =1) -> pd.DataFrame:
|
1664
1742
|
"""
|
1665
1743
|
Spacy implementation of NER.
|
@@ -3,7 +3,7 @@ opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuF
|
|
3
3
|
opsci_toolbox/apis/rapidapi_helpers.py,sha256=plX0uoGXWBEmeRqK7QfB_CVYJnW15kVUWtitESxPLNw,26669
|
4
4
|
opsci_toolbox/apis/reddit.py,sha256=b_dJFZ_bOB9LLugGBBw5bCbUZdq8VnwtVCGaTYljIIg,21096
|
5
5
|
opsci_toolbox/apis/telegram.py,sha256=JjmAk6tKvpnFIYpZDKthxS_mgqhWQpDPUOvyC7SiWPA,60920
|
6
|
-
opsci_toolbox/apis/webscraping.py,sha256=
|
6
|
+
opsci_toolbox/apis/webscraping.py,sha256=fo6H2OaH0m_LHJB9IyN-q0Vkk8L9OvHxNn4O_A6a6yc,21572
|
7
7
|
opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
|
8
8
|
opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
9
|
opsci_toolbox/helpers/common.py,sha256=zmi-FbN39Rci_hGEKj2bmkcucrVwnHhMgKU6AAIap3Q,53327
|
@@ -11,7 +11,7 @@ opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,2
|
|
11
11
|
opsci_toolbox/helpers/dataviz.py,sha256=U2Kj-xoF1wHvYXUKxLsrSvKnhky9PrPUy61s1WEKp44,208743
|
12
12
|
opsci_toolbox/helpers/dates.py,sha256=Pq-SKP2n1z0_jzU8NxGSv8CHLH_MOKjP_rNYeny0Tb8,4752
|
13
13
|
opsci_toolbox/helpers/gliner.py,sha256=qLkpuoCDezQyYmg_TE3XYETSpobHods6WBjCLo0Gjqw,3579
|
14
|
-
opsci_toolbox/helpers/nlp.py,sha256=
|
14
|
+
opsci_toolbox/helpers/nlp.py,sha256=4edA5JZ4vzpU4U9w-INNspW2oTQ-yYpm5rFXExKB4YI,108324
|
15
15
|
opsci_toolbox/helpers/nlp_cuml.py,sha256=KfgC0hMqLCKoOME2DOu3Wje4ormV19fEB8Fyq8G7D-E,30901
|
16
16
|
opsci_toolbox/helpers/sna.py,sha256=3qx1WBQwLKpZNGR0bLSMB2-LBRx-vtNHp8puzoj-84A,33730
|
17
17
|
opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
|
@@ -19,8 +19,8 @@ opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUK
|
|
19
19
|
opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
20
|
opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
|
21
21
|
opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
|
22
|
-
opsci_toolbox-0.0.
|
23
|
-
opsci_toolbox-0.0.
|
24
|
-
opsci_toolbox-0.0.
|
25
|
-
opsci_toolbox-0.0.
|
26
|
-
opsci_toolbox-0.0.
|
22
|
+
opsci_toolbox-0.0.16.dist-info/METADATA,sha256=-SCFUBnwnWlUrOGgQwxib8ZfCjWxXm3iVVwnfErQ9Fk,1727
|
23
|
+
opsci_toolbox-0.0.16.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
24
|
+
opsci_toolbox-0.0.16.dist-info/dependency_links.txt,sha256=bEiJsgyh9M0F_pGpJBwUYDefiTNq9F6QEGfQS5RH1Os,39
|
25
|
+
opsci_toolbox-0.0.16.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
|
26
|
+
opsci_toolbox-0.0.16.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|