opsci-toolbox 0.0.15__py3-none-any.whl → 0.0.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -97,9 +97,14 @@ def url_get_domain(url: str) -> str:
97
97
  Returns:
98
98
  str: The domain name extracted from the URL.
99
99
  """
100
- parsed_url = urlparse(url)
101
- domain = parsed_url.hostname if parsed_url.hostname else parsed_url.netloc
102
- return domain
100
+ try:
101
+ parsed_url = urlparse(url)
102
+ domain = parsed_url.hostname if parsed_url.hostname else parsed_url.netloc
103
+ return domain
104
+ except Exception as e:
105
+ pass
106
+ print(url, e)
107
+ return url
103
108
 
104
109
 
105
110
  def url_get_extension(url: str) -> str:
@@ -30,7 +30,7 @@ from eldar import Query
30
30
  import torch
31
31
  from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
32
32
  from bs4 import BeautifulSoup
33
-
33
+ from nltk.tokenize import PunktSentenceTokenizer
34
34
 
35
35
  ####################################################################
36
36
  # CLEANING
@@ -1660,6 +1660,84 @@ def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1
1660
1660
  return df
1661
1661
 
1662
1662
 
1663
+ def split_n_sentences_nltk(df: pd.DataFrame, col_text: str, n_sentences: int = 1, threshold: int = None, stats: bool = False) -> pd.DataFrame:
1664
+ """
1665
+ Split a text into chunks of n sentences, returning their start and end indexes in separate columns using NLTK PunktSentenceTokenizer.
1666
+
1667
+ Parameters:
1668
+ df : pd.DataFrame
1669
+ DataFrame containing the text data to split.
1670
+ col_text : str
1671
+ The name of the column containing the text data.
1672
+ n_sentences : int, optional
1673
+ The number of sentences to group together. Default is 1.
1674
+ threshold : int, optional
1675
+ Maximum number of sentence batches to return per text. If None, all batches are returned. Default is None.
1676
+ stats : bool, optional
1677
+ Flag indicating whether to compute statistics about the splitting process. Default is False.
1678
+
1679
+ Returns:
1680
+ pd.DataFrame
1681
+ DataFrame containing the split sentences with their start and end indexes in separate columns.
1682
+
1683
+ """
1684
+ tokenizer = PunktSentenceTokenizer()
1685
+ text = list(df[col_text].astype('unicode').values)
1686
+
1687
+ count_sentences = []
1688
+ count_batches = []
1689
+ results = []
1690
+ start_indexes = []
1691
+ end_indexes = []
1692
+
1693
+ for doc in tqdm(text, total=len(text), desc="Sentence splitting"):
1694
+ sentences = []
1695
+ start_pos = 0
1696
+
1697
+ # Tokenize sentences and compute positions
1698
+ for sent in tokenizer.tokenize(doc):
1699
+ start_idx = doc.find(sent, start_pos)
1700
+ end_idx = start_idx + len(sent)
1701
+ sentences.append((sent, start_idx, end_idx))
1702
+ start_pos = end_idx
1703
+
1704
+ if stats:
1705
+ count_sentences.append(len(sentences))
1706
+
1707
+ if n_sentences > 1:
1708
+ # Split sentences into batches of size n_sentences
1709
+ batches = [sentences[i:i + n_sentences] for i in range(0, len(sentences), n_sentences)]
1710
+
1711
+ # Concatenate batches of sentences and adjust spans accordingly
1712
+ concatenate_batches = [" ".join([sub[0] for sub in sublist]) for sublist in batches]
1713
+ concatenate_spans = [(sublist[0][1], sublist[-1][2]) for sublist in batches]
1714
+
1715
+ if threshold is not None:
1716
+ concatenate_batches = concatenate_batches[:threshold]
1717
+ concatenate_spans = concatenate_spans[:threshold]
1718
+
1719
+ results.append(concatenate_batches)
1720
+ start_indexes.append([span[0] for span in concatenate_spans])
1721
+ end_indexes.append([span[1] for span in concatenate_spans])
1722
+
1723
+ if stats:
1724
+ count_batches.append(len(concatenate_batches))
1725
+ else:
1726
+ sentences = sentences[:threshold] if threshold is not None else sentences
1727
+
1728
+ results.append([sub[0] for sub in sentences])
1729
+ start_indexes.append([sub[1] for sub in sentences])
1730
+ end_indexes.append([sub[2] for sub in sentences])
1731
+
1732
+ df['sentences'] = results
1733
+ df['start_indexes'] = start_indexes
1734
+ df['end_indexes'] = end_indexes
1735
+
1736
+ df = df.explode(['sentences', 'start_indexes', 'end_indexes']).reset_index(drop=True)
1737
+
1738
+ return df
1739
+
1740
+
1663
1741
  def spacy_NER(nlp, df: pd.DataFrame, col_text: str, entities_to_keep: list = ['PERSON','ORG'], explode: bool = True, batch_size : int = 100, n_process: int =1) -> pd.DataFrame:
1664
1742
  """
1665
1743
  Spacy implementation of NER.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: opsci-toolbox
3
- Version: 0.0.15
3
+ Version: 0.0.16
4
4
  Summary: a complete toolbox
5
5
  Home-page: UNKNOWN
6
6
  Author: Erwan Le Nagard
@@ -3,7 +3,7 @@ opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuF
3
3
  opsci_toolbox/apis/rapidapi_helpers.py,sha256=plX0uoGXWBEmeRqK7QfB_CVYJnW15kVUWtitESxPLNw,26669
4
4
  opsci_toolbox/apis/reddit.py,sha256=b_dJFZ_bOB9LLugGBBw5bCbUZdq8VnwtVCGaTYljIIg,21096
5
5
  opsci_toolbox/apis/telegram.py,sha256=JjmAk6tKvpnFIYpZDKthxS_mgqhWQpDPUOvyC7SiWPA,60920
6
- opsci_toolbox/apis/webscraping.py,sha256=1DAIYbywZoPwTSyoqFGxyF0-q_nUsGg_VK51zLL_bB0,21465
6
+ opsci_toolbox/apis/webscraping.py,sha256=fo6H2OaH0m_LHJB9IyN-q0Vkk8L9OvHxNn4O_A6a6yc,21572
7
7
  opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
8
8
  opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  opsci_toolbox/helpers/common.py,sha256=zmi-FbN39Rci_hGEKj2bmkcucrVwnHhMgKU6AAIap3Q,53327
@@ -11,7 +11,7 @@ opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,2
11
11
  opsci_toolbox/helpers/dataviz.py,sha256=U2Kj-xoF1wHvYXUKxLsrSvKnhky9PrPUy61s1WEKp44,208743
12
12
  opsci_toolbox/helpers/dates.py,sha256=Pq-SKP2n1z0_jzU8NxGSv8CHLH_MOKjP_rNYeny0Tb8,4752
13
13
  opsci_toolbox/helpers/gliner.py,sha256=qLkpuoCDezQyYmg_TE3XYETSpobHods6WBjCLo0Gjqw,3579
14
- opsci_toolbox/helpers/nlp.py,sha256=TXf1_dvmfDY9tR0gjQ1C-KzPRib7t74_ZcvmcYZWcPs,105096
14
+ opsci_toolbox/helpers/nlp.py,sha256=4edA5JZ4vzpU4U9w-INNspW2oTQ-yYpm5rFXExKB4YI,108324
15
15
  opsci_toolbox/helpers/nlp_cuml.py,sha256=KfgC0hMqLCKoOME2DOu3Wje4ormV19fEB8Fyq8G7D-E,30901
16
16
  opsci_toolbox/helpers/sna.py,sha256=3qx1WBQwLKpZNGR0bLSMB2-LBRx-vtNHp8puzoj-84A,33730
17
17
  opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
@@ -19,8 +19,8 @@ opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUK
19
19
  opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
21
21
  opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
22
- opsci_toolbox-0.0.15.dist-info/METADATA,sha256=ppE13xf4E90LfW9Eir5U30xOI91F96wQqAam7kZwV1o,1727
23
- opsci_toolbox-0.0.15.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
24
- opsci_toolbox-0.0.15.dist-info/dependency_links.txt,sha256=bEiJsgyh9M0F_pGpJBwUYDefiTNq9F6QEGfQS5RH1Os,39
25
- opsci_toolbox-0.0.15.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
26
- opsci_toolbox-0.0.15.dist-info/RECORD,,
22
+ opsci_toolbox-0.0.16.dist-info/METADATA,sha256=-SCFUBnwnWlUrOGgQwxib8ZfCjWxXm3iVVwnfErQ9Fk,1727
23
+ opsci_toolbox-0.0.16.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
24
+ opsci_toolbox-0.0.16.dist-info/dependency_links.txt,sha256=bEiJsgyh9M0F_pGpJBwUYDefiTNq9F6QEGfQS5RH1Os,39
25
+ opsci_toolbox-0.0.16.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
26
+ opsci_toolbox-0.0.16.dist-info/RECORD,,