PyPI - opsci-toolbox - Versions diffs - 0.0.15__py3-none-any.whl → 0.0.16__py3-none-any.whl - Mend

opsci-toolbox 0.0.15py3-none-any.whl → 0.0.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

opsci_toolbox/apis/webscraping.py CHANGED Viewed

@@ -97,9 +97,14 @@ def url_get_domain(url: str) -> str:
     Returns:
         str: The domain name extracted from the URL.
     """
-    parsed_url = urlparse(url)
-    domain = parsed_url.hostname if parsed_url.hostname else parsed_url.netloc
-    return domain
+    try:
+        parsed_url = urlparse(url)
+        domain = parsed_url.hostname if parsed_url.hostname else parsed_url.netloc
+        return domain
+    except Exception as e:
+        pass
+        print(url, e)
+        return url
 def url_get_extension(url: str) -> str:

opsci_toolbox/helpers/nlp.py CHANGED Viewed

@@ -30,7 +30,7 @@ from eldar import Query
 import torch
 from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
 from bs4 import BeautifulSoup
+from nltk.tokenize import PunktSentenceTokenizer
 ####################################################################
 # CLEANING
@@ -1660,6 +1660,84 @@ def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1
     return df
+def split_n_sentences_nltk(df: pd.DataFrame, col_text: str, n_sentences: int = 1, threshold: int = None, stats: bool = False) -> pd.DataFrame:
+    """
+    Split a text into chunks of n sentences, returning their start and end indexes in separate columns using NLTK PunktSentenceTokenizer.
+    Parameters:
+        df : pd.DataFrame
+            DataFrame containing the text data to split.
+        col_text : str
+            The name of the column containing the text data.
+        n_sentences : int, optional
+            The number of sentences to group together. Default is 1.
+        threshold : int, optional
+            Maximum number of sentence batches to return per text. If None, all batches are returned. Default is None.
+        stats : bool, optional
+            Flag indicating whether to compute statistics about the splitting process. Default is False.
+    Returns:
+        pd.DataFrame
+            DataFrame containing the split sentences with their start and end indexes in separate columns.
+    """
+    tokenizer = PunktSentenceTokenizer()
+    text = list(df[col_text].astype('unicode').values)
+    count_sentences = []
+    count_batches = []
+    results = []
+    start_indexes = []
+    end_indexes = []
+    for doc in tqdm(text, total=len(text), desc="Sentence splitting"):
+        sentences = []
+        start_pos = 0
+        # Tokenize sentences and compute positions
+        for sent in tokenizer.tokenize(doc):
+            start_idx = doc.find(sent, start_pos)
+            end_idx = start_idx + len(sent)
+            sentences.append((sent, start_idx, end_idx))
+            start_pos = end_idx
+        if stats:
+            count_sentences.append(len(sentences))
+        if n_sentences > 1:
+            # Split sentences into batches of size n_sentences
+            batches = [sentences[i:i + n_sentences] for i in range(0, len(sentences), n_sentences)]
+            # Concatenate batches of sentences and adjust spans accordingly
+            concatenate_batches = [" ".join([sub[0] for sub in sublist]) for sublist in batches]
+            concatenate_spans = [(sublist[0][1], sublist[-1][2]) for sublist in batches]
+            if threshold is not None:
+                concatenate_batches = concatenate_batches[:threshold]
+                concatenate_spans = concatenate_spans[:threshold]
+            results.append(concatenate_batches)
+            start_indexes.append([span[0] for span in concatenate_spans])
+            end_indexes.append([span[1] for span in concatenate_spans])
+            if stats:
+                count_batches.append(len(concatenate_batches))
+        else:
+            sentences = sentences[:threshold] if threshold is not None else sentences
+            results.append([sub[0] for sub in sentences])
+            start_indexes.append([sub[1] for sub in sentences])
+            end_indexes.append([sub[2] for sub in sentences])
+    df['sentences'] = results
+    df['start_indexes'] = start_indexes
+    df['end_indexes'] = end_indexes
+    df = df.explode(['sentences', 'start_indexes', 'end_indexes']).reset_index(drop=True)
+    return df
 def spacy_NER(nlp, df: pd.DataFrame, col_text: str, entities_to_keep: list = ['PERSON','ORG'], explode: bool = True, batch_size : int = 100, n_process: int =1) -> pd.DataFrame:
     """
     Spacy implementation of NER.

{opsci_toolbox-0.0.15.dist-info → opsci_toolbox-0.0.16.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: opsci-toolbox
-Version: 0.0.15
+Version: 0.0.16
 Summary: a complete toolbox
 Home-page: UNKNOWN
 Author: Erwan Le Nagard

{opsci_toolbox-0.0.15.dist-info → opsci_toolbox-0.0.16.dist-info}/RECORD RENAMED Viewed

@@ -3,7 +3,7 @@ opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuF
 opsci_toolbox/apis/rapidapi_helpers.py,sha256=plX0uoGXWBEmeRqK7QfB_CVYJnW15kVUWtitESxPLNw,26669
 opsci_toolbox/apis/reddit.py,sha256=b_dJFZ_bOB9LLugGBBw5bCbUZdq8VnwtVCGaTYljIIg,21096
 opsci_toolbox/apis/telegram.py,sha256=JjmAk6tKvpnFIYpZDKthxS_mgqhWQpDPUOvyC7SiWPA,60920
-opsci_toolbox/apis/webscraping.py,sha256=1DAIYbywZoPwTSyoqFGxyF0-q_nUsGg_VK51zLL_bB0,21465
+opsci_toolbox/apis/webscraping.py,sha256=fo6H2OaH0m_LHJB9IyN-q0Vkk8L9OvHxNn4O_A6a6yc,21572
 opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
 opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 opsci_toolbox/helpers/common.py,sha256=zmi-FbN39Rci_hGEKj2bmkcucrVwnHhMgKU6AAIap3Q,53327
@@ -11,7 +11,7 @@ opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,2
 opsci_toolbox/helpers/dataviz.py,sha256=U2Kj-xoF1wHvYXUKxLsrSvKnhky9PrPUy61s1WEKp44,208743
 opsci_toolbox/helpers/dates.py,sha256=Pq-SKP2n1z0_jzU8NxGSv8CHLH_MOKjP_rNYeny0Tb8,4752
 opsci_toolbox/helpers/gliner.py,sha256=qLkpuoCDezQyYmg_TE3XYETSpobHods6WBjCLo0Gjqw,3579
-opsci_toolbox/helpers/nlp.py,sha256=TXf1_dvmfDY9tR0gjQ1C-KzPRib7t74_ZcvmcYZWcPs,105096
+opsci_toolbox/helpers/nlp.py,sha256=4edA5JZ4vzpU4U9w-INNspW2oTQ-yYpm5rFXExKB4YI,108324
 opsci_toolbox/helpers/nlp_cuml.py,sha256=KfgC0hMqLCKoOME2DOu3Wje4ormV19fEB8Fyq8G7D-E,30901
 opsci_toolbox/helpers/sna.py,sha256=3qx1WBQwLKpZNGR0bLSMB2-LBRx-vtNHp8puzoj-84A,33730
 opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
@@ -19,8 +19,8 @@ opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUK
 opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
 opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
-opsci_toolbox-0.0.15.dist-info/METADATA,sha256=ppE13xf4E90LfW9Eir5U30xOI91F96wQqAam7kZwV1o,1727
-opsci_toolbox-0.0.15.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-opsci_toolbox-0.0.15.dist-info/dependency_links.txt,sha256=bEiJsgyh9M0F_pGpJBwUYDefiTNq9F6QEGfQS5RH1Os,39
-opsci_toolbox-0.0.15.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
-opsci_toolbox-0.0.15.dist-info/RECORD,,
+opsci_toolbox-0.0.16.dist-info/METADATA,sha256=-SCFUBnwnWlUrOGgQwxib8ZfCjWxXm3iVVwnfErQ9Fk,1727
+opsci_toolbox-0.0.16.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+opsci_toolbox-0.0.16.dist-info/dependency_links.txt,sha256=bEiJsgyh9M0F_pGpJBwUYDefiTNq9F6QEGfQS5RH1Os,39
+opsci_toolbox-0.0.16.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
+opsci_toolbox-0.0.16.dist-info/RECORD,,

{opsci_toolbox-0.0.15.dist-info → opsci_toolbox-0.0.16.dist-info}/WHEEL RENAMED Viewed

File without changes

{opsci_toolbox-0.0.15.dist-info → opsci_toolbox-0.0.16.dist-info}/dependency_links.txt RENAMED Viewed

File without changes

{opsci_toolbox-0.0.15.dist-info → opsci_toolbox-0.0.16.dist-info}/top_level.txt RENAMED Viewed

File without changes

opsci-toolbox 0.0.15__py3-none-any.whl → 0.0.16__py3-none-any.whl

opsci-toolbox 0.0.15py3-none-any.whl → 0.0.16py3-none-any.whl