PyPI - opsci-toolbox - Versions diffs - 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl - Mend

opsci-toolbox 0.0.10py3-none-any.whl → 0.0.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

opsci_toolbox/apis/webscraping.py +75 -0
opsci_toolbox/helpers/common.py +39 -20
opsci_toolbox/helpers/dataviz.py +4262 -1975
opsci_toolbox/helpers/nlp.py +121 -33
opsci_toolbox-0.0.12.dist-info/METADATA +53 -0
{opsci_toolbox-0.0.10.dist-info → opsci_toolbox-0.0.12.dist-info}/RECORD +8 -8
{opsci_toolbox-0.0.10.dist-info → opsci_toolbox-0.0.12.dist-info}/WHEEL +1 -1
opsci_toolbox-0.0.10.dist-info/METADATA +0 -53
{opsci_toolbox-0.0.10.dist-info → opsci_toolbox-0.0.12.dist-info}/top_level.txt +0 -0

opsci_toolbox/helpers/nlp.py CHANGED Viewed

@@ -48,6 +48,28 @@ def remove_html_tags(text: str) -> str:
     soup = BeautifulSoup(text, "html.parser")
     return soup.get_text()
+def remove_rt(text: str) -> str:
+    """
+    Remove the retweet tag from a given text.
+    Args:
+    - text (str): The input text possibly containing a retweet tag in the format "RT @username: ".
+    Returns:
+    - str: The cleaned text with the retweet tag removed.
+    Example:
+    >>> remove_rt("RT @user123: Check out this tweet!")
+    'Check out this tweet!'
+    """
+    # Regular expression pattern to match "RT @username: "
+    pattern = r'RT @\w+: '
+    # Substitute the pattern with an empty string
+    cleaned_text = re.sub(pattern, '', text)
+    return cleaned_text
 def filter_by_query(df: pd.DataFrame, col_text: str, query: str, ignore_case: bool = True, ignore_accent: bool = True, match_word: bool = False) -> pd.DataFrame:
     """
     Filter DataFrame rows by a query on a specific text column.
@@ -91,7 +113,8 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
         df : pandas DataFrame
             The DataFrame with cleaned text data.
     """
-    df[col_clean] = df[col].apply(lambda x : urls(x, repl= ''))
+    df[col_clean] = df[col].apply(remove_rt)
+    df[col_clean] = df[col_clean].apply(lambda x : urls(x, repl= ''))
     df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
     df[col_clean] = df[col_clean].apply(remove_extra_spaces)
     # df = df.loc[(df[col_clean] != ""), :]
@@ -1019,6 +1042,73 @@ def sample_most_engaging_posts(df: pd.DataFrame, col_topic: str, col_engagement:
 def get_lang_detector(nlp, name):
     return LanguageDetector(seed=42)  # We use the seed 42
+def PRarmy_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str = "lemmatized_text", pos_to_keep: list = ["VERB","NOUN","ADJ", "ADV", "PROPN"], entities_to_keep: list = ['PERSON','ORG', 'LOC'], stopwords: list = [], batch_size: int = 100, n_process: int = 1) -> pd.DataFrame:
+    """
+    Perform natural language processing tasks using spaCy for PR Army project.
+    Its main tasks are lemmatization and named entity recognition (NER).
+    Args:
+        nlp : spacy.Language
+            The spaCy language model.
+        df : pandas.DataFrame
+            The DataFrame containing the text data.
+        col_text : str
+            The name of the column containing the text data.
+        col_lemma : str
+            The name of the column to store the lemmatized text data.
+        pos_to_keep : list
+            A list of part-of-speech tags to keep during lemmatization.
+        entities_to_keep : list
+            A list of NER tags to keep.
+        stopwords : list
+            A list of stopwords to remove during processing.
+        batch_size : int, optional
+            The batch size for spaCy processing. Default is 100.
+        n_process : int, optional
+            The number of processes for parallel processing. Default is 1.
+    Returns:
+        pandas.DataFrame
+            The DataFrame with processed text data.
+    """
+    all_records = []
+    text=list(df[col_text].astype('unicode').values)
+    for doc in tqdm(nlp.pipe(text, batch_size=batch_size, n_process=n_process), total= len(text), desc = "NLP Process"):
+        NER_type = []
+        NER_text = []
+        ### LEMMATIZATION
+        if len(pos_to_keep)>0 and len(stopwords)>0:
+            lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space) and tok.text.lower() not in stopwords and tok.pos_ in pos_to_keep]
+        elif len(pos_to_keep)>0 and len(stopwords) < 1:
+            lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space) and tok.pos_ in pos_to_keep]
+        elif len(pos_to_keep) < 1 and len(stopwords) > 0:
+            lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space) and tok.text.lower() not in stopwords]
+        else :
+            lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space)]
+        ### NER
+        if len(entities_to_keep)>0:
+            for ent in doc.ents:
+                if ent.label_ in entities_to_keep:
+                    NER_type.append(ent.label_)
+                    NER_text.append(ent.text)
+        else:
+            for ent in doc.ents:
+                NER_type.append(ent.label_)
+                NER_text.append(ent.text)
+        record = (NER_type, NER_text, ' '.join(map(str, lemmas_list)))
+        all_records.append(record)
+    df[['NER_type', 'NER_text', col_lemma]] = pd.DataFrame(all_records, index=df.index)
+    return df
 def TM_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_keep: list, stopwords: list, batch_size: int = 100, n_process: int = 1, stats: bool = True, join_list: bool = False) -> pd.DataFrame:
     """
     Perform natural language processing tasks using spaCy for topic modeling.
@@ -1358,14 +1448,14 @@ def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1
     return df
-def spacy_NER(nlp, df: pd.DataFrame, col_text: str, entities_to_keep: list = ['PERSON','ORG'], explode: bool = True) -> pd.DataFrame:
+def spacy_NER(nlp, df: pd.DataFrame, col_text: str, entities_to_keep: list = ['PERSON','ORG'], explode: bool = True, batch_size : int = 100, n_process: int =1) -> pd.DataFrame:
     """
     Spacy implementation of NER.
     To define entities type to keep, call get_labels(nlp, pipe_step="ner", explanations=False)
     explode = False means it returns 1 list of entities per document
     explode = True means it returns 1 entity per row
-    Parameters:
+    Args:
         nlp : spacy.language.Language
             The spaCy language processing pipeline.
         df : pd.DataFrame
@@ -1376,6 +1466,10 @@ def spacy_NER(nlp, df: pd.DataFrame, col_text: str, entities_to_keep: list = ['P
             List of entity types to keep. Default is ['PERSON','ORG'].
         explode : bool, optional
             Flag indicating whether to explode the DataFrame to have one entity per row. Default is True.
+        batch_size : int, optional
+            Batch sizes
+        n_process : int, optional
+            Number of processes
     Returns:
         pd.DataFrame
@@ -1385,43 +1479,40 @@ def spacy_NER(nlp, df: pd.DataFrame, col_text: str, entities_to_keep: list = ['P
         This function performs Named Entity Recognition (NER) using spaCy on a DataFrame with text data. It extracts entities of the specified types
         and stores the NER information in separate columns. If 'explode' is set to True, it returns one entity per row in the DataFrame.
     """
-    # Create columns to store the NER information
-    df['NER_type'] = None
-    df['NER_text'] = None
-    df['NER_start_char'] = None
-    df['NER_end_char'] = None
-    # Function to process each row in the DataFrame
-    def process_row(row):
-        doc = nlp(row[col_text])
-        entities_data = []
+    l_text = df[col_text].tolist()
+    all_records = []
+    for doc in tqdm(nlp.pipe(l_text, batch_size=batch_size, n_process=n_process), total= len(l_text), desc = "NLP Process"):
+        NER_type = []
+        NER_text = []
+        NER_start_char = []
+        NER_end_char=[]
+        # entities_data = []
         if len(entities_to_keep)>0:
             for ent in doc.ents:
                 if ent.label_ in entities_to_keep:
-                    entities_data.append([ent.label_, ent.text, ent.start_char, ent.end_char])
+                    NER_type.append(ent.label_)
+                    NER_text.append(ent.text)
+                    NER_start_char.append(ent.start_char)
+                    NER_end_char.append(ent.end_char)
+                    # entities_data.append([ent.label_, ent.text, ent.start_char, ent.end_char])
         else:
             for ent in doc.ents:
-                entities_data.append([ent.label_, ent.text, ent.start_char, ent.end_char])
-        if entities_data:
-            entity_label, entity_text, start_char, end_char = zip(*entities_data)
-            row['NER_type'] = entity_label
-            row['NER_text'] = entity_text
-            row['NER_start_char'] = start_char
-            row['NER_end_char'] = end_char
+                NER_type.append(ent.label_)
+                NER_text.append(ent.text)
+                NER_start_char.append(ent.start_char)
+                NER_end_char.append(ent.end_char)
+                # entities_data.append([ent.label_, ent.text, ent.start_char, ent.end_char])
+        record = (NER_type, NER_text, NER_start_char, NER_end_char)
+        all_records.append(record)
-        return row
-    # Apply the processing function to each row
-    df = df.apply(process_row, axis=1)
+    df[['NER_type', 'NER_text','NER_start_char','NER_end_char']] = pd.DataFrame(all_records, index=df.index)
     if explode:
         df= df.explode(['NER_type', 'NER_text','NER_start_char','NER_end_char'])
     return df
 def tokenize(nlp, df: pd.DataFrame, col_text: str, col_tokens: str, pos_to_keep: list, stopwords: list, batch_size: int = 100, n_process: int = 1, stats: bool = True) -> pd.DataFrame:
     """
     Spacy implementation to tokenize text
@@ -1901,15 +1992,13 @@ def agglomerative_clustering(embeddings, n_clusters=15, metric="euclidean", link
-def hdbscan_clustering(embeddings, algorithm='best', alpha=1.0, cluster_selection_epsilon=0.0, approx_min_span_tree=True,
-                       gen_min_span_tree=True, leaf_size=40, metric='euclidean', min_cluster_size=5, min_samples=None,
-                       p=None, cluster_selection_method='eom', prediction_data = True):
+def hdbscan_clustering(embeddings, algorithm='best', alpha=1.0, cluster_selection_epsilon=0.0, approx_min_span_tree=True, gen_min_span_tree=True, leaf_size=40, metric='euclidean', min_cluster_size=5, min_samples=None, p=None, cluster_selection_method='eom', prediction_data = True):
     """
     This function performs clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm. It clusters the input data based on the specified parameters and returns the clusterer object, cluster labels for each point, and the probability of each sample being an outlier.
     Args
         embeddings : array-like or sparse matrix, shape (n_samples, n_features). The input data to be clustered.
-        algorithm : {'best', 'generic', 'prims_kdtree', 'boruvka_kdtree', 'boruvka_balltree', 'prims_balltree'}, optional. The algorithm to use for computation. Default is 'best'.
+        algorithm : {'best', 'generic', 'prims_kdtree', 'boruvka_kdtree', 'boruvka_balltree', 'prims_balltree'}, optional. The algorithm to use for computation. Default is best.
         alpha : float, optional. Scaling factor determining the individual weight of the (unnormalized) density estimate. Default is 1.0.
         cluster_selection_epsilon : float, optional. The epsilon value to specify a minimum cluster size. Default is 0.0.
         approx_min_span_tree : bool, optional. Whether to compute an approximation of the minimum spanning tree. Default is True.
@@ -2054,7 +2143,6 @@ def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json
             proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()[0]
             label = model.config.id2label[proba.argmax()]
             results = {"label":label, "score" : float(proba.max()), col_text : text}
-            print(results)
             write_json(results, dir_json , str(filename))
     return results

opsci_toolbox-0.0.12.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,53 @@
+Metadata-Version: 2.1
+Name: opsci-toolbox
+Version: 0.0.12
+Summary: a complete toolbox
+Home-page: UNKNOWN
+Author: Erwan Le Nagard
+Author-email: erwan@opsci.ai
+License: MIT
+Platform: UNKNOWN
+Requires-Dist: requests <3,>=2.31.0
+Requires-Dist: beautifulsoup4 ==4.9.3
+Requires-Dist: chardet >=4.0.0
+Requires-Dist: chart-studio ==1.1.0
+Requires-Dist: eldar ==0.0.8
+Requires-Dist: emoji ==2.10.1
+Requires-Dist: fa2-modified ==0.3.10
+Requires-Dist: google-api-python-client ==2.122.0
+Requires-Dist: gspread ==6.1.2
+Requires-Dist: hdbscan ==0.8.33
+Requires-Dist: jusText ==3.0.0
+Requires-Dist: langchain ==0.1.20
+Requires-Dist: matplotlib >=3.9.0
+Requires-Dist: mysql-connector-python >=9.0.0
+Requires-Dist: networkx ==3.2.1
+Requires-Dist: nltk ==3.8.1
+Requires-Dist: numpy <1.25.0,>=1.21.5
+Requires-Dist: opencv-python-headless ==4.9.0.80
+Requires-Dist: openpyxl ==3.1.3
+Requires-Dist: pandas >=1.5.3
+Requires-Dist: Pillow >=9.0.1
+Requires-Dist: plotly ==5.19.0
+Requires-Dist: protobuf ==4.23.4
+Requires-Dist: pyarrow >=14.0.2
+Requires-Dist: python-louvain ==0.16
+Requires-Dist: scikit-learn ==1.4.1.post1
+Requires-Dist: scipy <2.0.0,>=1.8.0
+Requires-Dist: sentence-transformers ==2.5.1
+Requires-Dist: setuptools ==59.6.0
+Requires-Dist: spacy ==3.7.4
+Requires-Dist: spacy-language-detection ==0.2.1
+Requires-Dist: spacymoji ==3.1.0
+Requires-Dist: supervision ==0.21.0
+Requires-Dist: textacy ==0.13.0
+Requires-Dist: torch ==2.0.1
+Requires-Dist: tqdm >=4.66.2
+Requires-Dist: trafilatura ==1.7.0
+Requires-Dist: transformers ==4.38.2
+Requires-Dist: umap-learn ==0.5.5
+Requires-Dist: urlextract ==1.9.0
+Requires-Dist: wordcloud ==1.9.3
+UNKNOWN

{opsci_toolbox-0.0.10.dist-info → opsci_toolbox-0.0.12.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,14 @@
 opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 opsci_toolbox/apis/rapidapi_helpers.py,sha256=k_hYcRNww5noNkX7zyz5Htggxb15BPoKSlbY7NLuQXI,26696
-opsci_toolbox/apis/webscraping.py,sha256=Gz3hOfhOHUpwHU1Pzj3mB2WdBAcKa2WisYBHMi3lcVE,18343
+opsci_toolbox/apis/webscraping.py,sha256=1DAIYbywZoPwTSyoqFGxyF0-q_nUsGg_VK51zLL_bB0,21465
 opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
 opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-opsci_toolbox/helpers/common.py,sha256=lemGhNwWIxaMwo-X7UsksUMGLV-IOuX_XwC82a50GD4,44672
+opsci_toolbox/helpers/common.py,sha256=nqg9wzgU5DxVTCxEb5LSw2lUnp0f_hKF_Q-DhpRtu6g,45158
 opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
-opsci_toolbox/helpers/dataviz.py,sha256=IfHByNWAU2rErZMfs3LuwZwJApLN5w320JEbBPuVp6U,115856
+opsci_toolbox/helpers/dataviz.py,sha256=1cIGb-u81cD5iSIkkkrzyrBnfim7fbhm0x_CguHUbf0,202128
 opsci_toolbox/helpers/dates.py,sha256=Wf7HxaUY62IRrY3XPdRIuoaMbGi3QqWf-vStqbRRY_o,2633
-opsci_toolbox/helpers/nlp.py,sha256=r4o7V9tJrj3xt34O_4hN0szbSB4RmveP8qmwCqHOxEY,87988
+opsci_toolbox/helpers/nlp.py,sha256=n7nNEU0cuu7bqXYRRBH4D-xIzpdNwKm0nj-eRYh3aPY,91956
 opsci_toolbox/helpers/nlp_cuml.py,sha256=XzBfoFMpVIehpRbp60E4wGokpoqJP0lJxs1plOxQqBY,28882
 opsci_toolbox/helpers/sna.py,sha256=XL1BZ-x83xWRNbGsvh7-m8Mdy6iOrWx8vjgaL2_TSmo,31905
 opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
@@ -16,7 +16,7 @@ opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUK
 opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
 opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
-opsci_toolbox-0.0.10.dist-info/METADATA,sha256=DAYpwkedg6Tf4p_JS0ntxq9qUBx9hxWagStKN972RoU,1717
-opsci_toolbox-0.0.10.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
-opsci_toolbox-0.0.10.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
-opsci_toolbox-0.0.10.dist-info/RECORD,,
+opsci_toolbox-0.0.12.dist-info/METADATA,sha256=LosT5jzu7Z0TXIslwVUSvPG6AKMrblGp8A6odUN_N9U,1633
+opsci_toolbox-0.0.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+opsci_toolbox-0.0.12.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
+opsci_toolbox-0.0.12.dist-info/RECORD,,

{opsci_toolbox-0.0.10.dist-info → opsci_toolbox-0.0.12.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.37.1)
+Generator: bdist_wheel (0.43.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

opsci_toolbox-0.0.10.dist-info/METADATA DELETED Viewed

@@ -1,53 +0,0 @@
-Metadata-Version: 2.1
-Name: opsci-toolbox
-Version: 0.0.10
-Summary: a complete toolbox
-Home-page: UNKNOWN
-Author: Erwan Le Nagard
-Author-email: erwan@opsci.ai
-License: MIT
-Platform: UNKNOWN
-Requires-Dist: Pillow (>=9.0.1)
-Requires-Dist: Requests (==2.32.3)
-Requires-Dist: beautifulsoup4 (==4.10.0)
-Requires-Dist: chardet (>=4.0.0)
-Requires-Dist: chart-studio (==1.1.0)
-Requires-Dist: eldar (==0.0.8)
-Requires-Dist: emoji (==2.10.1)
-Requires-Dist: fa2-modified (==0.3.10)
-Requires-Dist: google-api-python-client (==2.122.0)
-Requires-Dist: gspread (==6.1.2)
-Requires-Dist: hdbscan (==0.8.33)
-Requires-Dist: jusText (==3.0.0)
-Requires-Dist: langchain (==0.1.20)
-Requires-Dist: matplotlib (>=3.9.0)
-Requires-Dist: mysql-connector-repackaged (==0.3.1)
-Requires-Dist: networkx (==3.2.1)
-Requires-Dist: nltk (==3.8.1)
-Requires-Dist: numpy (<1.25.0,>=1.21.5)
-Requires-Dist: opencv-python-headless (==4.9.0.80)
-Requires-Dist: openpyxl (==3.1.3)
-Requires-Dist: pandas (>=1.5.3)
-Requires-Dist: plotly (==5.19.0)
-Requires-Dist: protobuf (==4.23.4)
-Requires-Dist: pyarrow (>=14.0.2)
-Requires-Dist: python-louvain (==0.16)
-Requires-Dist: scikit-learn (==1.4.1.post1)
-Requires-Dist: scipy (<2.0.0,>=1.8.0)
-Requires-Dist: sentence-transformers (==2.5.1)
-Requires-Dist: setuptools (==59.6.0)
-Requires-Dist: spacy (==3.7.4)
-Requires-Dist: spacy-language-detection (==0.2.1)
-Requires-Dist: spacymoji (==3.1.0)
-Requires-Dist: supervision (==0.21.0)
-Requires-Dist: textacy (==0.13.0)
-Requires-Dist: torch (==2.0.1)
-Requires-Dist: tqdm (==4.66.2)
-Requires-Dist: trafilatura (==1.7.0)
-Requires-Dist: transformers (==4.38.2)
-Requires-Dist: umap-learn (==0.5.5)
-Requires-Dist: urlextract (==1.9.0)
-Requires-Dist: wordcloud (==1.9.3)
-UNKNOWN

{opsci_toolbox-0.0.10.dist-info → opsci_toolbox-0.0.12.dist-info}/top_level.txt RENAMED Viewed

File without changes

opsci-toolbox 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl

opsci-toolbox 0.0.10py3-none-any.whl → 0.0.12py3-none-any.whl