PyPI - opsci-toolbox - Versions diffs - 0.0.15__py3-none-any.whl → 0.0.17__py3-none-any.whl - Mend

opsci-toolbox 0.0.15py3-none-any.whl → 0.0.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

opsci_toolbox/apis/webscraping.py CHANGED Viewed

@@ -97,9 +97,14 @@ def url_get_domain(url: str) -> str:
     Returns:
         str: The domain name extracted from the URL.
     """
-    parsed_url = urlparse(url)
-    domain = parsed_url.hostname if parsed_url.hostname else parsed_url.netloc
-    return domain
+    try:
+        parsed_url = urlparse(url)
+        domain = parsed_url.hostname if parsed_url.hostname else parsed_url.netloc
+        return domain
+    except Exception as e:
+        pass
+        print(url, e)
+        return url
 def url_get_extension(url: str) -> str:

opsci_toolbox/helpers/common.py CHANGED Viewed

@@ -1502,6 +1502,28 @@ def custom_ordering(df : pd.DataFrame, col_to_order : str, custom_order : list)
     df[col_to_order] = pd.Categorical(df[col_to_order], categories=custom_order, ordered=True).to_numpy()
     return df
+# def calcul_total_et_pourcentage(df : pd.DataFrame, col_gb : list, metrics : dict) -> pd.DataFrame:
+#     """
+#     Calculates the total and percentage values for the given metrics based on a grouping column.
+#     Args:
+#         df (DataFrame): The input DataFrame.
+#         col_gb (list):  Names of the columns to group by.
+#         metrics (dict): A dictionary of metrics to calculate.
+#     Returns:
+#         DataFrame: The modified DataFrame with total and percentage values added.
+#     """
+#     percentage_agregations = {f'per_{key}': lambda x: x[key] / x[f"total_{key}"] for key in list(metrics.keys())}
+#     df = (df.join(df.groupby(col_gb)
+#                   .agg(metrics)
+#                   .add_prefix("total_"), on=col_gb
+#                   )
+#                 .assign(**percentage_agregations).fillna(0)
+#         )
+#     return df
 def calcul_total_et_pourcentage(df : pd.DataFrame, col_gb : list, metrics : dict) -> pd.DataFrame:
     """
     Calculates the total and percentage values for the given metrics based on a grouping column.
@@ -1513,14 +1535,15 @@ def calcul_total_et_pourcentage(df : pd.DataFrame, col_gb : list, metrics : dict
         DataFrame: The modified DataFrame with total and percentage values added.
     """
-    percentage_agregations = {f'per_{key}': lambda x: x[key] / x[f"total_{key}"] for key in list(metrics.keys())}
+    # percentage_agregations = {f'per_{key}': lambda x: x[key] / x[f"total_{key}"] for key in list(metrics.keys())}
     df = (df.join(df.groupby(col_gb)
                   .agg(metrics)
                   .add_prefix("total_"), on=col_gb
                   )
-                .assign(**percentage_agregations).fillna(0)
         )
-    return df
+    for key in list(metrics.keys()):
+        df['per_' + key] = df[key] / df['total_' + key]
+        df['per_' + key] = df['per_' + key].fillna(0)
+    return df

opsci_toolbox/helpers/dataviz.py CHANGED Viewed

@@ -2007,6 +2007,115 @@ def horizontal_stacked_bars(df: pd.DataFrame,
     return fig
+def bar_stacked(df: pd.DataFrame,
+                             col_x: str,
+                             col_y: str,
+                             col_cat: str,
+                             col_color: str,
+                             **kwargs) -> go.Figure:
+    """
+    Create horizontal stacked bar plots.
+    Args:
+        df (pd.DataFrame): DataFrame containing data for the bar plots.
+        col_x (str): Name of the column containing x-axis values.
+        col_y (str): Name of the column containing y-axis values.
+        col_percentage (str): Name of the column containing percentage values.
+        col_cat (str): Name of the column containing categories.
+        col_color (str): Name of the column containing colors.
+        **kwargs: Additional keyword arguments to update default plotting parameters.
+    Returns:
+        go.Figure: Plotly Figure object representing the horizontal stacked bar plots.
+    """
+    params = general_kwargs()
+    params.update(kwargs)
+    categories = df[col_cat].unique()
+    col_hover = params["col_hover"]
+    fig = go.Figure()
+    for cat in categories:
+        current_df = df[df[col_cat] == cat]
+        hovertemplate= "<b>Catégorie</b> : "+str(cat)+"<br><b>"+str(col_x)+"</b> : "+current_df[col_x].astype(str)+ str(col_y) + "</b> : "+current_df[col_y].astype(str)
+        for c in col_hover:
+            hovertemplate += (
+                "<br><b>"
+                + str(c)
+                + "</b>:"
+                + current_df[c].astype(str).apply(wrap_text)
+            )
+        fig.add_trace(
+            go.Bar(
+                x=current_df[col_x],
+                y=current_df[col_y],
+                orientation=params['orientation'],
+                text = current_df[col_x],
+                textposition=params["textposition"],
+                name=cat,
+                marker=dict(color=current_df[col_color]),
+                hovertemplate=hovertemplate+'<extra></extra>',
+                textangle=params["xaxis_tickangle"],
+                )
+        )
+    fig.update_layout(
+            barmode='stack',
+            title_text=params["title_text"],
+            showlegend=params['showlegend'],
+            width = params["width"],
+            height= params["height"],
+            font_family=params["font_family"],
+            font_size=params["font_size"],
+            template=params["template"],
+            plot_bgcolor=params["plot_bgcolor"],  # background color (plot)
+            paper_bgcolor=params["paper_bgcolor"],
+            uniformtext_minsize=params["uniformtext_minsize"],
+            uniformtext_mode=params["uniformtext_mode"],
+        )
+    fig.update_yaxes(
+        # title=params["yaxis_title"],
+        title_font_size=params["yaxis_title_font_size"],
+        tickangle=params["yaxis_tickangle"],
+        tickfont_size=params["yaxis_tickfont_size"],
+        range=params["yaxis_range"],
+        showgrid=params["yaxis_showgrid"],
+        showline=params["yaxis_showline"],
+        zeroline=params["yaxis_zeroline"],
+        gridwidth=params["yaxis_gridwidth"],
+        gridcolor=params["yaxis_gridcolor"],
+        linewidth=params["yaxis_linewidth"],
+        linecolor=params["yaxis_linecolor"],
+        mirror=params["yaxis_mirror"],
+    )
+    fig.update_xaxes(
+        # title=params["xaxis_title"],
+        title_font_size=params["xaxis_title_font_size"],
+        tickangle=params["xaxis_tickangle"],
+        tickfont_size=params["xaxis_tickfont_size"],
+        # range=params["xaxis_range"],
+        showgrid=params["xaxis_showgrid"],
+        showline=params["xaxis_showline"],
+        zeroline=params["xaxis_zeroline"],
+        gridwidth=params["xaxis_gridwidth"],
+        gridcolor=params["xaxis_gridcolor"],
+        linewidth=params["xaxis_linewidth"],
+        linecolor=params["xaxis_linecolor"],
+        mirror=params["xaxis_mirror"]
+    )
+    fig.update_xaxes(title_text=params["xaxis_title"])
+    fig.update_yaxes(title_text=params["yaxis_title"])
+    fig.update_yaxes(showticklabels = False)
+    return fig
 def bar_trend_per_cat(df: pd.DataFrame,
                               col_x: str,
                               col_cat: str,
@@ -3597,13 +3706,11 @@ def density_map(df_posts: pd.DataFrame,
                 show_topics: bool = True,
                 show_halo: bool = False,
                 show_histogram: bool = True,
                 colorscale: str = "Portland",
                 marker_color: str = "#ff7f0e",
                 arrow_color: str = "#ff7f0e",
                 width: int = 1000,
                 height: int = 1000,
                 label_size_ratio: int = 100,
                 n_words: int = 3,
                 title_text: str = "Clustering",
@@ -3625,7 +3732,7 @@ def density_map(df_posts: pd.DataFrame,
         col_engagement (str): Column name corresponding to a metric.
         col_text (str): Column name corresponding to a text separated by |.
         col_text_dots (str): Column name corresponding to the text for dots.
-        colorscale (str, optional): Possible values are 'https://plotly.com/python/builtin-colorscales/'. Defaults to "Portland".
+        colorscale (str, optional): Possible values are ``https://plotly.com/python/builtin-colorscales/``. Defaults to "Portland".
         marker_color (str, optional): Dots color value. Defaults to "#ff7f0e".
         arrow_color (str, optional): Arrow pointing to topic centroid color value. Defaults to "#ff7f0e".
         width (int, optional): Width of the plot. Defaults to 1000.

opsci_toolbox/helpers/nlp.py CHANGED Viewed

@@ -30,7 +30,7 @@ from eldar import Query
 import torch
 from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
 from bs4 import BeautifulSoup
+from nltk.tokenize import PunktSentenceTokenizer
 ####################################################################
 # CLEANING
@@ -1660,6 +1660,84 @@ def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1
     return df
+def split_n_sentences_nltk(df: pd.DataFrame, col_text: str, n_sentences: int = 1, threshold: int = None, stats: bool = False) -> pd.DataFrame:
+    """
+    Split a text into chunks of n sentences, returning their start and end indexes in separate columns using NLTK PunktSentenceTokenizer.
+    Parameters:
+        df : pd.DataFrame
+            DataFrame containing the text data to split.
+        col_text : str
+            The name of the column containing the text data.
+        n_sentences : int, optional
+            The number of sentences to group together. Default is 1.
+        threshold : int, optional
+            Maximum number of sentence batches to return per text. If None, all batches are returned. Default is None.
+        stats : bool, optional
+            Flag indicating whether to compute statistics about the splitting process. Default is False.
+    Returns:
+        pd.DataFrame
+            DataFrame containing the split sentences with their start and end indexes in separate columns.
+    """
+    tokenizer = PunktSentenceTokenizer()
+    text = list(df[col_text].astype('unicode').values)
+    count_sentences = []
+    count_batches = []
+    results = []
+    start_indexes = []
+    end_indexes = []
+    for doc in tqdm(text, total=len(text), desc="Sentence splitting"):
+        sentences = []
+        start_pos = 0
+        # Tokenize sentences and compute positions
+        for sent in tokenizer.tokenize(doc):
+            start_idx = doc.find(sent, start_pos)
+            end_idx = start_idx + len(sent)
+            sentences.append((sent, start_idx, end_idx))
+            start_pos = end_idx
+        if stats:
+            count_sentences.append(len(sentences))
+        if n_sentences > 1:
+            # Split sentences into batches of size n_sentences
+            batches = [sentences[i:i + n_sentences] for i in range(0, len(sentences), n_sentences)]
+            # Concatenate batches of sentences and adjust spans accordingly
+            concatenate_batches = [" ".join([sub[0] for sub in sublist]) for sublist in batches]
+            concatenate_spans = [(sublist[0][1], sublist[-1][2]) for sublist in batches]
+            if threshold is not None:
+                concatenate_batches = concatenate_batches[:threshold]
+                concatenate_spans = concatenate_spans[:threshold]
+            results.append(concatenate_batches)
+            start_indexes.append([span[0] for span in concatenate_spans])
+            end_indexes.append([span[1] for span in concatenate_spans])
+            if stats:
+                count_batches.append(len(concatenate_batches))
+        else:
+            sentences = sentences[:threshold] if threshold is not None else sentences
+            results.append([sub[0] for sub in sentences])
+            start_indexes.append([sub[1] for sub in sentences])
+            end_indexes.append([sub[2] for sub in sentences])
+    df['sentences'] = results
+    df['start_indexes'] = start_indexes
+    df['end_indexes'] = end_indexes
+    df = df.explode(['sentences', 'start_indexes', 'end_indexes']).reset_index(drop=True)
+    return df
 def spacy_NER(nlp, df: pd.DataFrame, col_text: str, entities_to_keep: list = ['PERSON','ORG'], explode: bool = True, batch_size : int = 100, n_process: int =1) -> pd.DataFrame:
     """
     Spacy implementation of NER.
@@ -1996,9 +2074,9 @@ def encode_chunked_files(chunk_files_paths: list,
     Encode text from files and save the results in another pickle file.
     Parameters:
-        chunk_files_paths (List[str]): List of file paths containing documents.
+        chunk_files_paths (list): List of file paths containing documents.
         HF_encoder (Encoder): Encoder object for text vectorization.
-        cols (List[str]): Columns to keep in the resulting DataFrame.
+        cols (list): Columns to keep in the resulting DataFrame.
         col_text (str): Column containing text data in the DataFrame.
         path_embedded_chunks (str): Path to save the embedded chunks.
         reencode (bool, optional): Whether to re-encode files even if they already exist. Defaults to False.
@@ -2040,12 +2118,10 @@ def encode_labels(data_to_encode: np.ndarray) -> tuple:
     Encodes a list of labels using a LabelEncoder.
     Args:
-    - data_to_encode (List[Union[str, int]]): The list of labels to encode. Labels can be of any hashable type,
-      but strings or integers are typical.
+        data_to_encode (List[Union[str, int]]): The list of labels to encode. Labels can be of any hashable type, but strings or integers are typical.
     Returns:
-    - Tuple[LabelEncoder, np.ndarray]: A tuple containing the fitted LabelEncoder instance and a numpy array
-      of encoded labels.
+        Tuple[LabelEncoder, np.ndarray]: A tuple containing the fitted LabelEncoder instance and a numpy array of encoded labels.
     """
     label_encoder = LabelEncoder()
     label_encoder.fit(data_to_encode)
@@ -2072,12 +2148,10 @@ def one_hot_encode(data_to_encode:np.ndarray) -> tuple:
     One-hot encodes a list of categorical values using OneHotEncoder.
     Args:
-    - data_to_encode (List[Union[str, int]]): The list of categorical values to encode. The values can be of
-      any hashable type, typically strings or integers.
+    - data_to_encode (List[Union[str, int]]): The list of categorical values to encode. The values can be of any hashable type, typically strings or integers.
     Returns:
-    - Tuple[OneHotEncoder, np.ndarray]: A tuple containing the fitted OneHotEncoder instance and a numpy array
-      of one-hot encoded values.
+    - Tuple[OneHotEncoder, np.ndarray]: A tuple containing the fitted OneHotEncoder instance and a numpy array of one-hot encoded values.
     """
     one_hot_encoder = OneHotEncoder(sparse=False)
     data_to_encode_reshaped = np.array(data_to_encode).reshape(-1, 1)  # Reshape for OneHotEncoder

opsci_toolbox/helpers/nlp_cuml.py CHANGED Viewed

@@ -258,7 +258,7 @@ def process_UMAP(embedded_chunks_paths: list, path_reduced_embeddings_id: str, r
     new_file_paths=[]
     for file_path in tqdm(embedded_chunks_paths, total=len(embedded_chunks_paths), desc="UMAP transform from files"):
-        filename = os.path.splitext(os.path.basename(file_path))[0][:-9]
+        filename = os.path.splitext(os.path.basename(file_path))[0]
         new_filename = filename+"_reduce_embeddings.parquet"
         new_file_path = os.path.join(path_reduced_embeddings_id, new_filename)
@@ -309,7 +309,7 @@ def process_HDBSCAN(clusterer,
     new_file_paths=[]
     for file_path in tqdm(reduced_embeddings_paths, total=len(reduced_embeddings_paths), desc="HDBSCAN transform from files"):
-        filename = os.path.splitext(os.path.basename(file_path))[0][:-18]
+        filename = os.path.splitext(os.path.basename(file_path))[0]
         new_filename = filename+ "_predictions.parquet"
         new_file_path = os.path.join(path_predictions_dataset_id, new_filename)
         if not os.path.exists(new_file_path) or reencode:
@@ -566,7 +566,7 @@ def cudf_write_parquet(df: cudf.DataFrame, path: str, filename: str) -> str:
     df.to_parquet(file_path)
     return file_path
-def cudf_read_parquet(path: str) -> cudf.DataFrame:
+def cudf_read_parquet(path: str, cols : list = None) -> cudf.DataFrame:
     """
     Read a Parquet file into a cuDF DataFrame.
@@ -576,7 +576,10 @@ def cudf_read_parquet(path: str) -> cudf.DataFrame:
     Returns:
         cudf.DataFrame: The read cuDF DataFrame.
     """
-    df = cudf.read_parquet(path)
+    if cols :
+        df = cudf.read_parquet(path, columns=cols)
+    else :
+        df = cudf.read_parquet(path)
     return df
 def convert_df_to_cudf(df: pd.DataFrame) -> cudf.DataFrame:

opsci_toolbox/helpers/sna.py CHANGED Viewed

@@ -44,7 +44,7 @@ def create_subgraph_min_metric(G: nx.Graph, metric: str = "degree", min_value: f
     subgraph = G.subgraph(nodes_with_min_metric).copy()
     return subgraph
 def group_nodes_by_values(dictionnary : dict) -> dict:
     """
     Group nodes by their values from a dictionary.

{opsci_toolbox-0.0.15.dist-info → opsci_toolbox-0.0.17.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: opsci-toolbox
-Version: 0.0.15
+Version: 0.0.17
 Summary: a complete toolbox
 Home-page: UNKNOWN
 Author: Erwan Le Nagard

{opsci_toolbox-0.0.15.dist-info → opsci_toolbox-0.0.17.dist-info}/RECORD RENAMED Viewed

@@ -3,24 +3,24 @@ opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuF
 opsci_toolbox/apis/rapidapi_helpers.py,sha256=plX0uoGXWBEmeRqK7QfB_CVYJnW15kVUWtitESxPLNw,26669
 opsci_toolbox/apis/reddit.py,sha256=b_dJFZ_bOB9LLugGBBw5bCbUZdq8VnwtVCGaTYljIIg,21096
 opsci_toolbox/apis/telegram.py,sha256=JjmAk6tKvpnFIYpZDKthxS_mgqhWQpDPUOvyC7SiWPA,60920
-opsci_toolbox/apis/webscraping.py,sha256=1DAIYbywZoPwTSyoqFGxyF0-q_nUsGg_VK51zLL_bB0,21465
+opsci_toolbox/apis/webscraping.py,sha256=fo6H2OaH0m_LHJB9IyN-q0Vkk8L9OvHxNn4O_A6a6yc,21572
 opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
 opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-opsci_toolbox/helpers/common.py,sha256=zmi-FbN39Rci_hGEKj2bmkcucrVwnHhMgKU6AAIap3Q,53327
+opsci_toolbox/helpers/common.py,sha256=gM0QzLsdjMQTTT522CqzpFO86YWaxPaK48EXemjw9nI,54298
 opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
-opsci_toolbox/helpers/dataviz.py,sha256=U2Kj-xoF1wHvYXUKxLsrSvKnhky9PrPUy61s1WEKp44,208743
+opsci_toolbox/helpers/dataviz.py,sha256=viIrTrnxFzCRLY5sJDEz3jJtsB-gZTZb2uLoq0yvTlU,212762
 opsci_toolbox/helpers/dates.py,sha256=Pq-SKP2n1z0_jzU8NxGSv8CHLH_MOKjP_rNYeny0Tb8,4752
 opsci_toolbox/helpers/gliner.py,sha256=qLkpuoCDezQyYmg_TE3XYETSpobHods6WBjCLo0Gjqw,3579
-opsci_toolbox/helpers/nlp.py,sha256=TXf1_dvmfDY9tR0gjQ1C-KzPRib7t74_ZcvmcYZWcPs,105096
-opsci_toolbox/helpers/nlp_cuml.py,sha256=KfgC0hMqLCKoOME2DOu3Wje4ormV19fEB8Fyq8G7D-E,30901
-opsci_toolbox/helpers/sna.py,sha256=3qx1WBQwLKpZNGR0bLSMB2-LBRx-vtNHp8puzoj-84A,33730
+opsci_toolbox/helpers/nlp.py,sha256=MC2ibMi0j9BCysloEPXpvpvRlzlMvRn8krOAcFF-4VU,108286
+opsci_toolbox/helpers/nlp_cuml.py,sha256=sLvaDfVL0aoGi3mNXUkW47tWVrrYK5wxbf8QPgljQNA,30991
+opsci_toolbox/helpers/sna.py,sha256=yzBTQXYXow_lKGhlSMz8hYl2JcSlle95YEDht9v-_fY,33734
 opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
 opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
 opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
 opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
-opsci_toolbox-0.0.15.dist-info/METADATA,sha256=ppE13xf4E90LfW9Eir5U30xOI91F96wQqAam7kZwV1o,1727
-opsci_toolbox-0.0.15.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-opsci_toolbox-0.0.15.dist-info/dependency_links.txt,sha256=bEiJsgyh9M0F_pGpJBwUYDefiTNq9F6QEGfQS5RH1Os,39
-opsci_toolbox-0.0.15.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
-opsci_toolbox-0.0.15.dist-info/RECORD,,
+opsci_toolbox-0.0.17.dist-info/METADATA,sha256=RvPoecg-cflzmh0PcNj9dDZm_RLp5KsK2n-hRTXdEUs,1727
+opsci_toolbox-0.0.17.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+opsci_toolbox-0.0.17.dist-info/dependency_links.txt,sha256=bEiJsgyh9M0F_pGpJBwUYDefiTNq9F6QEGfQS5RH1Os,39
+opsci_toolbox-0.0.17.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
+opsci_toolbox-0.0.17.dist-info/RECORD,,

{opsci_toolbox-0.0.15.dist-info → opsci_toolbox-0.0.17.dist-info}/WHEEL RENAMED Viewed

File without changes

{opsci_toolbox-0.0.15.dist-info → opsci_toolbox-0.0.17.dist-info}/dependency_links.txt RENAMED Viewed

File without changes

{opsci_toolbox-0.0.15.dist-info → opsci_toolbox-0.0.17.dist-info}/top_level.txt RENAMED Viewed

File without changes

opsci-toolbox 0.0.15__py3-none-any.whl → 0.0.17__py3-none-any.whl

opsci-toolbox 0.0.15py3-none-any.whl → 0.0.17py3-none-any.whl