PyPI - opsci-toolbox - Versions diffs - 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl - Mend

opsci-toolbox 0.0.7py3-none-any.whl → 0.0.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

opsci_toolbox/apis/rapidapi_helpers.py +120 -21
opsci_toolbox/apis/webscraping.py +186 -59
opsci_toolbox/apis/youtube_helpers.py +103 -16
opsci_toolbox/helpers/common.py +368 -254
opsci_toolbox/helpers/cv.py +50 -60
opsci_toolbox/helpers/dataviz.py +255 -184
opsci_toolbox/helpers/dates.py +17 -18
opsci_toolbox/helpers/nlp.py +154 -114
opsci_toolbox/helpers/nlp_cuml.py +389 -36
opsci_toolbox/helpers/sna.py +509 -0
opsci_toolbox/helpers/sql.py +53 -0
{opsci_toolbox-0.0.7.dist-info → opsci_toolbox-0.0.9.dist-info}/METADATA +12 -9
opsci_toolbox-0.0.9.dist-info/RECORD +22 -0
opsci_toolbox-0.0.7.dist-info/RECORD +0 -21
{opsci_toolbox-0.0.7.dist-info → opsci_toolbox-0.0.9.dist-info}/WHEEL +0 -0
{opsci_toolbox-0.0.7.dist-info → opsci_toolbox-0.0.9.dist-info}/top_level.txt +0 -0

opsci_toolbox/helpers/nlp_cuml.py CHANGED Viewed

@@ -1,36 +1,47 @@
 from cuml import UMAP
+import cudf
+from sklearn.feature_selection import chi2
+from cuml.feature_extraction.text import CountVectorizer
 from cuml.cluster.hdbscan import HDBSCAN, all_points_membership_vectors, approximate_predict, membership_vector
 import numpy as np
 from tqdm import tqdm
 import os
 from opsci_toolbox.helpers.common import load_pickle, create_dir, write_pickle
+import cudf.pandas
+cudf.pandas.install()
+import pandas as pd
 def reduce_with_cuml_UMAP(embeddings: np.ndarray,
                           n_neighbors: int = 5,
                           n_components: int = 3,
                           min_dist: float = 0.0,
                           metric: str = "cosine",
-                          spread: float = 1.0) -> tuple:
+                          spread: float = 1.0,
+                          learning_rate: float = 1.0,
+                          n_epochs:int = 300
+                           ) -> tuple:
     """
     Reduces the dimensionality of embeddings using UMAP with cuML library.
-    Parameters:
-    - embeddings (np.ndarray): The input embeddings to be reduced.
-    - n_neighbors (int, optional): The number of nearest neighbors to consider. Defaults to 5.
-    - n_components (int, optional): The number of dimensions of the embedded space. Defaults to 3.
-    - min_dist (float, optional): The minimum distance between embedded points. Defaults to 0.0.
-    - metric (str, optional): The metric to use for distance computation. Defaults to "cosine".
-    - spread (float, optional): The effective scale of embedded points. Defaults to 1.0.
+    Args:
+        embeddings (np.ndarray): The input embeddings to be reduced.
+        n_neighbors (int, optional): The number of nearest neighbors to consider. Defaults to 5.
+        n_components (int, optional): The number of dimensions of the embedded space. Defaults to 3.
+        min_dist (float, optional): The minimum distance between embedded points. Defaults to 0.0.
+        metric (str, optional): The metric to use for distance computation. Defaults to "cosine".
+        spread (float, optional): The effective scale of embedded points. Defaults to 1.0.
     Returns:
-    - reducer (UMAP): The UMAP reducer object.
-    - reduced_embeddings (np.ndarray): The reduced embeddings.
+        reducer (UMAP): The UMAP reducer object.
+        reduced_embeddings (np.ndarray): The reduced embeddings.
     """
     reducer = UMAP(n_neighbors=n_neighbors,
                    n_components=n_components,
                    min_dist=min_dist,
                    metric=metric,
-                   spread =  spread).fit(embeddings)
+                   spread = spread,
+                   n_epochs=n_epochs,
+                   learning_rate=learning_rate).fit(embeddings)
     reduced_embeddings = reducer.transform(embeddings)
     return reducer, reduced_embeddings
@@ -40,12 +51,12 @@ def transform_with_cuml_UMAP(reducer,
     """
     Transform new data points using a UMAP object.
-    Parameters:
-    - reducer (UMAP): The UMAP reducer object.
-    - new_embeddings (np.ndarray): The new data points to be transformed.
+    Args:
+        reducer (UMAP): The UMAP reducer object.
+        new_embeddings (np.ndarray): The new data points to be transformed.
     Returns:
-    - reduced_embeddings (np.ndarray): The transformed embeddings.
+        reduced_embeddings (np.ndarray): The transformed embeddings.
     """
     reduced_embeddings = reducer.transform(new_embeddings)
     return reduced_embeddings
@@ -68,7 +79,7 @@ def hdbscan_cuml_clustering(embeddings: np.ndarray,
     """
     Perform clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm.
-    Parameters:
+    Args:
         embeddings : array-like or sparse matrix, shape (n_samples, n_features)
             The input data to be clustered.
         min_cluster_size : int, optional
@@ -100,7 +111,7 @@ def hdbscan_cuml_clustering(embeddings: np.ndarray,
             Whether the data is prediction data or not. Default is True.
     Returns:
-        clusterer : hdbscan.hdbscan_.HDBSCAN
+        clusterer : hdbscan.HDBSCAN
             HDBSCAN clusterer object.
         labels : array, shape (n_samples,)
             Cluster labels for each point. Noisy samples are given the label -1.
@@ -129,8 +140,8 @@ def transform_with_cuml_HDBSCAN(clusterer, new_embeddings: np.ndarray) -> tuple:
     """
     Transform new data points using an HDBSCAN object.
-    Parameters:
-        clusterer : hdbscan.hdbscan_.HDBSCAN
+    Args:
+        clusterer : hdbscan.HDBSCAN
             The HDBSCAN clusterer object trained on the original data.
         new_embeddings : array-like or sparse matrix, shape (n_samples, n_features)
             The new data points to be transformed.
@@ -149,15 +160,13 @@ def cuml_soft_clustering(clusterer) -> tuple:
     """
     Perform soft clustering using HDBSCAN.
-    Parameters:
-        clusterer : hdbscan.hdbscan_.HDBSCAN
+    Args:
+        clusterer : hdbscan.HDBSCAN
             The HDBSCAN clusterer object trained on the original data.
     Returns:
-        soft_clusters_val : list of str
-            Predicted cluster labels for each data point, represented as strings.
-        soft_clusters_proba : list of float
-            The maximum probability of each data point belonging to any cluster.
+        soft_clusters_val : list of str. Predicted cluster labels for each data point, represented as strings.
+        soft_clusters_proba : list of float. The maximum probability of each data point belonging to any cluster.
     """
     soft_clusters = all_points_membership_vectors(clusterer)
     soft_clusters_val = [str(np.argmax(x)) for x in soft_clusters]
@@ -169,7 +178,7 @@ def soft_cuml_clustering_new_data(clusterer, embeddings: np.ndarray) -> tuple:
     """
     Predict cluster memberships for new data points using HDBSCAN soft clustering.
-    Parameters:
+    Args:
         clusterer : hdbscan.hdbscan_.HDBSCAN
             The HDBSCAN clusterer object trained on the original data.
         embeddings : array-like or sparse matrix, shape (n_samples, n_features)
@@ -190,7 +199,7 @@ def process_UMAP(embedded_chunks_paths: list, path_reduced_embeddings_id: str, r
     """
     Process embeddings using UMAP reduction.
-    Parameters:
+    Args:
         embedded_chunks_paths : list of str
             List of file paths containing the embedded chunks.
         path_reduced_embeddings_id : str
@@ -208,20 +217,21 @@ def process_UMAP(embedded_chunks_paths: list, path_reduced_embeddings_id: str, r
     for file_path in tqdm(embedded_chunks_paths, total=len(embedded_chunks_paths), desc="UMAP transform from files"):
         filename = os.path.splitext(os.path.basename(file_path))[0][:-9]
-        new_filename = filename+"_reduce_embeddings.pickle"
+        new_filename = filename+"_reduce_embeddings.parquet"
         new_file_path = os.path.join(path_reduced_embeddings_id, new_filename)
         if not os.path.exists(new_file_path) or reencode:
-            df = load_pickle(file_path)
+            df = cudf_read_parquet(file_path)
             create_dir(path_reduced_embeddings_id)
             # embeddings = df["embeddings"].to_list()
-            embeddings = np.vstack(df['embeddings'].values)
+            # embeddings = np.vstack(df['embeddings'].values)
+            embeddings = np.vstack(df['embeddings'].to_pandas().tolist())
             reduced_embeddings = transform_with_cuml_UMAP(reducer, embeddings)
             reduced_embeddings_transformed=[list(e) for e in reduced_embeddings]
             df['reduced_embeddings'] = reduced_embeddings_transformed
             df.drop(columns=["embeddings"], inplace=True)
             print(path_reduced_embeddings_id, filename+"_reduce_embeddings")
-            write_pickle(df, path_reduced_embeddings_id, filename+"_reduce_embeddings")
+            cudf_write_parquet(df, path_reduced_embeddings_id, filename+"_reduce_embeddings")
             new_file_paths.append(new_file_path)
         else:
             print("REDUCED EMBEDDINGS ALREADY EXISTS", file_path)
@@ -238,7 +248,7 @@ def process_HDBSCAN(clusterer,
     """
     Process reduced embeddings using HDBSCAN clustering.
-    Parameters:
+    Args:
         clusterer : hdbscan.hdbscan_.HDBSCAN
             The HDBSCAN clusterer object.
         reduced_embeddings_paths : list of str
@@ -258,12 +268,13 @@ def process_HDBSCAN(clusterer,
     for file_path in tqdm(reduced_embeddings_paths, total=len(reduced_embeddings_paths), desc="HDBSCAN transform from files"):
         filename = os.path.splitext(os.path.basename(file_path))[0][:-18]
-        new_filename = filename+ "_predictions.pickle"
+        new_filename = filename+ "_predictions.parquet"
         new_file_path = os.path.join(path_predictions_dataset_id, new_filename)
         if not os.path.exists(new_file_path) or reencode:
-            df = load_pickle(file_path)
+            df = cudf_read_parquet(file_path)
             # reduced_embeddings = df["reduced_embeddings"].to_list()
-            reduced_embeddings = np.vstack(df['reduced_embeddings'].values)
+            # reduced_embeddings = np.vstack(df['reduced_embeddings'].values)
+            reduced_embeddings = np.vstack(df['reduced_embeddings'].to_pandas().tolist())
             topics, probas = transform_with_cuml_HDBSCAN(clusterer, reduced_embeddings)
             df["topic"]=topics.astype(int).astype(str)
             df["proba"]=probas
@@ -272,9 +283,351 @@ def process_HDBSCAN(clusterer,
                 df["soft_topic"]=soft_clusters
                 df["soft_proba"]=soft_proba
-            write_pickle(df, path_predictions_dataset_id, filename+ "_predictions")
+            cudf_write_parquet(df, path_predictions_dataset_id, filename+ "_predictions")
             new_file_paths.append(new_file_path)
         else:
             print("CLUSTERING ALREADY EXISTS", file_path)
             new_file_paths.append(new_file_path)
     return new_file_paths
+# def cuml_word_frequency_per_categorie(df: pd.DataFrame, col_text: str, col_cat: str, ngram_range: tuple = (1, 1), stop_words: list = [], n_words: int = 20, min_freq: int = 3) -> pd.DataFrame:
+#     """
+#     Calculate word frequency per category using cuML for GPU acceleration.
+#     Parameters:
+#         df : pandas DataFrame
+#             DataFrame containing text data and corresponding categories.
+#         col_text : str
+#             Name of the column containing the text data.
+#         col_cat : str
+#             Name of the column containing the categories.
+#         ngram_range : tuple, optional
+#             The range for n-grams. Default is (1, 1) for unigrams.
+#         stop_words : list, optional
+#             List of stopwords to be ignored during frequency calculation. Default is an empty list.
+#         n_words : int, optional
+#             Number of top words to display per category. Default is 20.
+#         min_freq : int, optional
+#             Minimum frequency threshold for word occurrences per category. Default is 3.
+#     Returns:
+#         DataFrame
+#             DataFrame containing word frequencies per category.
+#     Description:
+#         This function calculates word frequencies per category based on the provided DataFrame, considering the text data and corresponding categories.
+#         It filters out words with frequencies below the specified minimum frequency threshold and returns the top words for each category.
+#     """
+#     # Convert pandas DataFrame to cuDF DataFrame
+#     gdf = cudf.DataFrame.from_pandas(df)
+#     # Initialize cuML's CountVectorizer
+#     count_vectorizer = CountVectorizer(analyzer='word', ngram_range=ngram_range, stop_words=stop_words)
+#     # Fit and transform the text data
+#     X_train_count = count_vectorizer.fit_transform(gdf[col_text])
+#     X_names_count = count_vectorizer.get_feature_names()
+#     # Initialize the resulting DataFrame
+#     df_count = cudf.DataFrame()
+#     # Calculate word frequencies per category
+#     for cat in gdf[col_cat].unique().to_pandas().tolist():
+#         word_count = X_train_count[gdf[col_cat] == cat].sum(axis=0)
+#         df_count_tmp = cudf.DataFrame({col_cat: [cat]*len(X_names_count), "word": X_names_count, "freq": word_count.tolist()[0]}).sort_values(by="freq", ascending=False)
+#         # Apply frequency and n_words filters
+#         if n_words:
+#             df_count_tmp = df_count_tmp.head(n_words)
+#         if min_freq:
+#             df_count_tmp = df_count_tmp[df_count_tmp["freq"] > min_freq]
+#         # Concatenate the result to the main DataFrame
+#         df_count = cudf.concat([df_count, df_count_tmp])
+#     # Convert the result back to pandas DataFrame
+#     return df_count.to_pandas()
+def cuml_word_frequency_per_categorie(gdf: pd.DataFrame, col_text: str, col_cat: str, ngram_range: tuple = (1, 1), stop_words: list = [], n_words: int = 20, min_freq: int = 3) -> pd.DataFrame:
+    """
+    Calculate word frequency per category using cuML for GPU acceleration.
+    Args:
+        df : pandas DataFrame
+            DataFrame containing text data and corresponding categories.
+        col_text : str
+            Name of the column containing the text data.
+        col_cat : str
+            Name of the column containing the categories.
+        ngram_range : tuple, optional
+            The range for n-grams. Default is (1, 1) for unigrams.
+        stop_words : list, optional
+            List of stopwords to be ignored during frequency calculation. Default is an empty list.
+        n_words : int, optional
+            Number of top words to display per category. Default is 20.
+        min_freq : int, optional
+            Minimum frequency threshold for word occurrences per category. Default is 3.
+    Returns:
+        DataFrame
+            DataFrame containing word frequencies per category.
+    Description:
+        This function calculates word frequencies per category based on the provided DataFrame, considering the text data and corresponding categories.
+        It filters out words with frequencies below the specified minimum frequency threshold and returns the top words for each category.
+    """
+    # Convert pandas DataFrame to cuDF DataFrame
+    # gdf = cudf.DataFrame.from_pandas(df))
+    # print(type(gdf))
+    # gdf = convert_df_to_cudf(gdf)
+    # Initialize cuML's CountVectorizer
+    count_vectorizer = CountVectorizer(analyzer='word', ngram_range=ngram_range, stop_words=stop_words)
+    print(type(gdf[col_text]))
+    # Fit and transform the text data
+    X_train_count = count_vectorizer.fit_transform(cudf.Series(gdf[col_text]))
+    X_names_count = count_vectorizer.get_feature_names()
+    # Initialize the resulting DataFrame
+    df_count = cudf.DataFrame()
+    # Calculate word frequencies per category
+    for cat in gdf[col_cat].unique().tolist():
+        word_count = X_train_count[gdf[col_cat] == cat].sum(axis=0)
+        df_count_tmp = cudf.DataFrame({col_cat: [cat]*len(X_names_count), "word": X_names_count, "freq": word_count.tolist()[0]}).sort_values(by="freq", ascending=False)
+        # Apply frequency and n_words filters
+        if n_words:
+            df_count_tmp = df_count_tmp.head(n_words)
+        if min_freq:
+            df_count_tmp = df_count_tmp[df_count_tmp["freq"] > min_freq]
+        # Concatenate the result to the main DataFrame
+        df_count = cudf.concat([df_count, df_count_tmp])
+    # Convert the result back to pandas DataFrame
+    return df_count.to_pandas()
+# def cuml_chi2_per_category(lst_text: list, lst_categorie: list, col_cat: str, n_words: int = 10, p_value_limit: float = 0.95, min_freq: int = 3) -> pd.DataFrame:
+#     # Convert input lists to cuDF Series
+#     gdf_text = cudf.Series(lst_text)
+#     gdf_categorie = cudf.Series(lst_categorie)
+#     # Initialize cuML's CountVectorizer
+#     count_vectorizer = CountVectorizer(analyzer='word')
+#     # Fit and transform the text data
+#     X_train_count = count_vectorizer.fit_transform(gdf_text)
+#     X_names_count = count_vectorizer.get_feature_names()
+#     # Initialize the resulting DataFrame
+#     df_chi = cudf.DataFrame()
+#     # Calculate Chi-squared statistics per category
+#     unique_categories = gdf_categorie.unique().to_pandas().tolist()
+#     for cat in unique_categories:
+#         cat_series = (gdf_categorie == cat).astype(int).to_pandas()
+#         chi2_scores, p_values = chi2(X_train_count.get(), cat_series)
+#         word_count = X_train_count[cat_series.astype(bool)].sum(axis=0).get()[0]
+#         df_chi_tmp = cudf.DataFrame({
+#             col_cat: cat,
+#             "relevant_words_chi2": X_names_count,
+#             "chi2": chi2_scores,
+#             "p_values": 1 - p_values,
+#             "word_count_per_class": word_count
+#         }).sort_values(by="chi2", ascending=False).head(n_words)
+#         # Filter based on p_values and word_count
+#         df_chi_tmp = df_chi_tmp[df_chi_tmp["p_values"] > p_value_limit]
+#         df_chi_tmp = df_chi_tmp[df_chi_tmp["word_count_per_class"] > min_freq]
+#         df_chi = cudf.concat([df_chi, df_chi_tmp])
+#     # Reset index
+#     df_chi.reset_index(drop=True, inplace=True)
+#     return df_chi.to_pandas()
+def cuml_chi2_per_category(lst_text: list, lst_categorie: list, col_cat: str, n_words: int = 10, p_value_limit: float = 0.95, min_freq: int = 3) -> pd.DataFrame:
+    """
+    Calculate Chi-squared statistics for each category and return a DataFrame
+    of relevant words per category.
+    Args:
+        lst_text (List[str]): List of text documents.
+        lst_categorie (List[str]): List of categories corresponding to each document.
+        col_cat (str): Name of the category column in the resulting DataFrame.
+        n_words (int, optional): Number of top words to return per category. Default is 10.
+        p_value_limit (float, optional): The minimum p-value to filter relevant words. Default is 0.95.
+        min_freq (int, optional): The minimum frequency of words to be considered relevant. Default is 3.
+    Returns:
+        pd.DataFrame: A pandas DataFrame containing the relevant words for each category.
+    """
+    # Convert input lists to cuDF Series
+    gdf_text = cudf.Series(lst_text)
+    gdf_categorie = lst_categorie
+    # Initialize cuML's CountVectorizer
+    count_vectorizer = CountVectorizer(analyzer='word')
+    # Fit and transform the text data
+    X_train_count = count_vectorizer.fit_transform(gdf_text)
+    X_names_count = count_vectorizer.get_feature_names()
+    # Initialize the resulting DataFrame
+    df_chi = cudf.DataFrame()
+    # Calculate Chi-squared statistics per category
+    unique_categories = gdf_categorie.unique().tolist()
+    for cat in unique_categories:
+        cat_series = (gdf_categorie == cat).astype(int)
+        chi2_scores, p_values = chi2(X_train_count.get(), cat_series)
+        word_count = X_train_count[cat_series.astype(bool)].sum(axis=0).get()[0]
+        df_chi_tmp = cudf.DataFrame({
+            col_cat: cat,
+            "relevant_words_chi2": X_names_count,
+            "chi2": chi2_scores,
+            "p_values": 1 - p_values,
+            "word_count_per_class": word_count
+        }).sort_values(by="chi2", ascending=False).head(n_words)
+        # Filter based on p_values and word_count
+        df_chi_tmp = df_chi_tmp[df_chi_tmp["p_values"] > p_value_limit]
+        df_chi_tmp = df_chi_tmp[df_chi_tmp["word_count_per_class"] > min_freq]
+        df_chi = cudf.concat([df_chi, df_chi_tmp])
+    # Reset index
+    df_chi.reset_index(drop=True, inplace=True)
+    return df_chi.to_pandas()
+def cudf_write_parquet(df: cudf.DataFrame, path: str, filename: str) -> str:
+    """
+    Write a cuDF DataFrame to a Parquet file.
+    Args:
+        df (cudf.DataFrame): The cuDF DataFrame to be written.
+        path (str): The directory path where the file should be saved.
+        filename (str): The name of the file without extension.
+    Returns:
+        str: The file path of the saved Parquet file.
+    """
+    file_path = os.path.join(path, str(filename)+".parquet")
+    df.to_parquet(file_path)
+    return file_path
+def cudf_read_parquet(path: str) -> cudf.DataFrame:
+    """
+    Read a Parquet file into a cuDF DataFrame.
+    Args:
+        path (str): The file path to the Parquet file.
+    Returns:
+        cudf.DataFrame: The read cuDF DataFrame.
+    """
+    df = cudf.read_parquet(path)
+    return df
+def convert_df_to_cudf(df: pd.DataFrame) -> cudf.DataFrame:
+    """
+    Convert a pandas DataFrame to a cuDF DataFrame.
+    Args:
+        df (pd.DataFrame): The pandas DataFrame to convert.
+    Returns:
+        cudf.DataFrame: The resulting cuDF DataFrame.
+    """
+    return cudf.DataFrame.from_pandas(df)
+def convert_cudf_to_df(cdf: cudf.DataFrame) -> pd.DataFrame:
+    """
+    Convert a cuDF DataFrame to a pandas DataFrame.
+    Args:
+        cdf (cudf.DataFrame): The cuDF DataFrame to convert.
+    Returns:
+        pd.DataFrame: The resulting pandas DataFrame.
+    """
+    return cdf.to_pandas()
+def cudf_encode_chunked_files(chunk_files_paths: list,
+                         HF_encoder,
+                         cols: list,
+                         col_text: str,
+                         path_embedded_chunks: str,
+                         reencode: bool = False) -> list:
+    """
+    Encode text from files and save the results in another pickle file.
+    Args:
+        chunk_files_paths (List[str]): List of file paths containing documents.
+        HF_encoder (Encoder): Encoder object for text vectorization.
+        cols (List[str]): Columns to keep in the resulting DataFrame.
+        col_text (str): Column containing text data in the DataFrame.
+        path_embedded_chunks (str): Path to save the embedded chunks.
+        reencode (bool, optional): Whether to re-encode files even if they already exist. Defaults to False.
+    Returns:
+        List[str]: List of paths for newly created files.
+    """
+    new_file_paths=[]
+    for file in tqdm(chunk_files_paths, total=len(chunk_files_paths), desc="Encoding text from files"):
+        new_filename = os.path.splitext(os.path.basename(file))[0]+"_embedded"
+        new_file_path = os.path.join(path_embedded_chunks, new_filename+".parquet")
+        # on vérifie si on a déjà effectué l'encodage, si reencode == True, on effectue quand même la procédure
+        if not os.path.exists(new_file_path) or reencode:
+            current_df = cudf_read_parquet(file)
+            text_list = current_df[col_text].to_arrow().to_pylist()
+            # text vectorization
+            embeddings = HF_encoder.embed_documents(text_list)
+            # on crée un dataframe avec les embeddings
+            current_df = current_df[cols]
+            current_df['embeddings'] = embeddings
+            # on sauvegarde
+            new_file_path = cudf_write_parquet(current_df, path_embedded_chunks, new_filename)
+            new_file_paths.append(new_file_path)
+        else :
+            new_file_paths.append(new_file_path)
+    return new_file_paths
+def split_df_into_chunks(df: pd.DataFrame, path: str, name: str, chunk_size: int = 10000) -> list[str]:
+    """
+    Split a DataFrame into multiple pickle files with a specified chunk size.
+    Args:
+        df (pd.DataFrame): The DataFrame to be split.
+        path (str): The directory path where the pickle files will be saved.
+        name (str): The base name for the pickle files.
+        chunk_size (int, optional): The size of each chunk. Default is 10000.
+    Returns:
+        list[str]: A list of file paths to the saved pickle files.
+    """
+    num_chunks = -(-len(df) // chunk_size)  # Calculate the number of chunks using ceil division
+    file_paths = []
+    # create smaller datasets of chunk_size each
+    for i in range(num_chunks):
+        start = i * chunk_size
+        end = (i + 1) * chunk_size
+        chunk = df.iloc[start:end]
+        filename = f"{name}_{i}"  # Adjust the filename format as needed
+        file_path = cudf_write_parquet(chunk, path, filename)
+        file_paths.append(file_path)
+    return file_paths

opsci-toolbox 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl

opsci-toolbox 0.0.7py3-none-any.whl → 0.0.9py3-none-any.whl