PyPI - opsci-toolbox - Versions diffs - 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl - Mend

opsci-toolbox 0.0.5py3-none-any.whl → 0.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

opsci_toolbox/apis/rapidapi_helpers.py +1 -0
opsci_toolbox/helpers/common.py +557 -207
opsci_toolbox/helpers/cv.py +298 -123
opsci_toolbox/helpers/dataviz.py +875 -191
opsci_toolbox/helpers/dates.py +55 -8
opsci_toolbox/helpers/nlp.py +746 -97
opsci_toolbox/helpers/nlp_cuml.py +166 -57
opsci_toolbox/helpers/sna.py +101 -10
opsci_toolbox/helpers/surreaction.py +58 -16
{opsci_toolbox-0.0.5.dist-info → opsci_toolbox-0.0.7.dist-info}/METADATA +3 -2
opsci_toolbox-0.0.7.dist-info/RECORD +21 -0
opsci_toolbox-0.0.5.dist-info/RECORD +0 -21
{opsci_toolbox-0.0.5.dist-info → opsci_toolbox-0.0.7.dist-info}/WHEEL +0 -0
{opsci_toolbox-0.0.5.dist-info → opsci_toolbox-0.0.7.dist-info}/top_level.txt +0 -0

opsci_toolbox/helpers/nlp_cuml.py CHANGED Viewed

@@ -5,7 +5,27 @@ from tqdm import tqdm
 import os
 from opsci_toolbox.helpers.common import load_pickle, create_dir, write_pickle
-def reduce_with_cuml_UMAP(embeddings, n_neighbors = 5, n_components = 3, min_dist = 0.0, metric = "cosine", spread = 1.0):
+def reduce_with_cuml_UMAP(embeddings: np.ndarray,
+                          n_neighbors: int = 5,
+                          n_components: int = 3,
+                          min_dist: float = 0.0,
+                          metric: str = "cosine",
+                          spread: float = 1.0) -> tuple:
+    """
+    Reduces the dimensionality of embeddings using UMAP with cuML library.
+    Parameters:
+    - embeddings (np.ndarray): The input embeddings to be reduced.
+    - n_neighbors (int, optional): The number of nearest neighbors to consider. Defaults to 5.
+    - n_components (int, optional): The number of dimensions of the embedded space. Defaults to 3.
+    - min_dist (float, optional): The minimum distance between embedded points. Defaults to 0.0.
+    - metric (str, optional): The metric to use for distance computation. Defaults to "cosine".
+    - spread (float, optional): The effective scale of embedded points. Defaults to 1.0.
+    Returns:
+    - reducer (UMAP): The UMAP reducer object.
+    - reduced_embeddings (np.ndarray): The reduced embeddings.
+    """
     reducer = UMAP(n_neighbors=n_neighbors,
                    n_components=n_components,
                    min_dist=min_dist,
@@ -15,61 +35,77 @@ def reduce_with_cuml_UMAP(embeddings, n_neighbors = 5, n_components = 3, min_dis
     reduced_embeddings = reducer.transform(embeddings)
     return reducer, reduced_embeddings
-def transform_with_cuml_UMAP(reducer, new_embeddings):
+def transform_with_cuml_UMAP(reducer,
+                             new_embeddings: np.ndarray) -> np.ndarray:
     """
-    Transform new data points using a UMAP object
+    Transform new data points using a UMAP object.
+    Parameters:
+    - reducer (UMAP): The UMAP reducer object.
+    - new_embeddings (np.ndarray): The new data points to be transformed.
+    Returns:
+    - reduced_embeddings (np.ndarray): The transformed embeddings.
     """
     reduced_embeddings = reducer.transform(new_embeddings)
     return reduced_embeddings
-def hdbscan_cuml_clustering(embeddings, min_cluster_size=5, min_samples=None, max_cluster_size = 0,  metric='euclidean', alpha=1.0, p=2, cluster_selection_epsilon=0.0, cluster_selection_method='eom',
-                       approx_min_span_tree=True, gen_min_span_tree = False, gen_condensed_tree = False, gen_single_linkage_tree_ = False, prediction_data=True):
+def hdbscan_cuml_clustering(embeddings: np.ndarray,
+                            min_cluster_size: int = 5,
+                            min_samples: int = None,
+                            max_cluster_size: int = 0,
+                            metric: str = 'euclidean',
+                            alpha: float = 1.0,
+                            p: int = 2,
+                            cluster_selection_epsilon: float = 0.0,
+                            cluster_selection_method: str = 'eom',
+                            approx_min_span_tree: bool = True,
+                            gen_min_span_tree: bool = False,
+                            gen_condensed_tree: bool = False,
+                            gen_single_linkage_tree_: bool = False,
+                            prediction_data: bool = True) -> tuple:
     """
+    Perform clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm.
     Parameters:
-    embeddings : array-like or sparse matrix, shape (n_samples, n_features)
-        The input data to be clustered.
-    min_cluster_size : int, optional
-        The minimum number of samples in a group for that group to be considered a cluster; groupings smaller than this size will be left as noise.
-    min_samples : int or None, optional
-        The number of samples in a neighborhood for a point to be considered as a core point. This includes the point itself. If ‘None’, it defaults to the min_cluster_size.
-    max_cluster_size : int, optional (default=0)
-        A limit to the size of clusters returned by the eom algorithm. Has no effect when using leaf clustering (where clusters are usually small regardless) and can also be overridden in rare cases by a high value for cluster_selection_epsilon.
-        Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument.
-    metric : str or callable, optional
-        The metric to use for distance computation. Default is 'euclidean'.
-    alpha : float, optional
-         distance scaling parameter as used in robust single linkage.
-    p : int, optional
-        The Minkowski p-norm distance metric parameter. Default is None.
-    cluster_selection_epsilon : float, optional
-        A distance threshold. Clusters below this value will be merged. Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument.
-    cluster_selection_method : {'eom', 'leaf'}, optional
-        The method used to select clusters from the condensed tree. The standard approach for HDBSCAN* is to use an Excess of Mass algorithm to find the most persistent clusters. Alternatively you can instead select the clusters at the leaves of the tree – this provides the most fine grained and homogeneous clusters. Options are:
-    approx_min_span_tree : bool, optional
-        Whether to compute an approximation of the minimum spanning tree. Default is True.
-    gen_min_span_tree : bool, optional
-        Whether to populate the minimum_spanning_tree_ member for utilizing plotting tools. This requires the hdbscan CPU Python package to be installed
-    gen_condensed_tree : bool, optional
-        Whether to populate the condensed_tree_ member for utilizing plotting tools.
-    gen_single_linkage_tree_ :  bool
-        Whether to populate the single_linkage_tree_ member for utilizing plotting tools.
-    prediction_data : bool, optional
-        Whether the data is prediction data or not. Default is True.
-Returns:
-    clusterer : hdbscan.hdbscan_.HDBSCAN
-        HDBSCAN clusterer object.
-    labels : array, shape (n_samples,)
-        Cluster labels for each point. Noisy samples are given the label -1.
-    probabilities : array, shape (n_samples,)
-        The probability of each sample being an outlier.
-Description:
-    This function performs clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm.
-    It clusters the input data based on the specified parameters and returns the clusterer object, cluster labels for each point, and the
-    probability of each sample being an outlier.
+        embeddings : array-like or sparse matrix, shape (n_samples, n_features)
+            The input data to be clustered.
+        min_cluster_size : int, optional
+            The minimum number of samples in a group for that group to be considered a cluster; groupings smaller than this size will be left as noise.
+        min_samples : int or None, optional
+            The number of samples in a neighborhood for a point to be considered as a core point. This includes the point itself. If ‘None’, it defaults to the min_cluster_size.
+        max_cluster_size : int, optional (default=0)
+            A limit to the size of clusters returned by the eom algorithm. Has no effect when using leaf clustering (where clusters are usually small regardless) and can also be overridden in rare cases by a high value for cluster_selection_epsilon.
+            Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument.
+        metric : str or callable, optional
+            The metric to use for distance computation. Default is 'euclidean'.
+        alpha : float, optional
+             Distance scaling parameter as used in robust single linkage.
+        p : int, optional
+            The Minkowski p-norm distance metric parameter. Default is None.
+        cluster_selection_epsilon : float, optional
+            A distance threshold. Clusters below this value will be merged. Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument.
+        cluster_selection_method : {'eom', 'leaf'}, optional
+            The method used to select clusters from the condensed tree. The standard approach for HDBSCAN* is to use an Excess of Mass algorithm to find the most persistent clusters. Alternatively you can instead select the clusters at the leaves of the tree – this provides the most fine grained and homogeneous clusters. Options are:
+        approx_min_span_tree : bool, optional
+            Whether to compute an approximation of the minimum spanning tree. Default is True.
+        gen_min_span_tree : bool, optional
+            Whether to populate the minimum_spanning_tree_ member for utilizing plotting tools. This requires the hdbscan CPU Python package to be installed.
+        gen_condensed_tree : bool, optional
+            Whether to populate the condensed_tree_ member for utilizing plotting tools.
+        gen_single_linkage_tree_ :  bool
+            Whether to populate the single_linkage_tree_ member for utilizing plotting tools.
+        prediction_data : bool, optional
+            Whether the data is prediction data or not. Default is True.
+    Returns:
+        clusterer : hdbscan.hdbscan_.HDBSCAN
+            HDBSCAN clusterer object.
+        labels : array, shape (n_samples,)
+            Cluster labels for each point. Noisy samples are given the label -1.
+        probabilities : array, shape (n_samples,)
+            The probability of each sample being an outlier.
     """
     clusterer = HDBSCAN(min_cluster_size=min_cluster_size,
                                 min_samples=min_samples,
@@ -89,17 +125,39 @@ Description:
     return clusterer, clusterer.labels_, clusterer.probabilities_
-def transform_with_cuml_HDBSCAN(clusterer, new_embeddings):
+def transform_with_cuml_HDBSCAN(clusterer, new_embeddings: np.ndarray) -> tuple:
     """
-    Transform new data points using a HDBSCAN object
+    Transform new data points using an HDBSCAN object.
+    Parameters:
+        clusterer : hdbscan.hdbscan_.HDBSCAN
+            The HDBSCAN clusterer object trained on the original data.
+        new_embeddings : array-like or sparse matrix, shape (n_samples, n_features)
+            The new data points to be transformed.
+    Returns:
+        new_data_topic : array, shape (n_samples,)
+            Predicted cluster labels for each new data point.
+        new_data_proba : array, shape (n_samples,)
+            The probability of each new data point being an outlier.
     """
     new_data_topic, new_data_proba = approximate_predict(clusterer, new_embeddings)
     return new_data_topic, new_data_proba
-def cuml_soft_clustering(clusterer):
+def cuml_soft_clustering(clusterer) -> tuple:
     """
-    HDBSCAN SOFT CLUSTERING
+    Perform soft clustering using HDBSCAN.
+    Parameters:
+        clusterer : hdbscan.hdbscan_.HDBSCAN
+            The HDBSCAN clusterer object trained on the original data.
+    Returns:
+        soft_clusters_val : list of str
+            Predicted cluster labels for each data point, represented as strings.
+        soft_clusters_proba : list of float
+            The maximum probability of each data point belonging to any cluster.
     """
     soft_clusters = all_points_membership_vectors(clusterer)
     soft_clusters_val = [str(np.argmax(x)) for x in soft_clusters]
@@ -107,17 +165,45 @@ def cuml_soft_clustering(clusterer):
     return soft_clusters_val, soft_clusters_proba
-def soft_cuml_clustering_new_data(clusterer, embeddings):
+def soft_cuml_clustering_new_data(clusterer, embeddings: np.ndarray) -> tuple:
     """
-    PREDICT NEW DATA POINTS HDBSCAN SOFT CLUSTERING
+    Predict cluster memberships for new data points using HDBSCAN soft clustering.
+    Parameters:
+        clusterer : hdbscan.hdbscan_.HDBSCAN
+            The HDBSCAN clusterer object trained on the original data.
+        embeddings : array-like or sparse matrix, shape (n_samples, n_features)
+            The new data points to be clustered.
+    Returns:
+        soft_clusters_val : list of str
+            Predicted cluster labels for each new data point, represented as strings.
+        soft_clusters_proba : list of float
+            The maximum probability of each new data point belonging to any cluster.
     """
-    soft_clusters =membership_vector(clusterer, embeddings)
+    soft_clusters = membership_vector(clusterer, embeddings)
     soft_clusters_val = [str(np.argmax(x)) for x in soft_clusters]
     soft_clusters_proba = [np.max(x) for x in soft_clusters]
     return soft_clusters_val, soft_clusters_proba
-def process_UMAP(embedded_chunks_paths, path_reduced_embeddings_id, reducer, reencode =  False):
+def process_UMAP(embedded_chunks_paths: list, path_reduced_embeddings_id: str, reducer, reencode: bool = False) -> list:
+    """
+    Process embeddings using UMAP reduction.
+    Parameters:
+        embedded_chunks_paths : list of str
+            List of file paths containing the embedded chunks.
+        path_reduced_embeddings_id : str
+            Path to store the reduced embeddings.
+        reducer : UMAP object
+            The UMAP reducer object used for dimensionality reduction.
+        reencode : bool, optional
+            Whether to reencode the embeddings even if the reduced file already exists. Default is False.
+    Returns:
+        new_file_paths : list of str
+            List of file paths to the reduced embeddings.
+    """
     new_file_paths=[]
     for file_path in tqdm(embedded_chunks_paths, total=len(embedded_chunks_paths), desc="UMAP transform from files"):
@@ -144,7 +230,30 @@ def process_UMAP(embedded_chunks_paths, path_reduced_embeddings_id, reducer, ree
-def process_HDBSCAN(clusterer, reduced_embeddings_paths, path_predictions_dataset_id, run_soft_clustering= False, reencode = False):
+def process_HDBSCAN(clusterer,
+                    reduced_embeddings_paths: list,
+                    path_predictions_dataset_id: str,
+                    run_soft_clustering: bool = False,
+                    reencode: bool = False) -> list:
+    """
+    Process reduced embeddings using HDBSCAN clustering.
+    Parameters:
+        clusterer : hdbscan.hdbscan_.HDBSCAN
+            The HDBSCAN clusterer object.
+        reduced_embeddings_paths : list of str
+            List of file paths containing the reduced embeddings.
+        path_predictions_dataset_id : str
+            Path to store the clustering predictions.
+        run_soft_clustering : bool, optional
+            Whether to perform soft clustering in addition to regular clustering. Default is False.
+        reencode : bool, optional
+            Whether to reencode the embeddings even if the clustering file already exists. Default is False.
+    Returns:
+        new_file_paths : list of str
+            List of file paths to the clustering predictions.
+    """
     new_file_paths=[]
     for file_path in tqdm(reduced_embeddings_paths, total=len(reduced_embeddings_paths), desc="HDBSCAN transform from files"):

opsci_toolbox/helpers/sna.py CHANGED Viewed

@@ -8,7 +8,21 @@ from opsci_toolbox.helpers.common import scale_list
 import pandas as pd
 import math
-def create_collocations(lst_text, word_freq, coloc_freq, stop_words):
+def create_collocations(lst_text : list, word_freq : int, coloc_freq : int, stop_words : list) -> tuple:
+    """
+    Creates collocations (bigrams) from a list of texts and returns their relative frequencies and a DataFrame of word sizes.
+    Args:
+        lst_text (List[str]): A list of text documents.
+        word_freq (int): Minimum document frequency for words to be included.
+        coloc_freq (int): Minimum frequency for collocations (bigrams) to be included.
+        stop_words (Set[str]): A set of stop words to be excluded from tokenization.
+    Returns:
+        Tuple[List[Tuple[str, str, float]], pd.DataFrame]:
+            - A list of tuples where each tuple contains two words and their relative bigram frequency.
+            - A DataFrame containing words and their sizes based on their counts in the documents.
+    """
     # Tokenize the documents into words using scikit-learn's CountVectorizer
     vectorizer = CountVectorizer(token_pattern=r'[^\s]+', stop_words=stop_words, min_df=word_freq)
     tokenized_documents = vectorizer.fit_transform(lst_text)
@@ -42,7 +56,19 @@ def create_collocations(lst_text, word_freq, coloc_freq, stop_words):
     return edges, df_nodes
-def create_maximum_tree(edges, df_nodes):
+def create_maximum_tree(edges : list, df_nodes : pd.DataFrame) -> tuple:
+    """
+    Creates a network graph from edges and node attributes, then generates its maximum spanning tree.
+    Args:
+        edges (List[Tuple[str, str, float]]): A list of tuples where each tuple contains two nodes and the weight of the edge between them.
+        df_nodes (pd.DataFrame): A DataFrame containing node attributes, where 'word' is the node identifier.
+    Returns:
+        Tuple[nx.Graph, nx.Graph]:
+            - The original network graph with node attributes.
+            - The maximum spanning tree of the network graph.
+    """
     attributs=df_nodes.set_index('word')
     dictionnaire=attributs.to_dict('index')
@@ -54,7 +80,17 @@ def create_maximum_tree(edges, df_nodes):
     return network, tree
-def words_partitions(network, resolution = 1.0):
+def words_partitions(network : nx.Graph, resolution : float = 1.0) -> None:
+    """
+    Partitions the network using the Louvain method and calculates the modularity of the partition.
+    Args:
+        network (nx.Graph): The network graph to partition.
+        resolution (float): The resolution parameter for the Louvain method. Higher values lead to smaller communities.
+    Returns:
+        None
+    """
     try:
         partition = community_louvain.best_partition(network, resolution=resolution)
         modularity = community_louvain.modularity(partition, network)
@@ -69,8 +105,16 @@ def words_partitions(network, resolution = 1.0):
         nx.set_node_attributes(network, partition, "modularity")
-def compute_metrics(network):
-    ### CALCUL DE LA CENTRALITE DE DEGRES
+def compute_metrics(network : nx.Graph) -> None :
+    """
+    Computes and sets centrality metrics for the nodes in the network graph.
+    Args:
+        network (nx.Graph): The network graph on which to compute centrality.
+    Returns:
+        None
+    """
     try:
         degree_cent = nx.degree_centrality(network)
         nx.set_node_attributes(network, degree_cent, "degree_centrality")
@@ -82,7 +126,6 @@ def compute_metrics(network):
         degree_cent = {node: 0 for node in network.nodes()}
         nx.set_node_attributes(network, degree_cent, "degree_centrality")
     ### CALCUL DE LA CENTRALITE DE VECTEUR PROPRE
     try:
         centrality = nx.eigenvector_centrality(network)
@@ -106,7 +149,20 @@ def compute_metrics(network):
         betweenness_cent = {node: 0 for node in network.nodes()}
         nx.set_node_attributes(network, betweenness_cent, "betweenness_centrality")
-def prepare_nodes(T, layout_positions, colormap, min_node_size = 8, max_node_size = 40):
+def prepare_nodes(T : nx.Graph, layout_positions : dict, colormap : str, min_node_size : int = 8, max_node_size : int = 40) -> None:
+    """
+    Prepares and sets node attributes for a graph based on various centrality measures and colors them using a colormap.
+    Args:
+        T (nx.Graph): The input graph.
+        layout_positions (Dict[str, Tuple[float, float]]): A dictionary of node positions for layout.
+        colormap (Colormap): A colormap for generating node colors.
+        min_node_size (int): Minimum node size for scaling. Default is 8.
+        max_node_size (int): Maximum node size for scaling. Default is 40.
+    Returns:
+        None
+    """
     # on génère une palette de couleur à partir de colormap
     modularity_palette = generate_color_palette_with_colormap(set(nx.get_node_attributes(T,"modularity").values()), colormap=colormap)
@@ -147,17 +203,52 @@ def prepare_nodes(T, layout_positions, colormap, min_node_size = 8, max_node_siz
     for n, p in layout_positions.items():
         T.nodes[n]['pos'] = p
-def prepare_edges(T, min_edge_size=1, max_edge_size=5):
+def prepare_edges(T : nx.Graph, min_edge_size : int =1, max_edge_size : int =5) -> None:
+    """
+    Prepares and sets edge attributes for a graph by scaling edge weights.
+    Args:
+        T (nx.Graph): The input graph.
+        min_edge_size (int): Minimum edge size for scaling. Default is 1.
+        max_edge_size (int): Maximum edge size for scaling. Default is 5.
+    Returns:
+        None
+    """
     w = [e[2]['weight'] for e in T.edges(data=True)]
     scaled_w = scale_list(w, min_edge_size, max_edge_size)
     edges_attributes_dict = {(e[0], e[1]): {'scaled_weight': scaled_w[i]} for i, e in enumerate(T.edges(data=True))}
     nx.set_edge_attributes(T, edges_attributes_dict)
-def layout_graphviz(network, layout ="fdp", args=""):
+def layout_graphviz(network : nx.Graph, layout : str = "fdp", args : str ="") -> dict:
+    """
+    Generates node positions for a graph using Graphviz layout algorithms.
+    Args:
+        network (nx.Graph): The input graph.
+        layout (str): The Graphviz layout algorithm to use (e.g., "dot", "fdp", "sfdp"). Default is "fdp".
+        args (str): Additional arguments to pass to the Graphviz layout algorithm. Default is an empty string.
+    Returns:
+        Dict[str, Tuple[float, float]]: A dictionary of node positions.
+    """
     layout_positions = nx.nx_agraph.graphviz_layout(network, prog=layout, args=args)
     return layout_positions
-def layout_spring(network, k = 0.08, scale = 2, iterations = 200, weight="weight"):
+def layout_spring(network : nx.Graph, k : float = 0.08, scale : int = 2, iterations : int = 200, weight : str ="weight") -> dict:
+    """
+    Generates node positions for a graph using the spring layout algorithm.
+    Args:
+        network (nx.Graph): The input graph.
+        k (float): Optimal distance between nodes. Default is 0.08.
+        scale (float): Scale factor for the layout. Default is 2.
+        iterations (int): Number of iterations for the spring layout algorithm. Default is 200.
+        weight (str): Edge attribute to use as weight. Default is "weight".
+    Returns:
+        Dict[str, Tuple[float, float]]: A dictionary of node positions.
+    """
     layout_positions = nx.spring_layout(network, k=k,  scale=scale, iterations=iterations, weight=weight)
     return layout_positions

opsci_toolbox/helpers/surreaction.py CHANGED Viewed

@@ -1,9 +1,17 @@
 import pandas as pd
 from tqdm import tqdm
-def generate_index(df, col_author_id ='author_id', col_date='created_time'):
+def generate_index(df : pd.DataFrame, col_author_id : str ='author_id', col_date :  str = 'created_time') -> pd.DataFrame:
     """
-    Generates an index based on user_id and date
+    Generates an index based on author ID and creation date.
+    Args:
+        df (pd.DataFrame): The input DataFrame containing author IDs and creation dates.
+        col_author_id (str): The column name for author IDs. Default is 'author_id'.
+        col_date (str): The column name for creation dates. Default is 'created_time'.
+    Returns:
+        pd.DataFrame: The DataFrame with a new 'index' column containing the generated indices.
     """
     res=[]
     for i, row in tqdm(df.iterrows(), total=df.shape[0], desc="generation des index"):
@@ -13,15 +21,25 @@ def generate_index(df, col_author_id ='author_id', col_date='created_time'):
     return df
-def avg_performance(df,
-                    col_date='created_time',
-                    col_author_id='author_id',
-                    col_engagement=['shares', 'comments', 'reactions', 'likes','top_comments', 'love', 'wow', 'haha',
+def avg_performance(df : pd.DataFrame,
+                    col_date : str ='created_time',
+                    col_author_id : str ='author_id',
+                    col_engagement :  list =['shares', 'comments', 'reactions', 'likes','top_comments', 'love', 'wow', 'haha',
                                     'sad', 'angry','total_engagement', 'replies', 'percentage_replies'],
-                    rolling_period='7D'):
+                    rolling_period :  str ='7D') -> pd.DataFrame:
     """
-    Function to compute average performance on a rolling period for a list of metrics
+    Computes average performance on a rolling period for a list of engagement metrics.
+    Args:
+        df (pd.DataFrame): The input DataFrame containing engagement metrics.
+        col_date (str): The column name for creation dates. Default is 'created_time'.
+        col_author_id (str): The column name for author IDs. Default is 'author_id'.
+        col_engagement (List[str]): A list of columns representing engagement metrics.
+        rolling_period (str): The rolling period for calculating the average. Default is '7D'.
+    Returns:
+        pd.DataFrame: The DataFrame with additional columns containing the rolling average of engagement metrics.
     """
     # Nettoyage au cas où
@@ -47,18 +65,32 @@ def avg_performance(df,
     return df
-def kpi_reaction(df, cols):
+def kpi_reaction(df : pd.DataFrame, cols : list) -> pd.DataFrame:
     """
-    Cette fonction prend un dataframe et une liste de colonnes en entrée.
-    Pour chaque colonne, on va calculer le taux de sur-réaction.
+    Computes the overreaction rate for each column in the DataFrame.
+    Args:
+        df (pd.DataFrame): The input DataFrame containing engagement metrics.
+        cols (List[str]): A list of column names for which to calculate the overreaction rate.
+    Returns:
+        pd.DataFrame: The DataFrame with additional columns containing the overreaction rates.
     """
     for col in cols:
         df['tx_'+col]=(df[col]-df[col+'_avg'])/(df[col]+df[col+'_avg'])
     return df
-def get_reactions_type(df, cols, col_dest):
+def get_reactions_type(df : pd.DataFrame, cols : list, col_dest : str) -> pd.DataFrame:
     """
-    Conditional function to return the reaction type based on a list of metrics
+    Returns the reaction type based on a list of metrics for each row in the DataFrame.
+    Args:
+        df (pd.DataFrame): The input DataFrame containing engagement metrics.
+        cols (List[str]): A list of column names for which to determine the reaction type.
+        col_dest (str): The name of the column to store the reaction type in.
+    Returns:
+        pd.DataFrame: The DataFrame with additional column containing the reaction types.
     """
     all_val=[]
@@ -80,10 +112,20 @@ def get_reactions_type(df, cols, col_dest):
     df[col_dest]=all_val
     return df
-def compute_surreaction(df, col_date, col_author_id, cols_sureaction_metrics, cols_typologie_sureaction, rolling_period_sureaction = '7D'):
+def compute_surreaction(df : pd.DataFrame, col_date : str, col_author_id : str, cols_sureaction_metrics : list, cols_typologie_sureaction : list, rolling_period_sureaction : str = '7D') -> pd.DataFrame:
     """
-    Helpers to compute surreaction and return a dataframe with reaction rates and typology
+    Computes surreaction rates and typology for a DataFrame containing engagement metrics.
+    Args:
+        df (pd.DataFrame): The input DataFrame containing engagement metrics.
+        col_date (str): The column name for creation dates.
+        col_author_id (str): The column name for author IDs.
+        cols_sureaction_metrics (List[str]): A list of column names for which to calculate surreaction rates.
+        cols_typologie_sureaction (List[str]): A list of column names for categorizing the forms of reaction.
+        rolling_period_sureaction (str): The rolling period for calculating the average and surreaction rates. Default is '7D'.
+    Returns:
+        pd.DataFrame: The DataFrame with additional columns containing surreaction rates and typology.
     """
     # on désactive temporairement les messages d'alerte
     pd.options.mode.chained_assignment = None  # default='warn'

{opsci_toolbox-0.0.5.dist-info → opsci_toolbox-0.0.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: opsci-toolbox
-Version: 0.0.5
+Version: 0.0.7
 Summary: a complete toolbox
 Home-page: UNKNOWN
 Author: Erwan Le Nagard
@@ -23,9 +23,10 @@ Requires-Dist: networkx (==3.2.1)
 Requires-Dist: nltk (==3.8.1)
 Requires-Dist: numpy (<1.25.0,>=1.21.5)
 Requires-Dist: opencv-python-headless (==4.9.0.80)
+Requires-Dist: openpyxl (==3.1.3)
 Requires-Dist: pandas (==1.5.3)
 Requires-Dist: plotly (==5.19.0)
-Requires-Dist: protobuf (==5.26.1)
+Requires-Dist: protobuf (<5,>=3.20)
 Requires-Dist: pyarrow (==14.0.2)
 Requires-Dist: python-louvain (==0.16)
 Requires-Dist: scikit-learn (==1.4.1.post1)

opsci_toolbox-0.0.7.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,21 @@
+opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+opsci_toolbox/apis/rapidapi_helpers.py,sha256=f2o4ItMZwoAt3ow5bSK-MPkqzP3wzJ857xU0CzDZIyI,23207
+opsci_toolbox/apis/webscraping.py,sha256=D1A_ixjImPOncbWrKf6Nem2SR4NQraxTbcYqiE64VTY,12263
+opsci_toolbox/apis/youtube_helpers.py,sha256=CZQ4mP43eA3STWNJ0HjSoJpvz3iHzohSGxmp5ntEgpA,13115
+opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+opsci_toolbox/helpers/common.py,sha256=TvlGcCdpkfKUaDkahILq3wFLgxwAtgRv5KJRoNy9brw,40339
+opsci_toolbox/helpers/cv.py,sha256=-uXHncyAr8sDF0ip32LAz7Xae9Z4-T9MH6palpIzq-c,21109
+opsci_toolbox/helpers/dataviz.py,sha256=JbudfwWPCEEEzP8Vpmu1CMEKaE6O2vtk9xsflW2pT1M,112451
+opsci_toolbox/helpers/dates.py,sha256=EvNqut2s6S4CaaVFQhIDR-W00TZbt3J04yRYKYhxCkU,2638
+opsci_toolbox/helpers/nlp.py,sha256=jpZRyTkYeoVH8tzqIT0opZn5unt8cdU1qPdFzXxEOw8,86638
+opsci_toolbox/helpers/nlp_cuml.py,sha256=w-pkch2Sk_FfVrm1j8NUmmxVvoJXJHuXzGnXGV_FWSE,14153
+opsci_toolbox/helpers/sna.py,sha256=SZjS21qfBmlkHDJaXi7CaHpj6KhefcsDmJ1A9NRtVeQ,12006
+opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
+opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
+opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
+opsci_toolbox-0.0.7.dist-info/METADATA,sha256=ErIa8rDRfvT52LjZJcSKU7zougC_1hZa3oWnvPPTzJQ,1601
+opsci_toolbox-0.0.7.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
+opsci_toolbox-0.0.7.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
+opsci_toolbox-0.0.7.dist-info/RECORD,,

opsci_toolbox-0.0.5.dist-info/RECORD DELETED Viewed

@@ -1,21 +0,0 @@
-opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-opsci_toolbox/apis/rapidapi_helpers.py,sha256=5QbF6ehsmmdTrzp7Q8cF5wrf4DmO91v8YexbybczyHA,23183
-opsci_toolbox/apis/webscraping.py,sha256=D1A_ixjImPOncbWrKf6Nem2SR4NQraxTbcYqiE64VTY,12263
-opsci_toolbox/apis/youtube_helpers.py,sha256=CZQ4mP43eA3STWNJ0HjSoJpvz3iHzohSGxmp5ntEgpA,13115
-opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-opsci_toolbox/helpers/common.py,sha256=41EsQ2pTwQYnUUM1ggwaPueFVj2Qcm_UG7o_Zj41FU8,26152
-opsci_toolbox/helpers/cv.py,sha256=z0HecreIi-vqiOGpDa4VVnHIX_rvkObngrqwTwkWT44,12403
-opsci_toolbox/helpers/dataviz.py,sha256=4wFi0wCMgvIEQEL8okiVJOWxz-eJq5cZ7svHoBbZjnk,77393
-opsci_toolbox/helpers/dates.py,sha256=yQm9pUQAeLTFNPcgeumhi8oErustQJhaoL_HqxSxhiA,996
-opsci_toolbox/helpers/nlp.py,sha256=LGW8CIjrkQvGLKEnxYu7RNrBNViQ5dUygK67EhkBHZo,57999
-opsci_toolbox/helpers/nlp_cuml.py,sha256=Mkbtl9ewbv3aa9rFvhH9VOM5Y0G-XIsXtR_6IeYpebY,9450
-opsci_toolbox/helpers/sna.py,sha256=D6nwgUgbuApXGpT2zoIMip8262hynEwfppVdvaZ4Qm0,8053
-opsci_toolbox/helpers/surreaction.py,sha256=k5hcZZlXnJ-zczRpwfwthggEgFCr9lQsHHKVOPlm7fc,4606
-opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
-opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
-opsci_toolbox-0.0.5.dist-info/METADATA,sha256=Nhp2oK-KXD4JVivU37-T_MsN-VJfbPtJsWlUq7Kp5-A,1566
-opsci_toolbox-0.0.5.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
-opsci_toolbox-0.0.5.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
-opsci_toolbox-0.0.5.dist-info/RECORD,,

{opsci_toolbox-0.0.5.dist-info → opsci_toolbox-0.0.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{opsci_toolbox-0.0.5.dist-info → opsci_toolbox-0.0.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

opsci-toolbox 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl

opsci-toolbox 0.0.5py3-none-any.whl → 0.0.7py3-none-any.whl