PyPI - opsci-toolbox - Versions diffs - 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl - Mend

opsci-toolbox 0.0.12py3-none-any.whl → 0.0.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

opsci_toolbox/apis/rapidapi_helpers.py +1 -2
opsci_toolbox/apis/reddit.py +407 -0
opsci_toolbox/apis/telegram.py +1125 -0
opsci_toolbox/helpers/common.py +177 -5
opsci_toolbox/helpers/dataviz.py +184 -26
opsci_toolbox/helpers/dates.py +47 -1
opsci_toolbox/helpers/gliner.py +88 -0
opsci_toolbox/helpers/nlp.py +273 -15
opsci_toolbox/helpers/nlp_cuml.py +44 -3
opsci_toolbox/helpers/sna.py +1 -0
{opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.14.dist-info}/METADATA +5 -2
opsci_toolbox-0.0.14.dist-info/RECORD +26 -0
opsci_toolbox-0.0.14.dist-info/dependency_links.txt +1 -0
opsci_toolbox-0.0.12.dist-info/RECORD +0 -22
{opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.14.dist-info}/WHEEL +0 -0
{opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.14.dist-info}/top_level.txt +0 -0

opsci_toolbox/helpers/common.py CHANGED Viewed

@@ -310,6 +310,22 @@ def write_pickle(data: pd.DataFrame, path: str, filename: str) -> str:
         pickle.dump(data, f)
     return file_path
+def save_df_to_pickle(df: pd.DataFrame, path: str, filename: str) -> str:
+    """
+    Write a DataFrame into a pickle file.
+    Args:
+        data (pd.DataFrame): The DataFrame to be written to the pickle file.
+        path (str): The directory where the pickle file will be saved.
+        filename (str): The name of the pickle file (without the extension).
+    Returns:
+        str: The full path to the saved pickle file.
+    """
+    file_path = os.path.join(path, filename + '.pickle')
+    df.to_pickle(file_path)
+    return file_path
 def write_list_to_txt(input_list: list, path: str, name: str) -> str:
     """
@@ -587,7 +603,7 @@ def list_files_in_subdirectories(path: str, filetype: str = '*.json') -> list:
     return files
-def copy_file(source_path: str, destination_path: str, new_filename: str = '') -> str:
+def copy_file(source_path: str, destination_path: str, new_filename: str = None) -> str:
     """
     Copy a file from a source path to a destination path.
@@ -1319,13 +1335,13 @@ def categorize_percentiles(percentile: float) -> str:
     Categorizes a percentile value into a string representing its range.
     Args:
-    - percentile (float): The percentile value (between 0 and 1).
+        percentile (float): The percentile value (between 0 and 1).
     Returns:
-    - str: The category of the percentile value.
+        str: The category of the percentile value.
     Raises:
-    - ValueError: If the percentile value is outside the range [0, 1].
+        ValueError: If the percentile value is outside the range [0, 1].
     """
     if not (0 <= percentile <= 1):
         raise ValueError("Percentile must be between 0 and 1 inclusive.")
@@ -1349,4 +1365,160 @@ def categorize_percentiles(percentile: float) -> str:
     elif percentile <= 0.9:
         return '80-90%'
     else:
-        return '90-100%'
+        return '90-100%'
+def prepare_data_combinations(df: pd.DataFrame, columns_to_combine : list, col_date : str, date_format : str,  rolling_period : str, col_id : str, col_engagement : str) -> pd.DataFrame:
+    """
+    Prepare data for combinations of columns. Useful for data preparation before dataviz of time series. It adds missing rows for each combination of columns and date.
+    Args:
+        df (pd.DataFrame): The input DataFrame.
+        columns_to_combine (list): List of column names to combine.
+        col_date (str): Name of the column containing dates.
+        date_format (str): Format of the dates in col_date.
+        rolling_period (str): Rolling period for grouping.
+        col_id (str): Name of the column containing unique IDs.
+        col_engagement (str): Name of the column containing engagement values.
+    Returns:
+        pd.DataFrame: The prepared DataFrame with combinations of columns.
+    """
+    df_wt_combinations = df.copy()
+    df_wt_combinations["date"] = pd.to_datetime(df_wt_combinations[col_date], format=date_format).to_numpy()
+    # Create all possible combinations of columns indexes
+    # all_combinations = create_combination_index(df_wt_combinations, columns_to_combine, "date", rolling_period)
+    # If no columns to combine, just use the date for grouping
+    if not columns_to_combine:
+        df_wt_combinations = (df_wt_combinations
+                              .set_index("date")
+                              .resample(rolling_period)
+                              .agg({col_id: "nunique", col_engagement: "sum"})
+                              .fillna(0)
+                              .reset_index())
+    else:
+        # # Create all possible combinations of columns indexes
+        # all_combinations = create_combination_index(df_wt_combinations, columns_to_combine, "date", rolling_period)
+        df_wt_combinations = (df_wt_combinations
+                              .set_index(["date"])
+                              .groupby(columns_to_combine)
+                              .resample(rolling_period)
+                              .agg({col_id: "nunique", col_engagement: "sum"})
+                              .fillna(0)
+                              .reset_index())
+    return df_wt_combinations
+# def create_combination_index(df : pd.DataFrame, columns : list, date_column : str, rolling_period : str) -> pd.MultiIndex:
+#     """
+#     Create all possible combinations of unique values from specified columns and date range.
+#     Args:
+#         df (pd.DataFrame): The input DataFrame
+#         columns (list): List of column names to create combinations from
+#         date_column (str): Name of the date column
+#         rolling_period (str): Frequency for date range (e.g., '1D', '1W', '1M')
+#     Returns:
+#         pd.MultiIndex: MultiIndex with all combinations
+#     """
+#     # Create a list to store unique values for each column
+#     unique_values = []
+#     # Get unique values for each specified column
+#     for col in columns:
+#         unique_values.append(df[col].unique())
+#     # Create date range
+#     date_range = pd.date_range(start=df[date_column].min(),
+#                                end=df[date_column].max(),
+#                                freq=rolling_period)
+#     # Add date range to the list of unique values
+#     unique_values.append(date_range)
+#     # Create MultiIndex from product of all unique values
+#     all_combinations = pd.MultiIndex.from_product(unique_values,
+#                                                   names=columns + [date_column])
+#     return all_combinations
+# def prepare_data_combinations(df: pd.DataFrame, columns_to_combine : list, col_date : str, date_format : str,  rolling_period : str, col_id : str, col_engagement : str) -> pd.DataFrame:
+#     """
+#     Prepare data for combinations of columns. Useful for data preparation before dataviz of time series. It adds missing rows for each combination of columns and date.
+#     Args:
+#         df (pd.DataFrame): The input DataFrame.
+#         columns_to_combine (list): List of column names to combine.
+#         col_date (str): Name of the column containing dates.
+#         date_format (str): Format of the dates in col_date.
+#         rolling_period (str): Rolling period for grouping.
+#         col_id (str): Name of the column containing unique IDs.
+#         col_engagement (str): Name of the column containing engagement values.
+#     Returns:
+#         pd.DataFrame: The prepared DataFrame with combinations of columns.
+#     """
+#     df_wt_combinations = df.copy()
+#     df_wt_combinations["date"] = pd.to_datetime(df_wt_combinations[col_date], format=date_format).to_numpy()
+#     # Create all possible combinations of columns indexes
+#     all_combinations = create_combination_index(df_wt_combinations, columns_to_combine, "date", rolling_period)
+#     # If no columns to combine, just use the date for grouping
+#     if not columns_to_combine:
+#         df_wt_combinations = (df_wt_combinations
+#                               .set_index("date")
+#                               .groupby(pd.Grouper(freq=rolling_period))
+#                               .agg({col_id: "nunique", col_engagement: "sum"})
+#                               .fillna(0)
+#                               .reset_index())
+#     else:
+#         # # Create all possible combinations of columns indexes
+#         # all_combinations = create_combination_index(df_wt_combinations, columns_to_combine, "date", rolling_period)
+#         df_wt_combinations = (df_wt_combinations
+#                               .set_index(["date"])
+#                               .groupby([*columns_to_combine, pd.Grouper(freq=rolling_period)])
+#                               .agg({col_id: "nunique", col_engagement: "sum"})
+#                               .reindex(all_combinations, fill_value=0)
+#                               .reset_index())
+#     return df_wt_combinations
+def custom_ordering(df : pd.DataFrame, col_to_order : str, custom_order : list) -> pd.DataFrame:
+    """
+    Orders the values in a DataFrame column based on a custom order.
+    Args:
+        df (DataFrame): The DataFrame containing the column to be ordered.
+        col_to_order (str): The name of the column to be ordered.
+        custom_order (list): The custom order of values.
+    Returns:
+        DataFrame: The DataFrame with the column values ordered according to the custom order.
+    """
+    df[col_to_order] = pd.Categorical(df[col_to_order], categories=custom_order, ordered=True).to_numpy()
+    return df
+def calcul_total_et_pourcentage(df : pd.DataFrame, col_gb : list, metrics : dict) -> pd.DataFrame:
+    """
+    Calculates the total and percentage values for the given metrics based on a grouping column.
+    Args:
+        df (DataFrame): The input DataFrame.
+        col_gb (list):  Names of the columns to group by.
+        metrics (dict): A dictionary of metrics to calculate.
+    Returns:
+        DataFrame: The modified DataFrame with total and percentage values added.
+    """
+    percentage_agregations = {f'per_{key}': lambda x: x[key] / x[f"total_{key}"] for key in list(metrics.keys())}
+    df = (df.join(df.groupby(col_gb)
+                  .agg(metrics)
+                  .add_prefix("total_"), on=col_gb
+                  )
+                .assign(**percentage_agregations).fillna(0)
+        )
+    return df

opsci_toolbox/helpers/dataviz.py CHANGED Viewed

@@ -673,19 +673,14 @@ def create_scatter_plot(
     """
     params = general_kwargs()
     params.update(kwargs)
     marker_color = params["marker_color"]
     marker_line_color = params["marker_line_color"]
     marker_size = params["marker_size"]
     col_hover = params["col_hover"]
     xaxis_range = params["xaxis_range"]
     yaxis_range = params["yaxis_range"]
     fig = go.Figure()
     if marker_line_color is None:
         marker_line_color = marker_color
@@ -694,40 +689,41 @@ def create_scatter_plot(
         for i, category in enumerate(df[col_category].unique()):
             if color_palette:
-                marker_color = color_palette.get(category, generate_random_hexadecimal_color)  # Default to black if category not found
+                marker_color = color_palette.get(category, generate_random_hexadecimal_color())  # Default to black if category not found
             else:
                 marker_color = generate_random_hexadecimal_color()
             # hovertemplate generation
-            hovertemplate = (
-                "<b>"
-                + col_x
-                + "</b>:"
-                + df[df[col_category] == category][col_x].astype(str)
-                + "<br><b>"
-                + col_y
-                + "</b>:"
-                + df[df[col_category] == category][col_y].astype(str)
-                + "<br><b>"
-                + col_category
-                + "</b>:"
-                + str(category)
-            )
+            # hovertemplate = (
+            #     "<b>"
+            #     + col_x
+            #     + "</b>:"
+            #     + df[df[col_category] == category][col_x].astype(str)
+            #     + "<br><b>"
+            #     + col_y
+            #     + "</b>:"
+            #     + df[df[col_category] == category][col_y].astype(str)
+            #     + "<br><b>"
+            #     + col_category
+            #     + "</b>:"
+            #     + str(category)
+            # )
+            hovertemplate = ""
             if col_size is None:
                 size = marker_size
             else:
                 size = df[df[col_category] == category][col_size]
-                hovertemplate += "<br><b>" + col_size + "</b>:" + size.astype(str)
+            #     hovertemplate += "<br><b>" + col_size + "</b>:" + size.astype(str)
             if len(col_hover) > 0:
                 for c in col_hover:
                     hovertemplate += (
                         "<br><b>"
                         + str(c)
-                        + "</b>:"
+                        + "</b> : "
                         + df[df[col_category] == category][c]
+                        .apply(format_input)
                         .astype(str)
-                        .apply(wrap_text)
                     )
             fig.add_trace(
@@ -2038,8 +2034,6 @@ def bar_trend_per_cat(df: pd.DataFrame,
     params = general_kwargs()
     params.update(kwargs)
-    col_hover = params["col_hover"]
     xaxis_title = params["xaxis_title"]
     yaxis_title = params["yaxis_title"]
     zaxis_title = params["zaxis_title"]
@@ -2060,7 +2054,7 @@ def bar_trend_per_cat(df: pd.DataFrame,
         current_df = df[df[col_cat] == cat]
-        hovertemplate="<br><b>"+xaxis_title+"</b> :"+current_df[col_x].astype(str)+"<br><b>"+yaxis_title+"</b> - "+current_df[col_y].astype(str)+"<br><b>"+zaxis_title+"</b> : "+current_df[col_z].astype(str)+"<extra></extra>"
+        hovertemplate="<br><b>"+xaxis_title+"</b> :"+current_df[col_x].astype(str)+"<br><b>"+yaxis_title+"</b> - "+current_df[col_y].astype(str)+"<br><b>"+zaxis_title+"</b> : "+current_df[col_z].astype(str)
         # hovertemplate='<b>Categorie : </b>'+str(cat)+'<br><b>Date : </b>'+ current_df[col_x].astype(str) + '<br><b>'+y1_axis_title+'</b> : '+ current_df[col_metric1].astype(str)+' ('+current_df["per_"+col_metric1].map("{:.1%}".format).astype(str)+')' +'<br><b>'+y2_axis_title+'</b> : '+ current_df[col_metric2].astype(int).astype(str)+' ('+current_df["per_"+col_metric2].map("{:.1%}".format).astype(str)+')'
         for c in col_hover:
             hovertemplate += (
@@ -3946,6 +3940,170 @@ def create_radar(df: pd.DataFrame,
     )
     return fig
+def bar_subplots_per_cat(df: pd.DataFrame,
+                         col_x: str,
+                         col_y: str,
+                         col_cat: str,
+                         col_stack: str,
+                         color_palette: dict = None,
+                         n_top_words: int = 20,
+                         **kwargs
+                         ) -> go.Figure:
+    """
+    Create subplots of stacked bar charts.
+    Args:
+        df (pd.DataFrame): DataFrame containing data for bar charts.
+        col_x (str): Name of the column containing x-axis values.
+        col_y (str): Name of the column containing y-axis values.
+        col_cat (str): Name of the column containing categories.
+        col_stack (str): Name of the column containing stacking values.
+        color_palette (Optional[Dict[str, str]], optional): Dictionary mapping categories to colors. Defaults to None.
+        n_top_words (int, optional): Number of top words to display in each bar chart. Defaults to 20.
+        **kwargs: Additional keyword arguments to update default plotting parameters.
+    Returns:
+        go.Figure: Plotly Figure object representing the subplots of stacked bar charts.
+    """
+    params = general_kwargs()
+    params.update(kwargs)
+    marker_color = params['marker_color']
+    textposition = params["textposition"]
+    vertical_spacing = params['vertical_spacing']
+    horizontal_spacing = params["horizontal_spacing"]
+    col_hover = params['col_hover']
+    n_cols = params['n_cols']
+    categories = df[col_cat].unique()
+    # user define a number of columns, we compute the number of rows requires
+    n_rows = math.ceil(len(categories) / n_cols)
+    # fine tune parameter according to the text position provided
+    if textposition == 'inside':
+        horizontal_spacing = (horizontal_spacing / n_cols) / 2
+    else:
+        horizontal_spacing = (horizontal_spacing / n_cols)
+    # create subplots
+    fig = make_subplots(
+        rows=n_rows,  # number of rows
+        cols=n_cols,  # number of columns
+        subplot_titles=list(categories),  # title for each subplot
+        vertical_spacing=vertical_spacing / n_rows,  # space between subplots
+        horizontal_spacing=horizontal_spacing,  # space between subplots
+        shared_xaxes=params["shared_xaxes"],
+        shared_yaxes=params["shared_yaxes"]
+    )
+    # create stacked bar traces for each subplot
+    row_id = 0
+    col_id = 0
+    for i, category in enumerate(categories):
+        # define row and column position
+        col_id += 1
+        if i % n_cols == 0:
+            row_id += 1
+        if col_id > n_cols:
+            col_id = 1
+        # select data
+        current_df = df[df[col_cat] == category].sort_values(by=col_x, ascending=True)
+        unique_stacks = current_df[col_stack].unique()
+        if textposition == 'inside':
+            text = current_df[col_y].head(n_top_words)
+        else:
+            textposition = "auto"
+            text = None
+        for stack in unique_stacks:
+            # define bar color or create a random color
+            if color_palette:
+                marker_color = color_palette.get(stack, generate_random_hexadecimal_color())
+            else:
+                marker_color = generate_random_hexadecimal_color()
+            stack_df = current_df[current_df[col_stack] == stack]
+            hovertemplate = '<b>'+col_cat+" : "+ stack_df[col_cat].astype(str)+ '</b><br>' + col_stack+" : "+ stack_df[col_stack].astype(str)
+            for col in col_hover:
+                hovertemplate += '<br><b>' + col + ': ' + current_df[current_df[col_cat] == category][col].astype(str) + '</b>'
+            fig.add_trace(
+                go.Bar(
+                    x=stack_df[col_x].tail(n_top_words),
+                    y=stack_df[col_y].tail(n_top_words),
+                    opacity=params["marker_opacity"],
+                    orientation=params["orientation"],  # horizontal bars
+                    name=stack,  # trace name for legend
+                    text=text,  # text to display
+                    textposition=textposition,  # text position
+                    textangle=params["xaxis_tickangle"],  # text angle
+                    marker_color=marker_color,  # bar color
+                    hovertemplate=hovertemplate + "<extra></extra>"  # hover info
+                ),
+                row=row_id,
+                col=col_id
+            )
+    for row_id in range(1, n_rows+1):
+        for col_id in range(1, n_cols+1):
+            fig.update_yaxes(title=params["yaxis_title"], row=row_id, col=1)
+            fig.update_xaxes(title=params["xaxis_title"], row=row_id, col=col_id)
+    fig.update_layout(
+        margin=dict(l=75, r=75, t=75, b=50),
+        title_text=params["title_text"],
+        width=n_cols * params["width"],  # plot size
+        height=n_rows * n_top_words * params["height"],  # plot size
+        showlegend=params["showlegend"],
+        font_family=params["font_family"],
+        font_size=params["font_size"],
+        template=params["template"],
+        plot_bgcolor=params["plot_bgcolor"],  # background color (plot)
+        paper_bgcolor=params["paper_bgcolor"],  # background color (around plot)
+        uniformtext_minsize=params["uniformtext_minsize"],
+        barmode=params['barmode']
+    )
+    fig.update_yaxes(
+        # title=params["yaxis_title"],
+        title_font_size=params["yaxis_title_font_size"],
+        tickangle=params["yaxis_tickangle"],
+        tickfont_size=params["yaxis_tickfont_size"],
+        range=params["yaxis_range"],
+        showgrid=params["yaxis_showgrid"],
+        showline=params["yaxis_showline"],
+        zeroline=params["yaxis_zeroline"],
+        gridwidth=params["yaxis_gridwidth"],
+        gridcolor=params["yaxis_gridcolor"],
+        linewidth=params["yaxis_linewidth"],
+        linecolor=params["yaxis_linecolor"],
+        mirror=params["yaxis_mirror"],
+        layer="below traces",
+    )
+    fig.update_xaxes(
+        # title=params["xaxis_title"],
+        title_font_size=params["xaxis_title_font_size"],
+        tickangle=params["xaxis_tickangle"],
+        tickfont_size=params["xaxis_tickfont_size"],
+        range=params["xaxis_range"],
+        showgrid=params["xaxis_showgrid"],
+        showline=params["xaxis_showline"],
+        zeroline=params["xaxis_zeroline"],
+        gridwidth=params["xaxis_gridwidth"],
+        gridcolor=params["xaxis_gridcolor"],
+        linewidth=params["xaxis_linewidth"],
+        linecolor=params["xaxis_linecolor"],
+        mirror=params["xaxis_mirror"],
+        layer="below traces"
+    )
+    return fig
 # def bar_subplots(df: pd.DataFrame,
 #                  col_x: str,
 #                  col_y: str,

opsci_toolbox/helpers/dates.py CHANGED Viewed

@@ -58,7 +58,7 @@ def number_of_days(start_date: datetime, end_date: datetime) -> int:
         days_difference (int): The number of days between the start and end dates.
     """
     # Calculate the difference
-    time_difference = start_date - end_date
+    time_difference = end_date -  start_date
     # Extract the number of days from the timedelta object
     days_difference = time_difference.days
     return days_difference
@@ -77,3 +77,49 @@ def df_col_to_datetime(df: pd.DataFrame, col: str) -> pd.DataFrame:
     df[col] = pd.to_datetime(df[col])
     return df
+# from dateutil import parser
+# from datetime import datetime
+# def detect_date_format(date_string):
+#     formats = [
+#         # Date formats
+#         "%Y-%m-%d", "%d-%m-%Y", "%m/%d/%Y", "%m-%d-%Y",
+#         "%Y/%m/%d", "%d/%m/%Y", "%Y.%m.%d", "%d.%m.%Y",
+#         "%d %b %Y", "%d %B %Y", "%b %d, %Y", "%B %d, %Y",
+#         "%d-%b-%Y", "%d-%B-%Y", "%b-%d-%Y", "%B-%d-%Y",
+#         # Date and time formats
+#         "%Y-%m-%d %H:%M:%S", "%d-%m-%Y %H:%M:%S", "%m/%d/%Y %H:%M:%S", "%m-%d-%Y %H:%M:%S",
+#         "%Y/%m/%d %H:%M:%S", "%d/%m/%Y %H:%M:%S", "%Y.%m.%d %H:%M:%S", "%d.%m.%Y %H:%M:%S",
+#         "%d %b %Y %H:%M:%S", "%d %B %Y %H:%M:%S", "%b %d, %Y %H:%M:%S", "%B %d, %Y %H:%M:%S",
+#         "%d-%b-%Y %H:%M:%S", "%d-%B-%Y %H:%M:%S", "%b-%d-%Y %H:%M:%S", "%B-%d-%Y %H:%M:%S",
+#         # Time formats with milliseconds
+#         "%Y-%m-%d %H:%M:%S.%f", "%d-%m-%Y %H:%M:%S.%f", "%m/%d/%Y %H:%M:%S.%f", "%m-%d-%Y %H:%M:%S.%f",
+#         "%Y/%m/%d %H:%M:%S.%f", "%d/%m/%Y %H:%M:%S.%f", "%Y.%m.%d %H:%M:%S.%f", "%d.%m.%Y %H:%M:%S.%f",
+#         "%d %b %Y %H:%M:%S.%f", "%d %B %Y %H:%M:%S.%f", "%b %d, %Y %H:%M:%S.%f", "%B %d, %Y %H:%M:%S.%f",
+#         "%d-%b-%Y %H:%M:%S.%f", "%d-%B-%Y %H:%M:%S.%f", "%b-%d-%Y %H:%M:%S.%f", "%B-%d-%Y %H:%M:%S.%f",
+#         # ISO format
+#         "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S.%f",
+#         "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S.%f",
+#         "%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%d %H:%M:%S%z",
+#         # Additional formats
+#         "%y/%m/%d %H:%M:%S", "%d/%m/%y %H:%M:%S", "%y-%m-%d %H:%M:%S", "%d-%m-%y %H:%M:%S",
+#     ]
+#     for date_format in formats:
+#         try:
+#             # Try to parse the date string with each format
+#             parsed_date = datetime.strptime(date_string, date_format)
+#             return date_format
+#         except ValueError:
+#             continue
+#     return None
+# def detect_date_format(date_string):
+#     try:
+#         # Use dateutil parser to parse the date string
+#         parsed_date = parser.parse(date_string, fuzzy=False)
+#         return parsed_date
+#     except ValueError:
+#         return None

opsci_toolbox/helpers/gliner.py ADDED Viewed

@@ -0,0 +1,88 @@
+from gliner import GLiNER
+def load_gliner_model(model_name : str, map_location="cpu") -> GLiNER:
+    """
+    Load the GLINER named entity recognition (NER) model.
+    Args:
+        model: The model name to load.
+        map_location: The device to load the model on. Possible values are cpu or cuda .
+    Returns:
+        A list of predicted entities.
+    """
+    model = GLiNER.from_pretrained(model_name, map_location=map_location)
+    return model
+def gliner_predict(model : GLiNER, text : str, labels : list, threshold : float = 0.5) -> list:
+    """
+    Predicts entities using the given model.
+    Args:
+        model: The model used for prediction.
+        text: A list of texts to predict entities from.
+        labels: A list of labels corresponding to the texts.
+        threshold: The threshold value for entity prediction (default: 0.5).
+    Returns:
+        A list of predicted entities.
+    """
+    entities = model.predict_entities(text, labels, threshold=threshold)
+    return entities
+def gliner_batch_predict(model : GLiNER, text : list, labels : list, threshold : float = 0.5) -> list:
+    """
+    Batch inference. Predicts entities using the given model.
+    Args:
+        model: The model used for prediction.
+        text: A list of texts to predict entities from.
+        labels: A list of labels corresponding to the texts.
+        threshold: The threshold value for entity prediction (default: 0.5).
+    Returns:
+        A list of predicted entities.
+    """
+    entities = model.batch_predict_entities(text, labels, threshold=threshold)
+    return entities
+def parse_predictions(predictions : list) -> tuple:
+    """
+    Parse the predictions generated by a GLINER named entity recognition (NER) model for batch processing.
+    Args:
+        predictions (list): A list of dictionaries representing the predictions. Each dictionary contains the following keys:
+            - "start" (int): The starting index of the predicted entity in the input text.
+            - "end" (int): The ending index of the predicted entity in the input text.
+            - "text" (str): The predicted entity text.
+            - "label" (str): The predicted entity label.
+            - "score" (float): The confidence score of the prediction.
+    Returns:
+        tuple: A tuple containing lists of the extracted information from the predictions. The tuple contains the following lists:
+            - starts (list): A list of lists, where each inner list contains the starting indices of the predicted entities.
+            - ends (list): A list of lists, where each inner list contains the ending indices of the predicted entities.
+            - texts (list): A list of lists, where each inner list contains the predicted entity texts.
+            - labels (list): A list of lists, where each inner list contains the predicted entity labels.
+            - scores (list): A list of lists, where each inner list contains the confidence scores of the predictions.
+    """
+    starts, ends, texts, labels, scores = [], [], [], [], []
+    for prediction in predictions:
+        start, end, text, label, score = [], [], [], [], []
+        for item in prediction:
+            start.append(item.get("start"))
+            end.append(item.get("end"))
+            text.append(item.get("text"))
+            label.append(item.get("label"))
+            score.append(item.get("score"))
+        starts.append(start)
+        ends.append(end)
+        texts.append(text)
+        labels.append(label)
+        scores.append(score)
+    return starts, ends, texts, labels, scores

opsci-toolbox 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl

opsci-toolbox 0.0.12py3-none-any.whl → 0.0.14py3-none-any.whl