PyPI - opsci-toolbox - Versions diffs - 0.0.2__py3-none-any.whl → 0.0.5__py3-none-any.whl - Mend

opsci-toolbox 0.0.2py3-none-any.whl → 0.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

opsci_toolbox/apis/rapidapi_helpers.py +81 -0
opsci_toolbox/helpers/common.py +39 -14
opsci_toolbox/helpers/dataviz.py +134 -29
opsci_toolbox/helpers/nlp.py +22 -13
opsci_toolbox/helpers/nlp_cuml.py +171 -0
opsci_toolbox/helpers/surreaction.py +114 -0
{opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.5.dist-info}/METADATA +8 -11
{opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.5.dist-info}/RECORD +10 -8
{opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.5.dist-info}/WHEEL +0 -0
{opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.5.dist-info}/top_level.txt +0 -0

opsci_toolbox/apis/rapidapi_helpers.py CHANGED Viewed

@@ -5,6 +5,7 @@ from tqdm import tqdm
 import re
 from datetime import datetime,timedelta
 from opsci_toolbox.helpers.dates import str_to_datetime
+from opsci_toolbox.helpers.common import write_jsonl
 def create_queries_per_period(query, publishedAfter, publishedBefore, col_publishedAfter = "start_date", col_publishedBefore = "end_date", date_format = '%Y-%m-%d', rolling_days = 7 ):
     datetime_publishedAfter = datetime.strptime(publishedAfter, date_format)
@@ -278,6 +279,31 @@ def parse_tweet(json_data):
     df = pd.DataFrame.from_records(all_records, columns = all_cols)
     return df
+def parse_twitter_list_details(json_data):
+    """
+    Parse list results from https://rapidapi.com/omarmhaimdat/api/twitter154
+    """
+    list_id = json_data.get("list_id", "")
+    list_id_str = json_data.get("list_id_str", "")
+    member_count = json_data.get("member_count", 0)
+    name = json_data.get("name", "")
+    suscriber_count = json_data.get("subscriber_count", 0)
+    creation_date = json_data.get("creation_date", 0)
+    mode = json_data.get("mode", "0")
+    user_record = parse_user(json_data.get("user", {}))
+    record = (list_id, list_id_str, member_count, name, suscriber_count, creation_date, mode) + user_record
+    cols = ["list_id", "list_id_str", "member_count", "name", "suscriber_count", "creation_date", "mode", "user_creation_date", "user_id", "user_username", "user_name", "user_follower_count", "user_following_count", "user_favourites_count", "user_is_private", "user_is_verified", "user_is_blue_verified", "user_location", "user_profile_pic_url", "user_profile_banner_url", "user_description", "user_external_url", "user_number_of_tweets", "user_bot", "user_timestamp", "user_has_nft_avatar", "user_category", "user_default_profile", "user_default_profile_image", "user_listed_count", "user_verified_type"]
+    df = pd.DataFrame.from_records(record, cols)
+    return df
+######################################################################################
+# function to parse Instagram data
+# https://rapidapi.com/JoTucker/api/instagram-scraper2
+# https://instagram-scraper2.p.rapidapi.com/hash_tag_medias_v2
+######################################################################################
 def instagram_parse_hashtag_data(hashtag_data):
     hashtag_id =  hashtag_data.get("id")
     hashtag_name =  hashtag_data.get("name")
@@ -324,3 +350,58 @@ def instagram_parse_hashtag_data(hashtag_data):
     return df
+######################################################################################
+# function to parse Twitter data
+# https://rapidapi.com/twttrapi-twttrapi-default/api/twttrapi
+######################################################################################
+def compile_list_entries(json_data, path_json, filename):
+    """
+    Function to return next cursor and list details from https://twttrapi.p.rapidapi.com/list-members
+    """
+    results = []
+    entries = json_data.get('data', {}).get('list', {}).get('timeline_response', {}).get("timeline", {}).get("instructions", [{}])[-1].get('entries',[])
+    if len(entries)>0:
+        for entry in entries:
+            content = entry.get("content")
+            if (content.get("__typename") == "TimelineTimelineCursor") & (content.get("cursorType") =="Bottom"):
+                next_cursor = content.get("value", None)
+                if next_cursor:
+                    if next_cursor.split('|')[0]=="0":
+                        next_cursor = None
+            if content.get("__typename") != "TimelineTimelineCursor":
+                legacy = content.get("content", {}). get('userResult', {}).get("result", {}).get("legacy", {})
+                results.append(legacy)
+    write_jsonl(results, path_json, filename)
+    return results, next_cursor
+def parse_list_entries(jsonl_data):
+    """
+    Function to parse list details from https://twttrapi.p.rapidapi.com/list-members
+    """
+    all_records=[]
+    for data in jsonl_data:
+        id_str = data.get("id_str","")
+        name = data.get("name","")
+        screen_name = data.get("screen_name", "")
+        created_at = data.get("created_at")
+        description = data.get("description")
+        statuses_count = data.get("statuses_count", 0)
+        followers_count = data.get("followers_count",0)
+        friends_count = data.get("friends_count",0)
+        favourites_count = data.get("favourites_count",0)
+        media_count = data.get("media_count",0)
+        protected = data.get("protected", False)
+        verified = data.get("verified", False)
+        verified_type = data.get("verified_type", "")
+        entities = data.get("entities")
+        urls = [url.get("expanded_url","") for url in entities.get('url', {}).get("urls",[])]
+        user_mentions = [um.get("screen_name","") for um in entities.get('description', {}).get('user_mentions', [])]
+        user_mentions_indices = [um.get("indices",[]) for um in entities.get('description', {}).get('user_mentions', [])]
+        hashtags = [um.get("text","") for um in entities.get('description', {}).get('hashtags', [])]
+        hashtags_indices = [um.get("indices",[]) for um in entities.get('description', {}).get('hashtags', [])]
+        record = (id_str, name, screen_name, created_at, description, statuses_count, followers_count, friends_count, favourites_count, media_count, protected, verified, verified_type, urls, user_mentions, user_mentions_indices, hashtags, hashtags_indices)
+        all_records.append(record)
+    df = pd.DataFrame.from_records(all_records, columns = ["id_str", "name", "screen_name", "created_at", "description", "statuses_count", "followers_count", "friends_count", "favourites_count", "media_count", "protected", "verified", "verified_type", "urls", "user_mentions", "user_mentions_indices", "hashtags", "hashtags_indices"])
+    return df

opsci_toolbox/helpers/common.py CHANGED Viewed

@@ -50,14 +50,26 @@ def load_parquet(path):
         print(e)
     return df
-def load_pickle(path: str):
+# def load_pickle(path: str):
+#     """
+#     Load a pickle file into a dataframe
+#     """
+#     with open(path, 'rb') as f:
+#         df=pickle.load(f)
+#     return df
+def load_pickle(path):
+    return pd.read_pickle(path)
+def write_pickle(data, path, filename):
     """
-    Load a pickle file into a dataframe
+    Write a dataframe into a pickle file
     """
-    with open(path, 'rb') as f:
-        df=pickle.load(f)
-    return df
+    file_path=os.path.join(path, filename+'.pickle')
+    with open(file_path, 'wb') as f:
+        pickle.dump(data, f)
+    return file_path
 def load_json(path: str):
     """
@@ -164,15 +176,15 @@ def read_jsonl(path: str):
 #########################################################################################
-def write_pickle(df: pd.DataFrame, path: str, name: str):
-    """
-    Write a dataframe into a pickle file
-    """
-    file_path=os.path.join(path, name+'.pickle')
+# def write_pickle(df: pd.DataFrame, path: str, name: str):
+#     """
+#     Write a dataframe into a pickle file
+#     """
+#     file_path=os.path.join(path, name+'.pickle')
-    with open(file_path, 'wb') as f:
-        pickle.dump(df, f)
-    return file_path
+#     with open(file_path, 'wb') as f:
+#         pickle.dump(df, f)
+#     return file_path
 def write_list_to_txt(input_list: list, path: str, name: str):
@@ -842,3 +854,16 @@ def top_rows_per_category(df, col_to_sort, col_to_gb, cols_to_keep, top_rows) :
                  .reset_index(drop=True)
                 )[cols_to_keep]
     return df_gb
+def format_number(number):
+    """
+    Function to format a number in K, M or B
+    """
+    if number < 1000:
+        return str(number)
+    elif number < 1000000:
+        return f"{number / 1000:.1f}K"
+    elif number < 1000000000:
+        return f"{number / 1000000:.1f}M"
+    else:
+        return f"{number / 1000000000:.1f}B"

opsci_toolbox/helpers/dataviz.py CHANGED Viewed

@@ -282,7 +282,7 @@ def get_convex_hull_coord(points: np.array, interpolate_curve: bool = True) -> t
 #     return fig
-def create_scatter_plot(df, col_x, col_y, col_category, color_palette, col_color, col_size, col_text, title="Scatter Plot", x_axis_label="X-axis", y_axis_label="Y-axis", width=1000, height=1000, xaxis_range=None, yaxis_range=None,
+def create_scatter_plot(df, col_x, col_y, col_category, color_palette, col_color, col_size, col_text, col_legend = [], title="Scatter Plot", x_axis_label="X-axis", y_axis_label="Y-axis", width=1000, height=1000, xaxis_range=None, yaxis_range=None,
     size_value =4, opacity=0.8, maxdisplayed=0, mode = "markers", textposition="bottom center", plot_bgcolor=None, paper_bgcolor=None, yaxis_showgrid = False, xaxis_showgrid = False, color="indianred", line_width=0.5, line_color="white", colorscale='Viridis', showscale=True, template="plotly"):
     """
     Create a scatter plot :
@@ -327,8 +327,9 @@ def create_scatter_plot(df, col_x, col_y, col_category, color_palette, col_color
                 size = df[df[col_category] == category][col_size]
                 hovertemplate += '<br><b>'+col_size+'</b>:'+size.astype(str)
-            if col_text is not None:
-                hovertemplate +='<br><b>'+col_text+'</b>:'+ df[df[col_category]==category][col_text].apply(wrap_text)
+            if len(col_legend)>0:
+                for c in col_legend:
+                    hovertemplate +='<br><b>'+str(c)+'</b>:'+ df[df[col_category]==category][c].astype(str).apply(wrap_text)
             fig.add_trace(
                 go.Scatter(
@@ -365,13 +366,16 @@ def create_scatter_plot(df, col_x, col_y, col_category, color_palette, col_color
         else :
             if color is None:
                 color = generate_random_hexadecimal_color()
-        if col_text is not None:
-            hovertemplate +='<br><b>'+col_text+'</b>:'+ df[col_text].apply(wrap_text)
+        if len(col_legend)>0:
+            for c in col_legend:
+                hovertemplate +='<br><b>'+str(c)+'</b>:'+ df[c].astype(str).apply(wrap_text)
         fig = go.Figure( go.Scatter(
                     x=df[col_x],
                     y=df[col_y],
                     mode=mode,
+                    text = df[col_text],
+                    textposition=textposition,
                     marker=dict(color=color,                #dots color
                                 size=size,                  #dots size
                                 opacity=opacity,            #dots opacity
@@ -582,7 +586,8 @@ def scatter3D(df, col_x, col_y, col_z, col_category, color_palette, col_size, co
     return fig
-def fig_bar_trend(x, bar_measure, trend_measure, x_name="X", bar_name ="metric1", trend_name = "metric2", marker_color='lightpink', line_color='indianred', title_text="Couverture & Résonance", width=1500, height=700, xaxis_tickangle=0, opacity=0.8, plot_bgcolor=None, paper_bgcolor=None, template = "plotly"):
+def fig_bar_trend(df, col_x, bar_measure, trend_measure, x_name="X", bar_name ="metric1", trend_name = "metric2", marker_color='lightpink', line_color='indianred', title_text="Couverture & Résonance", width=1500, height=700, xaxis_tickangle=0, opacity=0.8, plot_bgcolor=None, paper_bgcolor=None, template = "plotly"):
     """
     Display a graph that combine bar and trend chart to compare 2 metrics :
     - x = x axis data
@@ -597,42 +602,43 @@ def fig_bar_trend(x, bar_measure, trend_measure, x_name="X", bar_name ="metric1"
     - opacity = opacity of bars
     """
-    nk = np.empty(shape=(len(x), 3, 1), dtype="object")
-    nk[:, 0] = np.array(x.apply(lambda txt: '<br>'.join(textwrap.wrap(str(txt), width=50)))).reshape(-1, 1)
-    nk[:, 1] = np.array(bar_measure).reshape(-1, 1)
-    nk[:, 2] = np.array(trend_measure).reshape(-1, 1)
+    # nk = np.empty(shape=(len(x), 3, 1), dtype="object")
+    # nk[:, 0] = np.array(x.apply(lambda txt: '<br>'.join(textwrap.wrap(str(txt), width=50)))).reshape(-1, 1)
+    # nk[:, 1] = np.array(bar_measure).reshape(-1, 1)
+    # nk[:, 2] = np.array(trend_measure).reshape(-1, 1)
     fig = make_subplots(specs=[[{"secondary_y": True}]])
     fig.add_trace(
         go.Scatter(
-            x=x,
-            y=trend_measure,
+            x=df[col_x].apply(wrap_text),
+            y=df[trend_measure],
             name=trend_name,
             mode='lines',
             line_color=line_color,
             line_width=4,
             textfont=dict(size=8),
-            customdata=nk,
-            hovertemplate=("<br>"+x_name+" :%{customdata[0]}<br>"+bar_name+" - %{customdata[1]}<br>"+trend_name+":%{customdata[2]}"+"<extra></extra>"),
+            # customdata=nk,
+            hovertemplate=("<br>"+x_name+" :"+df[col_x].astype(str)+"<br>"+bar_name+" - "+df[bar_measure].astype(str)+"<br>"+trend_name+" : "+df[trend_measure].astype(str)+"<extra></extra>"),
         ),
         secondary_y=True,
     )
     # Add traces
     fig.add_trace(
         go.Bar(
-            x=x,
-            y = bar_measure,
+            x=df[col_x].apply(wrap_text),
+            y = df[bar_measure],
             name=bar_name,
             marker_color=marker_color,
             opacity=opacity,
-            hovertemplate=("<br>"+x_name+" :%{customdata[0]}<br>"+bar_name+" - %{customdata[1]}<br>"+trend_name+":%{customdata[2]}"+"<extra></extra>"),
+            # customdata=nk,
+            hovertemplate=("<br>"+x_name+" :"+df[col_x].astype(str)+"<br>"+bar_name+" - "+df[bar_measure].astype(str)+"<br>"+trend_name+" : "+df[trend_measure].astype(str)+"<extra></extra>"),
         ),
         secondary_y=False,
     )
-    first_axis_range=[-0.5,bar_measure.max()*1.01]
-    secondary_axis_range=[-0.5,trend_measure.max()*1.01]
+    first_axis_range=[-0.5,df[bar_measure].max()*1.01]
+    secondary_axis_range=[-0.5,df[trend_measure].max()*1.01]
     # Add figure title
     fig.update_layout(
@@ -668,6 +674,92 @@ def fig_bar_trend(x, bar_measure, trend_measure, x_name="X", bar_name ="metric1"
     return fig
+# def fig_bar_trend(x, bar_measure, trend_measure, x_name="X", bar_name ="metric1", trend_name = "metric2", marker_color='lightpink', line_color='indianred', title_text="Couverture & Résonance", width=1500, height=700, xaxis_tickangle=0, opacity=0.8, plot_bgcolor=None, paper_bgcolor=None, template = "plotly"):
+#     """
+#     Display a graph that combine bar and trend chart to compare 2 metrics :
+#     - x = x axis data
+#     - bar_measure = data represented as bar diagram
+#     - trend_measure = data represented as trend line
+#     - x_name / bar_name / trend_name : axis labels
+#     - marker_color = color code for bars
+#     - line_color = color code for trend line
+#     - title_text = graph title
+#     - width / height = size of plot
+#     - xaxis_tickangle =  angle for x ticks
+#     - opacity = opacity of bars
+#     """
+#     nk = np.empty(shape=(len(x), 3, 1), dtype="object")
+#     nk[:, 0] = np.array(x.apply(lambda txt: '<br>'.join(textwrap.wrap(str(txt), width=50)))).reshape(-1, 1)
+#     nk[:, 1] = np.array(bar_measure).reshape(-1, 1)
+#     nk[:, 2] = np.array(trend_measure).reshape(-1, 1)
+#     fig = make_subplots(specs=[[{"secondary_y": True}]])
+#     fig.add_trace(
+#         go.Scatter(
+#             x=x,
+#             y=trend_measure,
+#             name=trend_name,
+#             mode='lines',
+#             line_color=line_color,
+#             line_width=4,
+#             textfont=dict(size=8),
+#             customdata=nk,
+#             hovertemplate=("<br>"+x_name+" :%{customdata[0]}<br>"+bar_name+" - %{customdata[1]}<br>"+trend_name+":%{customdata[2]}"+"<extra></extra>"),
+#         ),
+#         secondary_y=True,
+#     )
+#     # Add traces
+#     fig.add_trace(
+#         go.Bar(
+#             x=x,
+#             y = bar_measure,
+#             name=bar_name,
+#             marker_color=marker_color,
+#             opacity=opacity,
+#             hovertemplate=("<br>"+x_name+" :%{customdata[0]}<br>"+bar_name+" - %{customdata[1]}<br>"+trend_name+":%{customdata[2]}"+"<extra></extra>"),
+#         ),
+#         secondary_y=False,
+#     )
+#     first_axis_range=[-0.5,bar_measure.max()*1.01]
+#     secondary_axis_range=[-0.5,trend_measure.max()*1.01]
+#     # Add figure title
+#     fig.update_layout(
+#         title_text=title_text,
+#         showlegend=True,
+#         width = width,
+#         height= height,
+#         xaxis_tickangle=xaxis_tickangle,
+#         xaxis_showline=False,
+#         xaxis_showgrid=False,
+#         yaxis_showline=False,
+#         yaxis_showgrid=False,
+#         font_family="Segoe UI Semibold",
+#         template=template,
+#         plot_bgcolor=plot_bgcolor,    #background color (plot)
+#         paper_bgcolor=paper_bgcolor,   #background color (around plot)
+#         margin=dict(
+#                     t=width / 15,
+#                     b=width / 20,
+#                     r=width / 20,
+#                     l=width / 20,
+#                 ),
+#     )
+#     # # Set x-axis title
+#     fig.update_xaxes(title_text=x_name)
+#     # Set y-axes titles
+#     fig.update_yaxes(title_text=bar_name, range = first_axis_range, secondary_y=False)
+#     fig.update_yaxes(title_text=trend_name, range = secondary_axis_range, secondary_y=True)
+#     return fig
 def density_map(df_posts,
               df_dots,
               df_topics,
@@ -947,16 +1039,16 @@ def bar_subplots(df, col_x, col_y, col_cat, color_palette, n_cols=4, n_top_words
     # fine tune parameter according to the text position provided
     if textposition == 'inside':
-        horizontal_spacing = (horizontal_spacing / n_rows)/2
+        horizontal_spacing = (horizontal_spacing / n_cols)/2
     else:
-        horizontal_spacing = (horizontal_spacing / n_rows)
+        horizontal_spacing = (horizontal_spacing / n_cols)
     # create subplots
     fig = make_subplots(
         rows = n_rows,                           # number of rows
         cols = n_cols,                           # number of columns
         subplot_titles = list(categories),       # title for each subplot
-        vertical_spacing = vertical_spacing / n_cols,     # space between subplots
+        vertical_spacing = vertical_spacing / n_rows,     # space between subplots
         horizontal_spacing = horizontal_spacing  # space between subplots
         )
@@ -1040,8 +1132,6 @@ def pie_subplots(df, col_x, col_y, col_cat, col_color, n_cols=4, horizontal_spac
     # user define a number of columns, we compute the number of rows requires
     n_rows =  math.ceil(len(categories) / n_cols)
-    horizontal_spacing = (horizontal_spacing / n_rows)
     specs = [[{'type':'domain'}] * n_cols] * n_rows
     # create subplots
@@ -1049,8 +1139,8 @@ def pie_subplots(df, col_x, col_y, col_cat, col_color, n_cols=4, horizontal_spac
         rows=n_rows,
         cols=n_cols,
         subplot_titles=list(categories),
-        horizontal_spacing=horizontal_spacing,
-        vertical_spacing=vertical_spacing,
+        horizontal_spacing=horizontal_spacing / n_cols,
+        vertical_spacing=vertical_spacing / n_rows,
         specs=specs
     )
@@ -1103,7 +1193,7 @@ def pie_subplots(df, col_x, col_y, col_cat, col_color, n_cols=4, horizontal_spac
     return fig
-def horizontal_stacked_bars(df, col_x, col_y, col_percentage, col_cat, col_color, title_text = "Sentiment per topic", width=1200, height=1200, xaxis_tickangle=0, horizontal_spacing = 0.2, vertical_spacing = 0.08, plot_bgcolor=None, paper_bgcolor=None, template = "plotly"):
+def horizontal_stacked_bars(df, col_x, col_y, col_percentage, col_cat, col_color, title_text = "Sentiment per topic", width=1200, height=1200, xaxis_tickangle=0, horizontal_spacing = 0, vertical_spacing = 0.08, plot_bgcolor=None, paper_bgcolor=None, template = "plotly"):
     categories = df[col_cat].unique()
@@ -1112,8 +1202,8 @@ def horizontal_stacked_bars(df, col_x, col_y, col_percentage, col_cat, col_color
         rows = 1,                           # number of rows
         cols = 2,                           # number of columns
         # subplot_titles = list(categories),       # title for each subplot
-        vertical_spacing = vertical_spacing / n_cols,     # space between subplots
-        horizontal_spacing = 0  # space between subplots
+        vertical_spacing = vertical_spacing,     # space between subplots
+        horizontal_spacing = horizontal_spacing / n_cols # space between subplots
         )
     for cat in categories:
@@ -1688,4 +1778,19 @@ def add_shape(fig, shape_type = "rect", x0= -1, y0= -1, x1 = 0, y1=0, fillcolor=
             }
         )
+    return fig
+def add_image(fig, xref = "paper", yref = "paper", x = 0, y=0, sizex = 0.08, sizey=0.08, xanchor="right", yanchor="bottom", source = "data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iNDc1IiBoZWlnaHQ9IjM4OCIgdmlld0JveD0iMCAwIDQ3NSAzODgiIGZpbGw9Im5vbmUiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+CjxwYXRoIGQ9Ik0xMDUuNzI3IDI5My4zOTFDMTA1LjcyNyAyNjYuNzc0IDg0LjEyOTMgMjQ1LjE3NyA1Ny42MDEzIDI0NS4xNzdDMzAuOTg0IDI0NS4xNzcgOS4yOTYgMjY2Ljc3NCA5LjI5NiAyOTMuMzkxQzkuMjk2IDMyMC4wMDkgMzAuOTg0IDM0MS42MDcgNTcuNjAxMyAzNDEuNjA3Qzg0LjEyOTMgMzQxLjYwNyAxMDUuNzI3IDMyMC4wMDkgMTA1LjcyNyAyOTMuMzkxWk0wLjg3MDY2NyAyOTMuMzkxQzAuODcwNjY3IDI2Mi4yMDMgMjYuMzI0IDIzNi43NTMgNTcuNjAxMyAyMzYuNzUzQzg4LjY5ODcgMjM2Ljc1MyAxMTQuMTUxIDI2Mi4yMDMgMTE0LjE1MSAyOTMuMzkxQzExNC4xNTEgMzI0LjU3OSA4OC42OTg3IDM1MC4wMyA1Ny42MDEzIDM1MC4wM0MyNi4zMjQgMzUwLjAzIDAuODcwNjY3IDMyNC41NzkgMC44NzA2NjcgMjkzLjM5MVoiIGZpbGw9ImJsYWNrIi8+CjxwYXRoIGQ9Ik0yMzIuNTMxIDI5My40ODFDMjMyLjUzMSAyNjMuNjM3IDIwOS4zMTkgMjQ1LjI2NSAxODYuMjg2IDI0NS4yNjVDMTY2LjU3IDI0NS4yNjUgMTQ3LjQ4MiAyNTguNjIgMTQ1LjI0MSAyODAuMDM4VjMwNi42NTZDMTQ3LjM5MyAzMjguOTcgMTY2LjM5MSAzNDEuNjk2IDE4Ni4yODYgMzQxLjY5NkMyMDkuMzE5IDM0MS42OTYgMjMyLjUzMSAzMjMuMzI1IDIzMi41MzEgMjkzLjQ4MVpNMjQwLjg2NiAyOTMuNDgxQzI0MC44NjYgMzI4LjA3NCAyMTQuNjk3IDM1MC4xMiAxODcuMTgzIDM1MC4xMkMxNjkuOTc3IDM1MC4xMiAxNTMuNTc1IDM0Mi4zMjQgMTQ1LjI0MSAzMjcuNjI1VjM4Ny40OTNIMTM2Ljk5N1YyMzkuNjJIMTQ0Ljg4M0wxNDUuMjQxIDI1Ny41NDRWMjYwLjE0MkMxNTMuNjY2IDI0NS42MjQgMTcwLjE1NSAyMzYuODQyIDE4Ny4yNzMgMjM2Ljg0MkMyMTQuNjA3IDIzNi44NDIgMjQwLjg2NiAyNTguODg4IDI0MC44NjYgMjkzLjQ4MVoiIGZpbGw9ImJsYWNrIi8+CjxwYXRoIGQ9Ik0yNTUuNjQyIDMyOC40MzNMMjYwLjc1MSAzMjIuNzg4QzI2OC4xMDEgMzM1LjUxMyAyODEuMDk1IDM0MS45NjUgMjk0LjE3OCAzNDEuOTY1QzMwOC41MTggMzQxLjk2NSAzMjMuMTI2IDMzMy42MyAzMjMuMTI2IDMxOS41NjFDMzIzLjEyNiAzMDUuNDkgMzA0LjkzNCAyOTkuNjY1IDI4OS43ODcgMjkzLjc0OUMyODAuMzc4IDI4OS45ODYgMjYwLjc1MSAyODMuMzUzIDI2MC43NTEgMjY0LjYyNEMyNjAuNzUxIDI0OS41NjggMjc0LjI4MyAyMzYuNjYyIDI5NC4yNjkgMjM2LjY2MkMzMDkuODYyIDIzNi42NjIgMzIzLjEyNiAyNDUuMzU0IDMyNy41MTggMjU2LjM3OEwzMjEuNjAzIDI2MS4wMzhDMzE2LjMxNSAyNDkuODM3IDMwNC4yMTcgMjQ0LjkwNiAyOTQuMDAxIDI0NC45MDZDMjc5LjEyMiAyNDQuOTA2IDI2OS4xNzQgMjU0LjEzNyAyNjkuMTc0IDI2NC4yNjVDMjY5LjE3NCAyNzcuNDQgMjg0LjIzMSAyODIuOTA1IDI5OS4xMDkgMjg4LjU1MkMzMTEuMDI3IDI5My4yMTIgMzMxLjU1MSAzMDAuNjUgMzMxLjU1MSAzMTkuMDIyQzMzMS41NTEgMzM4LjExMiAzMTMuMjY5IDM1MC4yMSAyOTQuMDAxIDM1MC4yMUMyNzYuNzAzIDM1MC4yMSAyNjEuODI3IDM0MC40NDIgMjU1LjY0MiAzMjguNDMzWiIgZmlsbD0iYmxhY2siLz4KPHBhdGggZD0iTTM0Ni43OCAyOTMuMzkxQzM0Ni43OCAyNTguNTMgMzc1LjAxMSAyMzYuMDM0IDQwMy4yNDEgMjM2LjAzNEM0MTUuNzg4IDIzNi4wMzQgNDMwLjMwNyAyNDAuNTE3IDQzOS45ODUgMjQ4LjU4Mkw0MzUuMzI1IDI1NS40ODJDNDI4Ljc4MyAyNDkuMjk5IDQxNS41MiAyNDQuNDU5IDQwMy4zMzEgMjQ0LjQ1OUMzNzkuMTMzIDI0NC40NTkgMzU1LjIwNCAyNjMuNDU5IDM1NS4yMDQgMjkzLjM5MUMzNTUuMjA0IDMyMy41OTMgMzc5LjQwMyAzNDIuMzIzIDQwMy4yNDEgMzQyLjMyM0M0MTUuNjA4IDM0Mi4zMjMgNDI5LjIzMSAzMzcuMTI2IDQzNi4yMjEgMzMwLjQ5NEw0NDEuMzI5IDMzNy4xMjZDNDMxLjQ3MiAzNDYuMTc4IDQxNi40MTYgMzUwLjc0OSA0MDMuNDIgMzUwLjc0OUMzNzUuMSAzNTAuNzQ5IDM0Ni43OCAzMjguNDMzIDM0Ni43OCAyOTMuMzkxWiIgZmlsbD0iYmxhY2siLz4KPHBhdGggZD0iTTQ2My42MzcgMjM5LjYxOUg0NzIuMDYxVjM0Ny4xNjNINDYzLjYzN1YyMzkuNjE5Wk00NjEuMTI4IDIxMi40NjRDNDYxLjEyOCAyMDguNzAxIDQ2NC4wODUgMjA1Ljc0MyA0NjcuODQ5IDIwNS43NDNDNDcxLjUyNCAyMDUuNzQzIDQ3NC41NzEgMjA4LjcwMSA0NzQuNTcxIDIxMi40NjRDNDc0LjU3MSAyMTYuMjI4IDQ3MS41MjQgMjE5LjE4NSA0NjcuODQ5IDIxOS4xODVDNDY0LjA4NSAyMTkuMTg1IDQ2MS4xMjggMjE2LjIyOCA0NjEuMTI4IDIxMi40NjRaIiBmaWxsPSJibGFjayIvPgo8cGF0aCBkPSJNMjE3Ljg1MyAzMS4zOTE0TDIzNy43MjEgNTEuMjU4TDI1Ny41ODggMzEuMzkxNEwyMzcuNzIxIDExLjUyNDdMMjE3Ljg1MyAzMS4zOTE0Wk0yMzcuNzIxIDYyLjU3MjdMMjA2LjU0IDMxLjM5MTRMMjM3LjcyMSAwLjIxMDAxNkwyNjguOTAxIDMxLjM5MTRMMjM3LjcyMSA2Mi41NzI3Wk0xNTQuMTAxIDU5Ljc1OTRMMTYxLjQzOSA4Ni45NjQ3TDE4OC42NiA3OS42MjJMMTgxLjMyMyA1Mi41OTU0TDE1NC4xMDEgNTkuNzU5NFpNMTU1Ljc5NyA5Ni43NzE0TDE0NC4yOCA1NC4wNzE0TDE4Ni45NjMgNDIuODM5NEwxOTguNDgxIDg1LjI1OEwxNTUuNzk3IDk2Ljc3MTRaTTI4Ni43ODEgNzkuNjIyTDMxNC4wMDMgODYuOTY0N0wzMjEuMzQxIDU5Ljc1OTRMMjk0LjEyIDUyLjU5NTRMMjg2Ljc4MSA3OS42MjJaTTMxOS42NDMgOTYuNzcxNEwyNzYuOTYxIDg1LjI1OEwyODguNDc5IDQyLjgzOTRMMzMxLjE2MiA1NC4wNzE0TDMxOS42NDMgOTYuNzcxNFpNMTU0LjEwMSAxNTYuMTY5TDE4MS4zMjMgMTYzLjMzM0wxODguNjYgMTM2LjMwN0wxNjEuNDM5IDEyOC45NjVMMTU0LjEwMSAxNTYuMTY5Wk0xODYuOTYzIDE3My4wODlMMTQ0LjI4IDE2MS44NTdMMTU1Ljc5NyAxMTkuMTU3TDE5OC40ODEgMTMwLjY3TDE4Ni45NjMgMTczLjA4OVpNMjg2Ljc3NSAxMzYuMzA5TDI5NC4xMiAxNjMuNTM3TDMyMS4zNDggMTU2LjE5M0wzMTQuMDAzIDEyOC45NjVMMjg2Ljc3NSAxMzYuMzA5Wk0yODguNDc5IDE3My4zNDVMMjc2Ljk2NyAxMzAuNjY5TDMxOS42NDMgMTE5LjE1N0wzMzEuMTU1IDE2MS44MzRMMjg4LjQ3OSAxNzMuMzQ1Wk0yMTcuODUzIDE4NC41MzdMMjM3LjcyMSAyMDQuNDA1TDI1Ny41ODggMTg0LjUzN0wyMzcuNzIxIDE2NC42N0wyMTcuODUzIDE4NC41MzdaTTIzNy43MjEgMjE1LjcxOEwyMDYuNTQgMTg0LjUzN0wyMzcuNzIxIDE1My4zNTdMMjY4LjkwMSAxODQuNTM3TDIzNy43MjEgMjE1LjcxOFoiIGZpbGw9ImJsYWNrIi8+Cjwvc3ZnPgo="):
+    fig.add_layout_image(
+    dict(
+        source=source,
+        xref=xref,
+        yref=yref,
+        x=x, y=y,
+        sizex=sizex,
+        sizey=sizey,
+        xanchor=xanchor,
+        yanchor=yanchor
+        )
+    )
     return fig

opsci_toolbox/helpers/nlp.py CHANGED Viewed

@@ -613,6 +613,10 @@ def load_spacy_model(model,  disable_components=["transformer", "morphologizer",
         will be included in the spaCy pipeline.
     """
+    if torch.cuda.is_available():
+        spacy.prefer_gpu()
     if len(disable_components)>0:
         nlp = spacy.load(model, disable=disable_components)
     else:
@@ -1345,18 +1349,23 @@ def df_transform_column_as_list(column):
 def check_gpu():
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     return device
-def HF_load_model_classification(model_name, device):
-    model = AutoModelForSequenceClassification.from_pretrained(model_name, from_tf=True)
-    tokenizer = AutoTokenizer.from_pretrained(model_name, device=device)
-    classifier = TextClassificationPipeline(tokenizer=tokenizer, model=model.to(device))
-    return model, tokenizer, classifier
-def HF_classify_text(classifier, txt, col_text, filename, dir_json):
+def HF_load_model(model_checkpoint):
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
+    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
+    if torch.cuda.is_available():
+        model.cuda()
+    return model, tokenizer
+def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json):
+    """ Calculate sentiment of a text. `return_type` can be 'label', 'score' or 'proba' """
     file_path= os.path.join(dir_json , str(filename)+'.json')
     if not os.path.exists(file_path):
-        results=classifier(txt)
-        results=results[0]
-        results[col_text]=txt
-        write_json(results, dir_json , str(filename))
-        return results
+        with torch.no_grad():
+            inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
+            proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()[0]
+            label = model.config.id2label[proba.argmax()]
+            results = {"label":label, "score" : float(proba.max()), col_text : text}
+            print(results)
+            write_json(results, dir_json , str(filename))
+    return results

opsci_toolbox/helpers/nlp_cuml.py ADDED Viewed

@@ -0,0 +1,171 @@
+from cuml import UMAP
+from cuml.cluster.hdbscan import HDBSCAN, all_points_membership_vectors, approximate_predict, membership_vector
+import numpy as np
+from tqdm import tqdm
+import os
+from opsci_toolbox.helpers.common import load_pickle, create_dir, write_pickle
+def reduce_with_cuml_UMAP(embeddings, n_neighbors = 5, n_components = 3, min_dist = 0.0, metric = "cosine", spread = 1.0):
+    reducer = UMAP(n_neighbors=n_neighbors,
+                   n_components=n_components,
+                   min_dist=min_dist,
+                   metric=metric,
+                   spread =  spread).fit(embeddings)
+    reduced_embeddings = reducer.transform(embeddings)
+    return reducer, reduced_embeddings
+def transform_with_cuml_UMAP(reducer, new_embeddings):
+    """
+    Transform new data points using a UMAP object
+    """
+    reduced_embeddings = reducer.transform(new_embeddings)
+    return reduced_embeddings
+def hdbscan_cuml_clustering(embeddings, min_cluster_size=5, min_samples=None, max_cluster_size = 0,  metric='euclidean', alpha=1.0, p=2, cluster_selection_epsilon=0.0, cluster_selection_method='eom',
+                       approx_min_span_tree=True, gen_min_span_tree = False, gen_condensed_tree = False, gen_single_linkage_tree_ = False, prediction_data=True):
+    """
+    Parameters:
+    embeddings : array-like or sparse matrix, shape (n_samples, n_features)
+        The input data to be clustered.
+    min_cluster_size : int, optional
+        The minimum number of samples in a group for that group to be considered a cluster; groupings smaller than this size will be left as noise.
+    min_samples : int or None, optional
+        The number of samples in a neighborhood for a point to be considered as a core point. This includes the point itself. If ‘None’, it defaults to the min_cluster_size.
+    max_cluster_size : int, optional (default=0)
+        A limit to the size of clusters returned by the eom algorithm. Has no effect when using leaf clustering (where clusters are usually small regardless) and can also be overridden in rare cases by a high value for cluster_selection_epsilon.
+        Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument.
+    metric : str or callable, optional
+        The metric to use for distance computation. Default is 'euclidean'.
+    alpha : float, optional
+         distance scaling parameter as used in robust single linkage.
+    p : int, optional
+        The Minkowski p-norm distance metric parameter. Default is None.
+    cluster_selection_epsilon : float, optional
+        A distance threshold. Clusters below this value will be merged. Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument.
+    cluster_selection_method : {'eom', 'leaf'}, optional
+        The method used to select clusters from the condensed tree. The standard approach for HDBSCAN* is to use an Excess of Mass algorithm to find the most persistent clusters. Alternatively you can instead select the clusters at the leaves of the tree – this provides the most fine grained and homogeneous clusters. Options are:
+    approx_min_span_tree : bool, optional
+        Whether to compute an approximation of the minimum spanning tree. Default is True.
+    gen_min_span_tree : bool, optional
+        Whether to populate the minimum_spanning_tree_ member for utilizing plotting tools. This requires the hdbscan CPU Python package to be installed
+    gen_condensed_tree : bool, optional
+        Whether to populate the condensed_tree_ member for utilizing plotting tools.
+    gen_single_linkage_tree_ :  bool
+        Whether to populate the single_linkage_tree_ member for utilizing plotting tools.
+    prediction_data : bool, optional
+        Whether the data is prediction data or not. Default is True.
+Returns:
+    clusterer : hdbscan.hdbscan_.HDBSCAN
+        HDBSCAN clusterer object.
+    labels : array, shape (n_samples,)
+        Cluster labels for each point. Noisy samples are given the label -1.
+    probabilities : array, shape (n_samples,)
+        The probability of each sample being an outlier.
+Description:
+    This function performs clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm.
+    It clusters the input data based on the specified parameters and returns the clusterer object, cluster labels for each point, and the
+    probability of each sample being an outlier.
+    """
+    clusterer = HDBSCAN(min_cluster_size=min_cluster_size,
+                                min_samples=min_samples,
+                                max_cluster_size = max_cluster_size,
+                                metric=metric,
+                                alpha=alpha,
+                                p=p,
+                                cluster_selection_epsilon=cluster_selection_epsilon,
+                                cluster_selection_method=cluster_selection_method,
+                                approx_min_span_tree=approx_min_span_tree,
+                                gen_min_span_tree = gen_min_span_tree,
+                                gen_condensed_tree = gen_condensed_tree,
+                                gen_single_linkage_tree_ = gen_single_linkage_tree_,
+                                prediction_data=prediction_data)
+    clusterer.fit_predict(embeddings)
+    return clusterer, clusterer.labels_, clusterer.probabilities_
+def transform_with_cuml_HDBSCAN(clusterer, new_embeddings):
+    """
+    Transform new data points using a HDBSCAN object
+    """
+    new_data_topic, new_data_proba = approximate_predict(clusterer, new_embeddings)
+    return new_data_topic, new_data_proba
+def cuml_soft_clustering(clusterer):
+    """
+    HDBSCAN SOFT CLUSTERING
+    """
+    soft_clusters = all_points_membership_vectors(clusterer)
+    soft_clusters_val = [str(np.argmax(x)) for x in soft_clusters]
+    soft_clusters_proba = [np.max(x) for x in soft_clusters]
+    return soft_clusters_val, soft_clusters_proba
+def soft_cuml_clustering_new_data(clusterer, embeddings):
+    """
+    PREDICT NEW DATA POINTS HDBSCAN SOFT CLUSTERING
+    """
+    soft_clusters =membership_vector(clusterer, embeddings)
+    soft_clusters_val = [str(np.argmax(x)) for x in soft_clusters]
+    soft_clusters_proba = [np.max(x) for x in soft_clusters]
+    return soft_clusters_val, soft_clusters_proba
+def process_UMAP(embedded_chunks_paths, path_reduced_embeddings_id, reducer, reencode =  False):
+    new_file_paths=[]
+    for file_path in tqdm(embedded_chunks_paths, total=len(embedded_chunks_paths), desc="UMAP transform from files"):
+        filename = os.path.splitext(os.path.basename(file_path))[0][:-9]
+        new_filename = filename+"_reduce_embeddings.pickle"
+        new_file_path = os.path.join(path_reduced_embeddings_id, new_filename)
+        if not os.path.exists(new_file_path) or reencode:
+            df = load_pickle(file_path)
+            create_dir(path_reduced_embeddings_id)
+            # embeddings = df["embeddings"].to_list()
+            embeddings = np.vstack(df['embeddings'].values)
+            reduced_embeddings = transform_with_cuml_UMAP(reducer, embeddings)
+            reduced_embeddings_transformed=[list(e) for e in reduced_embeddings]
+            df['reduced_embeddings'] = reduced_embeddings_transformed
+            df.drop(columns=["embeddings"], inplace=True)
+            print(path_reduced_embeddings_id, filename+"_reduce_embeddings")
+            write_pickle(df, path_reduced_embeddings_id, filename+"_reduce_embeddings")
+            new_file_paths.append(new_file_path)
+        else:
+            print("REDUCED EMBEDDINGS ALREADY EXISTS", file_path)
+            new_file_paths.append(new_file_path)
+    return new_file_paths
+def process_HDBSCAN(clusterer, reduced_embeddings_paths, path_predictions_dataset_id, run_soft_clustering= False, reencode = False):
+    new_file_paths=[]
+    for file_path in tqdm(reduced_embeddings_paths, total=len(reduced_embeddings_paths), desc="HDBSCAN transform from files"):
+        filename = os.path.splitext(os.path.basename(file_path))[0][:-18]
+        new_filename = filename+ "_predictions.pickle"
+        new_file_path = os.path.join(path_predictions_dataset_id, new_filename)
+        if not os.path.exists(new_file_path) or reencode:
+            df = load_pickle(file_path)
+            # reduced_embeddings = df["reduced_embeddings"].to_list()
+            reduced_embeddings = np.vstack(df['reduced_embeddings'].values)
+            topics, probas = transform_with_cuml_HDBSCAN(clusterer, reduced_embeddings)
+            df["topic"]=topics.astype(int).astype(str)
+            df["proba"]=probas
+            if run_soft_clustering:
+                soft_clusters, soft_proba = soft_cuml_clustering_new_data(clusterer, np.array(reduced_embeddings))
+                df["soft_topic"]=soft_clusters
+                df["soft_proba"]=soft_proba
+            write_pickle(df, path_predictions_dataset_id, filename+ "_predictions")
+            new_file_paths.append(new_file_path)
+        else:
+            print("CLUSTERING ALREADY EXISTS", file_path)
+            new_file_paths.append(new_file_path)
+    return new_file_paths

opsci_toolbox/helpers/surreaction.py ADDED Viewed

@@ -0,0 +1,114 @@
+import pandas as pd
+from tqdm import tqdm
+def generate_index(df, col_author_id ='author_id', col_date='created_time'):
+    """
+    Generates an index based on user_id and date
+    """
+    res=[]
+    for i, row in tqdm(df.iterrows(), total=df.shape[0], desc="generation des index"):
+        new_index=".".join([ str(i) for i in [ row[col_author_id], row[col_date].year, row[col_date].month, row[col_date].day]])
+        res.append(new_index)
+    df["index"]=res
+    return df
+def avg_performance(df,
+                    col_date='created_time',
+                    col_author_id='author_id',
+                    col_engagement=['shares', 'comments', 'reactions', 'likes','top_comments', 'love', 'wow', 'haha',
+                                    'sad', 'angry','total_engagement', 'replies', 'percentage_replies'],
+                    rolling_period='7D'):
+    """
+    Function to compute average performance on a rolling period for a list of metrics
+    """
+    # Nettoyage au cas où
+    df[col_date] = pd.to_datetime(df[col_date])
+    df = df.sort_values([col_author_id, col_date])
+    # Le point central c'est la colone created_time, on la met en index.
+    # Ensuite on groupe par author_id en gardant les colonnes de valeurs.
+    # On applique la moyenne mean sur un rolling tous les 2 jours. Automatiquement il va prendre l'index, ici created_time comme pivot.
+    # On met tout à plat
+    average = df.set_index(col_date).groupby(col_author_id)[col_engagement].rolling(rolling_period).mean(numeric_only=True).reset_index()
+    # Sur les résultats précédent, on simplifie pour récupérer une liste avec juste la liste jour / author_id
+    average = average.set_index(col_date).groupby([col_author_id]).resample('1D').last(numeric_only=True).reset_index()
+    # On génère nos supers index
+    df=generate_index(df, col_author_id =col_author_id, col_date=col_date)
+    average = generate_index(average, col_author_id = col_author_id, col_date=col_date)
+    # On fusionne
+    df = pd.merge(df, average[['index']+col_engagement], how='left', on=['index'], suffixes=('', '_avg'))
+    return df
+def kpi_reaction(df, cols):
+    """
+    Cette fonction prend un dataframe et une liste de colonnes en entrée.
+    Pour chaque colonne, on va calculer le taux de sur-réaction.
+    """
+    for col in cols:
+        df['tx_'+col]=(df[col]-df[col+'_avg'])/(df[col]+df[col+'_avg'])
+    return df
+def get_reactions_type(df, cols, col_dest):
+    """
+    Conditional function to return the reaction type based on a list of metrics
+    """
+    all_val=[]
+    for i,row in tqdm(df.iterrows(), total=df.shape[0], desc="qualification des posts"):
+        str_val=''
+        count=0
+        for col in cols:
+            if row[col]>0:
+                str_val=str_val+' '+col.replace('tx_', 'sur-')
+                count=count+1
+        if count==0:
+            str_val="sous reaction"
+        if count==len(cols):
+            str_val="sur reaction totale"
+        all_val.append(str_val.strip())
+    df[col_dest]=all_val
+    return df
+def compute_surreaction(df, col_date, col_author_id, cols_sureaction_metrics, cols_typologie_sureaction, rolling_period_sureaction = '7D'):
+    """
+    Helpers to compute surreaction and return a dataframe with reaction rates and typology
+    """
+    # on désactive temporairement les messages d'alerte
+    pd.options.mode.chained_assignment = None  # default='warn'
+    # on calcule nos performances moyennes pour une liste de métriques
+    df= avg_performance(
+        df,
+        col_date=col_date,
+        col_author_id=col_author_id,
+        col_engagement= cols_sureaction_metrics,
+        rolling_period=rolling_period_sureaction
+        )
+    # on calcule les taux de sur-réaction pour notre liste de métriques
+    df=kpi_reaction(df, cols_sureaction_metrics)
+    cols_tx_engagement=['tx_'+c for c in cols_sureaction_metrics]
+    df[cols_tx_engagement]=df[cols_tx_engagement].fillna(-1)
+    # on supprime nos colonnes contenant la performance moyenne (on ne devrait plus en avoir besoin)
+    cols_to_drop = [c for c in df.columns if c.lower()[-4:] == '_avg']
+    df.drop(columns=cols_to_drop, inplace=True)
+    # on catégorise les formes de réaction
+    cols_typologie = ["tx_"+ col for col in cols_typologie_sureaction]
+    df=get_reactions_type(df, cols_typologie, 'type_engagement')
+    # on réactive les alertes
+    pd.options.mode.chained_assignment = 'warn'  # default='warn'
+    return df

{opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.5.dist-info}/METADATA RENAMED Viewed

@@ -1,44 +1,41 @@
 Metadata-Version: 2.1
 Name: opsci-toolbox
-Version: 0.0.2
+Version: 0.0.5
 Summary: a complete toolbox
 Home-page: UNKNOWN
 Author: Erwan Le Nagard
 Author-email: erwan@opsci.ai
 License: MIT
 Platform: UNKNOWN
-Requires-Dist: Pillow (==10.3.0)
-Requires-Dist: Pillow (>=9.0.1)
+Requires-Dist: Pillow (<11.0.0,>=9.0.1)
 Requires-Dist: Requests (==2.31.0)
 Requires-Dist: beautifulsoup4 (==4.10.0)
 Requires-Dist: chart-studio (==1.1.0)
 Requires-Dist: eldar (==0.0.8)
 Requires-Dist: emoji (==2.10.1)
 Requires-Dist: google-api-python-client (==2.122.0)
-Requires-Dist: gspread (==6.1.0)
+Requires-Dist: gspread (==6.1.1)
 Requires-Dist: hdbscan (==0.8.33)
 Requires-Dist: jusText (==3.0.0)
-Requires-Dist: langchain (==0.1.16)
-Requires-Dist: matplotlib (==3.8.3)
+Requires-Dist: langchain (==0.1.20)
 Requires-Dist: matplotlib (>=3.5.1)
 Requires-Dist: networkx (==3.2.1)
 Requires-Dist: nltk (==3.8.1)
-Requires-Dist: numpy (==1.24.4)
-Requires-Dist: numpy (>=1.21.5)
+Requires-Dist: numpy (<1.25.0,>=1.21.5)
 Requires-Dist: opencv-python-headless (==4.9.0.80)
 Requires-Dist: pandas (==1.5.3)
 Requires-Dist: plotly (==5.19.0)
-Requires-Dist: protobuf
+Requires-Dist: protobuf (==5.26.1)
 Requires-Dist: pyarrow (==14.0.2)
 Requires-Dist: python-louvain (==0.16)
 Requires-Dist: scikit-learn (==1.4.1.post1)
-Requires-Dist: scipy
+Requires-Dist: scipy (<2.0.0,>=1.8.0)
 Requires-Dist: sentence-transformers (==2.5.1)
 Requires-Dist: setuptools (==59.6.0)
 Requires-Dist: spacy (==3.7.4)
 Requires-Dist: spacy-language-detection (==0.2.1)
 Requires-Dist: spacymoji (==3.1.0)
-Requires-Dist: supervision (==0.19.0)
+Requires-Dist: supervision (==0.20.0)
 Requires-Dist: textacy (==0.13.0)
 Requires-Dist: torch (==2.0.1)
 Requires-Dist: tqdm (==4.66.2)

{opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.5.dist-info}/RECORD RENAMED Viewed

@@ -1,19 +1,21 @@
 opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-opsci_toolbox/apis/rapidapi_helpers.py,sha256=8qW7efnE-xuyM7IfGcE_VNugEWpaUfBYUU8y7Bq5TAM,18060
+opsci_toolbox/apis/rapidapi_helpers.py,sha256=5QbF6ehsmmdTrzp7Q8cF5wrf4DmO91v8YexbybczyHA,23183
 opsci_toolbox/apis/webscraping.py,sha256=D1A_ixjImPOncbWrKf6Nem2SR4NQraxTbcYqiE64VTY,12263
 opsci_toolbox/apis/youtube_helpers.py,sha256=CZQ4mP43eA3STWNJ0HjSoJpvz3iHzohSGxmp5ntEgpA,13115
 opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-opsci_toolbox/helpers/common.py,sha256=VEmDvLYrrkDU4HVCpKEMgOMLBK5_6rZz-V7Z1IPpDkI,25474
+opsci_toolbox/helpers/common.py,sha256=41EsQ2pTwQYnUUM1ggwaPueFVj2Qcm_UG7o_Zj41FU8,26152
 opsci_toolbox/helpers/cv.py,sha256=z0HecreIi-vqiOGpDa4VVnHIX_rvkObngrqwTwkWT44,12403
-opsci_toolbox/helpers/dataviz.py,sha256=N0g14X_inFEiaQEtIVtL5eKC42RU9JyUgSRnKKoMHyg,68844
+opsci_toolbox/helpers/dataviz.py,sha256=4wFi0wCMgvIEQEL8okiVJOWxz-eJq5cZ7svHoBbZjnk,77393
 opsci_toolbox/helpers/dates.py,sha256=yQm9pUQAeLTFNPcgeumhi8oErustQJhaoL_HqxSxhiA,996
-opsci_toolbox/helpers/nlp.py,sha256=WGpS73yzolBrX4lijm8GdaOEyjwF82ldYfvXkIm8yyk,57597
+opsci_toolbox/helpers/nlp.py,sha256=LGW8CIjrkQvGLKEnxYu7RNrBNViQ5dUygK67EhkBHZo,57999
+opsci_toolbox/helpers/nlp_cuml.py,sha256=Mkbtl9ewbv3aa9rFvhH9VOM5Y0G-XIsXtR_6IeYpebY,9450
 opsci_toolbox/helpers/sna.py,sha256=D6nwgUgbuApXGpT2zoIMip8262hynEwfppVdvaZ4Qm0,8053
+opsci_toolbox/helpers/surreaction.py,sha256=k5hcZZlXnJ-zczRpwfwthggEgFCr9lQsHHKVOPlm7fc,4606
 opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
 opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
-opsci_toolbox-0.0.2.dist-info/METADATA,sha256=arXHUG-nMzWGAmzDFqjGU_eT8Mi_q2VWLOySMZa-uJQ,1623
-opsci_toolbox-0.0.2.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
-opsci_toolbox-0.0.2.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
-opsci_toolbox-0.0.2.dist-info/RECORD,,
+opsci_toolbox-0.0.5.dist-info/METADATA,sha256=Nhp2oK-KXD4JVivU37-T_MsN-VJfbPtJsWlUq7Kp5-A,1566
+opsci_toolbox-0.0.5.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
+opsci_toolbox-0.0.5.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
+opsci_toolbox-0.0.5.dist-info/RECORD,,

{opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

opsci-toolbox 0.0.2__py3-none-any.whl → 0.0.5__py3-none-any.whl

opsci-toolbox 0.0.2py3-none-any.whl → 0.0.5py3-none-any.whl