opsci-toolbox 0.0.2__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,7 @@ from tqdm import tqdm
5
5
  import re
6
6
  from datetime import datetime,timedelta
7
7
  from opsci_toolbox.helpers.dates import str_to_datetime
8
+ from opsci_toolbox.helpers.common import write_jsonl
8
9
 
9
10
  def create_queries_per_period(query, publishedAfter, publishedBefore, col_publishedAfter = "start_date", col_publishedBefore = "end_date", date_format = '%Y-%m-%d', rolling_days = 7 ):
10
11
  datetime_publishedAfter = datetime.strptime(publishedAfter, date_format)
@@ -278,6 +279,31 @@ def parse_tweet(json_data):
278
279
  df = pd.DataFrame.from_records(all_records, columns = all_cols)
279
280
  return df
280
281
 
282
+ def parse_twitter_list_details(json_data):
283
+ """
284
+ Parse list results from https://rapidapi.com/omarmhaimdat/api/twitter154
285
+ """
286
+ list_id = json_data.get("list_id", "")
287
+ list_id_str = json_data.get("list_id_str", "")
288
+ member_count = json_data.get("member_count", 0)
289
+ name = json_data.get("name", "")
290
+ suscriber_count = json_data.get("subscriber_count", 0)
291
+ creation_date = json_data.get("creation_date", 0)
292
+ mode = json_data.get("mode", "0")
293
+
294
+ user_record = parse_user(json_data.get("user", {}))
295
+ record = (list_id, list_id_str, member_count, name, suscriber_count, creation_date, mode) + user_record
296
+ cols = ["list_id", "list_id_str", "member_count", "name", "suscriber_count", "creation_date", "mode", "user_creation_date", "user_id", "user_username", "user_name", "user_follower_count", "user_following_count", "user_favourites_count", "user_is_private", "user_is_verified", "user_is_blue_verified", "user_location", "user_profile_pic_url", "user_profile_banner_url", "user_description", "user_external_url", "user_number_of_tweets", "user_bot", "user_timestamp", "user_has_nft_avatar", "user_category", "user_default_profile", "user_default_profile_image", "user_listed_count", "user_verified_type"]
297
+
298
+ df = pd.DataFrame.from_records(record, cols)
299
+ return df
300
+
301
+ ######################################################################################
302
+ # function to parse Instagram data
303
+ # https://rapidapi.com/JoTucker/api/instagram-scraper2
304
+ # https://instagram-scraper2.p.rapidapi.com/hash_tag_medias_v2
305
+ ######################################################################################
306
+
281
307
  def instagram_parse_hashtag_data(hashtag_data):
282
308
  hashtag_id = hashtag_data.get("id")
283
309
  hashtag_name = hashtag_data.get("name")
@@ -324,3 +350,58 @@ def instagram_parse_hashtag_data(hashtag_data):
324
350
  return df
325
351
 
326
352
 
353
+ ######################################################################################
354
+ # function to parse Twitter data
355
+ # https://rapidapi.com/twttrapi-twttrapi-default/api/twttrapi
356
+ ######################################################################################
357
+ def compile_list_entries(json_data, path_json, filename):
358
+ """
359
+ Function to return next cursor and list details from https://twttrapi.p.rapidapi.com/list-members
360
+ """
361
+ results = []
362
+ entries = json_data.get('data', {}).get('list', {}).get('timeline_response', {}).get("timeline", {}).get("instructions", [{}])[-1].get('entries',[])
363
+ if len(entries)>0:
364
+ for entry in entries:
365
+ content = entry.get("content")
366
+ if (content.get("__typename") == "TimelineTimelineCursor") & (content.get("cursorType") =="Bottom"):
367
+ next_cursor = content.get("value", None)
368
+ if next_cursor:
369
+ if next_cursor.split('|')[0]=="0":
370
+ next_cursor = None
371
+ if content.get("__typename") != "TimelineTimelineCursor":
372
+ legacy = content.get("content", {}). get('userResult', {}).get("result", {}).get("legacy", {})
373
+ results.append(legacy)
374
+
375
+ write_jsonl(results, path_json, filename)
376
+ return results, next_cursor
377
+
378
+
379
+ def parse_list_entries(jsonl_data):
380
+ """
381
+ Function to parse list details from https://twttrapi.p.rapidapi.com/list-members
382
+ """
383
+ all_records=[]
384
+ for data in jsonl_data:
385
+ id_str = data.get("id_str","")
386
+ name = data.get("name","")
387
+ screen_name = data.get("screen_name", "")
388
+ created_at = data.get("created_at")
389
+ description = data.get("description")
390
+ statuses_count = data.get("statuses_count", 0)
391
+ followers_count = data.get("followers_count",0)
392
+ friends_count = data.get("friends_count",0)
393
+ favourites_count = data.get("favourites_count",0)
394
+ media_count = data.get("media_count",0)
395
+ protected = data.get("protected", False)
396
+ verified = data.get("verified", False)
397
+ verified_type = data.get("verified_type", "")
398
+ entities = data.get("entities")
399
+ urls = [url.get("expanded_url","") for url in entities.get('url', {}).get("urls",[])]
400
+ user_mentions = [um.get("screen_name","") for um in entities.get('description', {}).get('user_mentions', [])]
401
+ user_mentions_indices = [um.get("indices",[]) for um in entities.get('description', {}).get('user_mentions', [])]
402
+ hashtags = [um.get("text","") for um in entities.get('description', {}).get('hashtags', [])]
403
+ hashtags_indices = [um.get("indices",[]) for um in entities.get('description', {}).get('hashtags', [])]
404
+ record = (id_str, name, screen_name, created_at, description, statuses_count, followers_count, friends_count, favourites_count, media_count, protected, verified, verified_type, urls, user_mentions, user_mentions_indices, hashtags, hashtags_indices)
405
+ all_records.append(record)
406
+ df = pd.DataFrame.from_records(all_records, columns = ["id_str", "name", "screen_name", "created_at", "description", "statuses_count", "followers_count", "friends_count", "favourites_count", "media_count", "protected", "verified", "verified_type", "urls", "user_mentions", "user_mentions_indices", "hashtags", "hashtags_indices"])
407
+ return df
@@ -50,14 +50,26 @@ def load_parquet(path):
50
50
  print(e)
51
51
  return df
52
52
 
53
- def load_pickle(path: str):
53
+ # def load_pickle(path: str):
54
+ # """
55
+ # Load a pickle file into a dataframe
56
+ # """
57
+
58
+ # with open(path, 'rb') as f:
59
+ # df=pickle.load(f)
60
+ # return df
61
+ def load_pickle(path):
62
+ return pd.read_pickle(path)
63
+
64
+ def write_pickle(data, path, filename):
54
65
  """
55
- Load a pickle file into a dataframe
66
+ Write a dataframe into a pickle file
56
67
  """
57
-
58
- with open(path, 'rb') as f:
59
- df=pickle.load(f)
60
- return df
68
+ file_path=os.path.join(path, filename+'.pickle')
69
+ with open(file_path, 'wb') as f:
70
+ pickle.dump(data, f)
71
+ return file_path
72
+
61
73
 
62
74
  def load_json(path: str):
63
75
  """
@@ -164,15 +176,15 @@ def read_jsonl(path: str):
164
176
  #########################################################################################
165
177
 
166
178
 
167
- def write_pickle(df: pd.DataFrame, path: str, name: str):
168
- """
169
- Write a dataframe into a pickle file
170
- """
171
- file_path=os.path.join(path, name+'.pickle')
179
+ # def write_pickle(df: pd.DataFrame, path: str, name: str):
180
+ # """
181
+ # Write a dataframe into a pickle file
182
+ # """
183
+ # file_path=os.path.join(path, name+'.pickle')
172
184
 
173
- with open(file_path, 'wb') as f:
174
- pickle.dump(df, f)
175
- return file_path
185
+ # with open(file_path, 'wb') as f:
186
+ # pickle.dump(df, f)
187
+ # return file_path
176
188
 
177
189
 
178
190
  def write_list_to_txt(input_list: list, path: str, name: str):
@@ -842,3 +854,16 @@ def top_rows_per_category(df, col_to_sort, col_to_gb, cols_to_keep, top_rows) :
842
854
  .reset_index(drop=True)
843
855
  )[cols_to_keep]
844
856
  return df_gb
857
+
858
+ def format_number(number):
859
+ """
860
+ Function to format a number in K, M or B
861
+ """
862
+ if number < 1000:
863
+ return str(number)
864
+ elif number < 1000000:
865
+ return f"{number / 1000:.1f}K"
866
+ elif number < 1000000000:
867
+ return f"{number / 1000000:.1f}M"
868
+ else:
869
+ return f"{number / 1000000000:.1f}B"
@@ -282,7 +282,7 @@ def get_convex_hull_coord(points: np.array, interpolate_curve: bool = True) -> t
282
282
 
283
283
  # return fig
284
284
 
285
- def create_scatter_plot(df, col_x, col_y, col_category, color_palette, col_color, col_size, col_text, title="Scatter Plot", x_axis_label="X-axis", y_axis_label="Y-axis", width=1000, height=1000, xaxis_range=None, yaxis_range=None,
285
+ def create_scatter_plot(df, col_x, col_y, col_category, color_palette, col_color, col_size, col_text, col_legend = [], title="Scatter Plot", x_axis_label="X-axis", y_axis_label="Y-axis", width=1000, height=1000, xaxis_range=None, yaxis_range=None,
286
286
  size_value =4, opacity=0.8, maxdisplayed=0, mode = "markers", textposition="bottom center", plot_bgcolor=None, paper_bgcolor=None, yaxis_showgrid = False, xaxis_showgrid = False, color="indianred", line_width=0.5, line_color="white", colorscale='Viridis', showscale=True, template="plotly"):
287
287
  """
288
288
  Create a scatter plot :
@@ -327,8 +327,9 @@ def create_scatter_plot(df, col_x, col_y, col_category, color_palette, col_color
327
327
  size = df[df[col_category] == category][col_size]
328
328
  hovertemplate += '<br><b>'+col_size+'</b>:'+size.astype(str)
329
329
 
330
- if col_text is not None:
331
- hovertemplate +='<br><b>'+col_text+'</b>:'+ df[df[col_category]==category][col_text].apply(wrap_text)
330
+ if len(col_legend)>0:
331
+ for c in col_legend:
332
+ hovertemplate +='<br><b>'+str(c)+'</b>:'+ df[df[col_category]==category][c].astype(str).apply(wrap_text)
332
333
 
333
334
  fig.add_trace(
334
335
  go.Scatter(
@@ -365,13 +366,16 @@ def create_scatter_plot(df, col_x, col_y, col_category, color_palette, col_color
365
366
  else :
366
367
  if color is None:
367
368
  color = generate_random_hexadecimal_color()
368
- if col_text is not None:
369
- hovertemplate +='<br><b>'+col_text+'</b>:'+ df[col_text].apply(wrap_text)
369
+ if len(col_legend)>0:
370
+ for c in col_legend:
371
+ hovertemplate +='<br><b>'+str(c)+'</b>:'+ df[c].astype(str).apply(wrap_text)
370
372
 
371
373
  fig = go.Figure( go.Scatter(
372
374
  x=df[col_x],
373
375
  y=df[col_y],
374
376
  mode=mode,
377
+ text = df[col_text],
378
+ textposition=textposition,
375
379
  marker=dict(color=color, #dots color
376
380
  size=size, #dots size
377
381
  opacity=opacity, #dots opacity
@@ -582,7 +586,8 @@ def scatter3D(df, col_x, col_y, col_z, col_category, color_palette, col_size, co
582
586
 
583
587
  return fig
584
588
 
585
- def fig_bar_trend(x, bar_measure, trend_measure, x_name="X", bar_name ="metric1", trend_name = "metric2", marker_color='lightpink', line_color='indianred', title_text="Couverture & Résonance", width=1500, height=700, xaxis_tickangle=0, opacity=0.8, plot_bgcolor=None, paper_bgcolor=None, template = "plotly"):
589
+
590
+ def fig_bar_trend(df, col_x, bar_measure, trend_measure, x_name="X", bar_name ="metric1", trend_name = "metric2", marker_color='lightpink', line_color='indianred', title_text="Couverture & Résonance", width=1500, height=700, xaxis_tickangle=0, opacity=0.8, plot_bgcolor=None, paper_bgcolor=None, template = "plotly"):
586
591
  """
587
592
  Display a graph that combine bar and trend chart to compare 2 metrics :
588
593
  - x = x axis data
@@ -597,42 +602,43 @@ def fig_bar_trend(x, bar_measure, trend_measure, x_name="X", bar_name ="metric1"
597
602
  - opacity = opacity of bars
598
603
  """
599
604
 
600
- nk = np.empty(shape=(len(x), 3, 1), dtype="object")
601
- nk[:, 0] = np.array(x.apply(lambda txt: '<br>'.join(textwrap.wrap(str(txt), width=50)))).reshape(-1, 1)
602
- nk[:, 1] = np.array(bar_measure).reshape(-1, 1)
603
- nk[:, 2] = np.array(trend_measure).reshape(-1, 1)
605
+ # nk = np.empty(shape=(len(x), 3, 1), dtype="object")
606
+ # nk[:, 0] = np.array(x.apply(lambda txt: '<br>'.join(textwrap.wrap(str(txt), width=50)))).reshape(-1, 1)
607
+ # nk[:, 1] = np.array(bar_measure).reshape(-1, 1)
608
+ # nk[:, 2] = np.array(trend_measure).reshape(-1, 1)
604
609
 
605
610
  fig = make_subplots(specs=[[{"secondary_y": True}]])
606
611
 
607
612
  fig.add_trace(
608
613
  go.Scatter(
609
- x=x,
610
- y=trend_measure,
614
+ x=df[col_x].apply(wrap_text),
615
+ y=df[trend_measure],
611
616
  name=trend_name,
612
617
  mode='lines',
613
618
  line_color=line_color,
614
619
  line_width=4,
615
620
  textfont=dict(size=8),
616
- customdata=nk,
617
- hovertemplate=("<br>"+x_name+" :%{customdata[0]}<br>"+bar_name+" - %{customdata[1]}<br>"+trend_name+":%{customdata[2]}"+"<extra></extra>"),
621
+ # customdata=nk,
622
+ hovertemplate=("<br>"+x_name+" :"+df[col_x].astype(str)+"<br>"+bar_name+" - "+df[bar_measure].astype(str)+"<br>"+trend_name+" : "+df[trend_measure].astype(str)+"<extra></extra>"),
618
623
  ),
619
624
  secondary_y=True,
620
625
  )
621
626
  # Add traces
622
627
  fig.add_trace(
623
628
  go.Bar(
624
- x=x,
625
- y = bar_measure,
629
+ x=df[col_x].apply(wrap_text),
630
+ y = df[bar_measure],
626
631
  name=bar_name,
627
632
  marker_color=marker_color,
628
633
  opacity=opacity,
629
- hovertemplate=("<br>"+x_name+" :%{customdata[0]}<br>"+bar_name+" - %{customdata[1]}<br>"+trend_name+":%{customdata[2]}"+"<extra></extra>"),
634
+ # customdata=nk,
635
+ hovertemplate=("<br>"+x_name+" :"+df[col_x].astype(str)+"<br>"+bar_name+" - "+df[bar_measure].astype(str)+"<br>"+trend_name+" : "+df[trend_measure].astype(str)+"<extra></extra>"),
630
636
  ),
631
637
  secondary_y=False,
632
638
 
633
639
  )
634
- first_axis_range=[-0.5,bar_measure.max()*1.01]
635
- secondary_axis_range=[-0.5,trend_measure.max()*1.01]
640
+ first_axis_range=[-0.5,df[bar_measure].max()*1.01]
641
+ secondary_axis_range=[-0.5,df[trend_measure].max()*1.01]
636
642
 
637
643
  # Add figure title
638
644
  fig.update_layout(
@@ -668,6 +674,92 @@ def fig_bar_trend(x, bar_measure, trend_measure, x_name="X", bar_name ="metric1"
668
674
  return fig
669
675
 
670
676
 
677
+ # def fig_bar_trend(x, bar_measure, trend_measure, x_name="X", bar_name ="metric1", trend_name = "metric2", marker_color='lightpink', line_color='indianred', title_text="Couverture & Résonance", width=1500, height=700, xaxis_tickangle=0, opacity=0.8, plot_bgcolor=None, paper_bgcolor=None, template = "plotly"):
678
+ # """
679
+ # Display a graph that combine bar and trend chart to compare 2 metrics :
680
+ # - x = x axis data
681
+ # - bar_measure = data represented as bar diagram
682
+ # - trend_measure = data represented as trend line
683
+ # - x_name / bar_name / trend_name : axis labels
684
+ # - marker_color = color code for bars
685
+ # - line_color = color code for trend line
686
+ # - title_text = graph title
687
+ # - width / height = size of plot
688
+ # - xaxis_tickangle = angle for x ticks
689
+ # - opacity = opacity of bars
690
+ # """
691
+
692
+ # nk = np.empty(shape=(len(x), 3, 1), dtype="object")
693
+ # nk[:, 0] = np.array(x.apply(lambda txt: '<br>'.join(textwrap.wrap(str(txt), width=50)))).reshape(-1, 1)
694
+ # nk[:, 1] = np.array(bar_measure).reshape(-1, 1)
695
+ # nk[:, 2] = np.array(trend_measure).reshape(-1, 1)
696
+
697
+ # fig = make_subplots(specs=[[{"secondary_y": True}]])
698
+
699
+ # fig.add_trace(
700
+ # go.Scatter(
701
+ # x=x,
702
+ # y=trend_measure,
703
+ # name=trend_name,
704
+ # mode='lines',
705
+ # line_color=line_color,
706
+ # line_width=4,
707
+ # textfont=dict(size=8),
708
+ # customdata=nk,
709
+ # hovertemplate=("<br>"+x_name+" :%{customdata[0]}<br>"+bar_name+" - %{customdata[1]}<br>"+trend_name+":%{customdata[2]}"+"<extra></extra>"),
710
+ # ),
711
+ # secondary_y=True,
712
+ # )
713
+ # # Add traces
714
+ # fig.add_trace(
715
+ # go.Bar(
716
+ # x=x,
717
+ # y = bar_measure,
718
+ # name=bar_name,
719
+ # marker_color=marker_color,
720
+ # opacity=opacity,
721
+ # hovertemplate=("<br>"+x_name+" :%{customdata[0]}<br>"+bar_name+" - %{customdata[1]}<br>"+trend_name+":%{customdata[2]}"+"<extra></extra>"),
722
+ # ),
723
+ # secondary_y=False,
724
+
725
+ # )
726
+ # first_axis_range=[-0.5,bar_measure.max()*1.01]
727
+ # secondary_axis_range=[-0.5,trend_measure.max()*1.01]
728
+
729
+ # # Add figure title
730
+ # fig.update_layout(
731
+
732
+ # title_text=title_text,
733
+ # showlegend=True,
734
+ # width = width,
735
+ # height= height,
736
+ # xaxis_tickangle=xaxis_tickangle,
737
+ # xaxis_showline=False,
738
+ # xaxis_showgrid=False,
739
+ # yaxis_showline=False,
740
+ # yaxis_showgrid=False,
741
+ # font_family="Segoe UI Semibold",
742
+ # template=template,
743
+ # plot_bgcolor=plot_bgcolor, #background color (plot)
744
+ # paper_bgcolor=paper_bgcolor, #background color (around plot)
745
+ # margin=dict(
746
+ # t=width / 15,
747
+ # b=width / 20,
748
+ # r=width / 20,
749
+ # l=width / 20,
750
+ # ),
751
+ # )
752
+
753
+ # # # Set x-axis title
754
+ # fig.update_xaxes(title_text=x_name)
755
+
756
+ # # Set y-axes titles
757
+ # fig.update_yaxes(title_text=bar_name, range = first_axis_range, secondary_y=False)
758
+ # fig.update_yaxes(title_text=trend_name, range = secondary_axis_range, secondary_y=True)
759
+
760
+ # return fig
761
+
762
+
671
763
  def density_map(df_posts,
672
764
  df_dots,
673
765
  df_topics,
@@ -947,16 +1039,16 @@ def bar_subplots(df, col_x, col_y, col_cat, color_palette, n_cols=4, n_top_words
947
1039
 
948
1040
  # fine tune parameter according to the text position provided
949
1041
  if textposition == 'inside':
950
- horizontal_spacing = (horizontal_spacing / n_rows)/2
1042
+ horizontal_spacing = (horizontal_spacing / n_cols)/2
951
1043
  else:
952
- horizontal_spacing = (horizontal_spacing / n_rows)
1044
+ horizontal_spacing = (horizontal_spacing / n_cols)
953
1045
 
954
1046
  # create subplots
955
1047
  fig = make_subplots(
956
1048
  rows = n_rows, # number of rows
957
1049
  cols = n_cols, # number of columns
958
1050
  subplot_titles = list(categories), # title for each subplot
959
- vertical_spacing = vertical_spacing / n_cols, # space between subplots
1051
+ vertical_spacing = vertical_spacing / n_rows, # space between subplots
960
1052
  horizontal_spacing = horizontal_spacing # space between subplots
961
1053
  )
962
1054
 
@@ -1040,8 +1132,6 @@ def pie_subplots(df, col_x, col_y, col_cat, col_color, n_cols=4, horizontal_spac
1040
1132
 
1041
1133
  # user define a number of columns, we compute the number of rows requires
1042
1134
  n_rows = math.ceil(len(categories) / n_cols)
1043
-
1044
- horizontal_spacing = (horizontal_spacing / n_rows)
1045
1135
 
1046
1136
  specs = [[{'type':'domain'}] * n_cols] * n_rows
1047
1137
  # create subplots
@@ -1049,8 +1139,8 @@ def pie_subplots(df, col_x, col_y, col_cat, col_color, n_cols=4, horizontal_spac
1049
1139
  rows=n_rows,
1050
1140
  cols=n_cols,
1051
1141
  subplot_titles=list(categories),
1052
- horizontal_spacing=horizontal_spacing,
1053
- vertical_spacing=vertical_spacing,
1142
+ horizontal_spacing=horizontal_spacing / n_cols,
1143
+ vertical_spacing=vertical_spacing / n_rows,
1054
1144
  specs=specs
1055
1145
  )
1056
1146
 
@@ -1103,7 +1193,7 @@ def pie_subplots(df, col_x, col_y, col_cat, col_color, n_cols=4, horizontal_spac
1103
1193
  return fig
1104
1194
 
1105
1195
 
1106
- def horizontal_stacked_bars(df, col_x, col_y, col_percentage, col_cat, col_color, title_text = "Sentiment per topic", width=1200, height=1200, xaxis_tickangle=0, horizontal_spacing = 0.2, vertical_spacing = 0.08, plot_bgcolor=None, paper_bgcolor=None, template = "plotly"):
1196
+ def horizontal_stacked_bars(df, col_x, col_y, col_percentage, col_cat, col_color, title_text = "Sentiment per topic", width=1200, height=1200, xaxis_tickangle=0, horizontal_spacing = 0, vertical_spacing = 0.08, plot_bgcolor=None, paper_bgcolor=None, template = "plotly"):
1107
1197
 
1108
1198
  categories = df[col_cat].unique()
1109
1199
 
@@ -1112,8 +1202,8 @@ def horizontal_stacked_bars(df, col_x, col_y, col_percentage, col_cat, col_color
1112
1202
  rows = 1, # number of rows
1113
1203
  cols = 2, # number of columns
1114
1204
  # subplot_titles = list(categories), # title for each subplot
1115
- vertical_spacing = vertical_spacing / n_cols, # space between subplots
1116
- horizontal_spacing = 0 # space between subplots
1205
+ vertical_spacing = vertical_spacing, # space between subplots
1206
+ horizontal_spacing = horizontal_spacing / n_cols # space between subplots
1117
1207
  )
1118
1208
 
1119
1209
  for cat in categories:
@@ -1688,4 +1778,19 @@ def add_shape(fig, shape_type = "rect", x0= -1, y0= -1, x1 = 0, y1=0, fillcolor=
1688
1778
 
1689
1779
  }
1690
1780
  )
1781
+ return fig
1782
+
1783
+ def add_image(fig, xref = "paper", yref = "paper", x = 0, y=0, sizex = 0.08, sizey=0.08, xanchor="right", yanchor="bottom", source = "data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iNDc1IiBoZWlnaHQ9IjM4OCIgdmlld0JveD0iMCAwIDQ3NSAzODgiIGZpbGw9Im5vbmUiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+CjxwYXRoIGQ9Ik0xMDUuNzI3IDI5My4zOTFDMTA1LjcyNyAyNjYuNzc0IDg0LjEyOTMgMjQ1LjE3NyA1Ny42MDEzIDI0NS4xNzdDMzAuOTg0IDI0NS4xNzcgOS4yOTYgMjY2Ljc3NCA5LjI5NiAyOTMuMzkxQzkuMjk2IDMyMC4wMDkgMzAuOTg0IDM0MS42MDcgNTcuNjAxMyAzNDEuNjA3Qzg0LjEyOTMgMzQxLjYwNyAxMDUuNzI3IDMyMC4wMDkgMTA1LjcyNyAyOTMuMzkxWk0wLjg3MDY2NyAyOTMuMzkxQzAuODcwNjY3IDI2Mi4yMDMgMjYuMzI0IDIzNi43NTMgNTcuNjAxMyAyMzYuNzUzQzg4LjY5ODcgMjM2Ljc1MyAxMTQuMTUxIDI2Mi4yMDMgMTE0LjE1MSAyOTMuMzkxQzExNC4xNTEgMzI0LjU3OSA4OC42OTg3IDM1MC4wMyA1Ny42MDEzIDM1MC4wM0MyNi4zMjQgMzUwLjAzIDAuODcwNjY3IDMyNC41NzkgMC44NzA2NjcgMjkzLjM5MVoiIGZpbGw9ImJsYWNrIi8+CjxwYXRoIGQ9Ik0yMzIuNTMxIDI5My40ODFDMjMyLjUzMSAyNjMuNjM3IDIwOS4zMTkgMjQ1LjI2NSAxODYuMjg2IDI0NS4yNjVDMTY2LjU3IDI0NS4yNjUgMTQ3LjQ4MiAyNTguNjIgMTQ1LjI0MSAyODAuMDM4VjMwNi42NTZDMTQ3LjM5MyAzMjguOTcgMTY2LjM5MSAzNDEuNjk2IDE4Ni4yODYgMzQxLjY5NkMyMDkuMzE5IDM0MS42OTYgMjMyLjUzMSAzMjMuMzI1IDIzMi41MzEgMjkzLjQ4MVpNMjQwLjg2NiAyOTMuNDgxQzI0MC44NjYgMzI4LjA3NCAyMTQuNjk3IDM1MC4xMiAxODcuMTgzIDM1MC4xMkMxNjkuOTc3IDM1MC4xMiAxNTMuNTc1IDM0Mi4zMjQgMTQ1LjI0MSAzMjcuNjI1VjM4Ny40OTNIMTM2Ljk5N1YyMzkuNjJIMTQ0Ljg4M0wxNDUuMjQxIDI1Ny41NDRWMjYwLjE0MkMxNTMuNjY2IDI0NS42MjQgMTcwLjE1NSAyMzYuODQyIDE4Ny4yNzMgMjM2Ljg0MkMyMTQuNjA3IDIzNi44NDIgMjQwLjg2NiAyNTguODg4IDI0MC44NjYgMjkzLjQ4MVoiIGZpbGw9ImJsYWNrIi8+CjxwYXRoIGQ9Ik0yNTUuNjQyIDMyOC40MzNMMjYwLjc1MSAzMjIuNzg4QzI2OC4xMDEgMzM1LjUxMyAyODEuMDk1IDM0MS45NjUgMjk0LjE3OCAzNDEuOTY1QzMwOC41MTggMzQxLjk2NSAzMjMuMTI2IDMzMy42MyAzMjMuMTI2IDMxOS41NjFDMzIzLjEyNiAzMDUuNDkgMzA0LjkzNCAyOTkuNjY1IDI4OS43ODcgMjkzLjc0OUMyODAuMzc4IDI4OS45ODYgMjYwLjc1MSAyODMuMzUzIDI2MC43NTEgMjY0LjYyNEMyNjAuNzUxIDI0OS41NjggMjc0LjI4MyAyMzYuNjYyIDI5NC4yNjkgMjM2LjY2MkMzMDkuODYyIDIzNi42NjIgMzIzLjEyNiAyNDUuMzU0IDMyNy41MTggMjU2LjM3OEwzMjEuNjAzIDI2MS4wMzhDMzE2LjMxNSAyNDkuODM3IDMwNC4yMTcgMjQ0LjkwNiAyOTQuMDAxIDI0NC45MDZDMjc5LjEyMiAyNDQuOTA2IDI2OS4xNzQgMjU0LjEzNyAyNjkuMTc0IDI2NC4yNjVDMjY5LjE3NCAyNzcuNDQgMjg0LjIzMSAyODIuOTA1IDI5OS4xMDkgMjg4LjU1MkMzMTEuMDI3IDI5My4yMTIgMzMxLjU1MSAzMDAuNjUgMzMxLjU1MSAzMTkuMDIyQzMzMS41NTEgMzM4LjExMiAzMTMuMjY5IDM1MC4yMSAyOTQuMDAxIDM1MC4yMUMyNzYuNzAzIDM1MC4yMSAyNjEuODI3IDM0MC40NDIgMjU1LjY0MiAzMjguNDMzWiIgZmlsbD0iYmxhY2siLz4KPHBhdGggZD0iTTM0Ni43OCAyOTMuMzkxQzM0Ni43OCAyNTguNTMgMzc1LjAxMSAyMzYuMDM0IDQwMy4yNDEgMjM2LjAzNEM0MTUuNzg4IDIzNi4wMzQgNDMwLjMwNyAyNDAuNTE3IDQzOS45ODUgMjQ4LjU4Mkw0MzUuMzI1IDI1NS40ODJDNDI4Ljc4MyAyNDkuMjk5IDQxNS41MiAyNDQuNDU5IDQwMy4zMzEgMjQ0LjQ1OUMzNzkuMTMzIDI0NC40NTkgMzU1LjIwNCAyNjMuNDU5IDM1NS4yMDQgMjkzLjM5MUMzNTUuMjA0IDMyMy41OTMgMzc5LjQwMyAzNDIuMzIzIDQwMy4yNDEgMzQyLjMyM0M0MTUuNjA4IDM0Mi4zMjMgNDI5LjIzMSAzMzcuMTI2IDQzNi4yMjEgMzMwLjQ5NEw0NDEuMzI5IDMzNy4xMjZDNDMxLjQ3MiAzNDYuMTc4IDQxNi40MTYgMzUwLjc0OSA0MDMuNDIgMzUwLjc0OUMzNzUuMSAzNTAuNzQ5IDM0Ni43OCAzMjguNDMzIDM0Ni43OCAyOTMuMzkxWiIgZmlsbD0iYmxhY2siLz4KPHBhdGggZD0iTTQ2My42MzcgMjM5LjYxOUg0NzIuMDYxVjM0Ny4xNjNINDYzLjYzN1YyMzkuNjE5Wk00NjEuMTI4IDIxMi40NjRDNDYxLjEyOCAyMDguNzAxIDQ2NC4wODUgMjA1Ljc0MyA0NjcuODQ5IDIwNS43NDNDNDcxLjUyNCAyMDUuNzQzIDQ3NC41NzEgMjA4LjcwMSA0NzQuNTcxIDIxMi40NjRDNDc0LjU3MSAyMTYuMjI4IDQ3MS41MjQgMjE5LjE4NSA0NjcuODQ5IDIxOS4xODVDNDY0LjA4NSAyMTkuMTg1IDQ2MS4xMjggMjE2LjIyOCA0NjEuMTI4IDIxMi40NjRaIiBmaWxsPSJibGFjayIvPgo8cGF0aCBkPSJNMjE3Ljg1MyAzMS4zOTE0TDIzNy43MjEgNTEuMjU4TDI1Ny41ODggMzEuMzkxNEwyMzcuNzIxIDExLjUyNDdMMjE3Ljg1MyAzMS4zOTE0Wk0yMzcuNzIxIDYyLjU3MjdMMjA2LjU0IDMxLjM5MTRMMjM3LjcyMSAwLjIxMDAxNkwyNjguOTAxIDMxLjM5MTRMMjM3LjcyMSA2Mi41NzI3Wk0xNTQuMTAxIDU5Ljc1OTRMMTYxLjQzOSA4Ni45NjQ3TDE4OC42NiA3OS42MjJMMTgxLjMyMyA1Mi41OTU0TDE1NC4xMDEgNTkuNzU5NFpNMTU1Ljc5NyA5Ni43NzE0TDE0NC4yOCA1NC4wNzE0TDE4Ni45NjMgNDIuODM5NEwxOTguNDgxIDg1LjI1OEwxNTUuNzk3IDk2Ljc3MTRaTTI4Ni43ODEgNzkuNjIyTDMxNC4wMDMgODYuOTY0N0wzMjEuMzQxIDU5Ljc1OTRMMjk0LjEyIDUyLjU5NTRMMjg2Ljc4MSA3OS42MjJaTTMxOS42NDMgOTYuNzcxNEwyNzYuOTYxIDg1LjI1OEwyODguNDc5IDQyLjgzOTRMMzMxLjE2MiA1NC4wNzE0TDMxOS42NDMgOTYuNzcxNFpNMTU0LjEwMSAxNTYuMTY5TDE4MS4zMjMgMTYzLjMzM0wxODguNjYgMTM2LjMwN0wxNjEuNDM5IDEyOC45NjVMMTU0LjEwMSAxNTYuMTY5Wk0xODYuOTYzIDE3My4wODlMMTQ0LjI4IDE2MS44NTdMMTU1Ljc5NyAxMTkuMTU3TDE5OC40ODEgMTMwLjY3TDE4Ni45NjMgMTczLjA4OVpNMjg2Ljc3NSAxMzYuMzA5TDI5NC4xMiAxNjMuNTM3TDMyMS4zNDggMTU2LjE5M0wzMTQuMDAzIDEyOC45NjVMMjg2Ljc3NSAxMzYuMzA5Wk0yODguNDc5IDE3My4zNDVMMjc2Ljk2NyAxMzAuNjY5TDMxOS42NDMgMTE5LjE1N0wzMzEuMTU1IDE2MS44MzRMMjg4LjQ3OSAxNzMuMzQ1Wk0yMTcuODUzIDE4NC41MzdMMjM3LjcyMSAyMDQuNDA1TDI1Ny41ODggMTg0LjUzN0wyMzcuNzIxIDE2NC42N0wyMTcuODUzIDE4NC41MzdaTTIzNy43MjEgMjE1LjcxOEwyMDYuNTQgMTg0LjUzN0wyMzcuNzIxIDE1My4zNTdMMjY4LjkwMSAxODQuNTM3TDIzNy43MjEgMjE1LjcxOFoiIGZpbGw9ImJsYWNrIi8+Cjwvc3ZnPgo="):
1784
+ fig.add_layout_image(
1785
+ dict(
1786
+ source=source,
1787
+ xref=xref,
1788
+ yref=yref,
1789
+ x=x, y=y,
1790
+ sizex=sizex,
1791
+ sizey=sizey,
1792
+ xanchor=xanchor,
1793
+ yanchor=yanchor
1794
+ )
1795
+ )
1691
1796
  return fig
@@ -613,6 +613,10 @@ def load_spacy_model(model, disable_components=["transformer", "morphologizer",
613
613
  will be included in the spaCy pipeline.
614
614
 
615
615
  """
616
+ if torch.cuda.is_available():
617
+
618
+ spacy.prefer_gpu()
619
+
616
620
  if len(disable_components)>0:
617
621
  nlp = spacy.load(model, disable=disable_components)
618
622
  else:
@@ -1345,18 +1349,23 @@ def df_transform_column_as_list(column):
1345
1349
  def check_gpu():
1346
1350
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
1347
1351
  return device
1348
-
1349
- def HF_load_model_classification(model_name, device):
1350
- model = AutoModelForSequenceClassification.from_pretrained(model_name, from_tf=True)
1351
- tokenizer = AutoTokenizer.from_pretrained(model_name, device=device)
1352
- classifier = TextClassificationPipeline(tokenizer=tokenizer, model=model.to(device))
1353
- return model, tokenizer, classifier
1354
-
1355
- def HF_classify_text(classifier, txt, col_text, filename, dir_json):
1352
+
1353
+ def HF_load_model(model_checkpoint):
1354
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
1355
+ model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
1356
+ if torch.cuda.is_available():
1357
+ model.cuda()
1358
+ return model, tokenizer
1359
+
1360
+ def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json):
1361
+ """ Calculate sentiment of a text. `return_type` can be 'label', 'score' or 'proba' """
1356
1362
  file_path= os.path.join(dir_json , str(filename)+'.json')
1357
1363
  if not os.path.exists(file_path):
1358
- results=classifier(txt)
1359
- results=results[0]
1360
- results[col_text]=txt
1361
- write_json(results, dir_json , str(filename))
1362
- return results
1364
+ with torch.no_grad():
1365
+ inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
1366
+ proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()[0]
1367
+ label = model.config.id2label[proba.argmax()]
1368
+ results = {"label":label, "score" : float(proba.max()), col_text : text}
1369
+ print(results)
1370
+ write_json(results, dir_json , str(filename))
1371
+ return results
@@ -0,0 +1,171 @@
1
+ from cuml import UMAP
2
+ from cuml.cluster.hdbscan import HDBSCAN, all_points_membership_vectors, approximate_predict, membership_vector
3
+ import numpy as np
4
+ from tqdm import tqdm
5
+ import os
6
+ from opsci_toolbox.helpers.common import load_pickle, create_dir, write_pickle
7
+
8
+ def reduce_with_cuml_UMAP(embeddings, n_neighbors = 5, n_components = 3, min_dist = 0.0, metric = "cosine", spread = 1.0):
9
+ reducer = UMAP(n_neighbors=n_neighbors,
10
+ n_components=n_components,
11
+ min_dist=min_dist,
12
+ metric=metric,
13
+ spread = spread).fit(embeddings)
14
+
15
+ reduced_embeddings = reducer.transform(embeddings)
16
+ return reducer, reduced_embeddings
17
+
18
+ def transform_with_cuml_UMAP(reducer, new_embeddings):
19
+ """
20
+ Transform new data points using a UMAP object
21
+ """
22
+ reduced_embeddings = reducer.transform(new_embeddings)
23
+ return reduced_embeddings
24
+
25
+
26
+ def hdbscan_cuml_clustering(embeddings, min_cluster_size=5, min_samples=None, max_cluster_size = 0, metric='euclidean', alpha=1.0, p=2, cluster_selection_epsilon=0.0, cluster_selection_method='eom',
27
+ approx_min_span_tree=True, gen_min_span_tree = False, gen_condensed_tree = False, gen_single_linkage_tree_ = False, prediction_data=True):
28
+
29
+ """
30
+ Parameters:
31
+ embeddings : array-like or sparse matrix, shape (n_samples, n_features)
32
+ The input data to be clustered.
33
+ min_cluster_size : int, optional
34
+ The minimum number of samples in a group for that group to be considered a cluster; groupings smaller than this size will be left as noise.
35
+ min_samples : int or None, optional
36
+ The number of samples in a neighborhood for a point to be considered as a core point. This includes the point itself. If ‘None’, it defaults to the min_cluster_size.
37
+ max_cluster_size : int, optional (default=0)
38
+ A limit to the size of clusters returned by the eom algorithm. Has no effect when using leaf clustering (where clusters are usually small regardless) and can also be overridden in rare cases by a high value for cluster_selection_epsilon.
39
+ Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument.
40
+ metric : str or callable, optional
41
+ The metric to use for distance computation. Default is 'euclidean'.
42
+ alpha : float, optional
43
+ distance scaling parameter as used in robust single linkage.
44
+ p : int, optional
45
+ The Minkowski p-norm distance metric parameter. Default is None.
46
+ cluster_selection_epsilon : float, optional
47
+ A distance threshold. Clusters below this value will be merged. Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument.
48
+ cluster_selection_method : {'eom', 'leaf'}, optional
49
+ The method used to select clusters from the condensed tree. The standard approach for HDBSCAN* is to use an Excess of Mass algorithm to find the most persistent clusters. Alternatively you can instead select the clusters at the leaves of the tree – this provides the most fine grained and homogeneous clusters. Options are:
50
+ approx_min_span_tree : bool, optional
51
+ Whether to compute an approximation of the minimum spanning tree. Default is True.
52
+ gen_min_span_tree : bool, optional
53
+ Whether to populate the minimum_spanning_tree_ member for utilizing plotting tools. This requires the hdbscan CPU Python package to be installed
54
+ gen_condensed_tree : bool, optional
55
+ Whether to populate the condensed_tree_ member for utilizing plotting tools.
56
+ gen_single_linkage_tree_ : bool
57
+ Whether to populate the single_linkage_tree_ member for utilizing plotting tools.
58
+ prediction_data : bool, optional
59
+ Whether the data is prediction data or not. Default is True.
60
+
61
+ Returns:
62
+ clusterer : hdbscan.hdbscan_.HDBSCAN
63
+ HDBSCAN clusterer object.
64
+ labels : array, shape (n_samples,)
65
+ Cluster labels for each point. Noisy samples are given the label -1.
66
+ probabilities : array, shape (n_samples,)
67
+ The probability of each sample being an outlier.
68
+
69
+ Description:
70
+ This function performs clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm.
71
+ It clusters the input data based on the specified parameters and returns the clusterer object, cluster labels for each point, and the
72
+ probability of each sample being an outlier.
73
+ """
74
+ clusterer = HDBSCAN(min_cluster_size=min_cluster_size,
75
+ min_samples=min_samples,
76
+ max_cluster_size = max_cluster_size,
77
+ metric=metric,
78
+ alpha=alpha,
79
+ p=p,
80
+ cluster_selection_epsilon=cluster_selection_epsilon,
81
+ cluster_selection_method=cluster_selection_method,
82
+ approx_min_span_tree=approx_min_span_tree,
83
+ gen_min_span_tree = gen_min_span_tree,
84
+ gen_condensed_tree = gen_condensed_tree,
85
+ gen_single_linkage_tree_ = gen_single_linkage_tree_,
86
+ prediction_data=prediction_data)
87
+
88
+ clusterer.fit_predict(embeddings)
89
+
90
+ return clusterer, clusterer.labels_, clusterer.probabilities_
91
+
92
+ def transform_with_cuml_HDBSCAN(clusterer, new_embeddings):
93
+ """
94
+ Transform new data points using a HDBSCAN object
95
+ """
96
+ new_data_topic, new_data_proba = approximate_predict(clusterer, new_embeddings)
97
+ return new_data_topic, new_data_proba
98
+
99
+
100
+ def cuml_soft_clustering(clusterer):
101
+ """
102
+ HDBSCAN SOFT CLUSTERING
103
+ """
104
+ soft_clusters = all_points_membership_vectors(clusterer)
105
+ soft_clusters_val = [str(np.argmax(x)) for x in soft_clusters]
106
+ soft_clusters_proba = [np.max(x) for x in soft_clusters]
107
+ return soft_clusters_val, soft_clusters_proba
108
+
109
+
110
+ def soft_cuml_clustering_new_data(clusterer, embeddings):
111
+ """
112
+ PREDICT NEW DATA POINTS HDBSCAN SOFT CLUSTERING
113
+ """
114
+ soft_clusters =membership_vector(clusterer, embeddings)
115
+ soft_clusters_val = [str(np.argmax(x)) for x in soft_clusters]
116
+ soft_clusters_proba = [np.max(x) for x in soft_clusters]
117
+ return soft_clusters_val, soft_clusters_proba
118
+
119
+ def process_UMAP(embedded_chunks_paths, path_reduced_embeddings_id, reducer, reencode = False):
120
+
121
+ new_file_paths=[]
122
+ for file_path in tqdm(embedded_chunks_paths, total=len(embedded_chunks_paths), desc="UMAP transform from files"):
123
+
124
+ filename = os.path.splitext(os.path.basename(file_path))[0][:-9]
125
+ new_filename = filename+"_reduce_embeddings.pickle"
126
+ new_file_path = os.path.join(path_reduced_embeddings_id, new_filename)
127
+
128
+ if not os.path.exists(new_file_path) or reencode:
129
+ df = load_pickle(file_path)
130
+ create_dir(path_reduced_embeddings_id)
131
+ # embeddings = df["embeddings"].to_list()
132
+ embeddings = np.vstack(df['embeddings'].values)
133
+ reduced_embeddings = transform_with_cuml_UMAP(reducer, embeddings)
134
+ reduced_embeddings_transformed=[list(e) for e in reduced_embeddings]
135
+ df['reduced_embeddings'] = reduced_embeddings_transformed
136
+ df.drop(columns=["embeddings"], inplace=True)
137
+ print(path_reduced_embeddings_id, filename+"_reduce_embeddings")
138
+ write_pickle(df, path_reduced_embeddings_id, filename+"_reduce_embeddings")
139
+ new_file_paths.append(new_file_path)
140
+ else:
141
+ print("REDUCED EMBEDDINGS ALREADY EXISTS", file_path)
142
+ new_file_paths.append(new_file_path)
143
+ return new_file_paths
144
+
145
+
146
+
147
+ def process_HDBSCAN(clusterer, reduced_embeddings_paths, path_predictions_dataset_id, run_soft_clustering= False, reencode = False):
148
+ new_file_paths=[]
149
+ for file_path in tqdm(reduced_embeddings_paths, total=len(reduced_embeddings_paths), desc="HDBSCAN transform from files"):
150
+
151
+ filename = os.path.splitext(os.path.basename(file_path))[0][:-18]
152
+ new_filename = filename+ "_predictions.pickle"
153
+ new_file_path = os.path.join(path_predictions_dataset_id, new_filename)
154
+ if not os.path.exists(new_file_path) or reencode:
155
+ df = load_pickle(file_path)
156
+ # reduced_embeddings = df["reduced_embeddings"].to_list()
157
+ reduced_embeddings = np.vstack(df['reduced_embeddings'].values)
158
+ topics, probas = transform_with_cuml_HDBSCAN(clusterer, reduced_embeddings)
159
+ df["topic"]=topics.astype(int).astype(str)
160
+ df["proba"]=probas
161
+ if run_soft_clustering:
162
+ soft_clusters, soft_proba = soft_cuml_clustering_new_data(clusterer, np.array(reduced_embeddings))
163
+ df["soft_topic"]=soft_clusters
164
+ df["soft_proba"]=soft_proba
165
+
166
+ write_pickle(df, path_predictions_dataset_id, filename+ "_predictions")
167
+ new_file_paths.append(new_file_path)
168
+ else:
169
+ print("CLUSTERING ALREADY EXISTS", file_path)
170
+ new_file_paths.append(new_file_path)
171
+ return new_file_paths
@@ -0,0 +1,114 @@
1
+ import pandas as pd
2
+ from tqdm import tqdm
3
+
4
+ def generate_index(df, col_author_id ='author_id', col_date='created_time'):
5
+ """
6
+ Generates an index based on user_id and date
7
+ """
8
+ res=[]
9
+ for i, row in tqdm(df.iterrows(), total=df.shape[0], desc="generation des index"):
10
+ new_index=".".join([ str(i) for i in [ row[col_author_id], row[col_date].year, row[col_date].month, row[col_date].day]])
11
+ res.append(new_index)
12
+ df["index"]=res
13
+
14
+ return df
15
+
16
+ def avg_performance(df,
17
+ col_date='created_time',
18
+ col_author_id='author_id',
19
+ col_engagement=['shares', 'comments', 'reactions', 'likes','top_comments', 'love', 'wow', 'haha',
20
+ 'sad', 'angry','total_engagement', 'replies', 'percentage_replies'],
21
+ rolling_period='7D'):
22
+
23
+ """
24
+ Function to compute average performance on a rolling period for a list of metrics
25
+ """
26
+
27
+ # Nettoyage au cas où
28
+ df[col_date] = pd.to_datetime(df[col_date])
29
+ df = df.sort_values([col_author_id, col_date])
30
+
31
+ # Le point central c'est la colone created_time, on la met en index.
32
+ # Ensuite on groupe par author_id en gardant les colonnes de valeurs.
33
+ # On applique la moyenne mean sur un rolling tous les 2 jours. Automatiquement il va prendre l'index, ici created_time comme pivot.
34
+ # On met tout à plat
35
+ average = df.set_index(col_date).groupby(col_author_id)[col_engagement].rolling(rolling_period).mean(numeric_only=True).reset_index()
36
+
37
+ # Sur les résultats précédent, on simplifie pour récupérer une liste avec juste la liste jour / author_id
38
+ average = average.set_index(col_date).groupby([col_author_id]).resample('1D').last(numeric_only=True).reset_index()
39
+
40
+ # On génère nos supers index
41
+ df=generate_index(df, col_author_id =col_author_id, col_date=col_date)
42
+
43
+ average = generate_index(average, col_author_id = col_author_id, col_date=col_date)
44
+
45
+ # On fusionne
46
+ df = pd.merge(df, average[['index']+col_engagement], how='left', on=['index'], suffixes=('', '_avg'))
47
+
48
+ return df
49
+
50
+ def kpi_reaction(df, cols):
51
+ """
52
+ Cette fonction prend un dataframe et une liste de colonnes en entrée.
53
+ Pour chaque colonne, on va calculer le taux de sur-réaction.
54
+ """
55
+ for col in cols:
56
+ df['tx_'+col]=(df[col]-df[col+'_avg'])/(df[col]+df[col+'_avg'])
57
+ return df
58
+
59
+ def get_reactions_type(df, cols, col_dest):
60
+ """
61
+ Conditional function to return the reaction type based on a list of metrics
62
+ """
63
+ all_val=[]
64
+
65
+ for i,row in tqdm(df.iterrows(), total=df.shape[0], desc="qualification des posts"):
66
+
67
+ str_val=''
68
+ count=0
69
+ for col in cols:
70
+ if row[col]>0:
71
+ str_val=str_val+' '+col.replace('tx_', 'sur-')
72
+ count=count+1
73
+ if count==0:
74
+ str_val="sous reaction"
75
+ if count==len(cols):
76
+ str_val="sur reaction totale"
77
+
78
+ all_val.append(str_val.strip())
79
+
80
+ df[col_dest]=all_val
81
+ return df
82
+
83
+ def compute_surreaction(df, col_date, col_author_id, cols_sureaction_metrics, cols_typologie_sureaction, rolling_period_sureaction = '7D'):
84
+ """
85
+ Helpers to compute surreaction and return a dataframe with reaction rates and typology
86
+
87
+ """
88
+ # on désactive temporairement les messages d'alerte
89
+ pd.options.mode.chained_assignment = None # default='warn'
90
+ # on calcule nos performances moyennes pour une liste de métriques
91
+ df= avg_performance(
92
+ df,
93
+ col_date=col_date,
94
+ col_author_id=col_author_id,
95
+ col_engagement= cols_sureaction_metrics,
96
+ rolling_period=rolling_period_sureaction
97
+ )
98
+
99
+ # on calcule les taux de sur-réaction pour notre liste de métriques
100
+ df=kpi_reaction(df, cols_sureaction_metrics)
101
+ cols_tx_engagement=['tx_'+c for c in cols_sureaction_metrics]
102
+ df[cols_tx_engagement]=df[cols_tx_engagement].fillna(-1)
103
+
104
+ # on supprime nos colonnes contenant la performance moyenne (on ne devrait plus en avoir besoin)
105
+ cols_to_drop = [c for c in df.columns if c.lower()[-4:] == '_avg']
106
+ df.drop(columns=cols_to_drop, inplace=True)
107
+
108
+ # on catégorise les formes de réaction
109
+ cols_typologie = ["tx_"+ col for col in cols_typologie_sureaction]
110
+ df=get_reactions_type(df, cols_typologie, 'type_engagement')
111
+
112
+ # on réactive les alertes
113
+ pd.options.mode.chained_assignment = 'warn' # default='warn'
114
+ return df
@@ -1,44 +1,41 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: opsci-toolbox
3
- Version: 0.0.2
3
+ Version: 0.0.5
4
4
  Summary: a complete toolbox
5
5
  Home-page: UNKNOWN
6
6
  Author: Erwan Le Nagard
7
7
  Author-email: erwan@opsci.ai
8
8
  License: MIT
9
9
  Platform: UNKNOWN
10
- Requires-Dist: Pillow (==10.3.0)
11
- Requires-Dist: Pillow (>=9.0.1)
10
+ Requires-Dist: Pillow (<11.0.0,>=9.0.1)
12
11
  Requires-Dist: Requests (==2.31.0)
13
12
  Requires-Dist: beautifulsoup4 (==4.10.0)
14
13
  Requires-Dist: chart-studio (==1.1.0)
15
14
  Requires-Dist: eldar (==0.0.8)
16
15
  Requires-Dist: emoji (==2.10.1)
17
16
  Requires-Dist: google-api-python-client (==2.122.0)
18
- Requires-Dist: gspread (==6.1.0)
17
+ Requires-Dist: gspread (==6.1.1)
19
18
  Requires-Dist: hdbscan (==0.8.33)
20
19
  Requires-Dist: jusText (==3.0.0)
21
- Requires-Dist: langchain (==0.1.16)
22
- Requires-Dist: matplotlib (==3.8.3)
20
+ Requires-Dist: langchain (==0.1.20)
23
21
  Requires-Dist: matplotlib (>=3.5.1)
24
22
  Requires-Dist: networkx (==3.2.1)
25
23
  Requires-Dist: nltk (==3.8.1)
26
- Requires-Dist: numpy (==1.24.4)
27
- Requires-Dist: numpy (>=1.21.5)
24
+ Requires-Dist: numpy (<1.25.0,>=1.21.5)
28
25
  Requires-Dist: opencv-python-headless (==4.9.0.80)
29
26
  Requires-Dist: pandas (==1.5.3)
30
27
  Requires-Dist: plotly (==5.19.0)
31
- Requires-Dist: protobuf
28
+ Requires-Dist: protobuf (==5.26.1)
32
29
  Requires-Dist: pyarrow (==14.0.2)
33
30
  Requires-Dist: python-louvain (==0.16)
34
31
  Requires-Dist: scikit-learn (==1.4.1.post1)
35
- Requires-Dist: scipy
32
+ Requires-Dist: scipy (<2.0.0,>=1.8.0)
36
33
  Requires-Dist: sentence-transformers (==2.5.1)
37
34
  Requires-Dist: setuptools (==59.6.0)
38
35
  Requires-Dist: spacy (==3.7.4)
39
36
  Requires-Dist: spacy-language-detection (==0.2.1)
40
37
  Requires-Dist: spacymoji (==3.1.0)
41
- Requires-Dist: supervision (==0.19.0)
38
+ Requires-Dist: supervision (==0.20.0)
42
39
  Requires-Dist: textacy (==0.13.0)
43
40
  Requires-Dist: torch (==2.0.1)
44
41
  Requires-Dist: tqdm (==4.66.2)
@@ -1,19 +1,21 @@
1
1
  opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- opsci_toolbox/apis/rapidapi_helpers.py,sha256=8qW7efnE-xuyM7IfGcE_VNugEWpaUfBYUU8y7Bq5TAM,18060
3
+ opsci_toolbox/apis/rapidapi_helpers.py,sha256=5QbF6ehsmmdTrzp7Q8cF5wrf4DmO91v8YexbybczyHA,23183
4
4
  opsci_toolbox/apis/webscraping.py,sha256=D1A_ixjImPOncbWrKf6Nem2SR4NQraxTbcYqiE64VTY,12263
5
5
  opsci_toolbox/apis/youtube_helpers.py,sha256=CZQ4mP43eA3STWNJ0HjSoJpvz3iHzohSGxmp5ntEgpA,13115
6
6
  opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- opsci_toolbox/helpers/common.py,sha256=VEmDvLYrrkDU4HVCpKEMgOMLBK5_6rZz-V7Z1IPpDkI,25474
7
+ opsci_toolbox/helpers/common.py,sha256=41EsQ2pTwQYnUUM1ggwaPueFVj2Qcm_UG7o_Zj41FU8,26152
8
8
  opsci_toolbox/helpers/cv.py,sha256=z0HecreIi-vqiOGpDa4VVnHIX_rvkObngrqwTwkWT44,12403
9
- opsci_toolbox/helpers/dataviz.py,sha256=N0g14X_inFEiaQEtIVtL5eKC42RU9JyUgSRnKKoMHyg,68844
9
+ opsci_toolbox/helpers/dataviz.py,sha256=4wFi0wCMgvIEQEL8okiVJOWxz-eJq5cZ7svHoBbZjnk,77393
10
10
  opsci_toolbox/helpers/dates.py,sha256=yQm9pUQAeLTFNPcgeumhi8oErustQJhaoL_HqxSxhiA,996
11
- opsci_toolbox/helpers/nlp.py,sha256=WGpS73yzolBrX4lijm8GdaOEyjwF82ldYfvXkIm8yyk,57597
11
+ opsci_toolbox/helpers/nlp.py,sha256=LGW8CIjrkQvGLKEnxYu7RNrBNViQ5dUygK67EhkBHZo,57999
12
+ opsci_toolbox/helpers/nlp_cuml.py,sha256=Mkbtl9ewbv3aa9rFvhH9VOM5Y0G-XIsXtR_6IeYpebY,9450
12
13
  opsci_toolbox/helpers/sna.py,sha256=D6nwgUgbuApXGpT2zoIMip8262hynEwfppVdvaZ4Qm0,8053
14
+ opsci_toolbox/helpers/surreaction.py,sha256=k5hcZZlXnJ-zczRpwfwthggEgFCr9lQsHHKVOPlm7fc,4606
13
15
  opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
16
  opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
15
17
  opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
16
- opsci_toolbox-0.0.2.dist-info/METADATA,sha256=arXHUG-nMzWGAmzDFqjGU_eT8Mi_q2VWLOySMZa-uJQ,1623
17
- opsci_toolbox-0.0.2.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
18
- opsci_toolbox-0.0.2.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
19
- opsci_toolbox-0.0.2.dist-info/RECORD,,
18
+ opsci_toolbox-0.0.5.dist-info/METADATA,sha256=Nhp2oK-KXD4JVivU37-T_MsN-VJfbPtJsWlUq7Kp5-A,1566
19
+ opsci_toolbox-0.0.5.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
20
+ opsci_toolbox-0.0.5.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
21
+ opsci_toolbox-0.0.5.dist-info/RECORD,,