opsci-toolbox 0.0.2__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/rapidapi_helpers.py +81 -0
- opsci_toolbox/helpers/common.py +39 -14
- opsci_toolbox/helpers/dataviz.py +134 -29
- opsci_toolbox/helpers/nlp.py +22 -13
- opsci_toolbox/helpers/nlp_cuml.py +171 -0
- opsci_toolbox/helpers/surreaction.py +114 -0
- {opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.5.dist-info}/METADATA +8 -11
- {opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.5.dist-info}/RECORD +10 -8
- {opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.5.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.5.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ from tqdm import tqdm
|
|
5
5
|
import re
|
6
6
|
from datetime import datetime,timedelta
|
7
7
|
from opsci_toolbox.helpers.dates import str_to_datetime
|
8
|
+
from opsci_toolbox.helpers.common import write_jsonl
|
8
9
|
|
9
10
|
def create_queries_per_period(query, publishedAfter, publishedBefore, col_publishedAfter = "start_date", col_publishedBefore = "end_date", date_format = '%Y-%m-%d', rolling_days = 7 ):
|
10
11
|
datetime_publishedAfter = datetime.strptime(publishedAfter, date_format)
|
@@ -278,6 +279,31 @@ def parse_tweet(json_data):
|
|
278
279
|
df = pd.DataFrame.from_records(all_records, columns = all_cols)
|
279
280
|
return df
|
280
281
|
|
282
|
+
def parse_twitter_list_details(json_data):
|
283
|
+
"""
|
284
|
+
Parse list results from https://rapidapi.com/omarmhaimdat/api/twitter154
|
285
|
+
"""
|
286
|
+
list_id = json_data.get("list_id", "")
|
287
|
+
list_id_str = json_data.get("list_id_str", "")
|
288
|
+
member_count = json_data.get("member_count", 0)
|
289
|
+
name = json_data.get("name", "")
|
290
|
+
suscriber_count = json_data.get("subscriber_count", 0)
|
291
|
+
creation_date = json_data.get("creation_date", 0)
|
292
|
+
mode = json_data.get("mode", "0")
|
293
|
+
|
294
|
+
user_record = parse_user(json_data.get("user", {}))
|
295
|
+
record = (list_id, list_id_str, member_count, name, suscriber_count, creation_date, mode) + user_record
|
296
|
+
cols = ["list_id", "list_id_str", "member_count", "name", "suscriber_count", "creation_date", "mode", "user_creation_date", "user_id", "user_username", "user_name", "user_follower_count", "user_following_count", "user_favourites_count", "user_is_private", "user_is_verified", "user_is_blue_verified", "user_location", "user_profile_pic_url", "user_profile_banner_url", "user_description", "user_external_url", "user_number_of_tweets", "user_bot", "user_timestamp", "user_has_nft_avatar", "user_category", "user_default_profile", "user_default_profile_image", "user_listed_count", "user_verified_type"]
|
297
|
+
|
298
|
+
df = pd.DataFrame.from_records(record, cols)
|
299
|
+
return df
|
300
|
+
|
301
|
+
######################################################################################
|
302
|
+
# function to parse Instagram data
|
303
|
+
# https://rapidapi.com/JoTucker/api/instagram-scraper2
|
304
|
+
# https://instagram-scraper2.p.rapidapi.com/hash_tag_medias_v2
|
305
|
+
######################################################################################
|
306
|
+
|
281
307
|
def instagram_parse_hashtag_data(hashtag_data):
|
282
308
|
hashtag_id = hashtag_data.get("id")
|
283
309
|
hashtag_name = hashtag_data.get("name")
|
@@ -324,3 +350,58 @@ def instagram_parse_hashtag_data(hashtag_data):
|
|
324
350
|
return df
|
325
351
|
|
326
352
|
|
353
|
+
######################################################################################
|
354
|
+
# function to parse Twitter data
|
355
|
+
# https://rapidapi.com/twttrapi-twttrapi-default/api/twttrapi
|
356
|
+
######################################################################################
|
357
|
+
def compile_list_entries(json_data, path_json, filename):
|
358
|
+
"""
|
359
|
+
Function to return next cursor and list details from https://twttrapi.p.rapidapi.com/list-members
|
360
|
+
"""
|
361
|
+
results = []
|
362
|
+
entries = json_data.get('data', {}).get('list', {}).get('timeline_response', {}).get("timeline", {}).get("instructions", [{}])[-1].get('entries',[])
|
363
|
+
if len(entries)>0:
|
364
|
+
for entry in entries:
|
365
|
+
content = entry.get("content")
|
366
|
+
if (content.get("__typename") == "TimelineTimelineCursor") & (content.get("cursorType") =="Bottom"):
|
367
|
+
next_cursor = content.get("value", None)
|
368
|
+
if next_cursor:
|
369
|
+
if next_cursor.split('|')[0]=="0":
|
370
|
+
next_cursor = None
|
371
|
+
if content.get("__typename") != "TimelineTimelineCursor":
|
372
|
+
legacy = content.get("content", {}). get('userResult', {}).get("result", {}).get("legacy", {})
|
373
|
+
results.append(legacy)
|
374
|
+
|
375
|
+
write_jsonl(results, path_json, filename)
|
376
|
+
return results, next_cursor
|
377
|
+
|
378
|
+
|
379
|
+
def parse_list_entries(jsonl_data):
|
380
|
+
"""
|
381
|
+
Function to parse list details from https://twttrapi.p.rapidapi.com/list-members
|
382
|
+
"""
|
383
|
+
all_records=[]
|
384
|
+
for data in jsonl_data:
|
385
|
+
id_str = data.get("id_str","")
|
386
|
+
name = data.get("name","")
|
387
|
+
screen_name = data.get("screen_name", "")
|
388
|
+
created_at = data.get("created_at")
|
389
|
+
description = data.get("description")
|
390
|
+
statuses_count = data.get("statuses_count", 0)
|
391
|
+
followers_count = data.get("followers_count",0)
|
392
|
+
friends_count = data.get("friends_count",0)
|
393
|
+
favourites_count = data.get("favourites_count",0)
|
394
|
+
media_count = data.get("media_count",0)
|
395
|
+
protected = data.get("protected", False)
|
396
|
+
verified = data.get("verified", False)
|
397
|
+
verified_type = data.get("verified_type", "")
|
398
|
+
entities = data.get("entities")
|
399
|
+
urls = [url.get("expanded_url","") for url in entities.get('url', {}).get("urls",[])]
|
400
|
+
user_mentions = [um.get("screen_name","") for um in entities.get('description', {}).get('user_mentions', [])]
|
401
|
+
user_mentions_indices = [um.get("indices",[]) for um in entities.get('description', {}).get('user_mentions', [])]
|
402
|
+
hashtags = [um.get("text","") for um in entities.get('description', {}).get('hashtags', [])]
|
403
|
+
hashtags_indices = [um.get("indices",[]) for um in entities.get('description', {}).get('hashtags', [])]
|
404
|
+
record = (id_str, name, screen_name, created_at, description, statuses_count, followers_count, friends_count, favourites_count, media_count, protected, verified, verified_type, urls, user_mentions, user_mentions_indices, hashtags, hashtags_indices)
|
405
|
+
all_records.append(record)
|
406
|
+
df = pd.DataFrame.from_records(all_records, columns = ["id_str", "name", "screen_name", "created_at", "description", "statuses_count", "followers_count", "friends_count", "favourites_count", "media_count", "protected", "verified", "verified_type", "urls", "user_mentions", "user_mentions_indices", "hashtags", "hashtags_indices"])
|
407
|
+
return df
|
opsci_toolbox/helpers/common.py
CHANGED
@@ -50,14 +50,26 @@ def load_parquet(path):
|
|
50
50
|
print(e)
|
51
51
|
return df
|
52
52
|
|
53
|
-
def load_pickle(path: str):
|
53
|
+
# def load_pickle(path: str):
|
54
|
+
# """
|
55
|
+
# Load a pickle file into a dataframe
|
56
|
+
# """
|
57
|
+
|
58
|
+
# with open(path, 'rb') as f:
|
59
|
+
# df=pickle.load(f)
|
60
|
+
# return df
|
61
|
+
def load_pickle(path):
|
62
|
+
return pd.read_pickle(path)
|
63
|
+
|
64
|
+
def write_pickle(data, path, filename):
|
54
65
|
"""
|
55
|
-
|
66
|
+
Write a dataframe into a pickle file
|
56
67
|
"""
|
57
|
-
|
58
|
-
with open(
|
59
|
-
|
60
|
-
return
|
68
|
+
file_path=os.path.join(path, filename+'.pickle')
|
69
|
+
with open(file_path, 'wb') as f:
|
70
|
+
pickle.dump(data, f)
|
71
|
+
return file_path
|
72
|
+
|
61
73
|
|
62
74
|
def load_json(path: str):
|
63
75
|
"""
|
@@ -164,15 +176,15 @@ def read_jsonl(path: str):
|
|
164
176
|
#########################################################################################
|
165
177
|
|
166
178
|
|
167
|
-
def write_pickle(df: pd.DataFrame, path: str, name: str):
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
179
|
+
# def write_pickle(df: pd.DataFrame, path: str, name: str):
|
180
|
+
# """
|
181
|
+
# Write a dataframe into a pickle file
|
182
|
+
# """
|
183
|
+
# file_path=os.path.join(path, name+'.pickle')
|
172
184
|
|
173
|
-
|
174
|
-
|
175
|
-
|
185
|
+
# with open(file_path, 'wb') as f:
|
186
|
+
# pickle.dump(df, f)
|
187
|
+
# return file_path
|
176
188
|
|
177
189
|
|
178
190
|
def write_list_to_txt(input_list: list, path: str, name: str):
|
@@ -842,3 +854,16 @@ def top_rows_per_category(df, col_to_sort, col_to_gb, cols_to_keep, top_rows) :
|
|
842
854
|
.reset_index(drop=True)
|
843
855
|
)[cols_to_keep]
|
844
856
|
return df_gb
|
857
|
+
|
858
|
+
def format_number(number):
|
859
|
+
"""
|
860
|
+
Function to format a number in K, M or B
|
861
|
+
"""
|
862
|
+
if number < 1000:
|
863
|
+
return str(number)
|
864
|
+
elif number < 1000000:
|
865
|
+
return f"{number / 1000:.1f}K"
|
866
|
+
elif number < 1000000000:
|
867
|
+
return f"{number / 1000000:.1f}M"
|
868
|
+
else:
|
869
|
+
return f"{number / 1000000000:.1f}B"
|
opsci_toolbox/helpers/dataviz.py
CHANGED
@@ -282,7 +282,7 @@ def get_convex_hull_coord(points: np.array, interpolate_curve: bool = True) -> t
|
|
282
282
|
|
283
283
|
# return fig
|
284
284
|
|
285
|
-
def create_scatter_plot(df, col_x, col_y, col_category, color_palette, col_color, col_size, col_text, title="Scatter Plot", x_axis_label="X-axis", y_axis_label="Y-axis", width=1000, height=1000, xaxis_range=None, yaxis_range=None,
|
285
|
+
def create_scatter_plot(df, col_x, col_y, col_category, color_palette, col_color, col_size, col_text, col_legend = [], title="Scatter Plot", x_axis_label="X-axis", y_axis_label="Y-axis", width=1000, height=1000, xaxis_range=None, yaxis_range=None,
|
286
286
|
size_value =4, opacity=0.8, maxdisplayed=0, mode = "markers", textposition="bottom center", plot_bgcolor=None, paper_bgcolor=None, yaxis_showgrid = False, xaxis_showgrid = False, color="indianred", line_width=0.5, line_color="white", colorscale='Viridis', showscale=True, template="plotly"):
|
287
287
|
"""
|
288
288
|
Create a scatter plot :
|
@@ -327,8 +327,9 @@ def create_scatter_plot(df, col_x, col_y, col_category, color_palette, col_color
|
|
327
327
|
size = df[df[col_category] == category][col_size]
|
328
328
|
hovertemplate += '<br><b>'+col_size+'</b>:'+size.astype(str)
|
329
329
|
|
330
|
-
if
|
331
|
-
|
330
|
+
if len(col_legend)>0:
|
331
|
+
for c in col_legend:
|
332
|
+
hovertemplate +='<br><b>'+str(c)+'</b>:'+ df[df[col_category]==category][c].astype(str).apply(wrap_text)
|
332
333
|
|
333
334
|
fig.add_trace(
|
334
335
|
go.Scatter(
|
@@ -365,13 +366,16 @@ def create_scatter_plot(df, col_x, col_y, col_category, color_palette, col_color
|
|
365
366
|
else :
|
366
367
|
if color is None:
|
367
368
|
color = generate_random_hexadecimal_color()
|
368
|
-
if
|
369
|
-
|
369
|
+
if len(col_legend)>0:
|
370
|
+
for c in col_legend:
|
371
|
+
hovertemplate +='<br><b>'+str(c)+'</b>:'+ df[c].astype(str).apply(wrap_text)
|
370
372
|
|
371
373
|
fig = go.Figure( go.Scatter(
|
372
374
|
x=df[col_x],
|
373
375
|
y=df[col_y],
|
374
376
|
mode=mode,
|
377
|
+
text = df[col_text],
|
378
|
+
textposition=textposition,
|
375
379
|
marker=dict(color=color, #dots color
|
376
380
|
size=size, #dots size
|
377
381
|
opacity=opacity, #dots opacity
|
@@ -582,7 +586,8 @@ def scatter3D(df, col_x, col_y, col_z, col_category, color_palette, col_size, co
|
|
582
586
|
|
583
587
|
return fig
|
584
588
|
|
585
|
-
|
589
|
+
|
590
|
+
def fig_bar_trend(df, col_x, bar_measure, trend_measure, x_name="X", bar_name ="metric1", trend_name = "metric2", marker_color='lightpink', line_color='indianred', title_text="Couverture & Résonance", width=1500, height=700, xaxis_tickangle=0, opacity=0.8, plot_bgcolor=None, paper_bgcolor=None, template = "plotly"):
|
586
591
|
"""
|
587
592
|
Display a graph that combine bar and trend chart to compare 2 metrics :
|
588
593
|
- x = x axis data
|
@@ -597,42 +602,43 @@ def fig_bar_trend(x, bar_measure, trend_measure, x_name="X", bar_name ="metric1"
|
|
597
602
|
- opacity = opacity of bars
|
598
603
|
"""
|
599
604
|
|
600
|
-
nk = np.empty(shape=(len(x), 3, 1), dtype="object")
|
601
|
-
nk[:, 0] = np.array(x.apply(lambda txt: '<br>'.join(textwrap.wrap(str(txt), width=50)))).reshape(-1, 1)
|
602
|
-
nk[:, 1] = np.array(bar_measure).reshape(-1, 1)
|
603
|
-
nk[:, 2] = np.array(trend_measure).reshape(-1, 1)
|
605
|
+
# nk = np.empty(shape=(len(x), 3, 1), dtype="object")
|
606
|
+
# nk[:, 0] = np.array(x.apply(lambda txt: '<br>'.join(textwrap.wrap(str(txt), width=50)))).reshape(-1, 1)
|
607
|
+
# nk[:, 1] = np.array(bar_measure).reshape(-1, 1)
|
608
|
+
# nk[:, 2] = np.array(trend_measure).reshape(-1, 1)
|
604
609
|
|
605
610
|
fig = make_subplots(specs=[[{"secondary_y": True}]])
|
606
611
|
|
607
612
|
fig.add_trace(
|
608
613
|
go.Scatter(
|
609
|
-
x=
|
610
|
-
y=trend_measure,
|
614
|
+
x=df[col_x].apply(wrap_text),
|
615
|
+
y=df[trend_measure],
|
611
616
|
name=trend_name,
|
612
617
|
mode='lines',
|
613
618
|
line_color=line_color,
|
614
619
|
line_width=4,
|
615
620
|
textfont=dict(size=8),
|
616
|
-
customdata=nk,
|
617
|
-
hovertemplate=("<br>"+x_name+"
|
621
|
+
# customdata=nk,
|
622
|
+
hovertemplate=("<br>"+x_name+" :"+df[col_x].astype(str)+"<br>"+bar_name+" - "+df[bar_measure].astype(str)+"<br>"+trend_name+" : "+df[trend_measure].astype(str)+"<extra></extra>"),
|
618
623
|
),
|
619
624
|
secondary_y=True,
|
620
625
|
)
|
621
626
|
# Add traces
|
622
627
|
fig.add_trace(
|
623
628
|
go.Bar(
|
624
|
-
x=
|
625
|
-
y = bar_measure,
|
629
|
+
x=df[col_x].apply(wrap_text),
|
630
|
+
y = df[bar_measure],
|
626
631
|
name=bar_name,
|
627
632
|
marker_color=marker_color,
|
628
633
|
opacity=opacity,
|
629
|
-
|
634
|
+
# customdata=nk,
|
635
|
+
hovertemplate=("<br>"+x_name+" :"+df[col_x].astype(str)+"<br>"+bar_name+" - "+df[bar_measure].astype(str)+"<br>"+trend_name+" : "+df[trend_measure].astype(str)+"<extra></extra>"),
|
630
636
|
),
|
631
637
|
secondary_y=False,
|
632
638
|
|
633
639
|
)
|
634
|
-
first_axis_range=[-0.5,bar_measure.max()*1.01]
|
635
|
-
secondary_axis_range=[-0.5,trend_measure.max()*1.01]
|
640
|
+
first_axis_range=[-0.5,df[bar_measure].max()*1.01]
|
641
|
+
secondary_axis_range=[-0.5,df[trend_measure].max()*1.01]
|
636
642
|
|
637
643
|
# Add figure title
|
638
644
|
fig.update_layout(
|
@@ -668,6 +674,92 @@ def fig_bar_trend(x, bar_measure, trend_measure, x_name="X", bar_name ="metric1"
|
|
668
674
|
return fig
|
669
675
|
|
670
676
|
|
677
|
+
# def fig_bar_trend(x, bar_measure, trend_measure, x_name="X", bar_name ="metric1", trend_name = "metric2", marker_color='lightpink', line_color='indianred', title_text="Couverture & Résonance", width=1500, height=700, xaxis_tickangle=0, opacity=0.8, plot_bgcolor=None, paper_bgcolor=None, template = "plotly"):
|
678
|
+
# """
|
679
|
+
# Display a graph that combine bar and trend chart to compare 2 metrics :
|
680
|
+
# - x = x axis data
|
681
|
+
# - bar_measure = data represented as bar diagram
|
682
|
+
# - trend_measure = data represented as trend line
|
683
|
+
# - x_name / bar_name / trend_name : axis labels
|
684
|
+
# - marker_color = color code for bars
|
685
|
+
# - line_color = color code for trend line
|
686
|
+
# - title_text = graph title
|
687
|
+
# - width / height = size of plot
|
688
|
+
# - xaxis_tickangle = angle for x ticks
|
689
|
+
# - opacity = opacity of bars
|
690
|
+
# """
|
691
|
+
|
692
|
+
# nk = np.empty(shape=(len(x), 3, 1), dtype="object")
|
693
|
+
# nk[:, 0] = np.array(x.apply(lambda txt: '<br>'.join(textwrap.wrap(str(txt), width=50)))).reshape(-1, 1)
|
694
|
+
# nk[:, 1] = np.array(bar_measure).reshape(-1, 1)
|
695
|
+
# nk[:, 2] = np.array(trend_measure).reshape(-1, 1)
|
696
|
+
|
697
|
+
# fig = make_subplots(specs=[[{"secondary_y": True}]])
|
698
|
+
|
699
|
+
# fig.add_trace(
|
700
|
+
# go.Scatter(
|
701
|
+
# x=x,
|
702
|
+
# y=trend_measure,
|
703
|
+
# name=trend_name,
|
704
|
+
# mode='lines',
|
705
|
+
# line_color=line_color,
|
706
|
+
# line_width=4,
|
707
|
+
# textfont=dict(size=8),
|
708
|
+
# customdata=nk,
|
709
|
+
# hovertemplate=("<br>"+x_name+" :%{customdata[0]}<br>"+bar_name+" - %{customdata[1]}<br>"+trend_name+":%{customdata[2]}"+"<extra></extra>"),
|
710
|
+
# ),
|
711
|
+
# secondary_y=True,
|
712
|
+
# )
|
713
|
+
# # Add traces
|
714
|
+
# fig.add_trace(
|
715
|
+
# go.Bar(
|
716
|
+
# x=x,
|
717
|
+
# y = bar_measure,
|
718
|
+
# name=bar_name,
|
719
|
+
# marker_color=marker_color,
|
720
|
+
# opacity=opacity,
|
721
|
+
# hovertemplate=("<br>"+x_name+" :%{customdata[0]}<br>"+bar_name+" - %{customdata[1]}<br>"+trend_name+":%{customdata[2]}"+"<extra></extra>"),
|
722
|
+
# ),
|
723
|
+
# secondary_y=False,
|
724
|
+
|
725
|
+
# )
|
726
|
+
# first_axis_range=[-0.5,bar_measure.max()*1.01]
|
727
|
+
# secondary_axis_range=[-0.5,trend_measure.max()*1.01]
|
728
|
+
|
729
|
+
# # Add figure title
|
730
|
+
# fig.update_layout(
|
731
|
+
|
732
|
+
# title_text=title_text,
|
733
|
+
# showlegend=True,
|
734
|
+
# width = width,
|
735
|
+
# height= height,
|
736
|
+
# xaxis_tickangle=xaxis_tickangle,
|
737
|
+
# xaxis_showline=False,
|
738
|
+
# xaxis_showgrid=False,
|
739
|
+
# yaxis_showline=False,
|
740
|
+
# yaxis_showgrid=False,
|
741
|
+
# font_family="Segoe UI Semibold",
|
742
|
+
# template=template,
|
743
|
+
# plot_bgcolor=plot_bgcolor, #background color (plot)
|
744
|
+
# paper_bgcolor=paper_bgcolor, #background color (around plot)
|
745
|
+
# margin=dict(
|
746
|
+
# t=width / 15,
|
747
|
+
# b=width / 20,
|
748
|
+
# r=width / 20,
|
749
|
+
# l=width / 20,
|
750
|
+
# ),
|
751
|
+
# )
|
752
|
+
|
753
|
+
# # # Set x-axis title
|
754
|
+
# fig.update_xaxes(title_text=x_name)
|
755
|
+
|
756
|
+
# # Set y-axes titles
|
757
|
+
# fig.update_yaxes(title_text=bar_name, range = first_axis_range, secondary_y=False)
|
758
|
+
# fig.update_yaxes(title_text=trend_name, range = secondary_axis_range, secondary_y=True)
|
759
|
+
|
760
|
+
# return fig
|
761
|
+
|
762
|
+
|
671
763
|
def density_map(df_posts,
|
672
764
|
df_dots,
|
673
765
|
df_topics,
|
@@ -947,16 +1039,16 @@ def bar_subplots(df, col_x, col_y, col_cat, color_palette, n_cols=4, n_top_words
|
|
947
1039
|
|
948
1040
|
# fine tune parameter according to the text position provided
|
949
1041
|
if textposition == 'inside':
|
950
|
-
horizontal_spacing = (horizontal_spacing /
|
1042
|
+
horizontal_spacing = (horizontal_spacing / n_cols)/2
|
951
1043
|
else:
|
952
|
-
horizontal_spacing = (horizontal_spacing /
|
1044
|
+
horizontal_spacing = (horizontal_spacing / n_cols)
|
953
1045
|
|
954
1046
|
# create subplots
|
955
1047
|
fig = make_subplots(
|
956
1048
|
rows = n_rows, # number of rows
|
957
1049
|
cols = n_cols, # number of columns
|
958
1050
|
subplot_titles = list(categories), # title for each subplot
|
959
|
-
vertical_spacing = vertical_spacing /
|
1051
|
+
vertical_spacing = vertical_spacing / n_rows, # space between subplots
|
960
1052
|
horizontal_spacing = horizontal_spacing # space between subplots
|
961
1053
|
)
|
962
1054
|
|
@@ -1040,8 +1132,6 @@ def pie_subplots(df, col_x, col_y, col_cat, col_color, n_cols=4, horizontal_spac
|
|
1040
1132
|
|
1041
1133
|
# user define a number of columns, we compute the number of rows requires
|
1042
1134
|
n_rows = math.ceil(len(categories) / n_cols)
|
1043
|
-
|
1044
|
-
horizontal_spacing = (horizontal_spacing / n_rows)
|
1045
1135
|
|
1046
1136
|
specs = [[{'type':'domain'}] * n_cols] * n_rows
|
1047
1137
|
# create subplots
|
@@ -1049,8 +1139,8 @@ def pie_subplots(df, col_x, col_y, col_cat, col_color, n_cols=4, horizontal_spac
|
|
1049
1139
|
rows=n_rows,
|
1050
1140
|
cols=n_cols,
|
1051
1141
|
subplot_titles=list(categories),
|
1052
|
-
horizontal_spacing=horizontal_spacing,
|
1053
|
-
vertical_spacing=vertical_spacing,
|
1142
|
+
horizontal_spacing=horizontal_spacing / n_cols,
|
1143
|
+
vertical_spacing=vertical_spacing / n_rows,
|
1054
1144
|
specs=specs
|
1055
1145
|
)
|
1056
1146
|
|
@@ -1103,7 +1193,7 @@ def pie_subplots(df, col_x, col_y, col_cat, col_color, n_cols=4, horizontal_spac
|
|
1103
1193
|
return fig
|
1104
1194
|
|
1105
1195
|
|
1106
|
-
def horizontal_stacked_bars(df, col_x, col_y, col_percentage, col_cat, col_color, title_text = "Sentiment per topic", width=1200, height=1200, xaxis_tickangle=0, horizontal_spacing = 0
|
1196
|
+
def horizontal_stacked_bars(df, col_x, col_y, col_percentage, col_cat, col_color, title_text = "Sentiment per topic", width=1200, height=1200, xaxis_tickangle=0, horizontal_spacing = 0, vertical_spacing = 0.08, plot_bgcolor=None, paper_bgcolor=None, template = "plotly"):
|
1107
1197
|
|
1108
1198
|
categories = df[col_cat].unique()
|
1109
1199
|
|
@@ -1112,8 +1202,8 @@ def horizontal_stacked_bars(df, col_x, col_y, col_percentage, col_cat, col_color
|
|
1112
1202
|
rows = 1, # number of rows
|
1113
1203
|
cols = 2, # number of columns
|
1114
1204
|
# subplot_titles = list(categories), # title for each subplot
|
1115
|
-
vertical_spacing = vertical_spacing
|
1116
|
-
horizontal_spacing =
|
1205
|
+
vertical_spacing = vertical_spacing, # space between subplots
|
1206
|
+
horizontal_spacing = horizontal_spacing / n_cols # space between subplots
|
1117
1207
|
)
|
1118
1208
|
|
1119
1209
|
for cat in categories:
|
@@ -1688,4 +1778,19 @@ def add_shape(fig, shape_type = "rect", x0= -1, y0= -1, x1 = 0, y1=0, fillcolor=
|
|
1688
1778
|
|
1689
1779
|
}
|
1690
1780
|
)
|
1781
|
+
return fig
|
1782
|
+
|
1783
|
+
def add_image(fig, xref = "paper", yref = "paper", x = 0, y=0, sizex = 0.08, sizey=0.08, xanchor="right", yanchor="bottom", source = "data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iNDc1IiBoZWlnaHQ9IjM4OCIgdmlld0JveD0iMCAwIDQ3NSAzODgiIGZpbGw9Im5vbmUiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+CjxwYXRoIGQ9Ik0xMDUuNzI3IDI5My4zOTFDMTA1LjcyNyAyNjYuNzc0IDg0LjEyOTMgMjQ1LjE3NyA1Ny42MDEzIDI0NS4xNzdDMzAuOTg0IDI0NS4xNzcgOS4yOTYgMjY2Ljc3NCA5LjI5NiAyOTMuMzkxQzkuMjk2IDMyMC4wMDkgMzAuOTg0IDM0MS42MDcgNTcuNjAxMyAzNDEuNjA3Qzg0LjEyOTMgMzQxLjYwNyAxMDUuNzI3IDMyMC4wMDkgMTA1LjcyNyAyOTMuMzkxWk0wLjg3MDY2NyAyOTMuMzkxQzAuODcwNjY3IDI2Mi4yMDMgMjYuMzI0IDIzNi43NTMgNTcuNjAxMyAyMzYuNzUzQzg4LjY5ODcgMjM2Ljc1MyAxMTQuMTUxIDI2Mi4yMDMgMTE0LjE1MSAyOTMuMzkxQzExNC4xNTEgMzI0LjU3OSA4OC42OTg3IDM1MC4wMyA1Ny42MDEzIDM1MC4wM0MyNi4zMjQgMzUwLjAzIDAuODcwNjY3IDMyNC41NzkgMC44NzA2NjcgMjkzLjM5MVoiIGZpbGw9ImJsYWNrIi8+CjxwYXRoIGQ9Ik0yMzIuNTMxIDI5My40ODFDMjMyLjUzMSAyNjMuNjM3IDIwOS4zMTkgMjQ1LjI2NSAxODYuMjg2IDI0NS4yNjVDMTY2LjU3IDI0NS4yNjUgMTQ3LjQ4MiAyNTguNjIgMTQ1LjI0MSAyODAuMDM4VjMwNi42NTZDMTQ3LjM5MyAzMjguOTcgMTY2LjM5MSAzNDEuNjk2IDE4Ni4yODYgMzQxLjY5NkMyMDkuMzE5IDM0MS42OTYgMjMyLjUzMSAzMjMuMzI1IDIzMi41MzEgMjkzLjQ4MVpNMjQwLjg2NiAyOTMuNDgxQzI0MC44NjYgMzI4LjA3NCAyMTQuNjk3IDM1MC4xMiAxODcuMTgzIDM1MC4xMkMxNjkuOTc3IDM1MC4xMiAxNTMuNTc1IDM0Mi4zMjQgMTQ1LjI0MSAzMjcuNjI1VjM4Ny40OTNIMTM2Ljk5N1YyMzkuNjJIMTQ0Ljg4M0wxNDUuMjQxIDI1Ny41NDRWMjYwLjE0MkMxNTMuNjY2IDI0NS42MjQgMTcwLjE1NSAyMzYuODQyIDE4Ny4yNzMgMjM2Ljg0MkMyMTQuNjA3IDIzNi44NDIgMjQwLjg2NiAyNTguODg4IDI0MC44NjYgMjkzLjQ4MVoiIGZpbGw9ImJsYWNrIi8+CjxwYXRoIGQ9Ik0yNTUuNjQyIDMyOC40MzNMMjYwLjc1MSAzMjIuNzg4QzI2OC4xMDEgMzM1LjUxMyAyODEuMDk1IDM0MS45NjUgMjk0LjE3OCAzNDEuOTY1QzMwOC41MTggMzQxLjk2NSAzMjMuMTI2IDMzMy42MyAzMjMuMTI2IDMxOS41NjFDMzIzLjEyNiAzMDUuNDkgMzA0LjkzNCAyOTkuNjY1IDI4OS43ODcgMjkzLjc0OUMyODAuMzc4IDI4OS45ODYgMjYwLjc1MSAyODMuMzUzIDI2MC43NTEgMjY0LjYyNEMyNjAuNzUxIDI0OS41NjggMjc0LjI4MyAyMzYuNjYyIDI5NC4yNjkgMjM2LjY2MkMzMDkuODYyIDIzNi42NjIgMzIzLjEyNiAyNDUuMzU0IDMyNy41MTggMjU2LjM3OEwzMjEuNjAzIDI2MS4wMzhDMzE2LjMxNSAyNDkuODM3IDMwNC4yMTcgMjQ0LjkwNiAyOTQuMDAxIDI0NC45MDZDMjc5LjEyMiAyNDQuOTA2IDI2OS4xNzQgMjU0LjEzNyAyNjkuMTc0IDI2NC4yNjVDMjY5LjE3NCAyNzcuNDQgMjg0LjIzMSAyODIuOTA1IDI5OS4xMDkgMjg4LjU1MkMzMTEuMDI3IDI5My4yMTIgMzMxLjU1MSAzMDAuNjUgMzMxLjU1MSAzMTkuMDIyQzMzMS41NTEgMzM4LjExMiAzMTMuMjY5IDM1MC4yMSAyOTQuMDAxIDM1MC4yMUMyNzYuNzAzIDM1MC4yMSAyNjEuODI3IDM0MC40NDIgMjU1LjY0MiAzMjguNDMzWiIgZmlsbD0iYmxhY2siLz4KPHBhdGggZD0iTTM0Ni43OCAyOTMuMzkxQzM0Ni43OCAyNTguNTMgMzc1LjAxMSAyMzYuMDM0IDQwMy4yNDEgMjM2LjAzNEM0MTUuNzg4IDIzNi4wMzQgNDMwLjMwNyAyNDAuNTE3IDQzOS45ODUgMjQ4LjU4Mkw0MzUuMzI1IDI1NS40ODJDNDI4Ljc4MyAyNDkuMjk5IDQxNS41MiAyNDQuNDU5IDQwMy4zMzEgMjQ0LjQ1OUMzNzkuMTMzIDI0NC40NTkgMzU1LjIwNCAyNjMuNDU5IDM1NS4yMDQgMjkzLjM5MUMzNTUuMjA0IDMyMy41OTMgMzc5LjQwMyAzNDIuMzIzIDQwMy4yNDEgMzQyLjMyM0M0MTUuNjA4IDM0Mi4zMjMgNDI5LjIzMSAzMzcuMTI2IDQzNi4yMjEgMzMwLjQ5NEw0NDEuMzI5IDMzNy4xMjZDNDMxLjQ3MiAzNDYuMTc4IDQxNi40MTYgMzUwLjc0OSA0MDMuNDIgMzUwLjc0OUMzNzUuMSAzNTAuNzQ5IDM0Ni43OCAzMjguNDMzIDM0Ni43OCAyOTMuMzkxWiIgZmlsbD0iYmxhY2siLz4KPHBhdGggZD0iTTQ2My42MzcgMjM5LjYxOUg0NzIuMDYxVjM0Ny4xNjNINDYzLjYzN1YyMzkuNjE5Wk00NjEuMTI4IDIxMi40NjRDNDYxLjEyOCAyMDguNzAxIDQ2NC4wODUgMjA1Ljc0MyA0NjcuODQ5IDIwNS43NDNDNDcxLjUyNCAyMDUuNzQzIDQ3NC41NzEgMjA4LjcwMSA0NzQuNTcxIDIxMi40NjRDNDc0LjU3MSAyMTYuMjI4IDQ3MS41MjQgMjE5LjE4NSA0NjcuODQ5IDIxOS4xODVDNDY0LjA4NSAyMTkuMTg1IDQ2MS4xMjggMjE2LjIyOCA0NjEuMTI4IDIxMi40NjRaIiBmaWxsPSJibGFjayIvPgo8cGF0aCBkPSJNMjE3Ljg1MyAzMS4zOTE0TDIzNy43MjEgNTEuMjU4TDI1Ny41ODggMzEuMzkxNEwyMzcuNzIxIDExLjUyNDdMMjE3Ljg1MyAzMS4zOTE0Wk0yMzcuNzIxIDYyLjU3MjdMMjA2LjU0IDMxLjM5MTRMMjM3LjcyMSAwLjIxMDAxNkwyNjguOTAxIDMxLjM5MTRMMjM3LjcyMSA2Mi41NzI3Wk0xNTQuMTAxIDU5Ljc1OTRMMTYxLjQzOSA4Ni45NjQ3TDE4OC42NiA3OS42MjJMMTgxLjMyMyA1Mi41OTU0TDE1NC4xMDEgNTkuNzU5NFpNMTU1Ljc5NyA5Ni43NzE0TDE0NC4yOCA1NC4wNzE0TDE4Ni45NjMgNDIuODM5NEwxOTguNDgxIDg1LjI1OEwxNTUuNzk3IDk2Ljc3MTRaTTI4Ni43ODEgNzkuNjIyTDMxNC4wMDMgODYuOTY0N0wzMjEuMzQxIDU5Ljc1OTRMMjk0LjEyIDUyLjU5NTRMMjg2Ljc4MSA3OS42MjJaTTMxOS42NDMgOTYuNzcxNEwyNzYuOTYxIDg1LjI1OEwyODguNDc5IDQyLjgzOTRMMzMxLjE2MiA1NC4wNzE0TDMxOS42NDMgOTYuNzcxNFpNMTU0LjEwMSAxNTYuMTY5TDE4MS4zMjMgMTYzLjMzM0wxODguNjYgMTM2LjMwN0wxNjEuNDM5IDEyOC45NjVMMTU0LjEwMSAxNTYuMTY5Wk0xODYuOTYzIDE3My4wODlMMTQ0LjI4IDE2MS44NTdMMTU1Ljc5NyAxMTkuMTU3TDE5OC40ODEgMTMwLjY3TDE4Ni45NjMgMTczLjA4OVpNMjg2Ljc3NSAxMzYuMzA5TDI5NC4xMiAxNjMuNTM3TDMyMS4zNDggMTU2LjE5M0wzMTQuMDAzIDEyOC45NjVMMjg2Ljc3NSAxMzYuMzA5Wk0yODguNDc5IDE3My4zNDVMMjc2Ljk2NyAxMzAuNjY5TDMxOS42NDMgMTE5LjE1N0wzMzEuMTU1IDE2MS44MzRMMjg4LjQ3OSAxNzMuMzQ1Wk0yMTcuODUzIDE4NC41MzdMMjM3LjcyMSAyMDQuNDA1TDI1Ny41ODggMTg0LjUzN0wyMzcuNzIxIDE2NC42N0wyMTcuODUzIDE4NC41MzdaTTIzNy43MjEgMjE1LjcxOEwyMDYuNTQgMTg0LjUzN0wyMzcuNzIxIDE1My4zNTdMMjY4LjkwMSAxODQuNTM3TDIzNy43MjEgMjE1LjcxOFoiIGZpbGw9ImJsYWNrIi8+Cjwvc3ZnPgo="):
|
1784
|
+
fig.add_layout_image(
|
1785
|
+
dict(
|
1786
|
+
source=source,
|
1787
|
+
xref=xref,
|
1788
|
+
yref=yref,
|
1789
|
+
x=x, y=y,
|
1790
|
+
sizex=sizex,
|
1791
|
+
sizey=sizey,
|
1792
|
+
xanchor=xanchor,
|
1793
|
+
yanchor=yanchor
|
1794
|
+
)
|
1795
|
+
)
|
1691
1796
|
return fig
|
opsci_toolbox/helpers/nlp.py
CHANGED
@@ -613,6 +613,10 @@ def load_spacy_model(model, disable_components=["transformer", "morphologizer",
|
|
613
613
|
will be included in the spaCy pipeline.
|
614
614
|
|
615
615
|
"""
|
616
|
+
if torch.cuda.is_available():
|
617
|
+
|
618
|
+
spacy.prefer_gpu()
|
619
|
+
|
616
620
|
if len(disable_components)>0:
|
617
621
|
nlp = spacy.load(model, disable=disable_components)
|
618
622
|
else:
|
@@ -1345,18 +1349,23 @@ def df_transform_column_as_list(column):
|
|
1345
1349
|
def check_gpu():
|
1346
1350
|
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
1347
1351
|
return device
|
1348
|
-
|
1349
|
-
def
|
1350
|
-
|
1351
|
-
|
1352
|
-
|
1353
|
-
|
1354
|
-
|
1355
|
-
|
1352
|
+
|
1353
|
+
def HF_load_model(model_checkpoint):
|
1354
|
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
1355
|
+
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
|
1356
|
+
if torch.cuda.is_available():
|
1357
|
+
model.cuda()
|
1358
|
+
return model, tokenizer
|
1359
|
+
|
1360
|
+
def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json):
|
1361
|
+
""" Calculate sentiment of a text. `return_type` can be 'label', 'score' or 'proba' """
|
1356
1362
|
file_path= os.path.join(dir_json , str(filename)+'.json')
|
1357
1363
|
if not os.path.exists(file_path):
|
1358
|
-
|
1359
|
-
|
1360
|
-
|
1361
|
-
|
1362
|
-
|
1364
|
+
with torch.no_grad():
|
1365
|
+
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
|
1366
|
+
proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()[0]
|
1367
|
+
label = model.config.id2label[proba.argmax()]
|
1368
|
+
results = {"label":label, "score" : float(proba.max()), col_text : text}
|
1369
|
+
print(results)
|
1370
|
+
write_json(results, dir_json , str(filename))
|
1371
|
+
return results
|
@@ -0,0 +1,171 @@
|
|
1
|
+
from cuml import UMAP
|
2
|
+
from cuml.cluster.hdbscan import HDBSCAN, all_points_membership_vectors, approximate_predict, membership_vector
|
3
|
+
import numpy as np
|
4
|
+
from tqdm import tqdm
|
5
|
+
import os
|
6
|
+
from opsci_toolbox.helpers.common import load_pickle, create_dir, write_pickle
|
7
|
+
|
8
|
+
def reduce_with_cuml_UMAP(embeddings, n_neighbors = 5, n_components = 3, min_dist = 0.0, metric = "cosine", spread = 1.0):
|
9
|
+
reducer = UMAP(n_neighbors=n_neighbors,
|
10
|
+
n_components=n_components,
|
11
|
+
min_dist=min_dist,
|
12
|
+
metric=metric,
|
13
|
+
spread = spread).fit(embeddings)
|
14
|
+
|
15
|
+
reduced_embeddings = reducer.transform(embeddings)
|
16
|
+
return reducer, reduced_embeddings
|
17
|
+
|
18
|
+
def transform_with_cuml_UMAP(reducer, new_embeddings):
|
19
|
+
"""
|
20
|
+
Transform new data points using a UMAP object
|
21
|
+
"""
|
22
|
+
reduced_embeddings = reducer.transform(new_embeddings)
|
23
|
+
return reduced_embeddings
|
24
|
+
|
25
|
+
|
26
|
+
def hdbscan_cuml_clustering(embeddings, min_cluster_size=5, min_samples=None, max_cluster_size = 0, metric='euclidean', alpha=1.0, p=2, cluster_selection_epsilon=0.0, cluster_selection_method='eom',
|
27
|
+
approx_min_span_tree=True, gen_min_span_tree = False, gen_condensed_tree = False, gen_single_linkage_tree_ = False, prediction_data=True):
|
28
|
+
|
29
|
+
"""
|
30
|
+
Parameters:
|
31
|
+
embeddings : array-like or sparse matrix, shape (n_samples, n_features)
|
32
|
+
The input data to be clustered.
|
33
|
+
min_cluster_size : int, optional
|
34
|
+
The minimum number of samples in a group for that group to be considered a cluster; groupings smaller than this size will be left as noise.
|
35
|
+
min_samples : int or None, optional
|
36
|
+
The number of samples in a neighborhood for a point to be considered as a core point. This includes the point itself. If ‘None’, it defaults to the min_cluster_size.
|
37
|
+
max_cluster_size : int, optional (default=0)
|
38
|
+
A limit to the size of clusters returned by the eom algorithm. Has no effect when using leaf clustering (where clusters are usually small regardless) and can also be overridden in rare cases by a high value for cluster_selection_epsilon.
|
39
|
+
Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument.
|
40
|
+
metric : str or callable, optional
|
41
|
+
The metric to use for distance computation. Default is 'euclidean'.
|
42
|
+
alpha : float, optional
|
43
|
+
distance scaling parameter as used in robust single linkage.
|
44
|
+
p : int, optional
|
45
|
+
The Minkowski p-norm distance metric parameter. Default is None.
|
46
|
+
cluster_selection_epsilon : float, optional
|
47
|
+
A distance threshold. Clusters below this value will be merged. Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument.
|
48
|
+
cluster_selection_method : {'eom', 'leaf'}, optional
|
49
|
+
The method used to select clusters from the condensed tree. The standard approach for HDBSCAN* is to use an Excess of Mass algorithm to find the most persistent clusters. Alternatively you can instead select the clusters at the leaves of the tree – this provides the most fine grained and homogeneous clusters. Options are:
|
50
|
+
approx_min_span_tree : bool, optional
|
51
|
+
Whether to compute an approximation of the minimum spanning tree. Default is True.
|
52
|
+
gen_min_span_tree : bool, optional
|
53
|
+
Whether to populate the minimum_spanning_tree_ member for utilizing plotting tools. This requires the hdbscan CPU Python package to be installed
|
54
|
+
gen_condensed_tree : bool, optional
|
55
|
+
Whether to populate the condensed_tree_ member for utilizing plotting tools.
|
56
|
+
gen_single_linkage_tree_ : bool
|
57
|
+
Whether to populate the single_linkage_tree_ member for utilizing plotting tools.
|
58
|
+
prediction_data : bool, optional
|
59
|
+
Whether the data is prediction data or not. Default is True.
|
60
|
+
|
61
|
+
Returns:
|
62
|
+
clusterer : hdbscan.hdbscan_.HDBSCAN
|
63
|
+
HDBSCAN clusterer object.
|
64
|
+
labels : array, shape (n_samples,)
|
65
|
+
Cluster labels for each point. Noisy samples are given the label -1.
|
66
|
+
probabilities : array, shape (n_samples,)
|
67
|
+
The probability of each sample being an outlier.
|
68
|
+
|
69
|
+
Description:
|
70
|
+
This function performs clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm.
|
71
|
+
It clusters the input data based on the specified parameters and returns the clusterer object, cluster labels for each point, and the
|
72
|
+
probability of each sample being an outlier.
|
73
|
+
"""
|
74
|
+
clusterer = HDBSCAN(min_cluster_size=min_cluster_size,
|
75
|
+
min_samples=min_samples,
|
76
|
+
max_cluster_size = max_cluster_size,
|
77
|
+
metric=metric,
|
78
|
+
alpha=alpha,
|
79
|
+
p=p,
|
80
|
+
cluster_selection_epsilon=cluster_selection_epsilon,
|
81
|
+
cluster_selection_method=cluster_selection_method,
|
82
|
+
approx_min_span_tree=approx_min_span_tree,
|
83
|
+
gen_min_span_tree = gen_min_span_tree,
|
84
|
+
gen_condensed_tree = gen_condensed_tree,
|
85
|
+
gen_single_linkage_tree_ = gen_single_linkage_tree_,
|
86
|
+
prediction_data=prediction_data)
|
87
|
+
|
88
|
+
clusterer.fit_predict(embeddings)
|
89
|
+
|
90
|
+
return clusterer, clusterer.labels_, clusterer.probabilities_
|
91
|
+
|
92
|
+
def transform_with_cuml_HDBSCAN(clusterer, new_embeddings):
|
93
|
+
"""
|
94
|
+
Transform new data points using a HDBSCAN object
|
95
|
+
"""
|
96
|
+
new_data_topic, new_data_proba = approximate_predict(clusterer, new_embeddings)
|
97
|
+
return new_data_topic, new_data_proba
|
98
|
+
|
99
|
+
|
100
|
+
def cuml_soft_clustering(clusterer):
|
101
|
+
"""
|
102
|
+
HDBSCAN SOFT CLUSTERING
|
103
|
+
"""
|
104
|
+
soft_clusters = all_points_membership_vectors(clusterer)
|
105
|
+
soft_clusters_val = [str(np.argmax(x)) for x in soft_clusters]
|
106
|
+
soft_clusters_proba = [np.max(x) for x in soft_clusters]
|
107
|
+
return soft_clusters_val, soft_clusters_proba
|
108
|
+
|
109
|
+
|
110
|
+
def soft_cuml_clustering_new_data(clusterer, embeddings):
|
111
|
+
"""
|
112
|
+
PREDICT NEW DATA POINTS HDBSCAN SOFT CLUSTERING
|
113
|
+
"""
|
114
|
+
soft_clusters =membership_vector(clusterer, embeddings)
|
115
|
+
soft_clusters_val = [str(np.argmax(x)) for x in soft_clusters]
|
116
|
+
soft_clusters_proba = [np.max(x) for x in soft_clusters]
|
117
|
+
return soft_clusters_val, soft_clusters_proba
|
118
|
+
|
119
|
+
def process_UMAP(embedded_chunks_paths, path_reduced_embeddings_id, reducer, reencode = False):
|
120
|
+
|
121
|
+
new_file_paths=[]
|
122
|
+
for file_path in tqdm(embedded_chunks_paths, total=len(embedded_chunks_paths), desc="UMAP transform from files"):
|
123
|
+
|
124
|
+
filename = os.path.splitext(os.path.basename(file_path))[0][:-9]
|
125
|
+
new_filename = filename+"_reduce_embeddings.pickle"
|
126
|
+
new_file_path = os.path.join(path_reduced_embeddings_id, new_filename)
|
127
|
+
|
128
|
+
if not os.path.exists(new_file_path) or reencode:
|
129
|
+
df = load_pickle(file_path)
|
130
|
+
create_dir(path_reduced_embeddings_id)
|
131
|
+
# embeddings = df["embeddings"].to_list()
|
132
|
+
embeddings = np.vstack(df['embeddings'].values)
|
133
|
+
reduced_embeddings = transform_with_cuml_UMAP(reducer, embeddings)
|
134
|
+
reduced_embeddings_transformed=[list(e) for e in reduced_embeddings]
|
135
|
+
df['reduced_embeddings'] = reduced_embeddings_transformed
|
136
|
+
df.drop(columns=["embeddings"], inplace=True)
|
137
|
+
print(path_reduced_embeddings_id, filename+"_reduce_embeddings")
|
138
|
+
write_pickle(df, path_reduced_embeddings_id, filename+"_reduce_embeddings")
|
139
|
+
new_file_paths.append(new_file_path)
|
140
|
+
else:
|
141
|
+
print("REDUCED EMBEDDINGS ALREADY EXISTS", file_path)
|
142
|
+
new_file_paths.append(new_file_path)
|
143
|
+
return new_file_paths
|
144
|
+
|
145
|
+
|
146
|
+
|
147
|
+
def process_HDBSCAN(clusterer, reduced_embeddings_paths, path_predictions_dataset_id, run_soft_clustering= False, reencode = False):
|
148
|
+
new_file_paths=[]
|
149
|
+
for file_path in tqdm(reduced_embeddings_paths, total=len(reduced_embeddings_paths), desc="HDBSCAN transform from files"):
|
150
|
+
|
151
|
+
filename = os.path.splitext(os.path.basename(file_path))[0][:-18]
|
152
|
+
new_filename = filename+ "_predictions.pickle"
|
153
|
+
new_file_path = os.path.join(path_predictions_dataset_id, new_filename)
|
154
|
+
if not os.path.exists(new_file_path) or reencode:
|
155
|
+
df = load_pickle(file_path)
|
156
|
+
# reduced_embeddings = df["reduced_embeddings"].to_list()
|
157
|
+
reduced_embeddings = np.vstack(df['reduced_embeddings'].values)
|
158
|
+
topics, probas = transform_with_cuml_HDBSCAN(clusterer, reduced_embeddings)
|
159
|
+
df["topic"]=topics.astype(int).astype(str)
|
160
|
+
df["proba"]=probas
|
161
|
+
if run_soft_clustering:
|
162
|
+
soft_clusters, soft_proba = soft_cuml_clustering_new_data(clusterer, np.array(reduced_embeddings))
|
163
|
+
df["soft_topic"]=soft_clusters
|
164
|
+
df["soft_proba"]=soft_proba
|
165
|
+
|
166
|
+
write_pickle(df, path_predictions_dataset_id, filename+ "_predictions")
|
167
|
+
new_file_paths.append(new_file_path)
|
168
|
+
else:
|
169
|
+
print("CLUSTERING ALREADY EXISTS", file_path)
|
170
|
+
new_file_paths.append(new_file_path)
|
171
|
+
return new_file_paths
|
@@ -0,0 +1,114 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
from tqdm import tqdm
|
3
|
+
|
4
|
+
def generate_index(df, col_author_id ='author_id', col_date='created_time'):
|
5
|
+
"""
|
6
|
+
Generates an index based on user_id and date
|
7
|
+
"""
|
8
|
+
res=[]
|
9
|
+
for i, row in tqdm(df.iterrows(), total=df.shape[0], desc="generation des index"):
|
10
|
+
new_index=".".join([ str(i) for i in [ row[col_author_id], row[col_date].year, row[col_date].month, row[col_date].day]])
|
11
|
+
res.append(new_index)
|
12
|
+
df["index"]=res
|
13
|
+
|
14
|
+
return df
|
15
|
+
|
16
|
+
def avg_performance(df,
|
17
|
+
col_date='created_time',
|
18
|
+
col_author_id='author_id',
|
19
|
+
col_engagement=['shares', 'comments', 'reactions', 'likes','top_comments', 'love', 'wow', 'haha',
|
20
|
+
'sad', 'angry','total_engagement', 'replies', 'percentage_replies'],
|
21
|
+
rolling_period='7D'):
|
22
|
+
|
23
|
+
"""
|
24
|
+
Function to compute average performance on a rolling period for a list of metrics
|
25
|
+
"""
|
26
|
+
|
27
|
+
# Nettoyage au cas où
|
28
|
+
df[col_date] = pd.to_datetime(df[col_date])
|
29
|
+
df = df.sort_values([col_author_id, col_date])
|
30
|
+
|
31
|
+
# Le point central c'est la colone created_time, on la met en index.
|
32
|
+
# Ensuite on groupe par author_id en gardant les colonnes de valeurs.
|
33
|
+
# On applique la moyenne mean sur un rolling tous les 2 jours. Automatiquement il va prendre l'index, ici created_time comme pivot.
|
34
|
+
# On met tout à plat
|
35
|
+
average = df.set_index(col_date).groupby(col_author_id)[col_engagement].rolling(rolling_period).mean(numeric_only=True).reset_index()
|
36
|
+
|
37
|
+
# Sur les résultats précédent, on simplifie pour récupérer une liste avec juste la liste jour / author_id
|
38
|
+
average = average.set_index(col_date).groupby([col_author_id]).resample('1D').last(numeric_only=True).reset_index()
|
39
|
+
|
40
|
+
# On génère nos supers index
|
41
|
+
df=generate_index(df, col_author_id =col_author_id, col_date=col_date)
|
42
|
+
|
43
|
+
average = generate_index(average, col_author_id = col_author_id, col_date=col_date)
|
44
|
+
|
45
|
+
# On fusionne
|
46
|
+
df = pd.merge(df, average[['index']+col_engagement], how='left', on=['index'], suffixes=('', '_avg'))
|
47
|
+
|
48
|
+
return df
|
49
|
+
|
50
|
+
def kpi_reaction(df, cols):
|
51
|
+
"""
|
52
|
+
Cette fonction prend un dataframe et une liste de colonnes en entrée.
|
53
|
+
Pour chaque colonne, on va calculer le taux de sur-réaction.
|
54
|
+
"""
|
55
|
+
for col in cols:
|
56
|
+
df['tx_'+col]=(df[col]-df[col+'_avg'])/(df[col]+df[col+'_avg'])
|
57
|
+
return df
|
58
|
+
|
59
|
+
def get_reactions_type(df, cols, col_dest):
|
60
|
+
"""
|
61
|
+
Conditional function to return the reaction type based on a list of metrics
|
62
|
+
"""
|
63
|
+
all_val=[]
|
64
|
+
|
65
|
+
for i,row in tqdm(df.iterrows(), total=df.shape[0], desc="qualification des posts"):
|
66
|
+
|
67
|
+
str_val=''
|
68
|
+
count=0
|
69
|
+
for col in cols:
|
70
|
+
if row[col]>0:
|
71
|
+
str_val=str_val+' '+col.replace('tx_', 'sur-')
|
72
|
+
count=count+1
|
73
|
+
if count==0:
|
74
|
+
str_val="sous reaction"
|
75
|
+
if count==len(cols):
|
76
|
+
str_val="sur reaction totale"
|
77
|
+
|
78
|
+
all_val.append(str_val.strip())
|
79
|
+
|
80
|
+
df[col_dest]=all_val
|
81
|
+
return df
|
82
|
+
|
83
|
+
def compute_surreaction(df, col_date, col_author_id, cols_sureaction_metrics, cols_typologie_sureaction, rolling_period_sureaction = '7D'):
|
84
|
+
"""
|
85
|
+
Helpers to compute surreaction and return a dataframe with reaction rates and typology
|
86
|
+
|
87
|
+
"""
|
88
|
+
# on désactive temporairement les messages d'alerte
|
89
|
+
pd.options.mode.chained_assignment = None # default='warn'
|
90
|
+
# on calcule nos performances moyennes pour une liste de métriques
|
91
|
+
df= avg_performance(
|
92
|
+
df,
|
93
|
+
col_date=col_date,
|
94
|
+
col_author_id=col_author_id,
|
95
|
+
col_engagement= cols_sureaction_metrics,
|
96
|
+
rolling_period=rolling_period_sureaction
|
97
|
+
)
|
98
|
+
|
99
|
+
# on calcule les taux de sur-réaction pour notre liste de métriques
|
100
|
+
df=kpi_reaction(df, cols_sureaction_metrics)
|
101
|
+
cols_tx_engagement=['tx_'+c for c in cols_sureaction_metrics]
|
102
|
+
df[cols_tx_engagement]=df[cols_tx_engagement].fillna(-1)
|
103
|
+
|
104
|
+
# on supprime nos colonnes contenant la performance moyenne (on ne devrait plus en avoir besoin)
|
105
|
+
cols_to_drop = [c for c in df.columns if c.lower()[-4:] == '_avg']
|
106
|
+
df.drop(columns=cols_to_drop, inplace=True)
|
107
|
+
|
108
|
+
# on catégorise les formes de réaction
|
109
|
+
cols_typologie = ["tx_"+ col for col in cols_typologie_sureaction]
|
110
|
+
df=get_reactions_type(df, cols_typologie, 'type_engagement')
|
111
|
+
|
112
|
+
# on réactive les alertes
|
113
|
+
pd.options.mode.chained_assignment = 'warn' # default='warn'
|
114
|
+
return df
|
@@ -1,44 +1,41 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: opsci-toolbox
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.5
|
4
4
|
Summary: a complete toolbox
|
5
5
|
Home-page: UNKNOWN
|
6
6
|
Author: Erwan Le Nagard
|
7
7
|
Author-email: erwan@opsci.ai
|
8
8
|
License: MIT
|
9
9
|
Platform: UNKNOWN
|
10
|
-
Requires-Dist: Pillow (
|
11
|
-
Requires-Dist: Pillow (>=9.0.1)
|
10
|
+
Requires-Dist: Pillow (<11.0.0,>=9.0.1)
|
12
11
|
Requires-Dist: Requests (==2.31.0)
|
13
12
|
Requires-Dist: beautifulsoup4 (==4.10.0)
|
14
13
|
Requires-Dist: chart-studio (==1.1.0)
|
15
14
|
Requires-Dist: eldar (==0.0.8)
|
16
15
|
Requires-Dist: emoji (==2.10.1)
|
17
16
|
Requires-Dist: google-api-python-client (==2.122.0)
|
18
|
-
Requires-Dist: gspread (==6.1.
|
17
|
+
Requires-Dist: gspread (==6.1.1)
|
19
18
|
Requires-Dist: hdbscan (==0.8.33)
|
20
19
|
Requires-Dist: jusText (==3.0.0)
|
21
|
-
Requires-Dist: langchain (==0.1.
|
22
|
-
Requires-Dist: matplotlib (==3.8.3)
|
20
|
+
Requires-Dist: langchain (==0.1.20)
|
23
21
|
Requires-Dist: matplotlib (>=3.5.1)
|
24
22
|
Requires-Dist: networkx (==3.2.1)
|
25
23
|
Requires-Dist: nltk (==3.8.1)
|
26
|
-
Requires-Dist: numpy (
|
27
|
-
Requires-Dist: numpy (>=1.21.5)
|
24
|
+
Requires-Dist: numpy (<1.25.0,>=1.21.5)
|
28
25
|
Requires-Dist: opencv-python-headless (==4.9.0.80)
|
29
26
|
Requires-Dist: pandas (==1.5.3)
|
30
27
|
Requires-Dist: plotly (==5.19.0)
|
31
|
-
Requires-Dist: protobuf
|
28
|
+
Requires-Dist: protobuf (==5.26.1)
|
32
29
|
Requires-Dist: pyarrow (==14.0.2)
|
33
30
|
Requires-Dist: python-louvain (==0.16)
|
34
31
|
Requires-Dist: scikit-learn (==1.4.1.post1)
|
35
|
-
Requires-Dist: scipy
|
32
|
+
Requires-Dist: scipy (<2.0.0,>=1.8.0)
|
36
33
|
Requires-Dist: sentence-transformers (==2.5.1)
|
37
34
|
Requires-Dist: setuptools (==59.6.0)
|
38
35
|
Requires-Dist: spacy (==3.7.4)
|
39
36
|
Requires-Dist: spacy-language-detection (==0.2.1)
|
40
37
|
Requires-Dist: spacymoji (==3.1.0)
|
41
|
-
Requires-Dist: supervision (==0.
|
38
|
+
Requires-Dist: supervision (==0.20.0)
|
42
39
|
Requires-Dist: textacy (==0.13.0)
|
43
40
|
Requires-Dist: torch (==2.0.1)
|
44
41
|
Requires-Dist: tqdm (==4.66.2)
|
@@ -1,19 +1,21 @@
|
|
1
1
|
opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
opsci_toolbox/apis/rapidapi_helpers.py,sha256=
|
3
|
+
opsci_toolbox/apis/rapidapi_helpers.py,sha256=5QbF6ehsmmdTrzp7Q8cF5wrf4DmO91v8YexbybczyHA,23183
|
4
4
|
opsci_toolbox/apis/webscraping.py,sha256=D1A_ixjImPOncbWrKf6Nem2SR4NQraxTbcYqiE64VTY,12263
|
5
5
|
opsci_toolbox/apis/youtube_helpers.py,sha256=CZQ4mP43eA3STWNJ0HjSoJpvz3iHzohSGxmp5ntEgpA,13115
|
6
6
|
opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
opsci_toolbox/helpers/common.py,sha256=
|
7
|
+
opsci_toolbox/helpers/common.py,sha256=41EsQ2pTwQYnUUM1ggwaPueFVj2Qcm_UG7o_Zj41FU8,26152
|
8
8
|
opsci_toolbox/helpers/cv.py,sha256=z0HecreIi-vqiOGpDa4VVnHIX_rvkObngrqwTwkWT44,12403
|
9
|
-
opsci_toolbox/helpers/dataviz.py,sha256=
|
9
|
+
opsci_toolbox/helpers/dataviz.py,sha256=4wFi0wCMgvIEQEL8okiVJOWxz-eJq5cZ7svHoBbZjnk,77393
|
10
10
|
opsci_toolbox/helpers/dates.py,sha256=yQm9pUQAeLTFNPcgeumhi8oErustQJhaoL_HqxSxhiA,996
|
11
|
-
opsci_toolbox/helpers/nlp.py,sha256=
|
11
|
+
opsci_toolbox/helpers/nlp.py,sha256=LGW8CIjrkQvGLKEnxYu7RNrBNViQ5dUygK67EhkBHZo,57999
|
12
|
+
opsci_toolbox/helpers/nlp_cuml.py,sha256=Mkbtl9ewbv3aa9rFvhH9VOM5Y0G-XIsXtR_6IeYpebY,9450
|
12
13
|
opsci_toolbox/helpers/sna.py,sha256=D6nwgUgbuApXGpT2zoIMip8262hynEwfppVdvaZ4Qm0,8053
|
14
|
+
opsci_toolbox/helpers/surreaction.py,sha256=k5hcZZlXnJ-zczRpwfwthggEgFCr9lQsHHKVOPlm7fc,4606
|
13
15
|
opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
16
|
opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
|
15
17
|
opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
|
16
|
-
opsci_toolbox-0.0.
|
17
|
-
opsci_toolbox-0.0.
|
18
|
-
opsci_toolbox-0.0.
|
19
|
-
opsci_toolbox-0.0.
|
18
|
+
opsci_toolbox-0.0.5.dist-info/METADATA,sha256=Nhp2oK-KXD4JVivU37-T_MsN-VJfbPtJsWlUq7Kp5-A,1566
|
19
|
+
opsci_toolbox-0.0.5.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
20
|
+
opsci_toolbox-0.0.5.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
|
21
|
+
opsci_toolbox-0.0.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|