opsci-toolbox 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/rapidapi_helpers.py +1 -2
- opsci_toolbox/apis/reddit.py +407 -0
- opsci_toolbox/apis/telegram.py +1125 -0
- opsci_toolbox/helpers/common.py +177 -5
- opsci_toolbox/helpers/dataviz.py +184 -26
- opsci_toolbox/helpers/dates.py +47 -1
- opsci_toolbox/helpers/gliner.py +88 -0
- opsci_toolbox/helpers/nlp.py +273 -15
- opsci_toolbox/helpers/nlp_cuml.py +44 -3
- opsci_toolbox/helpers/sna.py +1 -0
- {opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.14.dist-info}/METADATA +5 -2
- opsci_toolbox-0.0.14.dist-info/RECORD +26 -0
- opsci_toolbox-0.0.14.dist-info/dependency_links.txt +1 -0
- opsci_toolbox-0.0.12.dist-info/RECORD +0 -22
- {opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.14.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.12.dist-info → opsci_toolbox-0.0.14.dist-info}/top_level.txt +0 -0
opsci_toolbox/helpers/common.py
CHANGED
@@ -310,6 +310,22 @@ def write_pickle(data: pd.DataFrame, path: str, filename: str) -> str:
|
|
310
310
|
pickle.dump(data, f)
|
311
311
|
return file_path
|
312
312
|
|
313
|
+
def save_df_to_pickle(df: pd.DataFrame, path: str, filename: str) -> str:
|
314
|
+
"""
|
315
|
+
Write a DataFrame into a pickle file.
|
316
|
+
|
317
|
+
Args:
|
318
|
+
data (pd.DataFrame): The DataFrame to be written to the pickle file.
|
319
|
+
path (str): The directory where the pickle file will be saved.
|
320
|
+
filename (str): The name of the pickle file (without the extension).
|
321
|
+
|
322
|
+
Returns:
|
323
|
+
str: The full path to the saved pickle file.
|
324
|
+
"""
|
325
|
+
file_path = os.path.join(path, filename + '.pickle')
|
326
|
+
df.to_pickle(file_path)
|
327
|
+
return file_path
|
328
|
+
|
313
329
|
|
314
330
|
def write_list_to_txt(input_list: list, path: str, name: str) -> str:
|
315
331
|
"""
|
@@ -587,7 +603,7 @@ def list_files_in_subdirectories(path: str, filetype: str = '*.json') -> list:
|
|
587
603
|
|
588
604
|
return files
|
589
605
|
|
590
|
-
def copy_file(source_path: str, destination_path: str, new_filename: str =
|
606
|
+
def copy_file(source_path: str, destination_path: str, new_filename: str = None) -> str:
|
591
607
|
"""
|
592
608
|
Copy a file from a source path to a destination path.
|
593
609
|
|
@@ -1319,13 +1335,13 @@ def categorize_percentiles(percentile: float) -> str:
|
|
1319
1335
|
Categorizes a percentile value into a string representing its range.
|
1320
1336
|
|
1321
1337
|
Args:
|
1322
|
-
|
1338
|
+
percentile (float): The percentile value (between 0 and 1).
|
1323
1339
|
|
1324
1340
|
Returns:
|
1325
|
-
|
1341
|
+
str: The category of the percentile value.
|
1326
1342
|
|
1327
1343
|
Raises:
|
1328
|
-
|
1344
|
+
ValueError: If the percentile value is outside the range [0, 1].
|
1329
1345
|
"""
|
1330
1346
|
if not (0 <= percentile <= 1):
|
1331
1347
|
raise ValueError("Percentile must be between 0 and 1 inclusive.")
|
@@ -1349,4 +1365,160 @@ def categorize_percentiles(percentile: float) -> str:
|
|
1349
1365
|
elif percentile <= 0.9:
|
1350
1366
|
return '80-90%'
|
1351
1367
|
else:
|
1352
|
-
return '90-100%'
|
1368
|
+
return '90-100%'
|
1369
|
+
|
1370
|
+
|
1371
|
+
def prepare_data_combinations(df: pd.DataFrame, columns_to_combine : list, col_date : str, date_format : str, rolling_period : str, col_id : str, col_engagement : str) -> pd.DataFrame:
|
1372
|
+
"""
|
1373
|
+
Prepare data for combinations of columns. Useful for data preparation before dataviz of time series. It adds missing rows for each combination of columns and date.
|
1374
|
+
Args:
|
1375
|
+
df (pd.DataFrame): The input DataFrame.
|
1376
|
+
columns_to_combine (list): List of column names to combine.
|
1377
|
+
col_date (str): Name of the column containing dates.
|
1378
|
+
date_format (str): Format of the dates in col_date.
|
1379
|
+
rolling_period (str): Rolling period for grouping.
|
1380
|
+
col_id (str): Name of the column containing unique IDs.
|
1381
|
+
col_engagement (str): Name of the column containing engagement values.
|
1382
|
+
Returns:
|
1383
|
+
pd.DataFrame: The prepared DataFrame with combinations of columns.
|
1384
|
+
"""
|
1385
|
+
|
1386
|
+
df_wt_combinations = df.copy()
|
1387
|
+
df_wt_combinations["date"] = pd.to_datetime(df_wt_combinations[col_date], format=date_format).to_numpy()
|
1388
|
+
|
1389
|
+
# Create all possible combinations of columns indexes
|
1390
|
+
# all_combinations = create_combination_index(df_wt_combinations, columns_to_combine, "date", rolling_period)
|
1391
|
+
|
1392
|
+
# If no columns to combine, just use the date for grouping
|
1393
|
+
if not columns_to_combine:
|
1394
|
+
df_wt_combinations = (df_wt_combinations
|
1395
|
+
.set_index("date")
|
1396
|
+
.resample(rolling_period)
|
1397
|
+
.agg({col_id: "nunique", col_engagement: "sum"})
|
1398
|
+
.fillna(0)
|
1399
|
+
.reset_index())
|
1400
|
+
else:
|
1401
|
+
# # Create all possible combinations of columns indexes
|
1402
|
+
# all_combinations = create_combination_index(df_wt_combinations, columns_to_combine, "date", rolling_period)
|
1403
|
+
|
1404
|
+
df_wt_combinations = (df_wt_combinations
|
1405
|
+
.set_index(["date"])
|
1406
|
+
.groupby(columns_to_combine)
|
1407
|
+
.resample(rolling_period)
|
1408
|
+
.agg({col_id: "nunique", col_engagement: "sum"})
|
1409
|
+
.fillna(0)
|
1410
|
+
.reset_index())
|
1411
|
+
|
1412
|
+
return df_wt_combinations
|
1413
|
+
|
1414
|
+
# def create_combination_index(df : pd.DataFrame, columns : list, date_column : str, rolling_period : str) -> pd.MultiIndex:
|
1415
|
+
# """
|
1416
|
+
# Create all possible combinations of unique values from specified columns and date range.
|
1417
|
+
|
1418
|
+
# Args:
|
1419
|
+
# df (pd.DataFrame): The input DataFrame
|
1420
|
+
# columns (list): List of column names to create combinations from
|
1421
|
+
# date_column (str): Name of the date column
|
1422
|
+
# rolling_period (str): Frequency for date range (e.g., '1D', '1W', '1M')
|
1423
|
+
|
1424
|
+
# Returns:
|
1425
|
+
# pd.MultiIndex: MultiIndex with all combinations
|
1426
|
+
# """
|
1427
|
+
# # Create a list to store unique values for each column
|
1428
|
+
# unique_values = []
|
1429
|
+
|
1430
|
+
# # Get unique values for each specified column
|
1431
|
+
# for col in columns:
|
1432
|
+
# unique_values.append(df[col].unique())
|
1433
|
+
|
1434
|
+
# # Create date range
|
1435
|
+
# date_range = pd.date_range(start=df[date_column].min(),
|
1436
|
+
# end=df[date_column].max(),
|
1437
|
+
# freq=rolling_period)
|
1438
|
+
|
1439
|
+
# # Add date range to the list of unique values
|
1440
|
+
# unique_values.append(date_range)
|
1441
|
+
|
1442
|
+
# # Create MultiIndex from product of all unique values
|
1443
|
+
# all_combinations = pd.MultiIndex.from_product(unique_values,
|
1444
|
+
# names=columns + [date_column])
|
1445
|
+
|
1446
|
+
# return all_combinations
|
1447
|
+
|
1448
|
+
# def prepare_data_combinations(df: pd.DataFrame, columns_to_combine : list, col_date : str, date_format : str, rolling_period : str, col_id : str, col_engagement : str) -> pd.DataFrame:
|
1449
|
+
# """
|
1450
|
+
# Prepare data for combinations of columns. Useful for data preparation before dataviz of time series. It adds missing rows for each combination of columns and date.
|
1451
|
+
# Args:
|
1452
|
+
# df (pd.DataFrame): The input DataFrame.
|
1453
|
+
# columns_to_combine (list): List of column names to combine.
|
1454
|
+
# col_date (str): Name of the column containing dates.
|
1455
|
+
# date_format (str): Format of the dates in col_date.
|
1456
|
+
# rolling_period (str): Rolling period for grouping.
|
1457
|
+
# col_id (str): Name of the column containing unique IDs.
|
1458
|
+
# col_engagement (str): Name of the column containing engagement values.
|
1459
|
+
# Returns:
|
1460
|
+
# pd.DataFrame: The prepared DataFrame with combinations of columns.
|
1461
|
+
# """
|
1462
|
+
|
1463
|
+
# df_wt_combinations = df.copy()
|
1464
|
+
# df_wt_combinations["date"] = pd.to_datetime(df_wt_combinations[col_date], format=date_format).to_numpy()
|
1465
|
+
|
1466
|
+
# # Create all possible combinations of columns indexes
|
1467
|
+
# all_combinations = create_combination_index(df_wt_combinations, columns_to_combine, "date", rolling_period)
|
1468
|
+
|
1469
|
+
# # If no columns to combine, just use the date for grouping
|
1470
|
+
# if not columns_to_combine:
|
1471
|
+
# df_wt_combinations = (df_wt_combinations
|
1472
|
+
# .set_index("date")
|
1473
|
+
# .groupby(pd.Grouper(freq=rolling_period))
|
1474
|
+
# .agg({col_id: "nunique", col_engagement: "sum"})
|
1475
|
+
# .fillna(0)
|
1476
|
+
# .reset_index())
|
1477
|
+
# else:
|
1478
|
+
# # # Create all possible combinations of columns indexes
|
1479
|
+
# # all_combinations = create_combination_index(df_wt_combinations, columns_to_combine, "date", rolling_period)
|
1480
|
+
|
1481
|
+
# df_wt_combinations = (df_wt_combinations
|
1482
|
+
# .set_index(["date"])
|
1483
|
+
# .groupby([*columns_to_combine, pd.Grouper(freq=rolling_period)])
|
1484
|
+
# .agg({col_id: "nunique", col_engagement: "sum"})
|
1485
|
+
# .reindex(all_combinations, fill_value=0)
|
1486
|
+
# .reset_index())
|
1487
|
+
|
1488
|
+
# return df_wt_combinations
|
1489
|
+
|
1490
|
+
def custom_ordering(df : pd.DataFrame, col_to_order : str, custom_order : list) -> pd.DataFrame:
|
1491
|
+
"""
|
1492
|
+
Orders the values in a DataFrame column based on a custom order.
|
1493
|
+
Args:
|
1494
|
+
df (DataFrame): The DataFrame containing the column to be ordered.
|
1495
|
+
col_to_order (str): The name of the column to be ordered.
|
1496
|
+
custom_order (list): The custom order of values.
|
1497
|
+
Returns:
|
1498
|
+
DataFrame: The DataFrame with the column values ordered according to the custom order.
|
1499
|
+
"""
|
1500
|
+
df[col_to_order] = pd.Categorical(df[col_to_order], categories=custom_order, ordered=True).to_numpy()
|
1501
|
+
return df
|
1502
|
+
|
1503
|
+
def calcul_total_et_pourcentage(df : pd.DataFrame, col_gb : list, metrics : dict) -> pd.DataFrame:
|
1504
|
+
"""
|
1505
|
+
Calculates the total and percentage values for the given metrics based on a grouping column.
|
1506
|
+
Args:
|
1507
|
+
df (DataFrame): The input DataFrame.
|
1508
|
+
col_gb (list): Names of the columns to group by.
|
1509
|
+
metrics (dict): A dictionary of metrics to calculate.
|
1510
|
+
Returns:
|
1511
|
+
DataFrame: The modified DataFrame with total and percentage values added.
|
1512
|
+
|
1513
|
+
"""
|
1514
|
+
percentage_agregations = {f'per_{key}': lambda x: x[key] / x[f"total_{key}"] for key in list(metrics.keys())}
|
1515
|
+
|
1516
|
+
df = (df.join(df.groupby(col_gb)
|
1517
|
+
.agg(metrics)
|
1518
|
+
.add_prefix("total_"), on=col_gb
|
1519
|
+
)
|
1520
|
+
.assign(**percentage_agregations).fillna(0)
|
1521
|
+
)
|
1522
|
+
|
1523
|
+
return df
|
1524
|
+
|
opsci_toolbox/helpers/dataviz.py
CHANGED
@@ -673,19 +673,14 @@ def create_scatter_plot(
|
|
673
673
|
"""
|
674
674
|
params = general_kwargs()
|
675
675
|
params.update(kwargs)
|
676
|
-
|
677
676
|
marker_color = params["marker_color"]
|
678
677
|
marker_line_color = params["marker_line_color"]
|
679
678
|
marker_size = params["marker_size"]
|
680
679
|
col_hover = params["col_hover"]
|
681
|
-
|
682
680
|
xaxis_range = params["xaxis_range"]
|
683
|
-
|
684
681
|
yaxis_range = params["yaxis_range"]
|
685
682
|
|
686
|
-
|
687
683
|
fig = go.Figure()
|
688
|
-
|
689
684
|
if marker_line_color is None:
|
690
685
|
marker_line_color = marker_color
|
691
686
|
|
@@ -694,40 +689,41 @@ def create_scatter_plot(
|
|
694
689
|
for i, category in enumerate(df[col_category].unique()):
|
695
690
|
|
696
691
|
if color_palette:
|
697
|
-
marker_color = color_palette.get(category, generate_random_hexadecimal_color) # Default to black if category not found
|
692
|
+
marker_color = color_palette.get(category, generate_random_hexadecimal_color()) # Default to black if category not found
|
698
693
|
else:
|
699
694
|
marker_color = generate_random_hexadecimal_color()
|
700
695
|
|
701
696
|
# hovertemplate generation
|
702
|
-
hovertemplate = (
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
)
|
697
|
+
# hovertemplate = (
|
698
|
+
# "<b>"
|
699
|
+
# + col_x
|
700
|
+
# + "</b>:"
|
701
|
+
# + df[df[col_category] == category][col_x].astype(str)
|
702
|
+
# + "<br><b>"
|
703
|
+
# + col_y
|
704
|
+
# + "</b>:"
|
705
|
+
# + df[df[col_category] == category][col_y].astype(str)
|
706
|
+
# + "<br><b>"
|
707
|
+
# + col_category
|
708
|
+
# + "</b>:"
|
709
|
+
# + str(category)
|
710
|
+
# )
|
711
|
+
hovertemplate = ""
|
716
712
|
if col_size is None:
|
717
713
|
size = marker_size
|
718
714
|
else:
|
719
715
|
size = df[df[col_category] == category][col_size]
|
720
|
-
|
716
|
+
# hovertemplate += "<br><b>" + col_size + "</b>:" + size.astype(str)
|
721
717
|
|
722
718
|
if len(col_hover) > 0:
|
723
719
|
for c in col_hover:
|
724
720
|
hovertemplate += (
|
725
721
|
"<br><b>"
|
726
722
|
+ str(c)
|
727
|
-
+ "</b
|
723
|
+
+ "</b> : "
|
728
724
|
+ df[df[col_category] == category][c]
|
725
|
+
.apply(format_input)
|
729
726
|
.astype(str)
|
730
|
-
.apply(wrap_text)
|
731
727
|
)
|
732
728
|
|
733
729
|
fig.add_trace(
|
@@ -2038,8 +2034,6 @@ def bar_trend_per_cat(df: pd.DataFrame,
|
|
2038
2034
|
params = general_kwargs()
|
2039
2035
|
params.update(kwargs)
|
2040
2036
|
|
2041
|
-
col_hover = params["col_hover"]
|
2042
|
-
|
2043
2037
|
xaxis_title = params["xaxis_title"]
|
2044
2038
|
yaxis_title = params["yaxis_title"]
|
2045
2039
|
zaxis_title = params["zaxis_title"]
|
@@ -2060,7 +2054,7 @@ def bar_trend_per_cat(df: pd.DataFrame,
|
|
2060
2054
|
current_df = df[df[col_cat] == cat]
|
2061
2055
|
|
2062
2056
|
|
2063
|
-
hovertemplate="<br><b>"+xaxis_title+"</b> :"+current_df[col_x].astype(str)+"<br><b>"+yaxis_title+"</b> - "+current_df[col_y].astype(str)+"<br><b>"+zaxis_title+"</b> : "+current_df[col_z].astype(str)
|
2057
|
+
hovertemplate="<br><b>"+xaxis_title+"</b> :"+current_df[col_x].astype(str)+"<br><b>"+yaxis_title+"</b> - "+current_df[col_y].astype(str)+"<br><b>"+zaxis_title+"</b> : "+current_df[col_z].astype(str)
|
2064
2058
|
# hovertemplate='<b>Categorie : </b>'+str(cat)+'<br><b>Date : </b>'+ current_df[col_x].astype(str) + '<br><b>'+y1_axis_title+'</b> : '+ current_df[col_metric1].astype(str)+' ('+current_df["per_"+col_metric1].map("{:.1%}".format).astype(str)+')' +'<br><b>'+y2_axis_title+'</b> : '+ current_df[col_metric2].astype(int).astype(str)+' ('+current_df["per_"+col_metric2].map("{:.1%}".format).astype(str)+')'
|
2065
2059
|
for c in col_hover:
|
2066
2060
|
hovertemplate += (
|
@@ -3946,6 +3940,170 @@ def create_radar(df: pd.DataFrame,
|
|
3946
3940
|
)
|
3947
3941
|
return fig
|
3948
3942
|
|
3943
|
+
def bar_subplots_per_cat(df: pd.DataFrame,
|
3944
|
+
col_x: str,
|
3945
|
+
col_y: str,
|
3946
|
+
col_cat: str,
|
3947
|
+
col_stack: str,
|
3948
|
+
color_palette: dict = None,
|
3949
|
+
n_top_words: int = 20,
|
3950
|
+
**kwargs
|
3951
|
+
) -> go.Figure:
|
3952
|
+
"""
|
3953
|
+
Create subplots of stacked bar charts.
|
3954
|
+
|
3955
|
+
Args:
|
3956
|
+
df (pd.DataFrame): DataFrame containing data for bar charts.
|
3957
|
+
col_x (str): Name of the column containing x-axis values.
|
3958
|
+
col_y (str): Name of the column containing y-axis values.
|
3959
|
+
col_cat (str): Name of the column containing categories.
|
3960
|
+
col_stack (str): Name of the column containing stacking values.
|
3961
|
+
color_palette (Optional[Dict[str, str]], optional): Dictionary mapping categories to colors. Defaults to None.
|
3962
|
+
n_top_words (int, optional): Number of top words to display in each bar chart. Defaults to 20.
|
3963
|
+
**kwargs: Additional keyword arguments to update default plotting parameters.
|
3964
|
+
|
3965
|
+
Returns:
|
3966
|
+
go.Figure: Plotly Figure object representing the subplots of stacked bar charts.
|
3967
|
+
"""
|
3968
|
+
|
3969
|
+
params = general_kwargs()
|
3970
|
+
params.update(kwargs)
|
3971
|
+
|
3972
|
+
marker_color = params['marker_color']
|
3973
|
+
textposition = params["textposition"]
|
3974
|
+
vertical_spacing = params['vertical_spacing']
|
3975
|
+
horizontal_spacing = params["horizontal_spacing"]
|
3976
|
+
col_hover = params['col_hover']
|
3977
|
+
n_cols = params['n_cols']
|
3978
|
+
categories = df[col_cat].unique()
|
3979
|
+
|
3980
|
+
# user define a number of columns, we compute the number of rows requires
|
3981
|
+
n_rows = math.ceil(len(categories) / n_cols)
|
3982
|
+
|
3983
|
+
# fine tune parameter according to the text position provided
|
3984
|
+
if textposition == 'inside':
|
3985
|
+
horizontal_spacing = (horizontal_spacing / n_cols) / 2
|
3986
|
+
else:
|
3987
|
+
horizontal_spacing = (horizontal_spacing / n_cols)
|
3988
|
+
|
3989
|
+
# create subplots
|
3990
|
+
fig = make_subplots(
|
3991
|
+
rows=n_rows, # number of rows
|
3992
|
+
cols=n_cols, # number of columns
|
3993
|
+
subplot_titles=list(categories), # title for each subplot
|
3994
|
+
vertical_spacing=vertical_spacing / n_rows, # space between subplots
|
3995
|
+
horizontal_spacing=horizontal_spacing, # space between subplots
|
3996
|
+
shared_xaxes=params["shared_xaxes"],
|
3997
|
+
shared_yaxes=params["shared_yaxes"]
|
3998
|
+
)
|
3999
|
+
|
4000
|
+
# create stacked bar traces for each subplot
|
4001
|
+
row_id = 0
|
4002
|
+
col_id = 0
|
4003
|
+
for i, category in enumerate(categories):
|
4004
|
+
# define row and column position
|
4005
|
+
col_id += 1
|
4006
|
+
if i % n_cols == 0:
|
4007
|
+
row_id += 1
|
4008
|
+
if col_id > n_cols:
|
4009
|
+
col_id = 1
|
4010
|
+
|
4011
|
+
# select data
|
4012
|
+
current_df = df[df[col_cat] == category].sort_values(by=col_x, ascending=True)
|
4013
|
+
unique_stacks = current_df[col_stack].unique()
|
4014
|
+
|
4015
|
+
if textposition == 'inside':
|
4016
|
+
text = current_df[col_y].head(n_top_words)
|
4017
|
+
else:
|
4018
|
+
textposition = "auto"
|
4019
|
+
text = None
|
4020
|
+
|
4021
|
+
for stack in unique_stacks:
|
4022
|
+
# define bar color or create a random color
|
4023
|
+
if color_palette:
|
4024
|
+
marker_color = color_palette.get(stack, generate_random_hexadecimal_color())
|
4025
|
+
else:
|
4026
|
+
marker_color = generate_random_hexadecimal_color()
|
4027
|
+
|
4028
|
+
stack_df = current_df[current_df[col_stack] == stack]
|
4029
|
+
hovertemplate = '<b>'+col_cat+" : "+ stack_df[col_cat].astype(str)+ '</b><br>' + col_stack+" : "+ stack_df[col_stack].astype(str)
|
4030
|
+
|
4031
|
+
for col in col_hover:
|
4032
|
+
hovertemplate += '<br><b>' + col + ': ' + current_df[current_df[col_cat] == category][col].astype(str) + '</b>'
|
4033
|
+
|
4034
|
+
|
4035
|
+
fig.add_trace(
|
4036
|
+
go.Bar(
|
4037
|
+
x=stack_df[col_x].tail(n_top_words),
|
4038
|
+
y=stack_df[col_y].tail(n_top_words),
|
4039
|
+
opacity=params["marker_opacity"],
|
4040
|
+
orientation=params["orientation"], # horizontal bars
|
4041
|
+
name=stack, # trace name for legend
|
4042
|
+
text=text, # text to display
|
4043
|
+
textposition=textposition, # text position
|
4044
|
+
textangle=params["xaxis_tickangle"], # text angle
|
4045
|
+
marker_color=marker_color, # bar color
|
4046
|
+
hovertemplate=hovertemplate + "<extra></extra>" # hover info
|
4047
|
+
),
|
4048
|
+
row=row_id,
|
4049
|
+
col=col_id
|
4050
|
+
)
|
4051
|
+
|
4052
|
+
for row_id in range(1, n_rows+1):
|
4053
|
+
for col_id in range(1, n_cols+1):
|
4054
|
+
fig.update_yaxes(title=params["yaxis_title"], row=row_id, col=1)
|
4055
|
+
fig.update_xaxes(title=params["xaxis_title"], row=row_id, col=col_id)
|
4056
|
+
|
4057
|
+
fig.update_layout(
|
4058
|
+
margin=dict(l=75, r=75, t=75, b=50),
|
4059
|
+
title_text=params["title_text"],
|
4060
|
+
width=n_cols * params["width"], # plot size
|
4061
|
+
height=n_rows * n_top_words * params["height"], # plot size
|
4062
|
+
showlegend=params["showlegend"],
|
4063
|
+
font_family=params["font_family"],
|
4064
|
+
font_size=params["font_size"],
|
4065
|
+
template=params["template"],
|
4066
|
+
plot_bgcolor=params["plot_bgcolor"], # background color (plot)
|
4067
|
+
paper_bgcolor=params["paper_bgcolor"], # background color (around plot)
|
4068
|
+
uniformtext_minsize=params["uniformtext_minsize"],
|
4069
|
+
barmode=params['barmode']
|
4070
|
+
)
|
4071
|
+
|
4072
|
+
fig.update_yaxes(
|
4073
|
+
# title=params["yaxis_title"],
|
4074
|
+
title_font_size=params["yaxis_title_font_size"],
|
4075
|
+
tickangle=params["yaxis_tickangle"],
|
4076
|
+
tickfont_size=params["yaxis_tickfont_size"],
|
4077
|
+
range=params["yaxis_range"],
|
4078
|
+
showgrid=params["yaxis_showgrid"],
|
4079
|
+
showline=params["yaxis_showline"],
|
4080
|
+
zeroline=params["yaxis_zeroline"],
|
4081
|
+
gridwidth=params["yaxis_gridwidth"],
|
4082
|
+
gridcolor=params["yaxis_gridcolor"],
|
4083
|
+
linewidth=params["yaxis_linewidth"],
|
4084
|
+
linecolor=params["yaxis_linecolor"],
|
4085
|
+
mirror=params["yaxis_mirror"],
|
4086
|
+
layer="below traces",
|
4087
|
+
)
|
4088
|
+
|
4089
|
+
fig.update_xaxes(
|
4090
|
+
# title=params["xaxis_title"],
|
4091
|
+
title_font_size=params["xaxis_title_font_size"],
|
4092
|
+
tickangle=params["xaxis_tickangle"],
|
4093
|
+
tickfont_size=params["xaxis_tickfont_size"],
|
4094
|
+
range=params["xaxis_range"],
|
4095
|
+
showgrid=params["xaxis_showgrid"],
|
4096
|
+
showline=params["xaxis_showline"],
|
4097
|
+
zeroline=params["xaxis_zeroline"],
|
4098
|
+
gridwidth=params["xaxis_gridwidth"],
|
4099
|
+
gridcolor=params["xaxis_gridcolor"],
|
4100
|
+
linewidth=params["xaxis_linewidth"],
|
4101
|
+
linecolor=params["xaxis_linecolor"],
|
4102
|
+
mirror=params["xaxis_mirror"],
|
4103
|
+
layer="below traces"
|
4104
|
+
)
|
4105
|
+
return fig
|
4106
|
+
|
3949
4107
|
# def bar_subplots(df: pd.DataFrame,
|
3950
4108
|
# col_x: str,
|
3951
4109
|
# col_y: str,
|
opsci_toolbox/helpers/dates.py
CHANGED
@@ -58,7 +58,7 @@ def number_of_days(start_date: datetime, end_date: datetime) -> int:
|
|
58
58
|
days_difference (int): The number of days between the start and end dates.
|
59
59
|
"""
|
60
60
|
# Calculate the difference
|
61
|
-
time_difference =
|
61
|
+
time_difference = end_date - start_date
|
62
62
|
# Extract the number of days from the timedelta object
|
63
63
|
days_difference = time_difference.days
|
64
64
|
return days_difference
|
@@ -77,3 +77,49 @@ def df_col_to_datetime(df: pd.DataFrame, col: str) -> pd.DataFrame:
|
|
77
77
|
df[col] = pd.to_datetime(df[col])
|
78
78
|
return df
|
79
79
|
|
80
|
+
|
81
|
+
# from dateutil import parser
|
82
|
+
# from datetime import datetime
|
83
|
+
|
84
|
+
# def detect_date_format(date_string):
|
85
|
+
# formats = [
|
86
|
+
# # Date formats
|
87
|
+
# "%Y-%m-%d", "%d-%m-%Y", "%m/%d/%Y", "%m-%d-%Y",
|
88
|
+
# "%Y/%m/%d", "%d/%m/%Y", "%Y.%m.%d", "%d.%m.%Y",
|
89
|
+
# "%d %b %Y", "%d %B %Y", "%b %d, %Y", "%B %d, %Y",
|
90
|
+
# "%d-%b-%Y", "%d-%B-%Y", "%b-%d-%Y", "%B-%d-%Y",
|
91
|
+
# # Date and time formats
|
92
|
+
# "%Y-%m-%d %H:%M:%S", "%d-%m-%Y %H:%M:%S", "%m/%d/%Y %H:%M:%S", "%m-%d-%Y %H:%M:%S",
|
93
|
+
# "%Y/%m/%d %H:%M:%S", "%d/%m/%Y %H:%M:%S", "%Y.%m.%d %H:%M:%S", "%d.%m.%Y %H:%M:%S",
|
94
|
+
# "%d %b %Y %H:%M:%S", "%d %B %Y %H:%M:%S", "%b %d, %Y %H:%M:%S", "%B %d, %Y %H:%M:%S",
|
95
|
+
# "%d-%b-%Y %H:%M:%S", "%d-%B-%Y %H:%M:%S", "%b-%d-%Y %H:%M:%S", "%B-%d-%Y %H:%M:%S",
|
96
|
+
# # Time formats with milliseconds
|
97
|
+
# "%Y-%m-%d %H:%M:%S.%f", "%d-%m-%Y %H:%M:%S.%f", "%m/%d/%Y %H:%M:%S.%f", "%m-%d-%Y %H:%M:%S.%f",
|
98
|
+
# "%Y/%m/%d %H:%M:%S.%f", "%d/%m/%Y %H:%M:%S.%f", "%Y.%m.%d %H:%M:%S.%f", "%d.%m.%Y %H:%M:%S.%f",
|
99
|
+
# "%d %b %Y %H:%M:%S.%f", "%d %B %Y %H:%M:%S.%f", "%b %d, %Y %H:%M:%S.%f", "%B %d, %Y %H:%M:%S.%f",
|
100
|
+
# "%d-%b-%Y %H:%M:%S.%f", "%d-%B-%Y %H:%M:%S.%f", "%b-%d-%Y %H:%M:%S.%f", "%B-%d-%Y %H:%M:%S.%f",
|
101
|
+
# # ISO format
|
102
|
+
# "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S.%f",
|
103
|
+
# "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S.%f",
|
104
|
+
# "%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%d %H:%M:%S%z",
|
105
|
+
# # Additional formats
|
106
|
+
# "%y/%m/%d %H:%M:%S", "%d/%m/%y %H:%M:%S", "%y-%m-%d %H:%M:%S", "%d-%m-%y %H:%M:%S",
|
107
|
+
# ]
|
108
|
+
|
109
|
+
# for date_format in formats:
|
110
|
+
# try:
|
111
|
+
# # Try to parse the date string with each format
|
112
|
+
# parsed_date = datetime.strptime(date_string, date_format)
|
113
|
+
# return date_format
|
114
|
+
# except ValueError:
|
115
|
+
# continue
|
116
|
+
|
117
|
+
# return None
|
118
|
+
|
119
|
+
# def detect_date_format(date_string):
|
120
|
+
# try:
|
121
|
+
# # Use dateutil parser to parse the date string
|
122
|
+
# parsed_date = parser.parse(date_string, fuzzy=False)
|
123
|
+
# return parsed_date
|
124
|
+
# except ValueError:
|
125
|
+
# return None
|
@@ -0,0 +1,88 @@
|
|
1
|
+
from gliner import GLiNER
|
2
|
+
|
3
|
+
|
4
|
+
def load_gliner_model(model_name : str, map_location="cpu") -> GLiNER:
|
5
|
+
"""
|
6
|
+
Load the GLINER named entity recognition (NER) model.
|
7
|
+
|
8
|
+
Args:
|
9
|
+
model: The model name to load.
|
10
|
+
map_location: The device to load the model on. Possible values are cpu or cuda .
|
11
|
+
|
12
|
+
Returns:
|
13
|
+
A list of predicted entities.
|
14
|
+
|
15
|
+
"""
|
16
|
+
model = GLiNER.from_pretrained(model_name, map_location=map_location)
|
17
|
+
return model
|
18
|
+
|
19
|
+
def gliner_predict(model : GLiNER, text : str, labels : list, threshold : float = 0.5) -> list:
|
20
|
+
"""
|
21
|
+
Predicts entities using the given model.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
model: The model used for prediction.
|
25
|
+
text: A list of texts to predict entities from.
|
26
|
+
labels: A list of labels corresponding to the texts.
|
27
|
+
threshold: The threshold value for entity prediction (default: 0.5).
|
28
|
+
|
29
|
+
Returns:
|
30
|
+
A list of predicted entities.
|
31
|
+
|
32
|
+
"""
|
33
|
+
entities = model.predict_entities(text, labels, threshold=threshold)
|
34
|
+
return entities
|
35
|
+
|
36
|
+
def gliner_batch_predict(model : GLiNER, text : list, labels : list, threshold : float = 0.5) -> list:
|
37
|
+
"""
|
38
|
+
Batch inference. Predicts entities using the given model.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
model: The model used for prediction.
|
42
|
+
text: A list of texts to predict entities from.
|
43
|
+
labels: A list of labels corresponding to the texts.
|
44
|
+
threshold: The threshold value for entity prediction (default: 0.5).
|
45
|
+
|
46
|
+
Returns:
|
47
|
+
A list of predicted entities.
|
48
|
+
|
49
|
+
"""
|
50
|
+
entities = model.batch_predict_entities(text, labels, threshold=threshold)
|
51
|
+
return entities
|
52
|
+
|
53
|
+
|
54
|
+
def parse_predictions(predictions : list) -> tuple:
|
55
|
+
"""
|
56
|
+
Parse the predictions generated by a GLINER named entity recognition (NER) model for batch processing.
|
57
|
+
|
58
|
+
Args:
|
59
|
+
predictions (list): A list of dictionaries representing the predictions. Each dictionary contains the following keys:
|
60
|
+
- "start" (int): The starting index of the predicted entity in the input text.
|
61
|
+
- "end" (int): The ending index of the predicted entity in the input text.
|
62
|
+
- "text" (str): The predicted entity text.
|
63
|
+
- "label" (str): The predicted entity label.
|
64
|
+
- "score" (float): The confidence score of the prediction.
|
65
|
+
|
66
|
+
Returns:
|
67
|
+
tuple: A tuple containing lists of the extracted information from the predictions. The tuple contains the following lists:
|
68
|
+
- starts (list): A list of lists, where each inner list contains the starting indices of the predicted entities.
|
69
|
+
- ends (list): A list of lists, where each inner list contains the ending indices of the predicted entities.
|
70
|
+
- texts (list): A list of lists, where each inner list contains the predicted entity texts.
|
71
|
+
- labels (list): A list of lists, where each inner list contains the predicted entity labels.
|
72
|
+
- scores (list): A list of lists, where each inner list contains the confidence scores of the predictions.
|
73
|
+
"""
|
74
|
+
starts, ends, texts, labels, scores = [], [], [], [], []
|
75
|
+
for prediction in predictions:
|
76
|
+
start, end, text, label, score = [], [], [], [], []
|
77
|
+
for item in prediction:
|
78
|
+
start.append(item.get("start"))
|
79
|
+
end.append(item.get("end"))
|
80
|
+
text.append(item.get("text"))
|
81
|
+
label.append(item.get("label"))
|
82
|
+
score.append(item.get("score"))
|
83
|
+
starts.append(start)
|
84
|
+
ends.append(end)
|
85
|
+
texts.append(text)
|
86
|
+
labels.append(label)
|
87
|
+
scores.append(score)
|
88
|
+
return starts, ends, texts, labels, scores
|