opsci-toolbox 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -310,6 +310,22 @@ def write_pickle(data: pd.DataFrame, path: str, filename: str) -> str:
310
310
  pickle.dump(data, f)
311
311
  return file_path
312
312
 
313
+ def save_df_to_pickle(df: pd.DataFrame, path: str, filename: str) -> str:
314
+ """
315
+ Write a DataFrame into a pickle file.
316
+
317
+ Args:
318
+ data (pd.DataFrame): The DataFrame to be written to the pickle file.
319
+ path (str): The directory where the pickle file will be saved.
320
+ filename (str): The name of the pickle file (without the extension).
321
+
322
+ Returns:
323
+ str: The full path to the saved pickle file.
324
+ """
325
+ file_path = os.path.join(path, filename + '.pickle')
326
+ df.to_pickle(file_path)
327
+ return file_path
328
+
313
329
 
314
330
  def write_list_to_txt(input_list: list, path: str, name: str) -> str:
315
331
  """
@@ -587,7 +603,7 @@ def list_files_in_subdirectories(path: str, filetype: str = '*.json') -> list:
587
603
 
588
604
  return files
589
605
 
590
- def copy_file(source_path: str, destination_path: str, new_filename: str = '') -> str:
606
+ def copy_file(source_path: str, destination_path: str, new_filename: str = None) -> str:
591
607
  """
592
608
  Copy a file from a source path to a destination path.
593
609
 
@@ -1319,13 +1335,13 @@ def categorize_percentiles(percentile: float) -> str:
1319
1335
  Categorizes a percentile value into a string representing its range.
1320
1336
 
1321
1337
  Args:
1322
- - percentile (float): The percentile value (between 0 and 1).
1338
+ percentile (float): The percentile value (between 0 and 1).
1323
1339
 
1324
1340
  Returns:
1325
- - str: The category of the percentile value.
1341
+ str: The category of the percentile value.
1326
1342
 
1327
1343
  Raises:
1328
- - ValueError: If the percentile value is outside the range [0, 1].
1344
+ ValueError: If the percentile value is outside the range [0, 1].
1329
1345
  """
1330
1346
  if not (0 <= percentile <= 1):
1331
1347
  raise ValueError("Percentile must be between 0 and 1 inclusive.")
@@ -1349,4 +1365,160 @@ def categorize_percentiles(percentile: float) -> str:
1349
1365
  elif percentile <= 0.9:
1350
1366
  return '80-90%'
1351
1367
  else:
1352
- return '90-100%'
1368
+ return '90-100%'
1369
+
1370
+
1371
+ def prepare_data_combinations(df: pd.DataFrame, columns_to_combine : list, col_date : str, date_format : str, rolling_period : str, col_id : str, col_engagement : str) -> pd.DataFrame:
1372
+ """
1373
+ Prepare data for combinations of columns. Useful for data preparation before dataviz of time series. It adds missing rows for each combination of columns and date.
1374
+ Args:
1375
+ df (pd.DataFrame): The input DataFrame.
1376
+ columns_to_combine (list): List of column names to combine.
1377
+ col_date (str): Name of the column containing dates.
1378
+ date_format (str): Format of the dates in col_date.
1379
+ rolling_period (str): Rolling period for grouping.
1380
+ col_id (str): Name of the column containing unique IDs.
1381
+ col_engagement (str): Name of the column containing engagement values.
1382
+ Returns:
1383
+ pd.DataFrame: The prepared DataFrame with combinations of columns.
1384
+ """
1385
+
1386
+ df_wt_combinations = df.copy()
1387
+ df_wt_combinations["date"] = pd.to_datetime(df_wt_combinations[col_date], format=date_format).to_numpy()
1388
+
1389
+ # Create all possible combinations of columns indexes
1390
+ # all_combinations = create_combination_index(df_wt_combinations, columns_to_combine, "date", rolling_period)
1391
+
1392
+ # If no columns to combine, just use the date for grouping
1393
+ if not columns_to_combine:
1394
+ df_wt_combinations = (df_wt_combinations
1395
+ .set_index("date")
1396
+ .resample(rolling_period)
1397
+ .agg({col_id: "nunique", col_engagement: "sum"})
1398
+ .fillna(0)
1399
+ .reset_index())
1400
+ else:
1401
+ # # Create all possible combinations of columns indexes
1402
+ # all_combinations = create_combination_index(df_wt_combinations, columns_to_combine, "date", rolling_period)
1403
+
1404
+ df_wt_combinations = (df_wt_combinations
1405
+ .set_index(["date"])
1406
+ .groupby(columns_to_combine)
1407
+ .resample(rolling_period)
1408
+ .agg({col_id: "nunique", col_engagement: "sum"})
1409
+ .fillna(0)
1410
+ .reset_index())
1411
+
1412
+ return df_wt_combinations
1413
+
1414
+ # def create_combination_index(df : pd.DataFrame, columns : list, date_column : str, rolling_period : str) -> pd.MultiIndex:
1415
+ # """
1416
+ # Create all possible combinations of unique values from specified columns and date range.
1417
+
1418
+ # Args:
1419
+ # df (pd.DataFrame): The input DataFrame
1420
+ # columns (list): List of column names to create combinations from
1421
+ # date_column (str): Name of the date column
1422
+ # rolling_period (str): Frequency for date range (e.g., '1D', '1W', '1M')
1423
+
1424
+ # Returns:
1425
+ # pd.MultiIndex: MultiIndex with all combinations
1426
+ # """
1427
+ # # Create a list to store unique values for each column
1428
+ # unique_values = []
1429
+
1430
+ # # Get unique values for each specified column
1431
+ # for col in columns:
1432
+ # unique_values.append(df[col].unique())
1433
+
1434
+ # # Create date range
1435
+ # date_range = pd.date_range(start=df[date_column].min(),
1436
+ # end=df[date_column].max(),
1437
+ # freq=rolling_period)
1438
+
1439
+ # # Add date range to the list of unique values
1440
+ # unique_values.append(date_range)
1441
+
1442
+ # # Create MultiIndex from product of all unique values
1443
+ # all_combinations = pd.MultiIndex.from_product(unique_values,
1444
+ # names=columns + [date_column])
1445
+
1446
+ # return all_combinations
1447
+
1448
+ # def prepare_data_combinations(df: pd.DataFrame, columns_to_combine : list, col_date : str, date_format : str, rolling_period : str, col_id : str, col_engagement : str) -> pd.DataFrame:
1449
+ # """
1450
+ # Prepare data for combinations of columns. Useful for data preparation before dataviz of time series. It adds missing rows for each combination of columns and date.
1451
+ # Args:
1452
+ # df (pd.DataFrame): The input DataFrame.
1453
+ # columns_to_combine (list): List of column names to combine.
1454
+ # col_date (str): Name of the column containing dates.
1455
+ # date_format (str): Format of the dates in col_date.
1456
+ # rolling_period (str): Rolling period for grouping.
1457
+ # col_id (str): Name of the column containing unique IDs.
1458
+ # col_engagement (str): Name of the column containing engagement values.
1459
+ # Returns:
1460
+ # pd.DataFrame: The prepared DataFrame with combinations of columns.
1461
+ # """
1462
+
1463
+ # df_wt_combinations = df.copy()
1464
+ # df_wt_combinations["date"] = pd.to_datetime(df_wt_combinations[col_date], format=date_format).to_numpy()
1465
+
1466
+ # # Create all possible combinations of columns indexes
1467
+ # all_combinations = create_combination_index(df_wt_combinations, columns_to_combine, "date", rolling_period)
1468
+
1469
+ # # If no columns to combine, just use the date for grouping
1470
+ # if not columns_to_combine:
1471
+ # df_wt_combinations = (df_wt_combinations
1472
+ # .set_index("date")
1473
+ # .groupby(pd.Grouper(freq=rolling_period))
1474
+ # .agg({col_id: "nunique", col_engagement: "sum"})
1475
+ # .fillna(0)
1476
+ # .reset_index())
1477
+ # else:
1478
+ # # # Create all possible combinations of columns indexes
1479
+ # # all_combinations = create_combination_index(df_wt_combinations, columns_to_combine, "date", rolling_period)
1480
+
1481
+ # df_wt_combinations = (df_wt_combinations
1482
+ # .set_index(["date"])
1483
+ # .groupby([*columns_to_combine, pd.Grouper(freq=rolling_period)])
1484
+ # .agg({col_id: "nunique", col_engagement: "sum"})
1485
+ # .reindex(all_combinations, fill_value=0)
1486
+ # .reset_index())
1487
+
1488
+ # return df_wt_combinations
1489
+
1490
+ def custom_ordering(df : pd.DataFrame, col_to_order : str, custom_order : list) -> pd.DataFrame:
1491
+ """
1492
+ Orders the values in a DataFrame column based on a custom order.
1493
+ Args:
1494
+ df (DataFrame): The DataFrame containing the column to be ordered.
1495
+ col_to_order (str): The name of the column to be ordered.
1496
+ custom_order (list): The custom order of values.
1497
+ Returns:
1498
+ DataFrame: The DataFrame with the column values ordered according to the custom order.
1499
+ """
1500
+ df[col_to_order] = pd.Categorical(df[col_to_order], categories=custom_order, ordered=True).to_numpy()
1501
+ return df
1502
+
1503
+ def calcul_total_et_pourcentage(df : pd.DataFrame, col_gb : list, metrics : dict) -> pd.DataFrame:
1504
+ """
1505
+ Calculates the total and percentage values for the given metrics based on a grouping column.
1506
+ Args:
1507
+ df (DataFrame): The input DataFrame.
1508
+ col_gb (list): Names of the columns to group by.
1509
+ metrics (dict): A dictionary of metrics to calculate.
1510
+ Returns:
1511
+ DataFrame: The modified DataFrame with total and percentage values added.
1512
+
1513
+ """
1514
+ percentage_agregations = {f'per_{key}': lambda x: x[key] / x[f"total_{key}"] for key in list(metrics.keys())}
1515
+
1516
+ df = (df.join(df.groupby(col_gb)
1517
+ .agg(metrics)
1518
+ .add_prefix("total_"), on=col_gb
1519
+ )
1520
+ .assign(**percentage_agregations).fillna(0)
1521
+ )
1522
+
1523
+ return df
1524
+
@@ -673,19 +673,14 @@ def create_scatter_plot(
673
673
  """
674
674
  params = general_kwargs()
675
675
  params.update(kwargs)
676
-
677
676
  marker_color = params["marker_color"]
678
677
  marker_line_color = params["marker_line_color"]
679
678
  marker_size = params["marker_size"]
680
679
  col_hover = params["col_hover"]
681
-
682
680
  xaxis_range = params["xaxis_range"]
683
-
684
681
  yaxis_range = params["yaxis_range"]
685
682
 
686
-
687
683
  fig = go.Figure()
688
-
689
684
  if marker_line_color is None:
690
685
  marker_line_color = marker_color
691
686
 
@@ -694,40 +689,41 @@ def create_scatter_plot(
694
689
  for i, category in enumerate(df[col_category].unique()):
695
690
 
696
691
  if color_palette:
697
- marker_color = color_palette.get(category, generate_random_hexadecimal_color) # Default to black if category not found
692
+ marker_color = color_palette.get(category, generate_random_hexadecimal_color()) # Default to black if category not found
698
693
  else:
699
694
  marker_color = generate_random_hexadecimal_color()
700
695
 
701
696
  # hovertemplate generation
702
- hovertemplate = (
703
- "<b>"
704
- + col_x
705
- + "</b>:"
706
- + df[df[col_category] == category][col_x].astype(str)
707
- + "<br><b>"
708
- + col_y
709
- + "</b>:"
710
- + df[df[col_category] == category][col_y].astype(str)
711
- + "<br><b>"
712
- + col_category
713
- + "</b>:"
714
- + str(category)
715
- )
697
+ # hovertemplate = (
698
+ # "<b>"
699
+ # + col_x
700
+ # + "</b>:"
701
+ # + df[df[col_category] == category][col_x].astype(str)
702
+ # + "<br><b>"
703
+ # + col_y
704
+ # + "</b>:"
705
+ # + df[df[col_category] == category][col_y].astype(str)
706
+ # + "<br><b>"
707
+ # + col_category
708
+ # + "</b>:"
709
+ # + str(category)
710
+ # )
711
+ hovertemplate = ""
716
712
  if col_size is None:
717
713
  size = marker_size
718
714
  else:
719
715
  size = df[df[col_category] == category][col_size]
720
- hovertemplate += "<br><b>" + col_size + "</b>:" + size.astype(str)
716
+ # hovertemplate += "<br><b>" + col_size + "</b>:" + size.astype(str)
721
717
 
722
718
  if len(col_hover) > 0:
723
719
  for c in col_hover:
724
720
  hovertemplate += (
725
721
  "<br><b>"
726
722
  + str(c)
727
- + "</b>:"
723
+ + "</b> : "
728
724
  + df[df[col_category] == category][c]
725
+ .apply(format_input)
729
726
  .astype(str)
730
- .apply(wrap_text)
731
727
  )
732
728
 
733
729
  fig.add_trace(
@@ -2038,8 +2034,6 @@ def bar_trend_per_cat(df: pd.DataFrame,
2038
2034
  params = general_kwargs()
2039
2035
  params.update(kwargs)
2040
2036
 
2041
- col_hover = params["col_hover"]
2042
-
2043
2037
  xaxis_title = params["xaxis_title"]
2044
2038
  yaxis_title = params["yaxis_title"]
2045
2039
  zaxis_title = params["zaxis_title"]
@@ -2060,7 +2054,7 @@ def bar_trend_per_cat(df: pd.DataFrame,
2060
2054
  current_df = df[df[col_cat] == cat]
2061
2055
 
2062
2056
 
2063
- hovertemplate="<br><b>"+xaxis_title+"</b> :"+current_df[col_x].astype(str)+"<br><b>"+yaxis_title+"</b> - "+current_df[col_y].astype(str)+"<br><b>"+zaxis_title+"</b> : "+current_df[col_z].astype(str)+"<extra></extra>"
2057
+ hovertemplate="<br><b>"+xaxis_title+"</b> :"+current_df[col_x].astype(str)+"<br><b>"+yaxis_title+"</b> - "+current_df[col_y].astype(str)+"<br><b>"+zaxis_title+"</b> : "+current_df[col_z].astype(str)
2064
2058
  # hovertemplate='<b>Categorie : </b>'+str(cat)+'<br><b>Date : </b>'+ current_df[col_x].astype(str) + '<br><b>'+y1_axis_title+'</b> : '+ current_df[col_metric1].astype(str)+' ('+current_df["per_"+col_metric1].map("{:.1%}".format).astype(str)+')' +'<br><b>'+y2_axis_title+'</b> : '+ current_df[col_metric2].astype(int).astype(str)+' ('+current_df["per_"+col_metric2].map("{:.1%}".format).astype(str)+')'
2065
2059
  for c in col_hover:
2066
2060
  hovertemplate += (
@@ -3946,6 +3940,170 @@ def create_radar(df: pd.DataFrame,
3946
3940
  )
3947
3941
  return fig
3948
3942
 
3943
+ def bar_subplots_per_cat(df: pd.DataFrame,
3944
+ col_x: str,
3945
+ col_y: str,
3946
+ col_cat: str,
3947
+ col_stack: str,
3948
+ color_palette: dict = None,
3949
+ n_top_words: int = 20,
3950
+ **kwargs
3951
+ ) -> go.Figure:
3952
+ """
3953
+ Create subplots of stacked bar charts.
3954
+
3955
+ Args:
3956
+ df (pd.DataFrame): DataFrame containing data for bar charts.
3957
+ col_x (str): Name of the column containing x-axis values.
3958
+ col_y (str): Name of the column containing y-axis values.
3959
+ col_cat (str): Name of the column containing categories.
3960
+ col_stack (str): Name of the column containing stacking values.
3961
+ color_palette (Optional[Dict[str, str]], optional): Dictionary mapping categories to colors. Defaults to None.
3962
+ n_top_words (int, optional): Number of top words to display in each bar chart. Defaults to 20.
3963
+ **kwargs: Additional keyword arguments to update default plotting parameters.
3964
+
3965
+ Returns:
3966
+ go.Figure: Plotly Figure object representing the subplots of stacked bar charts.
3967
+ """
3968
+
3969
+ params = general_kwargs()
3970
+ params.update(kwargs)
3971
+
3972
+ marker_color = params['marker_color']
3973
+ textposition = params["textposition"]
3974
+ vertical_spacing = params['vertical_spacing']
3975
+ horizontal_spacing = params["horizontal_spacing"]
3976
+ col_hover = params['col_hover']
3977
+ n_cols = params['n_cols']
3978
+ categories = df[col_cat].unique()
3979
+
3980
+ # user define a number of columns, we compute the number of rows requires
3981
+ n_rows = math.ceil(len(categories) / n_cols)
3982
+
3983
+ # fine tune parameter according to the text position provided
3984
+ if textposition == 'inside':
3985
+ horizontal_spacing = (horizontal_spacing / n_cols) / 2
3986
+ else:
3987
+ horizontal_spacing = (horizontal_spacing / n_cols)
3988
+
3989
+ # create subplots
3990
+ fig = make_subplots(
3991
+ rows=n_rows, # number of rows
3992
+ cols=n_cols, # number of columns
3993
+ subplot_titles=list(categories), # title for each subplot
3994
+ vertical_spacing=vertical_spacing / n_rows, # space between subplots
3995
+ horizontal_spacing=horizontal_spacing, # space between subplots
3996
+ shared_xaxes=params["shared_xaxes"],
3997
+ shared_yaxes=params["shared_yaxes"]
3998
+ )
3999
+
4000
+ # create stacked bar traces for each subplot
4001
+ row_id = 0
4002
+ col_id = 0
4003
+ for i, category in enumerate(categories):
4004
+ # define row and column position
4005
+ col_id += 1
4006
+ if i % n_cols == 0:
4007
+ row_id += 1
4008
+ if col_id > n_cols:
4009
+ col_id = 1
4010
+
4011
+ # select data
4012
+ current_df = df[df[col_cat] == category].sort_values(by=col_x, ascending=True)
4013
+ unique_stacks = current_df[col_stack].unique()
4014
+
4015
+ if textposition == 'inside':
4016
+ text = current_df[col_y].head(n_top_words)
4017
+ else:
4018
+ textposition = "auto"
4019
+ text = None
4020
+
4021
+ for stack in unique_stacks:
4022
+ # define bar color or create a random color
4023
+ if color_palette:
4024
+ marker_color = color_palette.get(stack, generate_random_hexadecimal_color())
4025
+ else:
4026
+ marker_color = generate_random_hexadecimal_color()
4027
+
4028
+ stack_df = current_df[current_df[col_stack] == stack]
4029
+ hovertemplate = '<b>'+col_cat+" : "+ stack_df[col_cat].astype(str)+ '</b><br>' + col_stack+" : "+ stack_df[col_stack].astype(str)
4030
+
4031
+ for col in col_hover:
4032
+ hovertemplate += '<br><b>' + col + ': ' + current_df[current_df[col_cat] == category][col].astype(str) + '</b>'
4033
+
4034
+
4035
+ fig.add_trace(
4036
+ go.Bar(
4037
+ x=stack_df[col_x].tail(n_top_words),
4038
+ y=stack_df[col_y].tail(n_top_words),
4039
+ opacity=params["marker_opacity"],
4040
+ orientation=params["orientation"], # horizontal bars
4041
+ name=stack, # trace name for legend
4042
+ text=text, # text to display
4043
+ textposition=textposition, # text position
4044
+ textangle=params["xaxis_tickangle"], # text angle
4045
+ marker_color=marker_color, # bar color
4046
+ hovertemplate=hovertemplate + "<extra></extra>" # hover info
4047
+ ),
4048
+ row=row_id,
4049
+ col=col_id
4050
+ )
4051
+
4052
+ for row_id in range(1, n_rows+1):
4053
+ for col_id in range(1, n_cols+1):
4054
+ fig.update_yaxes(title=params["yaxis_title"], row=row_id, col=1)
4055
+ fig.update_xaxes(title=params["xaxis_title"], row=row_id, col=col_id)
4056
+
4057
+ fig.update_layout(
4058
+ margin=dict(l=75, r=75, t=75, b=50),
4059
+ title_text=params["title_text"],
4060
+ width=n_cols * params["width"], # plot size
4061
+ height=n_rows * n_top_words * params["height"], # plot size
4062
+ showlegend=params["showlegend"],
4063
+ font_family=params["font_family"],
4064
+ font_size=params["font_size"],
4065
+ template=params["template"],
4066
+ plot_bgcolor=params["plot_bgcolor"], # background color (plot)
4067
+ paper_bgcolor=params["paper_bgcolor"], # background color (around plot)
4068
+ uniformtext_minsize=params["uniformtext_minsize"],
4069
+ barmode=params['barmode']
4070
+ )
4071
+
4072
+ fig.update_yaxes(
4073
+ # title=params["yaxis_title"],
4074
+ title_font_size=params["yaxis_title_font_size"],
4075
+ tickangle=params["yaxis_tickangle"],
4076
+ tickfont_size=params["yaxis_tickfont_size"],
4077
+ range=params["yaxis_range"],
4078
+ showgrid=params["yaxis_showgrid"],
4079
+ showline=params["yaxis_showline"],
4080
+ zeroline=params["yaxis_zeroline"],
4081
+ gridwidth=params["yaxis_gridwidth"],
4082
+ gridcolor=params["yaxis_gridcolor"],
4083
+ linewidth=params["yaxis_linewidth"],
4084
+ linecolor=params["yaxis_linecolor"],
4085
+ mirror=params["yaxis_mirror"],
4086
+ layer="below traces",
4087
+ )
4088
+
4089
+ fig.update_xaxes(
4090
+ # title=params["xaxis_title"],
4091
+ title_font_size=params["xaxis_title_font_size"],
4092
+ tickangle=params["xaxis_tickangle"],
4093
+ tickfont_size=params["xaxis_tickfont_size"],
4094
+ range=params["xaxis_range"],
4095
+ showgrid=params["xaxis_showgrid"],
4096
+ showline=params["xaxis_showline"],
4097
+ zeroline=params["xaxis_zeroline"],
4098
+ gridwidth=params["xaxis_gridwidth"],
4099
+ gridcolor=params["xaxis_gridcolor"],
4100
+ linewidth=params["xaxis_linewidth"],
4101
+ linecolor=params["xaxis_linecolor"],
4102
+ mirror=params["xaxis_mirror"],
4103
+ layer="below traces"
4104
+ )
4105
+ return fig
4106
+
3949
4107
  # def bar_subplots(df: pd.DataFrame,
3950
4108
  # col_x: str,
3951
4109
  # col_y: str,
@@ -58,7 +58,7 @@ def number_of_days(start_date: datetime, end_date: datetime) -> int:
58
58
  days_difference (int): The number of days between the start and end dates.
59
59
  """
60
60
  # Calculate the difference
61
- time_difference = start_date - end_date
61
+ time_difference = end_date - start_date
62
62
  # Extract the number of days from the timedelta object
63
63
  days_difference = time_difference.days
64
64
  return days_difference
@@ -77,3 +77,49 @@ def df_col_to_datetime(df: pd.DataFrame, col: str) -> pd.DataFrame:
77
77
  df[col] = pd.to_datetime(df[col])
78
78
  return df
79
79
 
80
+
81
+ # from dateutil import parser
82
+ # from datetime import datetime
83
+
84
+ # def detect_date_format(date_string):
85
+ # formats = [
86
+ # # Date formats
87
+ # "%Y-%m-%d", "%d-%m-%Y", "%m/%d/%Y", "%m-%d-%Y",
88
+ # "%Y/%m/%d", "%d/%m/%Y", "%Y.%m.%d", "%d.%m.%Y",
89
+ # "%d %b %Y", "%d %B %Y", "%b %d, %Y", "%B %d, %Y",
90
+ # "%d-%b-%Y", "%d-%B-%Y", "%b-%d-%Y", "%B-%d-%Y",
91
+ # # Date and time formats
92
+ # "%Y-%m-%d %H:%M:%S", "%d-%m-%Y %H:%M:%S", "%m/%d/%Y %H:%M:%S", "%m-%d-%Y %H:%M:%S",
93
+ # "%Y/%m/%d %H:%M:%S", "%d/%m/%Y %H:%M:%S", "%Y.%m.%d %H:%M:%S", "%d.%m.%Y %H:%M:%S",
94
+ # "%d %b %Y %H:%M:%S", "%d %B %Y %H:%M:%S", "%b %d, %Y %H:%M:%S", "%B %d, %Y %H:%M:%S",
95
+ # "%d-%b-%Y %H:%M:%S", "%d-%B-%Y %H:%M:%S", "%b-%d-%Y %H:%M:%S", "%B-%d-%Y %H:%M:%S",
96
+ # # Time formats with milliseconds
97
+ # "%Y-%m-%d %H:%M:%S.%f", "%d-%m-%Y %H:%M:%S.%f", "%m/%d/%Y %H:%M:%S.%f", "%m-%d-%Y %H:%M:%S.%f",
98
+ # "%Y/%m/%d %H:%M:%S.%f", "%d/%m/%Y %H:%M:%S.%f", "%Y.%m.%d %H:%M:%S.%f", "%d.%m.%Y %H:%M:%S.%f",
99
+ # "%d %b %Y %H:%M:%S.%f", "%d %B %Y %H:%M:%S.%f", "%b %d, %Y %H:%M:%S.%f", "%B %d, %Y %H:%M:%S.%f",
100
+ # "%d-%b-%Y %H:%M:%S.%f", "%d-%B-%Y %H:%M:%S.%f", "%b-%d-%Y %H:%M:%S.%f", "%B-%d-%Y %H:%M:%S.%f",
101
+ # # ISO format
102
+ # "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S.%f",
103
+ # "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S.%f",
104
+ # "%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%d %H:%M:%S%z",
105
+ # # Additional formats
106
+ # "%y/%m/%d %H:%M:%S", "%d/%m/%y %H:%M:%S", "%y-%m-%d %H:%M:%S", "%d-%m-%y %H:%M:%S",
107
+ # ]
108
+
109
+ # for date_format in formats:
110
+ # try:
111
+ # # Try to parse the date string with each format
112
+ # parsed_date = datetime.strptime(date_string, date_format)
113
+ # return date_format
114
+ # except ValueError:
115
+ # continue
116
+
117
+ # return None
118
+
119
+ # def detect_date_format(date_string):
120
+ # try:
121
+ # # Use dateutil parser to parse the date string
122
+ # parsed_date = parser.parse(date_string, fuzzy=False)
123
+ # return parsed_date
124
+ # except ValueError:
125
+ # return None
@@ -0,0 +1,88 @@
1
+ from gliner import GLiNER
2
+
3
+
4
+ def load_gliner_model(model_name : str, map_location="cpu") -> GLiNER:
5
+ """
6
+ Load the GLINER named entity recognition (NER) model.
7
+
8
+ Args:
9
+ model: The model name to load.
10
+ map_location: The device to load the model on. Possible values are cpu or cuda .
11
+
12
+ Returns:
13
+ A list of predicted entities.
14
+
15
+ """
16
+ model = GLiNER.from_pretrained(model_name, map_location=map_location)
17
+ return model
18
+
19
+ def gliner_predict(model : GLiNER, text : str, labels : list, threshold : float = 0.5) -> list:
20
+ """
21
+ Predicts entities using the given model.
22
+
23
+ Args:
24
+ model: The model used for prediction.
25
+ text: A list of texts to predict entities from.
26
+ labels: A list of labels corresponding to the texts.
27
+ threshold: The threshold value for entity prediction (default: 0.5).
28
+
29
+ Returns:
30
+ A list of predicted entities.
31
+
32
+ """
33
+ entities = model.predict_entities(text, labels, threshold=threshold)
34
+ return entities
35
+
36
+ def gliner_batch_predict(model : GLiNER, text : list, labels : list, threshold : float = 0.5) -> list:
37
+ """
38
+ Batch inference. Predicts entities using the given model.
39
+
40
+ Args:
41
+ model: The model used for prediction.
42
+ text: A list of texts to predict entities from.
43
+ labels: A list of labels corresponding to the texts.
44
+ threshold: The threshold value for entity prediction (default: 0.5).
45
+
46
+ Returns:
47
+ A list of predicted entities.
48
+
49
+ """
50
+ entities = model.batch_predict_entities(text, labels, threshold=threshold)
51
+ return entities
52
+
53
+
54
+ def parse_predictions(predictions : list) -> tuple:
55
+ """
56
+ Parse the predictions generated by a GLINER named entity recognition (NER) model for batch processing.
57
+
58
+ Args:
59
+ predictions (list): A list of dictionaries representing the predictions. Each dictionary contains the following keys:
60
+ - "start" (int): The starting index of the predicted entity in the input text.
61
+ - "end" (int): The ending index of the predicted entity in the input text.
62
+ - "text" (str): The predicted entity text.
63
+ - "label" (str): The predicted entity label.
64
+ - "score" (float): The confidence score of the prediction.
65
+
66
+ Returns:
67
+ tuple: A tuple containing lists of the extracted information from the predictions. The tuple contains the following lists:
68
+ - starts (list): A list of lists, where each inner list contains the starting indices of the predicted entities.
69
+ - ends (list): A list of lists, where each inner list contains the ending indices of the predicted entities.
70
+ - texts (list): A list of lists, where each inner list contains the predicted entity texts.
71
+ - labels (list): A list of lists, where each inner list contains the predicted entity labels.
72
+ - scores (list): A list of lists, where each inner list contains the confidence scores of the predictions.
73
+ """
74
+ starts, ends, texts, labels, scores = [], [], [], [], []
75
+ for prediction in predictions:
76
+ start, end, text, label, score = [], [], [], [], []
77
+ for item in prediction:
78
+ start.append(item.get("start"))
79
+ end.append(item.get("end"))
80
+ text.append(item.get("text"))
81
+ label.append(item.get("label"))
82
+ score.append(item.get("score"))
83
+ starts.append(start)
84
+ ends.append(end)
85
+ texts.append(text)
86
+ labels.append(label)
87
+ scores.append(score)
88
+ return starts, ends, texts, labels, scores