opsci-toolbox 0.0.11__py3-none-any.whl → 0.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/reddit.py +399 -0
- opsci_toolbox/apis/telegram.py +1035 -0
- opsci_toolbox/apis/webscraping.py +75 -0
- opsci_toolbox/helpers/common.py +176 -4
- opsci_toolbox/helpers/dataviz.py +184 -26
- opsci_toolbox/helpers/dates.py +46 -0
- opsci_toolbox/helpers/gliner.py +88 -0
- opsci_toolbox/helpers/nlp.py +256 -8
- opsci_toolbox/helpers/nlp_cuml.py +3 -3
- opsci_toolbox/helpers/sna.py +1 -0
- {opsci_toolbox-0.0.11.dist-info → opsci_toolbox-0.0.13.dist-info}/METADATA +4 -1
- opsci_toolbox-0.0.13.dist-info/RECORD +25 -0
- opsci_toolbox-0.0.11.dist-info/RECORD +0 -22
- {opsci_toolbox-0.0.11.dist-info → opsci_toolbox-0.0.13.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.11.dist-info → opsci_toolbox-0.0.13.dist-info}/top_level.txt +0 -0
@@ -11,6 +11,81 @@ import concurrent.futures
|
|
11
11
|
import pandas as pd
|
12
12
|
from tqdm import tqdm
|
13
13
|
|
14
|
+
def get_tweet_html(username: str, tweet_id: str, **kwargs) -> str:
|
15
|
+
"""
|
16
|
+
Retrieves the HTML code of a tweet given the username and tweet ID.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
username (str): The username of the Twitter account.
|
20
|
+
tweet_id (str): The ID of the tweet.
|
21
|
+
kwargs : additional parameters to pass to the Twitter API.
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
str: The HTML code of the tweet.
|
25
|
+
|
26
|
+
|
27
|
+
"""
|
28
|
+
params = {'lang':"en", # language of the features around the tweet
|
29
|
+
"maxwidth" : 550, # size of the tweet
|
30
|
+
"hide_media":False, # to hide photo / video
|
31
|
+
"hide_thread":False, # to hide original message on replies
|
32
|
+
"omit_script": True, # to include or not the JS script : <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
|
33
|
+
"align": None, # to align the tweet {left,right,center,none}
|
34
|
+
"theme": "light", # theme of the tweet {light,dark}
|
35
|
+
"dnt": True # When set to true, the Tweet and its embedded page on your site are not used for purposes that include personalized suggestions and personalized ads.
|
36
|
+
}
|
37
|
+
|
38
|
+
params.update(kwargs)
|
39
|
+
|
40
|
+
url = f'https://publish.twitter.com/oembed?url=https://twitter.com/{username}/status/{tweet_id}'
|
41
|
+
response = requests.get(url, params=params)
|
42
|
+
|
43
|
+
if response.status_code == 200:
|
44
|
+
data = response.json()
|
45
|
+
html = data.get('html')
|
46
|
+
return html, username, tweet_id
|
47
|
+
else:
|
48
|
+
print(response.url, "Failed to fetch data from Twitter.")
|
49
|
+
return None, username, tweet_id
|
50
|
+
|
51
|
+
|
52
|
+
def parallel_twitter_oembed(usernames, tweet_ids, **kwargs):
|
53
|
+
"""
|
54
|
+
Scrapes Twitter oEmbed data for multiple tweets in parallel.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
usernames (list): A list of Twitter usernames.
|
58
|
+
tweet_ids (list): A list of tweet IDs corresponding to the tweets of the given usernames.
|
59
|
+
**kwargs: Additional keyword arguments to be passed to the `get_tweet_html` function.
|
60
|
+
|
61
|
+
Returns:
|
62
|
+
pandas.DataFrame: A DataFrame containing the scraped tweet HTML, username, and message ID.
|
63
|
+
|
64
|
+
Raises:
|
65
|
+
Exception: If there is an error while downloading the tweet HTML.
|
66
|
+
|
67
|
+
"""
|
68
|
+
all_data = []
|
69
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
70
|
+
# Submit scraping tasks for each URL and add tqdm progress bar
|
71
|
+
futures = [
|
72
|
+
executor.submit(get_tweet_html, username, tweet_id, **kwargs)
|
73
|
+
for username, tweet_id in zip(usernames, tweet_ids)
|
74
|
+
]
|
75
|
+
for future in tqdm(
|
76
|
+
concurrent.futures.as_completed(futures),
|
77
|
+
total=len(usernames),
|
78
|
+
desc="Scraping Progress",
|
79
|
+
):
|
80
|
+
try:
|
81
|
+
data, username, tweet_id = future.result()
|
82
|
+
all_data.append((data, username, tweet_id))
|
83
|
+
except Exception as e:
|
84
|
+
print(f"Error downloading : {e}")
|
85
|
+
|
86
|
+
df = pd.DataFrame(all_data, columns=["tweet_html", "user_name", "message_id"])
|
87
|
+
return df
|
88
|
+
|
14
89
|
|
15
90
|
def url_get_domain(url: str) -> str:
|
16
91
|
"""
|
opsci_toolbox/helpers/common.py
CHANGED
@@ -310,6 +310,22 @@ def write_pickle(data: pd.DataFrame, path: str, filename: str) -> str:
|
|
310
310
|
pickle.dump(data, f)
|
311
311
|
return file_path
|
312
312
|
|
313
|
+
def save_df_to_pickle(df: pd.DataFrame, path: str, filename: str) -> str:
|
314
|
+
"""
|
315
|
+
Write a DataFrame into a pickle file.
|
316
|
+
|
317
|
+
Args:
|
318
|
+
data (pd.DataFrame): The DataFrame to be written to the pickle file.
|
319
|
+
path (str): The directory where the pickle file will be saved.
|
320
|
+
filename (str): The name of the pickle file (without the extension).
|
321
|
+
|
322
|
+
Returns:
|
323
|
+
str: The full path to the saved pickle file.
|
324
|
+
"""
|
325
|
+
file_path = os.path.join(path, filename + '.pickle')
|
326
|
+
df.to_pickle(file_path)
|
327
|
+
return file_path
|
328
|
+
|
313
329
|
|
314
330
|
def write_list_to_txt(input_list: list, path: str, name: str) -> str:
|
315
331
|
"""
|
@@ -1319,13 +1335,13 @@ def categorize_percentiles(percentile: float) -> str:
|
|
1319
1335
|
Categorizes a percentile value into a string representing its range.
|
1320
1336
|
|
1321
1337
|
Args:
|
1322
|
-
|
1338
|
+
percentile (float): The percentile value (between 0 and 1).
|
1323
1339
|
|
1324
1340
|
Returns:
|
1325
|
-
|
1341
|
+
str: The category of the percentile value.
|
1326
1342
|
|
1327
1343
|
Raises:
|
1328
|
-
|
1344
|
+
ValueError: If the percentile value is outside the range [0, 1].
|
1329
1345
|
"""
|
1330
1346
|
if not (0 <= percentile <= 1):
|
1331
1347
|
raise ValueError("Percentile must be between 0 and 1 inclusive.")
|
@@ -1349,4 +1365,160 @@ def categorize_percentiles(percentile: float) -> str:
|
|
1349
1365
|
elif percentile <= 0.9:
|
1350
1366
|
return '80-90%'
|
1351
1367
|
else:
|
1352
|
-
return '90-100%'
|
1368
|
+
return '90-100%'
|
1369
|
+
|
1370
|
+
|
1371
|
+
def prepare_data_combinations(df: pd.DataFrame, columns_to_combine : list, col_date : str, date_format : str, rolling_period : str, col_id : str, col_engagement : str) -> pd.DataFrame:
|
1372
|
+
"""
|
1373
|
+
Prepare data for combinations of columns. Useful for data preparation before dataviz of time series. It adds missing rows for each combination of columns and date.
|
1374
|
+
Args:
|
1375
|
+
df (pd.DataFrame): The input DataFrame.
|
1376
|
+
columns_to_combine (list): List of column names to combine.
|
1377
|
+
col_date (str): Name of the column containing dates.
|
1378
|
+
date_format (str): Format of the dates in col_date.
|
1379
|
+
rolling_period (str): Rolling period for grouping.
|
1380
|
+
col_id (str): Name of the column containing unique IDs.
|
1381
|
+
col_engagement (str): Name of the column containing engagement values.
|
1382
|
+
Returns:
|
1383
|
+
pd.DataFrame: The prepared DataFrame with combinations of columns.
|
1384
|
+
"""
|
1385
|
+
|
1386
|
+
df_wt_combinations = df.copy()
|
1387
|
+
df_wt_combinations["date"] = pd.to_datetime(df_wt_combinations[col_date], format=date_format).to_numpy()
|
1388
|
+
|
1389
|
+
# Create all possible combinations of columns indexes
|
1390
|
+
# all_combinations = create_combination_index(df_wt_combinations, columns_to_combine, "date", rolling_period)
|
1391
|
+
|
1392
|
+
# If no columns to combine, just use the date for grouping
|
1393
|
+
if not columns_to_combine:
|
1394
|
+
df_wt_combinations = (df_wt_combinations
|
1395
|
+
.set_index("date")
|
1396
|
+
.resample(rolling_period)
|
1397
|
+
.agg({col_id: "nunique", col_engagement: "sum"})
|
1398
|
+
.fillna(0)
|
1399
|
+
.reset_index())
|
1400
|
+
else:
|
1401
|
+
# # Create all possible combinations of columns indexes
|
1402
|
+
# all_combinations = create_combination_index(df_wt_combinations, columns_to_combine, "date", rolling_period)
|
1403
|
+
|
1404
|
+
df_wt_combinations = (df_wt_combinations
|
1405
|
+
.set_index(["date"])
|
1406
|
+
.groupby(columns_to_combine)
|
1407
|
+
.resample(rolling_period)
|
1408
|
+
.agg({col_id: "nunique", col_engagement: "sum"})
|
1409
|
+
.fillna(0)
|
1410
|
+
.reset_index())
|
1411
|
+
|
1412
|
+
return df_wt_combinations
|
1413
|
+
|
1414
|
+
# def create_combination_index(df : pd.DataFrame, columns : list, date_column : str, rolling_period : str) -> pd.MultiIndex:
|
1415
|
+
# """
|
1416
|
+
# Create all possible combinations of unique values from specified columns and date range.
|
1417
|
+
|
1418
|
+
# Args:
|
1419
|
+
# df (pd.DataFrame): The input DataFrame
|
1420
|
+
# columns (list): List of column names to create combinations from
|
1421
|
+
# date_column (str): Name of the date column
|
1422
|
+
# rolling_period (str): Frequency for date range (e.g., '1D', '1W', '1M')
|
1423
|
+
|
1424
|
+
# Returns:
|
1425
|
+
# pd.MultiIndex: MultiIndex with all combinations
|
1426
|
+
# """
|
1427
|
+
# # Create a list to store unique values for each column
|
1428
|
+
# unique_values = []
|
1429
|
+
|
1430
|
+
# # Get unique values for each specified column
|
1431
|
+
# for col in columns:
|
1432
|
+
# unique_values.append(df[col].unique())
|
1433
|
+
|
1434
|
+
# # Create date range
|
1435
|
+
# date_range = pd.date_range(start=df[date_column].min(),
|
1436
|
+
# end=df[date_column].max(),
|
1437
|
+
# freq=rolling_period)
|
1438
|
+
|
1439
|
+
# # Add date range to the list of unique values
|
1440
|
+
# unique_values.append(date_range)
|
1441
|
+
|
1442
|
+
# # Create MultiIndex from product of all unique values
|
1443
|
+
# all_combinations = pd.MultiIndex.from_product(unique_values,
|
1444
|
+
# names=columns + [date_column])
|
1445
|
+
|
1446
|
+
# return all_combinations
|
1447
|
+
|
1448
|
+
# def prepare_data_combinations(df: pd.DataFrame, columns_to_combine : list, col_date : str, date_format : str, rolling_period : str, col_id : str, col_engagement : str) -> pd.DataFrame:
|
1449
|
+
# """
|
1450
|
+
# Prepare data for combinations of columns. Useful for data preparation before dataviz of time series. It adds missing rows for each combination of columns and date.
|
1451
|
+
# Args:
|
1452
|
+
# df (pd.DataFrame): The input DataFrame.
|
1453
|
+
# columns_to_combine (list): List of column names to combine.
|
1454
|
+
# col_date (str): Name of the column containing dates.
|
1455
|
+
# date_format (str): Format of the dates in col_date.
|
1456
|
+
# rolling_period (str): Rolling period for grouping.
|
1457
|
+
# col_id (str): Name of the column containing unique IDs.
|
1458
|
+
# col_engagement (str): Name of the column containing engagement values.
|
1459
|
+
# Returns:
|
1460
|
+
# pd.DataFrame: The prepared DataFrame with combinations of columns.
|
1461
|
+
# """
|
1462
|
+
|
1463
|
+
# df_wt_combinations = df.copy()
|
1464
|
+
# df_wt_combinations["date"] = pd.to_datetime(df_wt_combinations[col_date], format=date_format).to_numpy()
|
1465
|
+
|
1466
|
+
# # Create all possible combinations of columns indexes
|
1467
|
+
# all_combinations = create_combination_index(df_wt_combinations, columns_to_combine, "date", rolling_period)
|
1468
|
+
|
1469
|
+
# # If no columns to combine, just use the date for grouping
|
1470
|
+
# if not columns_to_combine:
|
1471
|
+
# df_wt_combinations = (df_wt_combinations
|
1472
|
+
# .set_index("date")
|
1473
|
+
# .groupby(pd.Grouper(freq=rolling_period))
|
1474
|
+
# .agg({col_id: "nunique", col_engagement: "sum"})
|
1475
|
+
# .fillna(0)
|
1476
|
+
# .reset_index())
|
1477
|
+
# else:
|
1478
|
+
# # # Create all possible combinations of columns indexes
|
1479
|
+
# # all_combinations = create_combination_index(df_wt_combinations, columns_to_combine, "date", rolling_period)
|
1480
|
+
|
1481
|
+
# df_wt_combinations = (df_wt_combinations
|
1482
|
+
# .set_index(["date"])
|
1483
|
+
# .groupby([*columns_to_combine, pd.Grouper(freq=rolling_period)])
|
1484
|
+
# .agg({col_id: "nunique", col_engagement: "sum"})
|
1485
|
+
# .reindex(all_combinations, fill_value=0)
|
1486
|
+
# .reset_index())
|
1487
|
+
|
1488
|
+
# return df_wt_combinations
|
1489
|
+
|
1490
|
+
def custom_ordering(df : pd.DataFrame, col_to_order : str, custom_order : list) -> pd.DataFrame:
|
1491
|
+
"""
|
1492
|
+
Orders the values in a DataFrame column based on a custom order.
|
1493
|
+
Args:
|
1494
|
+
df (DataFrame): The DataFrame containing the column to be ordered.
|
1495
|
+
col_to_order (str): The name of the column to be ordered.
|
1496
|
+
custom_order (list): The custom order of values.
|
1497
|
+
Returns:
|
1498
|
+
DataFrame: The DataFrame with the column values ordered according to the custom order.
|
1499
|
+
"""
|
1500
|
+
df[col_to_order] = pd.Categorical(df[col_to_order], categories=custom_order, ordered=True).to_numpy()
|
1501
|
+
return df
|
1502
|
+
|
1503
|
+
def calcul_total_et_pourcentage(df : pd.DataFrame, col_gb : list, metrics : dict) -> pd.DataFrame:
|
1504
|
+
"""
|
1505
|
+
Calculates the total and percentage values for the given metrics based on a grouping column.
|
1506
|
+
Args:
|
1507
|
+
df (DataFrame): The input DataFrame.
|
1508
|
+
col_gb (list): Names of the columns to group by.
|
1509
|
+
metrics (dict): A dictionary of metrics to calculate.
|
1510
|
+
Returns:
|
1511
|
+
DataFrame: The modified DataFrame with total and percentage values added.
|
1512
|
+
|
1513
|
+
"""
|
1514
|
+
percentage_agregations = {f'per_{key}': lambda x: x[key] / x[f"total_{key}"] for key in list(metrics.keys())}
|
1515
|
+
|
1516
|
+
df = (df.join(df.groupby(col_gb)
|
1517
|
+
.agg(metrics)
|
1518
|
+
.add_prefix("total_"), on=col_gb
|
1519
|
+
)
|
1520
|
+
.assign(**percentage_agregations).fillna(0)
|
1521
|
+
)
|
1522
|
+
|
1523
|
+
return df
|
1524
|
+
|
opsci_toolbox/helpers/dataviz.py
CHANGED
@@ -673,19 +673,14 @@ def create_scatter_plot(
|
|
673
673
|
"""
|
674
674
|
params = general_kwargs()
|
675
675
|
params.update(kwargs)
|
676
|
-
|
677
676
|
marker_color = params["marker_color"]
|
678
677
|
marker_line_color = params["marker_line_color"]
|
679
678
|
marker_size = params["marker_size"]
|
680
679
|
col_hover = params["col_hover"]
|
681
|
-
|
682
680
|
xaxis_range = params["xaxis_range"]
|
683
|
-
|
684
681
|
yaxis_range = params["yaxis_range"]
|
685
682
|
|
686
|
-
|
687
683
|
fig = go.Figure()
|
688
|
-
|
689
684
|
if marker_line_color is None:
|
690
685
|
marker_line_color = marker_color
|
691
686
|
|
@@ -694,40 +689,41 @@ def create_scatter_plot(
|
|
694
689
|
for i, category in enumerate(df[col_category].unique()):
|
695
690
|
|
696
691
|
if color_palette:
|
697
|
-
marker_color = color_palette.get(category, generate_random_hexadecimal_color) # Default to black if category not found
|
692
|
+
marker_color = color_palette.get(category, generate_random_hexadecimal_color()) # Default to black if category not found
|
698
693
|
else:
|
699
694
|
marker_color = generate_random_hexadecimal_color()
|
700
695
|
|
701
696
|
# hovertemplate generation
|
702
|
-
hovertemplate = (
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
)
|
697
|
+
# hovertemplate = (
|
698
|
+
# "<b>"
|
699
|
+
# + col_x
|
700
|
+
# + "</b>:"
|
701
|
+
# + df[df[col_category] == category][col_x].astype(str)
|
702
|
+
# + "<br><b>"
|
703
|
+
# + col_y
|
704
|
+
# + "</b>:"
|
705
|
+
# + df[df[col_category] == category][col_y].astype(str)
|
706
|
+
# + "<br><b>"
|
707
|
+
# + col_category
|
708
|
+
# + "</b>:"
|
709
|
+
# + str(category)
|
710
|
+
# )
|
711
|
+
hovertemplate = ""
|
716
712
|
if col_size is None:
|
717
713
|
size = marker_size
|
718
714
|
else:
|
719
715
|
size = df[df[col_category] == category][col_size]
|
720
|
-
|
716
|
+
# hovertemplate += "<br><b>" + col_size + "</b>:" + size.astype(str)
|
721
717
|
|
722
718
|
if len(col_hover) > 0:
|
723
719
|
for c in col_hover:
|
724
720
|
hovertemplate += (
|
725
721
|
"<br><b>"
|
726
722
|
+ str(c)
|
727
|
-
+ "</b
|
723
|
+
+ "</b> : "
|
728
724
|
+ df[df[col_category] == category][c]
|
725
|
+
.apply(format_input)
|
729
726
|
.astype(str)
|
730
|
-
.apply(wrap_text)
|
731
727
|
)
|
732
728
|
|
733
729
|
fig.add_trace(
|
@@ -2038,8 +2034,6 @@ def bar_trend_per_cat(df: pd.DataFrame,
|
|
2038
2034
|
params = general_kwargs()
|
2039
2035
|
params.update(kwargs)
|
2040
2036
|
|
2041
|
-
col_hover = params["col_hover"]
|
2042
|
-
|
2043
2037
|
xaxis_title = params["xaxis_title"]
|
2044
2038
|
yaxis_title = params["yaxis_title"]
|
2045
2039
|
zaxis_title = params["zaxis_title"]
|
@@ -2060,7 +2054,7 @@ def bar_trend_per_cat(df: pd.DataFrame,
|
|
2060
2054
|
current_df = df[df[col_cat] == cat]
|
2061
2055
|
|
2062
2056
|
|
2063
|
-
hovertemplate="<br><b>"+xaxis_title+"</b> :"+current_df[col_x].astype(str)+"<br><b>"+yaxis_title+"</b> - "+current_df[col_y].astype(str)+"<br><b>"+zaxis_title+"</b> : "+current_df[col_z].astype(str)
|
2057
|
+
hovertemplate="<br><b>"+xaxis_title+"</b> :"+current_df[col_x].astype(str)+"<br><b>"+yaxis_title+"</b> - "+current_df[col_y].astype(str)+"<br><b>"+zaxis_title+"</b> : "+current_df[col_z].astype(str)
|
2064
2058
|
# hovertemplate='<b>Categorie : </b>'+str(cat)+'<br><b>Date : </b>'+ current_df[col_x].astype(str) + '<br><b>'+y1_axis_title+'</b> : '+ current_df[col_metric1].astype(str)+' ('+current_df["per_"+col_metric1].map("{:.1%}".format).astype(str)+')' +'<br><b>'+y2_axis_title+'</b> : '+ current_df[col_metric2].astype(int).astype(str)+' ('+current_df["per_"+col_metric2].map("{:.1%}".format).astype(str)+')'
|
2065
2059
|
for c in col_hover:
|
2066
2060
|
hovertemplate += (
|
@@ -3946,6 +3940,170 @@ def create_radar(df: pd.DataFrame,
|
|
3946
3940
|
)
|
3947
3941
|
return fig
|
3948
3942
|
|
3943
|
+
def bar_subplots_per_cat(df: pd.DataFrame,
|
3944
|
+
col_x: str,
|
3945
|
+
col_y: str,
|
3946
|
+
col_cat: str,
|
3947
|
+
col_stack: str,
|
3948
|
+
color_palette: dict = None,
|
3949
|
+
n_top_words: int = 20,
|
3950
|
+
**kwargs
|
3951
|
+
) -> go.Figure:
|
3952
|
+
"""
|
3953
|
+
Create subplots of stacked bar charts.
|
3954
|
+
|
3955
|
+
Args:
|
3956
|
+
df (pd.DataFrame): DataFrame containing data for bar charts.
|
3957
|
+
col_x (str): Name of the column containing x-axis values.
|
3958
|
+
col_y (str): Name of the column containing y-axis values.
|
3959
|
+
col_cat (str): Name of the column containing categories.
|
3960
|
+
col_stack (str): Name of the column containing stacking values.
|
3961
|
+
color_palette (Optional[Dict[str, str]], optional): Dictionary mapping categories to colors. Defaults to None.
|
3962
|
+
n_top_words (int, optional): Number of top words to display in each bar chart. Defaults to 20.
|
3963
|
+
**kwargs: Additional keyword arguments to update default plotting parameters.
|
3964
|
+
|
3965
|
+
Returns:
|
3966
|
+
go.Figure: Plotly Figure object representing the subplots of stacked bar charts.
|
3967
|
+
"""
|
3968
|
+
|
3969
|
+
params = general_kwargs()
|
3970
|
+
params.update(kwargs)
|
3971
|
+
|
3972
|
+
marker_color = params['marker_color']
|
3973
|
+
textposition = params["textposition"]
|
3974
|
+
vertical_spacing = params['vertical_spacing']
|
3975
|
+
horizontal_spacing = params["horizontal_spacing"]
|
3976
|
+
col_hover = params['col_hover']
|
3977
|
+
n_cols = params['n_cols']
|
3978
|
+
categories = df[col_cat].unique()
|
3979
|
+
|
3980
|
+
# user define a number of columns, we compute the number of rows requires
|
3981
|
+
n_rows = math.ceil(len(categories) / n_cols)
|
3982
|
+
|
3983
|
+
# fine tune parameter according to the text position provided
|
3984
|
+
if textposition == 'inside':
|
3985
|
+
horizontal_spacing = (horizontal_spacing / n_cols) / 2
|
3986
|
+
else:
|
3987
|
+
horizontal_spacing = (horizontal_spacing / n_cols)
|
3988
|
+
|
3989
|
+
# create subplots
|
3990
|
+
fig = make_subplots(
|
3991
|
+
rows=n_rows, # number of rows
|
3992
|
+
cols=n_cols, # number of columns
|
3993
|
+
subplot_titles=list(categories), # title for each subplot
|
3994
|
+
vertical_spacing=vertical_spacing / n_rows, # space between subplots
|
3995
|
+
horizontal_spacing=horizontal_spacing, # space between subplots
|
3996
|
+
shared_xaxes=params["shared_xaxes"],
|
3997
|
+
shared_yaxes=params["shared_yaxes"]
|
3998
|
+
)
|
3999
|
+
|
4000
|
+
# create stacked bar traces for each subplot
|
4001
|
+
row_id = 0
|
4002
|
+
col_id = 0
|
4003
|
+
for i, category in enumerate(categories):
|
4004
|
+
# define row and column position
|
4005
|
+
col_id += 1
|
4006
|
+
if i % n_cols == 0:
|
4007
|
+
row_id += 1
|
4008
|
+
if col_id > n_cols:
|
4009
|
+
col_id = 1
|
4010
|
+
|
4011
|
+
# select data
|
4012
|
+
current_df = df[df[col_cat] == category].sort_values(by=col_x, ascending=True)
|
4013
|
+
unique_stacks = current_df[col_stack].unique()
|
4014
|
+
|
4015
|
+
if textposition == 'inside':
|
4016
|
+
text = current_df[col_y].head(n_top_words)
|
4017
|
+
else:
|
4018
|
+
textposition = "auto"
|
4019
|
+
text = None
|
4020
|
+
|
4021
|
+
for stack in unique_stacks:
|
4022
|
+
# define bar color or create a random color
|
4023
|
+
if color_palette:
|
4024
|
+
marker_color = color_palette.get(stack, generate_random_hexadecimal_color())
|
4025
|
+
else:
|
4026
|
+
marker_color = generate_random_hexadecimal_color()
|
4027
|
+
|
4028
|
+
stack_df = current_df[current_df[col_stack] == stack]
|
4029
|
+
hovertemplate = '<b>'+col_cat+" : "+ stack_df[col_cat].astype(str)+ '</b><br>' + col_stack+" : "+ stack_df[col_stack].astype(str)
|
4030
|
+
|
4031
|
+
for col in col_hover:
|
4032
|
+
hovertemplate += '<br><b>' + col + ': ' + current_df[current_df[col_cat] == category][col].astype(str) + '</b>'
|
4033
|
+
|
4034
|
+
|
4035
|
+
fig.add_trace(
|
4036
|
+
go.Bar(
|
4037
|
+
x=stack_df[col_x].tail(n_top_words),
|
4038
|
+
y=stack_df[col_y].tail(n_top_words),
|
4039
|
+
opacity=params["marker_opacity"],
|
4040
|
+
orientation=params["orientation"], # horizontal bars
|
4041
|
+
name=stack, # trace name for legend
|
4042
|
+
text=text, # text to display
|
4043
|
+
textposition=textposition, # text position
|
4044
|
+
textangle=params["xaxis_tickangle"], # text angle
|
4045
|
+
marker_color=marker_color, # bar color
|
4046
|
+
hovertemplate=hovertemplate + "<extra></extra>" # hover info
|
4047
|
+
),
|
4048
|
+
row=row_id,
|
4049
|
+
col=col_id
|
4050
|
+
)
|
4051
|
+
|
4052
|
+
for row_id in range(1, n_rows+1):
|
4053
|
+
for col_id in range(1, n_cols+1):
|
4054
|
+
fig.update_yaxes(title=params["yaxis_title"], row=row_id, col=1)
|
4055
|
+
fig.update_xaxes(title=params["xaxis_title"], row=row_id, col=col_id)
|
4056
|
+
|
4057
|
+
fig.update_layout(
|
4058
|
+
margin=dict(l=75, r=75, t=75, b=50),
|
4059
|
+
title_text=params["title_text"],
|
4060
|
+
width=n_cols * params["width"], # plot size
|
4061
|
+
height=n_rows * n_top_words * params["height"], # plot size
|
4062
|
+
showlegend=params["showlegend"],
|
4063
|
+
font_family=params["font_family"],
|
4064
|
+
font_size=params["font_size"],
|
4065
|
+
template=params["template"],
|
4066
|
+
plot_bgcolor=params["plot_bgcolor"], # background color (plot)
|
4067
|
+
paper_bgcolor=params["paper_bgcolor"], # background color (around plot)
|
4068
|
+
uniformtext_minsize=params["uniformtext_minsize"],
|
4069
|
+
barmode=params['barmode']
|
4070
|
+
)
|
4071
|
+
|
4072
|
+
fig.update_yaxes(
|
4073
|
+
# title=params["yaxis_title"],
|
4074
|
+
title_font_size=params["yaxis_title_font_size"],
|
4075
|
+
tickangle=params["yaxis_tickangle"],
|
4076
|
+
tickfont_size=params["yaxis_tickfont_size"],
|
4077
|
+
range=params["yaxis_range"],
|
4078
|
+
showgrid=params["yaxis_showgrid"],
|
4079
|
+
showline=params["yaxis_showline"],
|
4080
|
+
zeroline=params["yaxis_zeroline"],
|
4081
|
+
gridwidth=params["yaxis_gridwidth"],
|
4082
|
+
gridcolor=params["yaxis_gridcolor"],
|
4083
|
+
linewidth=params["yaxis_linewidth"],
|
4084
|
+
linecolor=params["yaxis_linecolor"],
|
4085
|
+
mirror=params["yaxis_mirror"],
|
4086
|
+
layer="below traces",
|
4087
|
+
)
|
4088
|
+
|
4089
|
+
fig.update_xaxes(
|
4090
|
+
# title=params["xaxis_title"],
|
4091
|
+
title_font_size=params["xaxis_title_font_size"],
|
4092
|
+
tickangle=params["xaxis_tickangle"],
|
4093
|
+
tickfont_size=params["xaxis_tickfont_size"],
|
4094
|
+
range=params["xaxis_range"],
|
4095
|
+
showgrid=params["xaxis_showgrid"],
|
4096
|
+
showline=params["xaxis_showline"],
|
4097
|
+
zeroline=params["xaxis_zeroline"],
|
4098
|
+
gridwidth=params["xaxis_gridwidth"],
|
4099
|
+
gridcolor=params["xaxis_gridcolor"],
|
4100
|
+
linewidth=params["xaxis_linewidth"],
|
4101
|
+
linecolor=params["xaxis_linecolor"],
|
4102
|
+
mirror=params["xaxis_mirror"],
|
4103
|
+
layer="below traces"
|
4104
|
+
)
|
4105
|
+
return fig
|
4106
|
+
|
3949
4107
|
# def bar_subplots(df: pd.DataFrame,
|
3950
4108
|
# col_x: str,
|
3951
4109
|
# col_y: str,
|
opsci_toolbox/helpers/dates.py
CHANGED
@@ -77,3 +77,49 @@ def df_col_to_datetime(df: pd.DataFrame, col: str) -> pd.DataFrame:
|
|
77
77
|
df[col] = pd.to_datetime(df[col])
|
78
78
|
return df
|
79
79
|
|
80
|
+
|
81
|
+
# from dateutil import parser
|
82
|
+
# from datetime import datetime
|
83
|
+
|
84
|
+
# def detect_date_format(date_string):
|
85
|
+
# formats = [
|
86
|
+
# # Date formats
|
87
|
+
# "%Y-%m-%d", "%d-%m-%Y", "%m/%d/%Y", "%m-%d-%Y",
|
88
|
+
# "%Y/%m/%d", "%d/%m/%Y", "%Y.%m.%d", "%d.%m.%Y",
|
89
|
+
# "%d %b %Y", "%d %B %Y", "%b %d, %Y", "%B %d, %Y",
|
90
|
+
# "%d-%b-%Y", "%d-%B-%Y", "%b-%d-%Y", "%B-%d-%Y",
|
91
|
+
# # Date and time formats
|
92
|
+
# "%Y-%m-%d %H:%M:%S", "%d-%m-%Y %H:%M:%S", "%m/%d/%Y %H:%M:%S", "%m-%d-%Y %H:%M:%S",
|
93
|
+
# "%Y/%m/%d %H:%M:%S", "%d/%m/%Y %H:%M:%S", "%Y.%m.%d %H:%M:%S", "%d.%m.%Y %H:%M:%S",
|
94
|
+
# "%d %b %Y %H:%M:%S", "%d %B %Y %H:%M:%S", "%b %d, %Y %H:%M:%S", "%B %d, %Y %H:%M:%S",
|
95
|
+
# "%d-%b-%Y %H:%M:%S", "%d-%B-%Y %H:%M:%S", "%b-%d-%Y %H:%M:%S", "%B-%d-%Y %H:%M:%S",
|
96
|
+
# # Time formats with milliseconds
|
97
|
+
# "%Y-%m-%d %H:%M:%S.%f", "%d-%m-%Y %H:%M:%S.%f", "%m/%d/%Y %H:%M:%S.%f", "%m-%d-%Y %H:%M:%S.%f",
|
98
|
+
# "%Y/%m/%d %H:%M:%S.%f", "%d/%m/%Y %H:%M:%S.%f", "%Y.%m.%d %H:%M:%S.%f", "%d.%m.%Y %H:%M:%S.%f",
|
99
|
+
# "%d %b %Y %H:%M:%S.%f", "%d %B %Y %H:%M:%S.%f", "%b %d, %Y %H:%M:%S.%f", "%B %d, %Y %H:%M:%S.%f",
|
100
|
+
# "%d-%b-%Y %H:%M:%S.%f", "%d-%B-%Y %H:%M:%S.%f", "%b-%d-%Y %H:%M:%S.%f", "%B-%d-%Y %H:%M:%S.%f",
|
101
|
+
# # ISO format
|
102
|
+
# "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S.%f",
|
103
|
+
# "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S.%f",
|
104
|
+
# "%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%d %H:%M:%S%z",
|
105
|
+
# # Additional formats
|
106
|
+
# "%y/%m/%d %H:%M:%S", "%d/%m/%y %H:%M:%S", "%y-%m-%d %H:%M:%S", "%d-%m-%y %H:%M:%S",
|
107
|
+
# ]
|
108
|
+
|
109
|
+
# for date_format in formats:
|
110
|
+
# try:
|
111
|
+
# # Try to parse the date string with each format
|
112
|
+
# parsed_date = datetime.strptime(date_string, date_format)
|
113
|
+
# return date_format
|
114
|
+
# except ValueError:
|
115
|
+
# continue
|
116
|
+
|
117
|
+
# return None
|
118
|
+
|
119
|
+
# def detect_date_format(date_string):
|
120
|
+
# try:
|
121
|
+
# # Use dateutil parser to parse the date string
|
122
|
+
# parsed_date = parser.parse(date_string, fuzzy=False)
|
123
|
+
# return parsed_date
|
124
|
+
# except ValueError:
|
125
|
+
# return None
|