halib 0.1.50__tar.gz → 0.1.52__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. halib-0.1.52/MANIFEST.in +3 -0
  2. {halib-0.1.50/halib.egg-info → halib-0.1.52}/PKG-INFO +5 -1
  3. {halib-0.1.50 → halib-0.1.52}/README.md +4 -0
  4. halib-0.1.52/halib/research/perftb.py +756 -0
  5. {halib-0.1.50 → halib-0.1.52/halib.egg-info}/PKG-INFO +5 -1
  6. {halib-0.1.50 → halib-0.1.52}/halib.egg-info/SOURCES.txt +3 -4
  7. {halib-0.1.50 → halib-0.1.52}/setup.py +1 -1
  8. halib-0.1.50/halib/research/benchquery.py +0 -131
  9. halib-0.1.50/test/test15.py +0 -14
  10. halib-0.1.50/test/test_df_creator.py +0 -60
  11. {halib-0.1.50 → halib-0.1.52}/.gitignore +0 -0
  12. {halib-0.1.50 → halib-0.1.52}/GDriveFolder.txt +0 -0
  13. {halib-0.1.50 → halib-0.1.52}/LICENSE.txt +0 -0
  14. {halib-0.1.50 → halib-0.1.52}/guide_publish_pip.pdf +0 -0
  15. {halib-0.1.50 → halib-0.1.52}/halib/__init__.py +0 -0
  16. {halib-0.1.50 → halib-0.1.52}/halib/common.py +0 -0
  17. {halib-0.1.50 → halib-0.1.52}/halib/cuda.py +0 -0
  18. {halib-0.1.50 → halib-0.1.52}/halib/filetype/__init__.py +0 -0
  19. {halib-0.1.50 → halib-0.1.52}/halib/filetype/csvfile.py +0 -0
  20. {halib-0.1.50 → halib-0.1.52}/halib/filetype/jsonfile.py +0 -0
  21. {halib-0.1.50 → halib-0.1.52}/halib/filetype/textfile.py +0 -0
  22. {halib-0.1.50 → halib-0.1.52}/halib/filetype/videofile.py +0 -0
  23. {halib-0.1.50 → halib-0.1.52}/halib/filetype/yamlfile.py +0 -0
  24. {halib-0.1.50 → halib-0.1.52}/halib/online/__init__.py +0 -0
  25. {halib-0.1.50 → halib-0.1.52}/halib/online/gdrive.py +0 -0
  26. {halib-0.1.50 → halib-0.1.52}/halib/online/gdrive_mkdir.py +0 -0
  27. {halib-0.1.50 → halib-0.1.52}/halib/online/gdrive_test.py +0 -0
  28. {halib-0.1.50 → halib-0.1.52}/halib/online/projectmake.py +0 -0
  29. {halib-0.1.50 → halib-0.1.52}/halib/research/__init__.py +0 -0
  30. {halib-0.1.50 → halib-0.1.52}/halib/research/dataset.py +0 -0
  31. {halib-0.1.50 → halib-0.1.52}/halib/research/plot.py +0 -0
  32. {halib-0.1.50 → halib-0.1.52}/halib/research/torchloader.py +0 -0
  33. {halib-0.1.50 → halib-0.1.52}/halib/research/wandb_op.py +0 -0
  34. {halib-0.1.50 → halib-0.1.52}/halib/rich_color.py +0 -0
  35. {halib-0.1.50 → halib-0.1.52}/halib/system/__init__.py +0 -0
  36. {halib-0.1.50 → halib-0.1.52}/halib/system/cmd.py +0 -0
  37. {halib-0.1.50 → halib-0.1.52}/halib/system/filesys.py +0 -0
  38. {halib-0.1.50 → halib-0.1.52}/halib/utils/__init__.py +0 -0
  39. {halib-0.1.50 → halib-0.1.52}/halib/utils/listop.py +0 -0
  40. {halib-0.1.50 → halib-0.1.52}/halib/utils/tele_noti.py +0 -0
  41. {halib-0.1.50 → halib-0.1.52}/halib.egg-info/dependency_links.txt +0 -0
  42. {halib-0.1.50 → halib-0.1.52}/halib.egg-info/requires.txt +0 -0
  43. {halib-0.1.50 → halib-0.1.52}/halib.egg-info/top_level.txt +0 -0
  44. {halib-0.1.50 → halib-0.1.52}/setup.cfg +0 -0
@@ -0,0 +1,3 @@
1
+ prune _archived
2
+ prune test
3
+ prune zout
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: halib
3
- Version: 0.1.50
3
+ Version: 0.1.52
4
4
  Summary: Small library for common tasks
5
5
  Author: Hoang Van Ha
6
6
  Author-email: hoangvanhauit@gmail.com
@@ -15,6 +15,10 @@ License-File: LICENSE.txt
15
15
 
16
16
  Helper package for coding and automation
17
17
 
18
+ **Version 0.1.52**
19
+
20
+ + add `research/perftb` module to allow creating and managing performance tables for experiments, including filtering by datasets, metrics, and experiments.
21
+
18
22
  **Version 0.1.50**
19
23
 
20
24
  + add `pprint_local_path` to print local path (file/directory) in clickable link (as file URI)
@@ -1,5 +1,9 @@
1
1
  Helper package for coding and automation
2
2
 
3
+ **Version 0.1.52**
4
+
5
+ + add `research/perftb` module to allow creating and managing performance tables for experiments, including filtering by datasets, metrics, and experiments.
6
+
3
7
  **Version 0.1.50**
4
8
 
5
9
  + add `pprint_local_path` to print local path (file/directory) in clickable link (as file URI)
@@ -0,0 +1,756 @@
1
+ import warnings
2
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
3
+ import os
4
+ import random
5
+ import itertools
6
+ import plotly.express as px
7
+ import plotly.graph_objects as go
8
+ from collections import defaultdict
9
+ from plotly.subplots import make_subplots
10
+ from typing import Dict, List, Union, Optional
11
+
12
+ from rich.pretty import pprint
13
+ import pandas as pd
14
+
15
+ # from halib import *
16
+ # internal imports
17
+ from ..filetype import csvfile
18
+ from ..common import ConsoleLog
19
+
20
+
21
+ class DatasetMetrics:
22
+ """Class to store metrics definitions for a specific dataset."""
23
+
24
+ def __init__(self, dataset_name: str, metric_names: List[str]):
25
+ self.dataset_name = dataset_name
26
+ self.metric_names = set(metric_names) # Unique metric names
27
+ self.experiment_results: Dict[str, Dict[str, Union[float, int, None]]] = (
28
+ defaultdict(dict)
29
+ )
30
+
31
+ def add_experiment_result(
32
+ self, experiment_name: str, metrics: Dict[str, Union[float, int]]
33
+ ) -> None:
34
+ """Add experiment results for this dataset, only for defined metrics."""
35
+ # normalize metric names to lowercase
36
+ metrics = {k.lower(): v for k, v in metrics.items()}
37
+ # make sure every metric in metrics is defined for this dataset
38
+ for metric in metrics:
39
+ assert metric in self.metric_names, (
40
+ f"Metric <<{metric}>> not defined for dataset <<{self.dataset_name}>>. "
41
+ f"Available metrics: {self.metric_names}"
42
+ )
43
+ for metric in self.metric_names:
44
+ self.experiment_results[experiment_name][metric] = metrics.get(metric)
45
+
46
+ def get_metrics(self, experiment_name: str) -> Dict[str, Union[float, int, None]]:
47
+ """Retrieve metrics for a specific experiment."""
48
+ return self.experiment_results.get(
49
+ experiment_name, {metric: None for metric in self.metric_names}
50
+ )
51
+
52
+ def __str__(self) -> str:
53
+ return f"Dataset: {self.dataset_name}, Metrics: {', '.join(self.metric_names)}"
54
+
55
+
56
+ class PerfTB:
57
+ """Class to manage performance table data with datasets as primary structure."""
58
+
59
+ def __init__(self):
60
+ # Dictionary of dataset_name -> DatasetMetrics
61
+ self.datasets: Dict[str, DatasetMetrics] = {}
62
+ self.experiments: set = set()
63
+
64
+ def add_dataset(self, dataset_name: str, metric_names: List[str]) -> None:
65
+ """
66
+ Add a new dataset with its associated metrics.
67
+
68
+ Args:
69
+ dataset_name: Name of the dataset
70
+ metric_names: List of metric names for this dataset
71
+ """
72
+ # normalize metric names to lowercase
73
+ metric_names = [metric.lower() for metric in metric_names]
74
+ self.datasets[dataset_name] = DatasetMetrics(dataset_name, metric_names)
75
+
76
+ def table_meta(self):
77
+ """
78
+ Return metadata about the performance table.
79
+ """
80
+ return {
81
+ "num_datasets": len(self.datasets),
82
+ "num_experiments": len(self.experiments),
83
+ "datasets_metrics": {
84
+ dataset_name: dataset.metric_names
85
+ for dataset_name, dataset in self.datasets.items()
86
+ }
87
+ }
88
+
89
+ def add_experiment(
90
+ self,
91
+ experiment_name: str,
92
+ dataset_name: str,
93
+ metrics: Dict[str, Union[float, int]],
94
+ ) -> None:
95
+ """
96
+ Add experiment results for a specific dataset.
97
+
98
+ Args:
99
+ experiment_name: Name or identifier of the experiment
100
+ dataset_name: Name of the dataset
101
+ metrics: Dictionary of metric names and their values
102
+ """
103
+ # normalize metric names to lowercase
104
+ metrics = {k.lower(): v for k, v in metrics.items()}
105
+ if dataset_name not in self.datasets:
106
+ raise ValueError(
107
+ f"Dataset <<{dataset_name}>> not defined. Add dataset first."
108
+ )
109
+ self.experiments.add(experiment_name)
110
+ self.datasets[dataset_name].add_experiment_result(experiment_name, metrics)
111
+
112
+ def get_metrics_for_dataset(
113
+ self, experiment_name: str, dataset_name: str
114
+ ) -> Optional[Dict[str, Union[float, int, None]]]:
115
+ """
116
+ Retrieve performance metrics for a specific dataset and experiment.
117
+
118
+ Args:
119
+ experiment_name: Name or identifier of the experiment
120
+ dataset_name: Name of the dataset
121
+
122
+ Returns:
123
+ Dictionary of metrics or None if dataset not found
124
+ """
125
+ dataset = self.datasets.get(dataset_name)
126
+ if dataset:
127
+ return dataset.get_metrics(experiment_name)
128
+ return None
129
+
130
+ def get_all_experiments(self) -> List[str]:
131
+ """Return list of all experiment names."""
132
+ return sorted(self.experiments)
133
+
134
+ def get_all_datasets(self) -> List[str]:
135
+ """Return list of all dataset names."""
136
+ return sorted(self.datasets.keys())
137
+
138
+ def to_dataframe(self) -> pd.DataFrame:
139
+ """
140
+ Convert the performance table to a pandas DataFrame with MultiIndex columns.
141
+ Level 1: Datasets
142
+ Level 2: Metrics
143
+
144
+ Returns:
145
+ pandas DataFrame with experiments as rows and (dataset, metric) as columns
146
+ """
147
+ # Create MultiIndex for columns (dataset, metric)
148
+ columns = []
149
+ for dataset_name in self.get_all_datasets():
150
+ for metric in sorted(self.datasets[dataset_name].metric_names):
151
+ columns.append((dataset_name, metric))
152
+ columns = pd.MultiIndex.from_tuples(columns, names=["Dataset", "Metric"])
153
+
154
+ # Initialize DataFrame with experiments as index
155
+ df = pd.DataFrame(index=sorted(self.experiments), columns=columns)
156
+
157
+ # Populate DataFrame
158
+ for exp in self.experiments:
159
+ for dataset_name in self.datasets:
160
+ metrics = self.datasets[dataset_name].get_metrics(exp)
161
+ for metric, value in metrics.items():
162
+ df.loc[exp, (dataset_name, metric)] = value
163
+
164
+ return df
165
+
166
+ def plot(
167
+ self,
168
+ save_path: str,
169
+ title: Optional[str] = None,
170
+ custom_highlight_method_fn: Optional[callable] = None,
171
+ custom_sort_exp_fn: Optional[
172
+ callable
173
+ ] = None, # Function to sort experiments; should accept a list of experiment names and return a sorted list
174
+ open_plot: bool = False,
175
+ show_raw_df: bool = False,
176
+ experiment_names: Optional[List[str]] = None,
177
+ datasets: Optional[List[str]] = None,
178
+ height: int = 400,
179
+ width: int = 700,
180
+ ) -> None:
181
+ """
182
+ Plot comparison of experiments across datasets and their metrics using Plotly.
183
+ Splits plots if metrics have significantly different value ranges.
184
+
185
+ Args:
186
+ save_path: Base file path to save the figure(s) (extension optional)
187
+ open_plot: If True, attempts to open the saved image file(s) (Windows only)
188
+ experiment_names: List of experiments to compare (default: all)
189
+ datasets: List of datasets to include (default: all)
190
+ height: Base height of the plot (scaled by # of facet rows)
191
+ width: Width of the plot
192
+ range_diff_threshold: Range threshold to split metrics across different axes
193
+ """
194
+ experiment_names = experiment_names or self.get_all_experiments()
195
+ datasets = datasets or self.get_all_datasets()
196
+
197
+ records = []
198
+
199
+ for dataset in datasets:
200
+ if dataset not in self.datasets:
201
+ print(f"Warning: Dataset '{dataset}' not found. Skipping...")
202
+ continue
203
+
204
+ metric_names = sorted(self.datasets[dataset].metric_names)
205
+ for exp in experiment_names:
206
+ metric_values = self.get_metrics_for_dataset(exp, dataset)
207
+ if not metric_values:
208
+ continue
209
+ for metric in metric_names:
210
+ value = metric_values.get(metric)
211
+ if value is not None:
212
+ records.append(
213
+ {
214
+ "Experiment": exp,
215
+ "Dataset": dataset,
216
+ "Metric": metric,
217
+ "Value": value,
218
+ }
219
+ )
220
+
221
+ if not records:
222
+ print("No data found to plot.")
223
+ return
224
+
225
+ df = pd.DataFrame(records)
226
+ if show_raw_df:
227
+ with ConsoleLog("PerfTB DF"):
228
+ csvfile.fn_display_df(df)
229
+
230
+ metric_list = df["Metric"].unique()
231
+ fig = make_subplots(
232
+ rows=len(metric_list),
233
+ cols=1,
234
+ shared_xaxes=False,
235
+ subplot_titles=metric_list,
236
+ vertical_spacing=0.1,
237
+ )
238
+
239
+ unique_experiments = df["Experiment"].unique()
240
+ color_cycle = itertools.cycle(px.colors.qualitative.Plotly)
241
+
242
+ color_map = {
243
+ exp: color
244
+ for exp, color in zip(unique_experiments, color_cycle)
245
+ }
246
+
247
+ pattern_shapes = ["x", "-", "/", "\\", "|", "+", "."]
248
+ pattern_color = "black" # Color for patterns
249
+
250
+ current_our_method = -1 # Start with -1 to avoid index error
251
+ exp_pattern_dict = {}
252
+ for row_idx, metric in enumerate(metric_list, start=1):
253
+ metric_df = df[df["Metric"] == metric]
254
+ list_exp = list(metric_df["Experiment"].unique())
255
+ if custom_sort_exp_fn:
256
+ list_exp = custom_sort_exp_fn(list_exp)
257
+ for exp in list_exp:
258
+ should_highlight = (
259
+ custom_highlight_method_fn is not None and custom_highlight_method_fn(exp)
260
+ )
261
+ pattern_shape = "" # default no pattern
262
+ if should_highlight and exp not in exp_pattern_dict:
263
+ current_our_method += 1
264
+ pattern_shape = pattern_shapes[
265
+ current_our_method % len(pattern_shapes)
266
+ ]
267
+ exp_pattern_dict[exp] = pattern_shape
268
+ elif exp in exp_pattern_dict:
269
+ pattern_shape = exp_pattern_dict[exp]
270
+ exp_df = metric_df[metric_df["Experiment"] == exp]
271
+ fig.add_trace(
272
+ go.Bar(
273
+ x=exp_df["Dataset"],
274
+ y=exp_df["Value"],
275
+ name=f"{exp}",
276
+ legendgroup=exp,
277
+ showlegend=(row_idx == 1), # Show legend only for the first row
278
+ marker=dict(
279
+ color=color_map[exp],
280
+ pattern=(
281
+ dict(shape=pattern_shape, fgcolor=pattern_color)
282
+ if pattern_shape
283
+ else None
284
+ ),
285
+ ),
286
+ text=[f"{v:.5f}" for v in exp_df["Value"]],
287
+ textposition="auto", # <- position them automatically
288
+ ),
289
+ row=row_idx,
290
+ col=1,
291
+ )
292
+
293
+ # Manage layout
294
+ if title is None:
295
+ title = "Experiment Comparison by Metric Groups"
296
+ fig.update_layout(
297
+ height=height * len(metric_list),
298
+ width=width,
299
+ title_text=title,
300
+ barmode="group",
301
+ showlegend=True,
302
+ )
303
+
304
+ # Save and open plot
305
+ if save_path:
306
+ fig.write_image(save_path, engine="kaleido")
307
+ # pprint(f"Saved: {os.path.abspath(save_path)}")
308
+ if open_plot and os.name == "nt": # Windows
309
+ os.system(f'start "" "{os.path.abspath(save_path)}"')
310
+ return fig
311
+
312
+ def to_csv(self, outfile: str, sep=";", condensed_multiindex: bool = True) -> None:
313
+ """
314
+ Save the performance table to a CSV file.
315
+
316
+ Args:
317
+ outfile: Path to the output CSV file
318
+ """
319
+ df = self.to_dataframe()
320
+ if condensed_multiindex:
321
+ # Extract levels
322
+ level0 = df.columns.get_level_values(0)
323
+ level1 = df.columns.get_level_values(1)
324
+
325
+ # Build new level0 with blanks after first appearance
326
+ new_level0 = []
327
+ prev = None
328
+ for val in level0:
329
+ if val == prev:
330
+ new_level0.append("")
331
+ else:
332
+ new_level0.append(val)
333
+ prev = val
334
+
335
+ # Write to CSV
336
+ df.columns = pd.MultiIndex.from_arrays([new_level0, level1])
337
+ df.to_csv(outfile, index=True, sep=sep)
338
+
339
+ def display(self) -> None:
340
+ """
341
+ Display the performance table as a DataFrame.
342
+ """
343
+ df = self.to_dataframe()
344
+ csvfile.fn_display_df(df)
345
+
346
+ @classmethod
347
+ def _read_condensed_multiindex_csv(cls, filepath: str, sep=";", col_exclude_fn: Optional[callable] = None) -> pd.DataFrame:
348
+ # Read first two header rows
349
+ df = pd.read_csv(filepath, header=[0, 1], sep=sep)
350
+ # Extract levels
351
+ level0 = df.columns.get_level_values(0)
352
+ level1 = df.columns.get_level_values(1)
353
+ # pprint(f'{level0=}')
354
+ # pprint(f'{level1=}')
355
+ # if blank values in level0, fill them after first appearance
356
+ # for level0, we need to fill in blanks after first appearance
357
+ new_level0 = []
358
+ last_non_blank = level0[0] # Start with the first value
359
+ assert last_non_blank != "", (
360
+ "First level0 value should not be blank. "
361
+ "Check the CSV file format."
362
+ )
363
+ for val in level0:
364
+ if val == "" or "Unnamed: " in val:
365
+ new_level0.append(last_non_blank)
366
+ else:
367
+ new_level0.append(val)
368
+ last_non_blank = val
369
+ # pprint(new_level0)
370
+ # Rebuild MultiIndex
371
+ excluded_indices = []
372
+ if col_exclude_fn:
373
+ excluded_indices = []
374
+ for idx, val in enumerate(new_level0):
375
+ if col_exclude_fn(val):
376
+ excluded_indices.append(idx)
377
+ for idx, val in enumerate(level1):
378
+ if col_exclude_fn(val):
379
+ excluded_indices.append(idx)
380
+ excluded_indices = list(set(excluded_indices))
381
+
382
+ num_prev_cols = len(new_level0)
383
+ # Remove excluded indices from both levels
384
+ new_level0 = [
385
+ val for idx, val in enumerate(new_level0) if idx not in excluded_indices
386
+ ]
387
+ new_level1 = [
388
+ val for idx, val in enumerate(level1) if idx not in excluded_indices
389
+ ]
390
+ num_after_cols = len(new_level0)
391
+ if num_prev_cols != num_after_cols:
392
+ # get df with only the new level0 index
393
+ df = df.iloc[:, [i for i in range(len(df.columns)) if i not in excluded_indices]]
394
+
395
+ df.columns = pd.MultiIndex.from_arrays([new_level0, new_level1])
396
+ return df
397
+
398
+ @classmethod
399
+ def from_dataframe(
400
+ cls,
401
+ df: pd.DataFrame
402
+ ) -> "PerfTB":
403
+ """
404
+ Load performance table from a DataFrame.
405
+
406
+ Args:
407
+ df: Input DataFrame
408
+ method_col_name: Column name for methods
409
+ """
410
+ # console.log('--- PerfTB.from_dataframe ---')
411
+ # csvfile.fn_display_df(df)
412
+ cls_instance = cls()
413
+ # first loop through MultiIndex columns and extract datasets with their metrics
414
+ dataset_metrics = {}
415
+ for (dataset_name, metric_name) in df.columns[1:]:
416
+ if dataset_name not in dataset_metrics:
417
+ dataset_metrics[dataset_name] = []
418
+ dataset_metrics[dataset_name].append(metric_name)
419
+ for dataset_name, metric_names in dataset_metrics.items():
420
+ cls_instance.add_dataset(dataset_name, metric_names)
421
+
422
+ def safe_cast(val):
423
+ try:
424
+ return float(val)
425
+ except (ValueError, TypeError):
426
+ return None
427
+ for _, row in df.iterrows():
428
+ # Extract experiment name by first column
429
+ experiment_name = row.iloc[0]
430
+ # Iterate over MultiIndex columns (except first column)
431
+ metrics = {}
432
+ for dataset_name in dataset_metrics.keys():
433
+ for metric_name in dataset_metrics[dataset_name]:
434
+ # Get the value for this dataset and metric
435
+ value = row[(dataset_name, metric_name)]
436
+ # Cast to float or None if not applicable
437
+ metrics[metric_name] = safe_cast(value)
438
+
439
+ cls_instance.add_experiment(
440
+ experiment_name=experiment_name,
441
+ dataset_name=dataset_name,
442
+ metrics=metrics,
443
+ )
444
+
445
+ return cls_instance
446
+
447
+ @classmethod
448
+ def from_csv(
449
+ cls,
450
+ csv_file: str,
451
+ sep: str = ";",
452
+ col_exclude_fn: Optional[callable] = None,
453
+ ) -> "PerfTB":
454
+ """
455
+ Load performance table from a CSV file.
456
+
457
+ Args:
458
+ csv_file: Path to the CSV file
459
+ sep: Separator used in the CSV file
460
+ """
461
+ df = cls._read_condensed_multiindex_csv(csv_file, sep=sep, col_exclude_fn=col_exclude_fn)
462
+ return cls.from_dataframe(df)
463
+
464
+ def filter_index_info(
465
+ self):
466
+ """
467
+ Filter the index information of the performance table.
468
+ """
469
+ datasets_metrics = {
470
+ dataset_name: dataset.metric_names
471
+ for dataset_name, dataset in self.datasets.items()
472
+ }
473
+ meta_dict = {}
474
+ for i, (dataset_name, metrics) in enumerate(datasets_metrics.items()):
475
+ sorted_metrics = sorted(metrics) # make sure output should be same
476
+ meta_dict[dataset_name] = {
477
+ "index": i,
478
+ "metrics": sorted(
479
+ list(zip(sorted_metrics, range(len(sorted_metrics))))
480
+ ), # (metric_name, index)
481
+ }
482
+ return meta_dict
483
+
484
+ def filter(
485
+ self,
486
+ dataset_list: List[Union[str, int]] = None, # list of strings or integers
487
+ metrics_list: List[Union[list, str]] = None,
488
+ experiment_list: List[str] = None,
489
+ ) -> "PerfTB":
490
+ """
491
+ Filter the performance table by datasets and experiments.
492
+ Returns a new PerfTB instance with filtered data.
493
+ Args:
494
+ dataset_list: List of dataset names or indices to filter (optional)
495
+ metrics_list: List of metric names to filter (optional). Note that can be pass a list of list (of metric names) to filter by different set of metrics for each dataset. If using a single list, it will be applied to all datasets.
496
+ experiment_list: List of experiment NAMES (string) to filter (optional). Indices are not supported.
497
+ """
498
+ meta_filter_dict = self.filter_index_info()
499
+
500
+ if experiment_list is None:
501
+ experiment_list = self.get_all_experiments()
502
+ else:
503
+ # make sure all experiments are found in the performance table
504
+ for exp in experiment_list:
505
+ if exp not in self.experiments:
506
+ raise ValueError(
507
+ f"Experiment <<{exp}>> not found in the performance table. Available experiments: {self.get_all_experiments()}"
508
+ )
509
+ # pprint(f"Filtering experiments: {experiment_list}")
510
+ # get dataset list
511
+ if dataset_list is not None:
512
+ # if all item in dataset_list are integers, convert them to dataset names
513
+ if all(isinstance(item, int) and 0 <= item < len(meta_filter_dict) for item in dataset_list):
514
+ dataset_list = [
515
+ list(meta_filter_dict.keys())[item] for item in dataset_list
516
+ ]
517
+ elif all(isinstance(item, str) for item in dataset_list):
518
+ # if all items are strings, use them as dataset names
519
+ dataset_list = [
520
+ item for item in dataset_list if item in meta_filter_dict
521
+ ]
522
+ else:
523
+ raise ValueError(
524
+ f"dataset_list should be a list of strings (dataset names) or integers (indices, should be <= {len(meta_filter_dict) - 1}). Got: {dataset_list}"
525
+ )
526
+ else:
527
+ dataset_list = self.get_all_datasets()
528
+
529
+ filter_metrics_ls = [] # [list_metric_db_A, list_metric_db_B, ...]
530
+ all_ds_metrics = []
531
+ for dataset_name in dataset_list:
532
+ ds_meta = meta_filter_dict.get(dataset_name, None)
533
+ if ds_meta:
534
+ ds_metrics = ds_meta["metrics"]
535
+ all_ds_metrics.append([metric[0] for metric in ds_metrics])
536
+
537
+ if metrics_list is None:
538
+ filter_metrics_ls = all_ds_metrics
539
+ elif isinstance(metrics_list, list):
540
+ all_string = all(isinstance(item, str) for item in metrics_list)
541
+ if all_string:
542
+ # normalize metrics_list to lowercase
543
+ metrics_list = [metric.lower() for metric in metrics_list]
544
+ filter_metrics_ls = [metrics_list] * len(dataset_list)
545
+ else:
546
+ all_list = all(isinstance(item, list) for item in metrics_list)
547
+ pprint(f'{all_list=}')
548
+ if all_list:
549
+ print('b')
550
+ if len(metrics_list) != len(dataset_list):
551
+ raise ValueError(
552
+ f"metrics_list should be a list of strings (metric names) or a list of lists of metric names for each dataset. Got: {len(metrics_list)} metrics for {len(dataset_list)} datasets."
553
+ )
554
+ # normalize each list of metrics to lowercase
555
+ filter_metrics_ls = [
556
+ [metric.lower() for metric in item] for item in metrics_list
557
+ ]
558
+
559
+ else:
560
+ raise ValueError(
561
+ f"metrics_list should be a list of strings (metric names) or a list of lists of metric names for each dataset. Got: {metrics_list}"
562
+ )
563
+
564
+ # make sure that all metrics in filtered_metrics_list are valid for the datasets
565
+ final_metrics_list = []
566
+ for idx, dataset_name in enumerate(dataset_list):
567
+ valid_metrics_list = all_ds_metrics[idx]
568
+ current_metrics = filter_metrics_ls[idx]
569
+ new_valid_ds_metrics = []
570
+ for metric in current_metrics:
571
+ if metric in valid_metrics_list:
572
+ new_valid_ds_metrics.append(metric)
573
+ assert len(new_valid_ds_metrics) > 0, (
574
+ f"No valid metrics found for dataset <<{dataset_name}>>. "
575
+ f"Available metrics: {valid_metrics_list}. "
576
+ f"Filtered metrics: {current_metrics}"
577
+ )
578
+ final_metrics_list.append(new_valid_ds_metrics)
579
+
580
+ assert len(experiment_list) > 0, "No experiments to filter."
581
+ assert len(dataset_list) > 0, "No datasets to filter."
582
+ assert len(final_metrics_list) > 0, "No metrics to filter."
583
+ filtered_tb = PerfTB()
584
+ for db, metrics in zip(dataset_list, final_metrics_list):
585
+ # add dataset with its metrics
586
+ filtered_tb.add_dataset(db, metrics)
587
+
588
+ # now add experiments with their metrics
589
+ for exp in experiment_list:
590
+ for db, metrics in zip(dataset_list, final_metrics_list):
591
+ # get metrics for this experiment and dataset
592
+ metrics_dict = self.get_metrics_for_dataset(exp, db)
593
+ if metrics_dict:
594
+ # filter metrics to only those defined for this dataset
595
+ filtered_metrics = {k: v for k, v in metrics_dict.items() if k in metrics}
596
+ if filtered_metrics:
597
+ filtered_tb.add_experiment(exp, db, filtered_metrics)
598
+
599
+ return filtered_tb
600
+
601
+
602
+ def test_perftb_create() -> PerfTB:
603
+ # Create a performance table
604
+ perf_table = PerfTB()
605
+
606
+ # Define datasets and their metrics first
607
+ perf_table.add_dataset("dataset1", ["accuracy", "f1_score"])
608
+ perf_table.add_dataset("dataset2", ["accuracy", "f1_score", "precision"])
609
+
610
+ # Add experiment results
611
+ perf_table.add_experiment(
612
+ experiment_name="our_method1",
613
+ dataset_name="dataset1",
614
+ metrics={"accuracy": 100, "f1_score": 0.93},
615
+ )
616
+ perf_table.add_experiment(
617
+ experiment_name="our_method2",
618
+ dataset_name="dataset2",
619
+ metrics={"accuracy": 100, "precision": 0.87}, # Missing precision will be None
620
+ )
621
+ perf_table.add_experiment(
622
+ experiment_name="our_method2",
623
+ dataset_name="dataset1",
624
+ metrics={"accuracy": 90, "f1_score": 0.85},
625
+ )
626
+ method_list = [f"method{idx}" for idx in range(3, 7)]
627
+ # add random values for methods 3-6
628
+ for method in method_list:
629
+ perf_table.add_experiment(
630
+ experiment_name=method,
631
+ dataset_name="dataset1",
632
+ metrics={
633
+ "accuracy": random.randint(80, 100),
634
+ "f1_score": random.uniform(0.7, 0.95),
635
+ },
636
+ )
637
+ perf_table.add_experiment(
638
+ experiment_name=method,
639
+ dataset_name="dataset2",
640
+ metrics={
641
+ "accuracy": random.randint(80, 100),
642
+ "precision": random.uniform(0.7, 0.95),
643
+ "f1_score": random.uniform(0.7, 0.95),
644
+ },
645
+ )
646
+
647
+ # Get metrics for a specific dataset
648
+ metrics = perf_table.get_metrics_for_dataset("model1", "f1_score")
649
+ if metrics:
650
+ print(f"\nMetrics for model1 on dataset1: {metrics}")
651
+
652
+ return perf_table
653
+
654
+ def test_perftb_dataframe() -> None:
655
+ # Create a performance table
656
+ perf_table = test_perftb_create()
657
+
658
+ # Convert to DataFrame
659
+ df = perf_table.to_dataframe()
660
+ print("\nPerformance Table as DataFrame:")
661
+ csvfile.fn_display_df(df)
662
+
663
+ # Save to CSV
664
+ perf_table.to_csv("zout/perf_tb.csv", sep=";")
665
+
666
+ def test_perftb_plot() -> None:
667
+ # Create a performance table
668
+ perf_table = test_perftb_create()
669
+
670
+ # Plot the performance table
671
+ perf_table.plot(
672
+ save_path="zout/perf_tb.svg",
673
+ title="Performance Comparison",
674
+ custom_highlight_method_fn=lambda exp: exp.startswith("our_method"),
675
+ custom_sort_exp_fn=lambda exps: sorted(exps, reverse=True),
676
+ open_plot=False,
677
+ show_raw_df=False,
678
+ )
679
+
680
+ def test_load_perftb() -> None:
681
+ # Load performance table from CSV
682
+ def col_exclude_fn(col_name: str) -> bool:
683
+ # Exclude columns that are not metrics (e.g., "Unnamed" columns)
684
+ return col_name in ["Year", "data split", "test procedure", "code?"]
685
+
686
+ perf_table = PerfTB.from_csv("test/bench.csv", sep=";", col_exclude_fn=col_exclude_fn)
687
+ # print("\nLoaded Performance Table:")
688
+ # perf_table.display()
689
+ perf_table.to_csv("zout/loaded_perf_tb.csv", sep=";")
690
+
691
+ # Plot loaded performance table
692
+ perf_table.plot(
693
+ save_path="zout/loaded_perf_plot.svg",
694
+ title="Loaded Performance Comparison",
695
+ custom_highlight_method_fn=lambda exp: exp.startswith("Ours"),
696
+ custom_sort_exp_fn=lambda exps: sorted(exps, reverse=True),
697
+ open_plot=False,
698
+ show_raw_df=False,
699
+ )
700
+ return perf_table
701
+
702
+ def test_filtered_perftb() -> None:
703
+ perf_table_item = test_load_perftb()
704
+ # pprint(perf_table_item.meta())
705
+ pprint(perf_table_item.filter_index_info())
706
+ perf_table_item.filter(
707
+ dataset_list=[0, 2], # Use indices of datasets
708
+ # dataset_list=[
709
+ # "BOWFire_dataset_chino2015bowfire (small)",
710
+ # "FD-Dataset_li2020efficient (large)",
711
+ # ],
712
+ metrics_list=[
713
+ "acc",
714
+ "f1",
715
+ ], # [["acc"], ["f1"]], # Use a single list of metrics for all datasets or a list of lists for different metrics per dataset
716
+ # experiment_list=["ADFireNet_yar2023effective"],
717
+ ).plot(
718
+ save_path="zout/filtered_perf_tb.svg",
719
+ chk_highlight_method_fn=lambda exp: exp.startswith("Ours"),
720
+ custom_sort_exp_fn=lambda exps: sorted(exps, reverse=True),
721
+ title="Filtered Performance Comparison",
722
+ )
723
+
724
+ def test_mics() -> None:
725
+ # Test reading a CSV with MultiIndex columns
726
+ perf_table = test_perftb_create()
727
+ perf_table.display()
728
+ perf_table.plot(
729
+ save_path="zout/test1.svg",
730
+ title="Performance Comparison",
731
+ custom_highlight_method_fn=lambda exp: exp.startswith("our_"),
732
+ custom_sort_exp_fn=lambda exps: sorted(exps, reverse=True),
733
+ open_plot=False,
734
+ )
735
+ perf_table.to_csv("zout/perf_tb1.csv", sep=";")
736
+ tb = PerfTB.from_csv("./zout/perf_tb1.csv", sep=";")
737
+ tb.display()
738
+ ftb = tb.filter(
739
+ dataset_list=[1],
740
+ metrics_list=["precision"],
741
+ experiment_list=["method3", "method6"],
742
+ )
743
+ ftb.display()
744
+
745
+ ftb.plot(
746
+ save_path="zout/perf_tb11_plot.svg",
747
+ title="Performance Comparison",
748
+ custom_highlight_method_fn=lambda exp: exp.startswith("our_"),
749
+ custom_sort_exp_fn=lambda exps: sorted(exps, reverse=True),
750
+ open_plot=True,
751
+ )
752
+
753
+
754
+ # Example usage
755
+ if __name__ == "__main__":
756
+ test_mics()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: halib
3
- Version: 0.1.50
3
+ Version: 0.1.52
4
4
  Summary: Small library for common tasks
5
5
  Author: Hoang Van Ha
6
6
  Author-email: hoangvanhauit@gmail.com
@@ -15,6 +15,10 @@ License-File: LICENSE.txt
15
15
 
16
16
  Helper package for coding and automation
17
17
 
18
+ **Version 0.1.52**
19
+
20
+ + add `research/perftb` module to allow creating and managing performance tables for experiments, including filtering by datasets, metrics, and experiments.
21
+
18
22
  **Version 0.1.50**
19
23
 
20
24
  + add `pprint_local_path` to print local path (file/directory) in clickable link (as file URI)
@@ -1,6 +1,7 @@
1
1
  .gitignore
2
2
  GDriveFolder.txt
3
3
  LICENSE.txt
4
+ MANIFEST.in
4
5
  README.md
5
6
  guide_publish_pip.pdf
6
7
  setup.py
@@ -25,8 +26,8 @@ halib/online/gdrive_mkdir.py
25
26
  halib/online/gdrive_test.py
26
27
  halib/online/projectmake.py
27
28
  halib/research/__init__.py
28
- halib/research/benchquery.py
29
29
  halib/research/dataset.py
30
+ halib/research/perftb.py
30
31
  halib/research/plot.py
31
32
  halib/research/torchloader.py
32
33
  halib/research/wandb_op.py
@@ -35,6 +36,4 @@ halib/system/cmd.py
35
36
  halib/system/filesys.py
36
37
  halib/utils/__init__.py
37
38
  halib/utils/listop.py
38
- halib/utils/tele_noti.py
39
- test/test15.py
40
- test/test_df_creator.py
39
+ halib/utils/tele_noti.py
@@ -8,7 +8,7 @@ with open("requirements.txt") as f:
8
8
 
9
9
  setuptools.setup(
10
10
  name="halib",
11
- version="0.1.50",
11
+ version="0.1.52",
12
12
  author="Hoang Van Ha",
13
13
  author_email="hoangvanhauit@gmail.com",
14
14
  description="Small library for common tasks",
@@ -1,131 +0,0 @@
1
- import pandas as pd
2
- from rich.pretty import pprint
3
- from argparse import ArgumentParser
4
-
5
- def cols_to_col_groups(df):
6
- columns = list(df.columns)
7
- # pprint(columns)
8
-
9
- col_groups = []
10
- current_group = []
11
-
12
- def have_unnamed(col_group):
13
- return any("unnamed" in col.lower() for col in col_group)
14
-
15
- for i, col in enumerate(columns):
16
- # Add the first column to the current group
17
- if not current_group:
18
- current_group.append(col)
19
- continue
20
-
21
- prev_col = columns[i - 1]
22
- # Check if current column is "unnamed" or shares base name with previous
23
- # Assuming "equal" means same base name (before any suffix like '_1')
24
- base_prev = (
25
- prev_col.split("_")[0].lower() if "_" in prev_col else prev_col.lower()
26
- )
27
- base_col = col.split("_")[0].lower() if "_" in col else col.lower()
28
- is_unnamed = "unnamed" in col.lower()
29
- is_equal = base_col == base_prev
30
-
31
- if is_unnamed or is_equal:
32
- # Add to current group
33
- current_group.append(col)
34
- else:
35
- # Start a new group
36
- col_groups.append(current_group)
37
- current_group = [col]
38
- # Append the last group
39
- if current_group:
40
- col_groups.append(current_group)
41
- meta_dict = {"common_cols": [], "db_cols": []}
42
- for group in col_groups:
43
- if not have_unnamed(group):
44
- meta_dict["common_cols"].extend(group)
45
- else:
46
- # find the first unnamed column
47
- named_col = next(
48
- (col for col in group if "unnamed" not in col.lower()), None
49
- )
50
- group_cols = [f"{named_col}_{i}" for i in range(len(group))]
51
- meta_dict["db_cols"].extend(group_cols)
52
- return meta_dict
53
-
54
- # def bech_by_db_name(df, db_list="db1, db2", key_metrics="p, r, f1, acc"):
55
-
56
-
57
- def str_2_list(input_str, sep=","):
58
- out_ls = []
59
- if len(input_str.strip()) == 0:
60
- return out_ls
61
- if sep not in input_str:
62
- out_ls.append(input_str.strip())
63
- return out_ls
64
- else:
65
- out_ls = [item.strip() for item in input_str.split(sep) if item.strip()]
66
- return out_ls
67
-
68
-
69
- def filter_bech_df_by_db_and_metrics(df, db_list="", key_metrics=""):
70
- meta_cols_dict = cols_to_col_groups(df)
71
- op_df = df.copy()
72
- op_df.columns = (
73
- meta_cols_dict["common_cols"].copy() + meta_cols_dict["db_cols"].copy()
74
- )
75
- filterd_cols = []
76
- filterd_cols.extend(meta_cols_dict["common_cols"])
77
-
78
- selected_db_list = str_2_list(db_list)
79
- db_filted_cols = []
80
- if len(selected_db_list) > 0:
81
- for db_name in db_list.split(","):
82
- db_name = db_name.strip()
83
- for col_name in meta_cols_dict["db_cols"]:
84
- if db_name.lower() in col_name.lower():
85
- db_filted_cols.append(col_name)
86
- else:
87
- db_filted_cols = meta_cols_dict["db_cols"]
88
-
89
- filterd_cols.extend(db_filted_cols)
90
- df_filtered = op_df[filterd_cols].copy()
91
- df_filtered
92
-
93
- selected_metrics_ls = str_2_list(key_metrics)
94
- if len(selected_metrics_ls) > 0:
95
- # get the second row as metrics row (header)
96
- metrics_row = df_filtered.iloc[0].copy()
97
- # only get the values in columns in (db_filterd_cols)
98
- metrics_values = metrics_row[db_filted_cols].values
99
- keep_metrics_cols = []
100
- # create a zip of db_filted_cols and metrics_values (in that metrics_row)
101
- metrics_list = list(zip(metrics_values, db_filted_cols))
102
- selected_metrics_ls = [metric.strip().lower() for metric in selected_metrics_ls]
103
- for metric, col_name in metrics_list:
104
- if metric.lower() in selected_metrics_ls:
105
- keep_metrics_cols.append(col_name)
106
-
107
- else:
108
- pprint("No metrics selected, keeping all db columns")
109
- keep_metrics_cols = db_filted_cols
110
-
111
- final_filterd_cols = meta_cols_dict["common_cols"].copy() + keep_metrics_cols
112
- df_final = df_filtered[final_filterd_cols].copy()
113
- return df_final
114
-
115
-
116
- def parse_args():
117
- parser = ArgumentParser(
118
- description="desc text")
119
- parser.add_argument('-csv', '--csv', type=str, help='CSV file path', default=r"E:\Dev\__halib\test\bench.csv")
120
- return parser.parse_args()
121
-
122
-
123
- def main():
124
- args = parse_args()
125
- csv_file = args.csv
126
- df = pd.read_csv(csv_file, sep=";", encoding="utf-8")
127
- filtered_df = filter_bech_df_by_db_and_metrics(df, "bowfire", "acc")
128
- print(filtered_df)
129
-
130
- if __name__ == "__main__":
131
- main()
@@ -1,14 +0,0 @@
1
- from halib import *
2
-
3
-
4
- @console_log
5
- def this_function():
6
- pprint(np.random.rand(3, 3))
7
- print("Hello, World!")
8
- inspect(np.random.rand(3, 3))
9
-
10
-
11
- this_function()
12
-
13
- # with ConsoleLog('custom msg'):
14
- # pprint(np.random.rand(3, 3))
@@ -1,60 +0,0 @@
1
- from argparse import ArgumentParser
2
-
3
- # from line_profiler import LineProfiler
4
- # profile = LineProfiler()
5
- from tqdm import tqdm
6
- from rich.console import Console
7
- from rich import print as rprint
8
- from rich import inspect
9
- from rich.pretty import pprint
10
- from tqdm import tqdm
11
- from loguru import logger
12
-
13
-
14
- def parse_args():
15
- parser = ArgumentParser(description="desc text")
16
- parser.add_argument(
17
- "-arg1", "--argument1", type=str, help="arg1 description", default="some_string"
18
- )
19
- parser.add_argument(
20
- "-arg2", "--argument2", type=int, help="arg2 description", default=99
21
- )
22
- return parser.parse_args()
23
-
24
-
25
- # @profile
26
- def main():
27
- args = parse_args()
28
- arg1 = args.argument1
29
- arg2 = args.argument2
30
-
31
- from halib.filetype.csvfile import DFCreator
32
-
33
- dfCreator = DFCreator()
34
- dfCreator.create_table("table1", ["col1", "col2"])
35
- dfCreator.create_table("table2", ["col3", "col4", "col5"])
36
-
37
- limit = 5
38
- mil_rows = [["a", "b"] for i in range(limit)]
39
-
40
- dfCreator.insert_rows("table1", [["a", "b"], ["d", "e"]])
41
- dfCreator.insert_rows("table1", mil_rows)
42
-
43
- dfCreator.insert_rows("table2", ["c", "d", "e"])
44
-
45
- for i in tqdm(range(limit)):
46
- dfCreator.insert_rows("table1", ["w", "z"])
47
-
48
- # dfCreator.display_all_table_schema()
49
- pprint(dfCreator.row_pool_dict)
50
- dfCreator.display_all_table()
51
- dfCreator.insert_rows("table1", ["k", "k"])
52
- pprint(dfCreator.row_pool_dict)
53
- dfCreator.display_all_table()
54
- pprint(dfCreator.row_pool_dict)
55
-
56
- dfCreator.write_all_table(".")
57
-
58
-
59
- if __name__ == "__main__":
60
- main()
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes