halib 0.1.66__tar.gz → 0.1.70__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {halib-0.1.66 → halib-0.1.70}/PKG-INFO +7 -2
  2. {halib-0.1.66 → halib-0.1.70}/README.md +6 -1
  3. halib-0.1.70/halib/research/base_config.py +100 -0
  4. halib-0.1.70/halib/research/base_exp.py +99 -0
  5. {halib-0.1.66 → halib-0.1.70}/halib/research/metrics.py +13 -0
  6. {halib-0.1.66 → halib-0.1.70}/halib/research/perfcalc.py +144 -103
  7. {halib-0.1.66 → halib-0.1.70}/halib.egg-info/PKG-INFO +7 -2
  8. {halib-0.1.66 → halib-0.1.70}/halib.egg-info/SOURCES.txt +2 -0
  9. {halib-0.1.66 → halib-0.1.70}/setup.py +1 -1
  10. {halib-0.1.66 → halib-0.1.70}/.gitignore +0 -0
  11. {halib-0.1.66 → halib-0.1.70}/GDriveFolder.txt +0 -0
  12. {halib-0.1.66 → halib-0.1.70}/LICENSE.txt +0 -0
  13. {halib-0.1.66 → halib-0.1.70}/MANIFEST.in +0 -0
  14. {halib-0.1.66 → halib-0.1.70}/guide_publish_pip.pdf +0 -0
  15. {halib-0.1.66 → halib-0.1.70}/halib/__init__.py +0 -0
  16. {halib-0.1.66 → halib-0.1.70}/halib/common.py +0 -0
  17. {halib-0.1.66 → halib-0.1.70}/halib/cuda.py +0 -0
  18. {halib-0.1.66 → halib-0.1.70}/halib/filetype/__init__.py +0 -0
  19. {halib-0.1.66 → halib-0.1.70}/halib/filetype/csvfile.py +0 -0
  20. {halib-0.1.66 → halib-0.1.70}/halib/filetype/jsonfile.py +0 -0
  21. {halib-0.1.66 → halib-0.1.70}/halib/filetype/textfile.py +0 -0
  22. {halib-0.1.66 → halib-0.1.70}/halib/filetype/videofile.py +0 -0
  23. {halib-0.1.66 → halib-0.1.70}/halib/filetype/yamlfile.py +0 -0
  24. {halib-0.1.66 → halib-0.1.70}/halib/online/__init__.py +0 -0
  25. {halib-0.1.66 → halib-0.1.70}/halib/online/gdrive.py +0 -0
  26. {halib-0.1.66 → halib-0.1.70}/halib/online/gdrive_mkdir.py +0 -0
  27. {halib-0.1.66 → halib-0.1.70}/halib/online/gdrive_test.py +0 -0
  28. {halib-0.1.66 → halib-0.1.70}/halib/online/projectmake.py +0 -0
  29. {halib-0.1.66 → halib-0.1.70}/halib/research/__init__.py +0 -0
  30. {halib-0.1.66 → halib-0.1.70}/halib/research/dataset.py +0 -0
  31. {halib-0.1.66 → halib-0.1.70}/halib/research/perftb.py +0 -0
  32. {halib-0.1.66 → halib-0.1.70}/halib/research/plot.py +0 -0
  33. {halib-0.1.66 → halib-0.1.70}/halib/research/torchloader.py +0 -0
  34. {halib-0.1.66 → halib-0.1.70}/halib/research/wandb_op.py +0 -0
  35. {halib-0.1.66 → halib-0.1.70}/halib/rich_color.py +0 -0
  36. {halib-0.1.66 → halib-0.1.70}/halib/system/__init__.py +0 -0
  37. {halib-0.1.66 → halib-0.1.70}/halib/system/cmd.py +0 -0
  38. {halib-0.1.66 → halib-0.1.70}/halib/system/filesys.py +0 -0
  39. {halib-0.1.66 → halib-0.1.70}/halib/utils/__init__.py +0 -0
  40. {halib-0.1.66 → halib-0.1.70}/halib/utils/dataclass_util.py +0 -0
  41. {halib-0.1.66 → halib-0.1.70}/halib/utils/dict_op.py +0 -0
  42. {halib-0.1.66 → halib-0.1.70}/halib/utils/gpu_mon.py +0 -0
  43. {halib-0.1.66 → halib-0.1.70}/halib/utils/listop.py +0 -0
  44. {halib-0.1.66 → halib-0.1.70}/halib/utils/tele_noti.py +0 -0
  45. {halib-0.1.66 → halib-0.1.70}/halib/utils/video.py +0 -0
  46. {halib-0.1.66 → halib-0.1.70}/halib.egg-info/dependency_links.txt +0 -0
  47. {halib-0.1.66 → halib-0.1.70}/halib.egg-info/requires.txt +0 -0
  48. {halib-0.1.66 → halib-0.1.70}/halib.egg-info/top_level.txt +0 -0
  49. {halib-0.1.66 → halib-0.1.70}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: halib
3
- Version: 0.1.66
3
+ Version: 0.1.70
4
4
  Summary: Small library for common tasks
5
5
  Author: Hoang Van Ha
6
6
  Author-email: hoangvanhauit@gmail.com
@@ -52,7 +52,12 @@ Dynamic: summary
52
52
 
53
53
  Helper package for coding and automation
54
54
 
55
- **Version 0.1.66**
55
+ **Version 0.1.70**
56
+
57
+ + `research/base_exp`: add base experiment class to handle common experiment tasks, including performance calculation and saving results.
58
+
59
+
60
+ **Version 0.1.67**
56
61
 
57
62
  + now use `uv` for venv management
58
63
  + `research/perfcalc`: support both torchmetrics and custom metrics for performance calculation
@@ -1,6 +1,11 @@
1
1
  Helper package for coding and automation
2
2
 
3
- **Version 0.1.66**
3
+ **Version 0.1.70**
4
+
5
+ + `research/base_exp`: add base experiment class to handle common experiment tasks, including performance calculation and saving results.
6
+
7
+
8
+ **Version 0.1.67**
4
9
 
5
10
  + now use `uv` for venv management
6
11
  + `research/perfcalc`: support both torchmetrics and custom metrics for performance calculation
@@ -0,0 +1,100 @@
1
+ import os
2
+ from rich.pretty import pprint
3
+ from abc import ABC, abstractmethod
4
+ from dataclass_wizard import YAMLWizard
5
+
6
+
7
+ class NamedConfig(ABC):
8
+ """
9
+ Base class for named configurations.
10
+ All configurations should have a name.
11
+ """
12
+
13
+ @abstractmethod
14
+ def get_name(self):
15
+ """
16
+ Get the name of the configuration.
17
+ This method should be implemented in subclasses.
18
+ """
19
+ pass
20
+
21
+
22
+ class ExpBaseConfig(ABC, YAMLWizard):
23
+ """
24
+ Base class for configuration objects.
25
+ What a cfg class must have:
26
+ 1 - a dataset cfg
27
+ 2 - a metric cfg
28
+ 3 - a method cfg
29
+ """
30
+
31
+ # Save to yaml fil
32
+ def save_to_outdir(
33
+ self, filename: str = "config.yaml", outdir=None, override: bool = False
34
+ ) -> None:
35
+ """
36
+ Save the configuration to the output directory.
37
+ """
38
+ if outdir is not None:
39
+ output_dir = outdir
40
+ else:
41
+ output_dir = self.get_outdir()
42
+ os.makedirs(output_dir, exist_ok=True)
43
+ assert (output_dir is not None) and (
44
+ os.path.isdir(output_dir)
45
+ ), f"Output directory '{output_dir}' does not exist or is not a directory."
46
+ file_path = os.path.join(output_dir, filename)
47
+ if os.path.exists(file_path) and not override:
48
+ pprint(
49
+ f"File '{file_path}' already exists. Use 'override=True' to overwrite."
50
+ )
51
+ else:
52
+ # method of YAMLWizard to_yaml_file
53
+ self.to_yaml_file(file_path)
54
+
55
+ @classmethod
56
+ @abstractmethod
57
+ # load from a custom YAML file
58
+ def from_custom_yaml_file(cls, yaml_file: str):
59
+ """Load a configuration from a custom YAML file."""
60
+ pass
61
+
62
+ @abstractmethod
63
+ def get_cfg_name(self):
64
+ """
65
+ Get the name of the configuration.
66
+ This method should be implemented in subclasses.
67
+ """
68
+ pass
69
+
70
+ @abstractmethod
71
+ def get_outdir(self):
72
+ """
73
+ Get the output directory for the configuration.
74
+ This method should be implemented in subclasses.
75
+ """
76
+ return None
77
+
78
+ @abstractmethod
79
+ def get_general_cfg(self):
80
+ """
81
+ Get the general configuration like output directory, log settings, SEED, etc.
82
+ This method should be implemented in subclasses.
83
+ """
84
+ pass
85
+
86
+ @abstractmethod
87
+ def get_dataset_cfg(self) -> NamedConfig:
88
+ """
89
+ Get the dataset configuration.
90
+ This method should be implemented in subclasses.
91
+ """
92
+ pass
93
+
94
+ @abstractmethod
95
+ def get_metric_cfg(self) -> NamedConfig:
96
+ """
97
+ Get the metric configuration.
98
+ This method should be implemented in subclasses.
99
+ """
100
+ pass
@@ -0,0 +1,99 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+ from base_config import ExpBaseConfig
4
+ from perfcalc import PerfCalc
5
+ from metrics import MetricsBackend
6
+
7
+ # ! SEE https://github.com/hahv/base_exp for sample usage
8
+ class BaseExperiment(PerfCalc, ABC):
9
+ """
10
+ Base class for experiments.
11
+ Orchestrates the experiment pipeline using a pluggable metrics backend.
12
+ """
13
+
14
+ def __init__(self, config: ExpBaseConfig):
15
+ self.config = config
16
+ self.metric_backend = None
17
+
18
+ # -----------------------
19
+ # PerfCalc Required Methods
20
+ # -----------------------
21
+ def get_dataset_name(self):
22
+ return self.config.get_dataset_cfg().get_name()
23
+
24
+ def get_experiment_name(self):
25
+ return self.config.get_cfg_name()
26
+
27
+ def get_metric_backend(self):
28
+ if not self.metric_backend:
29
+ self.metric_backend = self.prepare_metrics(self.config.get_metric_cfg())
30
+ return self.metric_backend
31
+
32
+ # -----------------------
33
+ # Abstract Experiment Steps
34
+ # -----------------------
35
+ @abstractmethod
36
+ def init_general(self, general_cfg):
37
+ """Setup general settings like SEED, logging, env variables."""
38
+ pass
39
+
40
+ @abstractmethod
41
+ def prepare_dataset(self, dataset_cfg):
42
+ """Load/prepare dataset."""
43
+ pass
44
+
45
+ @abstractmethod
46
+ def prepare_metrics(self, metric_cfg) -> MetricsBackend:
47
+ """
48
+ Prepare the metrics for the experiment.
49
+ This method should be implemented in subclasses.
50
+ """
51
+ pass
52
+
53
+ @abstractmethod
54
+ def exec_exp(self, *args, **kwargs):
55
+ """Run experiment process, e.g.: training/evaluation loop.
56
+ Return: raw_metrics_data, and extra_data as input for calc_and_save_exp_perfs
57
+ """
58
+ pass
59
+
60
+ def eval_exp(self):
61
+ """Optional: re-run evaluation from saved results."""
62
+ pass
63
+
64
+ # -----------------------
65
+ # Main Experiment Runner
66
+ # -----------------------
67
+ def run_exp(self, do_calc_metrics=True, *args, **kwargs):
68
+ """
69
+ Run the whole experiment pipeline.
70
+ Params:
71
+ + 'outfile' to save csv file results,
72
+ + 'outdir' to set output directory for experiment results.
73
+ + 'return_df' to return a DataFrame of results instead of a dictionary.
74
+
75
+ Full pipeline:
76
+ 1. Init
77
+ 2. Dataset
78
+ 3. Metrics Preparation
79
+ 4. Save Config
80
+ 5. Execute
81
+ 6. Calculate & Save Metrics
82
+ """
83
+ self.init_general(self.config.get_general_cfg())
84
+ self.prepare_dataset(self.config.get_dataset_cfg())
85
+
86
+ # Save config before running
87
+ self.config.save_to_outdir()
88
+
89
+ # Execute experiment
90
+ results = self.exec_exp(*args, **kwargs)
91
+ if do_calc_metrics:
92
+ metrics_data, extra_data = results
93
+ # Calculate & Save metrics
94
+ perf_results = self.calc_and_save_exp_perfs(
95
+ raw_metrics_data=metrics_data, extra_data=extra_data, *args, **kwargs
96
+ )
97
+ return perf_results
98
+ else:
99
+ return results
@@ -44,6 +44,15 @@ class MetricsBackend(ABC):
44
44
  ) -> Dict[str, Any]:
45
45
  pass
46
46
 
47
+ def prepare_metrics_backend_data(
48
+ self, raw_metric_data, *args, **kwargs
49
+ ):
50
+ """
51
+ Prepare the data for the metrics backend.
52
+ This method can be overridden by subclasses to customize data preparation.
53
+ """
54
+ return raw_metric_data
55
+
47
56
  def calc_metrics(
48
57
  self, metrics_data_dict: Dict[str, Any], *args, **kwargs
49
58
  ) -> Dict[str, Any]:
@@ -55,6 +64,10 @@ class MetricsBackend(ABC):
55
64
  for metric in self.metric_names:
56
65
  if metric not in metrics_data_dict:
57
66
  raise ValueError(f"Metric '{metric}' not found in provided data.")
67
+ # Prepare the data for the backend
68
+ metrics_data_dict = self.prepare_metrics_backend_data(
69
+ metrics_data_dict, *args, **kwargs
70
+ )
58
71
  # Call the abstract method to compute metrics
59
72
  return self.compute_metrics(self.metric_info, metrics_data_dict, *args, **kwargs)
60
73
 
@@ -1,35 +1,27 @@
1
1
  import os
2
2
  import glob
3
- import inspect
3
+ from typing import Optional, Tuple
4
4
  import pandas as pd
5
5
 
6
- from typing import Dict
7
- from functools import wraps
8
6
  from rich.pretty import pprint
9
7
 
10
8
  from abc import ABC, abstractmethod
11
9
  from collections import OrderedDict
12
10
 
13
11
  from ..filetype import csvfile
12
+ from ..system import filesys as fs
14
13
  from ..common import now_str
15
14
  from ..research.perftb import PerfTB
16
15
  from ..research.metrics import *
17
16
 
18
- # # try to import torch, and torchmetrics
19
- # try:
20
- # import torch
21
- # import torchmetrics
22
- # from torchmetrics import Metric
23
- # except ImportError:
24
- # raise ImportError("Please install torch and torchmetrics to use this module.")
25
-
26
17
 
27
18
  REQUIRED_COLS = ["experiment", "dataset"]
28
19
  CSV_FILE_POSTFIX = "__perf"
20
+ METRIC_PREFIX = "metric_"
29
21
 
30
- class PerfCalc(ABC): # Abstract base class for performance calculation
22
+ class PerfCalc(ABC): # Abstract base class for performance calculation
31
23
  @abstractmethod
32
- def get_experiment_name(self):
24
+ def get_experiment_name(self) -> str:
33
25
  """
34
26
  Return the name of the experiment.
35
27
  This function should be overridden by the subclass if needed.
@@ -37,7 +29,7 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
37
29
  pass
38
30
 
39
31
  @abstractmethod
40
- def get_dataset_name(self):
32
+ def get_dataset_name(self) -> str:
41
33
  """
42
34
  Return the name of the dataset.
43
35
  This function should be overridden by the subclass if needed.
@@ -52,96 +44,128 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
52
44
  """
53
45
  pass
54
46
 
55
- # ! can be override, but ONLY if torchmetrics are used
56
- # Prepare the exp data for torch metrics.
57
- @abstractmethod
58
- def prepare_metrics_data_dict(self, metric_names, *args, **kwargs):
59
- """
60
- Prepare the data for metrics.
61
- This function should be overridden by the subclass if needed.
62
- Must return a dictionary with keys as metric names and values as the data to be used for those metrics.
63
- NOTE: that the data (for each metric) must be in the format expected by the torchmetrics instance (for that metric). E.g: {"accuracy": {"preds": [...], "target": [...]}, ...} since torchmetrics expects the data in a specific format.
64
- """
65
- pass
47
+ def valid_proc_extra_data(
48
+ self, proc_extra_data
49
+ ):
50
+ # make sure that all items in proc_extra_data are dictionaries, with same keys
51
+ if proc_extra_data is None or len(proc_extra_data) == 0:
52
+ return
53
+ if not all(isinstance(item, dict) for item in proc_extra_data):
54
+ raise TypeError("All items in proc_extra_data must be dictionaries")
66
55
 
67
- def calc_exp_outdict_custom_fields(self, outdict, *args, **kwargs):
68
- """Can be overridden by the subclass to add custom fields to the output dictionary.
69
- ! must return the modified outdict, and a ordered list of custom fields to be added to the output dictionary.
70
- """
71
- return outdict, []
72
-
73
- def __valid_calc_custom_fields(self, fun_results):
74
- if not isinstance(fun_results, tuple) or len(fun_results) != 2:
75
- raise ValueError("Function must return a tuple (outdict, custom_fields)")
76
- outdict, custom_fields = fun_results
77
- if not isinstance(outdict, dict):
78
- raise TypeError("Output must be a dictionary")
79
- if not isinstance(custom_fields, list):
80
- raise TypeError("Custom fields must be a list")
81
- for field in custom_fields:
82
- if not isinstance(field, str):
83
- raise TypeError(f"Custom field '{field}' is not a string")
84
- return outdict, custom_fields
56
+ if not all(item.keys() == proc_extra_data[0].keys() for item in proc_extra_data):
57
+ raise ValueError("All dictionaries in proc_extra_data must have the same keys")
58
+
59
+ def valid_proc_metric_raw_data(
60
+ self, metric_names, proc_metric_raw_data
61
+ ):
62
+ # make sure that all items in proc_metric_raw_data are dictionaries, with same keys as metric_names
63
+ assert isinstance(proc_metric_raw_data, list) and len(proc_metric_raw_data) > 0, \
64
+ "raw_data_for_metrics must be a non-empty list of dictionaries"
65
+
66
+ # make sure that all items in proc_metric_raw_data are dictionaries with keys as metric_names
67
+ if not all(isinstance(item, dict) for item in proc_metric_raw_data):
68
+ raise TypeError("All items in raw_data_for_metrics must be dictionaries")
69
+ if not all( set(item.keys()) == set(metric_names) for item in proc_metric_raw_data):
70
+ raise ValueError(
71
+ "All dictionaries in raw_data_for_metrics must have the same keys as metric_names"
72
+ )
85
73
 
86
74
  # ! only need to override this method if torchmetrics are not used
87
- def calc_exp_perf_metrics(self, metric_names, *args, **kwargs):
75
+ def calc_exp_perf_metrics(
76
+ self, metric_names, raw_metrics_data, extra_data=None, *args, **kwargs
77
+ ):
78
+ assert isinstance(raw_metrics_data, dict) or isinstance(raw_metrics_data, list), \
79
+ "raw_data_for_metrics must be a dictionary or a list"
80
+
81
+ if extra_data is not None:
82
+ assert isinstance(extra_data, type(raw_metrics_data)), \
83
+ "extra_data must be of the same type as raw_data_for_metrics (dict or list)"
84
+ # prepare raw_metric data for processing
85
+ proc_metric_raw_data_ls = raw_metrics_data if isinstance(raw_metrics_data, list) else [raw_metrics_data.copy()]
86
+ self.valid_proc_metric_raw_data(metric_names, proc_metric_raw_data_ls)
87
+ # prepare extra data for processing
88
+ proc_extra_data_ls = []
89
+ if extra_data is not None:
90
+ proc_extra_data_ls = extra_data if isinstance(extra_data, list) else [extra_data.copy()]
91
+ assert len(proc_extra_data_ls) == len(proc_metric_raw_data_ls), \
92
+ "extra_data must have the same length as raw_data_for_metrics if it is a list"
93
+ # validate the extra_data
94
+ self.valid_proc_extra_data(proc_extra_data_ls)
95
+
96
+ # calculate the metrics output results
88
97
  metrics_backend = self.get_metric_backend()
89
- out_dict = {"dataset": self.get_dataset_name(), "experiment": self.get_experiment_name()}
90
- out_dict, custom_fields = self.__valid_calc_custom_fields(self.calc_exp_outdict_custom_fields(
91
- outdict=out_dict, *args, **kwargs
92
- ))
93
- metrics_data_dict = self.prepare_metrics_data_dict(
94
- metric_names, *args, **kwargs
95
- )
96
- metric_results = metrics_backend.calc_metrics(
97
- metrics_data_dict=metrics_data_dict, *args, **kwargs
98
- )
99
- metric_results_prefix = {
100
- f"metric_{k}": v for k, v in metric_results.items()
101
- }
102
- out_dict.update(metric_results_prefix)
103
- ordered_cols = REQUIRED_COLS + custom_fields + list(metric_results_prefix.keys())
104
- out_dict = OrderedDict(
105
- (col, out_dict[col]) for col in ordered_cols if col in out_dict
106
- )
107
- return out_dict
98
+ proc_outdict_list = []
99
+ for idx, raw_metrics_data in enumerate(proc_metric_raw_data_ls):
100
+ out_dict = {
101
+ "dataset": self.get_dataset_name(),
102
+ "experiment": self.get_experiment_name(),
103
+ }
104
+ custom_fields = []
105
+ if len(proc_extra_data_ls)> 0:
106
+ # add extra data to the output dictionary
107
+ extra_data_item = proc_extra_data_ls[idx]
108
+ out_dict.update(extra_data_item)
109
+ custom_fields = list(extra_data_item.keys())
110
+ metric_results = metrics_backend.calc_metrics(
111
+ metrics_data_dict=raw_metrics_data, *args, **kwargs
112
+ )
113
+ metric_results_prefix = {f"metric_{k}": v for k, v in metric_results.items()}
114
+ out_dict.update(metric_results_prefix)
115
+ ordered_cols = (
116
+ REQUIRED_COLS + custom_fields + list(metric_results_prefix.keys())
117
+ )
118
+ out_dict = OrderedDict(
119
+ (col, out_dict[col]) for col in ordered_cols if col in out_dict
120
+ )
121
+ proc_outdict_list.append(out_dict)
122
+
123
+ return proc_outdict_list
108
124
 
109
125
  #! custom kwargs:
110
126
  #! outfile - if provided, will save the output to a CSV file with the given path
111
127
  #! outdir - if provided, will save the output to a CSV file in the given directory with a generated filename
112
128
  #! return_df - if True, will return a DataFrame instead of a dictionary
113
-
114
- def calc_save_exp_perfs(self, *args, **kwargs):
129
+ def calc_and_save_exp_perfs(
130
+ self,
131
+ raw_metrics_data: Union[List[dict], dict],
132
+ extra_data: Optional[Union[List[dict], dict]] = None,
133
+ *args,
134
+ **kwargs,
135
+ ) -> Tuple[Union[List[OrderedDict], pd.DataFrame], Optional[str]]:
115
136
  """
116
137
  Calculate the metrics.
117
138
  This function should be overridden by the subclass if needed.
118
139
  Must return a dictionary with keys as metric names and values as the calculated metrics.
119
140
  """
120
141
  metric_names = self.get_metric_backend().metric_names
121
- out_dict = self.calc_exp_perf_metrics(metric_names=metric_names, *args, **kwargs)
142
+ out_dict_list = self.calc_exp_perf_metrics(
143
+ metric_names=metric_names, raw_metrics_data=raw_metrics_data,
144
+ extra_data=extra_data,
145
+ *args, **kwargs
146
+ )
122
147
  csv_outfile = kwargs.get("outfile", None)
123
148
  if csv_outfile is not None:
124
149
  filePathNoExt, _ = os.path.splitext(csv_outfile)
125
150
  # pprint(f"CSV Outfile Path (No Ext): {filePathNoExt}")
126
- csv_outfile = f'{filePathNoExt}{CSV_FILE_POSTFIX}.csv'
151
+ csv_outfile = f"{filePathNoExt}{CSV_FILE_POSTFIX}.csv"
127
152
  elif "outdir" in kwargs:
128
153
  csvoutdir = kwargs["outdir"]
129
154
  csvfilename = f"{now_str()}_{self.get_dataset_name()}_{self.get_experiment_name()}_{CSV_FILE_POSTFIX}.csv"
130
155
  csv_outfile = os.path.join(csvoutdir, csvfilename)
131
156
 
132
157
  # convert out_dict to a DataFrame
133
- df = pd.DataFrame([out_dict])
158
+ df = pd.DataFrame(out_dict_list)
134
159
  # get the orders of the columns as the orders or the keys in out_dict
135
- ordered_cols = list(out_dict.keys())
160
+ ordered_cols = list(out_dict_list[0].keys())
136
161
  df = df[ordered_cols] # reorder columns
137
-
138
162
  if csv_outfile:
139
163
  df.to_csv(csv_outfile, index=False, sep=";", encoding="utf-8")
140
164
  return_df = kwargs.get("return_df", False)
141
- if return_df: # return DataFrame instead of dict if requested
165
+ if return_df: # return DataFrame instead of dict if requested
142
166
  return df, csv_outfile
143
167
  else:
144
- return out_dict, csv_outfile
168
+ return out_dict_list, csv_outfile
145
169
 
146
170
  @staticmethod
147
171
  def default_exp_csv_filter_fn(exp_file_name: str) -> bool:
@@ -153,29 +177,37 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
153
177
 
154
178
  @classmethod
155
179
  def gen_perf_report_for_multip_exps(
156
- cls, indir: str, exp_csv_filter_fn=default_exp_csv_filter_fn, csv_sep=";"
180
+ cls, indir: str, exp_csv_filter_fn=default_exp_csv_filter_fn, include_file_name=False, csv_sep=";"
157
181
  ) -> PerfTB:
158
182
  """
159
183
  Generate a performance report by scanning experiment subdirectories.
160
184
  Must return a dictionary with keys as metric names and values as performance tables.
161
185
  """
162
- def get_df_for_all_exp_perf(csv_perf_files, csv_sep=';'):
186
+ def get_df_for_all_exp_perf(csv_perf_files, csv_sep=";"):
163
187
  """
164
188
  Create a single DataFrame from all CSV files.
165
189
  Assumes all CSV files MAY have different metrics
166
190
  """
167
191
  cols = []
192
+ FILE_NAME_COL = "file_name" if include_file_name else None
193
+
168
194
  for csv_file in csv_perf_files:
169
195
  temp_df = pd.read_csv(csv_file, sep=csv_sep)
196
+ if FILE_NAME_COL:
197
+ temp_df[FILE_NAME_COL] = fs.get_file_name(csv_file, split_file_ext=False)
198
+ # csvfile.fn_display_df(temp_df)
170
199
  temp_df_cols = temp_df.columns.tolist()
171
200
  for col in temp_df_cols:
172
201
  if col not in cols:
173
202
  cols.append(col)
203
+
174
204
  df = pd.DataFrame(columns=cols)
175
205
  for csv_file in csv_perf_files:
176
206
  temp_df = pd.read_csv(csv_file, sep=csv_sep)
207
+ if FILE_NAME_COL:
208
+ temp_df[FILE_NAME_COL] = fs.get_file_name(csv_file, split_file_ext=False)
177
209
  # Drop all-NA columns to avoid dtype inconsistency
178
- temp_df = temp_df.dropna(axis=1, how='all')
210
+ temp_df = temp_df.dropna(axis=1, how="all")
179
211
  # ensure all columns are present in the final DataFrame
180
212
  for col in cols:
181
213
  if col not in temp_df.columns:
@@ -183,27 +215,36 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
183
215
  df = pd.concat([df, temp_df], ignore_index=True)
184
216
  # assert that REQUIRED_COLS are present in the DataFrame
185
217
  # pprint(df.columns.tolist())
186
- for col in REQUIRED_COLS:
218
+ sticky_cols = REQUIRED_COLS + ([FILE_NAME_COL] if include_file_name else []) # columns that must always be present
219
+ for col in sticky_cols:
187
220
  if col not in df.columns:
188
- raise ValueError(f"Required column '{col}' is missing from the DataFrame. REQUIRED_COLS = {REQUIRED_COLS}")
189
- metric_cols = [col for col in df.columns if col.startswith('metric_')]
190
- assert len(metric_cols) > 0, "No metric columns found in the DataFrame. Ensure that the CSV files contain metric columns starting with 'metric_'."
191
- final_cols = REQUIRED_COLS + metric_cols
221
+ raise ValueError(
222
+ f"Required column '{col}' is missing from the DataFrame. REQUIRED_COLS = {sticky_cols}"
223
+ )
224
+ metric_cols = [col for col in df.columns if col.startswith(METRIC_PREFIX)]
225
+ assert (
226
+ len(metric_cols) > 0
227
+ ), "No metric columns found in the DataFrame. Ensure that the CSV files contain metric columns starting with 'metric_'."
228
+ final_cols = sticky_cols + metric_cols
192
229
  df = df[final_cols]
193
230
  # !hahv debug
194
- pprint('------ Final DataFrame Columns ------')
231
+ pprint("------ Final DataFrame Columns ------")
195
232
  csvfile.fn_display_df(df)
196
233
  # ! validate all rows in df before returning
197
234
  # make sure all rows will have at least values for REQUIRED_COLS and at least one metric column
198
235
  for index, row in df.iterrows():
199
- if not all(col in row and pd.notna(row[col]) for col in REQUIRED_COLS):
200
- raise ValueError(f"Row {index} is missing required columns or has NaN values in required columns: {row}")
236
+ if not all(col in row and pd.notna(row[col]) for col in sticky_cols):
237
+ raise ValueError(
238
+ f"Row {index} is missing required columns or has NaN values in required columns: {row}"
239
+ )
201
240
  if not any(pd.notna(row[col]) for col in metric_cols):
202
241
  raise ValueError(f"Row {index} has no metric values: {row}")
203
242
  # make sure these is no (experiment, dataset) pair that is duplicated
204
- duplicates = df.duplicated(subset=['experiment', 'dataset'], keep=False)
243
+ duplicates = df.duplicated(subset=sticky_cols, keep=False)
205
244
  if duplicates.any():
206
- raise ValueError("Duplicate (experiment, dataset) pairs found in the DataFrame. Please ensure that each experiment-dataset combination is unique.")
245
+ raise ValueError(
246
+ "Duplicate (experiment, dataset) pairs found in the DataFrame. Please ensure that each experiment-dataset combination is unique."
247
+ )
207
248
  return df
208
249
 
209
250
  def mk_perftb_report(df):
@@ -213,9 +254,9 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
213
254
  """
214
255
  perftb = PerfTB()
215
256
  # find all "dataset" values (unique)
216
- dataset_names = list(df['dataset'].unique())
217
- # find all columns that start with "metric_"
218
- metric_cols = [col for col in df.columns if col.startswith('metric_')]
257
+ dataset_names = list(df["dataset"].unique())
258
+ # find all columns that start with METRIC_PREFIX
259
+ metric_cols = [col for col in df.columns if col.startswith(METRIC_PREFIX)]
219
260
 
220
261
  # Determine which metrics are associated with each dataset.
221
262
  # Since a dataset may appear in multiple rows and may not include all metrics in each, identify the row with the same dataset that contains the most non-NaN metric values. The set of metrics for that dataset is defined by the non-NaN metrics in that row.
@@ -225,7 +266,11 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
225
266
  dataset_rows = df[df["dataset"] == dataset_name]
226
267
  # Find the row with the most non-NaN metric values
227
268
  max_non_nan_row = dataset_rows[metric_cols].count(axis=1).idxmax()
228
- metrics_for_dataset = dataset_rows.loc[max_non_nan_row, metric_cols].dropna().index.tolist()
269
+ metrics_for_dataset = (
270
+ dataset_rows.loc[max_non_nan_row, metric_cols]
271
+ .dropna()
272
+ .index.tolist()
273
+ )
229
274
  dataset_metrics[dataset_name] = metrics_for_dataset
230
275
 
231
276
  for dataset_name, metrics in dataset_metrics.items():
@@ -233,11 +278,11 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
233
278
  perftb.add_dataset(dataset_name, metrics)
234
279
 
235
280
  for _, row in df.iterrows():
236
- dataset_name = row['dataset']
281
+ dataset_name = row["dataset"]
237
282
  ds_metrics = dataset_metrics.get(dataset_name)
238
283
  if dataset_name in dataset_metrics:
239
284
  # Add the metrics for this row to the performance table
240
- exp_name = row.get('experiment')
285
+ exp_name = row.get("experiment")
241
286
  exp_metric_values = {}
242
287
  for metric in ds_metrics:
243
288
  if metric in row and pd.notna(row[metric]):
@@ -245,7 +290,7 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
245
290
  perftb.add_experiment(
246
291
  experiment_name=exp_name,
247
292
  dataset_name=dataset_name,
248
- metrics=exp_metric_values
293
+ metrics=exp_metric_values,
249
294
  )
250
295
 
251
296
  return perftb
@@ -260,9 +305,7 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
260
305
  if os.path.isdir(os.path.join(indir, d))
261
306
  ]
262
307
  if len(exp_dirs) == 0:
263
- csv_perf_files = glob.glob(
264
- os.path.join(indir, f"*.csv")
265
- )
308
+ csv_perf_files = glob.glob(os.path.join(indir, f"*.csv"))
266
309
  csv_perf_files = [
267
310
  file_item
268
311
  for file_item in csv_perf_files
@@ -273,13 +316,9 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
273
316
  # Collect all matching CSV files in those subdirs
274
317
  for exp_dir in exp_dirs:
275
318
  # pprint(f"Searching in experiment directory: {exp_dir}")
276
- matched = glob.glob(
277
- os.path.join(exp_dir, f"*.csv")
278
- )
319
+ matched = glob.glob(os.path.join(exp_dir, f"*.csv"))
279
320
  matched = [
280
- file_item
281
- for file_item in matched
282
- if exp_csv_filter_fn(file_item)
321
+ file_item for file_item in matched if exp_csv_filter_fn(file_item)
283
322
  ]
284
323
  csv_perf_files.extend(matched)
285
324
 
@@ -287,9 +326,11 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
287
326
  len(csv_perf_files) > 0
288
327
  ), f"No CSV files matching pattern '{exp_csv_filter_fn}' found in the experiment directories."
289
328
 
290
- assert len(csv_perf_files) > 0, f"No CSV files matching pattern '{exp_csv_filter_fn}' found in the experiment directories."
329
+ assert (
330
+ len(csv_perf_files) > 0
331
+ ), f"No CSV files matching pattern '{exp_csv_filter_fn}' found in the experiment directories."
291
332
 
292
333
  all_exp_perf_df = get_df_for_all_exp_perf(csv_perf_files, csv_sep=csv_sep)
293
- csvfile.fn_display_df(all_exp_perf_df)
334
+ # csvfile.fn_display_df(all_exp_perf_df)
294
335
  perf_tb = mk_perftb_report(all_exp_perf_df)
295
336
  return perf_tb
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: halib
3
- Version: 0.1.66
3
+ Version: 0.1.70
4
4
  Summary: Small library for common tasks
5
5
  Author: Hoang Van Ha
6
6
  Author-email: hoangvanhauit@gmail.com
@@ -52,7 +52,12 @@ Dynamic: summary
52
52
 
53
53
  Helper package for coding and automation
54
54
 
55
- **Version 0.1.66**
55
+ **Version 0.1.70**
56
+
57
+ + `research/base_exp`: add base experiment class to handle common experiment tasks, including performance calculation and saving results.
58
+
59
+
60
+ **Version 0.1.67**
56
61
 
57
62
  + now use `uv` for venv management
58
63
  + `research/perfcalc`: support both torchmetrics and custom metrics for performance calculation
@@ -26,6 +26,8 @@ halib/online/gdrive_mkdir.py
26
26
  halib/online/gdrive_test.py
27
27
  halib/online/projectmake.py
28
28
  halib/research/__init__.py
29
+ halib/research/base_config.py
30
+ halib/research/base_exp.py
29
31
  halib/research/dataset.py
30
32
  halib/research/metrics.py
31
33
  halib/research/perfcalc.py
@@ -8,7 +8,7 @@ with open("requirements.txt") as f:
8
8
 
9
9
  setuptools.setup(
10
10
  name="halib",
11
- version="0.1.66",
11
+ version="0.1.70",
12
12
  author="Hoang Van Ha",
13
13
  author_email="hoangvanhauit@gmail.com",
14
14
  description="Small library for common tasks",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes