halib 0.1.66__py3-none-any.whl → 0.1.70__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- halib/research/base_config.py +100 -0
- halib/research/base_exp.py +99 -0
- halib/research/metrics.py +13 -0
- halib/research/perfcalc.py +144 -103
- {halib-0.1.66.dist-info → halib-0.1.70.dist-info}/METADATA +7 -2
- {halib-0.1.66.dist-info → halib-0.1.70.dist-info}/RECORD +9 -7
- {halib-0.1.66.dist-info → halib-0.1.70.dist-info}/WHEEL +0 -0
- {halib-0.1.66.dist-info → halib-0.1.70.dist-info}/licenses/LICENSE.txt +0 -0
- {halib-0.1.66.dist-info → halib-0.1.70.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,100 @@
|
|
1
|
+
import os
|
2
|
+
from rich.pretty import pprint
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
from dataclass_wizard import YAMLWizard
|
5
|
+
|
6
|
+
|
7
|
+
class NamedConfig(ABC):
|
8
|
+
"""
|
9
|
+
Base class for named configurations.
|
10
|
+
All configurations should have a name.
|
11
|
+
"""
|
12
|
+
|
13
|
+
@abstractmethod
|
14
|
+
def get_name(self):
|
15
|
+
"""
|
16
|
+
Get the name of the configuration.
|
17
|
+
This method should be implemented in subclasses.
|
18
|
+
"""
|
19
|
+
pass
|
20
|
+
|
21
|
+
|
22
|
+
class ExpBaseConfig(ABC, YAMLWizard):
|
23
|
+
"""
|
24
|
+
Base class for configuration objects.
|
25
|
+
What a cfg class must have:
|
26
|
+
1 - a dataset cfg
|
27
|
+
2 - a metric cfg
|
28
|
+
3 - a method cfg
|
29
|
+
"""
|
30
|
+
|
31
|
+
# Save to yaml fil
|
32
|
+
def save_to_outdir(
|
33
|
+
self, filename: str = "config.yaml", outdir=None, override: bool = False
|
34
|
+
) -> None:
|
35
|
+
"""
|
36
|
+
Save the configuration to the output directory.
|
37
|
+
"""
|
38
|
+
if outdir is not None:
|
39
|
+
output_dir = outdir
|
40
|
+
else:
|
41
|
+
output_dir = self.get_outdir()
|
42
|
+
os.makedirs(output_dir, exist_ok=True)
|
43
|
+
assert (output_dir is not None) and (
|
44
|
+
os.path.isdir(output_dir)
|
45
|
+
), f"Output directory '{output_dir}' does not exist or is not a directory."
|
46
|
+
file_path = os.path.join(output_dir, filename)
|
47
|
+
if os.path.exists(file_path) and not override:
|
48
|
+
pprint(
|
49
|
+
f"File '{file_path}' already exists. Use 'override=True' to overwrite."
|
50
|
+
)
|
51
|
+
else:
|
52
|
+
# method of YAMLWizard to_yaml_file
|
53
|
+
self.to_yaml_file(file_path)
|
54
|
+
|
55
|
+
@classmethod
|
56
|
+
@abstractmethod
|
57
|
+
# load from a custom YAML file
|
58
|
+
def from_custom_yaml_file(cls, yaml_file: str):
|
59
|
+
"""Load a configuration from a custom YAML file."""
|
60
|
+
pass
|
61
|
+
|
62
|
+
@abstractmethod
|
63
|
+
def get_cfg_name(self):
|
64
|
+
"""
|
65
|
+
Get the name of the configuration.
|
66
|
+
This method should be implemented in subclasses.
|
67
|
+
"""
|
68
|
+
pass
|
69
|
+
|
70
|
+
@abstractmethod
|
71
|
+
def get_outdir(self):
|
72
|
+
"""
|
73
|
+
Get the output directory for the configuration.
|
74
|
+
This method should be implemented in subclasses.
|
75
|
+
"""
|
76
|
+
return None
|
77
|
+
|
78
|
+
@abstractmethod
|
79
|
+
def get_general_cfg(self):
|
80
|
+
"""
|
81
|
+
Get the general configuration like output directory, log settings, SEED, etc.
|
82
|
+
This method should be implemented in subclasses.
|
83
|
+
"""
|
84
|
+
pass
|
85
|
+
|
86
|
+
@abstractmethod
|
87
|
+
def get_dataset_cfg(self) -> NamedConfig:
|
88
|
+
"""
|
89
|
+
Get the dataset configuration.
|
90
|
+
This method should be implemented in subclasses.
|
91
|
+
"""
|
92
|
+
pass
|
93
|
+
|
94
|
+
@abstractmethod
|
95
|
+
def get_metric_cfg(self) -> NamedConfig:
|
96
|
+
"""
|
97
|
+
Get the metric configuration.
|
98
|
+
This method should be implemented in subclasses.
|
99
|
+
"""
|
100
|
+
pass
|
@@ -0,0 +1,99 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
|
3
|
+
from base_config import ExpBaseConfig
|
4
|
+
from perfcalc import PerfCalc
|
5
|
+
from metrics import MetricsBackend
|
6
|
+
|
7
|
+
# ! SEE https://github.com/hahv/base_exp for sample usage
|
8
|
+
class BaseExperiment(PerfCalc, ABC):
|
9
|
+
"""
|
10
|
+
Base class for experiments.
|
11
|
+
Orchestrates the experiment pipeline using a pluggable metrics backend.
|
12
|
+
"""
|
13
|
+
|
14
|
+
def __init__(self, config: ExpBaseConfig):
|
15
|
+
self.config = config
|
16
|
+
self.metric_backend = None
|
17
|
+
|
18
|
+
# -----------------------
|
19
|
+
# PerfCalc Required Methods
|
20
|
+
# -----------------------
|
21
|
+
def get_dataset_name(self):
|
22
|
+
return self.config.get_dataset_cfg().get_name()
|
23
|
+
|
24
|
+
def get_experiment_name(self):
|
25
|
+
return self.config.get_cfg_name()
|
26
|
+
|
27
|
+
def get_metric_backend(self):
|
28
|
+
if not self.metric_backend:
|
29
|
+
self.metric_backend = self.prepare_metrics(self.config.get_metric_cfg())
|
30
|
+
return self.metric_backend
|
31
|
+
|
32
|
+
# -----------------------
|
33
|
+
# Abstract Experiment Steps
|
34
|
+
# -----------------------
|
35
|
+
@abstractmethod
|
36
|
+
def init_general(self, general_cfg):
|
37
|
+
"""Setup general settings like SEED, logging, env variables."""
|
38
|
+
pass
|
39
|
+
|
40
|
+
@abstractmethod
|
41
|
+
def prepare_dataset(self, dataset_cfg):
|
42
|
+
"""Load/prepare dataset."""
|
43
|
+
pass
|
44
|
+
|
45
|
+
@abstractmethod
|
46
|
+
def prepare_metrics(self, metric_cfg) -> MetricsBackend:
|
47
|
+
"""
|
48
|
+
Prepare the metrics for the experiment.
|
49
|
+
This method should be implemented in subclasses.
|
50
|
+
"""
|
51
|
+
pass
|
52
|
+
|
53
|
+
@abstractmethod
|
54
|
+
def exec_exp(self, *args, **kwargs):
|
55
|
+
"""Run experiment process, e.g.: training/evaluation loop.
|
56
|
+
Return: raw_metrics_data, and extra_data as input for calc_and_save_exp_perfs
|
57
|
+
"""
|
58
|
+
pass
|
59
|
+
|
60
|
+
def eval_exp(self):
|
61
|
+
"""Optional: re-run evaluation from saved results."""
|
62
|
+
pass
|
63
|
+
|
64
|
+
# -----------------------
|
65
|
+
# Main Experiment Runner
|
66
|
+
# -----------------------
|
67
|
+
def run_exp(self, do_calc_metrics=True, *args, **kwargs):
|
68
|
+
"""
|
69
|
+
Run the whole experiment pipeline.
|
70
|
+
Params:
|
71
|
+
+ 'outfile' to save csv file results,
|
72
|
+
+ 'outdir' to set output directory for experiment results.
|
73
|
+
+ 'return_df' to return a DataFrame of results instead of a dictionary.
|
74
|
+
|
75
|
+
Full pipeline:
|
76
|
+
1. Init
|
77
|
+
2. Dataset
|
78
|
+
3. Metrics Preparation
|
79
|
+
4. Save Config
|
80
|
+
5. Execute
|
81
|
+
6. Calculate & Save Metrics
|
82
|
+
"""
|
83
|
+
self.init_general(self.config.get_general_cfg())
|
84
|
+
self.prepare_dataset(self.config.get_dataset_cfg())
|
85
|
+
|
86
|
+
# Save config before running
|
87
|
+
self.config.save_to_outdir()
|
88
|
+
|
89
|
+
# Execute experiment
|
90
|
+
results = self.exec_exp(*args, **kwargs)
|
91
|
+
if do_calc_metrics:
|
92
|
+
metrics_data, extra_data = results
|
93
|
+
# Calculate & Save metrics
|
94
|
+
perf_results = self.calc_and_save_exp_perfs(
|
95
|
+
raw_metrics_data=metrics_data, extra_data=extra_data, *args, **kwargs
|
96
|
+
)
|
97
|
+
return perf_results
|
98
|
+
else:
|
99
|
+
return results
|
halib/research/metrics.py
CHANGED
@@ -44,6 +44,15 @@ class MetricsBackend(ABC):
|
|
44
44
|
) -> Dict[str, Any]:
|
45
45
|
pass
|
46
46
|
|
47
|
+
def prepare_metrics_backend_data(
|
48
|
+
self, raw_metric_data, *args, **kwargs
|
49
|
+
):
|
50
|
+
"""
|
51
|
+
Prepare the data for the metrics backend.
|
52
|
+
This method can be overridden by subclasses to customize data preparation.
|
53
|
+
"""
|
54
|
+
return raw_metric_data
|
55
|
+
|
47
56
|
def calc_metrics(
|
48
57
|
self, metrics_data_dict: Dict[str, Any], *args, **kwargs
|
49
58
|
) -> Dict[str, Any]:
|
@@ -55,6 +64,10 @@ class MetricsBackend(ABC):
|
|
55
64
|
for metric in self.metric_names:
|
56
65
|
if metric not in metrics_data_dict:
|
57
66
|
raise ValueError(f"Metric '{metric}' not found in provided data.")
|
67
|
+
# Prepare the data for the backend
|
68
|
+
metrics_data_dict = self.prepare_metrics_backend_data(
|
69
|
+
metrics_data_dict, *args, **kwargs
|
70
|
+
)
|
58
71
|
# Call the abstract method to compute metrics
|
59
72
|
return self.compute_metrics(self.metric_info, metrics_data_dict, *args, **kwargs)
|
60
73
|
|
halib/research/perfcalc.py
CHANGED
@@ -1,35 +1,27 @@
|
|
1
1
|
import os
|
2
2
|
import glob
|
3
|
-
import
|
3
|
+
from typing import Optional, Tuple
|
4
4
|
import pandas as pd
|
5
5
|
|
6
|
-
from typing import Dict
|
7
|
-
from functools import wraps
|
8
6
|
from rich.pretty import pprint
|
9
7
|
|
10
8
|
from abc import ABC, abstractmethod
|
11
9
|
from collections import OrderedDict
|
12
10
|
|
13
11
|
from ..filetype import csvfile
|
12
|
+
from ..system import filesys as fs
|
14
13
|
from ..common import now_str
|
15
14
|
from ..research.perftb import PerfTB
|
16
15
|
from ..research.metrics import *
|
17
16
|
|
18
|
-
# # try to import torch, and torchmetrics
|
19
|
-
# try:
|
20
|
-
# import torch
|
21
|
-
# import torchmetrics
|
22
|
-
# from torchmetrics import Metric
|
23
|
-
# except ImportError:
|
24
|
-
# raise ImportError("Please install torch and torchmetrics to use this module.")
|
25
|
-
|
26
17
|
|
27
18
|
REQUIRED_COLS = ["experiment", "dataset"]
|
28
19
|
CSV_FILE_POSTFIX = "__perf"
|
20
|
+
METRIC_PREFIX = "metric_"
|
29
21
|
|
30
|
-
class PerfCalc(ABC):
|
22
|
+
class PerfCalc(ABC): # Abstract base class for performance calculation
|
31
23
|
@abstractmethod
|
32
|
-
def get_experiment_name(self):
|
24
|
+
def get_experiment_name(self) -> str:
|
33
25
|
"""
|
34
26
|
Return the name of the experiment.
|
35
27
|
This function should be overridden by the subclass if needed.
|
@@ -37,7 +29,7 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
|
|
37
29
|
pass
|
38
30
|
|
39
31
|
@abstractmethod
|
40
|
-
def get_dataset_name(self):
|
32
|
+
def get_dataset_name(self) -> str:
|
41
33
|
"""
|
42
34
|
Return the name of the dataset.
|
43
35
|
This function should be overridden by the subclass if needed.
|
@@ -52,96 +44,128 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
|
|
52
44
|
"""
|
53
45
|
pass
|
54
46
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
NOTE: that the data (for each metric) must be in the format expected by the torchmetrics instance (for that metric). E.g: {"accuracy": {"preds": [...], "target": [...]}, ...} since torchmetrics expects the data in a specific format.
|
64
|
-
"""
|
65
|
-
pass
|
47
|
+
def valid_proc_extra_data(
|
48
|
+
self, proc_extra_data
|
49
|
+
):
|
50
|
+
# make sure that all items in proc_extra_data are dictionaries, with same keys
|
51
|
+
if proc_extra_data is None or len(proc_extra_data) == 0:
|
52
|
+
return
|
53
|
+
if not all(isinstance(item, dict) for item in proc_extra_data):
|
54
|
+
raise TypeError("All items in proc_extra_data must be dictionaries")
|
66
55
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
return outdict, custom_fields
|
56
|
+
if not all(item.keys() == proc_extra_data[0].keys() for item in proc_extra_data):
|
57
|
+
raise ValueError("All dictionaries in proc_extra_data must have the same keys")
|
58
|
+
|
59
|
+
def valid_proc_metric_raw_data(
|
60
|
+
self, metric_names, proc_metric_raw_data
|
61
|
+
):
|
62
|
+
# make sure that all items in proc_metric_raw_data are dictionaries, with same keys as metric_names
|
63
|
+
assert isinstance(proc_metric_raw_data, list) and len(proc_metric_raw_data) > 0, \
|
64
|
+
"raw_data_for_metrics must be a non-empty list of dictionaries"
|
65
|
+
|
66
|
+
# make sure that all items in proc_metric_raw_data are dictionaries with keys as metric_names
|
67
|
+
if not all(isinstance(item, dict) for item in proc_metric_raw_data):
|
68
|
+
raise TypeError("All items in raw_data_for_metrics must be dictionaries")
|
69
|
+
if not all( set(item.keys()) == set(metric_names) for item in proc_metric_raw_data):
|
70
|
+
raise ValueError(
|
71
|
+
"All dictionaries in raw_data_for_metrics must have the same keys as metric_names"
|
72
|
+
)
|
85
73
|
|
86
74
|
# ! only need to override this method if torchmetrics are not used
|
87
|
-
def calc_exp_perf_metrics(
|
75
|
+
def calc_exp_perf_metrics(
|
76
|
+
self, metric_names, raw_metrics_data, extra_data=None, *args, **kwargs
|
77
|
+
):
|
78
|
+
assert isinstance(raw_metrics_data, dict) or isinstance(raw_metrics_data, list), \
|
79
|
+
"raw_data_for_metrics must be a dictionary or a list"
|
80
|
+
|
81
|
+
if extra_data is not None:
|
82
|
+
assert isinstance(extra_data, type(raw_metrics_data)), \
|
83
|
+
"extra_data must be of the same type as raw_data_for_metrics (dict or list)"
|
84
|
+
# prepare raw_metric data for processing
|
85
|
+
proc_metric_raw_data_ls = raw_metrics_data if isinstance(raw_metrics_data, list) else [raw_metrics_data.copy()]
|
86
|
+
self.valid_proc_metric_raw_data(metric_names, proc_metric_raw_data_ls)
|
87
|
+
# prepare extra data for processing
|
88
|
+
proc_extra_data_ls = []
|
89
|
+
if extra_data is not None:
|
90
|
+
proc_extra_data_ls = extra_data if isinstance(extra_data, list) else [extra_data.copy()]
|
91
|
+
assert len(proc_extra_data_ls) == len(proc_metric_raw_data_ls), \
|
92
|
+
"extra_data must have the same length as raw_data_for_metrics if it is a list"
|
93
|
+
# validate the extra_data
|
94
|
+
self.valid_proc_extra_data(proc_extra_data_ls)
|
95
|
+
|
96
|
+
# calculate the metrics output results
|
88
97
|
metrics_backend = self.get_metric_backend()
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
(
|
106
|
-
|
107
|
-
|
98
|
+
proc_outdict_list = []
|
99
|
+
for idx, raw_metrics_data in enumerate(proc_metric_raw_data_ls):
|
100
|
+
out_dict = {
|
101
|
+
"dataset": self.get_dataset_name(),
|
102
|
+
"experiment": self.get_experiment_name(),
|
103
|
+
}
|
104
|
+
custom_fields = []
|
105
|
+
if len(proc_extra_data_ls)> 0:
|
106
|
+
# add extra data to the output dictionary
|
107
|
+
extra_data_item = proc_extra_data_ls[idx]
|
108
|
+
out_dict.update(extra_data_item)
|
109
|
+
custom_fields = list(extra_data_item.keys())
|
110
|
+
metric_results = metrics_backend.calc_metrics(
|
111
|
+
metrics_data_dict=raw_metrics_data, *args, **kwargs
|
112
|
+
)
|
113
|
+
metric_results_prefix = {f"metric_{k}": v for k, v in metric_results.items()}
|
114
|
+
out_dict.update(metric_results_prefix)
|
115
|
+
ordered_cols = (
|
116
|
+
REQUIRED_COLS + custom_fields + list(metric_results_prefix.keys())
|
117
|
+
)
|
118
|
+
out_dict = OrderedDict(
|
119
|
+
(col, out_dict[col]) for col in ordered_cols if col in out_dict
|
120
|
+
)
|
121
|
+
proc_outdict_list.append(out_dict)
|
122
|
+
|
123
|
+
return proc_outdict_list
|
108
124
|
|
109
125
|
#! custom kwargs:
|
110
126
|
#! outfile - if provided, will save the output to a CSV file with the given path
|
111
127
|
#! outdir - if provided, will save the output to a CSV file in the given directory with a generated filename
|
112
128
|
#! return_df - if True, will return a DataFrame instead of a dictionary
|
113
|
-
|
114
|
-
|
129
|
+
def calc_and_save_exp_perfs(
|
130
|
+
self,
|
131
|
+
raw_metrics_data: Union[List[dict], dict],
|
132
|
+
extra_data: Optional[Union[List[dict], dict]] = None,
|
133
|
+
*args,
|
134
|
+
**kwargs,
|
135
|
+
) -> Tuple[Union[List[OrderedDict], pd.DataFrame], Optional[str]]:
|
115
136
|
"""
|
116
137
|
Calculate the metrics.
|
117
138
|
This function should be overridden by the subclass if needed.
|
118
139
|
Must return a dictionary with keys as metric names and values as the calculated metrics.
|
119
140
|
"""
|
120
141
|
metric_names = self.get_metric_backend().metric_names
|
121
|
-
|
142
|
+
out_dict_list = self.calc_exp_perf_metrics(
|
143
|
+
metric_names=metric_names, raw_metrics_data=raw_metrics_data,
|
144
|
+
extra_data=extra_data,
|
145
|
+
*args, **kwargs
|
146
|
+
)
|
122
147
|
csv_outfile = kwargs.get("outfile", None)
|
123
148
|
if csv_outfile is not None:
|
124
149
|
filePathNoExt, _ = os.path.splitext(csv_outfile)
|
125
150
|
# pprint(f"CSV Outfile Path (No Ext): {filePathNoExt}")
|
126
|
-
csv_outfile = f
|
151
|
+
csv_outfile = f"{filePathNoExt}{CSV_FILE_POSTFIX}.csv"
|
127
152
|
elif "outdir" in kwargs:
|
128
153
|
csvoutdir = kwargs["outdir"]
|
129
154
|
csvfilename = f"{now_str()}_{self.get_dataset_name()}_{self.get_experiment_name()}_{CSV_FILE_POSTFIX}.csv"
|
130
155
|
csv_outfile = os.path.join(csvoutdir, csvfilename)
|
131
156
|
|
132
157
|
# convert out_dict to a DataFrame
|
133
|
-
df = pd.DataFrame(
|
158
|
+
df = pd.DataFrame(out_dict_list)
|
134
159
|
# get the orders of the columns as the orders or the keys in out_dict
|
135
|
-
ordered_cols = list(
|
160
|
+
ordered_cols = list(out_dict_list[0].keys())
|
136
161
|
df = df[ordered_cols] # reorder columns
|
137
|
-
|
138
162
|
if csv_outfile:
|
139
163
|
df.to_csv(csv_outfile, index=False, sep=";", encoding="utf-8")
|
140
164
|
return_df = kwargs.get("return_df", False)
|
141
|
-
if return_df:
|
165
|
+
if return_df: # return DataFrame instead of dict if requested
|
142
166
|
return df, csv_outfile
|
143
167
|
else:
|
144
|
-
return
|
168
|
+
return out_dict_list, csv_outfile
|
145
169
|
|
146
170
|
@staticmethod
|
147
171
|
def default_exp_csv_filter_fn(exp_file_name: str) -> bool:
|
@@ -153,29 +177,37 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
|
|
153
177
|
|
154
178
|
@classmethod
|
155
179
|
def gen_perf_report_for_multip_exps(
|
156
|
-
cls, indir: str, exp_csv_filter_fn=default_exp_csv_filter_fn, csv_sep=";"
|
180
|
+
cls, indir: str, exp_csv_filter_fn=default_exp_csv_filter_fn, include_file_name=False, csv_sep=";"
|
157
181
|
) -> PerfTB:
|
158
182
|
"""
|
159
183
|
Generate a performance report by scanning experiment subdirectories.
|
160
184
|
Must return a dictionary with keys as metric names and values as performance tables.
|
161
185
|
"""
|
162
|
-
def get_df_for_all_exp_perf(csv_perf_files, csv_sep=
|
186
|
+
def get_df_for_all_exp_perf(csv_perf_files, csv_sep=";"):
|
163
187
|
"""
|
164
188
|
Create a single DataFrame from all CSV files.
|
165
189
|
Assumes all CSV files MAY have different metrics
|
166
190
|
"""
|
167
191
|
cols = []
|
192
|
+
FILE_NAME_COL = "file_name" if include_file_name else None
|
193
|
+
|
168
194
|
for csv_file in csv_perf_files:
|
169
195
|
temp_df = pd.read_csv(csv_file, sep=csv_sep)
|
196
|
+
if FILE_NAME_COL:
|
197
|
+
temp_df[FILE_NAME_COL] = fs.get_file_name(csv_file, split_file_ext=False)
|
198
|
+
# csvfile.fn_display_df(temp_df)
|
170
199
|
temp_df_cols = temp_df.columns.tolist()
|
171
200
|
for col in temp_df_cols:
|
172
201
|
if col not in cols:
|
173
202
|
cols.append(col)
|
203
|
+
|
174
204
|
df = pd.DataFrame(columns=cols)
|
175
205
|
for csv_file in csv_perf_files:
|
176
206
|
temp_df = pd.read_csv(csv_file, sep=csv_sep)
|
207
|
+
if FILE_NAME_COL:
|
208
|
+
temp_df[FILE_NAME_COL] = fs.get_file_name(csv_file, split_file_ext=False)
|
177
209
|
# Drop all-NA columns to avoid dtype inconsistency
|
178
|
-
temp_df = temp_df.dropna(axis=1, how=
|
210
|
+
temp_df = temp_df.dropna(axis=1, how="all")
|
179
211
|
# ensure all columns are present in the final DataFrame
|
180
212
|
for col in cols:
|
181
213
|
if col not in temp_df.columns:
|
@@ -183,27 +215,36 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
|
|
183
215
|
df = pd.concat([df, temp_df], ignore_index=True)
|
184
216
|
# assert that REQUIRED_COLS are present in the DataFrame
|
185
217
|
# pprint(df.columns.tolist())
|
186
|
-
|
218
|
+
sticky_cols = REQUIRED_COLS + ([FILE_NAME_COL] if include_file_name else []) # columns that must always be present
|
219
|
+
for col in sticky_cols:
|
187
220
|
if col not in df.columns:
|
188
|
-
raise ValueError(
|
189
|
-
|
190
|
-
|
191
|
-
|
221
|
+
raise ValueError(
|
222
|
+
f"Required column '{col}' is missing from the DataFrame. REQUIRED_COLS = {sticky_cols}"
|
223
|
+
)
|
224
|
+
metric_cols = [col for col in df.columns if col.startswith(METRIC_PREFIX)]
|
225
|
+
assert (
|
226
|
+
len(metric_cols) > 0
|
227
|
+
), "No metric columns found in the DataFrame. Ensure that the CSV files contain metric columns starting with 'metric_'."
|
228
|
+
final_cols = sticky_cols + metric_cols
|
192
229
|
df = df[final_cols]
|
193
230
|
# !hahv debug
|
194
|
-
pprint(
|
231
|
+
pprint("------ Final DataFrame Columns ------")
|
195
232
|
csvfile.fn_display_df(df)
|
196
233
|
# ! validate all rows in df before returning
|
197
234
|
# make sure all rows will have at least values for REQUIRED_COLS and at least one metric column
|
198
235
|
for index, row in df.iterrows():
|
199
|
-
if not all(col in row and pd.notna(row[col]) for col in
|
200
|
-
raise ValueError(
|
236
|
+
if not all(col in row and pd.notna(row[col]) for col in sticky_cols):
|
237
|
+
raise ValueError(
|
238
|
+
f"Row {index} is missing required columns or has NaN values in required columns: {row}"
|
239
|
+
)
|
201
240
|
if not any(pd.notna(row[col]) for col in metric_cols):
|
202
241
|
raise ValueError(f"Row {index} has no metric values: {row}")
|
203
242
|
# make sure these is no (experiment, dataset) pair that is duplicated
|
204
|
-
duplicates = df.duplicated(subset=
|
243
|
+
duplicates = df.duplicated(subset=sticky_cols, keep=False)
|
205
244
|
if duplicates.any():
|
206
|
-
raise ValueError(
|
245
|
+
raise ValueError(
|
246
|
+
"Duplicate (experiment, dataset) pairs found in the DataFrame. Please ensure that each experiment-dataset combination is unique."
|
247
|
+
)
|
207
248
|
return df
|
208
249
|
|
209
250
|
def mk_perftb_report(df):
|
@@ -213,9 +254,9 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
|
|
213
254
|
"""
|
214
255
|
perftb = PerfTB()
|
215
256
|
# find all "dataset" values (unique)
|
216
|
-
dataset_names = list(df[
|
217
|
-
# find all columns that start with
|
218
|
-
metric_cols = [col for col in df.columns if col.startswith(
|
257
|
+
dataset_names = list(df["dataset"].unique())
|
258
|
+
# find all columns that start with METRIC_PREFIX
|
259
|
+
metric_cols = [col for col in df.columns if col.startswith(METRIC_PREFIX)]
|
219
260
|
|
220
261
|
# Determine which metrics are associated with each dataset.
|
221
262
|
# Since a dataset may appear in multiple rows and may not include all metrics in each, identify the row with the same dataset that contains the most non-NaN metric values. The set of metrics for that dataset is defined by the non-NaN metrics in that row.
|
@@ -225,7 +266,11 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
|
|
225
266
|
dataset_rows = df[df["dataset"] == dataset_name]
|
226
267
|
# Find the row with the most non-NaN metric values
|
227
268
|
max_non_nan_row = dataset_rows[metric_cols].count(axis=1).idxmax()
|
228
|
-
metrics_for_dataset =
|
269
|
+
metrics_for_dataset = (
|
270
|
+
dataset_rows.loc[max_non_nan_row, metric_cols]
|
271
|
+
.dropna()
|
272
|
+
.index.tolist()
|
273
|
+
)
|
229
274
|
dataset_metrics[dataset_name] = metrics_for_dataset
|
230
275
|
|
231
276
|
for dataset_name, metrics in dataset_metrics.items():
|
@@ -233,11 +278,11 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
|
|
233
278
|
perftb.add_dataset(dataset_name, metrics)
|
234
279
|
|
235
280
|
for _, row in df.iterrows():
|
236
|
-
dataset_name = row[
|
281
|
+
dataset_name = row["dataset"]
|
237
282
|
ds_metrics = dataset_metrics.get(dataset_name)
|
238
283
|
if dataset_name in dataset_metrics:
|
239
284
|
# Add the metrics for this row to the performance table
|
240
|
-
exp_name = row.get(
|
285
|
+
exp_name = row.get("experiment")
|
241
286
|
exp_metric_values = {}
|
242
287
|
for metric in ds_metrics:
|
243
288
|
if metric in row and pd.notna(row[metric]):
|
@@ -245,7 +290,7 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
|
|
245
290
|
perftb.add_experiment(
|
246
291
|
experiment_name=exp_name,
|
247
292
|
dataset_name=dataset_name,
|
248
|
-
metrics=exp_metric_values
|
293
|
+
metrics=exp_metric_values,
|
249
294
|
)
|
250
295
|
|
251
296
|
return perftb
|
@@ -260,9 +305,7 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
|
|
260
305
|
if os.path.isdir(os.path.join(indir, d))
|
261
306
|
]
|
262
307
|
if len(exp_dirs) == 0:
|
263
|
-
csv_perf_files = glob.glob(
|
264
|
-
os.path.join(indir, f"*.csv")
|
265
|
-
)
|
308
|
+
csv_perf_files = glob.glob(os.path.join(indir, f"*.csv"))
|
266
309
|
csv_perf_files = [
|
267
310
|
file_item
|
268
311
|
for file_item in csv_perf_files
|
@@ -273,13 +316,9 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
|
|
273
316
|
# Collect all matching CSV files in those subdirs
|
274
317
|
for exp_dir in exp_dirs:
|
275
318
|
# pprint(f"Searching in experiment directory: {exp_dir}")
|
276
|
-
matched = glob.glob(
|
277
|
-
os.path.join(exp_dir, f"*.csv")
|
278
|
-
)
|
319
|
+
matched = glob.glob(os.path.join(exp_dir, f"*.csv"))
|
279
320
|
matched = [
|
280
|
-
file_item
|
281
|
-
for file_item in matched
|
282
|
-
if exp_csv_filter_fn(file_item)
|
321
|
+
file_item for file_item in matched if exp_csv_filter_fn(file_item)
|
283
322
|
]
|
284
323
|
csv_perf_files.extend(matched)
|
285
324
|
|
@@ -287,9 +326,11 @@ class PerfCalc(ABC): # Abstract base class for performance calculation
|
|
287
326
|
len(csv_perf_files) > 0
|
288
327
|
), f"No CSV files matching pattern '{exp_csv_filter_fn}' found in the experiment directories."
|
289
328
|
|
290
|
-
assert
|
329
|
+
assert (
|
330
|
+
len(csv_perf_files) > 0
|
331
|
+
), f"No CSV files matching pattern '{exp_csv_filter_fn}' found in the experiment directories."
|
291
332
|
|
292
333
|
all_exp_perf_df = get_df_for_all_exp_perf(csv_perf_files, csv_sep=csv_sep)
|
293
|
-
csvfile.fn_display_df(all_exp_perf_df)
|
334
|
+
# csvfile.fn_display_df(all_exp_perf_df)
|
294
335
|
perf_tb = mk_perftb_report(all_exp_perf_df)
|
295
336
|
return perf_tb
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: halib
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.70
|
4
4
|
Summary: Small library for common tasks
|
5
5
|
Author: Hoang Van Ha
|
6
6
|
Author-email: hoangvanhauit@gmail.com
|
@@ -52,7 +52,12 @@ Dynamic: summary
|
|
52
52
|
|
53
53
|
Helper package for coding and automation
|
54
54
|
|
55
|
-
**Version 0.1.
|
55
|
+
**Version 0.1.70**
|
56
|
+
|
57
|
+
+ `research/base_exp`: add base experiment class to handle common experiment tasks, including performance calculation and saving results.
|
58
|
+
|
59
|
+
|
60
|
+
**Version 0.1.67**
|
56
61
|
|
57
62
|
+ now use `uv` for venv management
|
58
63
|
+ `research/perfcalc`: support both torchmetrics and custom metrics for performance calculation
|
@@ -28,10 +28,12 @@ halib/online/gdrive_mkdir.py,sha256=wSJkQMJCDuS1gxQ2lHQHq_IrJ4xR_SEoPSo9n_2WNFU,
|
|
28
28
|
halib/online/gdrive_test.py,sha256=hMWzz4RqZwETHp4GG4WwVNFfYvFQhp2Boz5t-DqwMo0,1342
|
29
29
|
halib/online/projectmake.py,sha256=Zrs96WgXvO4nIrwxnCOletL4aTBge-EoF0r7hpKO1w8,4034
|
30
30
|
halib/research/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
|
+
halib/research/base_config.py,sha256=AIjVzl2ZJ9b8yIGb2X5EZwLmyGJ_9wNWqrib1nU3Wj0,2831
|
32
|
+
halib/research/base_exp.py,sha256=mIyxVDKb40lngrEgW3_hUV5KPon25nxQeaWCUU_AWjQ,3185
|
31
33
|
halib/research/benchquery.py,sha256=FuKnbWQtCEoRRtJAfN-zaN-jPiO_EzsakmTOMiqi7GQ,4626
|
32
34
|
halib/research/dataset.py,sha256=QU0Hr5QFb8_XlvnOMgC9QJGIpwXAZ9lDd0RdQi_QRec,6743
|
33
|
-
halib/research/metrics.py,sha256=
|
34
|
-
halib/research/perfcalc.py,sha256=
|
35
|
+
halib/research/metrics.py,sha256=Xgv0GUGo-o-RJaBOmkRCRpQJaYijF_1xeKkyYU_Bv4U,5249
|
36
|
+
halib/research/perfcalc.py,sha256=qDa0sqfpWrwGZVJtjuUVFK7JX6j8xyXP9OnnfYmdamg,15898
|
35
37
|
halib/research/perftb.py,sha256=vazU-dYBJhfc4sK4zFgxOvzeXGi-5TyPHCt20ItiWhY,30463
|
36
38
|
halib/research/plot.py,sha256=-pDUk4z3C_GnyJ5zWmf-mGMdT4gaipVJWzIgcpIPiRk,9448
|
37
39
|
halib/research/torchloader.py,sha256=yqUjcSiME6H5W210363HyRUrOi3ISpUFAFkTr1w4DCw,6503
|
@@ -49,8 +51,8 @@ halib/utils/gpu_mon.py,sha256=vD41_ZnmPLKguuq9X44SB_vwd9JrblO4BDzHLXZhhFY,2233
|
|
49
51
|
halib/utils/listop.py,sha256=Vpa8_2fI0wySpB2-8sfTBkyi_A4FhoFVVvFiuvW8N64,339
|
50
52
|
halib/utils/tele_noti.py,sha256=-4WXZelCA4W9BroapkRyIdUu9cUVrcJJhegnMs_WpGU,5928
|
51
53
|
halib/utils/video.py,sha256=ZqzNVPgc1RZr_T0OlHvZ6SzyBpL7O27LtB86JMbBuR0,3059
|
52
|
-
halib-0.1.
|
53
|
-
halib-0.1.
|
54
|
-
halib-0.1.
|
55
|
-
halib-0.1.
|
56
|
-
halib-0.1.
|
54
|
+
halib-0.1.70.dist-info/licenses/LICENSE.txt,sha256=qZssdna4aETiR8znYsShUjidu-U4jUT9Q-EWNlZ9yBQ,1100
|
55
|
+
halib-0.1.70.dist-info/METADATA,sha256=C5ei-WhAmt6SuG-vR8pIQd2Uat5HrlT20RX9JP3D0Q4,5706
|
56
|
+
halib-0.1.70.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
57
|
+
halib-0.1.70.dist-info/top_level.txt,sha256=7AD6PLaQTreE0Fn44mdZsoHBe_Zdd7GUmjsWPyQ7I-k,6
|
58
|
+
halib-0.1.70.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|