halib 0.1.7__py3-none-any.whl → 0.1.99__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. halib/__init__.py +84 -0
  2. halib/common.py +151 -0
  3. halib/cuda.py +39 -0
  4. halib/dataset.py +209 -0
  5. halib/filetype/csvfile.py +151 -45
  6. halib/filetype/ipynb.py +63 -0
  7. halib/filetype/jsonfile.py +1 -1
  8. halib/filetype/textfile.py +4 -4
  9. halib/filetype/videofile.py +44 -33
  10. halib/filetype/yamlfile.py +95 -0
  11. halib/gdrive.py +1 -1
  12. halib/online/gdrive.py +104 -54
  13. halib/online/gdrive_mkdir.py +29 -17
  14. halib/online/gdrive_test.py +31 -18
  15. halib/online/projectmake.py +58 -43
  16. halib/plot.py +296 -11
  17. halib/projectmake.py +1 -1
  18. halib/research/__init__.py +0 -0
  19. halib/research/base_config.py +100 -0
  20. halib/research/base_exp.py +100 -0
  21. halib/research/benchquery.py +131 -0
  22. halib/research/dataset.py +208 -0
  23. halib/research/flop_csv.py +34 -0
  24. halib/research/flops.py +156 -0
  25. halib/research/metrics.py +133 -0
  26. halib/research/mics.py +68 -0
  27. halib/research/params_gen.py +108 -0
  28. halib/research/perfcalc.py +336 -0
  29. halib/research/perftb.py +780 -0
  30. halib/research/plot.py +758 -0
  31. halib/research/profiler.py +300 -0
  32. halib/research/torchloader.py +162 -0
  33. halib/research/wandb_op.py +116 -0
  34. halib/rich_color.py +285 -0
  35. halib/sys/filesys.py +17 -10
  36. halib/system/__init__.py +0 -0
  37. halib/system/cmd.py +8 -0
  38. halib/system/filesys.py +124 -0
  39. halib/tele_noti.py +166 -0
  40. halib/torchloader.py +162 -0
  41. halib/utils/__init__.py +0 -0
  42. halib/utils/dataclass_util.py +40 -0
  43. halib/utils/dict_op.py +9 -0
  44. halib/utils/gpu_mon.py +58 -0
  45. halib/utils/listop.py +13 -0
  46. halib/utils/tele_noti.py +166 -0
  47. halib/utils/video.py +82 -0
  48. halib/videofile.py +1 -1
  49. halib-0.1.99.dist-info/METADATA +209 -0
  50. halib-0.1.99.dist-info/RECORD +64 -0
  51. {halib-0.1.7.dist-info → halib-0.1.99.dist-info}/WHEEL +1 -1
  52. halib-0.1.7.dist-info/METADATA +0 -59
  53. halib-0.1.7.dist-info/RECORD +0 -30
  54. {halib-0.1.7.dist-info → halib-0.1.99.dist-info/licenses}/LICENSE.txt +0 -0
  55. {halib-0.1.7.dist-info → halib-0.1.99.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,108 @@
1
+ from typing import Dict, Any, List
2
+ import numpy as np
3
+ from ..common import *
4
+ from ..filetype import yamlfile
5
+ import yaml
6
+ import os
7
+
8
+ class ParamGen:
9
+
10
+ @staticmethod
11
+ def build_from_file(params_file):
12
+ builder = ParamGen(params_file)
13
+ return builder.params
14
+
15
+ def __init__(self, params_file=None):
16
+ self.params = {}
17
+ assert os.path.isfile(params_file), f"params_file not found: {params_file}"
18
+ self.params = self._build(params_file)
19
+
20
+ def _expand_param(self, param_name: str, config: Dict[str, Any]) -> List[Any]:
21
+ """
22
+ Validates and expands the values for a single parameter configuration.
23
+
24
+ Args:
25
+ param_name: The name of the parameter being processed.
26
+ config: The configuration dictionary for this parameter.
27
+
28
+ Returns:
29
+ A list of the expanded values for the parameter.
30
+
31
+ Raises:
32
+ TypeError: If the configuration or its values have an incorrect type.
33
+ ValueError: If the configuration is missing keys or has an invalid structure.
34
+ """
35
+ # 1. Validate the configuration structure
36
+ if not isinstance(config, dict):
37
+ raise TypeError(f"Config for '{param_name}' must be a dictionary.")
38
+
39
+ if "type" not in config or "values" not in config:
40
+ raise ValueError(
41
+ f"Config for '{param_name}' must contain 'type' and 'values' keys."
42
+ )
43
+
44
+ gen_type = config["type"]
45
+ values = config["values"]
46
+
47
+ # 2. Handle the generation based on type
48
+ if gen_type == "list":
49
+ # Ensure values are returned as a list, even if a single item was provided
50
+ return values if isinstance(values, list) else [values]
51
+
52
+ elif gen_type == "range":
53
+ if not isinstance(values, list) or len(values) != 3:
54
+ raise ValueError(
55
+ f"For 'range' type on '{param_name}', 'values' must be a list of 3 numbers "
56
+ f"[start, end, step], but got: {values}"
57
+ )
58
+
59
+ start, end, step = values
60
+ if all(isinstance(v, int) for v in values):
61
+ return list(range(start, end, step))
62
+ elif all(isinstance(v, (int, float)) for v in values):
63
+ # Use numpy for floating point ranges
64
+ temp_list = list(np.arange(start, end, step))
65
+ # convert to float (not np.float)
66
+ return [float(v) for v in temp_list]
67
+ else:
68
+ raise TypeError(
69
+ f"All 'values' for 'range' on '{param_name}' must be numbers."
70
+ )
71
+
72
+ else:
73
+ raise ValueError(
74
+ f"Invalid 'type' for '{param_name}': '{gen_type}'. Must be 'list' or 'range'."
75
+ )
76
+
77
+ def _build(self, params_file):
78
+ """
79
+ Builds a full optimization configuration by expanding parameter values based on their type.
80
+
81
+ This function processes a dictionary where each key is a parameter name and each value
82
+ is a config dict specifying the 'type' ('list' or 'range') and 'values' for generation.
83
+
84
+ Args:
85
+ opt_cfg: The input configuration dictionary.
86
+ Example:
87
+ {
88
+ "learning_rate": {"type": "range", "values": [0.01, 0.1, 0.01]},
89
+ "optimizer": {"type": "list", "values": ["adam", "sgd"]},
90
+ "epochs": {"type": "list", "values": 100}
91
+ }
92
+
93
+ Returns:
94
+ A dictionary with parameter names mapped to their fully expanded list of values.
95
+ """
96
+ cfg_raw_dict = yamlfile.load_yaml(params_file, to_dict=True)
97
+ if not isinstance(cfg_raw_dict, dict):
98
+ raise TypeError("The entire opt_cfg must be a dictionary.")
99
+
100
+ # Use a dictionary comprehension for a clean and efficient build
101
+ return {
102
+ param_name: self._expand_param(param_name, config)
103
+ for param_name, config in cfg_raw_dict.items()
104
+ }
105
+
106
+ def save(self, outfile):
107
+ with open(outfile, "w") as f:
108
+ yaml.dump(self.params, f)
@@ -0,0 +1,336 @@
1
+ import os
2
+ import glob
3
+ from typing import Optional, Tuple
4
+ import pandas as pd
5
+
6
+ from rich.pretty import pprint
7
+
8
+ from abc import ABC, abstractmethod
9
+ from collections import OrderedDict
10
+
11
+ from ..filetype import csvfile
12
+ from ..system import filesys as fs
13
+ from ..common import now_str
14
+ from ..research.perftb import PerfTB
15
+ from ..research.metrics import *
16
+
17
+
18
+ REQUIRED_COLS = ["experiment", "dataset"]
19
+ CSV_FILE_POSTFIX = "__perf"
20
+ METRIC_PREFIX = "metric_"
21
+
22
+ class PerfCalc(ABC): # Abstract base class for performance calculation
23
+ @abstractmethod
24
+ def get_experiment_name(self) -> str:
25
+ """
26
+ Return the name of the experiment.
27
+ This function should be overridden by the subclass if needed.
28
+ """
29
+ pass
30
+
31
+ @abstractmethod
32
+ def get_dataset_name(self) -> str:
33
+ """
34
+ Return the name of the dataset.
35
+ This function should be overridden by the subclass if needed.
36
+ """
37
+ pass
38
+
39
+ @abstractmethod
40
+ def get_metric_backend(self) -> MetricsBackend:
41
+ """
42
+ Return a list of metric names to be used for performance calculation OR a dictionaray with keys as metric names and values as metric instances of torchmetrics.Metric. For example: {"accuracy": Accuracy(), "precision": Precision()}
43
+
44
+ """
45
+ pass
46
+
47
+ def valid_proc_extra_data(
48
+ self, proc_extra_data
49
+ ):
50
+ # make sure that all items in proc_extra_data are dictionaries, with same keys
51
+ if proc_extra_data is None or len(proc_extra_data) == 0:
52
+ return
53
+ if not all(isinstance(item, dict) for item in proc_extra_data):
54
+ raise TypeError("All items in proc_extra_data must be dictionaries")
55
+
56
+ if not all(item.keys() == proc_extra_data[0].keys() for item in proc_extra_data):
57
+ raise ValueError("All dictionaries in proc_extra_data must have the same keys")
58
+
59
+ def valid_proc_metric_raw_data(
60
+ self, metric_names, proc_metric_raw_data
61
+ ):
62
+ # make sure that all items in proc_metric_raw_data are dictionaries, with same keys as metric_names
63
+ assert isinstance(proc_metric_raw_data, list) and len(proc_metric_raw_data) > 0, \
64
+ "raw_data_for_metrics must be a non-empty list of dictionaries"
65
+
66
+ # make sure that all items in proc_metric_raw_data are dictionaries with keys as metric_names
67
+ if not all(isinstance(item, dict) for item in proc_metric_raw_data):
68
+ raise TypeError("All items in raw_data_for_metrics must be dictionaries")
69
+ if not all( set(item.keys()) == set(metric_names) for item in proc_metric_raw_data):
70
+ raise ValueError(
71
+ "All dictionaries in raw_data_for_metrics must have the same keys as metric_names"
72
+ )
73
+
74
+ # ! only need to override this method if torchmetrics are not used
75
+ def calc_exp_perf_metrics(
76
+ self, metric_names, raw_metrics_data, extra_data=None, *args, **kwargs
77
+ ):
78
+ assert isinstance(raw_metrics_data, dict) or isinstance(raw_metrics_data, list), \
79
+ "raw_data_for_metrics must be a dictionary or a list"
80
+
81
+ if extra_data is not None:
82
+ assert isinstance(extra_data, type(raw_metrics_data)), \
83
+ "extra_data must be of the same type as raw_data_for_metrics (dict or list)"
84
+ # prepare raw_metric data for processing
85
+ proc_metric_raw_data_ls = raw_metrics_data if isinstance(raw_metrics_data, list) else [raw_metrics_data.copy()]
86
+ self.valid_proc_metric_raw_data(metric_names, proc_metric_raw_data_ls)
87
+ # prepare extra data for processing
88
+ proc_extra_data_ls = []
89
+ if extra_data is not None:
90
+ proc_extra_data_ls = extra_data if isinstance(extra_data, list) else [extra_data.copy()]
91
+ assert len(proc_extra_data_ls) == len(proc_metric_raw_data_ls), \
92
+ "extra_data must have the same length as raw_data_for_metrics if it is a list"
93
+ # validate the extra_data
94
+ self.valid_proc_extra_data(proc_extra_data_ls)
95
+
96
+ # calculate the metrics output results
97
+ metrics_backend = self.get_metric_backend()
98
+ proc_outdict_list = []
99
+ for idx, raw_metrics_data in enumerate(proc_metric_raw_data_ls):
100
+ out_dict = {
101
+ "dataset": self.get_dataset_name(),
102
+ "experiment": self.get_experiment_name(),
103
+ }
104
+ custom_fields = []
105
+ if len(proc_extra_data_ls)> 0:
106
+ # add extra data to the output dictionary
107
+ extra_data_item = proc_extra_data_ls[idx]
108
+ out_dict.update(extra_data_item)
109
+ custom_fields = list(extra_data_item.keys())
110
+ metric_results = metrics_backend.calc_metrics(
111
+ metrics_data_dict=raw_metrics_data, *args, **kwargs
112
+ )
113
+ metric_results_prefix = {f"metric_{k}": v for k, v in metric_results.items()}
114
+ out_dict.update(metric_results_prefix)
115
+ ordered_cols = (
116
+ REQUIRED_COLS + custom_fields + list(metric_results_prefix.keys())
117
+ )
118
+ out_dict = OrderedDict(
119
+ (col, out_dict[col]) for col in ordered_cols if col in out_dict
120
+ )
121
+ proc_outdict_list.append(out_dict)
122
+
123
+ return proc_outdict_list
124
+
125
+ #! custom kwargs:
126
+ #! outfile - if provided, will save the output to a CSV file with the given path
127
+ #! outdir - if provided, will save the output to a CSV file in the given directory with a generated filename
128
+ #! return_df - if True, will return a DataFrame instead of a dictionary
129
+ def calc_and_save_exp_perfs(
130
+ self,
131
+ raw_metrics_data: Union[List[dict], dict],
132
+ extra_data: Optional[Union[List[dict], dict]] = None,
133
+ *args,
134
+ **kwargs,
135
+ ) -> Tuple[Union[List[OrderedDict], pd.DataFrame], Optional[str]]:
136
+ """
137
+ Calculate the metrics.
138
+ This function should be overridden by the subclass if needed.
139
+ Must return a dictionary with keys as metric names and values as the calculated metrics.
140
+ """
141
+ metric_names = self.get_metric_backend().metric_names
142
+ out_dict_list = self.calc_exp_perf_metrics(
143
+ metric_names=metric_names, raw_metrics_data=raw_metrics_data,
144
+ extra_data=extra_data,
145
+ *args, **kwargs
146
+ )
147
+ csv_outfile = kwargs.get("outfile", None)
148
+ if csv_outfile is not None:
149
+ filePathNoExt, _ = os.path.splitext(csv_outfile)
150
+ # pprint(f"CSV Outfile Path (No Ext): {filePathNoExt}")
151
+ csv_outfile = f"{filePathNoExt}{CSV_FILE_POSTFIX}.csv"
152
+ elif "outdir" in kwargs:
153
+ csvoutdir = kwargs["outdir"]
154
+ csvfilename = f"{now_str()}_{self.get_dataset_name()}_{self.get_experiment_name()}_{CSV_FILE_POSTFIX}.csv"
155
+ csv_outfile = os.path.join(csvoutdir, csvfilename)
156
+
157
+ # convert out_dict to a DataFrame
158
+ df = pd.DataFrame(out_dict_list)
159
+ # get the orders of the columns as the orders or the keys in out_dict
160
+ ordered_cols = list(out_dict_list[0].keys())
161
+ df = df[ordered_cols] # reorder columns
162
+ if csv_outfile:
163
+ df.to_csv(csv_outfile, index=False, sep=";", encoding="utf-8")
164
+ return_df = kwargs.get("return_df", False)
165
+ if return_df: # return DataFrame instead of dict if requested
166
+ return df, csv_outfile
167
+ else:
168
+ return out_dict_list, csv_outfile
169
+
170
+ @staticmethod
171
+ def default_exp_csv_filter_fn(exp_file_name: str) -> bool:
172
+ """
173
+ Default filter function for experiments.
174
+ Returns True if the experiment name does not start with "test_" or "debug_".
175
+ """
176
+ return "__perf.csv" in exp_file_name
177
+
178
+ @classmethod
179
+ def gen_perf_report_for_multip_exps(
180
+ cls, indir: str, exp_csv_filter_fn=default_exp_csv_filter_fn, include_file_name=False, csv_sep=";"
181
+ ) -> PerfTB:
182
+ """
183
+ Generate a performance report by scanning experiment subdirectories.
184
+ Must return a dictionary with keys as metric names and values as performance tables.
185
+ """
186
+ def get_df_for_all_exp_perf(csv_perf_files, csv_sep=";"):
187
+ """
188
+ Create a single DataFrame from all CSV files.
189
+ Assumes all CSV files MAY have different metrics
190
+ """
191
+ cols = []
192
+ FILE_NAME_COL = "file_name" if include_file_name else None
193
+
194
+ for csv_file in csv_perf_files:
195
+ temp_df = pd.read_csv(csv_file, sep=csv_sep)
196
+ if FILE_NAME_COL:
197
+ temp_df[FILE_NAME_COL] = fs.get_file_name(csv_file, split_file_ext=False)
198
+ # csvfile.fn_display_df(temp_df)
199
+ temp_df_cols = temp_df.columns.tolist()
200
+ for col in temp_df_cols:
201
+ if col not in cols:
202
+ cols.append(col)
203
+
204
+ df = pd.DataFrame(columns=cols)
205
+ for csv_file in csv_perf_files:
206
+ temp_df = pd.read_csv(csv_file, sep=csv_sep)
207
+ if FILE_NAME_COL:
208
+ temp_df[FILE_NAME_COL] = fs.get_file_name(csv_file, split_file_ext=False)
209
+ # Drop all-NA columns to avoid dtype inconsistency
210
+ temp_df = temp_df.dropna(axis=1, how="all")
211
+ # ensure all columns are present in the final DataFrame
212
+ for col in cols:
213
+ if col not in temp_df.columns:
214
+ temp_df[col] = None # fill missing columns with None
215
+ df = pd.concat([df, temp_df], ignore_index=True)
216
+ # assert that REQUIRED_COLS are present in the DataFrame
217
+ # pprint(df.columns.tolist())
218
+ sticky_cols = REQUIRED_COLS + ([FILE_NAME_COL] if include_file_name else []) # columns that must always be present
219
+ for col in sticky_cols:
220
+ if col not in df.columns:
221
+ raise ValueError(
222
+ f"Required column '{col}' is missing from the DataFrame. REQUIRED_COLS = {sticky_cols}"
223
+ )
224
+ metric_cols = [col for col in df.columns if col.startswith(METRIC_PREFIX)]
225
+ assert (
226
+ len(metric_cols) > 0
227
+ ), "No metric columns found in the DataFrame. Ensure that the CSV files contain metric columns starting with 'metric_'."
228
+ final_cols = sticky_cols + metric_cols
229
+ df = df[final_cols]
230
+ # # !hahv debug
231
+ # pprint("------ Final DataFrame Columns ------")
232
+ # csvfile.fn_display_df(df)
233
+ # ! validate all rows in df before returning
234
+ # make sure all rows will have at least values for REQUIRED_COLS and at least one metric column
235
+ for index, row in df.iterrows():
236
+ if not all(col in row and pd.notna(row[col]) for col in sticky_cols):
237
+ raise ValueError(
238
+ f"Row {index} is missing required columns or has NaN values in required columns: {row}"
239
+ )
240
+ if not any(pd.notna(row[col]) for col in metric_cols):
241
+ raise ValueError(f"Row {index} has no metric values: {row}")
242
+ # make sure these is no (experiment, dataset) pair that is duplicated
243
+ duplicates = df.duplicated(subset=sticky_cols, keep=False)
244
+ if duplicates.any():
245
+ raise ValueError(
246
+ "Duplicate (experiment, dataset) pairs found in the DataFrame. Please ensure that each experiment-dataset combination is unique."
247
+ )
248
+ return df
249
+
250
+ def mk_perftb_report(df):
251
+ """
252
+ Create a performance report table from the DataFrame.
253
+ This function should be customized based on the specific requirements of the report.
254
+ """
255
+ perftb = PerfTB()
256
+ # find all "dataset" values (unique)
257
+ dataset_names = list(df["dataset"].unique())
258
+ # find all columns that start with METRIC_PREFIX
259
+ metric_cols = [col for col in df.columns if col.startswith(METRIC_PREFIX)]
260
+
261
+ # Determine which metrics are associated with each dataset.
262
+ # Since a dataset may appear in multiple rows and may not include all metrics in each, identify the row with the same dataset that contains the most non-NaN metric values. The set of metrics for that dataset is defined by the non-NaN metrics in that row.
263
+
264
+ dataset_metrics = {}
265
+ for dataset_name in dataset_names:
266
+ dataset_rows = df[df["dataset"] == dataset_name]
267
+ # Find the row with the most non-NaN metric values
268
+ max_non_nan_row = dataset_rows[metric_cols].count(axis=1).idxmax()
269
+ metrics_for_dataset = (
270
+ dataset_rows.loc[max_non_nan_row, metric_cols]
271
+ .dropna()
272
+ .index.tolist()
273
+ )
274
+ dataset_metrics[dataset_name] = metrics_for_dataset
275
+
276
+ for dataset_name, metrics in dataset_metrics.items():
277
+ # Create a new row for the performance table
278
+ perftb.add_dataset(dataset_name, metrics)
279
+
280
+ for _, row in df.iterrows():
281
+ dataset_name = row["dataset"]
282
+ ds_metrics = dataset_metrics.get(dataset_name)
283
+ if dataset_name in dataset_metrics:
284
+ # Add the metrics for this row to the performance table
285
+ exp_name = row.get("experiment")
286
+ exp_metric_values = {}
287
+ for metric in ds_metrics:
288
+ if metric in row and pd.notna(row[metric]):
289
+ exp_metric_values[metric] = row[metric]
290
+ perftb.add_experiment(
291
+ experiment_name=exp_name,
292
+ dataset_name=dataset_name,
293
+ metrics=exp_metric_values,
294
+ )
295
+
296
+ return perftb
297
+
298
+ assert os.path.exists(indir), f"Input directory {indir} does not exist."
299
+
300
+ csv_perf_files = []
301
+ # Find experiment subdirectories
302
+ exp_dirs = [
303
+ os.path.join(indir, d)
304
+ for d in os.listdir(indir)
305
+ if os.path.isdir(os.path.join(indir, d))
306
+ ]
307
+ if len(exp_dirs) == 0:
308
+ csv_perf_files = glob.glob(os.path.join(indir, f"*.csv"))
309
+ csv_perf_files = [
310
+ file_item
311
+ for file_item in csv_perf_files
312
+ if exp_csv_filter_fn(file_item)
313
+ ]
314
+ else:
315
+ # multiple experiment directories found
316
+ # Collect all matching CSV files in those subdirs
317
+ for exp_dir in exp_dirs:
318
+ # pprint(f"Searching in experiment directory: {exp_dir}")
319
+ matched = glob.glob(os.path.join(exp_dir, f"*.csv"))
320
+ matched = [
321
+ file_item for file_item in matched if exp_csv_filter_fn(file_item)
322
+ ]
323
+ csv_perf_files.extend(matched)
324
+
325
+ assert (
326
+ len(csv_perf_files) > 0
327
+ ), f"No CSV files matching pattern '{exp_csv_filter_fn}' found in the experiment directories."
328
+
329
+ assert (
330
+ len(csv_perf_files) > 0
331
+ ), f"No CSV files matching pattern '{exp_csv_filter_fn}' found in the experiment directories."
332
+
333
+ all_exp_perf_df = get_df_for_all_exp_perf(csv_perf_files, csv_sep=csv_sep)
334
+ # csvfile.fn_display_df(all_exp_perf_df)
335
+ perf_tb = mk_perftb_report(all_exp_perf_df)
336
+ return perf_tb