PyPI - halib - Versions diffs - 0.1.55__tar.gz → 0.1.57__tar.gz - Mend

halib 0.1.55tar.gz → 0.1.57tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

{halib-0.1.55 → halib-0.1.57}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: halib
-Version: 0.1.55
+Version: 0.1.57
 Summary: Small library for common tasks
 Author: Hoang Van Ha
 Author-email: hoangvanhauit@gmail.com
@@ -15,6 +15,10 @@ License-File: LICENSE.txt
 Helper package for coding and automation
+**Version 0.1.57**
++ add `util/perfcalc`: abstract class for performance calculation. This class need to be inherited and implemented with specific performance calculation logic.
 **Version 0.1.55**
 + add `util/dataclass_util` to help dynamically create `dataclass` classes from dictionary or YAML file, including support for nested dataclasses. From there, we can use `dataclass_wizard` to create a list of `dataclass` classes with the help from ChatGPT.

{halib-0.1.55 → halib-0.1.57}/README.md RENAMED Viewed

@@ -1,5 +1,9 @@
 Helper package for coding and automation
+**Version 0.1.57**
++ add `util/perfcalc`: abstract class for performance calculation. This class need to be inherited and implemented with specific performance calculation logic.
 **Version 0.1.55**
 + add `util/dataclass_util` to help dynamically create `dataclass` classes from dictionary or YAML file, including support for nested dataclasses. From there, we can use `dataclass_wizard` to create a list of `dataclass` classes with the help from ChatGPT.

halib-0.1.57/halib/research/perfcalc.py ADDED Viewed

@@ -0,0 +1,314 @@
+import os
+import glob
+import inspect
+import pandas as pd
+from typing import Dict
+from functools import wraps
+from rich.pretty import pprint
+from abc import ABC, abstractmethod
+from ..filetype import csvfile
+from ..common import now_str
+from ..research.perftb import PerfTB
+# try to import torch, and torchmetrics
+try:
+    import torch
+    import torchmetrics
+    from torchmetrics import Metric
+except ImportError:
+    raise ImportError("Please install torch and torchmetrics to use this module.")
+def validate_torch_metrics(fn):
+    @wraps(fn)
+    def wrapper(self, *args, **kwargs):
+        result = fn(self, *args, **kwargs)
+        if not isinstance(result, dict):
+            raise TypeError("torch_metrics() must return a dictionary")
+        for k, v in result.items():
+            if not isinstance(k, str):
+                raise TypeError(f"Key '{k}' is not a string")
+            if not isinstance(v, Metric):
+                raise TypeError(
+                    f"Value for key '{k}' is not a torchmetrics.Metric (got {type(v).__name__})"
+                )
+        return result
+    return wrapper
+def valid_custom_fields(fn):
+    @wraps(fn)
+    def wrapper(self, *args, **kwargs):
+        rs = fn(self, *args, **kwargs)
+        if not isinstance(rs, tuple) or len(rs) != 2:
+            raise ValueError("Function must return a tuple (outdict, custom_fields)")
+        outdict, custom_fields = rs
+        if not isinstance(outdict, dict):
+            raise TypeError("Output must be a dictionary")
+        if not isinstance(custom_fields, list):
+            raise TypeError("Custom fields must be a list")
+        for field in custom_fields:
+            if not isinstance(field, str):
+                raise TypeError(f"Custom field '{field}' is not a string")
+        return outdict, custom_fields
+    return wrapper
+REQUIRED_COLS = ["experiment", "dataset"]
+CSV_FILE_POSTFIX = "__perf.csv"
+class PerfCalc(ABC): # Abstract base class for performance calculation
+    @abstractmethod
+    def get_exp_torch_metrics(self):
+        """
+        Return a dictionary of torchmetrics to be used for performance calculation.
+        Example: {"accuracy": Accuracy(), "precision": Precision()}
+        """
+        pass
+    @abstractmethod
+    def get_dataset_name(self):
+        """
+        Return the name of the dataset.
+        This function should be overridden by the subclass if needed.
+        """
+        pass
+    @abstractmethod
+    def prepare_exp_data_for_metrics(self, metric_names, *args, **kwargs):
+        """
+        Prepare the data for metrics.
+        This function should be overridden by the subclass if needed.
+        Must return a dictionary with keys as metric names and values as the data to be used for those metrics.
+        NOTE: that the data (for each metric) must be in the format expected by the torchmetrics instance (for that metric). E.g: {"accuracy": {"preds": [...], "target": [...]}, ...} since torchmetrics expects the data in a specific format.
+        """
+        pass
+    @abstractmethod
+    def get_experiment_name(self):
+        """
+        Return the name of the experiment.
+        This function should be overridden by the subclass if needed.
+        """
+        pass
+    def calc_exp_outdict_custom_fields(self, outdict, *args, **kwargs):
+        """Can be overridden by the subclass to add custom fields to the output dictionary.
+        ! must return the modified outdict, and a ordered list of custom fields to be added to the output dictionary.
+        """
+        return outdict, []
+    #! custom kwargs:
+    #! outfile - if provided, will save the output to a CSV file with the given path
+    #! outdir - if provided, will save the output to a CSV file in the given directory with a generated filename
+    #! return_df - if True, will return a DataFrame instead of a dictionary
+    def calculate_exp_perf_metrics(self, *args, **kwargs):
+        """
+        Calculate the metrics.
+        This function should be overridden by the subclass if needed.
+        Must return a dictionary with keys as metric names and values as the calculated metrics.
+        """
+        metric_names = list(self.get_exp_torch_metrics().keys())
+        out_dict = {metric: None for metric in metric_names}
+        out_dict['dataset'] = self.get_dataset_name()
+        out_dict['experiment'] = self.get_experiment_name()
+        out_dict, custom_fields = self.calc_exp_outdict_custom_fields(
+            outdict=out_dict, *args, **kwargs
+        )
+        torch_metrics_dict = self.get_exp_torch_metrics()
+        all_metric_data = self.prepare_exp_data_for_metrics(
+            metric_names, *args, **kwargs
+        )
+        for metric in metric_names:
+            if metric not in all_metric_data:
+                raise ValueError(f"Metric '{metric}' not found in provided data.")
+            tmetric = torch_metrics_dict[metric] # torchmetrics instance
+            metric_data = all_metric_data[metric]  # should be a dict of args/kwargs
+            # Inspect expected parameters for the metric's update() method
+            sig = inspect.signature(tmetric.update)
+            expected_args = list(sig.parameters.values())
+            # Prepare args in correct order
+            if isinstance(metric_data, dict):
+                # Match dict keys to parameter names
+                args = [metric_data[param.name] for param in expected_args]
+            elif isinstance(metric_data, (list, tuple)):
+                args = metric_data
+            else:
+                raise TypeError(f"Unsupported data format for metric '{metric}'")
+            # Call update and compute
+            tmetric.update(*args)
+            computed_value = tmetric.compute()
+            # ensure the computed value converted to a scala value or list array
+            if isinstance(computed_value, torch.Tensor):
+                if computed_value.numel() == 1:
+                    computed_value = computed_value.item()
+                else:
+                    computed_value = computed_value.tolist()
+            out_dict[metric] = computed_value
+        # check if any kwargs named "outfile"
+        csv_outfile = kwargs.get("outfile", None)
+        if csv_outfile is not None:
+            # get the file path without the extension
+            filePathNoExt, _ = os.path.splitext(csv_outfile)
+            # add the postfix to the file path
+            csvfilename = f"{now_str()}_{filePathNoExt}{CSV_FILE_POSTFIX}.csv"
+            csv_outfile = os.path.join(os.path.dirname(csv_outfile), csvfilename)
+        elif "outdir" in kwargs:
+            csvoutdir = kwargs["outdir"]
+            csvfilename = f"{now_str()}_{self.get_dataset_name()}_{self.get_experiment_name()}_{CSV_FILE_POSTFIX}.csv"
+            csv_outfile = os.path.join(csvoutdir, csvfilename)
+        # convert out_dict to a DataFrame
+        df = pd.DataFrame([out_dict])
+        ordered_cols = REQUIRED_COLS + custom_fields + metric_names
+        df = df[ordered_cols]  # reorder columns
+        if csv_outfile:
+            df.to_csv(csv_outfile, index=False, sep=";", encoding="utf-8")
+        return_df = kwargs.get("return_df", False)
+        if return_df: # return DataFrame instead of dict if requested
+            return df, csv_outfile
+        else:
+            return out_dict, csv_outfile
+    @classmethod
+    def gen_perf_report_for_multip_exps(
+        cls, indir: str, exp_perf_csv_pattern="__perf", csv_sep=';'
+    ) -> PerfTB:
+        """
+        Generate a performance report by scanning experiment subdirectories.
+        Must return a dictionary with keys as metric names and values as performance tables.
+        """
+        def get_df_for_all_exp_perf(csv_perf_files, csv_sep=';'):
+            """
+            Create a single DataFrame from all CSV files.
+            Assumes all CSV files MAY have different metrics
+            """
+            cols = []
+            for csv_file in csv_perf_files:
+                temp_df = pd.read_csv(csv_file, sep=csv_sep)
+                temp_df_cols = temp_df.columns.tolist()
+                for col in temp_df_cols:
+                    if col not in cols:
+                        cols.append(col)
+            df = pd.DataFrame(columns=cols)
+            for csv_file in csv_perf_files:
+                temp_df = pd.read_csv(csv_file, sep=csv_sep)
+                # Drop all-NA columns to avoid dtype inconsistency
+                temp_df = temp_df.dropna(axis=1, how='all')
+                # ensure all columns are present in the final DataFrame
+                for col in cols:
+                    if col not in temp_df.columns:
+                        temp_df[col] = None  # fill missing columns with None
+                df = pd.concat([df, temp_df], ignore_index=True)
+            # assert that REQUIRED_COLS are present in the DataFrame
+            pprint(df.columns.tolist())
+            for col in REQUIRED_COLS:
+                if col not in df.columns:
+                    raise ValueError(f"Required column '{col}' is missing from the DataFrame. REQUIRED_COLS = {REQUIRED_COLS}")
+            metric_cols = [col for col in df.columns if col.startswith('metric_')]
+            assert len(metric_cols) > 0, "No metric columns found in the DataFrame. Ensure that the CSV files contain metric columns starting with 'metric_'."
+            final_cols = REQUIRED_COLS + metric_cols
+            df = df[final_cols]
+            # ! validate all rows in df before returning
+            # make sure all rows will have at least values for REQUIRED_COLS and at least one metric column
+            for index, row in df.iterrows():
+                if not all(col in row and pd.notna(row[col]) for col in REQUIRED_COLS):
+                    raise ValueError(f"Row {index} is missing required columns or has NaN values in required columns: {row}")
+                if not any(pd.notna(row[col]) for col in metric_cols):
+                    raise ValueError(f"Row {index} has no metric values: {row}")
+            # make sure these is no (experiment, dataset) pair that is duplicated
+            duplicates = df.duplicated(subset=['experiment', 'dataset'], keep=False)
+            if duplicates.any():
+                raise ValueError("Duplicate (experiment, dataset) pairs found in the DataFrame. Please ensure that each experiment-dataset combination is unique.")
+            return df
+        def mk_perftb_report(df):
+            """
+            Create a performance report table from the DataFrame.
+            This function should be customized based on the specific requirements of the report.
+            """
+            perftb = PerfTB()
+            # find all "dataset" values (unique)
+            dataset_names = list(df['dataset'].unique())
+            # find all columns that start with "metric_"
+            metric_cols = [col for col in df.columns if col.startswith('metric_')]
+            # Determine which metrics are associated with each dataset.
+            # Since a dataset may appear in multiple rows and may not include all metrics in each, identify the row with the same dataset that contains the most non-NaN metric values. The set of metrics for that dataset is defined by the non-NaN metrics in that row.
+            dataset_metrics = {}
+            for dataset_name in dataset_names:
+                dataset_rows = df[df["dataset"] == dataset_name]
+                # Find the row with the most non-NaN metric values
+                max_non_nan_row = dataset_rows[metric_cols].count(axis=1).idxmax()
+                metrics_for_dataset = dataset_rows.loc[max_non_nan_row, metric_cols].dropna().index.tolist()
+                dataset_metrics[dataset_name] = metrics_for_dataset
+            for dataset_name, metrics in dataset_metrics.items():
+                # Create a new row for the performance table
+                perftb.add_dataset(dataset_name, metrics)
+            for _, row in df.iterrows():
+                dataset_name = row['dataset']
+                ds_metrics = dataset_metrics.get(dataset_name)
+                if dataset_name in dataset_metrics:
+                    # Add the metrics for this row to the performance table
+                    exp_name = row.get('experiment')
+                    exp_metric_values = {}
+                    for metric in ds_metrics:
+                        if metric in row and pd.notna(row[metric]):
+                            exp_metric_values[metric] = row[metric]
+                    perftb.add_experiment(
+                        experiment_name=exp_name,
+                        dataset_name=dataset_name,
+                        metrics=exp_metric_values
+                    )
+            return perftb
+        assert os.path.exists(indir), f"Input directory {indir} does not exist."
+        # Find experiment subdirectories
+        exp_dirs = [
+            os.path.join(indir, d)
+            for d in os.listdir(indir)
+            if os.path.isdir(os.path.join(indir, d))
+        ]
+        assert exp_dirs, f"No experiment directories found in {indir}."
+        # Collect all matching CSV files in those subdirs
+        csv_perf_files = []
+        for exp_dir in exp_dirs:
+            pprint(f"Searching in experiment directory: {exp_dir}")
+            matched = glob.glob(os.path.join(exp_dir, f"*{exp_perf_csv_pattern}*.csv"))
+            csv_perf_files.extend(matched)
+        assert len(csv_perf_files) > 0, f"No CSV files matching pattern '{exp_perf_csv_pattern}' found in the experiment directories."
+        all_exp_perf_df = get_df_for_all_exp_perf(csv_perf_files, csv_sep=csv_sep)
+        csvfile.fn_display_df(all_exp_perf_df)
+        perf_tb = mk_perftb_report(all_exp_perf_df)
+        return perf_tb
+def main():
+    indir = "./zreport/test"
+    report_outfile = "./zreport/all.csv"
+    exp_perf_csv_pattern = "__perf"
+    csv_sep = ";"
+    perftb = PerfCalc.gen_perf_report_for_multip_exps(
+        indir, exp_perf_csv_pattern, csv_sep
+    )
+    perftb.to_csv(report_outfile, sep=csv_sep)
+    inspect(perftb)
+    perftb.plot(save_path="./zreport/test_csv.svg", open_plot=True)
+if __name__ == "__main__":
+    main()

{halib-0.1.55 → halib-0.1.57}/halib.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: halib
-Version: 0.1.55
+Version: 0.1.57
 Summary: Small library for common tasks
 Author: Hoang Van Ha
 Author-email: hoangvanhauit@gmail.com
@@ -15,6 +15,10 @@ License-File: LICENSE.txt
 Helper package for coding and automation
+**Version 0.1.57**
++ add `util/perfcalc`: abstract class for performance calculation. This class need to be inherited and implemented with specific performance calculation logic.
 **Version 0.1.55**
 + add `util/dataclass_util` to help dynamically create `dataclass` classes from dictionary or YAML file, including support for nested dataclasses. From there, we can use `dataclass_wizard` to create a list of `dataclass` classes with the help from ChatGPT.

{halib-0.1.55 → halib-0.1.57}/halib.egg-info/SOURCES.txt RENAMED Viewed

@@ -27,6 +27,7 @@ halib/online/gdrive_test.py
 halib/online/projectmake.py
 halib/research/__init__.py
 halib/research/dataset.py
+halib/research/perfcalc.py
 halib/research/perftb.py
 halib/research/plot.py
 halib/research/torchloader.py

{halib-0.1.55 → halib-0.1.57}/setup.py RENAMED Viewed

@@ -8,7 +8,7 @@ with open("requirements.txt") as f:
 setuptools.setup(
     name="halib",
-    version="0.1.55",
+    version="0.1.57",
     author="Hoang Van Ha",
     author_email="hoangvanhauit@gmail.com",
     description="Small library for common tasks",