PyPI - halib - Versions diffs - 0.1.47__py3-none-any.whl → 0.1.48__py3-none-any.whl - Mend

halib 0.1.47py3-none-any.whl → 0.1.48py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

halib/research/__init__.py +0 -0
halib/research/benchquery.py +131 -0
halib/research/dataset.py +209 -0
halib/research/plot.py +301 -0
halib/research/torchloader.py +162 -0
halib/utils/__init__.py +0 -0
halib/utils/listop.py +13 -0
halib/utils/tele_noti.py +166 -0
{halib-0.1.47.dist-info → halib-0.1.48.dist-info}/METADATA +5 -1
{halib-0.1.47.dist-info → halib-0.1.48.dist-info}/RECORD +13 -5
{halib-0.1.47.dist-info → halib-0.1.48.dist-info}/LICENSE.txt +0 -0
{halib-0.1.47.dist-info → halib-0.1.48.dist-info}/WHEEL +0 -0
{halib-0.1.47.dist-info → halib-0.1.48.dist-info}/top_level.txt +0 -0

halib/research/__init__.py ADDED Viewed

File without changes

halib/research/benchquery.py ADDED Viewed

@@ -0,0 +1,131 @@
+import pandas as pd
+from rich.pretty import pprint
+from argparse import ArgumentParser
+def cols_to_col_groups(df):
+    columns = list(df.columns)
+    # pprint(columns)
+    col_groups = []
+    current_group = []
+    def have_unnamed(col_group):
+        return any("unnamed" in col.lower() for col in col_group)
+    for i, col in enumerate(columns):
+        # Add the first column to the current group
+        if not current_group:
+            current_group.append(col)
+            continue
+        prev_col = columns[i - 1]
+        # Check if current column is "unnamed" or shares base name with previous
+        # Assuming "equal" means same base name (before any suffix like '_1')
+        base_prev = (
+            prev_col.split("_")[0].lower() if "_" in prev_col else prev_col.lower()
+        )
+        base_col = col.split("_")[0].lower() if "_" in col else col.lower()
+        is_unnamed = "unnamed" in col.lower()
+        is_equal = base_col == base_prev
+        if is_unnamed or is_equal:
+            # Add to current group
+            current_group.append(col)
+        else:
+            # Start a new group
+            col_groups.append(current_group)
+            current_group = [col]
+    # Append the last group
+    if current_group:
+        col_groups.append(current_group)
+    meta_dict = {"common_cols": [], "db_cols": []}
+    for group in col_groups:
+        if not have_unnamed(group):
+            meta_dict["common_cols"].extend(group)
+        else:
+            # find the first unnamed column
+            named_col = next(
+                (col for col in group if "unnamed" not in col.lower()), None
+            )
+            group_cols = [f"{named_col}_{i}" for i in range(len(group))]
+            meta_dict["db_cols"].extend(group_cols)
+    return meta_dict
+# def bech_by_db_name(df, db_list="db1, db2", key_metrics="p, r, f1, acc"):
+def str_2_list(input_str, sep=","):
+    out_ls = []
+    if len(input_str.strip()) == 0:
+        return out_ls
+    if sep not in input_str:
+        out_ls.append(input_str.strip())
+        return out_ls
+    else:
+        out_ls = [item.strip() for item in input_str.split(sep) if item.strip()]
+        return out_ls
+def filter_bech_df_by_db_and_metrics(df, db_list="", key_metrics=""):
+    meta_cols_dict = cols_to_col_groups(df)
+    op_df = df.copy()
+    op_df.columns = (
+        meta_cols_dict["common_cols"].copy() + meta_cols_dict["db_cols"].copy()
+    )
+    filterd_cols = []
+    filterd_cols.extend(meta_cols_dict["common_cols"])
+    selected_db_list = str_2_list(db_list)
+    db_filted_cols = []
+    if len(selected_db_list) > 0:
+        for db_name in db_list.split(","):
+            db_name = db_name.strip()
+            for col_name in meta_cols_dict["db_cols"]:
+                if db_name.lower() in col_name.lower():
+                    db_filted_cols.append(col_name)
+    else:
+        db_filted_cols = meta_cols_dict["db_cols"]
+    filterd_cols.extend(db_filted_cols)
+    df_filtered = op_df[filterd_cols].copy()
+    df_filtered
+    selected_metrics_ls = str_2_list(key_metrics)
+    if len(selected_metrics_ls) > 0:
+        # get the second row as metrics row (header)
+        metrics_row = df_filtered.iloc[0].copy()
+        # only get the values in columns in (db_filterd_cols)
+        metrics_values = metrics_row[db_filted_cols].values
+        keep_metrics_cols = []
+        # create a zip of db_filted_cols and metrics_values (in that metrics_row)
+        metrics_list = list(zip(metrics_values, db_filted_cols))
+        selected_metrics_ls = [metric.strip().lower() for metric in selected_metrics_ls]
+        for metric, col_name in metrics_list:
+            if metric.lower() in selected_metrics_ls:
+                keep_metrics_cols.append(col_name)
+    else:
+        pprint("No metrics selected, keeping all db columns")
+        keep_metrics_cols = db_filted_cols
+    final_filterd_cols = meta_cols_dict["common_cols"].copy() + keep_metrics_cols
+    df_final = df_filtered[final_filterd_cols].copy()
+    return df_final
+def parse_args():
+    parser = ArgumentParser(
+        description="desc text")
+    parser.add_argument('-csv', '--csv', type=str, help='CSV file path', default=r"E:\Dev\__halib\test\bench.csv")
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    csv_file = args.csv
+    df = pd.read_csv(csv_file, sep=";", encoding="utf-8")
+    filtered_df = filter_bech_df_by_db_and_metrics(df, "bowfire", "acc")
+    print(filtered_df)
+if __name__ == "__main__":
+    main()

halib/research/dataset.py ADDED Viewed

@@ -0,0 +1,209 @@
+# This script create a test version
+# of the watcam (wc) dataset
+# for testing the tflite model
+from argparse import ArgumentParser
+from rich import inspect
+from common import console, seed_everything, ConsoleLog
+from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit
+from tqdm import tqdm
+import os
+import click
+from torchvision.datasets import ImageFolder
+import shutil
+from rich.pretty import pprint
+from system import filesys as fs
+import glob
+def parse_args():
+    parser = ArgumentParser(description="desc text")
+    parser.add_argument(
+        "-indir",
+        "--indir",
+        type=str,
+        help="orignal dataset path",
+    )
+    parser.add_argument(
+        "-outdir",
+        "--outdir",
+        type=str,
+        help="dataset out path",
+        default=".",  # default to current dir
+    )
+    parser.add_argument(
+        "-val_size",
+        "--val_size",
+        type=float,
+        help="validation size",  # no default value to force user to input
+        default=0.2,
+    )
+    # add using StratifiedShuffleSplit or ShuffleSplit
+    parser.add_argument(
+        "-seed",
+        "--seed",
+        type=int,
+        help="random seed",
+        default=42,
+    )
+    parser.add_argument(
+        "-inplace",
+        "--inplace",
+        action="store_true",
+        help="inplace operation, will overwrite the outdir if exists",
+    )
+    parser.add_argument(
+        "-stratified",
+        "--stratified",
+        action="store_true",
+        help="use StratifiedShuffleSplit instead of ShuffleSplit",
+    )
+    parser.add_argument(
+        "-no_train",
+        "--no_train",
+        action="store_true",
+        help="only create test set, no train set",
+    )
+    parser.add_argument(
+        "-reverse",
+        "--reverse",
+        action="store_true",
+        help="combine train and val set back to original dataset",
+    )
+    return parser.parse_args()
+def move_images(image_paths, target_set_dir):
+    for img_path in tqdm(image_paths):
+        # get folder name of the image
+        img_dir = os.path.dirname(img_path)
+        out_cls_dir = os.path.join(target_set_dir, os.path.basename(img_dir))
+        if not os.path.exists(out_cls_dir):
+            os.makedirs(out_cls_dir)
+        # move the image to the class folder
+        shutil.move(img_path, out_cls_dir)
+def split_dataset_cls(
+    indir, outdir, val_size, seed, inplace, stratified_split, no_train
+):
+    seed_everything(seed)
+    console.rule("Config confirm?")
+    pprint(locals())
+    click.confirm("Continue?", abort=True)
+    assert os.path.exists(indir), f"{indir} does not exist"
+    if not inplace:
+        assert (not inplace) and (
+            not os.path.exists(outdir)
+        ), f"{outdir} already exists; SKIP ...."
+    if inplace:
+        outdir = indir
+    if not os.path.exists(outdir):
+        os.makedirs(outdir)
+    console.rule(f"Creating train/val dataset")
+    sss = (
+        ShuffleSplit(n_splits=1, test_size=val_size)
+        if not stratified_split
+        else StratifiedShuffleSplit(n_splits=1, test_size=val_size)
+    )
+    pprint({"split strategy": sss, "indir": indir, "outdir": outdir})
+    dataset = ImageFolder(
+        root=indir,
+        transform=None,
+    )
+    train_dataset_indices = None
+    val_dataset_indices = None  # val here means test
+    for train_indices, val_indices in sss.split(dataset.samples, dataset.targets):
+        train_dataset_indices = train_indices
+        val_dataset_indices = val_indices
+    # get image paths for train/val split dataset
+    train_image_paths = [dataset.imgs[i][0] for i in train_dataset_indices]
+    val_image_paths = [dataset.imgs[i][0] for i in val_dataset_indices]
+    # start creating train/val folders then move images
+    out_train_dir = os.path.join(outdir, "train")
+    out_val_dir = os.path.join(outdir, "val")
+    if inplace:
+        assert os.path.exists(out_train_dir) == False, f"{out_train_dir} already exists"
+        assert os.path.exists(out_val_dir) == False, f"{out_val_dir} already exists"
+    os.makedirs(out_train_dir)
+    os.makedirs(out_val_dir)
+    if not no_train:
+        with ConsoleLog(f"Moving train images to {out_train_dir} "):
+            move_images(train_image_paths, out_train_dir)
+    else:
+        pprint("test only, skip moving train images")
+        # remove out_train_dir
+        shutil.rmtree(out_train_dir)
+    with ConsoleLog(f"Moving val images to {out_val_dir} "):
+        move_images(val_image_paths, out_val_dir)
+    if inplace:
+        pprint(f"remove all folders, except train and val")
+        for cls_dir in os.listdir(outdir):
+            if cls_dir not in ["train", "val"]:
+                shutil.rmtree(os.path.join(indir, cls_dir))
+def reverse_split_ds(indir):
+    console.rule(f"Reversing split dataset <{indir}>...")
+    ls_dirs = os.listdir(indir)
+    # make sure there are only two dirs 'train' and 'val'
+    assert len(ls_dirs) == 2, f"Found more than 2 dirs: {len(ls_dirs) } dirs"
+    assert "train" in ls_dirs, f"train dir not found in {indir}"
+    assert "val" in ls_dirs, f"val dir not found in {indir}"
+    train_dir = os.path.join(indir, "train")
+    val_dir = os.path.join(indir, "val")
+    all_train_files = fs.filter_files_by_extension(
+        train_dir, ["jpg", "jpeg", "png", "bmp", "gif", "tiff"]
+    )
+    all_val_files = fs.filter_files_by_extension(
+        val_dir, ["jpg", "jpeg", "png", "bmp", "gif", "tiff"]
+    )
+    # move all files from train to indir
+    with ConsoleLog(f"Moving train images to {indir} "):
+        move_images(all_train_files, indir)
+    with ConsoleLog(f"Moving val images to {indir} "):
+        move_images(all_val_files, indir)
+    with ConsoleLog(f"Removing train and val dirs"):
+        # remove train and val dirs
+        shutil.rmtree(train_dir)
+        shutil.rmtree(val_dir)
+def main():
+    args = parse_args()
+    indir = args.indir
+    outdir = args.outdir
+    if outdir == ".":
+        # get current folder of the indir
+        indir_parent_dir = os.path.dirname(os.path.normpath(indir))
+        indir_name = os.path.basename(indir)
+        outdir = os.path.join(indir_parent_dir, f"{indir_name}_split")
+    val_size = args.val_size
+    seed = args.seed
+    inplace = args.inplace
+    stratified_split = args.stratified
+    no_train = args.no_train
+    reverse = args.reverse
+    if not reverse:
+        split_dataset_cls(
+            indir, outdir, val_size, seed, inplace, stratified_split, no_train
+        )
+    else:
+        reverse_split_ds(indir)
+if __name__ == "__main__":
+    main()

halib/research/plot.py ADDED Viewed

@@ -0,0 +1,301 @@
+from ..common import now_str, norm_str, ConsoleLog
+from ..filetype import csvfile
+from ..system import filesys as fs
+from functools import partial
+from rich.console import Console
+from rich.pretty import pprint
+import click
+import csv
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import pandas as pd
+import seaborn as sns
+console = Console()
+desktop_path = os.path.expanduser("~/Desktop")
+REQUIRED_COLUMNS = ["epoch", "train_loss", "val_loss", "train_acc", "val_acc"]
+import csv
+def get_delimiter(file_path, bytes=4096):
+    sniffer = csv.Sniffer()
+    data = open(file_path, "r").read(bytes)
+    delimiter = sniffer.sniff(data).delimiter
+    return delimiter
+# Function to verify that the DataFrame has the required columns, and only the required columns
+def verify_csv(csv_file, required_columns=REQUIRED_COLUMNS):
+    delimiter = get_delimiter(csv_file)
+    df = pd.read_csv(csv_file, sep=delimiter)
+    # change the column names to lower case
+    df.columns = [col.lower() for col in df.columns]
+    for col in required_columns:
+        if col not in df.columns:
+            raise ValueError(
+                f"Required columns are: {REQUIRED_COLUMNS}, but found {df.columns}"
+            )
+    df = df[required_columns].copy()
+    return df
+def get_valid_tags(csv_files, tags):
+    if tags is not None and len(tags) > 0:
+        assert all(
+            isinstance(tag, str) for tag in tags
+        ), "tags must be a list of strings"
+        assert all(
+            len(tag) > 0 for tag in tags
+        ), "tags must be a list of non-empty strings"
+        valid_tags = tags
+    else:
+        valid_tags = []
+        for csv_file in csv_files:
+            file_name = fs.get_file_name(csv_file, split_file_ext=True)[0]
+            tag = norm_str(file_name)
+            valid_tags.append(tag)
+    return valid_tags
+def plot_ax(df, ax, metric="loss", tag=""):
+    pprint(locals())
+    # reset plt
+    assert metric in ["loss", "acc"], "metric must be either 'loss' or 'acc'"
+    part = ["train", "val"]
+    for p in part:
+        label = f"{tag}_{p}_{metric}"
+        ax.plot(df["epoch"], df[f"{p}_{metric}"], label=label)
+    return ax
+def actual_plot_seaborn(frame, csv_files, axes, tags, log):
+    # clear the axes
+    for ax in axes:
+        ax.clear()
+    ls_df = []
+    valid_tags = get_valid_tags(csv_files, tags)
+    for csv_file in csv_files:
+        df = verify_csv(csv_file)
+        if log:
+            with ConsoleLog(f"plotting {csv_file}"):
+                csvfile.fn_display_df(df)
+        ls_df.append(df)
+    ls_metrics = ["loss", "acc"]
+    for df_item, tag in zip(ls_df, valid_tags):
+        # add tag to columns,excpet epoch
+        df_item.columns = [
+            f"{tag}_{col}" if col != "epoch" else col for col in df_item.columns
+        ]
+    # merge the dataframes on the epoch column
+    df_combined = ls_df[0]
+    for df_item in ls_df[1:]:
+        df_combined = pd.merge(df_combined, df_item, on="epoch", how="outer")
+    # csvfile.fn_display_df(df_combined)
+    for i, metric in enumerate(ls_metrics):
+        tags_str = "+".join(valid_tags) if len(valid_tags) > 1 else valid_tags[0]
+        title = f"{tags_str}_{metric}-by-epoch"
+        cols = [col for col in df_combined.columns if col != "epoch" and metric in col]
+        cols = sorted(cols)
+        # pprint(cols)
+        plot_data = df_combined[cols]
+        # line from same csv file (same tag) should have the same marker
+        all_markers = [
+            marker for marker in plt.Line2D.markers if marker and marker != " "
+        ]
+        tag2marker = {tag: marker for tag, marker in zip(valid_tags, all_markers)}
+        plot_markers = []
+        for col in cols:
+            # find the tag:
+            tag = None
+            for valid_tag in valid_tags:
+                if valid_tag in col:
+                    tag = valid_tag
+                    break
+            plot_markers.append(tag2marker[tag])
+        # pprint(list(zip(cols, plot_markers)))
+        # create color
+        sequential_palettes = [
+            "Reds",
+            "Greens",
+            "Blues",
+            "Oranges",
+            "Purples",
+            "Greys",
+            "BuGn",
+            "BuPu",
+            "GnBu",
+            "OrRd",
+            "PuBu",
+            "PuRd",
+            "RdPu",
+            "YlGn",
+            "PuBuGn",
+            "YlGnBu",
+            "YlOrBr",
+            "YlOrRd",
+        ]
+        # each csvfile (tag) should have a unique color
+        tag2palette = {
+            tag: palette for tag, palette in zip(valid_tags, sequential_palettes)
+        }
+        plot_colors = []
+        for tag in valid_tags:
+            palette = tag2palette[tag]
+            total_colors = 10
+            ls_colors = sns.color_palette(palette, total_colors).as_hex()
+            num_part = len(ls_metrics)
+            subarr = np.array_split(np.arange(total_colors), num_part)
+            for idx, col in enumerate(cols):
+                if tag in col:
+                    chosen_color = ls_colors[
+                        subarr[int(idx % num_part)].mean().astype(int)
+                    ]
+                    plot_colors.append(chosen_color)
+        # pprint(list(zip(cols, plot_colors)))
+        sns.lineplot(
+            data=plot_data,
+            markers=plot_markers,
+            palette=plot_colors,
+            ax=axes[i],
+            dashes=False,
+        )
+        axes[i].set(xlabel="epoch", ylabel=metric, title=title)
+        axes[i].legend()
+        axes[i].grid()
+def actual_plot(frame, csv_files, axes, tags, log):
+    ls_df = []
+    valid_tags = get_valid_tags(csv_files, tags)
+    for csv_file in csv_files:
+        df = verify_csv(csv_file)
+        if log:
+            with ConsoleLog(f"plotting {csv_file}"):
+                csvfile.fn_display_df(df)
+        ls_df.append(df)
+    metric_values = ["loss", "acc"]
+    for i, metric in enumerate(metric_values):
+        for df_item, tag in zip(ls_df, valid_tags):
+            metric_ax = plot_ax(df_item, axes[i], metric, tag)
+        # set the title, xlabel, ylabel, legend, and grid
+        tags_str = "+".join(valid_tags) if len(valid_tags) > 1 else valid_tags[0]
+        metric_ax.set(
+            xlabel="epoch", ylabel=metric, title=f"{tags_str}_{metric}-by-epoch"
+        )
+        metric_ax.legend()
+        metric_ax.grid()
+def plot_csv_files(
+    csv_files,
+    outdir="./out/plot",
+    tags=None,
+    log=False,
+    save_fig=False,
+    update_in_min=1,
+):
+    # if csv_files is a string, convert it to a list
+    if isinstance(csv_files, str):
+        csv_files = [csv_files]
+    # if tags is a string, convert it to a list
+    if isinstance(tags, str):
+        tags = [tags]
+    valid_tags = get_valid_tags(csv_files, tags)
+    assert len(valid_tags) == len(
+        csv_files
+    ), "Unable to determine tags for each csv file"
+    live_update_in_ms = int(update_in_min * 60 * 1000)
+    fig, axes = plt.subplots(2, 1, figsize=(10, 17))
+    if live_update_in_ms:  # live update in min should be > 0
+        from matplotlib.animation import FuncAnimation
+        anim = FuncAnimation(
+            fig,
+            partial(
+                actual_plot_seaborn, csv_files=csv_files, axes=axes, tags=tags, log=log
+            ),
+            interval=live_update_in_ms,
+            blit=False,
+            cache_frame_data=False,
+        )
+        plt.show()
+    else:
+        actual_plot_seaborn(None, csv_files, axes, tags, log)
+        plt.show()
+    if save_fig:
+        os.makedirs(outdir, exist_ok=True)
+        tags_str = "+".join(valid_tags) if len(valid_tags) > 1 else valid_tags[0]
+        tag = f"{now_str()}_{tags_str}"
+        fig.savefig(f"{outdir}/{tag}_plot.png")
+        enable_plot_pgf()
+        fig.savefig(f"{outdir}/{tag}_plot.pdf")
+    if live_update_in_ms:
+        return anim
+def enable_plot_pgf():
+    matplotlib.use("pdf")
+    matplotlib.rcParams.update(
+        {
+            "pgf.texsystem": "pdflatex",
+            "font.family": "serif",
+            "text.usetex": True,
+            "pgf.rcfonts": False,
+        }
+    )
+def save_fig_latex_pgf(filename, directory="."):
+    enable_plot_pgf()
+    if ".pgf" not in filename:
+        filename = f"{directory}/{filename}.pgf"
+    plt.savefig(filename)
+# https: // click.palletsprojects.com/en/8.1.x/api/
+@click.command()
+@click.option("--csvfiles", "-f", multiple=True, type=str, help="csv files to plot")
+@click.option(
+    "--outdir",
+    "-o",
+    type=str,
+    help="output directory for the plot",
+    default=str(desktop_path),
+)
+@click.option(
+    "--tags", "-t", multiple=True, type=str, help="tags for the csv files", default=[]
+)
+@click.option("--log", "-l", is_flag=True, help="log the csv files")
+@click.option("--save_fig", "-s", is_flag=True, help="save the plot as a file")
+@click.option(
+    "--update_in_min",
+    "-u",
+    type=float,
+    help="update the plot every x minutes",
+    default=0.0,
+)
+def main(
+    csvfiles,
+    outdir,
+    tags,
+    log,
+    save_fig,
+    update_in_min,
+):
+    plot_csv_files(list(csvfiles), outdir, list(tags), log, save_fig, update_in_min)
+if __name__ == "__main__":
+    main()

halib/research/torchloader.py ADDED Viewed

@@ -0,0 +1,162 @@
+"""
+ * @author Hoang Van-Ha
+ * @email hoangvanhauit@gmail.com
+ * @create date 2024-03-27 15:40:22
+ * @modify date 2024-03-27 15:40:22
+ * @desc this module works as a utility tools for finding the best configuration for dataloader (num_workers, batch_size, pin_menory, etc.) that fits your hardware.
+"""
+from argparse import ArgumentParser
+from ..common import *
+from ..filetype import csvfile
+from ..filetype.yamlfile import load_yaml
+from rich import inspect
+from torch.utils.data import DataLoader
+from torchvision import datasets, transforms
+from tqdm import tqdm
+from typing import Union
+import itertools as it  # for cartesian product
+import os
+import time
+import traceback
+def parse_args():
+    parser = ArgumentParser(description="desc text")
+    parser.add_argument("-cfg", "--cfg", type=str, help="cfg file for searching")
+    return parser.parse_args()
+def get_test_range(cfg: dict, search_item="num_workers"):
+    item_search_cfg = cfg["search_space"].get(search_item, None)
+    if item_search_cfg is None:
+        raise ValueError(f"search_item: {search_item} not found in cfg")
+    if isinstance(item_search_cfg, list):
+        return item_search_cfg
+    elif isinstance(item_search_cfg, dict):
+        if "mode" in item_search_cfg:
+            mode = item_search_cfg["mode"]
+            assert mode in ["range", "list"], f"mode: {mode} not supported"
+            value_in_mode = item_search_cfg.get(mode, None)
+            if value_in_mode is None:
+                raise ValueError(f"mode<{mode}>: data not found in <{search_item}>")
+            if mode == "range":
+                assert len(value_in_mode) == 3, f"range must have 3 values: start, stop, step"
+                start = value_in_mode[0]
+                stop = value_in_mode[1]
+                step = value_in_mode[2]
+                return list(range(start, stop, step))
+            elif mode == "list":
+                return item_search_cfg["list"]
+    else:
+        return [item_search_cfg]  # for int, float, str, bool, etc.
+def load_an_batch(loader_iter):
+    start = time.time()
+    next(loader_iter)
+    end = time.time()
+    return end - start
+def test_dataloader_with_cfg(origin_dataloader: DataLoader, cfg: Union[dict, str]):
+    try:
+        if isinstance(cfg, str):
+            cfg = load_yaml(cfg, to_dict=True)
+        dfmk = csvfile.DFCreator()
+        search_items = ["batch_size", "num_workers", "persistent_workers", "pin_memory"]
+        batch_limit = cfg["general"]["batch_limit"]
+        csv_cfg = cfg["general"]["to_csv"]
+        log_batch_info = cfg["general"]["log_batch_info"]
+        save_to_csv = csv_cfg["enabled"]
+        log_dir = csv_cfg["log_dir"]
+        filename = csv_cfg["filename"]
+        filename = f"{now_str()}_{filename}.csv"
+        outfile = os.path.join(log_dir, filename)
+        dfmk.create_table(
+            "cfg_search",
+            (search_items + ["avg_time_taken"]),
+        )
+        ls_range_test = []
+        for item in search_items:
+            range_test = get_test_range(cfg, search_item=item)
+            range_test = [(item, i) for i in range_test]
+            ls_range_test.append(range_test)
+        all_combinations = list(it.product(*ls_range_test))
+        rows = []
+        for cfg_idx, combine in enumerate(all_combinations):
+            console.rule(f"Testing cfg {cfg_idx+1}/{len(all_combinations)}")
+            inspect(combine)
+            batch_size = combine[search_items.index("batch_size")][1]
+            num_workers = combine[search_items.index("num_workers")][1]
+            persistent_workers = combine[search_items.index("persistent_workers")][1]
+            pin_memory = combine[search_items.index("pin_memory")][1]
+            test_dataloader = DataLoader(origin_dataloader.dataset, batch_size=batch_size, num_workers=num_workers, persistent_workers=persistent_workers, pin_memory=pin_memory, shuffle=True)
+            row = [
+                batch_size,
+                num_workers,
+                persistent_workers,
+                pin_memory,
+                0.0,
+            ]
+            # calculate the avg time taken to load the data for <batch_limit> batches
+            trainiter = iter(test_dataloader)
+            time_elapsed = 0
+            pprint('Start testing...')
+            for i in tqdm(range(batch_limit)):
+                single_batch_time = load_an_batch(trainiter)
+                if log_batch_info:
+                    pprint(f"Batch {i+1} took {single_batch_time:.4f} seconds to load")
+                time_elapsed += single_batch_time
+            row[-1] = time_elapsed / batch_limit
+            rows.append(row)
+        dfmk.insert_rows('cfg_search', rows)
+        dfmk.fill_table_from_row_pool('cfg_search')
+        with ConsoleLog("results"):
+            csvfile.fn_display_df(dfmk['cfg_search'])
+            if save_to_csv:
+                dfmk["cfg_search"].to_csv(outfile, index=False)
+                console.print(f"[red] Data saved to <{outfile}> [/red]")
+    except Exception as e:
+        traceback.print_exc()
+        print(e)
+        # get current directory of this python file
+        current_dir = os.path.dirname(os.path.realpath(__file__))
+        standar_cfg_path = os.path.join(current_dir, "torchloader_search.yaml")
+        pprint(
+            f"Make sure you get the  right <cfg.yaml> file. An example of <cfg.yaml> file can be found at this path: {standar_cfg_path}"
+        )
+        return
+def main():
+    args = parse_args()
+    cfg_yaml = args.cfg
+    cfg_dict = load_yaml(cfg_yaml, to_dict=True)
+    # Define transforms for data augmentation and normalization
+    transform = transforms.Compose(
+        [
+            transforms.RandomHorizontalFlip(),  # Randomly flip images horizontally
+            transforms.RandomRotation(10),  # Randomly rotate images by 10 degrees
+            transforms.ToTensor(),  # Convert images to PyTorch tensors
+            transforms.Normalize(
+                (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
+            ),  # Normalize pixel values to [-1, 1]
+        ]
+    )
+    test_dataset = datasets.CIFAR10(
+        root="./data", train=False, download=True, transform=transform
+    )
+    batch_size = 64
+    train_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
+    test_dataloader_with_cfg(train_loader, cfg_dict)
+if __name__ == "__main__":
+    main()

halib/utils/__init__.py ADDED Viewed

File without changes

halib/utils/listop.py ADDED Viewed

@@ -0,0 +1,13 @@
+def subtract(list_a, list_b):
+    return [item for item in list_a if item not in list_b]
+def union(list_a, list_b, no_duplicate=False):
+    if no_duplicate:
+        return list(set(list_a) | set(list_b))
+    else:
+        return list_a + list_b
+def intersection(list_a, list_b):
+    return list(set(list_a) & set(list_b))

halib/utils/tele_noti.py ADDED Viewed

@@ -0,0 +1,166 @@
+# Watch a log file and send a telegram message when train reaches a certain epoch or end
+import os
+import yaml
+import asyncio
+import telegram
+import pandas as pd
+from rich.pretty import pprint
+from rich.console import Console
+import plotly.graph_objects as go
+from ..system import filesys as fs
+from ..filetype import textfile, csvfile
+from argparse import ArgumentParser
+tele_console = Console()
+def parse_args():
+    parser = ArgumentParser(description="desc text")
+    parser.add_argument(
+        "-cfg",
+        "--cfg",
+        type=str,
+        help="yaml file for tele",
+        default=r"E:\Dev\halib\cfg_tele_noti.yaml",
+    )
+    return parser.parse_args()
+def get_watcher_message_df(target_file, num_last_lines):
+    file_ext = fs.get_file_name(target_file, split_file_ext=True)[1]
+    supported_ext = [".txt", ".log", ".csv"]
+    assert (
+        file_ext in supported_ext
+    ), f"File extension {file_ext} not supported. Supported extensions are {supported_ext}"
+    last_lines_df = None
+    if file_ext in [".txt", ".log"]:
+        lines = textfile.read_line_by_line(target_file)
+        if num_last_lines > len(lines):
+            num_last_lines = len(lines)
+        last_line_arr = lines[-num_last_lines:]
+        # add a line start with word "epoch"
+        epoch_info_list = "Epoch: n/a"
+        for line in reversed(lines):
+            if "epoch" in line.lower():
+                epoch_info_list = line
+                break
+        last_line_arr.insert(0, epoch_info_list)  # insert at the beginning
+        dfCreator = csvfile.DFCreator()
+        dfCreator.create_table("last_lines", ["line"])
+        last_line_arr = [[line] for line in last_line_arr]
+        dfCreator.insert_rows("last_lines", last_line_arr)
+        dfCreator.fill_table_from_row_pool("last_lines")
+        last_lines_df = dfCreator["last_lines"].copy()
+    else:
+        df = pd.read_csv(target_file)
+        num_rows = len(df)
+        if num_last_lines > num_rows:
+            num_last_lines = num_rows
+        last_lines_df = df.tail(num_last_lines)
+    return last_lines_df
+def df2img(df: pd.DataFrame, output_img_dir, decimal_places, out_img_scale):
+    df = df.round(decimal_places)
+    fig = go.Figure(
+        data=[
+            go.Table(
+                header=dict(values=list(df.columns), align="center"),
+                cells=dict(
+                    values=df.values.transpose(),
+                    fill_color=[["white", "lightgrey"] * df.shape[0]],
+                    align="center",
+                ),
+            )
+        ]
+    )
+    if not os.path.exists(output_img_dir):
+        os.makedirs(output_img_dir)
+    img_path = os.path.normpath(os.path.join(output_img_dir, "last_lines.png"))
+    fig.write_image(img_path, scale=out_img_scale)
+    return img_path
+def compose_message_and_img_path(
+    target_file, project, num_last_lines, decimal_places, out_img_scale, output_img_dir
+):
+    context_msg = f">> Project: {project} \n>> File: {target_file} \n>> Last {num_last_lines} lines:"
+    msg_df = get_watcher_message_df(target_file, num_last_lines)
+    try:
+        img_path = df2img(msg_df, output_img_dir, decimal_places, out_img_scale)
+    except Exception as e:
+        pprint(f"Error: {e}")
+        img_path = None
+    return context_msg, img_path
+async def send_to_telegram(cfg_dict, interval_in_sec):
+    # pprint(cfg_dict)
+    token = cfg_dict["telegram"]["token"]
+    chat_id = cfg_dict["telegram"]["chat_id"]
+    noti_settings = cfg_dict["noti_settings"]
+    project = noti_settings["project"]
+    target_file = noti_settings["target_file"]
+    num_last_lines = noti_settings["num_last_lines"]
+    output_img_dir = noti_settings["output_img_dir"]
+    decimal_places = noti_settings["decimal_places"]
+    out_img_scale = noti_settings["out_img_scale"]
+    bot = telegram.Bot(token=token)
+    async with bot:
+        try:
+            context_msg, img_path = compose_message_and_img_path(
+                target_file,
+                project,
+                num_last_lines,
+                decimal_places,
+                out_img_scale,
+                output_img_dir,
+            )
+            time_now = next_time = pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
+            sep_line = "-" * 50
+            context_msg = f"{sep_line}\n>> Time: {time_now}\n{context_msg}"
+            # calculate the next time to send message
+            next_time = pd.Timestamp.now() + pd.Timedelta(seconds=interval_in_sec)
+            next_time = next_time.strftime("%Y-%m-%d %H:%M:%S")
+            next_time_info = f"Next msg: {next_time}"
+            tele_console.rule()
+            tele_console.print("[green] Send message to telegram [/green]")
+            tele_console.print(
+                f"[red] Next message will be sent at <{next_time}> [/red]"
+            )
+            await bot.send_message(text=context_msg, chat_id=chat_id)
+            if img_path:
+                await bot.send_photo(chat_id=chat_id, photo=open(img_path, "rb"))
+            await bot.send_message(text=next_time_info, chat_id=chat_id)
+        except Exception as e:
+            pprint(f"Error: {e}")
+            pprint("Message not sent to telegram")
+async def run_forever(cfg_path):
+    cfg_dict = yaml.safe_load(open(cfg_path, "r"))
+    noti_settings = cfg_dict["noti_settings"]
+    interval_in_min = noti_settings["interval_in_min"]
+    interval_in_sec = int(interval_in_min * 60)
+    pprint(
+        f"Message will be sent every {interval_in_min} minutes or {interval_in_sec} seconds"
+    )
+    while True:
+        await send_to_telegram(cfg_dict, interval_in_sec)
+        await asyncio.sleep(interval_in_sec)
+async def main():
+    args = parse_args()
+    await run_forever(args.cfg)
+if __name__ == "__main__":
+    asyncio.run(main())

{halib-0.1.47.dist-info → halib-0.1.48.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: halib
-Version: 0.1.47
+Version: 0.1.48
 Summary: Small library for common tasks
 Author: Hoang Van Ha
 Author-email: hoangvanhauit@gmail.com
@@ -43,6 +43,10 @@ Requires-Dist: tube-dl
 Helper package for coding and automation
+**Version 0.1.48**
++ add `research` module to help with research tasks, including `benchquery` for benchmarking queries from dataframe
 **Version 0.1.47**
 + add `pprint_box` to print object/string in a box frame (like in `inspect`)

{halib-0.1.47.dist-info → halib-0.1.48.dist-info}/RECORD RENAMED Viewed

@@ -27,14 +27,22 @@ halib/online/gdrive.py,sha256=RmF4y6UPxektkKIctmfT-pKWZsBM9FVUeld6zZmJkp0,7787
 halib/online/gdrive_mkdir.py,sha256=wSJkQMJCDuS1gxQ2lHQHq_IrJ4xR_SEoPSo9n_2WNFU,1474
 halib/online/gdrive_test.py,sha256=hMWzz4RqZwETHp4GG4WwVNFfYvFQhp2Boz5t-DqwMo0,1342
 halib/online/projectmake.py,sha256=Zrs96WgXvO4nIrwxnCOletL4aTBge-EoF0r7hpKO1w8,4034
+halib/research/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+halib/research/benchquery.py,sha256=FuKnbWQtCEoRRtJAfN-zaN-jPiO_EzsakmTOMiqi7GQ,4626
+halib/research/dataset.py,sha256=QU0Hr5QFb8_XlvnOMgC9QJGIpwXAZ9lDd0RdQi_QRec,6743
+halib/research/plot.py,sha256=-pDUk4z3C_GnyJ5zWmf-mGMdT4gaipVJWzIgcpIPiRk,9448
+halib/research/torchloader.py,sha256=yqUjcSiME6H5W210363HyRUrOi3ISpUFAFkTr1w4DCw,6503
 halib/sys/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 halib/sys/cmd.py,sha256=b2x7JPcNnFjLGheIESVYvqAb-w2UwBM1PAwYxMZ5YjA,228
 halib/sys/filesys.py,sha256=ERpnELLDKJoTIIKf-AajgkY62nID4qmqmX5TkE95APU,2931
 halib/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 halib/system/cmd.py,sha256=b2x7JPcNnFjLGheIESVYvqAb-w2UwBM1PAwYxMZ5YjA,228
 halib/system/filesys.py,sha256=ERpnELLDKJoTIIKf-AajgkY62nID4qmqmX5TkE95APU,2931
-halib-0.1.47.dist-info/LICENSE.txt,sha256=qZssdna4aETiR8znYsShUjidu-U4jUT9Q-EWNlZ9yBQ,1100
-halib-0.1.47.dist-info/METADATA,sha256=GsAawspTV3gRGBKKkxsSuTG9IlkE2cAIj3GzdlCNE68,3823
-halib-0.1.47.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
-halib-0.1.47.dist-info/top_level.txt,sha256=7AD6PLaQTreE0Fn44mdZsoHBe_Zdd7GUmjsWPyQ7I-k,6
-halib-0.1.47.dist-info/RECORD,,
+halib/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+halib/utils/listop.py,sha256=Vpa8_2fI0wySpB2-8sfTBkyi_A4FhoFVVvFiuvW8N64,339
+halib/utils/tele_noti.py,sha256=-4WXZelCA4W9BroapkRyIdUu9cUVrcJJhegnMs_WpGU,5928
+halib-0.1.48.dist-info/LICENSE.txt,sha256=qZssdna4aETiR8znYsShUjidu-U4jUT9Q-EWNlZ9yBQ,1100
+halib-0.1.48.dist-info/METADATA,sha256=iaGDSmQyhQWr6hLkyRpK6ZpkW6tuoAoGOrPNuK3CQp8,3960
+halib-0.1.48.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
+halib-0.1.48.dist-info/top_level.txt,sha256=7AD6PLaQTreE0Fn44mdZsoHBe_Zdd7GUmjsWPyQ7I-k,6
+halib-0.1.48.dist-info/RECORD,,

{halib-0.1.47.dist-info → halib-0.1.48.dist-info}/LICENSE.txt RENAMED Viewed

File without changes

{halib-0.1.47.dist-info → halib-0.1.48.dist-info}/WHEEL RENAMED Viewed

File without changes

{halib-0.1.47.dist-info → halib-0.1.48.dist-info}/top_level.txt RENAMED Viewed

File without changes

halib 0.1.47__py3-none-any.whl → 0.1.48__py3-none-any.whl

halib 0.1.47py3-none-any.whl → 0.1.48py3-none-any.whl