PyPI - halib - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.99__py3-none-any.whl - Mend

halib 0.1.7py3-none-any.whl → 0.1.99py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

halib/__init__.py +84 -0
halib/common.py +151 -0
halib/cuda.py +39 -0
halib/dataset.py +209 -0
halib/filetype/csvfile.py +151 -45
halib/filetype/ipynb.py +63 -0
halib/filetype/jsonfile.py +1 -1
halib/filetype/textfile.py +4 -4
halib/filetype/videofile.py +44 -33
halib/filetype/yamlfile.py +95 -0
halib/gdrive.py +1 -1
halib/online/gdrive.py +104 -54
halib/online/gdrive_mkdir.py +29 -17
halib/online/gdrive_test.py +31 -18
halib/online/projectmake.py +58 -43
halib/plot.py +296 -11
halib/projectmake.py +1 -1
halib/research/__init__.py +0 -0
halib/research/base_config.py +100 -0
halib/research/base_exp.py +100 -0
halib/research/benchquery.py +131 -0
halib/research/dataset.py +208 -0
halib/research/flop_csv.py +34 -0
halib/research/flops.py +156 -0
halib/research/metrics.py +133 -0
halib/research/mics.py +68 -0
halib/research/params_gen.py +108 -0
halib/research/perfcalc.py +336 -0
halib/research/perftb.py +780 -0
halib/research/plot.py +758 -0
halib/research/profiler.py +300 -0
halib/research/torchloader.py +162 -0
halib/research/wandb_op.py +116 -0
halib/rich_color.py +285 -0
halib/sys/filesys.py +17 -10
halib/system/__init__.py +0 -0
halib/system/cmd.py +8 -0
halib/system/filesys.py +124 -0
halib/tele_noti.py +166 -0
halib/torchloader.py +162 -0
halib/utils/__init__.py +0 -0
halib/utils/dataclass_util.py +40 -0
halib/utils/dict_op.py +9 -0
halib/utils/gpu_mon.py +58 -0
halib/utils/listop.py +13 -0
halib/utils/tele_noti.py +166 -0
halib/utils/video.py +82 -0
halib/videofile.py +1 -1
halib-0.1.99.dist-info/METADATA +209 -0
halib-0.1.99.dist-info/RECORD +64 -0
{halib-0.1.7.dist-info → halib-0.1.99.dist-info}/WHEEL +1 -1
halib-0.1.7.dist-info/METADATA +0 -59
halib-0.1.7.dist-info/RECORD +0 -30
{halib-0.1.7.dist-info → halib-0.1.99.dist-info/licenses}/LICENSE.txt +0 -0
{halib-0.1.7.dist-info → halib-0.1.99.dist-info}/top_level.txt +0 -0

halib/torchloader.py ADDED Viewed

@@ -0,0 +1,162 @@
+"""
+ * @author Hoang Van-Ha
+ * @email hoangvanhauit@gmail.com
+ * @create date 2024-03-27 15:40:22
+ * @modify date 2024-03-27 15:40:22
+ * @desc this module works as a utility tools for finding the best configuration for dataloader (num_workers, batch_size, pin_menory, etc.) that fits your hardware.
+"""
+from argparse import ArgumentParser
+from .common import *
+from .filetype import csvfile
+from .filetype.yamlfile import load_yaml
+from rich import inspect
+from torch.utils.data import DataLoader
+from torchvision import datasets, transforms
+from tqdm import tqdm
+from typing import Union
+import itertools as it  # for cartesian product
+import os
+import time
+import traceback
+def parse_args():
+    parser = ArgumentParser(description="desc text")
+    parser.add_argument("-cfg", "--cfg", type=str, help="cfg file for searching")
+    return parser.parse_args()
+def get_test_range(cfg: dict, search_item="num_workers"):
+    item_search_cfg = cfg["search_space"].get(search_item, None)
+    if item_search_cfg is None:
+        raise ValueError(f"search_item: {search_item} not found in cfg")
+    if isinstance(item_search_cfg, list):
+        return item_search_cfg
+    elif isinstance(item_search_cfg, dict):
+        if "mode" in item_search_cfg:
+            mode = item_search_cfg["mode"]
+            assert mode in ["range", "list"], f"mode: {mode} not supported"
+            value_in_mode = item_search_cfg.get(mode, None)
+            if value_in_mode is None:
+                raise ValueError(f"mode<{mode}>: data not found in <{search_item}>")
+            if mode == "range":
+                assert len(value_in_mode) == 3, f"range must have 3 values: start, stop, step"
+                start = value_in_mode[0]
+                stop = value_in_mode[1]
+                step = value_in_mode[2]
+                return list(range(start, stop, step))
+            elif mode == "list":
+                return item_search_cfg["list"]
+    else:
+        return [item_search_cfg]  # for int, float, str, bool, etc.
+def load_an_batch(loader_iter):
+    start = time.time()
+    next(loader_iter)
+    end = time.time()
+    return end - start
+def test_dataloader_with_cfg(origin_dataloader: DataLoader, cfg: Union[dict, str]):
+    try:
+        if isinstance(cfg, str):
+            cfg = load_yaml(cfg, to_dict=True)
+        dfmk = csvfile.DFCreator()
+        search_items = ["batch_size", "num_workers", "persistent_workers", "pin_memory"]
+        batch_limit = cfg["general"]["batch_limit"]
+        csv_cfg = cfg["general"]["to_csv"]
+        log_batch_info = cfg["general"]["log_batch_info"]
+        save_to_csv = csv_cfg["enabled"]
+        log_dir = csv_cfg["log_dir"]
+        filename = csv_cfg["filename"]
+        filename = f"{now_str()}_{filename}.csv"
+        outfile = os.path.join(log_dir, filename)
+        dfmk.create_table(
+            "cfg_search",
+            (search_items + ["avg_time_taken"]),
+        )
+        ls_range_test = []
+        for item in search_items:
+            range_test = get_test_range(cfg, search_item=item)
+            range_test = [(item, i) for i in range_test]
+            ls_range_test.append(range_test)
+        all_combinations = list(it.product(*ls_range_test))
+        rows = []
+        for cfg_idx, combine in enumerate(all_combinations):
+            console.rule(f"Testing cfg {cfg_idx+1}/{len(all_combinations)}")
+            inspect(combine)
+            batch_size = combine[search_items.index("batch_size")][1]
+            num_workers = combine[search_items.index("num_workers")][1]
+            persistent_workers = combine[search_items.index("persistent_workers")][1]
+            pin_memory = combine[search_items.index("pin_memory")][1]
+            test_dataloader = DataLoader(origin_dataloader.dataset, batch_size=batch_size, num_workers=num_workers, persistent_workers=persistent_workers, pin_memory=pin_memory, shuffle=True)
+            row = [
+                batch_size,
+                num_workers,
+                persistent_workers,
+                pin_memory,
+                0.0,
+            ]
+            # calculate the avg time taken to load the data for <batch_limit> batches
+            trainiter = iter(test_dataloader)
+            time_elapsed = 0
+            pprint('Start testing...')
+            for i in tqdm(range(batch_limit)):
+                single_batch_time = load_an_batch(trainiter)
+                if log_batch_info:
+                    pprint(f"Batch {i+1} took {single_batch_time:.4f} seconds to load")
+                time_elapsed += single_batch_time
+            row[-1] = time_elapsed / batch_limit
+            rows.append(row)
+        dfmk.insert_rows('cfg_search', rows)
+        dfmk.fill_table_from_row_pool('cfg_search')
+        with ConsoleLog("results"):
+            csvfile.fn_display_df(dfmk['cfg_search'])
+            if save_to_csv:
+                dfmk["cfg_search"].to_csv(outfile, index=False)
+                console.print(f"[red] Data saved to <{outfile}> [/red]")
+    except Exception as e:
+        traceback.print_exc()
+        print(e)
+        # get current directory of this python file
+        current_dir = os.path.dirname(os.path.realpath(__file__))
+        standar_cfg_path = os.path.join(current_dir, "torchloader_search.yaml")
+        pprint(
+            f"Make sure you get the  right <cfg.yaml> file. An example of <cfg.yaml> file can be found at this path: {standar_cfg_path}"
+        )
+        return
+def main():
+    args = parse_args()
+    cfg_yaml = args.cfg
+    cfg_dict = load_yaml(cfg_yaml, to_dict=True)
+    # Define transforms for data augmentation and normalization
+    transform = transforms.Compose(
+        [
+            transforms.RandomHorizontalFlip(),  # Randomly flip images horizontally
+            transforms.RandomRotation(10),  # Randomly rotate images by 10 degrees
+            transforms.ToTensor(),  # Convert images to PyTorch tensors
+            transforms.Normalize(
+                (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
+            ),  # Normalize pixel values to [-1, 1]
+        ]
+    )
+    test_dataset = datasets.CIFAR10(
+        root="./data", train=False, download=True, transform=transform
+    )
+    batch_size = 64
+    train_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
+    test_dataloader_with_cfg(train_loader, cfg_dict)
+if __name__ == "__main__":
+    main()

halib/utils/__init__.py ADDED Viewed

File without changes

halib/utils/dataclass_util.py ADDED Viewed

@@ -0,0 +1,40 @@
+import yaml
+from typing import Any
+from rich.pretty import pprint
+from ..filetype import yamlfile
+# from halib.filetype import yamlfile
+from dataclasses import make_dataclass
+def dict_to_dataclass(name: str, data: dict):
+    fields = []
+    values = {}
+    for key, value in data.items():
+        if isinstance(value, dict):
+            sub_dc = dict_to_dataclass(key.capitalize(), value)
+            fields.append((key, type(sub_dc)))
+            values[key] = sub_dc
+        else:
+            field_type = type(value) if value is not None else Any
+            fields.append((key, field_type))
+            values[key] = value
+    DC = make_dataclass(name.capitalize(), fields)
+    return DC(**values)
+def yaml_to_dataclass(name: str, yaml_str: str):
+    data = yaml.safe_load(yaml_str)
+    return dict_to_dataclass(name, data)
+def yamlfile_to_dataclass(name: str, file_path: str):
+    data_dict = yamlfile.load_yaml(file_path, to_dict=True)
+    if "__base__" in data_dict:
+        del data_dict["__base__"]
+    return dict_to_dataclass(name, data_dict)
+if __name__ == "__main__":
+    cfg = yamlfile_to_dataclass("Config", "test/dataclass_util_test_cfg.yaml")
+    # ! NOTICE: after print out this dataclass, we can copy the output and paste it into CHATGPT to generate a list of needed dataclass classes using `from dataclass_wizard import YAMLWizard`
+    pprint(cfg)

halib/utils/dict_op.py ADDED Viewed

@@ -0,0 +1,9 @@
+def flatten_dict(d, parent_key="", sep="."):
+    items = {}
+    for k, v in d.items():
+        key = f"{parent_key}{sep}{k}" if parent_key else k
+        if isinstance(v, dict):
+            items.update(flatten_dict(v, key, sep=sep))
+        else:
+            items[key] = v
+    return items

halib/utils/gpu_mon.py ADDED Viewed

@@ -0,0 +1,58 @@
+# install `pynvml_utils` package first
+# see this repo: https://github.com/gpuopenanalytics/pynvml
+from pynvml_utils import nvidia_smi
+import time
+import threading
+from rich.pretty import pprint
+class GPUMonitor:
+    def __init__(self, gpu_index=0, interval=0.01):
+        self.nvsmi = nvidia_smi.getInstance()
+        self.gpu_index = gpu_index
+        self.interval = interval
+        self.gpu_stats = []
+        self._running = False
+        self._thread = None
+    def _monitor(self):
+        while self._running:
+            stats = self.nvsmi.DeviceQuery("power.draw, memory.used")["gpu"][
+                self.gpu_index
+            ]
+            # pprint(stats)
+            self.gpu_stats.append(
+                {
+                    "power": stats["power_readings"]["power_draw"],
+                    "power_unit": stats["power_readings"]["unit"],
+                    "memory": stats["fb_memory_usage"]["used"],
+                    "memory_unit": stats["fb_memory_usage"]["unit"],
+                }
+            )
+            time.sleep(self.interval)
+    def start(self):
+        if not self._running:
+            self._running = True
+            # clear previous stats
+            self.gpu_stats.clear()
+            self._thread = threading.Thread(target=self._monitor)
+            self._thread.start()
+    def stop(self):
+        if self._running:
+            self._running = False
+            self._thread.join()
+            # clear the thread reference
+            self._thread = None
+    def get_stats(self):
+        ## return self.gpu_stats
+        assert self._running is False, "GPU monitor is still running. Stop it first."
+        powers = [s["power"] for s in self.gpu_stats if s["power"] is not None]
+        memories = [s["memory"] for s in self.gpu_stats if s["memory"] is not None]
+        avg_power = sum(powers) / len(powers) if powers else 0
+        max_memory = max(memories) if memories else 0
+        # power_unit = self.gpu_stats[0]["power_unit"] if self.gpu_stats else "W"
+        # memory_unit = self.gpu_stats[0]["memory_unit"] if self.gpu_stats else "MiB"
+        return {"gpu_avg_power": avg_power, "gpu_avg_max_memory": max_memory}

halib/utils/listop.py ADDED Viewed

@@ -0,0 +1,13 @@
+def subtract(list_a, list_b):
+    return [item for item in list_a if item not in list_b]
+def union(list_a, list_b, no_duplicate=False):
+    if no_duplicate:
+        return list(set(list_a) | set(list_b))
+    else:
+        return list_a + list_b
+def intersection(list_a, list_b):
+    return list(set(list_a) & set(list_b))

halib/utils/tele_noti.py ADDED Viewed

@@ -0,0 +1,166 @@
+# Watch a log file and send a telegram message when train reaches a certain epoch or end
+import os
+import yaml
+import asyncio
+import telegram
+import pandas as pd
+from rich.pretty import pprint
+from rich.console import Console
+import plotly.graph_objects as go
+from ..system import filesys as fs
+from ..filetype import textfile, csvfile
+from argparse import ArgumentParser
+tele_console = Console()
+def parse_args():
+    parser = ArgumentParser(description="desc text")
+    parser.add_argument(
+        "-cfg",
+        "--cfg",
+        type=str,
+        help="yaml file for tele",
+        default=r"E:\Dev\halib\cfg_tele_noti.yaml",
+    )
+    return parser.parse_args()
+def get_watcher_message_df(target_file, num_last_lines):
+    file_ext = fs.get_file_name(target_file, split_file_ext=True)[1]
+    supported_ext = [".txt", ".log", ".csv"]
+    assert (
+        file_ext in supported_ext
+    ), f"File extension {file_ext} not supported. Supported extensions are {supported_ext}"
+    last_lines_df = None
+    if file_ext in [".txt", ".log"]:
+        lines = textfile.read_line_by_line(target_file)
+        if num_last_lines > len(lines):
+            num_last_lines = len(lines)
+        last_line_arr = lines[-num_last_lines:]
+        # add a line start with word "epoch"
+        epoch_info_list = "Epoch: n/a"
+        for line in reversed(lines):
+            if "epoch" in line.lower():
+                epoch_info_list = line
+                break
+        last_line_arr.insert(0, epoch_info_list)  # insert at the beginning
+        dfCreator = csvfile.DFCreator()
+        dfCreator.create_table("last_lines", ["line"])
+        last_line_arr = [[line] for line in last_line_arr]
+        dfCreator.insert_rows("last_lines", last_line_arr)
+        dfCreator.fill_table_from_row_pool("last_lines")
+        last_lines_df = dfCreator["last_lines"].copy()
+    else:
+        df = pd.read_csv(target_file)
+        num_rows = len(df)
+        if num_last_lines > num_rows:
+            num_last_lines = num_rows
+        last_lines_df = df.tail(num_last_lines)
+    return last_lines_df
+def df2img(df: pd.DataFrame, output_img_dir, decimal_places, out_img_scale):
+    df = df.round(decimal_places)
+    fig = go.Figure(
+        data=[
+            go.Table(
+                header=dict(values=list(df.columns), align="center"),
+                cells=dict(
+                    values=df.values.transpose(),
+                    fill_color=[["white", "lightgrey"] * df.shape[0]],
+                    align="center",
+                ),
+            )
+        ]
+    )
+    if not os.path.exists(output_img_dir):
+        os.makedirs(output_img_dir)
+    img_path = os.path.normpath(os.path.join(output_img_dir, "last_lines.png"))
+    fig.write_image(img_path, scale=out_img_scale)
+    return img_path
+def compose_message_and_img_path(
+    target_file, project, num_last_lines, decimal_places, out_img_scale, output_img_dir
+):
+    context_msg = f">> Project: {project} \n>> File: {target_file} \n>> Last {num_last_lines} lines:"
+    msg_df = get_watcher_message_df(target_file, num_last_lines)
+    try:
+        img_path = df2img(msg_df, output_img_dir, decimal_places, out_img_scale)
+    except Exception as e:
+        pprint(f"Error: {e}")
+        img_path = None
+    return context_msg, img_path
+async def send_to_telegram(cfg_dict, interval_in_sec):
+    # pprint(cfg_dict)
+    token = cfg_dict["telegram"]["token"]
+    chat_id = cfg_dict["telegram"]["chat_id"]
+    noti_settings = cfg_dict["noti_settings"]
+    project = noti_settings["project"]
+    target_file = noti_settings["target_file"]
+    num_last_lines = noti_settings["num_last_lines"]
+    output_img_dir = noti_settings["output_img_dir"]
+    decimal_places = noti_settings["decimal_places"]
+    out_img_scale = noti_settings["out_img_scale"]
+    bot = telegram.Bot(token=token)
+    async with bot:
+        try:
+            context_msg, img_path = compose_message_and_img_path(
+                target_file,
+                project,
+                num_last_lines,
+                decimal_places,
+                out_img_scale,
+                output_img_dir,
+            )
+            time_now = next_time = pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
+            sep_line = "-" * 50
+            context_msg = f"{sep_line}\n>> Time: {time_now}\n{context_msg}"
+            # calculate the next time to send message
+            next_time = pd.Timestamp.now() + pd.Timedelta(seconds=interval_in_sec)
+            next_time = next_time.strftime("%Y-%m-%d %H:%M:%S")
+            next_time_info = f"Next msg: {next_time}"
+            tele_console.rule()
+            tele_console.print("[green] Send message to telegram [/green]")
+            tele_console.print(
+                f"[red] Next message will be sent at <{next_time}> [/red]"
+            )
+            await bot.send_message(text=context_msg, chat_id=chat_id)
+            if img_path:
+                await bot.send_photo(chat_id=chat_id, photo=open(img_path, "rb"))
+            await bot.send_message(text=next_time_info, chat_id=chat_id)
+        except Exception as e:
+            pprint(f"Error: {e}")
+            pprint("Message not sent to telegram")
+async def run_forever(cfg_path):
+    cfg_dict = yaml.safe_load(open(cfg_path, "r"))
+    noti_settings = cfg_dict["noti_settings"]
+    interval_in_min = noti_settings["interval_in_min"]
+    interval_in_sec = int(interval_in_min * 60)
+    pprint(
+        f"Message will be sent every {interval_in_min} minutes or {interval_in_sec} seconds"
+    )
+    while True:
+        await send_to_telegram(cfg_dict, interval_in_sec)
+        await asyncio.sleep(interval_in_sec)
+async def main():
+    args = parse_args()
+    await run_forever(args.cfg)
+if __name__ == "__main__":
+    asyncio.run(main())

halib/utils/video.py ADDED Viewed

@@ -0,0 +1,82 @@
+import os
+import cv2
+from ..filetype import csvfile
+from ..system import filesys as fs
+class VideoUtils:
+    @staticmethod
+    def _default_meta_extractor(video_path):
+        """Default video metadata extractor function."""
+        # Open the video file
+        cap = cv2.VideoCapture(video_path)
+        # Check if the video was opened successfully
+        if not cap.isOpened():
+            print(f"Error: Could not open video file {video_path}")
+            return None
+        # Get the frame count
+        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        # Get the FPS
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        # get frame size
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        # Release the video capture object
+        cap.release()
+        meta_dict = {
+            "video_path": video_path,
+            "width": width,
+            "height": height,
+            "frame_count": frame_count,
+            "fps": fps
+        }
+        return meta_dict
+    @staticmethod
+    def get_video_meta_dict(video_path, meta_dict_extractor_func=None):
+        assert os.path.exists(video_path), f"Video file {video_path} does not exist"
+        if meta_dict_extractor_func and callable(meta_dict_extractor_func):
+            assert meta_dict_extractor_func.__code__.co_argcount == 1, "meta_dict_extractor_func must take exactly one argument (video_path)"
+            meta_dict = meta_dict_extractor_func(video_path)
+            assert isinstance(meta_dict, dict), "meta_dict_extractor_func must return a dictionary"
+            assert 'video_path' in meta_dict, "meta_dict must contain 'video_path'"
+        else:
+            meta_dict = VideoUtils._default_meta_extractor(video_path=video_path)
+        return  meta_dict
+    @staticmethod
+    def get_video_dir_meta_df(video_dir, video_exts=['.mp4', '.avi', '.mov', '.mkv'], search_recursive=False, csv_outfile=None):
+        assert os.path.exists(video_dir), f"Video directory {video_dir} does not exist"
+        video_files = fs.filter_files_by_extension(video_dir, video_exts, recursive=search_recursive)
+        assert len(video_files) > 0, f"No video files found in {video_dir} with extensions {video_exts}"
+        video_meta_list = []
+        for vfile in video_files:
+            meta_dict = VideoUtils.get_video_meta_dict(vfile)
+            if meta_dict:
+                video_meta_list.append(meta_dict)
+        dfmk = csvfile.DFCreator()
+        columns = list(video_meta_list[0].keys())
+        assert len(columns) > 0, "No video metadata found"
+        assert 'video_path' in columns, "video_path column not found in video metadata"
+        # move video_path to the first column
+        columns.remove('video_path')
+        columns.insert(0, 'video_path')
+        dfmk.create_table("video_meta", columns)
+        rows = [[meta[col] for col in columns] for meta in video_meta_list]
+        dfmk.insert_rows("video_meta", rows)
+        dfmk.fill_table_from_row_pool("video_meta")
+        if csv_outfile:
+            dfmk["video_meta"].to_csv(csv_outfile, index=False, sep=";")
+        return dfmk["video_meta"].copy()

halib/videofile.py CHANGED Viewed

@@ -5,7 +5,7 @@ import enlighten
 from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
 from tube_dl import Youtube, Playlist
-from halib.sys import filesys
+from halib.system import filesys
 from halib.filetype import textfile

halib 0.1.7__py3-none-any.whl → 0.1.99__py3-none-any.whl

halib 0.1.7py3-none-any.whl → 0.1.99py3-none-any.whl