PyPI - dragon-ml-toolbox - Versions diffs - 1.1.2__py3-none-any.whl - Mend

dragon-ml-toolbox 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (16) hide show

dragon_ml_toolbox-1.1.2.dist-info/METADATA +114 -0
dragon_ml_toolbox-1.1.2.dist-info/RECORD +16 -0
dragon_ml_toolbox-1.1.2.dist-info/WHEEL +5 -0
dragon_ml_toolbox-1.1.2.dist-info/top_level.txt +1 -0
ml_tools/MICE_imputation.py +178 -0
ml_tools/__init__.py +0 -0
ml_tools/data_exploration.py +751 -0
ml_tools/datasetmaster.py +595 -0
ml_tools/ensemble_learning.py +701 -0
ml_tools/handle_excel.py +310 -0
ml_tools/logger.py +145 -0
ml_tools/particle_swarm_optimization.py +467 -0
ml_tools/pytorch_models.py +227 -0
ml_tools/trainer.py +366 -0
ml_tools/utilities.py +168 -0
ml_tools/vision_helpers.py +218 -0

ml_tools/handle_excel.py ADDED Viewed

@@ -0,0 +1,310 @@
+import os
+from openpyxl import load_workbook, Workbook
+import pandas as pd
+from typing import List, Optional
+def unmerge_and_split_excel(filepath: str) -> None:
+    """
+    Processes a single Excel file:
+      - Unmerges all merged cells (vertical and horizontal),
+      - Fills each merged region with the top-left cell value,
+      - Splits each sheet into a separate Excel file,
+      - Saves all results in the same directory as the input file.
+    Parameters:
+        filepath (str): Full path to the Excel file to process.
+    """
+    wb = load_workbook(filepath)
+    base_dir = os.path.dirname(os.path.abspath(filepath))
+    base_name = os.path.splitext(os.path.basename(filepath))[0]
+    total_output_files = 0
+    for sheet_name in wb.sheetnames:
+        ws = wb[sheet_name]
+        new_wb = Workbook()
+        new_ws = new_wb.active
+        new_ws.title = sheet_name
+        # Copy all cell values
+        for row in ws.iter_rows():
+            for cell in row:
+                new_ws.cell(row=cell.row, column=cell.column, value=cell.value)
+        # Fill and unmerge merged regions
+        for merged_range in list(ws.merged_cells.ranges):
+            min_row, min_col, max_row, max_col = (
+                merged_range.min_row, merged_range.min_col,
+                merged_range.max_row, merged_range.max_col
+            )
+            value = ws.cell(row=min_row, column=min_col).value
+            for row in range(min_row, max_row + 1):
+                for col in range(min_col, max_col + 1):
+                    new_ws.cell(row=row, column=col, value=value)
+        # Construct flat output file name
+        sanitized_sheet_name = sheet_name.replace("/", "_").replace("\\", "_")
+        output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
+        output_path = os.path.join(base_dir, output_filename)
+        new_wb.save(output_path)
+        # print(f"Saved: {output_path}")
+        total_output_files += 1
+    print(f"✅ Processed file: {filepath} into {total_output_files} output file(s).")
+    return None
+def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
+    """
+    Processes all Excel files in the input directory:
+      - Unmerges all merged cells (vertical and horizontal),
+      - Fills each merged region with the top-left cell value,
+      - Splits each sheet into separate Excel files,
+      - Saves all results into the output directory.
+    Parameters:
+        input_dir (str): Directory containing Excel files to process.
+        output_dir (str): Directory to save processed Excel files.
+    """
+    raw_files = [f for f in os.listdir(input_dir) if f.endswith(('.xlsx', '.xls'))]
+    excel_files = [os.path.join(input_dir, f) for f in raw_files if not f.startswith('~')]
+    if not excel_files:
+        raise FileNotFoundError(f"No valid Excel files found in directory: {input_dir}")
+    os.makedirs(output_dir, exist_ok=True)
+    total_output_files = 0
+    for file_path in excel_files:
+        wb = load_workbook(file_path)
+        base_name = os.path.splitext(os.path.basename(file_path))[0]
+        for sheet_name in wb.sheetnames:
+            ws = wb[sheet_name]
+            new_wb = Workbook()
+            new_ws = new_wb.active
+            new_ws.title = sheet_name
+            # Copy all cell values
+            for row in ws.iter_rows():
+                for cell in row:
+                    new_ws.cell(row=cell.row, column=cell.column, value=cell.value)
+            # Fill and unmerge merged regions
+            for merged_range in list(ws.merged_cells.ranges):
+                min_row, min_col, max_row, max_col = (
+                    merged_range.min_row, merged_range.min_col,
+                    merged_range.max_row, merged_range.max_col
+                )
+                value = ws.cell(row=min_row, column=min_col).value
+                for row in range(min_row, max_row + 1):
+                    for col in range(min_col, max_col + 1):
+                        new_ws.cell(row=row, column=col, value=value)
+            # Construct flat output file name
+            sanitized_sheet_name = sheet_name.replace("/", "_").replace("\\", "_")
+            output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
+            output_path = os.path.join(output_dir, output_filename)
+            new_wb.save(output_path)
+            # print(f"Saved: {output_path}")
+            total_output_files += 1
+    print(f"✅ Processed {len(excel_files)} input Excel file(s) with a total of {total_output_files} output Excel file(s).")
+    return None
+def validate_excel_schema(
+    target_dir: str,
+    expected_columns: List[str],
+    strict: bool = False
+) -> None:
+    """
+    Validates that each Excel file in a directory conforms to the expected column schema.
+    Parameters:
+        target_dir (str): Path to the directory containing Excel files.
+        expected_columns (list[str]): List of expected column names.
+        strict (bool): If True, columns must match exactly (names and order).
+                      If False, columns must contain at least all expected names.
+    Returns:
+        List[str]: List of file paths that failed the schema validation.
+    """
+    invalid_files = []
+    expected_set = set(expected_columns)
+    excel_seen = 0
+    for filename in os.listdir(target_dir):
+        if not filename.lower().endswith(".xlsx"):
+            continue  # Skip non-Excel files
+        if filename.startswith("~"): # Skip temporary files
+            continue
+        file_path = os.path.join(target_dir, filename)
+        excel_seen += 1
+        try:
+            wb = load_workbook(file_path, read_only=True)
+            ws = wb.active  # Only check the first worksheet
+            header = [cell.value for cell in next(ws.iter_rows(max_row=1))]
+            if strict:
+                if header != expected_columns:
+                    invalid_files.append(file_path)
+            else:
+                header_set = set(header)
+                if not expected_set.issubset(header_set):
+                    invalid_files.append(file_path)
+        except Exception as e:
+            print(f"Error processing '{file_path}': {e}")
+            invalid_files.append(file_path)
+    valid_excel_number = excel_seen - len(invalid_files)
+    print(f"{valid_excel_number} out of {excel_seen} excel files conform to the schema.")
+    if invalid_files:
+        print(f"⚠️ {len(invalid_files)} excel files are invalid:")
+        for file in invalid_files:
+            print(f"  - {file}")
+    return None
+def vertical_merge_transform_excel(
+    target_dir: str,
+    csv_filename: str,
+    output_dir: str,
+    target_columns: Optional[List[str]] = None,
+    rename_columns: Optional[List[str]] = None
+) -> None:
+    """
+    Merges multiple Excel files in a directory vertically and saves as a single CSV file.
+    Constraints:
+    - Only the first worksheet of each Excel file is processed.
+    - All Excel files must have either the same column names or a common subset if `target_columns` is provided.
+    - If `rename_columns` is provided, it must match the length of `target_columns` (if used) or the original columns. Names match by position.
+    Parameters:
+        target_dir (str): Directory containing Excel files.
+        csv_filename (str): Output CSV filename.
+        output_dir (str): Directory to save the output CSV file.
+        target_columns (list[str] | None): Columns to select from each Excel file.
+        rename_columns (list[str] | None): Optional renaming for columns. Position-based matching.
+    """
+    raw_excel_files = [f for f in os.listdir(target_dir) if f.endswith(('.xlsx', '.xls'))]
+    excel_files = [f for f in raw_excel_files if not f.startswith('~')]  # Exclude temporary files
+    if not excel_files:
+        raise ValueError("No Excel files found in the target directory.")
+    csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
+    csv_path = os.path.join(output_dir, csv_filename)
+    dataframes = []
+    for file in excel_files:
+        file_path = os.path.join(target_dir, file)
+        df = pd.read_excel(file_path, engine='openpyxl')
+        if target_columns is not None:
+            missing = [col for col in target_columns if col not in df.columns]
+            if missing:
+                raise ValueError(f"Missing columns in {file}: {missing}")
+            df = df[target_columns]
+        dataframes.append(df)
+    merged_df = pd.concat(dataframes, ignore_index=True)
+    if rename_columns is not None:
+        expected_len = len(target_columns if target_columns is not None else merged_df.columns)
+        if len(rename_columns) != expected_len:
+            raise ValueError("Length of rename_columns must match the selected columns")
+        merged_df.columns = rename_columns
+    merged_df.to_csv(csv_path, index=False, encoding='utf-8')
+    print(f"✅ Merged {len(dataframes)} excel files into '{csv_filename}'.")
+def horizontal_merge_transform_excel(
+    target_dir: str,
+    csv_filename: str,
+    output_dir: str,
+    drop_columns: Optional[list[str]] = None,
+    skip_duplicates: bool = False
+) -> None:
+    """
+    Horizontally concatenates Excel files (first sheet of each) by aligning rows and expanding columns.
+    Then saves the result as a .csv file.
+    Constraints:
+    - All Excel files must have the same number of rows, or shorter ones will be padded with empty rows.
+    - Only the first sheet in each Excel file is used.
+    - Columns in `drop_columns` will be excluded from the result.
+    - If `skip_duplicates` is False, duplicate columns are suffixed with "_copy", "_copy2", etc.
+      If True, only the first occurrence of each column name is kept.
+    Parameters:
+        target_dir (str): Directory containing Excel files.
+        csv_filename (str): Name of the output CSV file.
+        output_dir (str): Directory to save the output CSV file.
+        drop_columns (list[str] | None): Columns to exclude from each file before merging.
+        skip_duplicates (bool): Whether to skip duplicate columns or rename them.
+    """
+    raw_excel_files = [f for f in os.listdir(target_dir) if f.endswith(('.xlsx', '.xls'))]
+    excel_files = [f for f in raw_excel_files if not f.startswith('~')]  # Exclude temporary files
+    if not excel_files:
+        raise ValueError("No Excel files found in the target directory.")
+    csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
+    csv_path = os.path.join(output_dir, csv_filename)
+    dataframes = []
+    max_rows = 0
+    for file in excel_files:
+        file_path = os.path.join(target_dir, file)
+        df = pd.read_excel(file_path, engine='openpyxl')
+        if drop_columns is not None:
+            df = df.drop(columns=[col for col in drop_columns if col in df.columns])
+        max_rows = max(max_rows, len(df))
+        dataframes.append(df)
+    padded_dataframes = []
+    for df in dataframes:
+        padded_df = df.reindex(range(max_rows)).reset_index(drop=True)
+        padded_dataframes.append(padded_df)
+    merged_df = pd.concat(padded_dataframes, axis=1)
+    duplicate_columns = merged_df.columns[merged_df.columns.duplicated()].tolist()
+    if skip_duplicates:
+        merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
+    else:
+        seen = {}
+        new_cols = []
+        for col in merged_df.columns:
+            base_col = col
+            count = seen.get(base_col, 0)
+            if count:
+                while f"{base_col}_copy{count}" in seen:
+                    count += 1
+                col = f"{base_col}_copy{count}"
+            seen[col] = count + 1
+            new_cols.append(col)
+        merged_df.columns = new_cols
+    merged_df.to_csv(csv_path, index=False, encoding='utf-8')
+    print(f"✅ Merged {len(excel_files)} Excel files into '{csv_filename}'.")
+    if duplicate_columns:
+        print(f"⚠️ Duplicate columns: {duplicate_columns}")

ml_tools/logger.py ADDED Viewed

@@ -0,0 +1,145 @@
+import os
+from datetime import datetime
+from typing import Union, List, Dict, Any
+import pandas as pd
+from openpyxl.styles import Font, PatternFill
+import traceback
+import json
+from utilities import sanitize_filename
+def custom_logger(
+    data: Union[
+        List[Any],
+        Dict[Any, Any],
+        pd.DataFrame,
+        str,
+        BaseException
+    ],
+    save_directory: str,
+    log_name: str,
+) -> None:
+    """
+    Logs various data types to corresponding output formats:
+    - list[Any]                    → .txt
+        Each element is written on a new line.
+    - dict[str, list[Any]]        → .csv
+        Dictionary is treated as tabular data; keys become columns, values become rows.
+    - dict[str, scalar]           → .json
+        Dictionary is treated as structured data and serialized as JSON.
+    - pandas.DataFrame            → .xlsx
+        Written to an Excel file with styled headers.
+    - str                         → .log
+        Plain text string is written to a .log file.
+    - BaseException               → .log
+        Full traceback is logged for debugging purposes.
+    Args:
+        data: The data to be logged. Must be one of the supported types.
+        save_directory: Directory where the log will be saved. Created if it does not exist.
+        log_name: Base name for the log file. Timestamp will be appended automatically.
+    Raises:
+        ValueError: If the data type is unsupported.
+    """
+    try:
+        os.makedirs(save_directory, exist_ok=True)
+        timestamp = datetime.now().strftime(r"%Y%m%d_%H%M")
+        log_name = sanitize_filename(log_name)
+        base_path = os.path.join(save_directory, f"{log_name}_{timestamp}")
+        if isinstance(data, list):
+            _log_list_to_txt(data, base_path + ".txt")
+        elif isinstance(data, dict):
+            if all(isinstance(v, list) for v in data.values()):
+                _log_dict_to_csv(data, base_path + ".csv")
+            else:
+                _log_dict_to_json(data, base_path + ".json")
+        elif isinstance(data, pd.DataFrame):
+            _log_dataframe_to_xlsx(data, base_path + ".xlsx")
+        elif isinstance(data, str):
+            _log_string_to_log(data, base_path + ".log")
+        elif isinstance(data, BaseException):
+            _log_exception_to_log(data, base_path + ".log")
+        else:
+            raise ValueError("Unsupported data type. Must be list, dict, DataFrame, str, or BaseException.")
+        print(f"Log saved to: {base_path}")
+    except Exception as e:
+        print(f"Error in custom_logger: {e}")
+def _log_list_to_txt(data: List[Any], path: str) -> None:
+    log_lines = []
+    for item in data:
+        try:
+            log_lines.append(str(item).strip())
+        except Exception:
+            log_lines.append(f"(unrepresentable item of type {type(item)})")
+    with open(path, 'w', encoding='utf-8') as f:
+        f.write('\n'.join(log_lines))
+def _log_dict_to_csv(data: Dict[Any, List[Any]], path: str) -> None:
+    sanitized_dict = {}
+    max_length = max(len(v) for v in data.values()) if data else 0
+    for key, value in data.items():
+        if not isinstance(value, list):
+            raise ValueError(f"Dictionary value for key '{key}' must be a list.")
+        sanitized_key = str(key).strip().replace('\n', '_').replace('\r', '_')
+        padded_value = value + [None] * (max_length - len(value))
+        sanitized_dict[sanitized_key] = padded_value
+    df = pd.DataFrame(sanitized_dict)
+    df.to_csv(path, index=False)
+def _log_dataframe_to_xlsx(data: pd.DataFrame, path: str) -> None:
+    writer = pd.ExcelWriter(path, engine='openpyxl')
+    data.to_excel(writer, index=True, sheet_name='Data')
+    workbook = writer.book
+    worksheet = writer.sheets['Data']
+    header_font = Font(bold=True)
+    header_fill = PatternFill(
+        start_color="ADD8E6",  # Light blue
+        end_color="ADD8E6",
+        fill_type="solid"
+    )
+    for cell in worksheet[1]:
+        cell.font = header_font
+        cell.fill = header_fill
+    writer.close()
+def _log_string_to_log(data: str, path: str) -> None:
+    with open(path, 'w', encoding='utf-8') as f:
+        f.write(data.strip() + '\n')
+def _log_exception_to_log(exc: BaseException, path: str) -> None:
+    with open(path, 'w', encoding='utf-8') as f:
+        f.write("Exception occurred:\n")
+        traceback.print_exception(type(exc), exc, exc.__traceback__, file=f)
+def _log_dict_to_json(data: Dict[Any, Any], path: str) -> None:
+    with open(path, 'w', encoding='utf-8') as f:
+        json.dump(data, f, indent=4, ensure_ascii=False)