PyPI - datasmryzr - Versions diffs - 0.0.1__py3-none-any.whl - Mend

datasmryzr 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

datasmryzr/__about__.py +4 -0
datasmryzr/__init__.py +3 -0
datasmryzr/annotate.py +270 -0
datasmryzr/clusters.py +315 -0
datasmryzr/core_genome.py +297 -0
datasmryzr/datasmryzr.py +97 -0
datasmryzr/distances.py +106 -0
datasmryzr/pangenome.py +211 -0
datasmryzr/smryz.py +472 -0
datasmryzr/summary.py +110 -0
datasmryzr/tables.py +243 -0
datasmryzr/templates/base_config.json +28 -0
datasmryzr/templates/report.html.j2 +899 -0
datasmryzr/tree.py +27 -0
datasmryzr/utils.py +94 -0
datasmryzr-0.0.1.dist-info/METADATA +54 -0
datasmryzr-0.0.1.dist-info/RECORD +20 -0
datasmryzr-0.0.1.dist-info/WHEEL +4 -0
datasmryzr-0.0.1.dist-info/entry_points.txt +2 -0
datasmryzr-0.0.1.dist-info/licenses/LICENSE.txt +9 -0

datasmryzr/__about__.py ADDED Viewed

@@ -0,0 +1,4 @@
+# SPDX-FileCopyrightText: 2025-present Kristy <kristyhoran15@gmail.com>
+#
+# SPDX-License-Identifier: MIT
+__version__ = "0.0.1"

datasmryzr/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2025-present Kristy <kristyhoran15@gmail.com>
+#
+# SPDX-License-Identifier: MIT

datasmryzr/annotate.py ADDED Viewed

@@ -0,0 +1,270 @@
+"""
+This module provides functions for generating metadata annotations
+and legends for a DataFrame, mapping metadata variables to colors.
+"""
+import pandas as pd
+import json
+from mycolorpy import colorlist as mcp
+from datasmryzr import utils
+def _open_file(file_path:str) -> pd.DataFrame:
+    """
+    Open a file and return its contents.
+    Args:
+        file_path (str): Path to the file.
+    Returns:
+        str: File contents.
+    """
+    df = pd.read_csv(file_path, sep = None, engine = 'python')
+    return df
+def _check_vals(df:pd.DataFrame,
+                cols:list,
+                cfg:dict) -> list:
+    """
+    Validates and filters the specified columns from a DataFrame based on
+    their data type and configuration settings.
+    Args:
+        df (pd.DataFrame): The input DataFrame to check.
+        cols (list): A list of column names to validate.
+        cfg (dict): A configuration dictionary containing the key
+            'categorical_columns', which specifies columns to treat as
+            categorical.
+    Returns:
+        list: A list of valid column names that are either non-numerical or
+        explicitly specified as categorical in the configuration.
+    Raises:
+        ValueError: If none of the specified columns are valid or if none of
+        the columns exist in the DataFrame.
+    """
+    final_cols = []
+    _id_col = df.columns[0]
+    indf = False
+    for col in cols:
+        if col in df.columns:
+            indf = True
+            is_string = True
+            if col != _id_col:
+                for val in df[col].unique():
+                    if isinstance(val, str):
+                        is_string = True
+                    else:
+                        is_string = False
+                if is_string or col in cfg['categorical_columns']:
+                    final_cols.append(col)
+    if not final_cols:
+        if indf:
+            raise ValueError(
+                f"Columns {', '.join(cols)} do not contain any valid values.\
+                 Please check the column names."
+            )
+        raise ValueError(
+            f"None of the columns {', '.join(cols)} are in the dataframe or \
+            in the correct format - only non-numerical data can be included. \
+            Please check the column names."
+        )
+    else:
+        return final_cols
+def _get_cols(cols: list, df: pd.DataFrame, cfg: dict) -> list:
+    """
+    Retrieve and validate a list of columns from a DataFrame based on the
+    provided configuration.
+    Args:
+        cols (list): A list of column names to retrieve or the string "all"
+        to select all columns.
+        df (pd.DataFrame): The DataFrame from which columns will be retrieved.
+        cfg (dict): A configuration dictionary used for validation.
+    Returns:
+        list: A list of validated column names.
+    Notes:
+        - If `cols` is "all", all columns in the DataFrame will be selected
+        and validated.
+        - The `_check_vals` function is used to validate the selected columns
+        against the configuration.
+    """
+    if cols == "all":
+        return _check_vals(df=df, cols=df.columns.tolist(), cfg=cfg)
+    return _check_vals(df=df, cols=cols, cfg=cfg)
+def _get_colors(df:pd.DataFrame,
+                cols:list) -> tuple:
+    """
+    Generate a dictionary of CSS-compatible color mappings for unique values
+    in specified columns of a DataFrame.
+    Args:
+        df (pd.DataFrame): The input DataFrame containing the data.
+        cols (list): A list of column names in the DataFrame for which
+        unique values will be assigned colors.
+    Returns:
+        tuple: A dictionary where keys are modified color names
+        (CSS-compatible) and values are the corresponding color codes.
+    """
+    colors_set: set = set()
+    colors_css: dict = {}
+    for col in cols:
+        unique_vals = list(df[col].unique())
+        length = len(unique_vals)
+        colors = mcp.gen_color(cmap="tab20b", n=length)
+        colors_set.update(colors)
+    for cl in colors_set:
+        nme = cl.replace("#", "a")
+        if nme not in colors_css:
+            colors_css[nme] = cl
+    return colors_css
+def _make_legend(df:pd.DataFrame,
+                 cols:list,
+                 color_css:dict) -> dict:
+    """
+    Generate a legend mapping unique values in specified columns of a
+    DataFrame to corresponding colors from a given CSS color dictionary.
+    Args:
+        df (pd.DataFrame): The input DataFrame containing the data.
+        cols (list): A list of column names in the DataFrame to generate
+        legends for.
+        color_css (dict): A dictionary where keys are color names or codes,
+                            and values are CSS color definitions.
+    Returns:
+        dict: A dictionary where each key is a column name from `cols`, and
+        the value is a list of dictionaries mapping unique column values
+        to colors.
+    Notes:
+        - Values equal to "NA" are excluded from the legend.
+        - If the number of unique values in a column exceeds the number of
+        available colors in `color_css`, only the first `len(color_css)`
+        unique values are mapped.
+    """
+    legend: dict = {}
+    for col in cols:
+        unique_vals = [val for val in df[col].unique() if val != "NA"]
+        colors = list(color_css.keys())
+        cols_mapped = zip(unique_vals, colors)
+        legend[col] = [{val: color} for val, color in cols_mapped]
+    return legend
+def _get_metadata_tree(df:pd.DataFrame,
+                       cols:list,
+                       legend: dict,
+                       color_css:dict) -> dict:
+    """
+    Generate a metadata structure from a DataFrame.
+    This function creates a nested dictionary (metadata tree) where each key corresponds to a unique value
+    in the first column of the DataFrame (`tiplabel`). For each row in the DataFrame, the metadata tree
+    includes additional metadata for specified columns (`cols`), with associated color and label information.
+    Args:
+        df (pd.DataFrame): The input DataFrame containing the data to process. The first column is used as
+                            the primary key (`tiplabel`) for the metadata.
+        cols (list): A list of column names from the DataFrame to include in the metadata.
+        legend (dict): A dictionary mapping column names to dictionaries that map column values to colors.
+                        Example: { "column_name": { "value1": "color1", "value2": "color2" } }.
+        color_css (dict): A dictionary mapping color names to CSS-compatible color codes.
+                            Example: { "red": "#FF0000", "blue": "#0000FF" }.
+    Returns:
+        dict: A nested dictionary representing the metadata. Each key corresponds to a unique value
+                in the first column of the DataFrame, and each value is a dictionary containing metadata
+                for the specified columns, including color and label information.
+    Example:
+        Input DataFrame:
+            +---------+--------+--------+
+            | tiplabel| col1   | col2   |
+            +---------+--------+--------+
+            | A       | value1 | value2 |
+            | B       | value3 | value4 |
+            +---------+--------+--------+
+        cols = ["col1", "col2"]
+        legend = {
+            "col1": {"value1": "red", "value3": "blue"},
+            "col2": {"value2": "green", "value4": "yellow"}
+        color_css = {"red": "#FF0000", "blue": "#0000FF", "green": "#00FF00", "yellow": "#FFFF00"}
+        Output:
+        {
+            "A": {
+                "col1": {"colour": "#FF0000", "label": "value1"},
+                "col2": {"colour": "#00FF00", "label": "value2"}
+            },
+            "B": {
+                "col1": {"colour": "#0000FF", "label": "value3"},
+                "col2": {"colour": "#FFFF00", "label": "value4"}
+    """
+    metadata_tree = {}
+    tiplabel = df.columns[0]
+    for _, row in df.iterrows():
+        metadata_tree[row[tiplabel]] = {
+            col: {
+                "colour": color_css.get(
+                    next(
+                        (lg[row[col]] for lg in legend[col] if row[col] in lg),
+                        "white",
+                    ),
+                    "white",
+                ),
+                "label": row[col],
+            }
+            for col in cols
+            if col != tiplabel
+        }
+    return metadata_tree
+def construct_annotations(path: str,
+                          cols: list,
+                          config:str) -> dict:
+    """
+    Constructs annotations based on the provided file path and columns.
+    This function processes a file to generate metadata annotations, including
+    a metadata tree, metadata columns, CSS color mappings, and a legend. If no
+    file path is provided, it returns default empty structures.
+    Args:
+        path (str): The file path to the data source. If empty, default values
+            are returned.
+        cols (list): A list of column names to be used for generating metadata.
+    Returns:
+        dict: A dictionary containing the following keys:
+        - "metadata_tree" (dict): A hierarchical representation of metadata.
+        - "metadata_columns" (list): A list of metadata column names.
+        - "colors_css" (dict): A mapping of metadata values to CSS color codes.
+        - "legend" (list): A list of legend entries for the metadata.
+    """
+    if not path:
+        return {
+            "metadata_tree": {},
+            "metadata_columns": [],
+            "colors_css": {},
+            "legend": [],
+        }
+    df = _open_file(path).fillna("NA")
+    cfg = utils.get_config(config)
+    metadata_columns = _get_cols(cols=cols, df=df, cfg=cfg)
+    colors_css = _get_colors(df=df, cols=metadata_columns)
+    legend = _make_legend(df=df, cols=metadata_columns, color_css=colors_css)
+    metadata_tree = _get_metadata_tree(
+        df=df, cols=metadata_columns, legend=legend, color_css=colors_css
+    )
+    return {
+        "metadata_tree": metadata_tree,
+        "metadata_columns": metadata_columns,
+        "colors_css": colors_css,
+        "legend": legend,
+    }

datasmryzr/clusters.py ADDED Viewed

@@ -0,0 +1,315 @@
+"""
+This module provides functions for processing pairwise distances between isolates,
+including generating histograms and heatmaps for visualization.
+"""
+import pandas as pd
+import pathlib
+import json
+import altair as alt
+from datasmryzr.utils import check_file_exists
+from datasmryzr.distances import _get_distances
+def _get_cluster_table(
+        clusters: str
+    ) -> pd.DataFrame:
+    try:
+    # if check_file_exists(clusters):
+        cluster_df = pd.read_csv(clusters, sep=None, engine='python', dtype=str)
+        return cluster_df
+    except Exception as e:
+        print(e)
+        return pd.DataFrame()
+def _get_distance_data(
+        cluster_df: pd.DataFrame,
+        distances_df: pd.DataFrame
+    ) -> pd.DataFrame:
+    pass
+def _combine_cluster_ids(
+                         clusters:pd.DataFrame) -> pd.DataFrame:
+    thresholds = _get_thresholds(clusters)
+    while thresholds:
+        threshold = thresholds.pop(0)
+        if thresholds != []:
+            print(thresholds)
+            clusters[f"Tx:{thresholds[0]}"] = clusters[[f"Tx:{threshold}", f"Tx:{thresholds[0]}"]].apply(lambda x: ':'.join(x) if not "UC" in f"{x[0]}" else x[0], axis = 1)
+    return clusters
+def _get_thresholds(clusters: pd.DataFrame) -> list:
+    thresholds = sorted([int(t.split(':')[1]) for t in list(clusters.columns) if "Tx" in t], reverse=True)
+    return thresholds
+def _create_tree_for_traversal(
+                               clusters: pd.DataFrame) -> dict:
+    thresholds = _get_thresholds(clusters)
+    tree = {'all': [c for c in clusters[f"Tx:{thresholds[0]}"].unique() if c != "UC"]}
+    while thresholds:
+        threshold = thresholds.pop(0)
+        clusters = clusters[~clusters[f"Tx:{threshold}"].str.contains("UC")]
+        for cl in clusters[f"Tx:{threshold}"].unique():
+            if cl not in tree:
+                tree[cl] = []
+                if thresholds != []:
+                    tree[cl] = list(clusters[clusters[f"Tx:{threshold}"] == cl][f"Tx:{thresholds[0]}"].unique())
+                else:
+                    tree[cl] = []
+    return tree
+def _construct_table_dict(tree, node, clusters, visited=None):
+    # print(type(df))
+    # print(type(df))
+    # print(type(df))
+    cols = list(clusters.columns)
+    size = 0
+    print(node)
+    for col in cols:
+        print(col)
+        if node in clusters[col].unique():
+            tmp = clusters[clusters[col] == node]
+            size = tmp.shape[0]
+            isolates = list(tmp['ID'].unique())
+    # print(type(df))
+    if visited is None:
+        visited = set()  # Initialize the visited set
+    visited.add(node)    # Mark the node as visited
+    # print(node)
+    data = {'Cluster ID': node, 'Num seqs':size, '_children': []}  # Store the children of the current node
+    if "UC" not in node:
+        for child in tree[node]:  # Recursively visit children
+            if child not in visited:
+                if "UC" not in child:
+                    data['_children'].append(_construct_table_dict(tree = tree, node = child, clusters = clusters, visited=visited))
+                # dfs_recursive(tree, child, clusters, visited)
+    # print(data)
+    return data
+def get_cluster_distances(
+        clusters: str,
+        distances: str
+    ) -> pd.DataFrame:
+    cluster_df = _get_cluster_table(clusters)
+    cluster_df = _combine_cluster_ids(cluster_df)
+    thresholds = _get_thresholds(cluster_df)
+    dists = {}
+    if check_file_exists(distances) and not cluster_df.empty:
+        distances_df = pd.read_csv(distances, sep = "\t")
+        tree = _create_tree_for_traversal(cluster_df)
+        id_col = distances_df.columns[0]
+        for cl in tree:
+            for th in thresholds:
+                if cl in cluster_df[f"Tx:{th}"].values:
+                    isolates = list(cluster_df[cluster_df[f"Tx:{th}"] == cl]['ID'])
+                    ccols = ["Isolate"]
+                    ccols.extend(isolates)
+                    dd = distances_df[distances_df["Isolate"].isin(isolates)][ccols]
+                    tbl = dd.to_dict(orient='records')
+                    col_dict = []
+                    for col in ccols:
+                        if col == id_col:
+                            dct = {'field': col, 'title': col, 'type': 'string', 'headerFilter':'input',
+                            'headerFilterPlaceholder':f'Search {col}',
+                            'formatter':"textarea"}
+                            col_dict.append(dct)
+                        else:
+                            dct = {'field': col, 'title': col, 'type': 'number', 'headerFilter':'number', 'headerFilterFunc':"<=",
+                            'headerFilterPlaceholder':f'Less than ...',
+                            'formatter':"number",}
+                            col_dict.append(dct)
+                    dists[cl] = {
+                        'table': tbl,
+                        'columns': col_dict
+                    }
+    return dists
+def _save_cluster_table(cluster_table: dict) -> str:
+    out_path = pathlib.Path.cwd() / "clusters.json"
+    with open(out_path, 'w') as f:
+        json.dump(cluster_table, f, indent=4)
+    return str(out_path)
+def get_cluster_table(
+        clusters: str,
+        distances: str
+    ) -> str:
+    distances_df = _get_distances(distances)
+    cluster_df = _get_cluster_table(clusters)
+    # print(cluster_df)
+    thresholds = _get_thresholds(cluster_df)
+    if cluster_df.empty or distances_df.empty:
+        return {}
+    else:
+        cluster_df = _combine_cluster_ids(cluster_df)
+        tree = _create_tree_for_traversal(cluster_df)
+        cluster_table = _construct_table_dict(tree = tree, node = 'all', clusters= cluster_df)
+        # print(raw_data)
+        # cluster_table = _polish_cluster_table(raw_data, 'all')
+        return _save_cluster_table(cluster_table['_children'])
+def _get_clustered(clusters: str, threshold:int) -> pd.DataFrame:
+    clustered = clusters[~clusters[f"Tx:{threshold}"].str.contains("UC")][clusters.columns[0]].tolist()
+    return clustered
+def _cluster_statistics(
+        cluster_df: str,
+        distances_df: str,
+        thresholds: list,
+        id_col: str = None
+    ) -> pd.DataFrame:
+    intra_clusters = []
+    for th in thresholds:
+        clustered = _get_clustered(cluster_df, th)
+        cdf = distances_df[distances_df["Isolate1"].isin(clustered) & distances_df["Isolate2"].isin(clustered)]
+        # print(cdf)
+        for cl in cluster_df[f"Tx:{th}"].unique():
+            if "UC" not in cl:
+                cldf = cluster_df[cluster_df[f"Tx:{th}"] == cl]
+                if not cdf.empty:
+                    tmp = cdf[cdf["Isolate1"].isin(cldf[id_col] )]
+                    tmp = tmp[tmp["Isolate2"].isin(cldf[id_col])]
+                    # print(tmp[["Isolate1", "Isolate2"]])
+                    tmp["pair"] = tmp[["Isolate1", "Isolate2"]].apply(lambda x: "_".join(sorted(x)), axis=1)
+                    tmp["Cluster ID"] = f"{cl}"
+                    tmp["SNP Threshold"] = th
+                    tmp["Measurement"] = "Intra-cluster distance"
+                    intra_clusters.append(tmp)
+                inter = cluster_df[(cluster_df[f"Tx:{th}"] != cl) & (~cluster_df[f"Tx:{th}"].str.contains("UC"))]
+                # print(inter)
+                for cluster in inter[f"Tx:{th}"].unique():
+                    intery = inter[inter[f"Tx:{th}"] == cluster]
+                    for i in cldf[id_col].unique(): # get each isolate in the cluster
+                        interx = pd.concat([intery, cldf[cldf[id_col] == i]])
+                        # print(interx)
+                        tmp2 = cdf[cdf["Isolate1"]== i]
+                        tmp2 = tmp2[~tmp2["Isolate2"].isin(interx[id_col])]
+                        tmp2["pair"] = tmp2[["Isolate1", "Isolate2"]].apply(lambda x: "_".join(sorted(x)), axis=1)
+                        tmp2["Cluster ID"] = f"{cl}"
+                        tmp2["SNP Threshold"] = th
+                        tmp2["Measurement"] = "Inter-cluster distance"
+                        # print(tmp2)
+                        intra_clusters.append(tmp2)
+    cdf_all = pd.concat(intra_clusters, ignore_index=True)
+    cdf_all.drop_duplicates(subset=["pair", "Cluster ID"], inplace=True)
+    return cdf_all
+def _generate_cluster_graphs(
+        cdf_all: pd.DataFrame,
+        clusters: pd.DataFrame,
+        id_col: str,
+        thresholds: int
+    ) -> dict:
+    charts = []
+    for th in thresholds:
+        clustered_list = _get_clustered(clusters, th)
+        tmp = clusters[clusters[id_col].isin(clustered_list)]
+        tmp = tmp.rename(columns={f"Tx:{th}": f"Tx_{th}"})
+        uc = clusters[~clusters[id_col].isin(clustered_list)]
+        uc = uc.rename(columns={f"Tx:{th}": f"Tx_{th}"})
+        uc[f"Tx_{th}"] = "UC"
+        num_cls = tmp.shape[0]
+        clustered_graph = alt.Chart(tmp).mark_bar().encode(
+            x=alt.X(f'Tx_{th}:N', title = None),
+            y=alt.Y('count():Q', title = None).scale(domain=[0, clusters.shape[0]]),
+            # column = "Clustered:N",
+            color=alt.Color('Tx_{th}:N', scale=alt.Scale(scheme='viridis')).legend(None),
+            tooltip=[f'Tx_{th}:N', 'count():Q']
+        ).properties(
+            width=200,
+            title = "Number sequences per cluster"
+            # height=300
+        )
+        unclustered_graph  = alt.Chart(uc).mark_bar(color="grey").encode(
+            x=alt.X(f'Tx_{th}:N', title = None),
+            y=alt.Y('count():Q', title = "Number of Isolates").scale(domain=[0, clusters.shape[0]]),
+            # column = "Clustered:N",
+            # color=alt.Color('threshold_9:N'),
+            tooltip=[f'Tx_{th}:N', 'count():Q']
+        ).properties(
+            width=300/(num_cls + 1),
+            # height=300
+        )
+        alt.hconcat(unclustered_graph,clustered_graph).configure_axis(
+                            grid=False
+            ).configure_view(
+            stroke=None
+        )
+        graphs = [unclustered_graph,clustered_graph]
+        for m in ["Intra-cluster distance", "Inter-cluster distance"]:
+            box = alt.Chart(cdf_all[(cdf_all["Measurement"] == m) & (cdf_all["SNP Threshold"] == th)]).mark_boxplot(extent='min-max', opacity=.3).encode(
+                    y=alt.Y(f"Distance:Q", sort=None, title = f"Pairwise Distance (threshold: {th})"),
+                    x=alt.X('Cluster ID:N'),
+                    # tooltip=['pair', 'Distance:Q'],
+                    color=alt.Color('Cluster ID').scale(scheme='viridis').legend(None)
+                )
+            scatter = alt.Chart(cdf_all[(cdf_all["Measurement"] == m) & (cdf_all["SNP Threshold"] == th)]).mark_circle(size=80).encode(
+                    y=alt.Y(f"Distance:Q", sort=None),
+                    x=alt.X('Cluster ID:N'),
+                    color=alt.Color('Cluster ID').scale(scheme='viridis').legend(None),
+                    # Add jitter if desired (e.g., using a calculated jitter column or a transform)
+                    # yOffset='jitter_x:Q',
+                    tooltip=['pair', f'Distance'],
+                )
+            chart = box + scatter
+            chart = chart.properties(title=f"{m}", width = 500)
+            graphs.append(chart)
+        graph = alt.hconcat(*graphs).resolve_scale(
+            y='independent').properties(
+            title = alt.Title(f"SNP threshold {th}", anchor='start', fontSize=20, dy=-10, baseline='middle')
+        )
+        charts.append(graph)
+    final_chart = alt.vconcat(*charts).configure_axis(
+                        grid=False
+                                    ).configure_view(
+                                    stroke=None
+                                )
+    return final_chart.to_json()
+def get_cluster_graphs(
+        clusters: str,
+        distances: str
+    ) -> dict:
+    distances_df = _get_distances(distances)
+    cluster_df = _get_cluster_table(clusters)
+    thresholds = _get_thresholds(cluster_df)
+    cluster_df = _combine_cluster_ids(cluster_df)
+    id_col = cluster_df.columns[0]
+    try:
+        cdf_all = _cluster_statistics(cluster_df = cluster_df, distances_df = distances_df, thresholds= thresholds, id_col = id_col)
+        graph = _generate_cluster_graphs(cdf_all = cdf_all, clusters= cluster_df, id_col = id_col, thresholds= thresholds)
+        return graph
+    except Exception as e:
+        print(e)
+        return {}
+# <button class="btn btn-sm btn-outline-secondary" style= "margin:2px;" id="information-button" data-bs-toggle="modal" data-bs-target="#myModal"><i class="bi bi-info-circle" style = "font-size: 1.2rem;"></i> Info</button>