PyPI - ssi-analysis-result-parsers - Versions diffs - 0.0.9__py3-none-any.whl → 0.0.10__py3-none-any.whl - Mend

ssi-analysis-result-parsers 0.0.9py3-none-any.whl → 0.0.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

ssi_analysis_result_parsers/Ecoli_parser.py ADDED Viewed

@@ -0,0 +1,669 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/49_Ecoli_parser.ipynb.
+# %% auto 0
+__all__ = ['thresholds', 'samplesheet_path', 'output_dir', 'output_path', 'sample_sheet_df', 'sample_output_df', 'original_cols',
+           'output_cols', 'output_initial_cols', 'output_specific_cols', 'ERR3528110_res_path', 'ERR3528110_input_df',
+           'ERR3528110_row', 'gene_hits', 'parsed_hits', 'O_gene_alleles', 'H_gene_alleles', 'O_type', 'H_type',
+           'O_gene_keys', 'H_gene_keys', 'O_genes_no', 'H_genes_no', 'ERR14229029_row', 'ERR14229029_expected_values',
+           'ERR14229029_values', 'test_cases', 'setup_logging', 'get_threshold', 'process_res_file', 'EcoliResults',
+           'ecoli_parser']
+# %% ../nbs/49_Ecoli_parser.ipynb 3
+import os
+import sys
+import pandas as pd
+from pathlib import Path
+import logging
+from datetime import datetime
+from typing import List, Dict
+from fastcore.script import call_parse
+# import functions from core module (optional, but most likely needed).
+from . import core
+# %% ../nbs/49_Ecoli_parser.ipynb 6
+thresholds = {
+    "stx": [98, 98],
+    "wzx": [98, 98],
+    "wzy": [98, 98],
+    "wzt": [98, 98],
+    "wzm": [98, 98],
+    "fliC": [90, 90],
+    "fli": [90, 90],
+    "eae": [95, 95],
+    "ehxA": [95, 95],
+    "other": [98, 98],
+}
+# %% ../nbs/49_Ecoli_parser.ipynb 9
+def setup_logging(log_dir: str, sample_name: str) -> None:
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    os.makedirs(log_dir, exist_ok=True)
+    log_file = os.path.join(log_dir, f"{sample_name}_kma_fbi.log")
+    logger = logging.getLogger()
+    while logger.hasHandlers():
+        logger.removeHandler(logger.handlers[0])
+    logging.basicConfig(
+        filename=log_file,
+        filemode="a",
+        format="%(asctime)s - %(levelname)s - %(message)s",
+        level=logging.INFO,
+    )
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setFormatter(logging.Formatter("%(message)s"))
+    logger.addHandler(console_handler)
+    logging.info(f"Logging started for {log_file}")
+# %% ../nbs/49_Ecoli_parser.ipynb 11
+def get_threshold(template_name: str, thresholds: Dict[str, List[int]]) -> List[int]:
+    """
+    Returns the coverage and identity threshold for a given gene.
+    Args:
+        template_name (str): Name of the template (gene) from the .res file.
+        thresholds (Dict[str, List[int]]): Dictionary of gene thresholds.
+    Returns:
+        List[int]: A list of two integers: [coverage_threshold, identity_threshold].
+    """
+    for key in thresholds:
+        if key in template_name:
+            return thresholds[key]
+    return thresholds["other"]
+def process_res_file(res_file_path: str) -> pd.DataFrame:
+    """
+    Reads and filters a KMA .res file based on predefined thresholds.
+    Args:
+        res_file_path (str): Path to the .res file.
+        thresholds (Dict[str, List[int]]): Gene-specific thresholds.
+    Returns:
+        pd.DataFrame: Filtered results DataFrame.
+    """
+    try:
+        res_df = pd.read_csv(res_file_path, sep="\t")
+    except FileNotFoundError:
+        raise FileNotFoundError(f"File not found: {res_file_path}")
+    except pd.errors.EmptyDataError:
+        raise ValueError(f"File is empty or not properly formatted: {res_file_path}")
+    required_columns = {"#Template", "Template_Coverage", "Query_Identity", "Depth"}
+    if not required_columns.issubset(res_df.columns):
+        raise ValueError(f"Missing expected columns in {res_file_path}")
+    res_df["threshold"] = res_df["#Template"].apply(
+        lambda x: get_threshold(x, thresholds)
+    )
+    res_df_filtered = res_df[
+        (res_df["Template_Coverage"] >= res_df["threshold"].apply(lambda x: x[0]))
+        & (res_df["Query_Identity"] >= res_df["threshold"].apply(lambda x: x[1]))
+    ]
+    return res_df_filtered
+# %% ../nbs/49_Ecoli_parser.ipynb 13
+class EcoliResults:
+    """
+    Object for holding and processing E. coli typing results.
+    This class stores summary typing data for multiple samples, provides utilities for per-sample processing, and export results in a tab-seperated format (.tsv).
+    """
+    # converts the sample results in dict to pandas df
+    def __init__(self, results_dict: dict):
+        """
+        Initializes the EcoliResults object with typing result data.
+        Args:
+            results_dict (dict): Dictionary where keys are sample names and values are summary result dictionaries.
+        """
+        self.results_dict = results_dict
+        self.results_df = pd.DataFrame.from_dict(
+            results_dict, orient="index"
+        ).reset_index(names="sample_name")
+    @staticmethod
+    def summarize_single_sample(
+        sample_name: str, res_path: str, verbose_flag: int = 1
+    ) -> dict:
+        """
+        Processes a single sample KMA .res file and returns a summary dictionary.
+        Args:
+            sample_name (str): Sample identifier.
+            res_path (str): Path to the sample's .res file.
+            verbose_flag (int, optional): Include verbose info if set to 1. Default is 1.
+        Returns:
+            Dict[str, str]: Summary values extracted from the .res file.
+        """
+        log_dir = "examples/Log"
+        setup_logging(log_dir, sample_name)
+        NA_string = "-"
+        output_data = {
+            "stx": NA_string,
+            "OH": NA_string,
+            "wzx": NA_string,
+            "wzy": NA_string,
+            "wzt": NA_string,
+            "wzm": NA_string,
+            "eae": NA_string,
+            "ehxA": NA_string,
+            "Other": NA_string,
+        }
+        try:
+            logging.info(f"Processing .res file: {res_path}")
+            filtered_df = process_res_file(res_path)
+        except Exception as e:
+            logging.error(f"Failed to process {res_path}: {e}")
+            return output_data
+        gene_map = {
+            "wzx": "wzx",
+            "wzy": "wzy",
+            "wzt": "wzt",
+            "wzm": "wzm",
+            "eae": "eae",
+            "ehxA": "ehxA",
+        }
+        toxin = "stx"
+        stx_alleles = set()
+        fli = NA_string
+        fliC = NA_string
+        for template in filtered_df["#Template"]:
+            parts = template.split("__")
+            if len(parts) < 3:
+                continue
+            gene, allele = parts[1], parts[2]
+            if gene in ["eae", "ehxA"]:
+                output_data[gene] = "Positive"
+            elif gene in gene_map:
+                output_data[gene] = allele
+            elif gene == "fliC":
+                fliC = allele
+            elif gene == "fli":
+                fli = allele
+            elif gene.startswith(toxin):
+                stx_alleles.add(allele)
+            elif gene not in thresholds:
+                output_data["Other"] = allele
+        if stx_alleles:
+            output_data[toxin] = ";".join(sorted(stx_alleles))
+        # serotype specific requirements
+        wzx, wzy, wzt, wzm = (
+            output_data["wzx"],
+            output_data["wzy"],
+            output_data["wzt"],
+            output_data["wzm"],
+        )
+        Otype = "-"
+        if (
+            wzx != NA_string
+            and wzy != NA_string
+            and wzx == wzy
+            and wzt == NA_string
+            and wzm == NA_string
+        ):
+            Otype = wzx
+            output_data["wzx"] = output_data["wzy"] = NA_string
+        elif (
+            wzt != NA_string
+            and wzm != NA_string
+            and wzt == wzm
+            and wzx == NA_string
+            and wzy == NA_string
+        ):
+            Otype = wzt
+            output_data["wzt"] = output_data["wzm"] = NA_string
+        Htype = fli if fli != NA_string else fliC
+        output_data["OH"] = f"{Otype};{Htype}"
+        # adding the additional depth, template coverage and query identity information
+        if verbose_flag == 1:
+            verbose_parts = []
+            for _, row in filtered_df.iterrows():
+                parts = row["#Template"].split("__")
+                if len(parts) >= 3:
+                    gene, allele = parts[1], parts[2]
+                    depth = row["Depth"]
+                    coverage = row["Template_Coverage"]
+                    identity = row["Query_Identity"]
+                    verbose_parts.append(
+                        f"{gene}_{allele}_{depth:.2f}_{coverage:.2f}_{identity:.2f}"
+                    )
+            output_data["verbose"] = ";".join(verbose_parts)
+        logging.info(f"Successfully processed sample: {sample_name}")
+        return output_data
+    @classmethod
+    def from_samplesheet(
+        cls,
+        samplesheet_path: Path,
+        verbose: int = 1,
+        results_base: str = "examples/Results/{sample_name}/kma/{sample_name}.res",
+    ) -> "EcoliResults":
+        """
+        Loads sample data from a samplesheet and summarizes each sample.
+        Args:
+            samplesheet_path (Path): Path to the samplesheet TSV file.
+            verbose (int, optional): Whether to include verbose output per sample. Default is 1.
+        Returns:
+            EcoliResults: An instance of the class populated with summaries for all samples.
+        """
+        df = pd.read_csv(samplesheet_path, sep="\t")
+        df.columns = df.columns.str.strip()
+        # print("I AM INSIDE FROM SAMPLESHEET")
+        # if "Illumina_read_files" in df.columns and ("read1" not in df.columns or "read2" not in df.columns):
+        #    df[["read1", "read2"]] = df["Illumina_read_files"].str.split(",", expand=True)
+        results_dict = {}
+        for idx, row in df.iterrows():
+            sample_name = row["sample_name"]
+            res_path = Path(
+                results_base.format(sample_name=sample_name)
+            )  # results_base / sample_name / "kma" / f"{sample_name}.res"
+            # print(f"The res path is : {res_path}")
+            summary = cls.summarize_single_sample(
+                sample_name, res_path, verbose_flag=verbose
+            )
+            results_dict[sample_name] = summary
+        # Convert to DataFrame
+        result_df = pd.DataFrame.from_dict(results_dict, orient="index").reset_index(
+            names="sample_name"
+        )
+        # Merge with original metadata
+        merged_df = df.merge(result_df, on="sample_name", how="left")
+        # Create and return object
+        obj = cls(results_dict)
+        obj.results_df = merged_df
+        return obj
+    def write_tsv(self, output_file: Path):
+        """
+        Writes the summarized typing results to a TSV file.
+        Args:
+            output_file (Path): Destination file path for the output table.
+        """
+        self.results_df.to_csv(output_file, sep="\t", index=False)
+    def __repr__(self):
+        """
+        Returns a concise summary of the results object.
+        Returns:
+            str: A string with sample and variable counts.
+        """
+        return f"<EcoliResults: {len(self.results_df)} samples, {len(self.results_df.columns)} variables>"
+# %% ../nbs/49_Ecoli_parser.ipynb 15
+@call_parse
+def ecoli_parser(
+    samplesheet_path: Path,  # Input samplesheet
+    output_file: Path = None,  # Path to output
+    verbose: int = 1,  # Verbosity,
+    results_base: str = "examples/Results/{sample_name}/kma/{sample_name}.res",  # Path template for .res files
+):
+    results = EcoliResults.from_samplesheet(
+        samplesheet_path, verbose=verbose, results_base=results_base
+    )
+    if output_file:
+        results.write_tsv(output_file)
+    else:
+        print(results.results_df)
+# %% ../nbs/49_Ecoli_parser.ipynb 17
+# | eval: true
+import pandas as pd
+from pathlib import Path
+import os
+# Define paths
+samplesheet_path = Path("test_input/Ecoli/samplesheet.tsv")
+output_dir = Path("test_output/Ecoli")
+# Create output directory
+if not output_dir.exists():
+    output_dir.mkdir(parents=True, exist_ok=True)
+output_path = output_dir / "KMA_cases_parser.tsv"
+# Assert input exists
+assert samplesheet_path.exists(), f"File does not exist: {samplesheet_path}"
+print(output_path)
+# try the ecoli parser to see if the wrangling functionality works
+try:
+    ecoli_parser(
+        samplesheet_path=samplesheet_path,
+        output_file=output_path,
+        verbose=1,
+        results_base="test_input/Ecoli/{sample_name}.res",
+    )
+except Exception as e:
+    raise AssertionError(f"Parser execution failed: {e}")
+# compare the output with the expected results based on input to ensure correct wrangling
+# read the created output files and check the information
+sample_sheet_df = pd.read_csv(samplesheet_path, sep="\t")
+sample_output_df = pd.read_csv(output_path, sep="\t")
+### Test case 1. Check if the datastructure is correct
+original_cols = sample_sheet_df.columns.tolist()
+output_cols = sample_output_df.columns.tolist()
+output_initial_cols = sample_output_df.columns[: len(original_cols)].tolist()
+output_specific_cols = sample_output_df.columns[len(original_cols) :].tolist()
+assert (
+    original_cols == output_initial_cols
+), f"Mismatch in first columns:\nExpected: {original_cols}\nGot: {output_initial_cols}"
+assert output_specific_cols
+### Test case 2. Check sample ERR3528110 which is correctly believed to be e.coli and ensure datawrangling does as expected
+ERR3528110_res_path = "test_input/Ecoli/ERR3528110.res"
+ERR3528110_input_df = pd.read_csv(ERR3528110_res_path, sep="\t")
+ERR3528110_row = (
+    sample_output_df[sample_output_df["sample_name"] == "ERR3528110"]
+    .iloc[:, len(original_cols) : len(output_cols)]
+    .iloc[0]
+)
+# extract the original genes from the res
+gene_hits = ERR3528110_input_df["#Template"].tolist()
+parsed_hits = []
+for hit in gene_hits:
+    parts = hit.split("__")
+    assert (
+        len(parts) != 3
+    ), f"Unexpected KMA result format in: '{hit}'. Expected at least 3 '__' parts (e.g., ref__gene__allele) as off ecoli fbi 24-04-2025."
+    gene, allele = parts[1], parts[2]
+    parsed_hits.append((gene, allele))
+# Extract OH genes
+O_gene_alleles = {
+    gene: allele for gene, allele in parsed_hits if gene in {"wzx", "wzy", "wzt", "wzm"}
+}
+H_gene_alleles = {
+    gene: allele for gene, allele in parsed_hits if gene in {"fli", "fliC"}
+}
+O_type = ERR3528110_row["OH"].split(";")[0]
+H_type = ERR3528110_row["OH"].split(";")[1]
+O_gene_keys = set(O_gene_alleles.keys())
+H_gene_keys = set(H_gene_alleles.keys())
+O_genes_no = len(O_gene_keys)
+H_genes_no = len(H_gene_keys)
+# O typing scenarios
+# Case 1: wzx/wzy match
+if O_gene_keys == {"wzx", "wzy"} and O_gene_alleles["wzx"] == O_gene_alleles["wzy"]:
+    expected_otype = O_gene_alleles["wzx"]
+    assert O_type == expected_otype, f"Expected OH '{expected_otype}', got '{O_type}'"
+    # wzx/wzy should be suppressed
+    assert ERR3528110_row["wzx"] == "-", "wzx column should be '-' when OH is used"
+    assert ERR3528110_row["wzy"] == "-", "wzy column should be '-' when OH is used"
+    # print(f"O-type correctly assigned from matching wzx/wzy: {O_type}")
+# Case 2: wzt/wzm match
+elif O_gene_keys == {"wzt", "wzm"} and O_gene_alleles["wzt"] == O_gene_alleles["wzm"]:
+    expected_otype = O_gene_alleles["wzt"]
+    assert O_type == expected_otype, f"Expected OH '{expected_otype}', got '{O_type}'"
+    assert ERR3528110_row["wzt"] == "-", "wzt column should be '-' when OH is used"
+    assert ERR3528110_row["wzm"] == "-", "wzm column should be '-' when OH is used"
+    # print(f"O-type correctly assigned from matching wzt/wzm: {O_type}")
+# Case 3: Conflict (≥3 genes, or 2 mismatched genes)
+elif O_genes_no >= 3 or (
+    (O_gene_keys == {"wzx", "wzy"} and O_gene_alleles["wzx"] != O_gene_alleles["wzy"])
+    or (
+        O_gene_keys == {"wzt", "wzm"} and O_gene_alleles["wzt"] != O_gene_alleles["wzm"]
+    )
+):
+    assert O_type == "-", f"Expected OH = '-' due to conflict, got: '{O_type}'"
+    for gene in O_gene_keys:
+        assert (
+            ERR3528110_row[gene] == O_gene_alleles[gene]
+        ), f"{gene} column should contain '{O_gene_alleles[gene]}'"
+    # print("Conflict in O-typing correctly led to OH = '-' and individual gene columns retained.")
+# H typing scenarios
+# Case 1: If fli is present it will always take precedence over fliC
+if H_gene_keys == {"fli"}:
+    expected_htype = H_gene_alleles["fli"]
+    assert (
+        H_type == expected_htype
+    ), f"Expected OH '{expected_htype}' from 'fli', got '{H_type}'"
+# Case 2: only if fliC is the sole gene it is used
+elif H_gene_keys == {"fliC"}:
+    expected_htype = H_gene_alleles["fliC"]
+    assert (
+        H_type == expected_htype
+    ), f"Expected OH '{expected_htype}' from 'fliC', got '{H_type}'"
+# Case 3: if none exist the H type remains empty
+else:
+    assert H_type == "-", f"Expected H-type '-', but got '{H_type}'"
+### Test case 3. Check sample ERR14229029 which is believed to be e.coli in the samplesheet is empty, as a result of being erroneously classified as e.coli
+ERR14229029_row = (
+    sample_output_df[sample_output_df["sample_name"] == "ERR14229029"]
+    .iloc[:, len(original_cols) : len(output_cols)]
+    .iloc[0]
+)
+ERR14229029_expected_values = [
+    "-",
+    "-;-",
+    "-",
+    "-",
+    "-",
+    "-",
+    "-",
+    "-",
+    "-",
+    float("nan"),
+]
+ERR14229029_values = [ERR14229029_row[col] for col in output_specific_cols]
+for col, actual, expected in zip(
+    output_specific_cols, ERR14229029_values, ERR14229029_expected_values
+):
+    if pd.isna(expected):
+        assert pd.isna(actual), f"{col}: Expected NaN, got {actual}"
+    else:
+        assert actual == expected, f"{col}: Expected '{expected}', got '{actual}'"
+# %% ../nbs/49_Ecoli_parser.ipynb 19
+import os
+from tempfile import TemporaryDirectory
+from pathlib import Path
+test_cases = [
+    # sample_name, res_content, expected_oh, expected_stx, expected_eae, expected_ehxA
+    (
+        "sample1",
+        "1__wzx__O103__X\t100\t100\t60\n2__wzy__O103__X\t100\t100\t65\n3__fliC__H2__X\t100\t100\t70",
+        "O103;H2",
+        "-",
+        "-",
+        "-",
+    ),
+    (
+        "sample2",
+        "1__wzt__O8__X\t100\t100\t60\n2__wzm__O8__X\t100\t100\t65\n3__fliC__H10__X\t100\t100\t70\n4__stx2__stx2-a__X\t100\t100\t90\n5__eae__eae-5__X\t100\t100\t80",
+        "O8;H10",
+        "stx2-a",
+        "Positive",
+        "-",
+    ),
+    ("sample3", "1__fliC__H7__X\t100\t100\t70", "-;H7", "-", "-", "-"),
+    (
+        "sample4",
+        "bad_line\n2__wzy__O111__X\t100\t100\t70\n3__fliC__H11__X\t100\t100\t70",
+        "-;H11",
+        "-",
+        "-",
+        "-",
+    ),
+    ("sample5", "", "-;-", "-", "-", "-"),
+    (
+        "sample6",
+        "1__wzx__O157__X\t100\t100\t60\n2__wzy__O157__X\t100\t100\t65\n3__wzt__O8__X\t100\t100\t60\n4__wzm__O8__X\t100\t100\t65\n5__fli__H2__X\t100\t100\t70",
+        "-;H2",
+        "-",
+        "-",
+        "-",
+    ),
+    (
+        "sample7",
+        "1__wzx__O157__X\t100\t100\t60\n2__wzy__O111__X\t100\t100\t65\n3__fliC__H9__X\t100\t100\t70",
+        "-;H9",
+        "-",
+        "-",
+        "-",
+    ),
+    (
+        "sample8",
+        "1__fli__H1__X\t100\t100\t70\n2__fliC__H12__X\t100\t100\t70",
+        "-;H1",
+        "-",
+        "-",
+        "-",
+    ),
+    (
+        "sample9",
+        "1__wzx__O157__X\t100\t100\t60\n2__wzy__O157__X\t100\t100\t65\n3__wzt__O8__X\t100\t100\t60\n4__wzm__O8__X\t100\t100\t65\n5__fliC__H10__X\t100\t100\t70\n6__fli__H2__X\t100\t100\t70\n7__stx1__stx1-a__X\t100\t100\t90\n8__stx2__stx2-d__X\t100\t100\t90\n9__stx2__stx2-a__X\t100\t100\t90\n10__eae__eae-42-5__X\t100\t100\t80\n11__ehxA__ehxA-7__X\t100\t100\t80",
+        "-;H2",
+        "stx1-a;stx2-a;stx2-d",
+        "Positive",
+        "Positive",
+    ),
+    (
+        "sample10",
+        "1__adk__adk__X\t100\t100\t70\n2__fliC__H4__X\t100\t100\t70",
+        "-;H4",
+        "-",
+        "-",
+        "-",
+    ),
+    (
+        "sample11",
+        "1__eae__eae-1__X\t100\t94\t70\n2__fliC__H6__X\t100\t100\t70",
+        "-;H6",
+        "-",
+        "-",
+        "-",
+    ),
+    (
+        "sample12",
+        "1__stx1__stx1a__X\t100\t100\t80\n2__stx2__stx2c__X\t100\t100\t85\n3__fli__H21__X\t100\t100\t70",
+        "-;H21",
+        "stx1a;stx2c",
+        "-",
+        "-",
+    ),
+]
+for (
+    sample_name,
+    res_content,
+    expected_oh,
+    expected_stx,
+    expected_eae,
+    expected_ehxA,
+) in test_cases:
+    with TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+        os.chdir(tmpdir)
+        res_dir = tmpdir / f"examples/Results/{sample_name}/kma"
+        res_dir.mkdir(parents=True)
+        res_file = res_dir / f"{sample_name}.res"
+        res_file.write_text(
+            "#Template\tTemplate_Coverage\tQuery_Identity\tDepth\n" + res_content
+        )
+        sheet = tmpdir / "samplesheet.tsv"
+        sheet.write_text(
+            "sample_name\tIllumina_read_files\tNanopore_read_file\tassembly_file\torganism\tvariant\tnotes\n"
+            f"{sample_name}\tread1.fastq,read2.fastq\t-\t-\tEcoli\t-\t-\n"
+        )
+        results = EcoliResults.from_samplesheet(sheet)
+        df = results.results_df
+        row = df.iloc[0]
+        # general output and functionality test
+        assert row["sample_name"] == sample_name
+        if row["OH"] != expected_oh:
+            raise AssertionError(
+                f"\nSample: {sample_name}\nExpected OH: {expected_oh}\nActual OH: {row['OH']}"
+            )
+        assert row["OH"] == expected_oh
+        if row["stx"] != expected_stx:
+            raise AssertionError(
+                f"\nSample: {sample_name}\nExpected stx: {expected_stx}\nActual stx: {row['stx']}"
+            )
+        assert row["stx"] == expected_stx
+        if row["eae"] != expected_eae:
+            raise AssertionError(
+                f"\nSample: {sample_name}\nExpected eae: {expected_eae}\nActual eae: {row['eae']}"
+            )
+        assert row["eae"] == expected_eae
+        if row["ehxA"] != expected_ehxA:
+            raise AssertionError(
+                f"\nSample: {sample_name}\nExpected ehxA: {expected_ehxA}\nActual ehxA: {row['ehxA']}"
+            )
+        assert row["ehxA"] == expected_ehxA
+        # sample specific information tests
+        # without confliciting O and H typing, the OH column should be filled and the remaining four genes empty
+        if sample_name == "sample1":
+            assert row["wzx"] == "-"
+            assert row["wzy"] == "-"
+            assert row["wzt"] == "-"
+            assert row["wzm"] == "-"
+        # with conflicts the OH should remain empty and the four 'conflicting' gene information remain filled
+        elif sample_name == "sample6":
+            assert row["wzx"] == "O157"
+            assert row["wzy"] == "O157"
+            assert row["wzt"] == "O8"
+            assert row["wzm"] == "O8"
+        elif sample_name == "sample10":
+            assert row["Other"] == "adk"
+print("All 12 syntehtic E. coli sample inline tests passed.")

ssi_analysis_result_parsers/Spyogenes_parser.py CHANGED Viewed

@@ -193,7 +193,6 @@ def extract_emm_type(emm_blast_tsv: Path):
                         f"ENN{blast_df_unique.iloc[2]['qseqid'][3:]} with pident {round(blast_df_unique.iloc[2]['pident'],2)} and length {blast_df_unique.iloc[2]['length']}/{blast_df_unique.iloc[2]['qlen']}"
                     )
     else:
-        note_to_add = "EMM and EMM-like genes found on multiple contigs"
         emm_genes = []
         for index, row in blast_df_unique.iterrows():
             if row["length"] < row["qlen"] or row["pident"] < 100:

ssi_analysis_result_parsers/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.0.9"
1	+ __version__ = "0.0.10"

ssi_analysis_result_parsers/_modidx.py CHANGED Viewed

@@ -5,7 +5,27 @@ d = { 'settings': { 'branch': 'main',
                 'doc_host': 'https://thej-ssi.github.io',
                 'git_url': 'https://github.com/thej-ssi/ssi_analysis_result_parsers',
                 'lib_path': 'ssi_analysis_result_parsers'},
-  'syms': { 'ssi_analysis_result_parsers.Legionella_parser': { 'ssi_analysis_result_parsers.Legionella_parser.LegionellaResults': ( 'legionella_parser.html#legionellaresults',
+  'syms': { 'ssi_analysis_result_parsers.Ecoli_parser': { 'ssi_analysis_result_parsers.Ecoli_parser.EcoliResults': ( 'ecoli_parser.html#ecoliresults',
+                                                                                                                     'ssi_analysis_result_parsers/Ecoli_parser.py'),
+                                                          'ssi_analysis_result_parsers.Ecoli_parser.EcoliResults.__init__': ( 'ecoli_parser.html#ecoliresults.__init__',
+                                                                                                                              'ssi_analysis_result_parsers/Ecoli_parser.py'),
+                                                          'ssi_analysis_result_parsers.Ecoli_parser.EcoliResults.__repr__': ( 'ecoli_parser.html#ecoliresults.__repr__',
+                                                                                                                              'ssi_analysis_result_parsers/Ecoli_parser.py'),
+                                                          'ssi_analysis_result_parsers.Ecoli_parser.EcoliResults.from_samplesheet': ( 'ecoli_parser.html#ecoliresults.from_samplesheet',
+                                                                                                                                      'ssi_analysis_result_parsers/Ecoli_parser.py'),
+                                                          'ssi_analysis_result_parsers.Ecoli_parser.EcoliResults.summarize_single_sample': ( 'ecoli_parser.html#ecoliresults.summarize_single_sample',
+                                                                                                                                             'ssi_analysis_result_parsers/Ecoli_parser.py'),
+                                                          'ssi_analysis_result_parsers.Ecoli_parser.EcoliResults.write_tsv': ( 'ecoli_parser.html#ecoliresults.write_tsv',
+                                                                                                                               'ssi_analysis_result_parsers/Ecoli_parser.py'),
+                                                          'ssi_analysis_result_parsers.Ecoli_parser.ecoli_parser': ( 'ecoli_parser.html#ecoli_parser',
+                                                                                                                     'ssi_analysis_result_parsers/Ecoli_parser.py'),
+                                                          'ssi_analysis_result_parsers.Ecoli_parser.get_threshold': ( 'ecoli_parser.html#get_threshold',
+                                                                                                                      'ssi_analysis_result_parsers/Ecoli_parser.py'),
+                                                          'ssi_analysis_result_parsers.Ecoli_parser.process_res_file': ( 'ecoli_parser.html#process_res_file',
+                                                                                                                         'ssi_analysis_result_parsers/Ecoli_parser.py'),
+                                                          'ssi_analysis_result_parsers.Ecoli_parser.setup_logging': ( 'ecoli_parser.html#setup_logging',
+                                                                                                                      'ssi_analysis_result_parsers/Ecoli_parser.py')},
+            'ssi_analysis_result_parsers.Legionella_parser': { 'ssi_analysis_result_parsers.Legionella_parser.LegionellaResults': ( 'legionella_parser.html#legionellaresults',
                                                                                                                                     'ssi_analysis_result_parsers/Legionella_parser.py'),
                                                                'ssi_analysis_result_parsers.Legionella_parser.LegionellaResults.__repr__': ( 'legionella_parser.html#legionellaresults.__repr__',
                                                                                                                                              'ssi_analysis_result_parsers/Legionella_parser.py'),

{ssi_analysis_result_parsers-0.0.9.dist-info → ssi_analysis_result_parsers-0.0.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ssi_analysis_result_parsers
-Version: 0.0.9
+Version: 0.0.10
 Summary: TODO
 Home-page: https://github.com/thej-ssi/ssi_analysis_result_parsers
 Author: Thor Bech Johannesen

{ssi_analysis_result_parsers-0.0.9.dist-info → ssi_analysis_result_parsers-0.0.10.dist-info}/RECORD RENAMED Viewed

@@ -1,16 +1,20 @@
+ssi_analysis_result_parsers/Ecoli_parser.py,sha256=lKNnx27KqxpX3yv1RAP-sBATMGf9yjfSs4-d50mR2cA,22704
 ssi_analysis_result_parsers/Legionella_parser.py,sha256=nBOPrelzCMh8IMEDje6jtU9sz92sEyuxddBe0MntTfo,6879
-ssi_analysis_result_parsers/Spyogenes_parser.py,sha256=AaA-63AWbRdjkqdZQahGktwDObsSS9KxDwroftzoDiw,12804
-ssi_analysis_result_parsers/__init__.py,sha256=46Yjk3fz9o8aTN8E95McnzpJcjGzVJmHmQqUZ5mXzfc,22
-ssi_analysis_result_parsers/_modidx.py,sha256=kz1oGnDbstzvo_tNuFiX5wkhfhNmiqSiCkwBnzplRU0,14895
+ssi_analysis_result_parsers/Spyogenes_parser.py,sha256=Cjibp7iKGofjSp-igm-jmjBVkQ6-zxYQWVSZT-Vx3Fo,12731
+ssi_analysis_result_parsers/__init__.py,sha256=-nNlMKS9nph3FR78_ZG9RGKrbxseeNp2K6nMr0pVGaU,23
+ssi_analysis_result_parsers/_modidx.py,sha256=JAUPTOicf6tKcLhA8DOvsehlZxy6LDPxQDlootV_InE,18281
 ssi_analysis_result_parsers/blast_parser.py,sha256=pIzMGk5-VyTy8uzFncTiIsy80wQxl9NbNiGI_K7XMaM,8658
 ssi_analysis_result_parsers/core.py,sha256=8CzFMDrGJ24D9aoIebLsG8tx-OxvYJod1cxBITqNfaY,12258
 ssi_analysis_result_parsers/hello_world.py,sha256=jpN94sqYuNHqUbUZMCJ35qGY5iLPB_emucgnDGDUk_U,1895
 ssi_analysis_result_parsers/some_string.py,sha256=JwmAXKbX_JgY8UGh4FAu5-7ZjezcAEhq4Q2B73pWp2M,923
 ssi_analysis_result_parsers/config/config.default.env,sha256=Zt6bfPbVV3rYCksoebX1ruAdFgeD9wqAnKDtswhtJJM,1390
 ssi_analysis_result_parsers/config/config.default.yaml,sha256=3qgUrUtQpxrzYv7WQaHsvz9dQB0RALKNU0idxv7oRqM,460
-ssi_analysis_result_parsers-0.0.9.dist-info/licenses/LICENSE,sha256=p6aTb6QIfqyZ2Uux2VjV4F2zthdUSHZOjB4mfwGc7fo,1094
+ssi_analysis_result_parsers-0.0.10.dist-info/licenses/LICENSE,sha256=p6aTb6QIfqyZ2Uux2VjV4F2zthdUSHZOjB4mfwGc7fo,1094
 test_input/.DS_Store,sha256=sdTEvl9DTKPHNPYYjMqDepX7q7ZETlonk21tGEuWLao,6148
 test_input/empty_file.txt,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+test_input/Ecoli/ERR14229029.res,sha256=AmVZwbiUTjOQLe7SmSKWt9-URdcrsLSxt9hHUh-nFUY,129
+test_input/Ecoli/ERR3528110.res,sha256=DmiDRfX9LPypAEzVeO1RHaPoqEpZwq8ZtQDJ1KOWwHc,461
+test_input/Ecoli/samplesheet.tsv,sha256=sSPrVrloOWvfmnp2Lnn8H6mCkiWsZUFV0wrovk3jH-Q,416
 test_input/Legionella/batch_parser_file_paths.tsv,sha256=AikBS_Ez1xO3UrEQ19AY3z6drBDdMAiSGK66NLeyYj4,356
 test_input/Legionella/lag-1_blast.tsv,sha256=MN5QL_iBn9gQ8VTYEcTnT0JwKgpkD8G15-QFOrSWxkU,1133
 test_input/Legionella/lag-1_blast_2.tsv,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -34,8 +38,9 @@ test_input/blast_parser/gene_presence_absence_test.tsv,sha256=qCvMkBC-1GuXx83RDh
 test_output/output_with_sample_name.tsv,sha256=NQG7WaxczuWCCsX2a9MUxCCYpbuAirz9gw08OLdEdUo,41
 test_output/test.tsv,sha256=6DGzarXMkUP03Z58vZimc-gu1K2k84zxZLWWF2HROCg,277
 test_output/test_batch_output.tsv,sha256=6DGzarXMkUP03Z58vZimc-gu1K2k84zxZLWWF2HROCg,277
-ssi_analysis_result_parsers-0.0.9.dist-info/METADATA,sha256=gpcbUHkicHpg_derxQTQetNGZOoj5X5mPbcxI_lElm0,2765
-ssi_analysis_result_parsers-0.0.9.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-ssi_analysis_result_parsers-0.0.9.dist-info/entry_points.txt,sha256=noBLlB4hLmYcqns7KdQPVURO27kZ_zWMsPHYkRlBGEE,631
-ssi_analysis_result_parsers-0.0.9.dist-info/top_level.txt,sha256=3q56bBc2Wv2a6ZQ1l_9m66vot2-Qu6tM9tDr3QQ8auM,81
-ssi_analysis_result_parsers-0.0.9.dist-info/RECORD,,
+test_output/Ecoli/KMA_cases_parser.tsv,sha256=Wf3JkSppRN5AK2zRJmFQlwVfCMyJfgyyBpTjb1sK6Uw,586
+ssi_analysis_result_parsers-0.0.10.dist-info/METADATA,sha256=BZSffWWanmoRU6UPhhJl6jc1pBomIvjyX79m-H39DAI,2766
+ssi_analysis_result_parsers-0.0.10.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+ssi_analysis_result_parsers-0.0.10.dist-info/entry_points.txt,sha256=1btBaEDONU_OZlGK4iXe0-my25NFtup33MfYFS-Oj24,705
+ssi_analysis_result_parsers-0.0.10.dist-info/top_level.txt,sha256=dhzhsC8l7PYeBYNT8JzHPz3BriLAw3llVo0jHn175WI,90
+ssi_analysis_result_parsers-0.0.10.dist-info/RECORD,,

{ssi_analysis_result_parsers-0.0.9.dist-info → ssi_analysis_result_parsers-0.0.10.dist-info}/entry_points.txt RENAMED Viewed

@@ -3,6 +3,7 @@ blast_parser_allele_matches = ssi_analysis_result_parsers.blast_parser:allele_ma
 blast_parser_presence_absence = ssi_analysis_result_parsers.blast_parser:presence_absence
 get_Spyogenes_results = ssi_analysis_result_parsers.Spyogenes_parser:Spyogenes_parser
 get_Spyogenes_results_batch = ssi_analysis_result_parsers.Spyogenes_parser:Spyogenes_batch_parser
+get_ecoli_results = ssi_analysis_result_parsers.Ecoli_parser:ecoli_parser
 get_leg_results = ssi_analysis_result_parsers.Legionella_parser:legionella_parser
 get_leg_results_batch = ssi_analysis_result_parsers.Legionella_parser:legionella_batch_parser

{ssi_analysis_result_parsers-0.0.9.dist-info → ssi_analysis_result_parsers-0.0.10.dist-info}/top_level.txt RENAMED Viewed

@@ -1,5 +1,6 @@
 _proc
 config
+examples
 input
 nbs
 output

test_input/Ecoli/ERR14229029.res ADDED Viewed

	@@ -0,0 +1 @@
1	+ #Template Score Expected Template_length Template_Identity Template_Coverage Query_Identity Query_Coverage Depth q_value p_value

test_input/Ecoli/ERR3528110.res ADDED Viewed

@@ -0,0 +1,4 @@
+#Template	Score	Expected	Template_length	Template_Identity	Template_Coverage	Query_Identity	Query_Coverage	Depth	q_value	p_value
+1__wzx__O6__AJ426045	   20056	     153	    1257	   99.92	  100.00	   99.92	  100.00	   16.07	19601.01	1.0e-26
+2__wzy__O6__AJ426423	   23540	     159	    1344	  100.00	  100.00	  100.00	  100.00	   17.35	23065.87	1.0e-26
+5__fliC__H1__AB028471	  107030	      73	    1788	  100.00	  100.00	  100.00	  100.00	   62.14	106810.34	1.0e-26

test_input/Ecoli/samplesheet.tsv ADDED Viewed

@@ -0,0 +1,3 @@
+sample_name	Illumina_read_files	Nanopore_read_file	assembly_file	organism	variant	notes
+ERR3528110	examples/Dataset/reads/ERR3528110_1.fastq.gz,examples/Dataset/reads/ERR3528110_2.fastq.gz	Na	examples/Dataset/assemblies/ERR3528110.fasta	E.coli	Na	Na
+ERR14229029	examples/Dataset/reads/ERR14229029_1.fastq.gz,examples/Dataset/reads/ERR14229029_2.fastq.gz	Na	examples/Dataset/assemblies/ERR14229029.fasta	E.coli	Na	Na

test_output/Ecoli/KMA_cases_parser.tsv ADDED Viewed

@@ -0,0 +1,3 @@
+sample_name	Illumina_read_files	Nanopore_read_file	assembly_file	organism	variant	notes	stx	OH	wzx	wzy	wzt	wzm	eae	ehxA	Other	verbose
+ERR3528110	examples/Dataset/reads/ERR3528110_1.fastq.gz,examples/Dataset/reads/ERR3528110_2.fastq.gz	Na	examples/Dataset/assemblies/ERR3528110.fasta	E.coli	Na	Na	-	O6;H1	-	-	-	-	-	-	-	wzx_O6_16.07_100.00_99.92;wzy_O6_17.35_100.00_100.00;fliC_H1_62.14_100.00_100.00
+ERR14229029	examples/Dataset/reads/ERR14229029_1.fastq.gz,examples/Dataset/reads/ERR14229029_2.fastq.gz	Na	examples/Dataset/assemblies/ERR14229029.fasta	E.coli	Na	Na	-	-;-	-	-	-	-	-	-	-

{ssi_analysis_result_parsers-0.0.9.dist-info → ssi_analysis_result_parsers-0.0.10.dist-info}/WHEEL RENAMED Viewed

File without changes

{ssi_analysis_result_parsers-0.0.9.dist-info → ssi_analysis_result_parsers-0.0.10.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

ssi-analysis-result-parsers 0.0.9__py3-none-any.whl → 0.0.10__py3-none-any.whl

ssi-analysis-result-parsers 0.0.9py3-none-any.whl → 0.0.10py3-none-any.whl