PyPI - ssi-analysis-result-parsers - Versions diffs - 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl - Mend

ssi-analysis-result-parsers 0.0.4py3-none-any.whl → 0.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

ssi_analysis_result_parsers/Legionella_parser.py CHANGED Viewed

@@ -1,8 +1,7 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/39_Legionella_parser.ipynb.
 # %% auto 0
-__all__ = ['extract_legionella_sbt', 'legionella_summary', 'legionella_batch_from_sheet', 'LegionellaResults',
-           'legionella_batch_from_dict', 'legionella_parser', 'legionella_batch_parser']
+__all__ = ['extract_legionella_sbt', 'LegionellaResults', 'legionella_parser', 'legionella_batch_parser']
 # %% ../nbs/39_Legionella_parser.ipynb 3
 # standard libs
@@ -25,8 +24,10 @@ import json  # for nicely printing json and yaml
 # import functions from core module (optional, but most likely needed).
 from ssi_analysis_result_parsers import (
     core,
+    blast_parser,
 )
-from .blast_parser import extract_presence_absence
+# from ssi_analysis_result_parsers.blast_parser import extract_presence_absence
 # Project specific libraries
 from pathlib import Path
@@ -36,14 +37,22 @@ import sys
 # %% ../nbs/39_Legionella_parser.ipynb 6
 def extract_legionella_sbt(legionella_sbt_results_tsv: Path) -> dict:
     """
-    Returns dictionary of results found in the Legionella SBT summary output
+    Extract results from Legionella SBT results file
+    Returns a dictionary with ST, allele variant for each gene, and notes from output
     """
     if os.path.exists(legionella_sbt_results_tsv):
-        df = pandas.read_csv(legionella_sbt_results_tsv, sep="\t")
-        df.set_index("sample", inplace=True, drop=True)
-        d = df.to_dict(orient="index")
-        fname = next(iter(d))
-        return d[fname]
+        try:
+            df = pandas.read_csv(legionella_sbt_results_tsv, sep="\t")
+            df.set_index("sample", inplace=True, drop=True)
+            d = df.to_dict(orient="index")
+            fname = next(iter(d))
+            return d[fname]
+        except pandas.errors.EmptyDataError:
+            print(
+                f"No Legionella SBT output empty at {legionella_sbt_results_tsv}",
+                file=sys.stderr,
+            )
+            return None
     else:
         print(
             f"No Legionella SBT output found at {legionella_sbt_results_tsv}",
@@ -52,46 +61,16 @@ def extract_legionella_sbt(legionella_sbt_results_tsv: Path) -> dict:
         return None
-def legionella_summary(legionella_sbt_results_tsv: Path, lag1_blast_tsv: Path) -> dict:
-    sbt_results_dict = extract_legionella_sbt(
-        legionella_sbt_results_tsv=legionella_sbt_results_tsv
-    )
-    lag1_blast_dict = extract_presence_absence(
-        blast_output_tsv=lag1_blast_tsv,
-        hits_as_string=False,
-        include_match_stats=False,
-        gene_names=["lag-1"],
-    )
-    results_dict = core.update_results_dict(
-        sbt_results_dict, lag1_blast_dict, old_duplicate_key_prefix="SBT: "
-    )
-    if results_dict is None:
-        return {}
-    return results_dict
-def legionella_batch_from_sheet(file_paths: dict, output_file: Path = None):
-    results_dict = {}
-    for sample_name, path_dict in file_paths.items():
-        legionella_results = legionella_summary(
-            legionella_sbt_results_tsv=Path(path_dict["sbt_results"]),
-            lag1_blast_tsv=Path(path_dict["lag1_blast_results"]),
-        )
-        results_dict[sample_name] = legionella_results
-    if output_file is not None:
-        df = pandas.DataFrame.from_dict(results_dict, orient="index").reset_index(
-            names="sample_name"
-        )
-        df.to_csv(output_file, sep="\t", index=False)
-    return results_dict
 class LegionellaResults(core.PipelineResults):
     @classmethod
     def from_tool_paths(
         cls, legionella_sbt_results_tsv: Path, lag1_blast_tsv: Path, sample_name=None
     ):
+        """
+        Alternative constructor for initializing results for single sample,
+        Initializes LegionellaResults instance provided paths to outputs from tools (legionella sbt and lag1 presence blast)
+        """
         legionella_results = cls.legionella_summary(
             legionella_sbt_results_tsv=legionella_sbt_results_tsv,
             lag1_blast_tsv=lag1_blast_tsv,
@@ -100,6 +79,10 @@ class LegionellaResults(core.PipelineResults):
     @classmethod
     def from_tool_paths_dict(cls, file_paths: dict):
+        """
+        Alternative constructor for initializing results for multiple samples,
+        Initializes LegionellaResults instance by providing a dictionary of paths to outputs from tools (legionella sbt and lag1 presence blast)
+        """
         results_dict = {}
         for sample_name, path_dict in file_paths.items():
             legionella_results = cls.legionella_summary(
@@ -111,6 +94,10 @@ class LegionellaResults(core.PipelineResults):
     @classmethod
     def from_tool_paths_dataframe(cls, file_paths_df: pandas.DataFrame):
+        """
+        Alternative constructor for initializing results for multiple samples,
+        Initializes LegionellaResults instance by providing a DataFrame of paths to outputs from tools (legionella sbt and lag1 presence blast)
+        """
         file_paths = file_paths_df.to_dict(orient="index")
         results_dict = {}
         for sample_name, path_dict in file_paths.items():
@@ -124,24 +111,22 @@ class LegionellaResults(core.PipelineResults):
     @classmethod
     def from_tool_paths_tsv(cls, tool_paths_tsv: Path):
+        """
+        Alternative constructor for initializing results for multiple samples,
+        Initializes LegionellaResults instance by providing a tsv-file with paths to outputs from tools (legionella sbt and lag1 presence blast)
+        """
         file_paths_df = pandas.read_csv(tool_paths_tsv, sep="\t")
         file_paths_df.set_index("sample_name", inplace=True, drop=True)
-        # return_cls =
-        # results_dict = file_paths_df.to_dict(orient="index")
         return cls.from_tool_paths_dataframe(file_paths_df)
-        """for sample_name, path_dict in file_paths.items():
-            legionella_results = cls.legionella_summary(legionella_sbt_results_tsv=Path(path_dict["sbt_results"]),
-                                                    lag1_blast_tsv=Path(path_dict["lag1_blast_results"]))
-            results_dict[sample_name] = legionella_results
-        return cls(results_dict)"""
+    @staticmethod
     def legionella_summary(
         legionella_sbt_results_tsv: Path, lag1_blast_tsv: Path
     ) -> dict:
         sbt_results_dict = extract_legionella_sbt(
             legionella_sbt_results_tsv=legionella_sbt_results_tsv
         )
-        lag1_blast_dict = extract_presence_absence(
+        lag1_blast_dict = blast_parser.extract_presence_absence(
             blast_output_tsv=lag1_blast_tsv,
             hits_as_string=False,
             include_match_stats=False,
@@ -150,43 +135,13 @@ class LegionellaResults(core.PipelineResults):
         results_dict = core.update_results_dict(
             sbt_results_dict, lag1_blast_dict, old_duplicate_key_prefix="SBT: "
         )
+        if results_dict is None:
+            return {}
         return results_dict
     def __repr__(self):
         return f"< Legionella analysis results object. {len(self.results_df)} samples with {len(self.results_df.columns)} result variables > "
-def legionella_batch_from_dict(file_paths: dict, output_file: Path = None):
-    results_dict = {}
-    for sample_name, path_dict in file_paths.items():
-        legionella_results = legionella_summary(
-            legionella_sbt_results_tsv=Path(path_dict["sbt_results"]),
-            lag1_blast_tsv=Path(path_dict["lag1_blast_results"]),
-        )
-        results_dict[sample_name] = legionella_results
-    if output_file is not None:
-        df = pandas.DataFrame.from_dict(results_dict, orient="index").reset_index(
-            names="sample_name"
-        )
-        df.to_csv(output_file, sep="\t", index=False)
-    return results_dict
-def legionella_batch_from_sheet(file_paths: dict, output_file: Path = None):
-    results_dict = {}
-    for sample_name, path_dict in file_paths.items():
-        legionella_results = legionella_summary(
-            legionella_sbt_results_tsv=Path(path_dict["sbt_results"]),
-            lag1_blast_tsv=Path(path_dict["lag1_blast_results"]),
-        )
-        results_dict[sample_name] = legionella_results
-    if output_file is not None:
-        df = pandas.DataFrame.from_dict(results_dict, orient="index").reset_index(
-            names="sample_name"
-        )
-        df.to_csv(output_file, sep="\t", index=False)
-    return results_dict
 # %% ../nbs/39_Legionella_parser.ipynb 9
 @call_parse
 def legionella_parser(

ssi_analysis_result_parsers/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.0.3"
1	+ __version__ = "0.0.5"

ssi_analysis_result_parsers/_modidx.py CHANGED Viewed

@@ -21,16 +21,10 @@ d = { 'settings': { 'branch': 'main',
                                                                                                                                                        'ssi_analysis_result_parsers/Legionella_parser.py'),
                                                                'ssi_analysis_result_parsers.Legionella_parser.extract_legionella_sbt': ( 'legionella_parser.html#extract_legionella_sbt',
                                                                                                                                          'ssi_analysis_result_parsers/Legionella_parser.py'),
-                                                               'ssi_analysis_result_parsers.Legionella_parser.legionella_batch_from_dict': ( 'legionella_parser.html#legionella_batch_from_dict',
-                                                                                                                                             'ssi_analysis_result_parsers/Legionella_parser.py'),
-                                                               'ssi_analysis_result_parsers.Legionella_parser.legionella_batch_from_sheet': ( 'legionella_parser.html#legionella_batch_from_sheet',
-                                                                                                                                              'ssi_analysis_result_parsers/Legionella_parser.py'),
                                                                'ssi_analysis_result_parsers.Legionella_parser.legionella_batch_parser': ( 'legionella_parser.html#legionella_batch_parser',
                                                                                                                                           'ssi_analysis_result_parsers/Legionella_parser.py'),
                                                                'ssi_analysis_result_parsers.Legionella_parser.legionella_parser': ( 'legionella_parser.html#legionella_parser',
-                                                                                                                                    'ssi_analysis_result_parsers/Legionella_parser.py'),
-                                                               'ssi_analysis_result_parsers.Legionella_parser.legionella_summary': ( 'legionella_parser.html#legionella_summary',
-                                                                                                                                     'ssi_analysis_result_parsers/Legionella_parser.py')},
+                                                                                                                                    'ssi_analysis_result_parsers/Legionella_parser.py')},
             'ssi_analysis_result_parsers.blast_parser': { 'ssi_analysis_result_parsers.blast_parser.allele_matches': ( 'blast_parser.html#allele_matches',
                                                                                                                        'ssi_analysis_result_parsers/blast_parser.py'),
                                                           'ssi_analysis_result_parsers.blast_parser.extract_allele_matches': ( 'blast_parser.html#extract_allele_matches',

ssi_analysis_result_parsers/blast_parser.py CHANGED Viewed

@@ -51,21 +51,31 @@ def extract_presence_absence(
     if os.path.exists(blast_output_tsv):
         try:
             blast_df = pandas.read_csv(blast_output_tsv, sep="\t", header=None)
+            header_list = tsv_header.split(" ")
+            if len(header_list) == len(blast_df.columns):
+                blast_df.columns = tsv_header.split(" ")
+                blast_df["plen"] = blast_df["length"] / blast_df["qlen"] * 100
+                blast_df_unique = (
+                    blast_df.sort_values(by=["bitscore"], ascending=False)
+                    .groupby("qseqid")
+                    .first()
+                )
+                blast_df_filtered = blast_df_unique.query(
+                    "plen > @plen_threshold and pident > @pident_threshold"
+                )
+                blast_dict = dict(blast_df_filtered.to_dict(orient="index"))
+            else:
+                print(
+                    f"Failed to parse {blast_output_tsv}. Number of columns do not match length of provided header string",
+                    file=sys.stderr,
+                )
+                return None
-            blast_df.columns = tsv_header.split(" ")
-            blast_df["plen"] = blast_df["length"] / blast_df["qlen"] * 100
-            blast_df_unique = (
-                blast_df.sort_values(by=["bitscore"], ascending=False)
-                .groupby("qseqid")
-                .first()
-            )
-            blast_df_filtered = blast_df_unique.query(
-                "plen > @plen_threshold and pident > @pident_threshold"
-            )
-            blast_dict = dict(blast_df_filtered.to_dict(orient="index"))
         except pandas.errors.EmptyDataError:
             blast_dict = {}
             print(f"Blast output file {blast_output_tsv} empty. Assuming 0 blast hits.")
+        except Exception as e:
+            print(f"Error parsing blast: e")
         if hits_as_string:
             results = []

ssi_analysis_result_parsers/core.py CHANGED Viewed

@@ -196,7 +196,6 @@ def get_samplesheet(sample_sheet_config: dict) -> pd.DataFrame:
 class PipelineResults:
     def __init__(self, results_dict):
-        print(results_dict)
         self.results_dict = results_dict
         self.results_df = pandas.DataFrame.from_dict(results_dict, orient="index")
@@ -207,12 +206,17 @@ class PipelineResults:
     @classmethod
     def from_results_dataframe(cls, results_df: pandas.DataFrame):
-        # results_df = results_df.set_index("sample_name")
+        """
+        Alternative constructor for initializing from DataFrame instead of dictionary
+        """
         results_dict = results_df.to_dict(orient="index")
         return cls(results_dict)
     @classmethod
     def from_results_tsv(cls, results_tsv: Path):
+        """
+        Alternative constructor for initializing from a tsv-file instead of dictionary
+        """
         results_df = pandas.read_csv(results_tsv, sep="\t")
         results_df.set_index("sample_name", inplace=True, drop=True)
         results_dict = results_df.to_dict(orient="index")

{ssi_analysis_result_parsers-0.0.4.dist-info → ssi_analysis_result_parsers-0.0.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ssi_analysis_result_parsers
-Version: 0.0.4
+Version: 0.0.6
 Summary: TODO
 Home-page: https://github.com/thej-ssi/ssi_analysis_result_parsers
 Author: Thor Bech Johannesen

{ssi_analysis_result_parsers-0.0.4.dist-info → ssi_analysis_result_parsers-0.0.6.dist-info}/RECORD RENAMED Viewed

@@ -1,15 +1,16 @@
-ssi_analysis_result_parsers/Legionella_parser.py,sha256=CP5r1RriVd4zxeTBokLJYcu5iS6xbK3pBzI6xwITSm0,8894
-ssi_analysis_result_parsers/__init__.py,sha256=4GZKi13lDTD25YBkGakhZyEQZWTER_OWQMNPoH_UM2c,22
-ssi_analysis_result_parsers/_modidx.py,sha256=JY_GM0tMojzTtX9O4D8as4k5a-sXqkxkb7ZUEPzhuMk,12232
-ssi_analysis_result_parsers/blast_parser.py,sha256=L7EdW2LUwSS2OQO7WZUAxP6whJXdvTILE2a3O59uv-s,7441
-ssi_analysis_result_parsers/core.py,sha256=6TGURv8spPdBpwKv6LvqvbVzJChdeHwsG3WQ6QLUuvE,12124
+ssi_analysis_result_parsers/Legionella_parser.py,sha256=an9Rm9r4N3wQXy0qhUpvacy4Wb6HxUXFdDA7D6YsQyY,7237
+ssi_analysis_result_parsers/__init__.py,sha256=S7u1lbuWmM3A3ajykBialmPoJUK6Jg-WmNqM-9OZFdk,22
+ssi_analysis_result_parsers/_modidx.py,sha256=ysvICOsqtGaXuCYPu-UuRGVRhZDJ-O9X3o9lE7rzzGI,11089
+ssi_analysis_result_parsers/blast_parser.py,sha256=EBqWlx8bDlaSzqAZomiUGnT2DGaaA-L7ukny7SEJbpk,7915
+ssi_analysis_result_parsers/core.py,sha256=8CzFMDrGJ24D9aoIebLsG8tx-OxvYJod1cxBITqNfaY,12258
 ssi_analysis_result_parsers/hello_world.py,sha256=jpN94sqYuNHqUbUZMCJ35qGY5iLPB_emucgnDGDUk_U,1895
 ssi_analysis_result_parsers/some_string.py,sha256=JwmAXKbX_JgY8UGh4FAu5-7ZjezcAEhq4Q2B73pWp2M,923
 ssi_analysis_result_parsers/config/config.default.env,sha256=Zt6bfPbVV3rYCksoebX1ruAdFgeD9wqAnKDtswhtJJM,1390
 ssi_analysis_result_parsers/config/config.default.yaml,sha256=3qgUrUtQpxrzYv7WQaHsvz9dQB0RALKNU0idxv7oRqM,460
-ssi_analysis_result_parsers-0.0.4.dist-info/licenses/LICENSE,sha256=p6aTb6QIfqyZ2Uux2VjV4F2zthdUSHZOjB4mfwGc7fo,1094
+ssi_analysis_result_parsers-0.0.6.dist-info/licenses/LICENSE,sha256=p6aTb6QIfqyZ2Uux2VjV4F2zthdUSHZOjB4mfwGc7fo,1094
 test_input/.DS_Store,sha256=sdTEvl9DTKPHNPYYjMqDepX7q7ZETlonk21tGEuWLao,6148
-test_input/Legionella/batch_parser_file_paths.tsv,sha256=zls11lmEA5U89d8RsX6PR8M1zXNVimeL4raqdZ3ijvQ,210
+test_input/empty_file.txt,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+test_input/Legionella/batch_parser_file_paths.tsv,sha256=AikBS_Ez1xO3UrEQ19AY3z6drBDdMAiSGK66NLeyYj4,356
 test_input/Legionella/lag-1_blast.tsv,sha256=MN5QL_iBn9gQ8VTYEcTnT0JwKgpkD8G15-QFOrSWxkU,1133
 test_input/Legionella/lag-1_blast_2.tsv,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 test_input/Legionella/test.sbt.tsv,sha256=ibhaH3is2dxHaABPvR2QM2HAq9bKOs1AwOTmrwSrcd8,168
@@ -20,8 +21,8 @@ test_input/blast_parser/gene_presence_absence_test.tsv,sha256=qCvMkBC-1GuXx83RDh
 test_output/output_with_sample_name.tsv,sha256=NQG7WaxczuWCCsX2a9MUxCCYpbuAirz9gw08OLdEdUo,41
 test_output/test.tsv,sha256=6DGzarXMkUP03Z58vZimc-gu1K2k84zxZLWWF2HROCg,277
 test_output/test_batch_output.tsv,sha256=6DGzarXMkUP03Z58vZimc-gu1K2k84zxZLWWF2HROCg,277
-ssi_analysis_result_parsers-0.0.4.dist-info/METADATA,sha256=r6IJQQ7JgRD_--UlM80lNMH5ZzlxYQCmvaVBIyPhF7k,2765
-ssi_analysis_result_parsers-0.0.4.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-ssi_analysis_result_parsers-0.0.4.dist-info/entry_points.txt,sha256=eG2NzlNDoG__0PPHl3eoKK5EXIz02BGhRX-L2aWgKCY,447
-ssi_analysis_result_parsers-0.0.4.dist-info/top_level.txt,sha256=3q56bBc2Wv2a6ZQ1l_9m66vot2-Qu6tM9tDr3QQ8auM,81
-ssi_analysis_result_parsers-0.0.4.dist-info/RECORD,,
+ssi_analysis_result_parsers-0.0.6.dist-info/METADATA,sha256=WUMU9Lfanw3DLtDNZUzKIZaBU071v00068cENqOkpq8,2765
+ssi_analysis_result_parsers-0.0.6.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+ssi_analysis_result_parsers-0.0.6.dist-info/entry_points.txt,sha256=eG2NzlNDoG__0PPHl3eoKK5EXIz02BGhRX-L2aWgKCY,447
+ssi_analysis_result_parsers-0.0.6.dist-info/top_level.txt,sha256=3q56bBc2Wv2a6ZQ1l_9m66vot2-Qu6tM9tDr3QQ8auM,81
+ssi_analysis_result_parsers-0.0.6.dist-info/RECORD,,

test_input/Legionella/batch_parser_file_paths.tsv CHANGED Viewed

@@ -1,3 +1,5 @@
 sample_name	sbt_results	lag1_blast_results
 sample_1	test_input/Legionella/test.sbt.tsv	test_input/Legionella/lag-1_blast.tsv
 sample_2	test_input/Legionella/test2.sbt.tsv	test_input/Legionella/lag-1_blast_2.tsv
+sample_3	test_input/Legionella/test2.sbt.tsv	test_input/empty_file.txt
+sample_4	test_input/empty_file.txt	test_input/Legionella/lag-1_blast_2.tsv

test_input/empty_file.txt ADDED Viewed

File without changes

{ssi_analysis_result_parsers-0.0.4.dist-info → ssi_analysis_result_parsers-0.0.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{ssi_analysis_result_parsers-0.0.4.dist-info → ssi_analysis_result_parsers-0.0.6.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{ssi_analysis_result_parsers-0.0.4.dist-info → ssi_analysis_result_parsers-0.0.6.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{ssi_analysis_result_parsers-0.0.4.dist-info → ssi_analysis_result_parsers-0.0.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

ssi-analysis-result-parsers 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

ssi-analysis-result-parsers 0.0.4py3-none-any.whl → 0.0.6py3-none-any.whl