ssi-analysis-result-parsers 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,7 @@
1
1
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/39_Legionella_parser.ipynb.
2
2
 
3
3
  # %% auto 0
4
- __all__ = ['extract_legionella_sbt', 'legionella_summary', 'legionella_batch_from_sheet', 'LegionellaResults',
5
- 'legionella_batch_from_dict', 'legionella_parser', 'legionella_batch_parser']
4
+ __all__ = ['extract_legionella_sbt', 'LegionellaResults', 'legionella_parser', 'legionella_batch_parser']
6
5
 
7
6
  # %% ../nbs/39_Legionella_parser.ipynb 3
8
7
  # standard libs
@@ -25,8 +24,10 @@ import json # for nicely printing json and yaml
25
24
  # import functions from core module (optional, but most likely needed).
26
25
  from ssi_analysis_result_parsers import (
27
26
  core,
27
+ blast_parser,
28
28
  )
29
- from .blast_parser import extract_presence_absence
29
+
30
+ # from ssi_analysis_result_parsers.blast_parser import extract_presence_absence
30
31
 
31
32
  # Project specific libraries
32
33
  from pathlib import Path
@@ -36,14 +37,22 @@ import sys
36
37
  # %% ../nbs/39_Legionella_parser.ipynb 6
37
38
  def extract_legionella_sbt(legionella_sbt_results_tsv: Path) -> dict:
38
39
  """
39
- Returns dictionary of results found in the Legionella SBT summary output
40
+ Extract results from Legionella SBT results file
41
+ Returns a dictionary with ST, allele variant for each gene, and notes from output
40
42
  """
41
43
  if os.path.exists(legionella_sbt_results_tsv):
42
- df = pandas.read_csv(legionella_sbt_results_tsv, sep="\t")
43
- df.set_index("sample", inplace=True, drop=True)
44
- d = df.to_dict(orient="index")
45
- fname = next(iter(d))
46
- return d[fname]
44
+ try:
45
+ df = pandas.read_csv(legionella_sbt_results_tsv, sep="\t")
46
+ df.set_index("sample", inplace=True, drop=True)
47
+ d = df.to_dict(orient="index")
48
+ fname = next(iter(d))
49
+ return d[fname]
50
+ except pandas.errors.EmptyDataError:
51
+ print(
52
+ f"No Legionella SBT output empty at {legionella_sbt_results_tsv}",
53
+ file=sys.stderr,
54
+ )
55
+ return None
47
56
  else:
48
57
  print(
49
58
  f"No Legionella SBT output found at {legionella_sbt_results_tsv}",
@@ -52,46 +61,16 @@ def extract_legionella_sbt(legionella_sbt_results_tsv: Path) -> dict:
52
61
  return None
53
62
 
54
63
 
55
- def legionella_summary(legionella_sbt_results_tsv: Path, lag1_blast_tsv: Path) -> dict:
56
- sbt_results_dict = extract_legionella_sbt(
57
- legionella_sbt_results_tsv=legionella_sbt_results_tsv
58
- )
59
- lag1_blast_dict = extract_presence_absence(
60
- blast_output_tsv=lag1_blast_tsv,
61
- hits_as_string=False,
62
- include_match_stats=False,
63
- gene_names=["lag-1"],
64
- )
65
- results_dict = core.update_results_dict(
66
- sbt_results_dict, lag1_blast_dict, old_duplicate_key_prefix="SBT: "
67
- )
68
- if results_dict is None:
69
- return {}
70
- return results_dict
71
-
72
-
73
- def legionella_batch_from_sheet(file_paths: dict, output_file: Path = None):
74
- results_dict = {}
75
- for sample_name, path_dict in file_paths.items():
76
- legionella_results = legionella_summary(
77
- legionella_sbt_results_tsv=Path(path_dict["sbt_results"]),
78
- lag1_blast_tsv=Path(path_dict["lag1_blast_results"]),
79
- )
80
- results_dict[sample_name] = legionella_results
81
- if output_file is not None:
82
- df = pandas.DataFrame.from_dict(results_dict, orient="index").reset_index(
83
- names="sample_name"
84
- )
85
- df.to_csv(output_file, sep="\t", index=False)
86
- return results_dict
87
-
88
-
89
64
  class LegionellaResults(core.PipelineResults):
90
65
 
91
66
  @classmethod
92
67
  def from_tool_paths(
93
68
  cls, legionella_sbt_results_tsv: Path, lag1_blast_tsv: Path, sample_name=None
94
69
  ):
70
+ """
71
+ Alternative constructor for initializing results for single sample,
72
+ Initializes LegionellaResults instance provided paths to outputs from tools (legionella sbt and lag1 presence blast)
73
+ """
95
74
  legionella_results = cls.legionella_summary(
96
75
  legionella_sbt_results_tsv=legionella_sbt_results_tsv,
97
76
  lag1_blast_tsv=lag1_blast_tsv,
@@ -100,6 +79,10 @@ class LegionellaResults(core.PipelineResults):
100
79
 
101
80
  @classmethod
102
81
  def from_tool_paths_dict(cls, file_paths: dict):
82
+ """
83
+ Alternative constructor for initializing results for multiple samples,
84
+ Initializes LegionellaResults instance by providing a dictionary of paths to outputs from tools (legionella sbt and lag1 presence blast)
85
+ """
103
86
  results_dict = {}
104
87
  for sample_name, path_dict in file_paths.items():
105
88
  legionella_results = cls.legionella_summary(
@@ -111,6 +94,10 @@ class LegionellaResults(core.PipelineResults):
111
94
 
112
95
  @classmethod
113
96
  def from_tool_paths_dataframe(cls, file_paths_df: pandas.DataFrame):
97
+ """
98
+ Alternative constructor for initializing results for multiple samples,
99
+ Initializes LegionellaResults instance by providing a DataFrame of paths to outputs from tools (legionella sbt and lag1 presence blast)
100
+ """
114
101
  file_paths = file_paths_df.to_dict(orient="index")
115
102
  results_dict = {}
116
103
  for sample_name, path_dict in file_paths.items():
@@ -124,24 +111,22 @@ class LegionellaResults(core.PipelineResults):
124
111
 
125
112
  @classmethod
126
113
  def from_tool_paths_tsv(cls, tool_paths_tsv: Path):
114
+ """
115
+ Alternative constructor for initializing results for multiple samples,
116
+ Initializes LegionellaResults instance by providing a tsv-file with paths to outputs from tools (legionella sbt and lag1 presence blast)
117
+ """
127
118
  file_paths_df = pandas.read_csv(tool_paths_tsv, sep="\t")
128
119
  file_paths_df.set_index("sample_name", inplace=True, drop=True)
129
- # return_cls =
130
- # results_dict = file_paths_df.to_dict(orient="index")
131
120
  return cls.from_tool_paths_dataframe(file_paths_df)
132
- """for sample_name, path_dict in file_paths.items():
133
- legionella_results = cls.legionella_summary(legionella_sbt_results_tsv=Path(path_dict["sbt_results"]),
134
- lag1_blast_tsv=Path(path_dict["lag1_blast_results"]))
135
- results_dict[sample_name] = legionella_results
136
- return cls(results_dict)"""
137
121
 
122
+ @staticmethod
138
123
  def legionella_summary(
139
124
  legionella_sbt_results_tsv: Path, lag1_blast_tsv: Path
140
125
  ) -> dict:
141
126
  sbt_results_dict = extract_legionella_sbt(
142
127
  legionella_sbt_results_tsv=legionella_sbt_results_tsv
143
128
  )
144
- lag1_blast_dict = extract_presence_absence(
129
+ lag1_blast_dict = blast_parser.extract_presence_absence(
145
130
  blast_output_tsv=lag1_blast_tsv,
146
131
  hits_as_string=False,
147
132
  include_match_stats=False,
@@ -150,43 +135,13 @@ class LegionellaResults(core.PipelineResults):
150
135
  results_dict = core.update_results_dict(
151
136
  sbt_results_dict, lag1_blast_dict, old_duplicate_key_prefix="SBT: "
152
137
  )
138
+ if results_dict is None:
139
+ return {}
153
140
  return results_dict
154
141
 
155
142
  def __repr__(self):
156
143
  return f"< Legionella analysis results object. {len(self.results_df)} samples with {len(self.results_df.columns)} result variables > "
157
144
 
158
-
159
- def legionella_batch_from_dict(file_paths: dict, output_file: Path = None):
160
- results_dict = {}
161
- for sample_name, path_dict in file_paths.items():
162
- legionella_results = legionella_summary(
163
- legionella_sbt_results_tsv=Path(path_dict["sbt_results"]),
164
- lag1_blast_tsv=Path(path_dict["lag1_blast_results"]),
165
- )
166
- results_dict[sample_name] = legionella_results
167
- if output_file is not None:
168
- df = pandas.DataFrame.from_dict(results_dict, orient="index").reset_index(
169
- names="sample_name"
170
- )
171
- df.to_csv(output_file, sep="\t", index=False)
172
- return results_dict
173
-
174
-
175
- def legionella_batch_from_sheet(file_paths: dict, output_file: Path = None):
176
- results_dict = {}
177
- for sample_name, path_dict in file_paths.items():
178
- legionella_results = legionella_summary(
179
- legionella_sbt_results_tsv=Path(path_dict["sbt_results"]),
180
- lag1_blast_tsv=Path(path_dict["lag1_blast_results"]),
181
- )
182
- results_dict[sample_name] = legionella_results
183
- if output_file is not None:
184
- df = pandas.DataFrame.from_dict(results_dict, orient="index").reset_index(
185
- names="sample_name"
186
- )
187
- df.to_csv(output_file, sep="\t", index=False)
188
- return results_dict
189
-
190
145
  # %% ../nbs/39_Legionella_parser.ipynb 9
191
146
  @call_parse
192
147
  def legionella_parser(
@@ -1 +1 @@
1
- __version__ = "0.0.3"
1
+ __version__ = "0.0.5"
@@ -21,16 +21,10 @@ d = { 'settings': { 'branch': 'main',
21
21
  'ssi_analysis_result_parsers/Legionella_parser.py'),
22
22
  'ssi_analysis_result_parsers.Legionella_parser.extract_legionella_sbt': ( 'legionella_parser.html#extract_legionella_sbt',
23
23
  'ssi_analysis_result_parsers/Legionella_parser.py'),
24
- 'ssi_analysis_result_parsers.Legionella_parser.legionella_batch_from_dict': ( 'legionella_parser.html#legionella_batch_from_dict',
25
- 'ssi_analysis_result_parsers/Legionella_parser.py'),
26
- 'ssi_analysis_result_parsers.Legionella_parser.legionella_batch_from_sheet': ( 'legionella_parser.html#legionella_batch_from_sheet',
27
- 'ssi_analysis_result_parsers/Legionella_parser.py'),
28
24
  'ssi_analysis_result_parsers.Legionella_parser.legionella_batch_parser': ( 'legionella_parser.html#legionella_batch_parser',
29
25
  'ssi_analysis_result_parsers/Legionella_parser.py'),
30
26
  'ssi_analysis_result_parsers.Legionella_parser.legionella_parser': ( 'legionella_parser.html#legionella_parser',
31
- 'ssi_analysis_result_parsers/Legionella_parser.py'),
32
- 'ssi_analysis_result_parsers.Legionella_parser.legionella_summary': ( 'legionella_parser.html#legionella_summary',
33
- 'ssi_analysis_result_parsers/Legionella_parser.py')},
27
+ 'ssi_analysis_result_parsers/Legionella_parser.py')},
34
28
  'ssi_analysis_result_parsers.blast_parser': { 'ssi_analysis_result_parsers.blast_parser.allele_matches': ( 'blast_parser.html#allele_matches',
35
29
  'ssi_analysis_result_parsers/blast_parser.py'),
36
30
  'ssi_analysis_result_parsers.blast_parser.extract_allele_matches': ( 'blast_parser.html#extract_allele_matches',
@@ -51,21 +51,31 @@ def extract_presence_absence(
51
51
  if os.path.exists(blast_output_tsv):
52
52
  try:
53
53
  blast_df = pandas.read_csv(blast_output_tsv, sep="\t", header=None)
54
+ header_list = tsv_header.split(" ")
55
+ if len(header_list) == len(blast_df.columns):
56
+ blast_df.columns = tsv_header.split(" ")
57
+ blast_df["plen"] = blast_df["length"] / blast_df["qlen"] * 100
58
+ blast_df_unique = (
59
+ blast_df.sort_values(by=["bitscore"], ascending=False)
60
+ .groupby("qseqid")
61
+ .first()
62
+ )
63
+ blast_df_filtered = blast_df_unique.query(
64
+ "plen > @plen_threshold and pident > @pident_threshold"
65
+ )
66
+ blast_dict = dict(blast_df_filtered.to_dict(orient="index"))
67
+ else:
68
+ print(
69
+ f"Failed to parse {blast_output_tsv}. Number of columns do not match length of provided header string",
70
+ file=sys.stderr,
71
+ )
72
+ return None
54
73
 
55
- blast_df.columns = tsv_header.split(" ")
56
- blast_df["plen"] = blast_df["length"] / blast_df["qlen"] * 100
57
- blast_df_unique = (
58
- blast_df.sort_values(by=["bitscore"], ascending=False)
59
- .groupby("qseqid")
60
- .first()
61
- )
62
- blast_df_filtered = blast_df_unique.query(
63
- "plen > @plen_threshold and pident > @pident_threshold"
64
- )
65
- blast_dict = dict(blast_df_filtered.to_dict(orient="index"))
66
74
  except pandas.errors.EmptyDataError:
67
75
  blast_dict = {}
68
76
  print(f"Blast output file {blast_output_tsv} empty. Assuming 0 blast hits.")
77
+ except Exception as e:
78
+ print(f"Error parsing blast: e")
69
79
  if hits_as_string:
70
80
 
71
81
  results = []
@@ -196,7 +196,6 @@ def get_samplesheet(sample_sheet_config: dict) -> pd.DataFrame:
196
196
  class PipelineResults:
197
197
 
198
198
  def __init__(self, results_dict):
199
- print(results_dict)
200
199
  self.results_dict = results_dict
201
200
  self.results_df = pandas.DataFrame.from_dict(results_dict, orient="index")
202
201
 
@@ -207,12 +206,17 @@ class PipelineResults:
207
206
 
208
207
  @classmethod
209
208
  def from_results_dataframe(cls, results_df: pandas.DataFrame):
210
- # results_df = results_df.set_index("sample_name")
209
+ """
210
+ Alternative constructor for initializing from DataFrame instead of dictionary
211
+ """
211
212
  results_dict = results_df.to_dict(orient="index")
212
213
  return cls(results_dict)
213
214
 
214
215
  @classmethod
215
216
  def from_results_tsv(cls, results_tsv: Path):
217
+ """
218
+ Alternative constructor for initializing from a tsv-file instead of dictionary
219
+ """
216
220
  results_df = pandas.read_csv(results_tsv, sep="\t")
217
221
  results_df.set_index("sample_name", inplace=True, drop=True)
218
222
  results_dict = results_df.to_dict(orient="index")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ssi_analysis_result_parsers
3
- Version: 0.0.4
3
+ Version: 0.0.6
4
4
  Summary: TODO
5
5
  Home-page: https://github.com/thej-ssi/ssi_analysis_result_parsers
6
6
  Author: Thor Bech Johannesen
@@ -1,15 +1,16 @@
1
- ssi_analysis_result_parsers/Legionella_parser.py,sha256=CP5r1RriVd4zxeTBokLJYcu5iS6xbK3pBzI6xwITSm0,8894
2
- ssi_analysis_result_parsers/__init__.py,sha256=4GZKi13lDTD25YBkGakhZyEQZWTER_OWQMNPoH_UM2c,22
3
- ssi_analysis_result_parsers/_modidx.py,sha256=JY_GM0tMojzTtX9O4D8as4k5a-sXqkxkb7ZUEPzhuMk,12232
4
- ssi_analysis_result_parsers/blast_parser.py,sha256=L7EdW2LUwSS2OQO7WZUAxP6whJXdvTILE2a3O59uv-s,7441
5
- ssi_analysis_result_parsers/core.py,sha256=6TGURv8spPdBpwKv6LvqvbVzJChdeHwsG3WQ6QLUuvE,12124
1
+ ssi_analysis_result_parsers/Legionella_parser.py,sha256=an9Rm9r4N3wQXy0qhUpvacy4Wb6HxUXFdDA7D6YsQyY,7237
2
+ ssi_analysis_result_parsers/__init__.py,sha256=S7u1lbuWmM3A3ajykBialmPoJUK6Jg-WmNqM-9OZFdk,22
3
+ ssi_analysis_result_parsers/_modidx.py,sha256=ysvICOsqtGaXuCYPu-UuRGVRhZDJ-O9X3o9lE7rzzGI,11089
4
+ ssi_analysis_result_parsers/blast_parser.py,sha256=EBqWlx8bDlaSzqAZomiUGnT2DGaaA-L7ukny7SEJbpk,7915
5
+ ssi_analysis_result_parsers/core.py,sha256=8CzFMDrGJ24D9aoIebLsG8tx-OxvYJod1cxBITqNfaY,12258
6
6
  ssi_analysis_result_parsers/hello_world.py,sha256=jpN94sqYuNHqUbUZMCJ35qGY5iLPB_emucgnDGDUk_U,1895
7
7
  ssi_analysis_result_parsers/some_string.py,sha256=JwmAXKbX_JgY8UGh4FAu5-7ZjezcAEhq4Q2B73pWp2M,923
8
8
  ssi_analysis_result_parsers/config/config.default.env,sha256=Zt6bfPbVV3rYCksoebX1ruAdFgeD9wqAnKDtswhtJJM,1390
9
9
  ssi_analysis_result_parsers/config/config.default.yaml,sha256=3qgUrUtQpxrzYv7WQaHsvz9dQB0RALKNU0idxv7oRqM,460
10
- ssi_analysis_result_parsers-0.0.4.dist-info/licenses/LICENSE,sha256=p6aTb6QIfqyZ2Uux2VjV4F2zthdUSHZOjB4mfwGc7fo,1094
10
+ ssi_analysis_result_parsers-0.0.6.dist-info/licenses/LICENSE,sha256=p6aTb6QIfqyZ2Uux2VjV4F2zthdUSHZOjB4mfwGc7fo,1094
11
11
  test_input/.DS_Store,sha256=sdTEvl9DTKPHNPYYjMqDepX7q7ZETlonk21tGEuWLao,6148
12
- test_input/Legionella/batch_parser_file_paths.tsv,sha256=zls11lmEA5U89d8RsX6PR8M1zXNVimeL4raqdZ3ijvQ,210
12
+ test_input/empty_file.txt,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ test_input/Legionella/batch_parser_file_paths.tsv,sha256=AikBS_Ez1xO3UrEQ19AY3z6drBDdMAiSGK66NLeyYj4,356
13
14
  test_input/Legionella/lag-1_blast.tsv,sha256=MN5QL_iBn9gQ8VTYEcTnT0JwKgpkD8G15-QFOrSWxkU,1133
14
15
  test_input/Legionella/lag-1_blast_2.tsv,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
16
  test_input/Legionella/test.sbt.tsv,sha256=ibhaH3is2dxHaABPvR2QM2HAq9bKOs1AwOTmrwSrcd8,168
@@ -20,8 +21,8 @@ test_input/blast_parser/gene_presence_absence_test.tsv,sha256=qCvMkBC-1GuXx83RDh
20
21
  test_output/output_with_sample_name.tsv,sha256=NQG7WaxczuWCCsX2a9MUxCCYpbuAirz9gw08OLdEdUo,41
21
22
  test_output/test.tsv,sha256=6DGzarXMkUP03Z58vZimc-gu1K2k84zxZLWWF2HROCg,277
22
23
  test_output/test_batch_output.tsv,sha256=6DGzarXMkUP03Z58vZimc-gu1K2k84zxZLWWF2HROCg,277
23
- ssi_analysis_result_parsers-0.0.4.dist-info/METADATA,sha256=r6IJQQ7JgRD_--UlM80lNMH5ZzlxYQCmvaVBIyPhF7k,2765
24
- ssi_analysis_result_parsers-0.0.4.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
25
- ssi_analysis_result_parsers-0.0.4.dist-info/entry_points.txt,sha256=eG2NzlNDoG__0PPHl3eoKK5EXIz02BGhRX-L2aWgKCY,447
26
- ssi_analysis_result_parsers-0.0.4.dist-info/top_level.txt,sha256=3q56bBc2Wv2a6ZQ1l_9m66vot2-Qu6tM9tDr3QQ8auM,81
27
- ssi_analysis_result_parsers-0.0.4.dist-info/RECORD,,
24
+ ssi_analysis_result_parsers-0.0.6.dist-info/METADATA,sha256=WUMU9Lfanw3DLtDNZUzKIZaBU071v00068cENqOkpq8,2765
25
+ ssi_analysis_result_parsers-0.0.6.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
26
+ ssi_analysis_result_parsers-0.0.6.dist-info/entry_points.txt,sha256=eG2NzlNDoG__0PPHl3eoKK5EXIz02BGhRX-L2aWgKCY,447
27
+ ssi_analysis_result_parsers-0.0.6.dist-info/top_level.txt,sha256=3q56bBc2Wv2a6ZQ1l_9m66vot2-Qu6tM9tDr3QQ8auM,81
28
+ ssi_analysis_result_parsers-0.0.6.dist-info/RECORD,,
@@ -1,3 +1,5 @@
1
1
  sample_name sbt_results lag1_blast_results
2
2
  sample_1 test_input/Legionella/test.sbt.tsv test_input/Legionella/lag-1_blast.tsv
3
3
  sample_2 test_input/Legionella/test2.sbt.tsv test_input/Legionella/lag-1_blast_2.tsv
4
+ sample_3 test_input/Legionella/test2.sbt.tsv test_input/empty_file.txt
5
+ sample_4 test_input/empty_file.txt test_input/Legionella/lag-1_blast_2.tsv
File without changes