ssi-analysis-result-parsers 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,669 @@
1
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/49_Ecoli_parser.ipynb.
2
+
3
+ # %% auto 0
4
+ __all__ = ['thresholds', 'samplesheet_path', 'output_dir', 'output_path', 'sample_sheet_df', 'sample_output_df', 'original_cols',
5
+ 'output_cols', 'output_initial_cols', 'output_specific_cols', 'ERR3528110_res_path', 'ERR3528110_input_df',
6
+ 'ERR3528110_row', 'gene_hits', 'parsed_hits', 'O_gene_alleles', 'H_gene_alleles', 'O_type', 'H_type',
7
+ 'O_gene_keys', 'H_gene_keys', 'O_genes_no', 'H_genes_no', 'ERR14229029_row', 'ERR14229029_expected_values',
8
+ 'ERR14229029_values', 'test_cases', 'setup_logging', 'get_threshold', 'process_res_file', 'EcoliResults',
9
+ 'ecoli_parser']
10
+
11
+ # %% ../nbs/49_Ecoli_parser.ipynb 3
12
+ import os
13
+ import sys
14
+ import pandas as pd
15
+ from pathlib import Path
16
+ import logging
17
+ from datetime import datetime
18
+ from typing import List, Dict
19
+ from fastcore.script import call_parse
20
+
21
+ # import functions from core module (optional, but most likely needed).
22
+ from . import core
23
+
24
+ # %% ../nbs/49_Ecoli_parser.ipynb 6
25
+ thresholds = {
26
+ "stx": [98, 98],
27
+ "wzx": [98, 98],
28
+ "wzy": [98, 98],
29
+ "wzt": [98, 98],
30
+ "wzm": [98, 98],
31
+ "fliC": [90, 90],
32
+ "fli": [90, 90],
33
+ "eae": [95, 95],
34
+ "ehxA": [95, 95],
35
+ "other": [98, 98],
36
+ }
37
+
38
+ # %% ../nbs/49_Ecoli_parser.ipynb 9
39
+ def setup_logging(log_dir: str, sample_name: str) -> None:
40
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
41
+ os.makedirs(log_dir, exist_ok=True)
42
+ log_file = os.path.join(log_dir, f"{sample_name}_kma_fbi.log")
43
+
44
+ logger = logging.getLogger()
45
+ while logger.hasHandlers():
46
+ logger.removeHandler(logger.handlers[0])
47
+
48
+ logging.basicConfig(
49
+ filename=log_file,
50
+ filemode="a",
51
+ format="%(asctime)s - %(levelname)s - %(message)s",
52
+ level=logging.INFO,
53
+ )
54
+
55
+ console_handler = logging.StreamHandler(sys.stdout)
56
+ console_handler.setFormatter(logging.Formatter("%(message)s"))
57
+ logger.addHandler(console_handler)
58
+
59
+ logging.info(f"Logging started for {log_file}")
60
+
61
+ # %% ../nbs/49_Ecoli_parser.ipynb 11
62
+ def get_threshold(template_name: str, thresholds: Dict[str, List[int]]) -> List[int]:
63
+ """
64
+ Returns the coverage and identity threshold for a given gene.
65
+
66
+ Args:
67
+ template_name (str): Name of the template (gene) from the .res file.
68
+ thresholds (Dict[str, List[int]]): Dictionary of gene thresholds.
69
+
70
+ Returns:
71
+ List[int]: A list of two integers: [coverage_threshold, identity_threshold].
72
+ """
73
+ for key in thresholds:
74
+ if key in template_name:
75
+ return thresholds[key]
76
+ return thresholds["other"]
77
+
78
+
79
+ def process_res_file(res_file_path: str) -> pd.DataFrame:
80
+ """
81
+ Reads and filters a KMA .res file based on predefined thresholds.
82
+
83
+ Args:
84
+ res_file_path (str): Path to the .res file.
85
+ thresholds (Dict[str, List[int]]): Gene-specific thresholds.
86
+
87
+ Returns:
88
+ pd.DataFrame: Filtered results DataFrame.
89
+ """
90
+ try:
91
+ res_df = pd.read_csv(res_file_path, sep="\t")
92
+ except FileNotFoundError:
93
+ raise FileNotFoundError(f"File not found: {res_file_path}")
94
+ except pd.errors.EmptyDataError:
95
+ raise ValueError(f"File is empty or not properly formatted: {res_file_path}")
96
+
97
+ required_columns = {"#Template", "Template_Coverage", "Query_Identity", "Depth"}
98
+ if not required_columns.issubset(res_df.columns):
99
+ raise ValueError(f"Missing expected columns in {res_file_path}")
100
+
101
+ res_df["threshold"] = res_df["#Template"].apply(
102
+ lambda x: get_threshold(x, thresholds)
103
+ )
104
+ res_df_filtered = res_df[
105
+ (res_df["Template_Coverage"] >= res_df["threshold"].apply(lambda x: x[0]))
106
+ & (res_df["Query_Identity"] >= res_df["threshold"].apply(lambda x: x[1]))
107
+ ]
108
+ return res_df_filtered
109
+
110
+ # %% ../nbs/49_Ecoli_parser.ipynb 13
111
+ class EcoliResults:
112
+ """
113
+ Object for holding and processing E. coli typing results.
114
+
115
+ This class stores summary typing data for multiple samples, provides utilities for per-sample processing, and export results in a tab-seperated format (.tsv).
116
+ """
117
+
118
+ # converts the sample results in dict to pandas df
119
+ def __init__(self, results_dict: dict):
120
+ """
121
+ Initializes the EcoliResults object with typing result data.
122
+
123
+ Args:
124
+ results_dict (dict): Dictionary where keys are sample names and values are summary result dictionaries.
125
+ """
126
+ self.results_dict = results_dict
127
+ self.results_df = pd.DataFrame.from_dict(
128
+ results_dict, orient="index"
129
+ ).reset_index(names="sample_name")
130
+
131
+ @staticmethod
132
+ def summarize_single_sample(
133
+ sample_name: str, res_path: str, verbose_flag: int = 1
134
+ ) -> dict:
135
+ """
136
+ Processes a single sample KMA .res file and returns a summary dictionary.
137
+
138
+ Args:
139
+ sample_name (str): Sample identifier.
140
+ res_path (str): Path to the sample's .res file.
141
+ verbose_flag (int, optional): Include verbose info if set to 1. Default is 1.
142
+
143
+ Returns:
144
+ Dict[str, str]: Summary values extracted from the .res file.
145
+ """
146
+ log_dir = "examples/Log"
147
+ setup_logging(log_dir, sample_name)
148
+
149
+ NA_string = "-"
150
+ output_data = {
151
+ "stx": NA_string,
152
+ "OH": NA_string,
153
+ "wzx": NA_string,
154
+ "wzy": NA_string,
155
+ "wzt": NA_string,
156
+ "wzm": NA_string,
157
+ "eae": NA_string,
158
+ "ehxA": NA_string,
159
+ "Other": NA_string,
160
+ }
161
+
162
+ try:
163
+ logging.info(f"Processing .res file: {res_path}")
164
+ filtered_df = process_res_file(res_path)
165
+ except Exception as e:
166
+ logging.error(f"Failed to process {res_path}: {e}")
167
+ return output_data
168
+
169
+ gene_map = {
170
+ "wzx": "wzx",
171
+ "wzy": "wzy",
172
+ "wzt": "wzt",
173
+ "wzm": "wzm",
174
+ "eae": "eae",
175
+ "ehxA": "ehxA",
176
+ }
177
+ toxin = "stx"
178
+ stx_alleles = set()
179
+ fli = NA_string
180
+ fliC = NA_string
181
+
182
+ for template in filtered_df["#Template"]:
183
+ parts = template.split("__")
184
+ if len(parts) < 3:
185
+ continue
186
+ gene, allele = parts[1], parts[2]
187
+
188
+ if gene in ["eae", "ehxA"]:
189
+ output_data[gene] = "Positive"
190
+ elif gene in gene_map:
191
+ output_data[gene] = allele
192
+ elif gene == "fliC":
193
+ fliC = allele
194
+ elif gene == "fli":
195
+ fli = allele
196
+ elif gene.startswith(toxin):
197
+ stx_alleles.add(allele)
198
+ elif gene not in thresholds:
199
+ output_data["Other"] = allele
200
+
201
+ if stx_alleles:
202
+ output_data[toxin] = ";".join(sorted(stx_alleles))
203
+
204
+ # serotype specific requirements
205
+ wzx, wzy, wzt, wzm = (
206
+ output_data["wzx"],
207
+ output_data["wzy"],
208
+ output_data["wzt"],
209
+ output_data["wzm"],
210
+ )
211
+ Otype = "-"
212
+ if (
213
+ wzx != NA_string
214
+ and wzy != NA_string
215
+ and wzx == wzy
216
+ and wzt == NA_string
217
+ and wzm == NA_string
218
+ ):
219
+ Otype = wzx
220
+ output_data["wzx"] = output_data["wzy"] = NA_string
221
+ elif (
222
+ wzt != NA_string
223
+ and wzm != NA_string
224
+ and wzt == wzm
225
+ and wzx == NA_string
226
+ and wzy == NA_string
227
+ ):
228
+ Otype = wzt
229
+ output_data["wzt"] = output_data["wzm"] = NA_string
230
+
231
+ Htype = fli if fli != NA_string else fliC
232
+ output_data["OH"] = f"{Otype};{Htype}"
233
+
234
+ # adding the additional depth, template coverage and query identity information
235
+ if verbose_flag == 1:
236
+ verbose_parts = []
237
+ for _, row in filtered_df.iterrows():
238
+ parts = row["#Template"].split("__")
239
+ if len(parts) >= 3:
240
+ gene, allele = parts[1], parts[2]
241
+ depth = row["Depth"]
242
+ coverage = row["Template_Coverage"]
243
+ identity = row["Query_Identity"]
244
+ verbose_parts.append(
245
+ f"{gene}_{allele}_{depth:.2f}_{coverage:.2f}_{identity:.2f}"
246
+ )
247
+ output_data["verbose"] = ";".join(verbose_parts)
248
+
249
+ logging.info(f"Successfully processed sample: {sample_name}")
250
+ return output_data
251
+
252
+ @classmethod
253
+ def from_samplesheet(
254
+ cls,
255
+ samplesheet_path: Path,
256
+ verbose: int = 1,
257
+ results_base: str = "examples/Results/{sample_name}/kma/{sample_name}.res",
258
+ ) -> "EcoliResults":
259
+ """
260
+ Loads sample data from a samplesheet and summarizes each sample.
261
+
262
+ Args:
263
+ samplesheet_path (Path): Path to the samplesheet TSV file.
264
+ verbose (int, optional): Whether to include verbose output per sample. Default is 1.
265
+
266
+ Returns:
267
+ EcoliResults: An instance of the class populated with summaries for all samples.
268
+ """
269
+ df = pd.read_csv(samplesheet_path, sep="\t")
270
+ df.columns = df.columns.str.strip()
271
+ # print("I AM INSIDE FROM SAMPLESHEET")
272
+ # if "Illumina_read_files" in df.columns and ("read1" not in df.columns or "read2" not in df.columns):
273
+ # df[["read1", "read2"]] = df["Illumina_read_files"].str.split(",", expand=True)
274
+
275
+ results_dict = {}
276
+ for idx, row in df.iterrows():
277
+ sample_name = row["sample_name"]
278
+ res_path = Path(
279
+ results_base.format(sample_name=sample_name)
280
+ ) # results_base / sample_name / "kma" / f"{sample_name}.res"
281
+ # print(f"The res path is : {res_path}")
282
+ summary = cls.summarize_single_sample(
283
+ sample_name, res_path, verbose_flag=verbose
284
+ )
285
+ results_dict[sample_name] = summary
286
+
287
+ # Convert to DataFrame
288
+ result_df = pd.DataFrame.from_dict(results_dict, orient="index").reset_index(
289
+ names="sample_name"
290
+ )
291
+
292
+ # Merge with original metadata
293
+ merged_df = df.merge(result_df, on="sample_name", how="left")
294
+
295
+ # Create and return object
296
+ obj = cls(results_dict)
297
+ obj.results_df = merged_df
298
+ return obj
299
+
300
+ def write_tsv(self, output_file: Path):
301
+ """
302
+ Writes the summarized typing results to a TSV file.
303
+
304
+ Args:
305
+ output_file (Path): Destination file path for the output table.
306
+ """
307
+ self.results_df.to_csv(output_file, sep="\t", index=False)
308
+
309
+ def __repr__(self):
310
+ """
311
+ Returns a concise summary of the results object.
312
+
313
+ Returns:
314
+ str: A string with sample and variable counts.
315
+ """
316
+ return f"<EcoliResults: {len(self.results_df)} samples, {len(self.results_df.columns)} variables>"
317
+
318
+ # %% ../nbs/49_Ecoli_parser.ipynb 15
319
+ @call_parse
320
+ def ecoli_parser(
321
+ samplesheet_path: Path, # Input samplesheet
322
+ output_file: Path = None, # Path to output
323
+ verbose: int = 1, # Verbosity,
324
+ results_base: str = "examples/Results/{sample_name}/kma/{sample_name}.res", # Path template for .res files
325
+ ):
326
+ results = EcoliResults.from_samplesheet(
327
+ samplesheet_path, verbose=verbose, results_base=results_base
328
+ )
329
+ if output_file:
330
+ results.write_tsv(output_file)
331
+ else:
332
+ print(results.results_df)
333
+
334
+ # %% ../nbs/49_Ecoli_parser.ipynb 17
335
+ # | eval: true
336
+ import pandas as pd
337
+ from pathlib import Path
338
+ import os
339
+
340
+ # Define paths
341
+ samplesheet_path = Path("test_input/Ecoli/samplesheet.tsv")
342
+ output_dir = Path("test_output/Ecoli")
343
+
344
+ # Create output directory
345
+ if not output_dir.exists():
346
+ output_dir.mkdir(parents=True, exist_ok=True)
347
+
348
+ output_path = output_dir / "KMA_cases_parser.tsv"
349
+
350
+ # Assert input exists
351
+ assert samplesheet_path.exists(), f"File does not exist: {samplesheet_path}"
352
+ print(output_path)
353
+
354
+ # try the ecoli parser to see if the wrangling functionality works
355
+ try:
356
+ ecoli_parser(
357
+ samplesheet_path=samplesheet_path,
358
+ output_file=output_path,
359
+ verbose=1,
360
+ results_base="test_input/Ecoli/{sample_name}.res",
361
+ )
362
+ except Exception as e:
363
+ raise AssertionError(f"Parser execution failed: {e}")
364
+
365
+ # compare the output with the expected results based on input to ensure correct wrangling
366
+
367
+ # read the created output files and check the information
368
+ sample_sheet_df = pd.read_csv(samplesheet_path, sep="\t")
369
+ sample_output_df = pd.read_csv(output_path, sep="\t")
370
+
371
+ ### Test case 1. Check if the datastructure is correct
372
+ original_cols = sample_sheet_df.columns.tolist()
373
+ output_cols = sample_output_df.columns.tolist()
374
+ output_initial_cols = sample_output_df.columns[: len(original_cols)].tolist()
375
+ output_specific_cols = sample_output_df.columns[len(original_cols) :].tolist()
376
+
377
+ assert (
378
+ original_cols == output_initial_cols
379
+ ), f"Mismatch in first columns:\nExpected: {original_cols}\nGot: {output_initial_cols}"
380
+
381
+ assert output_specific_cols
382
+
383
+ ### Test case 2. Check sample ERR3528110 which is correctly believed to be e.coli and ensure datawrangling does as expected
384
+ ERR3528110_res_path = "test_input/Ecoli/ERR3528110.res"
385
+ ERR3528110_input_df = pd.read_csv(ERR3528110_res_path, sep="\t")
386
+
387
+ ERR3528110_row = (
388
+ sample_output_df[sample_output_df["sample_name"] == "ERR3528110"]
389
+ .iloc[:, len(original_cols) : len(output_cols)]
390
+ .iloc[0]
391
+ )
392
+
393
+ # extract the original genes from the res
394
+ gene_hits = ERR3528110_input_df["#Template"].tolist()
395
+
396
+ parsed_hits = []
397
+
398
+ for hit in gene_hits:
399
+ parts = hit.split("__")
400
+ assert (
401
+ len(parts) != 3
402
+ ), f"Unexpected KMA result format in: '{hit}'. Expected at least 3 '__' parts (e.g., ref__gene__allele) as off ecoli fbi 24-04-2025."
403
+ gene, allele = parts[1], parts[2]
404
+ parsed_hits.append((gene, allele))
405
+
406
+ # Extract OH genes
407
+ O_gene_alleles = {
408
+ gene: allele for gene, allele in parsed_hits if gene in {"wzx", "wzy", "wzt", "wzm"}
409
+ }
410
+ H_gene_alleles = {
411
+ gene: allele for gene, allele in parsed_hits if gene in {"fli", "fliC"}
412
+ }
413
+
414
+ O_type = ERR3528110_row["OH"].split(";")[0]
415
+ H_type = ERR3528110_row["OH"].split(";")[1]
416
+
417
+ O_gene_keys = set(O_gene_alleles.keys())
418
+ H_gene_keys = set(H_gene_alleles.keys())
419
+
420
+ O_genes_no = len(O_gene_keys)
421
+ H_genes_no = len(H_gene_keys)
422
+
423
+ # O typing scenarios
424
+ # Case 1: wzx/wzy match
425
+ if O_gene_keys == {"wzx", "wzy"} and O_gene_alleles["wzx"] == O_gene_alleles["wzy"]:
426
+ expected_otype = O_gene_alleles["wzx"]
427
+ assert O_type == expected_otype, f"Expected OH '{expected_otype}', got '{O_type}'"
428
+ # wzx/wzy should be suppressed
429
+ assert ERR3528110_row["wzx"] == "-", "wzx column should be '-' when OH is used"
430
+ assert ERR3528110_row["wzy"] == "-", "wzy column should be '-' when OH is used"
431
+ # print(f"O-type correctly assigned from matching wzx/wzy: {O_type}")
432
+
433
+ # Case 2: wzt/wzm match
434
+ elif O_gene_keys == {"wzt", "wzm"} and O_gene_alleles["wzt"] == O_gene_alleles["wzm"]:
435
+ expected_otype = O_gene_alleles["wzt"]
436
+ assert O_type == expected_otype, f"Expected OH '{expected_otype}', got '{O_type}'"
437
+ assert ERR3528110_row["wzt"] == "-", "wzt column should be '-' when OH is used"
438
+ assert ERR3528110_row["wzm"] == "-", "wzm column should be '-' when OH is used"
439
+ # print(f"O-type correctly assigned from matching wzt/wzm: {O_type}")
440
+
441
+ # Case 3: Conflict (≥3 genes, or 2 mismatched genes)
442
+ elif O_genes_no >= 3 or (
443
+ (O_gene_keys == {"wzx", "wzy"} and O_gene_alleles["wzx"] != O_gene_alleles["wzy"])
444
+ or (
445
+ O_gene_keys == {"wzt", "wzm"} and O_gene_alleles["wzt"] != O_gene_alleles["wzm"]
446
+ )
447
+ ):
448
+ assert O_type == "-", f"Expected OH = '-' due to conflict, got: '{O_type}'"
449
+ for gene in O_gene_keys:
450
+ assert (
451
+ ERR3528110_row[gene] == O_gene_alleles[gene]
452
+ ), f"{gene} column should contain '{O_gene_alleles[gene]}'"
453
+ # print("Conflict in O-typing correctly led to OH = '-' and individual gene columns retained.")
454
+
455
+ # H typing scenarios
456
+
457
+ # Case 1: If fli is present it will always take precedence over fliC
458
+ if H_gene_keys == {"fli"}:
459
+ expected_htype = H_gene_alleles["fli"]
460
+ assert (
461
+ H_type == expected_htype
462
+ ), f"Expected OH '{expected_htype}' from 'fli', got '{H_type}'"
463
+
464
+ # Case 2: only if fliC is the sole gene it is used
465
+ elif H_gene_keys == {"fliC"}:
466
+ expected_htype = H_gene_alleles["fliC"]
467
+ assert (
468
+ H_type == expected_htype
469
+ ), f"Expected OH '{expected_htype}' from 'fliC', got '{H_type}'"
470
+
471
+ # Case 3: if none exist the H type remains empty
472
+ else:
473
+ assert H_type == "-", f"Expected H-type '-', but got '{H_type}'"
474
+
475
+ ### Test case 3. Check sample ERR14229029 which is believed to be e.coli in the samplesheet is empty, as a result of being erroneously classified as e.coli
476
+
477
+ ERR14229029_row = (
478
+ sample_output_df[sample_output_df["sample_name"] == "ERR14229029"]
479
+ .iloc[:, len(original_cols) : len(output_cols)]
480
+ .iloc[0]
481
+ )
482
+
483
+ ERR14229029_expected_values = [
484
+ "-",
485
+ "-;-",
486
+ "-",
487
+ "-",
488
+ "-",
489
+ "-",
490
+ "-",
491
+ "-",
492
+ "-",
493
+ float("nan"),
494
+ ]
495
+ ERR14229029_values = [ERR14229029_row[col] for col in output_specific_cols]
496
+
497
+ for col, actual, expected in zip(
498
+ output_specific_cols, ERR14229029_values, ERR14229029_expected_values
499
+ ):
500
+ if pd.isna(expected):
501
+ assert pd.isna(actual), f"{col}: Expected NaN, got {actual}"
502
+ else:
503
+ assert actual == expected, f"{col}: Expected '{expected}', got '{actual}'"
504
+
505
+ # %% ../nbs/49_Ecoli_parser.ipynb 19
506
+ import os
507
+ from tempfile import TemporaryDirectory
508
+ from pathlib import Path
509
+
510
+ test_cases = [
511
+ # sample_name, res_content, expected_oh, expected_stx, expected_eae, expected_ehxA
512
+ (
513
+ "sample1",
514
+ "1__wzx__O103__X\t100\t100\t60\n2__wzy__O103__X\t100\t100\t65\n3__fliC__H2__X\t100\t100\t70",
515
+ "O103;H2",
516
+ "-",
517
+ "-",
518
+ "-",
519
+ ),
520
+ (
521
+ "sample2",
522
+ "1__wzt__O8__X\t100\t100\t60\n2__wzm__O8__X\t100\t100\t65\n3__fliC__H10__X\t100\t100\t70\n4__stx2__stx2-a__X\t100\t100\t90\n5__eae__eae-5__X\t100\t100\t80",
523
+ "O8;H10",
524
+ "stx2-a",
525
+ "Positive",
526
+ "-",
527
+ ),
528
+ ("sample3", "1__fliC__H7__X\t100\t100\t70", "-;H7", "-", "-", "-"),
529
+ (
530
+ "sample4",
531
+ "bad_line\n2__wzy__O111__X\t100\t100\t70\n3__fliC__H11__X\t100\t100\t70",
532
+ "-;H11",
533
+ "-",
534
+ "-",
535
+ "-",
536
+ ),
537
+ ("sample5", "", "-;-", "-", "-", "-"),
538
+ (
539
+ "sample6",
540
+ "1__wzx__O157__X\t100\t100\t60\n2__wzy__O157__X\t100\t100\t65\n3__wzt__O8__X\t100\t100\t60\n4__wzm__O8__X\t100\t100\t65\n5__fli__H2__X\t100\t100\t70",
541
+ "-;H2",
542
+ "-",
543
+ "-",
544
+ "-",
545
+ ),
546
+ (
547
+ "sample7",
548
+ "1__wzx__O157__X\t100\t100\t60\n2__wzy__O111__X\t100\t100\t65\n3__fliC__H9__X\t100\t100\t70",
549
+ "-;H9",
550
+ "-",
551
+ "-",
552
+ "-",
553
+ ),
554
+ (
555
+ "sample8",
556
+ "1__fli__H1__X\t100\t100\t70\n2__fliC__H12__X\t100\t100\t70",
557
+ "-;H1",
558
+ "-",
559
+ "-",
560
+ "-",
561
+ ),
562
+ (
563
+ "sample9",
564
+ "1__wzx__O157__X\t100\t100\t60\n2__wzy__O157__X\t100\t100\t65\n3__wzt__O8__X\t100\t100\t60\n4__wzm__O8__X\t100\t100\t65\n5__fliC__H10__X\t100\t100\t70\n6__fli__H2__X\t100\t100\t70\n7__stx1__stx1-a__X\t100\t100\t90\n8__stx2__stx2-d__X\t100\t100\t90\n9__stx2__stx2-a__X\t100\t100\t90\n10__eae__eae-42-5__X\t100\t100\t80\n11__ehxA__ehxA-7__X\t100\t100\t80",
565
+ "-;H2",
566
+ "stx1-a;stx2-a;stx2-d",
567
+ "Positive",
568
+ "Positive",
569
+ ),
570
+ (
571
+ "sample10",
572
+ "1__adk__adk__X\t100\t100\t70\n2__fliC__H4__X\t100\t100\t70",
573
+ "-;H4",
574
+ "-",
575
+ "-",
576
+ "-",
577
+ ),
578
+ (
579
+ "sample11",
580
+ "1__eae__eae-1__X\t100\t94\t70\n2__fliC__H6__X\t100\t100\t70",
581
+ "-;H6",
582
+ "-",
583
+ "-",
584
+ "-",
585
+ ),
586
+ (
587
+ "sample12",
588
+ "1__stx1__stx1a__X\t100\t100\t80\n2__stx2__stx2c__X\t100\t100\t85\n3__fli__H21__X\t100\t100\t70",
589
+ "-;H21",
590
+ "stx1a;stx2c",
591
+ "-",
592
+ "-",
593
+ ),
594
+ ]
595
+
596
+ for (
597
+ sample_name,
598
+ res_content,
599
+ expected_oh,
600
+ expected_stx,
601
+ expected_eae,
602
+ expected_ehxA,
603
+ ) in test_cases:
604
+ with TemporaryDirectory() as tmpdir:
605
+ tmpdir = Path(tmpdir)
606
+ os.chdir(tmpdir)
607
+
608
+ res_dir = tmpdir / f"examples/Results/{sample_name}/kma"
609
+ res_dir.mkdir(parents=True)
610
+ res_file = res_dir / f"{sample_name}.res"
611
+ res_file.write_text(
612
+ "#Template\tTemplate_Coverage\tQuery_Identity\tDepth\n" + res_content
613
+ )
614
+
615
+ sheet = tmpdir / "samplesheet.tsv"
616
+ sheet.write_text(
617
+ "sample_name\tIllumina_read_files\tNanopore_read_file\tassembly_file\torganism\tvariant\tnotes\n"
618
+ f"{sample_name}\tread1.fastq,read2.fastq\t-\t-\tEcoli\t-\t-\n"
619
+ )
620
+
621
+ results = EcoliResults.from_samplesheet(sheet)
622
+ df = results.results_df
623
+ row = df.iloc[0]
624
+
625
+ # general output and functionality test
626
+ assert row["sample_name"] == sample_name
627
+
628
+ if row["OH"] != expected_oh:
629
+ raise AssertionError(
630
+ f"\nSample: {sample_name}\nExpected OH: {expected_oh}\nActual OH: {row['OH']}"
631
+ )
632
+ assert row["OH"] == expected_oh
633
+
634
+ if row["stx"] != expected_stx:
635
+ raise AssertionError(
636
+ f"\nSample: {sample_name}\nExpected stx: {expected_stx}\nActual stx: {row['stx']}"
637
+ )
638
+ assert row["stx"] == expected_stx
639
+
640
+ if row["eae"] != expected_eae:
641
+ raise AssertionError(
642
+ f"\nSample: {sample_name}\nExpected eae: {expected_eae}\nActual eae: {row['eae']}"
643
+ )
644
+ assert row["eae"] == expected_eae
645
+
646
+ if row["ehxA"] != expected_ehxA:
647
+ raise AssertionError(
648
+ f"\nSample: {sample_name}\nExpected ehxA: {expected_ehxA}\nActual ehxA: {row['ehxA']}"
649
+ )
650
+ assert row["ehxA"] == expected_ehxA
651
+
652
+ # sample specific information tests
653
+
654
+ # without confliciting O and H typing, the OH column should be filled and the remaining four genes empty
655
+ if sample_name == "sample1":
656
+ assert row["wzx"] == "-"
657
+ assert row["wzy"] == "-"
658
+ assert row["wzt"] == "-"
659
+ assert row["wzm"] == "-"
660
+ # with conflicts the OH should remain empty and the four 'conflicting' gene information remain filled
661
+ elif sample_name == "sample6":
662
+ assert row["wzx"] == "O157"
663
+ assert row["wzy"] == "O157"
664
+ assert row["wzt"] == "O8"
665
+ assert row["wzm"] == "O8"
666
+ elif sample_name == "sample10":
667
+ assert row["Other"] == "adk"
668
+
669
+ print("All 12 syntehtic E. coli sample inline tests passed.")
@@ -56,7 +56,14 @@ def extract_emm_type(emm_blast_tsv: Path):
56
56
  """
57
57
 
58
58
  emm_types_in_emm_plus_mrp_operons = [] ### to update
59
- mrp_types_in_emm_plus_mrp_operons = ["156"] ### to update
59
+ mrp_types_in_emm_plus_mrp_operons = [
60
+ "134",
61
+ "156",
62
+ "159",
63
+ "164",
64
+ "174",
65
+ "205",
66
+ ] ### to update
60
67
  emm_blast_tsv = Path(emm_blast_tsv)
61
68
  emm_typing_results = {"EMM_type": "-", "ENN_type": "-", "MRP_type": "-"}
62
69
  if not emm_blast_tsv.exists():
@@ -85,87 +92,118 @@ def extract_emm_type(emm_blast_tsv: Path):
85
92
  .groupby("extended_sstart")
86
93
  .first()
87
94
  )
88
- print(blast_df_unique)
89
- if blast_df_unique.shape[0] == 1:
90
- emm_typing_results["EMM_type"] = blast_df_unique.iloc[0]["qseqid"][3:]
91
- if (
92
- blast_df_unique.iloc[0]["length"] < 180
93
- or blast_df_unique.iloc[0]["pident"] < 100
94
- ):
95
- emm_typing_results["EMM_type"] += "*"
96
- notes.append(
97
- f"EMM{blast_df_unique.iloc[0]['qseqid'][3:]} with {round(blast_df_unique.iloc[0]['pident'],2)} and length {blast_df_unique.iloc[0]['length']}/{blast_df_unique.iloc[0]['qlen']}"
98
- )
99
- else:
100
- if blast_df_unique.iloc[0]["sstart"] < blast_df_unique.iloc[0]["send"]:
101
- blast_df_unique = blast_df_unique.sort_values(by=["sstart"], ascending=True)
102
- else:
103
- blast_df_unique = blast_df_unique.sort_values(
104
- by=["sstart"], ascending=False
95
+
96
+ if blast_df_unique.shape[0] == 0:
97
+ notes.append("No blast hits found for EMM genes")
98
+ elif len(set(blast_df_unique["sseqid"])) == 1:
99
+ if blast_df_unique.shape[0] == 1:
100
+ emm_typing_results["EMM_type"] = (
101
+ "EMM" + blast_df_unique.iloc[0]["qseqid"][3:]
105
102
  )
106
- if blast_df_unique.shape[0] == 2:
107
- emm_typing_results["EMM_type"] = blast_df_unique.iloc[0]["qseqid"][3:]
108
103
  if (
109
- blast_df_unique.iloc[0]["length"] < 180
104
+ blast_df_unique.iloc[0]["length"] < blast_df_unique.iloc[0]["qlen"]
110
105
  or blast_df_unique.iloc[0]["pident"] < 100
111
106
  ):
112
107
  emm_typing_results["EMM_type"] += "*"
113
108
  notes.append(
114
- f"EMM{blast_df_unique.iloc[0]['qseqid'][3:]} with pident {round(blast_df_unique.iloc[0]['pident'],2)} and length {blast_df_unique.iloc[0]['length']}/{blast_df_unique.iloc[0]['qlen']}"
109
+ f"EMM{blast_df_unique.iloc[0]['qseqid'][3:]} with {round(blast_df_unique.iloc[0]['pident'],2)} and length {blast_df_unique.iloc[0]['length']}/{blast_df_unique.iloc[0]['qlen']}"
110
+ )
111
+ else:
112
+ if blast_df_unique.iloc[0]["sstart"] < blast_df_unique.iloc[0]["send"]:
113
+ blast_df_unique = blast_df_unique.sort_values(
114
+ by=["sstart"], ascending=True
115
+ )
116
+ else:
117
+ blast_df_unique = blast_df_unique.sort_values(
118
+ by=["sstart"], ascending=False
115
119
  )
120
+ if blast_df_unique.shape[0] == 2:
121
+ emm_typing_results["EMM_type"] = (
122
+ "EMM" + blast_df_unique.iloc[0]["qseqid"][3:]
123
+ )
124
+ if (
125
+ blast_df_unique.iloc[0]["length"] < blast_df_unique.iloc[0]["qlen"]
126
+ or blast_df_unique.iloc[0]["pident"] < 100
127
+ ):
128
+ emm_typing_results["EMM_type"] += "*"
129
+ notes.append(
130
+ f"EMM{blast_df_unique.iloc[0]['qseqid'][3:]} with pident {round(blast_df_unique.iloc[0]['pident'],2)} and length {blast_df_unique.iloc[0]['length']}/{blast_df_unique.iloc[0]['qlen']}"
131
+ )
116
132
 
117
- emm_typing_results["ENN_type"] = blast_df_unique.iloc[1]["qseqid"][3:]
118
- if (
119
- blast_df_unique.iloc[1]["length"] < 180
120
- or blast_df_unique.iloc[1]["pident"] < 100
121
- ):
122
- emm_typing_results["ENN_type"] += "*"
123
- notes.append(
124
- f"ENN{blast_df_unique.iloc[1]['qseqid'][3:]} with pident {round(blast_df_unique.iloc[1]['pident'],2)} and length {blast_df_unique.iloc[1]['length']}/{blast_df_unique.iloc[1]['qlen']}"
133
+ emm_typing_results["ENN_type"] = (
134
+ "ENN" + blast_df_unique.iloc[1]["qseqid"][3:]
125
135
  )
126
- emm_maintype = blast_df_unique.iloc[0]["qseqid"][3:].split(".")[0]
127
- mrp_maintype = blast_df_unique.iloc[1]["qseqid"][3:].split(".")[0]
128
- if (
129
- mrp_maintype in emm_types_in_emm_plus_mrp_operons
130
- or emm_maintype in mrp_types_in_emm_plus_mrp_operons
131
- ):
132
- emm_typing_results["MRP_type"] = emm_typing_results["EMM_type"]
133
- emm_typing_results["EMM_type"] = emm_typing_results["ENN_type"]
134
- emm_typing_results["ENN_type"] = "-"
135
- notes.append(f"EMM redesignated due to known MRP+EMM operon")
136
+ if (
137
+ blast_df_unique.iloc[1]["length"] < blast_df_unique.iloc[1]["qlen"]
138
+ or blast_df_unique.iloc[1]["pident"] < 100
139
+ ):
140
+ emm_typing_results["ENN_type"] += "*"
141
+ notes.append(
142
+ f"ENN{blast_df_unique.iloc[1]['qseqid'][3:]} with pident {round(blast_df_unique.iloc[1]['pident'],2)} and length {blast_df_unique.iloc[1]['length']}/{blast_df_unique.iloc[1]['qlen']}"
143
+ )
144
+ emm_maintype = blast_df_unique.iloc[0]["qseqid"][3:].split(".")[0]
145
+ mrp_maintype = blast_df_unique.iloc[1]["qseqid"][3:].split(".")[0]
146
+ if (
147
+ mrp_maintype in emm_types_in_emm_plus_mrp_operons
148
+ or emm_maintype in mrp_types_in_emm_plus_mrp_operons
149
+ ):
150
+ emm_typing_results["MRP_type"] = (
151
+ "MRP" + emm_typing_results["EMM_type"][3:]
152
+ )
153
+ emm_typing_results["EMM_type"] = (
154
+ "EMM" + emm_typing_results["ENN_type"][3:]
155
+ )
156
+ emm_typing_results["ENN_type"] = "-"
157
+ notes.append(f"EMM redesignated due to known MRP+EMM operon")
136
158
 
137
- elif blast_df_unique.shape[0] == 3:
138
- emm_typing_results["MRP_type"] = blast_df_unique.iloc[0]["qseqid"][3:]
139
- if (
140
- blast_df_unique.iloc[0]["length"] < 180
141
- or blast_df_unique.iloc[0]["pident"] < 100
142
- ):
143
- emm_typing_results["MRP_type"] += "*"
144
- notes.append(
145
- f"MRP{blast_df_unique.iloc[0]['qseqid'][3:]} with pident {round(blast_df_unique.iloc[0]['pident'],2)} and length {blast_df_unique.iloc[0]['length']}/{blast_df_unique.iloc[0]['qlen']}"
159
+ elif blast_df_unique.shape[0] == 3:
160
+ emm_typing_results["MRP_type"] = (
161
+ "MRP" + blast_df_unique.iloc[0]["qseqid"][3:]
146
162
  )
163
+ if (
164
+ blast_df_unique.iloc[0]["length"] < blast_df_unique.iloc[0]["qlen"]
165
+ or blast_df_unique.iloc[0]["pident"] < 100
166
+ ):
167
+ emm_typing_results["MRP_type"] += "*"
168
+ notes.append(
169
+ f"MRP{blast_df_unique.iloc[0]['qseqid'][3:]} with pident {round(blast_df_unique.iloc[0]['pident'],2)} and length {blast_df_unique.iloc[0]['length']}/{blast_df_unique.iloc[0]['qlen']}"
170
+ )
147
171
 
148
- emm_typing_results["EMM_type"] = blast_df_unique.iloc[1]["qseqid"][3:]
149
- if (
150
- blast_df_unique.iloc[1]["length"] < 180
151
- or blast_df_unique.iloc[1]["pident"] < 100
152
- ):
153
- emm_typing_results["EMM_type"] += "*"
154
- notes.append(
155
- f"EMM{blast_df_unique.iloc[1]['qseqid'][3:]} with pident {round(blast_df_unique.iloc[1]['pident'],2)} and length {blast_df_unique.iloc[1]['length']}/{blast_df_unique.iloc[1]['qlen']}"
172
+ emm_typing_results["EMM_type"] = (
173
+ "EMM" + blast_df_unique.iloc[1]["qseqid"][3:]
156
174
  )
175
+ if (
176
+ blast_df_unique.iloc[1]["length"] < blast_df_unique.iloc[1]["qlen"]
177
+ or blast_df_unique.iloc[1]["pident"] < 100
178
+ ):
179
+ emm_typing_results["EMM_type"] += "*"
180
+ notes.append(
181
+ f"EMM{blast_df_unique.iloc[1]['qseqid'][3:]} with pident {round(blast_df_unique.iloc[1]['pident'],2)} and length {blast_df_unique.iloc[1]['length']}/{blast_df_unique.iloc[1]['qlen']}"
182
+ )
157
183
 
158
- emm_typing_results["ENN_type"] = blast_df_unique.iloc[2]["qseqid"][3:]
159
- if (
160
- blast_df_unique.iloc[2]["length"] < 180
161
- or blast_df_unique.iloc[2]["pident"] < 100
162
- ):
163
- emm_typing_results["ENN_type"] += "*"
164
- notes.append(
165
- f"ENN{blast_df_unique.iloc[2]['qseqid'][3:]} with pident {round(blast_df_unique.iloc[2]['pident'],2)} and length {blast_df_unique.iloc[2]['length']}/{blast_df_unique.iloc[2]['qlen']}"
184
+ emm_typing_results["ENN_type"] = (
185
+ "ENN" + blast_df_unique.iloc[2]["qseqid"][3:]
166
186
  )
167
- elif blast_df_unique.shape[0] == 0:
168
- notes.append("No blast hits found for EMM genes")
187
+ if (
188
+ blast_df_unique.iloc[2]["length"] < blast_df_unique.iloc[2]["qlen"]
189
+ or blast_df_unique.iloc[2]["pident"] < 100
190
+ ):
191
+ emm_typing_results["ENN_type"] += "*"
192
+ notes.append(
193
+ f"ENN{blast_df_unique.iloc[2]['qseqid'][3:]} with pident {round(blast_df_unique.iloc[2]['pident'],2)} and length {blast_df_unique.iloc[2]['length']}/{blast_df_unique.iloc[2]['qlen']}"
194
+ )
195
+ else:
196
+ emm_genes = []
197
+ for index, row in blast_df_unique.iterrows():
198
+ if row["length"] < row["qlen"] or row["pident"] < 100:
199
+ emm_genes.append(row["qseqid"][3:] + "*")
200
+ else:
201
+ emm_genes.append(row["qseqid"][3:])
202
+ notes.append(
203
+ "EMM and EMM-like genes found on multiple contigs. Alleles found: "
204
+ + "/".join(emm_genes)
205
+ )
206
+
169
207
  emm_typing_results["emm_typing_notes"] = ", ".join(notes)
170
208
  return emm_typing_results
171
209
 
@@ -1 +1 @@
1
- __version__ = "0.0.7"
1
+ __version__ = "0.0.10"
@@ -5,7 +5,27 @@ d = { 'settings': { 'branch': 'main',
5
5
  'doc_host': 'https://thej-ssi.github.io',
6
6
  'git_url': 'https://github.com/thej-ssi/ssi_analysis_result_parsers',
7
7
  'lib_path': 'ssi_analysis_result_parsers'},
8
- 'syms': { 'ssi_analysis_result_parsers.Legionella_parser': { 'ssi_analysis_result_parsers.Legionella_parser.LegionellaResults': ( 'legionella_parser.html#legionellaresults',
8
+ 'syms': { 'ssi_analysis_result_parsers.Ecoli_parser': { 'ssi_analysis_result_parsers.Ecoli_parser.EcoliResults': ( 'ecoli_parser.html#ecoliresults',
9
+ 'ssi_analysis_result_parsers/Ecoli_parser.py'),
10
+ 'ssi_analysis_result_parsers.Ecoli_parser.EcoliResults.__init__': ( 'ecoli_parser.html#ecoliresults.__init__',
11
+ 'ssi_analysis_result_parsers/Ecoli_parser.py'),
12
+ 'ssi_analysis_result_parsers.Ecoli_parser.EcoliResults.__repr__': ( 'ecoli_parser.html#ecoliresults.__repr__',
13
+ 'ssi_analysis_result_parsers/Ecoli_parser.py'),
14
+ 'ssi_analysis_result_parsers.Ecoli_parser.EcoliResults.from_samplesheet': ( 'ecoli_parser.html#ecoliresults.from_samplesheet',
15
+ 'ssi_analysis_result_parsers/Ecoli_parser.py'),
16
+ 'ssi_analysis_result_parsers.Ecoli_parser.EcoliResults.summarize_single_sample': ( 'ecoli_parser.html#ecoliresults.summarize_single_sample',
17
+ 'ssi_analysis_result_parsers/Ecoli_parser.py'),
18
+ 'ssi_analysis_result_parsers.Ecoli_parser.EcoliResults.write_tsv': ( 'ecoli_parser.html#ecoliresults.write_tsv',
19
+ 'ssi_analysis_result_parsers/Ecoli_parser.py'),
20
+ 'ssi_analysis_result_parsers.Ecoli_parser.ecoli_parser': ( 'ecoli_parser.html#ecoli_parser',
21
+ 'ssi_analysis_result_parsers/Ecoli_parser.py'),
22
+ 'ssi_analysis_result_parsers.Ecoli_parser.get_threshold': ( 'ecoli_parser.html#get_threshold',
23
+ 'ssi_analysis_result_parsers/Ecoli_parser.py'),
24
+ 'ssi_analysis_result_parsers.Ecoli_parser.process_res_file': ( 'ecoli_parser.html#process_res_file',
25
+ 'ssi_analysis_result_parsers/Ecoli_parser.py'),
26
+ 'ssi_analysis_result_parsers.Ecoli_parser.setup_logging': ( 'ecoli_parser.html#setup_logging',
27
+ 'ssi_analysis_result_parsers/Ecoli_parser.py')},
28
+ 'ssi_analysis_result_parsers.Legionella_parser': { 'ssi_analysis_result_parsers.Legionella_parser.LegionellaResults': ( 'legionella_parser.html#legionellaresults',
9
29
  'ssi_analysis_result_parsers/Legionella_parser.py'),
10
30
  'ssi_analysis_result_parsers.Legionella_parser.LegionellaResults.__repr__': ( 'legionella_parser.html#legionellaresults.__repr__',
11
31
  'ssi_analysis_result_parsers/Legionella_parser.py'),
@@ -74,8 +74,8 @@ def extract_presence_absence(
74
74
  except pandas.errors.EmptyDataError:
75
75
  blast_dict = {}
76
76
  print(f"Blast output file {blast_output_tsv} empty. Assuming 0 blast hits.")
77
- except Exception as e:
78
- print(f"Error parsing blast: e")
77
+ # except Exception as e:
78
+ # print(f"Error parsing blast: e")
79
79
  if hits_as_string:
80
80
 
81
81
  results = []
@@ -105,6 +105,7 @@ def extract_presence_absence(
105
105
 
106
106
  else:
107
107
  print(f"No blast output found at {blast_output_tsv}", file=sys.stderr)
108
+ return None
108
109
 
109
110
 
110
111
  def extract_allele_matches(
@@ -122,21 +123,38 @@ def extract_allele_matches(
122
123
  allele_dict = {}
123
124
  detailed_dict = {}
124
125
  if os.path.exists(blast_output_tsv):
125
- blast_df = pandas.read_csv(blast_output_tsv, sep="\t", header=None)
126
- blast_df.columns = tsv_header.split(" ")
127
- blast_df.set_index("qseqid", drop=False)
128
- blast_df["plen"] = blast_df["length"] / blast_df["qlen"] * 100
129
- blast_df[["gene", "allele"]] = blast_df["qseqid"].str.split("_", expand=True)
130
- blast_df_unique = (
131
- blast_df.sort_values(by=["bitscore"], ascending=False)
132
- .groupby("gene")
133
- .first()
134
- )
135
- for gene, d in blast_df_unique.to_dict(orient="index").items():
136
- allele_dict[gene] = d["allele"]
137
- detailed_dict[gene] = f"{d['allele']}__{d['pident']}__{d['plen']}"
126
+ try:
127
+ blast_df = pandas.read_csv(blast_output_tsv, sep="\t", header=None)
128
+ header_list = tsv_header.split(" ")
129
+ if len(header_list) == len(blast_df.columns):
130
+ blast_df.columns = tsv_header.split(" ")
131
+ blast_df.set_index("qseqid", drop=False)
132
+ blast_df["plen"] = blast_df["length"] / blast_df["qlen"] * 100
133
+ blast_df[["gene", "allele"]] = blast_df["qseqid"].str.split(
134
+ "_", expand=True
135
+ )
136
+ blast_df_unique = (
137
+ blast_df.sort_values(by=["bitscore"], ascending=False)
138
+ .groupby("gene")
139
+ .first()
140
+ )
141
+ for gene, d in blast_df_unique.to_dict(orient="index").items():
142
+ allele_dict[gene] = d["allele"]
143
+ detailed_dict[gene] = f"{d['allele']}__{d['pident']}__{d['plen']}"
144
+ else:
145
+ print(
146
+ f"Failed to parse {blast_output_tsv}. Number of columns do not match length of provided header string",
147
+ file=sys.stderr,
148
+ )
149
+ return None
150
+
151
+ except pandas.errors.EmptyDataError:
152
+ detailed_dict = {}
153
+ allele_dict = {}
154
+ print(f"Blast output file {blast_output_tsv} empty. Assuming 0 blast hits.")
138
155
  else:
139
156
  print(f"No blast output found at {blast_output_tsv}", file=sys.stderr)
157
+ return None
140
158
 
141
159
  if include_match_stats:
142
160
  return detailed_dict
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ssi_analysis_result_parsers
3
- Version: 0.0.8
3
+ Version: 0.0.10
4
4
  Summary: TODO
5
5
  Home-page: https://github.com/thej-ssi/ssi_analysis_result_parsers
6
6
  Author: Thor Bech Johannesen
@@ -1,22 +1,26 @@
1
+ ssi_analysis_result_parsers/Ecoli_parser.py,sha256=lKNnx27KqxpX3yv1RAP-sBATMGf9yjfSs4-d50mR2cA,22704
1
2
  ssi_analysis_result_parsers/Legionella_parser.py,sha256=nBOPrelzCMh8IMEDje6jtU9sz92sEyuxddBe0MntTfo,6879
2
- ssi_analysis_result_parsers/Spyogenes_parser.py,sha256=fTud7InvlPPPH1YHt5wcK1sGPZQlgNvIyN28qhcJLi8,11295
3
- ssi_analysis_result_parsers/__init__.py,sha256=R9xOYoYrWKcfO5zvTeGC3m_eDNOvxMd8CocQs2tLufo,22
4
- ssi_analysis_result_parsers/_modidx.py,sha256=kz1oGnDbstzvo_tNuFiX5wkhfhNmiqSiCkwBnzplRU0,14895
5
- ssi_analysis_result_parsers/blast_parser.py,sha256=EBqWlx8bDlaSzqAZomiUGnT2DGaaA-L7ukny7SEJbpk,7915
3
+ ssi_analysis_result_parsers/Spyogenes_parser.py,sha256=Cjibp7iKGofjSp-igm-jmjBVkQ6-zxYQWVSZT-Vx3Fo,12731
4
+ ssi_analysis_result_parsers/__init__.py,sha256=-nNlMKS9nph3FR78_ZG9RGKrbxseeNp2K6nMr0pVGaU,23
5
+ ssi_analysis_result_parsers/_modidx.py,sha256=JAUPTOicf6tKcLhA8DOvsehlZxy6LDPxQDlootV_InE,18281
6
+ ssi_analysis_result_parsers/blast_parser.py,sha256=pIzMGk5-VyTy8uzFncTiIsy80wQxl9NbNiGI_K7XMaM,8658
6
7
  ssi_analysis_result_parsers/core.py,sha256=8CzFMDrGJ24D9aoIebLsG8tx-OxvYJod1cxBITqNfaY,12258
7
8
  ssi_analysis_result_parsers/hello_world.py,sha256=jpN94sqYuNHqUbUZMCJ35qGY5iLPB_emucgnDGDUk_U,1895
8
9
  ssi_analysis_result_parsers/some_string.py,sha256=JwmAXKbX_JgY8UGh4FAu5-7ZjezcAEhq4Q2B73pWp2M,923
9
10
  ssi_analysis_result_parsers/config/config.default.env,sha256=Zt6bfPbVV3rYCksoebX1ruAdFgeD9wqAnKDtswhtJJM,1390
10
11
  ssi_analysis_result_parsers/config/config.default.yaml,sha256=3qgUrUtQpxrzYv7WQaHsvz9dQB0RALKNU0idxv7oRqM,460
11
- ssi_analysis_result_parsers-0.0.8.dist-info/licenses/LICENSE,sha256=p6aTb6QIfqyZ2Uux2VjV4F2zthdUSHZOjB4mfwGc7fo,1094
12
+ ssi_analysis_result_parsers-0.0.10.dist-info/licenses/LICENSE,sha256=p6aTb6QIfqyZ2Uux2VjV4F2zthdUSHZOjB4mfwGc7fo,1094
12
13
  test_input/.DS_Store,sha256=sdTEvl9DTKPHNPYYjMqDepX7q7ZETlonk21tGEuWLao,6148
13
14
  test_input/empty_file.txt,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ test_input/Ecoli/ERR14229029.res,sha256=AmVZwbiUTjOQLe7SmSKWt9-URdcrsLSxt9hHUh-nFUY,129
16
+ test_input/Ecoli/ERR3528110.res,sha256=DmiDRfX9LPypAEzVeO1RHaPoqEpZwq8ZtQDJ1KOWwHc,461
17
+ test_input/Ecoli/samplesheet.tsv,sha256=sSPrVrloOWvfmnp2Lnn8H6mCkiWsZUFV0wrovk3jH-Q,416
14
18
  test_input/Legionella/batch_parser_file_paths.tsv,sha256=AikBS_Ez1xO3UrEQ19AY3z6drBDdMAiSGK66NLeyYj4,356
15
19
  test_input/Legionella/lag-1_blast.tsv,sha256=MN5QL_iBn9gQ8VTYEcTnT0JwKgpkD8G15-QFOrSWxkU,1133
16
20
  test_input/Legionella/lag-1_blast_2.tsv,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
21
  test_input/Legionella/test.sbt.tsv,sha256=ibhaH3is2dxHaABPvR2QM2HAq9bKOs1AwOTmrwSrcd8,168
18
22
  test_input/Legionella/test2.sbt.tsv,sha256=uJyVGHKXPmnvaXSt_84_buATOyl79H6vZjkWRitca9k,170
19
- test_input/Spyogenes/batch_parser_file_paths.tsv,sha256=PzJO_X6jUdVz4Z6BU2NI_7H1oclIDXkfLPhR9Mj1X8E,231
23
+ test_input/Spyogenes/batch_parser_file_paths.tsv,sha256=Va9rulA_fbK6k9IkIa0Lr2z5qG-spquc056_gL5bG1I,547
20
24
  test_input/Spyogenes/emm_typing/Mga.fasta,sha256=WvDUm_tiUfAfcBAh9fwOHc8F0WkvjaxNErzUsMNV1w4,1630
21
25
  test_input/Spyogenes/emm_typing/emm_clusters.txt,sha256=ggYNeQjAIIokLHX6vXc7ER6PIIwbhQR5OahD3cxu88c,3479
22
26
  test_input/Spyogenes/emm_typing/test1.emm.blast.tsv,sha256=Xncpf4b0WEtbxBNKYaA84yPxwzyWG8S8QIJ7ifP-lIk,10759
@@ -27,14 +31,16 @@ test_input/Spyogenes/emm_typing/test3.emm.blast.tsv,sha256=Z5MV_PMF6GjQokkxP4w30
27
31
  test_input/Spyogenes/emm_typing/test4.emm.blast.tsv,sha256=tM9-iXa3STGyR0_DpIYfgETZBirxz4-uX5f0rq4Ni-A,4989
28
32
  test_input/Spyogenes/emm_typing/test5.emm.blast.tsv,sha256=gNAtzRdO1nBUwCH4Az57-_-s4aXoWgjZW14iA3Tjp-4,8347
29
33
  test_input/Spyogenes/emm_typing/test6.emm.blast.tsv,sha256=mSySwORBVGGXVoSGECoYVw9OVDcUVaExMXVtb5DAwqE,11698
34
+ test_input/Spyogenes/emm_typing/test7.emm.blast.tsv,sha256=ff308ZD95mQZu3K2OXUKEldL5uRwE_rxVF-Gb8VDv8A,10376
30
35
  test_input/blast_parser/allele_matches_test.tsv,sha256=7vfQAOxz3fKc84HtxN9eoCyQoF9G8MFd-GKH3Krw_Cs,233035
31
36
  test_input/blast_parser/empty_gene_presence_absense_test.tsv,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
37
  test_input/blast_parser/gene_presence_absence_test.tsv,sha256=qCvMkBC-1GuXx83RDhnGAuuBXAlIq4e_IW0rrNVn2yA,1447
33
38
  test_output/output_with_sample_name.tsv,sha256=NQG7WaxczuWCCsX2a9MUxCCYpbuAirz9gw08OLdEdUo,41
34
39
  test_output/test.tsv,sha256=6DGzarXMkUP03Z58vZimc-gu1K2k84zxZLWWF2HROCg,277
35
40
  test_output/test_batch_output.tsv,sha256=6DGzarXMkUP03Z58vZimc-gu1K2k84zxZLWWF2HROCg,277
36
- ssi_analysis_result_parsers-0.0.8.dist-info/METADATA,sha256=6TO2iB06jnXSGOlf5Jpd4gnNrNdmSekVYeirLnUTNSs,2765
37
- ssi_analysis_result_parsers-0.0.8.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
38
- ssi_analysis_result_parsers-0.0.8.dist-info/entry_points.txt,sha256=noBLlB4hLmYcqns7KdQPVURO27kZ_zWMsPHYkRlBGEE,631
39
- ssi_analysis_result_parsers-0.0.8.dist-info/top_level.txt,sha256=3q56bBc2Wv2a6ZQ1l_9m66vot2-Qu6tM9tDr3QQ8auM,81
40
- ssi_analysis_result_parsers-0.0.8.dist-info/RECORD,,
41
+ test_output/Ecoli/KMA_cases_parser.tsv,sha256=Wf3JkSppRN5AK2zRJmFQlwVfCMyJfgyyBpTjb1sK6Uw,586
42
+ ssi_analysis_result_parsers-0.0.10.dist-info/METADATA,sha256=BZSffWWanmoRU6UPhhJl6jc1pBomIvjyX79m-H39DAI,2766
43
+ ssi_analysis_result_parsers-0.0.10.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
44
+ ssi_analysis_result_parsers-0.0.10.dist-info/entry_points.txt,sha256=1btBaEDONU_OZlGK4iXe0-my25NFtup33MfYFS-Oj24,705
45
+ ssi_analysis_result_parsers-0.0.10.dist-info/top_level.txt,sha256=dhzhsC8l7PYeBYNT8JzHPz3BriLAw3llVo0jHn175WI,90
46
+ ssi_analysis_result_parsers-0.0.10.dist-info/RECORD,,
@@ -3,6 +3,7 @@ blast_parser_allele_matches = ssi_analysis_result_parsers.blast_parser:allele_ma
3
3
  blast_parser_presence_absence = ssi_analysis_result_parsers.blast_parser:presence_absence
4
4
  get_Spyogenes_results = ssi_analysis_result_parsers.Spyogenes_parser:Spyogenes_parser
5
5
  get_Spyogenes_results_batch = ssi_analysis_result_parsers.Spyogenes_parser:Spyogenes_batch_parser
6
+ get_ecoli_results = ssi_analysis_result_parsers.Ecoli_parser:ecoli_parser
6
7
  get_leg_results = ssi_analysis_result_parsers.Legionella_parser:legionella_parser
7
8
  get_leg_results_batch = ssi_analysis_result_parsers.Legionella_parser:legionella_batch_parser
8
9
 
@@ -0,0 +1 @@
1
+ #Template Score Expected Template_length Template_Identity Template_Coverage Query_Identity Query_Coverage Depth q_value p_value
@@ -0,0 +1,4 @@
1
+ #Template Score Expected Template_length Template_Identity Template_Coverage Query_Identity Query_Coverage Depth q_value p_value
2
+ 1__wzx__O6__AJ426045 20056 153 1257 99.92 100.00 99.92 100.00 16.07 19601.01 1.0e-26
3
+ 2__wzy__O6__AJ426423 23540 159 1344 100.00 100.00 100.00 100.00 17.35 23065.87 1.0e-26
4
+ 5__fliC__H1__AB028471 107030 73 1788 100.00 100.00 100.00 100.00 62.14 106810.34 1.0e-26
@@ -0,0 +1,3 @@
1
+ sample_name Illumina_read_files Nanopore_read_file assembly_file organism variant notes
2
+ ERR3528110 examples/Dataset/reads/ERR3528110_1.fastq.gz,examples/Dataset/reads/ERR3528110_2.fastq.gz Na examples/Dataset/assemblies/ERR3528110.fasta E.coli Na Na
3
+ ERR14229029 examples/Dataset/reads/ERR14229029_1.fastq.gz,examples/Dataset/reads/ERR14229029_2.fastq.gz Na examples/Dataset/assemblies/ERR14229029.fasta E.coli Na Na
@@ -1,5 +1,10 @@
1
1
  sample_name emm_results
2
2
  sample_1 test_input/Spyogenes/emm_typing/test1.emm.blast.tsv
3
3
  sample_2 test_input/Spyogenes/emm_typing/test2.emm.blast.tsv
4
- sample_3 test_input/files_that_does_not_exist.tsv
5
- sample_4 test_input/empty_file.txt
4
+ sample_3 test_input/Spyogenes/emm_typing/test3.emm.blast.tsv
5
+ sample_4 test_input/Spyogenes/emm_typing/test4.emm.blast.tsv
6
+ sample_5 test_input/Spyogenes/emm_typing/test5.emm.blast.tsv
7
+ sample_6 test_input/Spyogenes/emm_typing/test6.emm.blast.tsv
8
+ sample_7 test_input/Spyogenes/emm_typing/test7.emm.blast.tsv
9
+ sample_empty test_input/empty_file.txt
10
+ sample_nonexist test_input/files_that_does_not_exist.tsv
@@ -0,0 +1,40 @@
1
+ EMM203.3 GAS-2025-0367_5_42.2241 96.111 180 180 1 180 55602 55781 GGTTTTGCAAACCAAACGGAAGTAAGAGCTGAAGGGGTAAACCCGACTACGAACTTGCCAGAGAAGGCTAAATATGCCGCAGTGAAAGATGAGAATACTGGTTTACGTGGTGATCAGAAAAAATTAGTAAAAAAACTTGAAGAAGAACAAGAGAAGAGCAAAAATCTAGAAAAGCAAAAA 5.51e-81 294
2
+ EMM203.4 GAS-2025-0367_5_42.2241 96.667 180 180 1 180 55602 55781 GGTTTTGCAAACCAAACGGAAGTAAGAGCTGAAGGGGTAAACCCGACTACGAACTTGCCAGAGAAGGCTAAATATGCCGCAGTGAAAGATGAGAATACTGGTTTACGTGGTGATCAGAAAAAATTAGTAAAAAAACTTGAAGAAGAACAAGAGAAGAGCAAAAATCTAGAAAAGCAAAAA 1.18e-82 300
3
+ EMM203.5 GAS-2025-0367_5_42.2241 93.333 180 180 1 180 55602 55781 GGTTTTGCAAACCAAACGGAAGTAAGAGCTGAAGGGGTAAACCCGACTACGAACTTGCCAGAGAAGGCTAAATATGCCGCAGTGAAAGATGAGAATACTGGTTTACGTGGTGATCAGAAAAAATTAGTAAAAAAACTTGAAGAAGAACAAGAGAAGAGCAAAAATCTAGAAAAGCAAAAA 1.20e-72 267
4
+ EMM236.0 GAS-2025-0367_5_42.2241 93.333 180 180 1 180 55602 55781 GGTTTTGCAAACCAAACGGAAGTAAGAGCTGAAGGGGTAAACCCGACTACGAACTTGCCAGAGAAGGCTAAATATGCCGCAGTGAAAGATGAGAATACTGGTTTACGTGGTGATCAGAAAAAATTAGTAAAAAAACTTGAAGAAGAACAAGAGAAGAGCAAAAATCTAGAAAAGCAAAAA 1.20e-72 267
5
+ EMM236.1 GAS-2025-0367_5_42.2241 93.889 180 180 1 180 55602 55781 GGTTTTGCAAACCAAACGGAAGTAAGAGCTGAAGGGGTAAACCCGACTACGAACTTGCCAGAGAAGGCTAAATATGCCGCAGTGAAAGATGAGAATACTGGTTTACGTGGTGATCAGAAAAAATTAGTAAAAAAACTTGAAGAAGAACAAGAGAAGAGCAAAAATCTAGAAAAGCAAAAA 2.58e-74 272
6
+ EMM236.3 GAS-2025-0367_5_42.2241 93.889 180 180 1 180 55602 55781 GGTTTTGCAAACCAAACGGAAGTAAGAGCTGAAGGGGTAAACCCGACTACGAACTTGCCAGAGAAGGCTAAATATGCCGCAGTGAAAGATGAGAATACTGGTTTACGTGGTGATCAGAAAAAATTAGTAAAAAAACTTGAAGAAGAACAAGAGAAGAGCAAAAATCTAGAAAAGCAAAAA 2.58e-74 272
7
+ EMM28.0 GAS-2025-0367_2_61.3538 100.000 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 1.17e-92 333
8
+ EMM28.1 GAS-2025-0367_2_61.3538 98.788 165 180 1 165 202215 202379 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAA 5.51e-81 294
9
+ EMM28.10 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
10
+ EMM28.11 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
11
+ EMM28.12 GAS-2025-0367_2_61.3538 98.333 180 180 1 180 202215 202391 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTA---AAGAAGAAGAACCTAGGTATAAA 1.52e-86 313
12
+ EMM28.13 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
13
+ EMM28.14 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
14
+ EMM28.15 GAS-2025-0367_2_61.3538 98.333 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 1.18e-87 316
15
+ EMM28.16 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
16
+ EMM28.17 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
17
+ EMM28.18 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
18
+ EMM28.19 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
19
+ EMM28.2 GAS-2025-0367_2_61.3538 99.390 164 180 1 164 202215 202378 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGA 4.26e-82 298
20
+ EMM28.20 GAS-2025-0367_2_61.3538 98.889 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 2.53e-89 322
21
+ EMM28.21 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
22
+ EMM28.22 GAS-2025-0367_2_61.3538 98.765 162 180 1 162 202215 202376 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAA 2.56e-79 289
23
+ EMM28.23 GAS-2025-0367_2_61.3538 98.889 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 2.53e-89 322
24
+ EMM28.24 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
25
+ EMM28.25 GAS-2025-0367_2_61.3538 100.000 175 180 1 175 202215 202389 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATA 7.02e-90 324
26
+ EMM28.26 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
27
+ EMM28.27 GAS-2025-0367_2_61.3538 98.889 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 2.53e-89 322
28
+ EMM28.28 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
29
+ EMM28.29 GAS-2025-0367_2_61.3538 98.361 183 180 1 180 202215 202397 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCATTG 3.27e-88 318
30
+ EMM28.3 GAS-2025-0367_2_61.3538 99.394 165 180 1 165 202215 202379 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAA 1.18e-82 300
31
+ EMM28.30 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
32
+ EMM28.31 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
33
+ EMM28.32 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
34
+ EMM28.33 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
35
+ EMM28.4 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
36
+ EMM28.5 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
37
+ EMM28.6 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
38
+ EMM28.7 GAS-2025-0367_2_61.3538 99.383 162 180 1 162 202215 202376 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAA 5.51e-81 294
39
+ EMM28.8 GAS-2025-0367_2_61.3538 99.444 180 180 1 180 202215 202394 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCTAAAGAAGAAGAACCTAGGTATAAAGCA 5.43e-91 327
40
+ EMM28.9 GAS-2025-0367_2_61.3538 98.333 180 180 1 180 202215 202391 GGCTTTGCAAACCAAACAGAAGTTAAGGCTGCGGAGTCTCCAAAAAGTACTGAGACTTCTGCTAATGGAGCTGATAAATTAGCTGATGCATACAACACATTGCTTACTGAACATGAGAAACTCAGAGATGAGTATTATACATTAATTGATGCT---AAAGAAGAAGAACCTAGGTATAAA 1.52e-86 313
@@ -0,0 +1,3 @@
1
+ sample_name Illumina_read_files Nanopore_read_file assembly_file organism variant notes stx OH wzx wzy wzt wzm eae ehxA Other verbose
2
+ ERR3528110 examples/Dataset/reads/ERR3528110_1.fastq.gz,examples/Dataset/reads/ERR3528110_2.fastq.gz Na examples/Dataset/assemblies/ERR3528110.fasta E.coli Na Na - O6;H1 - - - - - - - wzx_O6_16.07_100.00_99.92;wzy_O6_17.35_100.00_100.00;fliC_H1_62.14_100.00_100.00
3
+ ERR14229029 examples/Dataset/reads/ERR14229029_1.fastq.gz,examples/Dataset/reads/ERR14229029_2.fastq.gz Na examples/Dataset/assemblies/ERR14229029.fasta E.coli Na Na - -;- - - - - - - -