mgnify-pipelines-toolkit 0.1.8__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

@@ -0,0 +1,181 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the 'License');
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an 'AS IS' BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ from collections import defaultdict
19
+ import logging
20
+ import os
21
+ from pathlib import Path
22
+
23
+ from mgnify_pipelines_toolkit.analysis.assembly.go_utils import parse_interproscan_tsv
24
+
25
+ logging.basicConfig(
26
+ level=logging.INFO, format="%(asctime)s - %(levelname)s: %(message)s"
27
+ )
28
+
29
+
30
+ def parse_args():
31
+
32
+ description = "Go slim pipeline."
33
+ parser = argparse.ArgumentParser(description=description)
34
+ parser.add_argument(
35
+ "-go", "--go_obo", help="Gene Ontology basic file.", required=True
36
+ )
37
+ parser.add_argument(
38
+ "-gb", "--go_banding", help="Subset GO banding file.", required=True
39
+ )
40
+ parser.add_argument(
41
+ "-gaf",
42
+ "--gaf_input",
43
+ help="GAF file, generated by generate_gaf.py",
44
+ required=True,
45
+ )
46
+ parser.add_argument(
47
+ "-i", "--ips_input", help="InterProScan result file.", required=True
48
+ )
49
+ parser.add_argument("-o", "--output", help="GO summary output file.", required=True)
50
+ args = parser.parse_args()
51
+
52
+ go_obo = args.go_obo
53
+ go_banding = args.go_banding
54
+ gaf_input = args.gaf_input
55
+ ips_input = args.ips_input
56
+ output = args.output
57
+
58
+ return go_obo, go_banding, gaf_input, ips_input, output
59
+
60
+
61
+ def parse_mapped_gaf_file(gaf_file: Path) -> defaultdict[set]:
62
+
63
+ mapped_go_dict = defaultdict(set)
64
+ if os.path.exists(gaf_file):
65
+ handle = open(gaf_file, "r")
66
+ for line in handle:
67
+ if not line.startswith("!"):
68
+ line = line.strip()
69
+ splitted_line = line.split("\t")
70
+ go_id = splitted_line[1]
71
+ mapped_go_id = splitted_line[4]
72
+ mapped_go_dict[go_id].add(mapped_go_id)
73
+
74
+ return mapped_go_dict
75
+
76
+
77
+ def get_go_slim_summary(go_slim_banding_file, goslims2_protein_count):
78
+ summary = []
79
+
80
+ fr = open(go_slim_banding_file, "r")
81
+
82
+ for line in fr:
83
+ if line.startswith("GO"):
84
+ line = line.strip()
85
+ line_chunks = line.split("\t")
86
+ go_id = line_chunks[0]
87
+ term = line_chunks[1]
88
+ category = line_chunks[2]
89
+ # Default value for the count
90
+ count = 0
91
+ if go_id in goslims2_protein_count:
92
+ count = goslims2_protein_count[go_id]
93
+ summary.append((go_id, term, category, count))
94
+ return summary
95
+
96
+
97
+ def write_go_summary_to_file(go_summary, output_file):
98
+ fw = open(output_file, "w")
99
+ for go, term, category, count in go_summary:
100
+ fw.write('","'.join(['"' + go, term, category, str(count) + '"']) + "\n")
101
+ fw.close()
102
+
103
+
104
+ def parse_gene_ontology(obo_file):
105
+ """
106
+ Parses OBO formatted file.
107
+ :param obo_file:
108
+ :return:
109
+ """
110
+ go_term_tuples = []
111
+ fr = open(obo_file, "r")
112
+ id, term, category = "", "", ""
113
+ for line in fr:
114
+ line = line.strip()
115
+ split_line = line.split(": ")
116
+ if line.startswith("id:"):
117
+ id = split_line[1]
118
+ elif line.startswith("name:"):
119
+ term = split_line[1]
120
+ elif line.startswith("namespace"):
121
+ category = split_line[1]
122
+ else:
123
+ if id.startswith("GO:") and id and term and category:
124
+ item = (id, term, category)
125
+ go_term_tuples.append(item)
126
+ id, term, category = "", "", ""
127
+ fr.close()
128
+ return go_term_tuples
129
+
130
+
131
+ def get_full_go_summary(core_gene_ontology, go2protein_count_dict, top_level_go_ids):
132
+ summary = []
133
+
134
+ for go_id, term, category in core_gene_ontology:
135
+
136
+ if (go_id in go2protein_count_dict) and (
137
+ go_id not in top_level_go_ids
138
+ ): # make sure that top level terms are not included (they tell you nothing!)
139
+ count = go2protein_count_dict[go_id]
140
+ summary.append((go_id, term, category, count))
141
+ summary.sort(key=lambda x: (x[2], -x[3]))
142
+ return summary
143
+
144
+
145
+ def main():
146
+
147
+ go_obo, go_banding, gaf_input, ips_input, output = parse_args()
148
+
149
+ logging.info("Parsing the InterProScan input: " + ips_input)
150
+ go2protein_count_dict = parse_interproscan_tsv(ips_input)
151
+ logging.info("Finished parsing.")
152
+
153
+ # Generate GO summary
154
+ logging.info("Loading full Gene ontology: " + go_obo)
155
+ go_term_tuples = parse_gene_ontology(go_obo)
156
+ logging.info("Finished loading.")
157
+
158
+ logging.info("Generating full GO summary...")
159
+ top_level_go_ids = ["GO:0008150", "GO:0003674", "GO:0005575"]
160
+ full_go_summary = get_full_go_summary(
161
+ go_term_tuples, go2protein_count_dict, top_level_go_ids
162
+ )
163
+ logging.info("Finished generation.")
164
+
165
+ logging.info("Writing full GO summary: " + output)
166
+ write_go_summary_to_file(full_go_summary, output)
167
+ logging.info("Finished writing.")
168
+
169
+ mapped_go_terms = parse_mapped_gaf_file(gaf_input)
170
+ logging.info("Getting GO slim counts")
171
+ goslims2_protein_count = parse_interproscan_tsv(ips_input, mapped_go_terms)
172
+
173
+ go_slim_summary = get_go_slim_summary(go_banding, goslims2_protein_count)
174
+ go_slim_output_file = output + "_slim"
175
+ logging.info("Writing GO slim summary: " + go_slim_output_file)
176
+ write_go_summary_to_file(go_slim_summary, go_slim_output_file)
177
+ logging.info("Finished writing.")
178
+
179
+
180
+ if __name__ == "__main__":
181
+ main()
@@ -0,0 +1,382 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import click
18
+ from collections import defaultdict
19
+ import glob
20
+ import logging
21
+ from pathlib import Path
22
+ from typing import Union, List
23
+
24
+ import pandas as pd
25
+
26
+ from mgnify_pipelines_toolkit.constants.db_labels import TAXDB_LABELS, ASV_TAXDB_LABELS
27
+ from mgnify_pipelines_toolkit.constants.tax_ranks import (
28
+ _SILVA_TAX_RANKS,
29
+ _PR2_TAX_RANKS,
30
+ )
31
+ from mgnify_pipelines_toolkit.schemas.schemas import (
32
+ AmpliconPassedRunsSchema,
33
+ AmpliconNonINSDCPassedRunsSchema,
34
+ TaxonSchema,
35
+ PR2TaxonSchema,
36
+ )
37
+
38
+ logging.basicConfig(level=logging.DEBUG)
39
+
40
+
41
+ @click.group()
42
+ def cli():
43
+ pass
44
+
45
+
46
+ def get_tax_file(
47
+ run_acc: str, analyses_dir: Path, db_label: str
48
+ ) -> Union[Path, List[Path]]:
49
+ """Takes path information for a particular analysis and db_label combo, and returns any existing files.
50
+
51
+ :param run_acc: Run accession for the tax file that should be retrieved.
52
+ :type run_acc: str
53
+ :param analyses_dir: The path to the directory containing all of the analyses,
54
+ including the tax file corresponding to :param:`run_acc`.
55
+ :type analyses_dir: Path
56
+ :param db_label: One of the database labels that results might exist for,
57
+ values of which come from the imported constants ``TAXDB_LABELS`` and ``ASV_TAXDB_LABELS``.
58
+ :type db_label: str
59
+ :return: Either a :class:`Path` object if :param:`db_label` comes from ``TAXDB_LABELS``,
60
+ or a list of :class:`Path` objects if from ``ASV_TAXDB_LABELS``.
61
+ :rtype: Union[Path, List[Path]]
62
+ """
63
+
64
+ tax_file = None
65
+
66
+ db_path = Path(f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}")
67
+
68
+ if not db_path.exists():
69
+ logging.debug(
70
+ f"DB {db_path} doesn't exist for {run_acc}. Skipping"
71
+ ) # or error?
72
+ return
73
+
74
+ if db_label in TAXDB_LABELS:
75
+ tax_file = Path(
76
+ f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/{run_acc}_{db_label}.txt"
77
+ )
78
+ if not tax_file.exists():
79
+ logging.error(
80
+ f"DB path exists but file doesn't - exiting. Path: {tax_file}"
81
+ )
82
+ exit(1)
83
+
84
+ file_size = tax_file.stat().st_size
85
+ if (
86
+ file_size == 0
87
+ ): # Pipeline can generate files that are empty for ITS DBs (UNITE and ITSoneDB),
88
+ # so need to skip those. Should probably fix that at some point
89
+ logging.debug(
90
+ f"File {tax_file} exists but is empty, so will be skipping it."
91
+ )
92
+ tax_file = None
93
+ elif db_label in ASV_TAXDB_LABELS:
94
+ # ASV tax files could have up to two files, one for each amplified region (maximum two from the pipeline).
95
+ # So will need to handle this differently to closed-reference files
96
+ asv_tax_files = glob.glob(
97
+ f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/*.txt"
98
+ )
99
+ asv_tax_files = [
100
+ Path(file) for file in asv_tax_files if "concat" not in file
101
+ ] # Have to filter out concatenated file if it exists
102
+
103
+ tax_file = asv_tax_files
104
+
105
+ return tax_file
106
+
107
+
108
+ def parse_one_tax_file(
109
+ run_acc: str, tax_file: Path, long_tax_ranks: list
110
+ ) -> pd.DataFrame:
111
+ """Parses a taxonomy file, and returns it as a pandas DataFrame object.
112
+
113
+ :param run_acc: Run accession of the taxonomy file that will be parsed.
114
+ :type run_acc: str
115
+ :param tax_file: Taxonomy file that will be parsed.
116
+ :type tax_file: Path
117
+ :param long_tax_ranks: Either the imported list _SILVA_TAX_RANKS or _PR2_TAX_RANKS
118
+ to validate the taxonomic ranks of the file.
119
+ :type tax_file: list
120
+ :return: The parsed :param:`tax_file` as a :class:`pd.DataFrame` object
121
+ :rtype: pd.DataFrame
122
+ """
123
+
124
+ res_df = pd.read_csv(tax_file, sep="\t", names=["Count"] + long_tax_ranks)
125
+ res_df = res_df.fillna("")
126
+
127
+ # Two different schemas used for validation depending on the database
128
+ # because PR2 schema has different taxonomic ranks than the standard
129
+ if len(long_tax_ranks) == 8:
130
+ TaxonSchema(res_df)
131
+ elif len(long_tax_ranks) == 9:
132
+ PR2TaxonSchema(res_df)
133
+
134
+ res_df["full_taxon"] = res_df.iloc[:, 1:].apply(
135
+ lambda x: ";".join(x).strip(";"), axis=1
136
+ )
137
+ final_df = res_df.iloc[:, [0, -1]]
138
+ final_df = final_df.set_index("full_taxon")
139
+ final_df.columns = [run_acc]
140
+
141
+ return final_df
142
+
143
+
144
+ def generate_db_summary(
145
+ db_label: str, tax_dfs: defaultdict[Path], output_prefix: str
146
+ ) -> None:
147
+ """Takes paired run accessions taxonomy dataframes in the form of a dictionary,
148
+ and respective db_label, joins them together, and generates a study-wide summary
149
+ in the form of a .tsv file.
150
+
151
+ :param db_label: One of the database labels that results might exist for,
152
+ values of which come from the imported constants ``TAXDB_LABELS`` and ``ASV_TAXDB_LABELS``.
153
+ :param tax_dfs: Dictionary where the key is a run accession,
154
+ and values are either one parsed taxonomy dataframe if the :param:db_label comes from ``TAXDB_LABELS``,
155
+ or a list of at least 1 and at most 2 dataframes if it comes from ``ASV_TAXDB_LABELS``.
156
+ These dataframes are parsed by :func:`parse_one_tax_file`
157
+ :type tax_dfs: defaultdict[Path]
158
+ :param output_prefix: Prefix to be added to the generated summary file.
159
+ :type output_prefix: str
160
+ """
161
+
162
+ if db_label in TAXDB_LABELS:
163
+ df_list = []
164
+
165
+ if "PR2" in db_label:
166
+ long_tax_ranks = _PR2_TAX_RANKS
167
+ else:
168
+ long_tax_ranks = _SILVA_TAX_RANKS
169
+
170
+ for run_acc, tax_df in tax_dfs.items():
171
+ res_df = parse_one_tax_file(run_acc, tax_df, long_tax_ranks)
172
+ df_list.append(res_df)
173
+
174
+ res_df = pd.concat(df_list, axis=1).fillna(0)
175
+ res_df = res_df.sort_index()
176
+ res_df = res_df.astype(int)
177
+
178
+ res_df.to_csv(
179
+ f"{output_prefix}_{db_label}_study_summary.tsv",
180
+ sep="\t",
181
+ index_label="taxonomy",
182
+ )
183
+
184
+ elif db_label in ASV_TAXDB_LABELS:
185
+
186
+ if "PR2" in db_label:
187
+ long_tax_ranks = _PR2_TAX_RANKS
188
+ else:
189
+ long_tax_ranks = _SILVA_TAX_RANKS
190
+
191
+ amp_region_dict = defaultdict(list)
192
+
193
+ for (
194
+ run_acc,
195
+ tax_df_asv_lst,
196
+ ) in (
197
+ tax_dfs.items()
198
+ ): # each `tax_file` will be a list containing at most two files (one for each amp_region)
199
+ for tax_df in tax_df_asv_lst:
200
+ amp_region = str(tax_df).split("_")[
201
+ -5
202
+ ] # there are a lot of underscores in these names... but it is consistent
203
+ # e.g. ERR4334351_16S-V3-V4_DADA2-SILVA_asv_krona_counts.txt
204
+ amp_region_df = parse_one_tax_file(run_acc, tax_df, long_tax_ranks)
205
+ amp_region_dict[amp_region].append(amp_region_df)
206
+
207
+ for amp_region, amp_region_dfs in amp_region_dict.items():
208
+ if (
209
+ len(amp_region_dfs) > 1
210
+ ): # Need at least two analyses with this amp_region to bother with the summary
211
+ amp_res_df = amp_region_dfs[0]
212
+ for amp_df in amp_region_dfs[1:]:
213
+ amp_res_df = amp_res_df.join(amp_df, how="outer")
214
+ amp_res_df = amp_res_df.fillna(0)
215
+ amp_res_df = amp_res_df.astype(int)
216
+
217
+ amp_res_df.to_csv(
218
+ f"{output_prefix}_{db_label}_{amp_region}_asv_study_summary.tsv",
219
+ sep="\t",
220
+ index_label="taxonomy",
221
+ )
222
+
223
+
224
+ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[List]:
225
+ """Matches different summary files of the same database label and analysis
226
+ type (and amplified region for ASVs) into a dictionary to help merge
227
+ the correct summaries.
228
+
229
+ :param all_study_summaries: List of file paths to different summary files
230
+ :type all_study_summaries: List[str]
231
+ :return: Organised dictionary where each summary is paired to a specific
232
+ database label key to be merged together.
233
+ :rtype: defaultdict[List]
234
+ """
235
+ summaries_dict = defaultdict(list)
236
+
237
+ for summary in all_study_summaries:
238
+ summary_path = Path(summary)
239
+ summary_filename = summary_path.stem
240
+
241
+ temp_lst = summary_filename.split("_")
242
+ if "asv_study_summary" in summary_filename:
243
+ summary_db_label = "_".join(
244
+ temp_lst[1:3]
245
+ ) # For ASVs we need to include the amp_region in the label
246
+ else:
247
+ summary_db_label = temp_lst[
248
+ 1
249
+ ] # For closed reference, just the db_label is needed
250
+
251
+ summaries_dict[summary_db_label].append(summary_path)
252
+
253
+ return summaries_dict
254
+
255
+
256
+ @cli.command(
257
+ "summarise",
258
+ options_metavar="-r <runs> -a <analyses_dir> -p <output_prefix>",
259
+ short_help="Generate study-level analysis summaries.",
260
+ )
261
+ @click.option(
262
+ "-r",
263
+ "--runs",
264
+ required=True,
265
+ help="CSV file containing successful analyses generated by the pipeline",
266
+ type=click.Path(exists=True, path_type=Path, dir_okay=False),
267
+ )
268
+ @click.option(
269
+ "-a",
270
+ "--analyses_dir",
271
+ required=True,
272
+ help="Input directory to where all the individual analyses subdirectories for summarising",
273
+ type=click.Path(exists=True, path_type=Path, file_okay=False),
274
+ )
275
+ @click.option(
276
+ "-p", "--output_prefix", required=True, help="Prefix to summary files", type=str
277
+ )
278
+ @click.option(
279
+ "--non_insdc",
280
+ default=False,
281
+ is_flag=True,
282
+ help="If run accessions aren't INSDC-formatted",
283
+ )
284
+ def summarise_analyses(
285
+ runs: Path, analyses_dir: Path, output_prefix: str, non_insdc: bool
286
+ ) -> None:
287
+ """Function that will take a file of pipeline-successful run accessions
288
+ that should be used for the generation of the relevant db-specific
289
+ study-level summary files. For ASV results, these will also be on a
290
+ per-amplified-region basis.
291
+ \f
292
+
293
+ :param runs: Path to a qc_passed_runs file from the pipeline execution.
294
+ Contains the accessions of runs that should therefore be included in the generated
295
+ summaries.
296
+ :type runs: Path
297
+ :param analyses_dir: The path to the directory containing all of the analyses.
298
+ :type analyses_dir: Path
299
+ :param output_prefix: Prefix to be added to the generated summary file.
300
+ :type output_prefix: str
301
+ """
302
+ runs_df = pd.read_csv(runs, names=["run", "status"])
303
+
304
+ if not non_insdc:
305
+ AmpliconPassedRunsSchema(
306
+ runs_df
307
+ ) # Run validation on the successful_runs .csv file
308
+ else:
309
+ AmpliconNonINSDCPassedRunsSchema(runs_df)
310
+
311
+ all_db_labels = TAXDB_LABELS + ASV_TAXDB_LABELS
312
+ for db_label in all_db_labels:
313
+
314
+ tax_files = defaultdict(Path)
315
+ for i in range(0, len(runs_df)):
316
+ run_acc = runs_df.loc[i, "run"]
317
+ tax_file = get_tax_file(run_acc, analyses_dir, db_label)
318
+
319
+ if tax_file:
320
+ tax_files[run_acc] = tax_file
321
+
322
+ if (
323
+ len(tax_files) > 1
324
+ ): # If at least two analyses have results from the current DB, generate a study-level summary for it
325
+ generate_db_summary(db_label, tax_files, output_prefix)
326
+
327
+
328
+ @cli.command(
329
+ "merge",
330
+ options_metavar="-a <analyses_dir> -p <output_prefix>",
331
+ short_help="Merge multiple study-level analysis summaries.",
332
+ )
333
+ @click.option(
334
+ "-a",
335
+ "--analyses_dir",
336
+ required=True,
337
+ help="Input directory to where all the individual analyses subdirectories for merging",
338
+ type=click.Path(exists=True, file_okay=False),
339
+ )
340
+ @click.option(
341
+ "-p",
342
+ "--output_prefix",
343
+ required=True,
344
+ help="Prefix to merged summary files",
345
+ type=str,
346
+ )
347
+ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
348
+ """Function that will take a file path containing study-level
349
+ summaries that should be merged together on a per-db-per-amplified-region
350
+ basis.
351
+ \f
352
+
353
+ :param analyses_dir: The filepath to the directory containing all of the analyses.
354
+ :type analyses_dir: str
355
+ :param output_prefix: Prefix to be added to the generated summary file.
356
+ :type output_prefix: str
357
+ """
358
+
359
+ # TODO: The way we grab all the summaries might change depending on how the prefect side does things
360
+ all_study_summaries = glob.glob(f"{analyses_dir}/*_study_summary.tsv")
361
+
362
+ summaries_dict = organise_study_summaries(all_study_summaries)
363
+
364
+ for db_label, summaries in summaries_dict.items():
365
+ if len(summaries) > 1:
366
+ res_df = pd.read_csv(summaries[0], sep="\t", index_col=0)
367
+ for summary in summaries[1:]:
368
+ curr_df = pd.read_csv(summary, sep="\t", index_col=0)
369
+ res_df = res_df.join(curr_df, how="outer")
370
+ res_df = res_df.fillna(0)
371
+ res_df = res_df.astype(int)
372
+
373
+ res_df = res_df.reindex(sorted(res_df.columns), axis=1)
374
+ res_df.to_csv(
375
+ f"{output_prefix}_{db_label}_study_summary.tsv",
376
+ sep="\t",
377
+ index_label="taxonomy",
378
+ )
379
+
380
+
381
+ if __name__ == "__main__":
382
+ cli()
@@ -0,0 +1,21 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ # taxonomy_summary labels for closed-reference method
18
+ TAXDB_LABELS = ["SILVA-SSU", "SILVA-LSU", "PR2", "UNITE", "ITSoneDB"]
19
+
20
+ # taxonomy_summary for ASV method
21
+ ASV_TAXDB_LABELS = ["DADA2-SILVA", "DADA2-PR2"]
@@ -35,3 +35,7 @@ _PR2_TAX_RANKS = [
35
35
  "Genus",
36
36
  "Species",
37
37
  ]
38
+
39
+ SHORT_TAX_RANKS = ["sk", "k", "p", "c", "o", "f", "g", "s"]
40
+
41
+ SHORT_PR2_TAX_RANKS = ["d", "sg", "dv", "sdv", "c", "o", "f", "g", "s"]