bactopia 2.1.6__tar.gz → 2.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bactopia-2.1.6 → bactopia-2.2.0}/PKG-INFO +2 -1
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/common.py +1 -1
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/search.py +83 -14
- bactopia-2.2.0/bactopia/databases/ena.py +118 -0
- bactopia-2.2.0/bactopia/databases/sra.py +103 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/lint/docs.py +1 -1
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/nextflow/nextflow.config.j2 +3 -3
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/module/tests/nextflow.config.j2 +1 -1
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/subworkflow/tests/nextflow.config.j2 +2 -2
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/workflow/nextflow.config.j2 +4 -4
- {bactopia-2.1.6 → bactopia-2.2.0}/pyproject.toml +2 -1
- bactopia-2.1.6/bactopia/databases/ena.py +0 -86
- {bactopia-2.1.6 → bactopia-2.2.0}/LICENSE +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/README.md +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/__init__.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/atb.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/__init__.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/atb/__init__.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/atb/atb_downloader.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/atb/atb_formatter.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/catalog.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/citations.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/datasets.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/docs.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/download.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/helpers/__init__.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/helpers/merge_schemas.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/lint.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pipeline/__init__.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pipeline/bracken_to_excel.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pipeline/check_assembly_accession.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pipeline/check_fastqs.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pipeline/cleanup_coverage.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pipeline/kraken_bracken_summary.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pipeline/mask_consensus.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pipeline/scrubber_summary.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pipeline/teton_prepare.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/prepare.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/prune.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pubmlst/build.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pubmlst/setup.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/review.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/scaffold.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/status.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/summary.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/sysinfo.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/testing.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/update.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/workflows.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/conda.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/databases/__init__.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/databases/ncbi.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/databases/pubmlst/__init__.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/databases/pubmlst/constants.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/databases/pubmlst/utils.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/lint/__init__.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/lint/citations.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/lint/models.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/lint/rules/__init__.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/lint/rules/module_rules.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/lint/rules/subworkflow_rules.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/lint/rules/workflow_rules.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/lint/runner.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/nf.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/outputs.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parse.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/__init__.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/amrfinderplus.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/annotator.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/ariba.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/assembler.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/blast.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/citations.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/coverage.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/error.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/gather.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/generic.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/kraken.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/mapping.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/mlst.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/nextflow.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/parsables.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/qc.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/sketcher.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/variants.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/versions.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/workflows.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/reports/__init__.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/reports/templates/__init__.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/scaffold.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/summary.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/__init__.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/bactopia/llms.txt.j2 +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/logos.py +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/nextflow/params.config.j2 +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/nextflow/process.config.j2 +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/module/main.nf.j2 +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/module/module.config.j2 +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/module/schema.json.j2 +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/module/tests/main.nf.test.j2 +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/module/tests/nf-test.config.j2 +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/subworkflow/main.nf.j2 +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/subworkflow/tests/main.nf.test.j2 +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/subworkflow/tests/nf-test.config.j2 +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/subworkflow/tests/nftignore.j2 +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/workflow/main.nf.j2 +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/workflow/tests/main.nf.test.j2 +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/workflow/tests/nf-test.config.j2 +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/workflow/tests/nftignore.j2 +0 -0
- {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bactopia
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.2.0
|
|
4
4
|
Summary: A Python package for working with Bactopia
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENSE
|
|
@@ -20,6 +20,7 @@ Requires-Dist: jinja2 (>=3.1.6)
|
|
|
20
20
|
Requires-Dist: openpyxl (>=3.1.0)
|
|
21
21
|
Requires-Dist: pandas (>=2.2.0)
|
|
22
22
|
Requires-Dist: psutil (>=5.9.0)
|
|
23
|
+
Requires-Dist: pysradb (>=2.2.0)
|
|
23
24
|
Requires-Dist: pyyaml (>=6.0)
|
|
24
25
|
Requires-Dist: rauth (>=0.7.3)
|
|
25
26
|
Requires-Dist: requests (>=2.28.2)
|
|
@@ -26,7 +26,7 @@ def common_options(fn):
|
|
|
26
26
|
def setup_logging(verbose: bool, silent: bool) -> None:
|
|
27
27
|
"""Configure root logger with RichHandler at the appropriate level."""
|
|
28
28
|
logging.basicConfig(
|
|
29
|
-
format="%(
|
|
29
|
+
format="%(message)s",
|
|
30
30
|
datefmt="%Y-%m-%d %H:%M:%S",
|
|
31
31
|
handlers=[
|
|
32
32
|
RichHandler(rich_tracebacks=True, console=rich.console.Console(stderr=True))
|
|
@@ -30,6 +30,8 @@ click.rich_click.OPTION_GROUPS = {
|
|
|
30
30
|
"name": "Query Options",
|
|
31
31
|
"options": [
|
|
32
32
|
"--exact-taxon",
|
|
33
|
+
"--provider",
|
|
34
|
+
"--only-provider",
|
|
33
35
|
"--limit",
|
|
34
36
|
"--accession-limit",
|
|
35
37
|
"--biosample-subset",
|
|
@@ -192,6 +194,9 @@ def parse_query(q, accession_limit, exact_taxon=False):
|
|
|
192
194
|
run_accessions = []
|
|
193
195
|
|
|
194
196
|
for query in queries:
|
|
197
|
+
query = query.strip()
|
|
198
|
+
if not query:
|
|
199
|
+
continue
|
|
195
200
|
try:
|
|
196
201
|
taxon_id = int(query)
|
|
197
202
|
if exact_taxon:
|
|
@@ -244,6 +249,18 @@ def parse_query(q, accession_limit, exact_taxon=False):
|
|
|
244
249
|
help="Taxon ID or Study, BioSample, or Run accession (can also be comma separated or a file of accessions)",
|
|
245
250
|
)
|
|
246
251
|
@click.option("--exact-taxon", is_flag=True, help="Exclude Taxon ID descendants")
|
|
252
|
+
@click.option(
|
|
253
|
+
"--provider",
|
|
254
|
+
default="ena",
|
|
255
|
+
show_default=True,
|
|
256
|
+
type=click.Choice(["ena", "sra"], case_sensitive=False),
|
|
257
|
+
help="Provider to query first, falls back to the other",
|
|
258
|
+
)
|
|
259
|
+
@click.option(
|
|
260
|
+
"--only-provider",
|
|
261
|
+
is_flag=True,
|
|
262
|
+
help="Only query the given --provider, skip fallback",
|
|
263
|
+
)
|
|
247
264
|
@click.option(
|
|
248
265
|
"--outdir", "-o", default="./", show_default=True, help="Directory to write output"
|
|
249
266
|
)
|
|
@@ -316,6 +333,8 @@ def parse_query(q, accession_limit, exact_taxon=False):
|
|
|
316
333
|
def search(
|
|
317
334
|
query,
|
|
318
335
|
exact_taxon,
|
|
336
|
+
provider,
|
|
337
|
+
only_provider,
|
|
319
338
|
outdir,
|
|
320
339
|
prefix,
|
|
321
340
|
limit,
|
|
@@ -379,12 +398,31 @@ def search(
|
|
|
379
398
|
accessions_file = f"{outdir}/{prefix}-accessions.txt".replace("//", "/")
|
|
380
399
|
filtered_file = f"{outdir}/{prefix}-filtered.txt".replace("//", "/")
|
|
381
400
|
summary_file = f"{outdir}/{prefix}-search.txt".replace("//", "/")
|
|
401
|
+
|
|
402
|
+
if not force:
|
|
403
|
+
existing = [
|
|
404
|
+
f
|
|
405
|
+
for f in [metadata_file, accessions_file, filtered_file, summary_file]
|
|
406
|
+
if Path(f).exists()
|
|
407
|
+
]
|
|
408
|
+
if existing:
|
|
409
|
+
logging.error(
|
|
410
|
+
f"Output files already exist: {', '.join(existing)}. "
|
|
411
|
+
"Use --force to overwrite."
|
|
412
|
+
)
|
|
413
|
+
sys.exit(1)
|
|
414
|
+
|
|
382
415
|
genome_sizes = get_ncbi_genome_size() if use_ncbi_genome_size else None
|
|
383
416
|
for query_type, ena_query, sra_query in queries:
|
|
384
417
|
logging.info(f"Submitting query (type - {query_type})")
|
|
385
418
|
is_accession = True if query_type.endswith("accession") else False
|
|
386
|
-
success, query_results = get_run_info(
|
|
387
|
-
sra_query,
|
|
419
|
+
success, query_results, source = get_run_info(
|
|
420
|
+
sra_query,
|
|
421
|
+
ena_query,
|
|
422
|
+
is_accession,
|
|
423
|
+
limit=limit,
|
|
424
|
+
provider=provider,
|
|
425
|
+
only_provider=only_provider,
|
|
388
426
|
)
|
|
389
427
|
results += query_results
|
|
390
428
|
if success:
|
|
@@ -395,8 +433,35 @@ def search(
|
|
|
395
433
|
genome_size=genome_size,
|
|
396
434
|
genome_sizes=genome_sizes,
|
|
397
435
|
)
|
|
436
|
+
|
|
437
|
+
# Fallback: provider returned results but none passed filtering
|
|
438
|
+
if not query_accessions and not only_provider:
|
|
439
|
+
fallback = "sra" if source == "ena" else "ena"
|
|
440
|
+
logging.info(
|
|
441
|
+
f"Accession found on {source.upper()}, but missing "
|
|
442
|
+
f"metadata, checking {fallback.upper()}..."
|
|
443
|
+
)
|
|
444
|
+
fb_success, fb_results, fb_source = get_run_info(
|
|
445
|
+
sra_query,
|
|
446
|
+
ena_query,
|
|
447
|
+
is_accession,
|
|
448
|
+
limit=limit,
|
|
449
|
+
provider=fallback,
|
|
450
|
+
only_provider=True,
|
|
451
|
+
)
|
|
452
|
+
if fb_success:
|
|
453
|
+
results += fb_results
|
|
454
|
+
source = fb_source
|
|
455
|
+
query_accessions, query_filtered = parse_accessions(
|
|
456
|
+
fb_results,
|
|
457
|
+
min_read_length=min_read_length,
|
|
458
|
+
min_base_count=min_base_count,
|
|
459
|
+
genome_size=genome_size,
|
|
460
|
+
genome_sizes=genome_sizes,
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
WARNING_MESSAGE = None
|
|
398
464
|
if len(query_accessions):
|
|
399
|
-
WARNING_MESSAGE = None
|
|
400
465
|
if query_type == "biosample" and biosample_subset > 0:
|
|
401
466
|
if len(query_accessions) > biosample_subset:
|
|
402
467
|
WARNING_MESSAGE = f"WARNING: Selected {biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}"
|
|
@@ -404,20 +469,19 @@ def search(
|
|
|
404
469
|
query_accessions, biosample_subset
|
|
405
470
|
)
|
|
406
471
|
accessions = list(set(accessions + query_accessions))
|
|
407
|
-
filtered["min_base_count"] += query_filtered["min_base_count"]
|
|
408
|
-
filtered["min_read_length"] += query_filtered["min_read_length"]
|
|
409
|
-
filtered["technical"] += query_filtered["technical"]
|
|
410
|
-
for filtered_sample in query_filtered["filtered"]:
|
|
411
|
-
filtered["filtered"][filtered_sample["accession"]] = (
|
|
412
|
-
filtered_sample["reason"]
|
|
413
|
-
)
|
|
414
472
|
else:
|
|
415
473
|
if query_results:
|
|
416
|
-
WARNING_MESSAGE = f"WARNING: {query} did not return any Illumina or Ont results from
|
|
474
|
+
WARNING_MESSAGE = f"WARNING: {query} did not return any Illumina or Ont results from {source.upper()}."
|
|
417
475
|
else:
|
|
418
|
-
WARNING_MESSAGE = (
|
|
419
|
-
|
|
420
|
-
|
|
476
|
+
WARNING_MESSAGE = f"WARNING: {query} did not return any results from {source.upper()}."
|
|
477
|
+
|
|
478
|
+
filtered["min_base_count"] += query_filtered["min_base_count"]
|
|
479
|
+
filtered["min_read_length"] += query_filtered["min_read_length"]
|
|
480
|
+
filtered["technical"] += query_filtered["technical"]
|
|
481
|
+
for filtered_sample in query_filtered["filtered"]:
|
|
482
|
+
filtered["filtered"][filtered_sample["accession"]] = filtered_sample[
|
|
483
|
+
"reason"
|
|
484
|
+
]
|
|
421
485
|
|
|
422
486
|
# Create Summary
|
|
423
487
|
query_string = query
|
|
@@ -435,6 +499,7 @@ def search(
|
|
|
435
499
|
summary.append(
|
|
436
500
|
f"DATE: {datetime.datetime.now().replace(microsecond=0).isoformat()}"
|
|
437
501
|
)
|
|
502
|
+
summary.append(f"PROVIDER: {source.upper()}")
|
|
438
503
|
summary.append(f"LIMIT: {limit}")
|
|
439
504
|
summary.append(f"RESULTS: {len(results)} ({metadata_file})")
|
|
440
505
|
summary.append(
|
|
@@ -462,6 +527,10 @@ def search(
|
|
|
462
527
|
else:
|
|
463
528
|
logging.error(f"ERROR: Unable to retrieve metadata for query ({query})")
|
|
464
529
|
|
|
530
|
+
if not results:
|
|
531
|
+
logging.error("No results found, skipping output files.")
|
|
532
|
+
sys.exit(1)
|
|
533
|
+
|
|
465
534
|
# Output the results
|
|
466
535
|
logging.info(f"Writing results to {metadata_file}")
|
|
467
536
|
with open(metadata_file, "w") as output_fh:
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
ENA_URL = "https://www.ebi.ac.uk/ena/portal/api/search"
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_ena_metadata(query: str, is_accession: bool, limit: int):
|
|
9
|
+
"""Fetch metadata from ENA.
|
|
10
|
+
https://docs.google.com/document/d/1CwoY84MuZ3SdKYocqssumghBF88PWxUZ/edit#heading=h.ag0eqy2wfin5
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
query (str): The query to search for.
|
|
14
|
+
is_accession (bool): If the query is an accession or not.
|
|
15
|
+
limit (int): The maximum number of records to return.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
list: Records associated with the accession.
|
|
19
|
+
"""
|
|
20
|
+
data = {
|
|
21
|
+
"dataPortal": "ena",
|
|
22
|
+
"dccDataOnly": "false",
|
|
23
|
+
"download": "false",
|
|
24
|
+
"result": "read_run",
|
|
25
|
+
"format": "tsv",
|
|
26
|
+
"limit": limit,
|
|
27
|
+
"fields": "all",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
if is_accession:
|
|
31
|
+
data["includeAccessions"] = query
|
|
32
|
+
else:
|
|
33
|
+
data["query"] = (
|
|
34
|
+
f'"{query} AND library_source=GENOMIC AND '
|
|
35
|
+
"(library_strategy=OTHER OR library_strategy=WGS OR "
|
|
36
|
+
"library_strategy=WGA) AND (library_selection=MNase OR "
|
|
37
|
+
"library_selection=RANDOM OR library_selection=unspecified OR "
|
|
38
|
+
'library_selection="size fractionation")"'
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
headers = {"accept": "*/*", "Content-type": "application/x-www-form-urlencoded"}
|
|
42
|
+
|
|
43
|
+
r = requests.post(ENA_URL, headers=headers, data=data)
|
|
44
|
+
if r.status_code == requests.codes.ok:
|
|
45
|
+
data = []
|
|
46
|
+
col_names = None
|
|
47
|
+
for line in r.text.split("\n"):
|
|
48
|
+
cols = line.split("\t")
|
|
49
|
+
if line:
|
|
50
|
+
if col_names:
|
|
51
|
+
data.append(dict(zip(col_names, cols)))
|
|
52
|
+
else:
|
|
53
|
+
col_names = cols
|
|
54
|
+
return [True, data]
|
|
55
|
+
else:
|
|
56
|
+
return [False, [r.status_code, r.text]]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_run_info(
|
|
60
|
+
sra_query: str,
|
|
61
|
+
ena_query: str,
|
|
62
|
+
is_accession: bool,
|
|
63
|
+
limit: int = 1000000,
|
|
64
|
+
provider: str = "ena",
|
|
65
|
+
only_provider: bool = False,
|
|
66
|
+
) -> tuple:
|
|
67
|
+
"""Retrieve a list of samples available from ENA and/or SRA.
|
|
68
|
+
|
|
69
|
+
By default, the provider is queried first and the other is used as fallback. When
|
|
70
|
+
only_provider is True, no fallback is attempted.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
sra_query: A formatted query for SRA searches.
|
|
74
|
+
ena_query: A formatted query for ENA searches.
|
|
75
|
+
is_accession: If the query is an accession or not.
|
|
76
|
+
limit: The maximum number of records to return.
|
|
77
|
+
provider: Which provider to query first ("ena" or "sra").
|
|
78
|
+
only_provider: If True, skip fallback to the other provider.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
tuple: (success, data, source) where source is "ena", "sra", or "none".
|
|
82
|
+
"""
|
|
83
|
+
from bactopia.databases.sra import get_sra_metadata
|
|
84
|
+
|
|
85
|
+
fallback = "sra" if provider == "ena" else "ena"
|
|
86
|
+
|
|
87
|
+
def _query_ena():
|
|
88
|
+
logging.debug("Querying ENA for metadata...")
|
|
89
|
+
success, data = get_ena_metadata(ena_query, is_accession, limit=limit)
|
|
90
|
+
if success and data:
|
|
91
|
+
return True, data
|
|
92
|
+
if not success:
|
|
93
|
+
logging.warning(f"ENA query failed (status {data[0]}).")
|
|
94
|
+
else:
|
|
95
|
+
logging.debug("ENA query returned no results.")
|
|
96
|
+
return False, []
|
|
97
|
+
|
|
98
|
+
def _query_sra():
|
|
99
|
+
logging.debug("Querying SRA for metadata...")
|
|
100
|
+
return get_sra_metadata(sra_query, is_accession, limit=limit)
|
|
101
|
+
|
|
102
|
+
query_fn = {"ena": _query_ena, "sra": _query_sra}
|
|
103
|
+
|
|
104
|
+
success, data = query_fn[provider]()
|
|
105
|
+
if success:
|
|
106
|
+
return True, data, provider
|
|
107
|
+
|
|
108
|
+
if only_provider:
|
|
109
|
+
logging.error(f"{provider.upper()} returned no results (--only-provider).")
|
|
110
|
+
return False, [], "none"
|
|
111
|
+
|
|
112
|
+
logging.info(f"No results from {provider.upper()}, checking {fallback.upper()}...")
|
|
113
|
+
success, data = query_fn[fallback]()
|
|
114
|
+
if success:
|
|
115
|
+
return True, data, fallback
|
|
116
|
+
|
|
117
|
+
logging.error("Both ENA and SRA returned no results.")
|
|
118
|
+
return False, [], "none"
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from pysradb.sraweb import SRAweb
|
|
4
|
+
|
|
5
|
+
INSTRUMENT_PLATFORM_MAP = {
|
|
6
|
+
"illumina": "ILLUMINA",
|
|
7
|
+
"nextseq": "ILLUMINA",
|
|
8
|
+
"hiseq": "ILLUMINA",
|
|
9
|
+
"miseq": "ILLUMINA",
|
|
10
|
+
"novaseq": "ILLUMINA",
|
|
11
|
+
"miniseq": "ILLUMINA",
|
|
12
|
+
"genome analyzer": "ILLUMINA",
|
|
13
|
+
"minion": "OXFORD_NANOPORE",
|
|
14
|
+
"gridion": "OXFORD_NANOPORE",
|
|
15
|
+
"promethion": "OXFORD_NANOPORE",
|
|
16
|
+
"nanopore": "OXFORD_NANOPORE",
|
|
17
|
+
"pacbio": "PACBIO_SMRT",
|
|
18
|
+
"sequel": "PACBIO_SMRT",
|
|
19
|
+
"revio": "PACBIO_SMRT",
|
|
20
|
+
"ion torrent": "ION_TORRENT",
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
SRA_TO_ENA_FIELDS = {
|
|
24
|
+
"run_total_bases": "base_count",
|
|
25
|
+
"run_total_spots": "read_count",
|
|
26
|
+
"organism_taxid": "tax_id",
|
|
27
|
+
"organism_name": "scientific_name",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def instrument_to_platform(instrument: str) -> str:
|
|
32
|
+
"""Map an SRA instrument model name to the ENA platform constant.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
instrument: Instrument model name from SRA (e.g. "Illumina MiniSeq").
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
str: Platform constant (e.g. "ILLUMINA") or the original value if unknown.
|
|
39
|
+
"""
|
|
40
|
+
lower = instrument.lower()
|
|
41
|
+
for key, platform in INSTRUMENT_PLATFORM_MAP.items():
|
|
42
|
+
if key in lower:
|
|
43
|
+
return platform
|
|
44
|
+
return instrument
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def normalize_sra_fields(records: list[dict]) -> list[dict]:
|
|
48
|
+
"""Rename SRA fields to their ENA equivalents so parse_accessions() works unchanged.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
records: List of dicts from pysradb search results.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
list[dict]: Records with critical fields renamed in place.
|
|
55
|
+
"""
|
|
56
|
+
for record in records:
|
|
57
|
+
for sra_field, ena_field in SRA_TO_ENA_FIELDS.items():
|
|
58
|
+
if sra_field in record:
|
|
59
|
+
record[ena_field] = record.pop(sra_field)
|
|
60
|
+
|
|
61
|
+
if "instrument" in record and "instrument_model_desc" not in record:
|
|
62
|
+
record["instrument_model_desc"] = instrument_to_platform(
|
|
63
|
+
record["instrument"]
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
layout = record.get("library_layout", "SINGLE").upper()
|
|
67
|
+
record["fastq_bytes"] = "0;0" if layout == "PAIRED" else "0"
|
|
68
|
+
|
|
69
|
+
return records
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def get_sra_metadata(query: str, is_accession: bool, limit: int) -> list:
|
|
73
|
+
"""Fetch metadata from SRA via pysradb.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
query: The query to search for (accession or NCBI query string).
|
|
77
|
+
is_accession: If the query is an accession or not.
|
|
78
|
+
limit: The maximum number of records to return.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
list: [success: bool, data: list[dict]]
|
|
82
|
+
"""
|
|
83
|
+
try:
|
|
84
|
+
db = SRAweb()
|
|
85
|
+
df = db.search_sra(
|
|
86
|
+
query,
|
|
87
|
+
detailed=True,
|
|
88
|
+
sample_attribute=True,
|
|
89
|
+
expand_sample_attributes=True,
|
|
90
|
+
)
|
|
91
|
+
if df is None or df.empty:
|
|
92
|
+
logging.debug(f"SRA query returned no results for: {query}")
|
|
93
|
+
return [False, []]
|
|
94
|
+
|
|
95
|
+
if len(df) > limit:
|
|
96
|
+
logging.debug(f"SRA returned {len(df)} results, truncating to {limit}")
|
|
97
|
+
df = df.head(limit)
|
|
98
|
+
|
|
99
|
+
records = df.to_dict(orient="records")
|
|
100
|
+
return [True, normalize_sra_fields(records)]
|
|
101
|
+
except Exception as e:
|
|
102
|
+
logging.error(f"Error querying SRA: {e}")
|
|
103
|
+
return [False, []]
|
|
@@ -61,7 +61,7 @@ _NEXTFLOW_INFORMATIONAL_RE = re.compile(
|
|
|
61
61
|
re.IGNORECASE,
|
|
62
62
|
)
|
|
63
63
|
|
|
64
|
-
# nextflow.config: nextflowVersion = '>=
|
|
64
|
+
# nextflow.config: nextflowVersion = '>=26.04.0'
|
|
65
65
|
_NEXTFLOW_CONFIG_RE = re.compile(
|
|
66
66
|
r"nextflowVersion\s*=\s*['\"][^\d]*(\d+\.\d+(?:\.\d+)?)"
|
|
67
67
|
)
|
|
@@ -5,7 +5,7 @@ manifest {
|
|
|
5
5
|
homePage = 'https://github.com/bactopia/bactopia'
|
|
6
6
|
description = 'An extensive workflow for processing sequencing of bacterial genomes.'
|
|
7
7
|
mainScript = 'main.nf'
|
|
8
|
-
version = '4.0.
|
|
8
|
+
version = '4.0.1'
|
|
9
9
|
nextflowVersion = '>=26.04.0'
|
|
10
10
|
}
|
|
11
11
|
|
|
@@ -21,7 +21,7 @@ params {
|
|
|
21
21
|
}
|
|
22
22
|
|
|
23
23
|
// Version
|
|
24
|
-
params.bactopia_version = '4.0.
|
|
24
|
+
params.bactopia_version = '4.0.1'
|
|
25
25
|
manifest.version = "${params.bactopia_version}"
|
|
26
26
|
|
|
27
27
|
// Includes
|
|
@@ -93,7 +93,7 @@ dag {
|
|
|
93
93
|
|
|
94
94
|
// Plugins
|
|
95
95
|
plugins {
|
|
96
|
-
id 'nf-bactopia@2.1.
|
|
96
|
+
id 'nf-bactopia@2.1.5'
|
|
97
97
|
}
|
|
98
98
|
|
|
99
99
|
bactopia {
|
{bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/module/tests/nextflow.config.j2
RENAMED
|
@@ -10,7 +10,7 @@ params {
|
|
|
10
10
|
ext = "fna"
|
|
11
11
|
}
|
|
12
12
|
|
|
13
|
-
bactopia_version = '4.0.
|
|
13
|
+
bactopia_version = '4.0.1'
|
|
14
14
|
bactopia_cache = System.getenv("BACTOPIA_CACHEDIR") ?: "${System.getenv('HOME')}/.bactopia"
|
|
15
15
|
condadir = "${params.bactopia_cache}/conda"
|
|
16
16
|
wf = params.workflow.name
|
{bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/subworkflow/tests/nextflow.config.j2
RENAMED
|
@@ -9,7 +9,7 @@ params {
|
|
|
9
9
|
description = "{{ description }}"
|
|
10
10
|
ext = "fna"
|
|
11
11
|
}
|
|
12
|
-
bactopia_version = '4.0.
|
|
12
|
+
bactopia_version = '4.0.1'
|
|
13
13
|
bactopia_cache = System.getenv("BACTOPIA_CACHEDIR") ?: "${System.getenv('HOME')}/.bactopia"
|
|
14
14
|
condadir = "${params.bactopia_cache}/conda"
|
|
15
15
|
wf = params.workflow.name
|
|
@@ -38,5 +38,5 @@ includeConfig "../../../conf/profiles.config"
|
|
|
38
38
|
|
|
39
39
|
// Plugin
|
|
40
40
|
plugins {
|
|
41
|
-
id 'nf-bactopia@2.
|
|
41
|
+
id 'nf-bactopia@2.1.5'
|
|
42
42
|
}
|
|
@@ -5,8 +5,8 @@ manifest {
|
|
|
5
5
|
homePage = 'https://github.com/bactopia/bactopia'
|
|
6
6
|
description = 'An extensive workflow for processing sequencing of bacterial genomes.'
|
|
7
7
|
mainScript = 'main.nf'
|
|
8
|
-
version = '4.0.
|
|
9
|
-
nextflowVersion = '>=
|
|
8
|
+
version = '4.0.1'
|
|
9
|
+
nextflowVersion = '>=26.04.0'
|
|
10
10
|
}
|
|
11
11
|
|
|
12
12
|
params {
|
|
@@ -19,7 +19,7 @@ params {
|
|
|
19
19
|
}
|
|
20
20
|
|
|
21
21
|
// Version
|
|
22
|
-
params.bactopia_version = '4.0.
|
|
22
|
+
params.bactopia_version = '4.0.1'
|
|
23
23
|
manifest.version = "${params.bactopia_version}"
|
|
24
24
|
|
|
25
25
|
// Includes
|
|
@@ -85,7 +85,7 @@ dag {
|
|
|
85
85
|
|
|
86
86
|
// Plugins
|
|
87
87
|
plugins {
|
|
88
|
-
id 'nf-bactopia@2.
|
|
88
|
+
id 'nf-bactopia@2.1.5'
|
|
89
89
|
}
|
|
90
90
|
|
|
91
91
|
bactopia {
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "bactopia"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.2.0"
|
|
4
4
|
description = "A Python package for working with Bactopia"
|
|
5
5
|
authors = [
|
|
6
6
|
"Robert A. Petit III <robbie.petit@gmail.com>",
|
|
@@ -57,6 +57,7 @@ pyyaml = ">=6.0"
|
|
|
57
57
|
biopython = ">=1.80"
|
|
58
58
|
openpyxl = ">=3.1.0"
|
|
59
59
|
psutil = ">=5.9.0"
|
|
60
|
+
pysradb = ">=2.2.0"
|
|
60
61
|
|
|
61
62
|
[tool.poetry.group.dev.dependencies]
|
|
62
63
|
ruff = "^0.9"
|
|
@@ -1,86 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import sys
|
|
3
|
-
|
|
4
|
-
import requests
|
|
5
|
-
|
|
6
|
-
ENA_URL = "https://www.ebi.ac.uk/ena/portal/api/search"
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def get_ena_metadata(query: str, is_accession: bool, limit: int):
|
|
10
|
-
"""Fetch metadata from ENA.
|
|
11
|
-
https://docs.google.com/document/d/1CwoY84MuZ3SdKYocqssumghBF88PWxUZ/edit#heading=h.ag0eqy2wfin5
|
|
12
|
-
|
|
13
|
-
Args:
|
|
14
|
-
query (str): The query to search for.
|
|
15
|
-
is_accession (bool): If the query is an accession or not.
|
|
16
|
-
limit (int): The maximum number of records to return.
|
|
17
|
-
|
|
18
|
-
Returns:
|
|
19
|
-
list: Records associated with the accession.
|
|
20
|
-
"""
|
|
21
|
-
data = {
|
|
22
|
-
"dataPortal": "ena",
|
|
23
|
-
"dccDataOnly": "false",
|
|
24
|
-
"download": "false",
|
|
25
|
-
"result": "read_run",
|
|
26
|
-
"format": "tsv",
|
|
27
|
-
"limit": limit,
|
|
28
|
-
"fields": "all",
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
if is_accession:
|
|
32
|
-
data["includeAccessions"] = query
|
|
33
|
-
else:
|
|
34
|
-
data["query"] = (
|
|
35
|
-
f'"{query} AND library_source=GENOMIC AND '
|
|
36
|
-
"(library_strategy=OTHER OR library_strategy=WGS OR "
|
|
37
|
-
"library_strategy=WGA) AND (library_selection=MNase OR "
|
|
38
|
-
"library_selection=RANDOM OR library_selection=unspecified OR "
|
|
39
|
-
'library_selection="size fractionation")"'
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
headers = {"accept": "*/*", "Content-type": "application/x-www-form-urlencoded"}
|
|
43
|
-
|
|
44
|
-
r = requests.post(ENA_URL, headers=headers, data=data)
|
|
45
|
-
if r.status_code == requests.codes.ok:
|
|
46
|
-
data = []
|
|
47
|
-
col_names = None
|
|
48
|
-
for line in r.text.split("\n"):
|
|
49
|
-
cols = line.split("\t")
|
|
50
|
-
if line:
|
|
51
|
-
if col_names:
|
|
52
|
-
data.append(dict(zip(col_names, cols)))
|
|
53
|
-
else:
|
|
54
|
-
col_names = cols
|
|
55
|
-
return [True, data]
|
|
56
|
-
else:
|
|
57
|
-
return [False, [r.status_code, r.text]]
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def get_run_info(
|
|
61
|
-
sra_query: str, ena_query: str, is_accession: bool, limit: int = 1000000
|
|
62
|
-
) -> tuple:
|
|
63
|
-
"""Retrieve a list of samples available from ENA.
|
|
64
|
-
|
|
65
|
-
The first attempt will be against ENA, and if that fails, SRA will be queried. This should
|
|
66
|
-
capture those samples not yet synced between ENA and SRA.
|
|
67
|
-
|
|
68
|
-
Args:
|
|
69
|
-
sra_query (str): A formatted query for SRA searches.
|
|
70
|
-
ena_query (str): A formatted query for ENA searches.
|
|
71
|
-
is_accession (bool): If the query is an accession or not.
|
|
72
|
-
limit (int): The maximum number of records to return.
|
|
73
|
-
|
|
74
|
-
Returns:
|
|
75
|
-
tuple: Records associated with the accession.
|
|
76
|
-
"""
|
|
77
|
-
|
|
78
|
-
logging.debug("Querying ENA for metadata...")
|
|
79
|
-
success, ena_data = get_ena_metadata(ena_query, is_accession, limit=limit)
|
|
80
|
-
if success:
|
|
81
|
-
return success, ena_data
|
|
82
|
-
else:
|
|
83
|
-
logging.error("There was an issue querying ENA, exiting...")
|
|
84
|
-
logging.error(f"STATUS: {ena_data[0]}")
|
|
85
|
-
logging.error(f"TEXT: {ena_data[1]}")
|
|
86
|
-
sys.exit(1)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/module/tests/nf-test.config.j2
RENAMED
|
File without changes
|
|
File without changes
|
{bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/subworkflow/tests/main.nf.test.j2
RENAMED
|
File without changes
|
{bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/subworkflow/tests/nf-test.config.j2
RENAMED
|
File without changes
|
{bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/subworkflow/tests/nftignore.j2
RENAMED
|
File without changes
|
|
File without changes
|
{bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/workflow/tests/main.nf.test.j2
RENAMED
|
File without changes
|
{bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/workflow/tests/nf-test.config.j2
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|