bactopia 2.1.6__tar.gz → 2.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. {bactopia-2.1.6 → bactopia-2.2.0}/PKG-INFO +2 -1
  2. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/common.py +1 -1
  3. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/search.py +83 -14
  4. bactopia-2.2.0/bactopia/databases/ena.py +118 -0
  5. bactopia-2.2.0/bactopia/databases/sra.py +103 -0
  6. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/lint/docs.py +1 -1
  7. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/nextflow/nextflow.config.j2 +3 -3
  8. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/module/tests/nextflow.config.j2 +1 -1
  9. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/subworkflow/tests/nextflow.config.j2 +2 -2
  10. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/workflow/nextflow.config.j2 +4 -4
  11. {bactopia-2.1.6 → bactopia-2.2.0}/pyproject.toml +2 -1
  12. bactopia-2.1.6/bactopia/databases/ena.py +0 -86
  13. {bactopia-2.1.6 → bactopia-2.2.0}/LICENSE +0 -0
  14. {bactopia-2.1.6 → bactopia-2.2.0}/README.md +0 -0
  15. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/__init__.py +0 -0
  16. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/atb.py +0 -0
  17. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/__init__.py +0 -0
  18. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/atb/__init__.py +0 -0
  19. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/atb/atb_downloader.py +0 -0
  20. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/atb/atb_formatter.py +0 -0
  21. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/catalog.py +0 -0
  22. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/citations.py +0 -0
  23. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/datasets.py +0 -0
  24. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/docs.py +0 -0
  25. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/download.py +0 -0
  26. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/helpers/__init__.py +0 -0
  27. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/helpers/merge_schemas.py +0 -0
  28. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/lint.py +0 -0
  29. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pipeline/__init__.py +0 -0
  30. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pipeline/bracken_to_excel.py +0 -0
  31. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pipeline/check_assembly_accession.py +0 -0
  32. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pipeline/check_fastqs.py +0 -0
  33. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pipeline/cleanup_coverage.py +0 -0
  34. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pipeline/kraken_bracken_summary.py +0 -0
  35. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pipeline/mask_consensus.py +0 -0
  36. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pipeline/scrubber_summary.py +0 -0
  37. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pipeline/teton_prepare.py +0 -0
  38. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/prepare.py +0 -0
  39. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/prune.py +0 -0
  40. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pubmlst/build.py +0 -0
  41. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/pubmlst/setup.py +0 -0
  42. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/review.py +0 -0
  43. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/scaffold.py +0 -0
  44. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/status.py +0 -0
  45. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/summary.py +0 -0
  46. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/sysinfo.py +0 -0
  47. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/testing.py +0 -0
  48. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/update.py +0 -0
  49. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/cli/workflows.py +0 -0
  50. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/conda.py +0 -0
  51. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/databases/__init__.py +0 -0
  52. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/databases/ncbi.py +0 -0
  53. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/databases/pubmlst/__init__.py +0 -0
  54. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/databases/pubmlst/constants.py +0 -0
  55. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/databases/pubmlst/utils.py +0 -0
  56. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/lint/__init__.py +0 -0
  57. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/lint/citations.py +0 -0
  58. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/lint/models.py +0 -0
  59. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/lint/rules/__init__.py +0 -0
  60. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/lint/rules/module_rules.py +0 -0
  61. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/lint/rules/subworkflow_rules.py +0 -0
  62. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/lint/rules/workflow_rules.py +0 -0
  63. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/lint/runner.py +0 -0
  64. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/nf.py +0 -0
  65. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/outputs.py +0 -0
  66. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parse.py +0 -0
  67. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/__init__.py +0 -0
  68. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/amrfinderplus.py +0 -0
  69. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/annotator.py +0 -0
  70. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/ariba.py +0 -0
  71. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/assembler.py +0 -0
  72. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/blast.py +0 -0
  73. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/citations.py +0 -0
  74. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/coverage.py +0 -0
  75. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/error.py +0 -0
  76. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/gather.py +0 -0
  77. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/generic.py +0 -0
  78. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/kraken.py +0 -0
  79. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/mapping.py +0 -0
  80. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/mlst.py +0 -0
  81. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/nextflow.py +0 -0
  82. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/parsables.py +0 -0
  83. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/qc.py +0 -0
  84. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/sketcher.py +0 -0
  85. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/variants.py +0 -0
  86. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/versions.py +0 -0
  87. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/parsers/workflows.py +0 -0
  88. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/reports/__init__.py +0 -0
  89. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/reports/templates/__init__.py +0 -0
  90. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/scaffold.py +0 -0
  91. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/summary.py +0 -0
  92. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/__init__.py +0 -0
  93. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/bactopia/llms.txt.j2 +0 -0
  94. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/logos.py +0 -0
  95. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/nextflow/params.config.j2 +0 -0
  96. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/nextflow/process.config.j2 +0 -0
  97. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/module/main.nf.j2 +0 -0
  98. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/module/module.config.j2 +0 -0
  99. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/module/schema.json.j2 +0 -0
  100. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/module/tests/main.nf.test.j2 +0 -0
  101. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/module/tests/nf-test.config.j2 +0 -0
  102. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/subworkflow/main.nf.j2 +0 -0
  103. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/subworkflow/tests/main.nf.test.j2 +0 -0
  104. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/subworkflow/tests/nf-test.config.j2 +0 -0
  105. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/subworkflow/tests/nftignore.j2 +0 -0
  106. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/workflow/main.nf.j2 +0 -0
  107. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/workflow/tests/main.nf.test.j2 +0 -0
  108. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/workflow/tests/nf-test.config.j2 +0 -0
  109. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/templates/scaffold/workflow/tests/nftignore.j2 +0 -0
  110. {bactopia-2.1.6 → bactopia-2.2.0}/bactopia/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bactopia
3
- Version: 2.1.6
3
+ Version: 2.2.0
4
4
  Summary: A Python package for working with Bactopia
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -20,6 +20,7 @@ Requires-Dist: jinja2 (>=3.1.6)
20
20
  Requires-Dist: openpyxl (>=3.1.0)
21
21
  Requires-Dist: pandas (>=2.2.0)
22
22
  Requires-Dist: psutil (>=5.9.0)
23
+ Requires-Dist: pysradb (>=2.2.0)
23
24
  Requires-Dist: pyyaml (>=6.0)
24
25
  Requires-Dist: rauth (>=0.7.3)
25
26
  Requires-Dist: requests (>=2.28.2)
@@ -26,7 +26,7 @@ def common_options(fn):
26
26
  def setup_logging(verbose: bool, silent: bool) -> None:
27
27
  """Configure root logger with RichHandler at the appropriate level."""
28
28
  logging.basicConfig(
29
- format="%(asctime)s:%(name)s:%(levelname)s - %(message)s",
29
+ format="%(message)s",
30
30
  datefmt="%Y-%m-%d %H:%M:%S",
31
31
  handlers=[
32
32
  RichHandler(rich_tracebacks=True, console=rich.console.Console(stderr=True))
@@ -30,6 +30,8 @@ click.rich_click.OPTION_GROUPS = {
30
30
  "name": "Query Options",
31
31
  "options": [
32
32
  "--exact-taxon",
33
+ "--provider",
34
+ "--only-provider",
33
35
  "--limit",
34
36
  "--accession-limit",
35
37
  "--biosample-subset",
@@ -192,6 +194,9 @@ def parse_query(q, accession_limit, exact_taxon=False):
192
194
  run_accessions = []
193
195
 
194
196
  for query in queries:
197
+ query = query.strip()
198
+ if not query:
199
+ continue
195
200
  try:
196
201
  taxon_id = int(query)
197
202
  if exact_taxon:
@@ -244,6 +249,18 @@ def parse_query(q, accession_limit, exact_taxon=False):
244
249
  help="Taxon ID or Study, BioSample, or Run accession (can also be comma separated or a file of accessions)",
245
250
  )
246
251
  @click.option("--exact-taxon", is_flag=True, help="Exclude Taxon ID descendants")
252
+ @click.option(
253
+ "--provider",
254
+ default="ena",
255
+ show_default=True,
256
+ type=click.Choice(["ena", "sra"], case_sensitive=False),
257
+ help="Provider to query first, falls back to the other",
258
+ )
259
+ @click.option(
260
+ "--only-provider",
261
+ is_flag=True,
262
+ help="Only query the given --provider, skip fallback",
263
+ )
247
264
  @click.option(
248
265
  "--outdir", "-o", default="./", show_default=True, help="Directory to write output"
249
266
  )
@@ -316,6 +333,8 @@ def parse_query(q, accession_limit, exact_taxon=False):
316
333
  def search(
317
334
  query,
318
335
  exact_taxon,
336
+ provider,
337
+ only_provider,
319
338
  outdir,
320
339
  prefix,
321
340
  limit,
@@ -379,12 +398,31 @@ def search(
379
398
  accessions_file = f"{outdir}/{prefix}-accessions.txt".replace("//", "/")
380
399
  filtered_file = f"{outdir}/{prefix}-filtered.txt".replace("//", "/")
381
400
  summary_file = f"{outdir}/{prefix}-search.txt".replace("//", "/")
401
+
402
+ if not force:
403
+ existing = [
404
+ f
405
+ for f in [metadata_file, accessions_file, filtered_file, summary_file]
406
+ if Path(f).exists()
407
+ ]
408
+ if existing:
409
+ logging.error(
410
+ f"Output files already exist: {', '.join(existing)}. "
411
+ "Use --force to overwrite."
412
+ )
413
+ sys.exit(1)
414
+
382
415
  genome_sizes = get_ncbi_genome_size() if use_ncbi_genome_size else None
383
416
  for query_type, ena_query, sra_query in queries:
384
417
  logging.info(f"Submitting query (type - {query_type})")
385
418
  is_accession = True if query_type.endswith("accession") else False
386
- success, query_results = get_run_info(
387
- sra_query, ena_query, is_accession, limit=limit
419
+ success, query_results, source = get_run_info(
420
+ sra_query,
421
+ ena_query,
422
+ is_accession,
423
+ limit=limit,
424
+ provider=provider,
425
+ only_provider=only_provider,
388
426
  )
389
427
  results += query_results
390
428
  if success:
@@ -395,8 +433,35 @@ def search(
395
433
  genome_size=genome_size,
396
434
  genome_sizes=genome_sizes,
397
435
  )
436
+
437
+ # Fallback: provider returned results but none passed filtering
438
+ if not query_accessions and not only_provider:
439
+ fallback = "sra" if source == "ena" else "ena"
440
+ logging.info(
441
+ f"Accession found on {source.upper()}, but missing "
442
+ f"metadata, checking {fallback.upper()}..."
443
+ )
444
+ fb_success, fb_results, fb_source = get_run_info(
445
+ sra_query,
446
+ ena_query,
447
+ is_accession,
448
+ limit=limit,
449
+ provider=fallback,
450
+ only_provider=True,
451
+ )
452
+ if fb_success:
453
+ results += fb_results
454
+ source = fb_source
455
+ query_accessions, query_filtered = parse_accessions(
456
+ fb_results,
457
+ min_read_length=min_read_length,
458
+ min_base_count=min_base_count,
459
+ genome_size=genome_size,
460
+ genome_sizes=genome_sizes,
461
+ )
462
+
463
+ WARNING_MESSAGE = None
398
464
  if len(query_accessions):
399
- WARNING_MESSAGE = None
400
465
  if query_type == "biosample" and biosample_subset > 0:
401
466
  if len(query_accessions) > biosample_subset:
402
467
  WARNING_MESSAGE = f"WARNING: Selected {biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}"
@@ -404,20 +469,19 @@ def search(
404
469
  query_accessions, biosample_subset
405
470
  )
406
471
  accessions = list(set(accessions + query_accessions))
407
- filtered["min_base_count"] += query_filtered["min_base_count"]
408
- filtered["min_read_length"] += query_filtered["min_read_length"]
409
- filtered["technical"] += query_filtered["technical"]
410
- for filtered_sample in query_filtered["filtered"]:
411
- filtered["filtered"][filtered_sample["accession"]] = (
412
- filtered_sample["reason"]
413
- )
414
472
  else:
415
473
  if query_results:
416
- WARNING_MESSAGE = f"WARNING: {query} did not return any Illumina or Ont results from ENA."
474
+ WARNING_MESSAGE = f"WARNING: {query} did not return any Illumina or Ont results from {source.upper()}."
417
475
  else:
418
- WARNING_MESSAGE = (
419
- f"WARNING: {query} did not return any results from ENA."
420
- )
476
+ WARNING_MESSAGE = f"WARNING: {query} did not return any results from {source.upper()}."
477
+
478
+ filtered["min_base_count"] += query_filtered["min_base_count"]
479
+ filtered["min_read_length"] += query_filtered["min_read_length"]
480
+ filtered["technical"] += query_filtered["technical"]
481
+ for filtered_sample in query_filtered["filtered"]:
482
+ filtered["filtered"][filtered_sample["accession"]] = filtered_sample[
483
+ "reason"
484
+ ]
421
485
 
422
486
  # Create Summary
423
487
  query_string = query
@@ -435,6 +499,7 @@ def search(
435
499
  summary.append(
436
500
  f"DATE: {datetime.datetime.now().replace(microsecond=0).isoformat()}"
437
501
  )
502
+ summary.append(f"PROVIDER: {source.upper()}")
438
503
  summary.append(f"LIMIT: {limit}")
439
504
  summary.append(f"RESULTS: {len(results)} ({metadata_file})")
440
505
  summary.append(
@@ -462,6 +527,10 @@ def search(
462
527
  else:
463
528
  logging.error(f"ERROR: Unable to retrieve metadata for query ({query})")
464
529
 
530
+ if not results:
531
+ logging.error("No results found, skipping output files.")
532
+ sys.exit(1)
533
+
465
534
  # Output the results
466
535
  logging.info(f"Writing results to {metadata_file}")
467
536
  with open(metadata_file, "w") as output_fh:
@@ -0,0 +1,118 @@
1
+ import logging
2
+
3
+ import requests
4
+
5
+ ENA_URL = "https://www.ebi.ac.uk/ena/portal/api/search"
6
+
7
+
8
+ def get_ena_metadata(query: str, is_accession: bool, limit: int):
9
+ """Fetch metadata from ENA.
10
+ https://docs.google.com/document/d/1CwoY84MuZ3SdKYocqssumghBF88PWxUZ/edit#heading=h.ag0eqy2wfin5
11
+
12
+ Args:
13
+ query (str): The query to search for.
14
+ is_accession (bool): If the query is an accession or not.
15
+ limit (int): The maximum number of records to return.
16
+
17
+ Returns:
18
+ list: Records associated with the accession.
19
+ """
20
+ data = {
21
+ "dataPortal": "ena",
22
+ "dccDataOnly": "false",
23
+ "download": "false",
24
+ "result": "read_run",
25
+ "format": "tsv",
26
+ "limit": limit,
27
+ "fields": "all",
28
+ }
29
+
30
+ if is_accession:
31
+ data["includeAccessions"] = query
32
+ else:
33
+ data["query"] = (
34
+ f'"{query} AND library_source=GENOMIC AND '
35
+ "(library_strategy=OTHER OR library_strategy=WGS OR "
36
+ "library_strategy=WGA) AND (library_selection=MNase OR "
37
+ "library_selection=RANDOM OR library_selection=unspecified OR "
38
+ 'library_selection="size fractionation")"'
39
+ )
40
+
41
+ headers = {"accept": "*/*", "Content-type": "application/x-www-form-urlencoded"}
42
+
43
+ r = requests.post(ENA_URL, headers=headers, data=data)
44
+ if r.status_code == requests.codes.ok:
45
+ data = []
46
+ col_names = None
47
+ for line in r.text.split("\n"):
48
+ cols = line.split("\t")
49
+ if line:
50
+ if col_names:
51
+ data.append(dict(zip(col_names, cols)))
52
+ else:
53
+ col_names = cols
54
+ return [True, data]
55
+ else:
56
+ return [False, [r.status_code, r.text]]
57
+
58
+
59
+ def get_run_info(
60
+ sra_query: str,
61
+ ena_query: str,
62
+ is_accession: bool,
63
+ limit: int = 1000000,
64
+ provider: str = "ena",
65
+ only_provider: bool = False,
66
+ ) -> tuple:
67
+ """Retrieve a list of samples available from ENA and/or SRA.
68
+
69
+ By default, the provider is queried first and the other is used as fallback. When
70
+ only_provider is True, no fallback is attempted.
71
+
72
+ Args:
73
+ sra_query: A formatted query for SRA searches.
74
+ ena_query: A formatted query for ENA searches.
75
+ is_accession: If the query is an accession or not.
76
+ limit: The maximum number of records to return.
77
+ provider: Which provider to query first ("ena" or "sra").
78
+ only_provider: If True, skip fallback to the other provider.
79
+
80
+ Returns:
81
+ tuple: (success, data, source) where source is "ena", "sra", or "none".
82
+ """
83
+ from bactopia.databases.sra import get_sra_metadata
84
+
85
+ fallback = "sra" if provider == "ena" else "ena"
86
+
87
+ def _query_ena():
88
+ logging.debug("Querying ENA for metadata...")
89
+ success, data = get_ena_metadata(ena_query, is_accession, limit=limit)
90
+ if success and data:
91
+ return True, data
92
+ if not success:
93
+ logging.warning(f"ENA query failed (status {data[0]}).")
94
+ else:
95
+ logging.debug("ENA query returned no results.")
96
+ return False, []
97
+
98
+ def _query_sra():
99
+ logging.debug("Querying SRA for metadata...")
100
+ return get_sra_metadata(sra_query, is_accession, limit=limit)
101
+
102
+ query_fn = {"ena": _query_ena, "sra": _query_sra}
103
+
104
+ success, data = query_fn[provider]()
105
+ if success:
106
+ return True, data, provider
107
+
108
+ if only_provider:
109
+ logging.error(f"{provider.upper()} returned no results (--only-provider).")
110
+ return False, [], "none"
111
+
112
+ logging.info(f"No results from {provider.upper()}, checking {fallback.upper()}...")
113
+ success, data = query_fn[fallback]()
114
+ if success:
115
+ return True, data, fallback
116
+
117
+ logging.error("Both ENA and SRA returned no results.")
118
+ return False, [], "none"
@@ -0,0 +1,103 @@
1
+ import logging
2
+
3
+ from pysradb.sraweb import SRAweb
4
+
5
+ INSTRUMENT_PLATFORM_MAP = {
6
+ "illumina": "ILLUMINA",
7
+ "nextseq": "ILLUMINA",
8
+ "hiseq": "ILLUMINA",
9
+ "miseq": "ILLUMINA",
10
+ "novaseq": "ILLUMINA",
11
+ "miniseq": "ILLUMINA",
12
+ "genome analyzer": "ILLUMINA",
13
+ "minion": "OXFORD_NANOPORE",
14
+ "gridion": "OXFORD_NANOPORE",
15
+ "promethion": "OXFORD_NANOPORE",
16
+ "nanopore": "OXFORD_NANOPORE",
17
+ "pacbio": "PACBIO_SMRT",
18
+ "sequel": "PACBIO_SMRT",
19
+ "revio": "PACBIO_SMRT",
20
+ "ion torrent": "ION_TORRENT",
21
+ }
22
+
23
+ SRA_TO_ENA_FIELDS = {
24
+ "run_total_bases": "base_count",
25
+ "run_total_spots": "read_count",
26
+ "organism_taxid": "tax_id",
27
+ "organism_name": "scientific_name",
28
+ }
29
+
30
+
31
+ def instrument_to_platform(instrument: str) -> str:
32
+ """Map an SRA instrument model name to the ENA platform constant.
33
+
34
+ Args:
35
+ instrument: Instrument model name from SRA (e.g. "Illumina MiniSeq").
36
+
37
+ Returns:
38
+ str: Platform constant (e.g. "ILLUMINA") or the original value if unknown.
39
+ """
40
+ lower = instrument.lower()
41
+ for key, platform in INSTRUMENT_PLATFORM_MAP.items():
42
+ if key in lower:
43
+ return platform
44
+ return instrument
45
+
46
+
47
+ def normalize_sra_fields(records: list[dict]) -> list[dict]:
48
+ """Rename SRA fields to their ENA equivalents so parse_accessions() works unchanged.
49
+
50
+ Args:
51
+ records: List of dicts from pysradb search results.
52
+
53
+ Returns:
54
+ list[dict]: Records with critical fields renamed in place.
55
+ """
56
+ for record in records:
57
+ for sra_field, ena_field in SRA_TO_ENA_FIELDS.items():
58
+ if sra_field in record:
59
+ record[ena_field] = record.pop(sra_field)
60
+
61
+ if "instrument" in record and "instrument_model_desc" not in record:
62
+ record["instrument_model_desc"] = instrument_to_platform(
63
+ record["instrument"]
64
+ )
65
+
66
+ layout = record.get("library_layout", "SINGLE").upper()
67
+ record["fastq_bytes"] = "0;0" if layout == "PAIRED" else "0"
68
+
69
+ return records
70
+
71
+
72
+ def get_sra_metadata(query: str, is_accession: bool, limit: int) -> list:
73
+ """Fetch metadata from SRA via pysradb.
74
+
75
+ Args:
76
+ query: The query to search for (accession or NCBI query string).
77
+ is_accession: If the query is an accession or not.
78
+ limit: The maximum number of records to return.
79
+
80
+ Returns:
81
+ list: [success: bool, data: list[dict]]
82
+ """
83
+ try:
84
+ db = SRAweb()
85
+ df = db.search_sra(
86
+ query,
87
+ detailed=True,
88
+ sample_attribute=True,
89
+ expand_sample_attributes=True,
90
+ )
91
+ if df is None or df.empty:
92
+ logging.debug(f"SRA query returned no results for: {query}")
93
+ return [False, []]
94
+
95
+ if len(df) > limit:
96
+ logging.debug(f"SRA returned {len(df)} results, truncating to {limit}")
97
+ df = df.head(limit)
98
+
99
+ records = df.to_dict(orient="records")
100
+ return [True, normalize_sra_fields(records)]
101
+ except Exception as e:
102
+ logging.error(f"Error querying SRA: {e}")
103
+ return [False, []]
@@ -61,7 +61,7 @@ _NEXTFLOW_INFORMATIONAL_RE = re.compile(
61
61
  re.IGNORECASE,
62
62
  )
63
63
 
64
- # nextflow.config: nextflowVersion = '>=25.04.6'
64
+ # nextflow.config: nextflowVersion = '>=26.04.0'
65
65
  _NEXTFLOW_CONFIG_RE = re.compile(
66
66
  r"nextflowVersion\s*=\s*['\"][^\d]*(\d+\.\d+(?:\.\d+)?)"
67
67
  )
@@ -5,7 +5,7 @@ manifest {
5
5
  homePage = 'https://github.com/bactopia/bactopia'
6
6
  description = 'An extensive workflow for processing sequencing of bacterial genomes.'
7
7
  mainScript = 'main.nf'
8
- version = '4.0.0'
8
+ version = '4.0.1'
9
9
  nextflowVersion = '>=26.04.0'
10
10
  }
11
11
 
@@ -21,7 +21,7 @@ params {
21
21
  }
22
22
 
23
23
  // Version
24
- params.bactopia_version = '4.0.0'
24
+ params.bactopia_version = '4.0.1'
25
25
  manifest.version = "${params.bactopia_version}"
26
26
 
27
27
  // Includes
@@ -93,7 +93,7 @@ dag {
93
93
 
94
94
  // Plugins
95
95
  plugins {
96
- id 'nf-bactopia@2.1.1'
96
+ id 'nf-bactopia@2.1.5'
97
97
  }
98
98
 
99
99
  bactopia {
@@ -10,7 +10,7 @@ params {
10
10
  ext = "fna"
11
11
  }
12
12
 
13
- bactopia_version = '4.0.0'
13
+ bactopia_version = '4.0.1'
14
14
  bactopia_cache = System.getenv("BACTOPIA_CACHEDIR") ?: "${System.getenv('HOME')}/.bactopia"
15
15
  condadir = "${params.bactopia_cache}/conda"
16
16
  wf = params.workflow.name
@@ -9,7 +9,7 @@ params {
9
9
  description = "{{ description }}"
10
10
  ext = "fna"
11
11
  }
12
- bactopia_version = '4.0.0'
12
+ bactopia_version = '4.0.1'
13
13
  bactopia_cache = System.getenv("BACTOPIA_CACHEDIR") ?: "${System.getenv('HOME')}/.bactopia"
14
14
  condadir = "${params.bactopia_cache}/conda"
15
15
  wf = params.workflow.name
@@ -38,5 +38,5 @@ includeConfig "../../../conf/profiles.config"
38
38
 
39
39
  // Plugin
40
40
  plugins {
41
- id 'nf-bactopia@2.0.2'
41
+ id 'nf-bactopia@2.1.5'
42
42
  }
@@ -5,8 +5,8 @@ manifest {
5
5
  homePage = 'https://github.com/bactopia/bactopia'
6
6
  description = 'An extensive workflow for processing sequencing of bacterial genomes.'
7
7
  mainScript = 'main.nf'
8
- version = '4.0.0'
9
- nextflowVersion = '>=25.04.6'
8
+ version = '4.0.1'
9
+ nextflowVersion = '>=26.04.0'
10
10
  }
11
11
 
12
12
  params {
@@ -19,7 +19,7 @@ params {
19
19
  }
20
20
 
21
21
  // Version
22
- params.bactopia_version = '4.0.0'
22
+ params.bactopia_version = '4.0.1'
23
23
  manifest.version = "${params.bactopia_version}"
24
24
 
25
25
  // Includes
@@ -85,7 +85,7 @@ dag {
85
85
 
86
86
  // Plugins
87
87
  plugins {
88
- id 'nf-bactopia@2.0.2'
88
+ id 'nf-bactopia@2.1.5'
89
89
  }
90
90
 
91
91
  bactopia {
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "bactopia"
3
- version = "2.1.6"
3
+ version = "2.2.0"
4
4
  description = "A Python package for working with Bactopia"
5
5
  authors = [
6
6
  "Robert A. Petit III <robbie.petit@gmail.com>",
@@ -57,6 +57,7 @@ pyyaml = ">=6.0"
57
57
  biopython = ">=1.80"
58
58
  openpyxl = ">=3.1.0"
59
59
  psutil = ">=5.9.0"
60
+ pysradb = ">=2.2.0"
60
61
 
61
62
  [tool.poetry.group.dev.dependencies]
62
63
  ruff = "^0.9"
@@ -1,86 +0,0 @@
1
- import logging
2
- import sys
3
-
4
- import requests
5
-
6
- ENA_URL = "https://www.ebi.ac.uk/ena/portal/api/search"
7
-
8
-
9
- def get_ena_metadata(query: str, is_accession: bool, limit: int):
10
- """Fetch metadata from ENA.
11
- https://docs.google.com/document/d/1CwoY84MuZ3SdKYocqssumghBF88PWxUZ/edit#heading=h.ag0eqy2wfin5
12
-
13
- Args:
14
- query (str): The query to search for.
15
- is_accession (bool): If the query is an accession or not.
16
- limit (int): The maximum number of records to return.
17
-
18
- Returns:
19
- list: Records associated with the accession.
20
- """
21
- data = {
22
- "dataPortal": "ena",
23
- "dccDataOnly": "false",
24
- "download": "false",
25
- "result": "read_run",
26
- "format": "tsv",
27
- "limit": limit,
28
- "fields": "all",
29
- }
30
-
31
- if is_accession:
32
- data["includeAccessions"] = query
33
- else:
34
- data["query"] = (
35
- f'"{query} AND library_source=GENOMIC AND '
36
- "(library_strategy=OTHER OR library_strategy=WGS OR "
37
- "library_strategy=WGA) AND (library_selection=MNase OR "
38
- "library_selection=RANDOM OR library_selection=unspecified OR "
39
- 'library_selection="size fractionation")"'
40
- )
41
-
42
- headers = {"accept": "*/*", "Content-type": "application/x-www-form-urlencoded"}
43
-
44
- r = requests.post(ENA_URL, headers=headers, data=data)
45
- if r.status_code == requests.codes.ok:
46
- data = []
47
- col_names = None
48
- for line in r.text.split("\n"):
49
- cols = line.split("\t")
50
- if line:
51
- if col_names:
52
- data.append(dict(zip(col_names, cols)))
53
- else:
54
- col_names = cols
55
- return [True, data]
56
- else:
57
- return [False, [r.status_code, r.text]]
58
-
59
-
60
- def get_run_info(
61
- sra_query: str, ena_query: str, is_accession: bool, limit: int = 1000000
62
- ) -> tuple:
63
- """Retrieve a list of samples available from ENA.
64
-
65
- The first attempt will be against ENA, and if that fails, SRA will be queried. This should
66
- capture those samples not yet synced between ENA and SRA.
67
-
68
- Args:
69
- sra_query (str): A formatted query for SRA searches.
70
- ena_query (str): A formatted query for ENA searches.
71
- is_accession (bool): If the query is an accession or not.
72
- limit (int): The maximum number of records to return.
73
-
74
- Returns:
75
- tuple: Records associated with the accession.
76
- """
77
-
78
- logging.debug("Querying ENA for metadata...")
79
- success, ena_data = get_ena_metadata(ena_query, is_accession, limit=limit)
80
- if success:
81
- return success, ena_data
82
- else:
83
- logging.error("There was an issue querying ENA, exiting...")
84
- logging.error(f"STATUS: {ena_data[0]}")
85
- logging.error(f"TEXT: {ena_data[1]}")
86
- sys.exit(1)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes