bactopia 2.1.5__tar.gz → 2.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. {bactopia-2.1.5 → bactopia-2.2.0}/PKG-INFO +8 -7
  2. {bactopia-2.1.5 → bactopia-2.2.0}/README.md +5 -5
  3. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/common.py +1 -1
  4. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/search.py +83 -14
  5. bactopia-2.2.0/bactopia/databases/ena.py +118 -0
  6. bactopia-2.2.0/bactopia/databases/sra.py +103 -0
  7. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/lint/docs.py +1 -1
  8. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/qc.py +10 -8
  9. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/bactopia/llms.txt.j2 +1 -1
  10. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/nextflow/nextflow.config.j2 +3 -3
  11. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/scaffold/module/tests/nextflow.config.j2 +1 -1
  12. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/scaffold/subworkflow/tests/nextflow.config.j2 +2 -2
  13. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/scaffold/workflow/nextflow.config.j2 +4 -4
  14. {bactopia-2.1.5 → bactopia-2.2.0}/pyproject.toml +3 -2
  15. bactopia-2.1.5/bactopia/databases/ena.py +0 -86
  16. {bactopia-2.1.5 → bactopia-2.2.0}/LICENSE +0 -0
  17. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/__init__.py +0 -0
  18. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/atb.py +0 -0
  19. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/__init__.py +0 -0
  20. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/atb/__init__.py +0 -0
  21. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/atb/atb_downloader.py +0 -0
  22. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/atb/atb_formatter.py +0 -0
  23. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/catalog.py +0 -0
  24. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/citations.py +0 -0
  25. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/datasets.py +0 -0
  26. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/docs.py +0 -0
  27. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/download.py +0 -0
  28. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/helpers/__init__.py +0 -0
  29. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/helpers/merge_schemas.py +0 -0
  30. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/lint.py +0 -0
  31. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/pipeline/__init__.py +0 -0
  32. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/pipeline/bracken_to_excel.py +0 -0
  33. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/pipeline/check_assembly_accession.py +0 -0
  34. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/pipeline/check_fastqs.py +0 -0
  35. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/pipeline/cleanup_coverage.py +0 -0
  36. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/pipeline/kraken_bracken_summary.py +0 -0
  37. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/pipeline/mask_consensus.py +0 -0
  38. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/pipeline/scrubber_summary.py +0 -0
  39. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/pipeline/teton_prepare.py +0 -0
  40. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/prepare.py +0 -0
  41. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/prune.py +0 -0
  42. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/pubmlst/build.py +0 -0
  43. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/pubmlst/setup.py +0 -0
  44. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/review.py +0 -0
  45. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/scaffold.py +0 -0
  46. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/status.py +0 -0
  47. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/summary.py +0 -0
  48. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/sysinfo.py +0 -0
  49. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/testing.py +0 -0
  50. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/update.py +0 -0
  51. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/cli/workflows.py +0 -0
  52. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/conda.py +0 -0
  53. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/databases/__init__.py +0 -0
  54. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/databases/ncbi.py +0 -0
  55. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/databases/pubmlst/__init__.py +0 -0
  56. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/databases/pubmlst/constants.py +0 -0
  57. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/databases/pubmlst/utils.py +0 -0
  58. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/lint/__init__.py +0 -0
  59. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/lint/citations.py +0 -0
  60. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/lint/models.py +0 -0
  61. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/lint/rules/__init__.py +0 -0
  62. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/lint/rules/module_rules.py +0 -0
  63. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/lint/rules/subworkflow_rules.py +0 -0
  64. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/lint/rules/workflow_rules.py +0 -0
  65. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/lint/runner.py +0 -0
  66. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/nf.py +0 -0
  67. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/outputs.py +0 -0
  68. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parse.py +0 -0
  69. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/__init__.py +0 -0
  70. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/amrfinderplus.py +0 -0
  71. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/annotator.py +0 -0
  72. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/ariba.py +0 -0
  73. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/assembler.py +0 -0
  74. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/blast.py +0 -0
  75. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/citations.py +0 -0
  76. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/coverage.py +0 -0
  77. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/error.py +0 -0
  78. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/gather.py +0 -0
  79. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/generic.py +0 -0
  80. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/kraken.py +0 -0
  81. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/mapping.py +0 -0
  82. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/mlst.py +0 -0
  83. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/nextflow.py +0 -0
  84. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/parsables.py +0 -0
  85. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/sketcher.py +0 -0
  86. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/variants.py +0 -0
  87. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/versions.py +0 -0
  88. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/parsers/workflows.py +0 -0
  89. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/reports/__init__.py +0 -0
  90. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/reports/templates/__init__.py +0 -0
  91. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/scaffold.py +0 -0
  92. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/summary.py +0 -0
  93. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/__init__.py +0 -0
  94. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/logos.py +0 -0
  95. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/nextflow/params.config.j2 +0 -0
  96. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/nextflow/process.config.j2 +0 -0
  97. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/scaffold/module/main.nf.j2 +0 -0
  98. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/scaffold/module/module.config.j2 +0 -0
  99. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/scaffold/module/schema.json.j2 +0 -0
  100. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/scaffold/module/tests/main.nf.test.j2 +0 -0
  101. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/scaffold/module/tests/nf-test.config.j2 +0 -0
  102. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/scaffold/subworkflow/main.nf.j2 +0 -0
  103. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/scaffold/subworkflow/tests/main.nf.test.j2 +0 -0
  104. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/scaffold/subworkflow/tests/nf-test.config.j2 +0 -0
  105. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/scaffold/subworkflow/tests/nftignore.j2 +0 -0
  106. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/scaffold/workflow/main.nf.j2 +0 -0
  107. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/scaffold/workflow/tests/main.nf.test.j2 +0 -0
  108. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/scaffold/workflow/tests/nf-test.config.j2 +0 -0
  109. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/templates/scaffold/workflow/tests/nftignore.j2 +0 -0
  110. {bactopia-2.1.5 → bactopia-2.2.0}/bactopia/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bactopia
3
- Version: 2.1.5
3
+ Version: 2.2.0
4
4
  Summary: A Python package for working with Bactopia
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -20,20 +20,21 @@ Requires-Dist: jinja2 (>=3.1.6)
20
20
  Requires-Dist: openpyxl (>=3.1.0)
21
21
  Requires-Dist: pandas (>=2.2.0)
22
22
  Requires-Dist: psutil (>=5.9.0)
23
+ Requires-Dist: pysradb (>=2.2.0)
23
24
  Requires-Dist: pyyaml (>=6.0)
24
25
  Requires-Dist: rauth (>=0.7.3)
25
26
  Requires-Dist: requests (>=2.28.2)
26
27
  Requires-Dist: rich (>=13.3.1)
27
28
  Requires-Dist: rich-click (>=1.6.1)
28
29
  Requires-Dist: tqdm (>=4.66.5)
29
- Project-URL: Homepage, https://bactopia.github.io/
30
+ Project-URL: Homepage, https://bactopia.io/
30
31
  Project-URL: Repository, https://github.com/bactopia/bactopia-py
31
32
  Description-Content-Type: text/markdown
32
33
 
33
34
  ![Bactopia Logo](https://raw.githubusercontent.com/bactopia/bactopia/master/data/bactopia-logo.png)
34
35
 
35
36
  # bactopia-py
36
- A Python package for working with [Bactopia](https://bactopia.github.io/)
37
+ A Python package for working with [Bactopia](https://bactopia.io/)
37
38
 
38
39
  ## Bactopia Subcommands
39
40
 
@@ -312,7 +313,7 @@ of nearly 2,000,000 bacterial genomes. Using available FASTQ files from the Euro
312
313
  Archive (ENA) and Sequence Read Archive (SRA), the genomes were assembled using [Shovill] and made
313
314
  publicly available from the [Iqbal Lab](https://github.com/iqbal-lab-org/AllTheBacteria).
314
315
 
315
- To make it easy to utilize [Bactopia Tools](https://bactopia.github.io/latest/bactopia-tools/) with
316
+ To make it easy to utilize [Bactopia Tools](https://bactopia.io/bactopia-tools/) with
316
317
  assemblies from AllTheBacteria, `bactopia-atb-formatter` was created. This tool will create a
317
318
  directory structure that resembles output from an actual Bactopia run.
318
319
 
@@ -350,7 +351,7 @@ directory structure that resembles output from an actual Bactopia run.
350
351
  To demonstrate the usage of `bactopia-atb-formatter`, we will use assemblies for
351
352
  _Legionella pneumophila_. The following steps will download the assemblies, build the
352
353
  Bactopia directory structure, and then run [legsta](https://github.com/tseemann/legsta)
353
- via the [Bactopia Tool](https://bactopia.github.io/latest/bactopia-tools/legsta/).
354
+ via the [Bactopia Tool](https://bactopia.io/bactopia-tools/legsta/).
354
355
 
355
356
  #### Download the Assemblies
356
357
 
@@ -394,7 +395,7 @@ created for 5,393 assemblies and is ready for use with Bactopia Tools.
394
395
 
395
396
  As mentioned above, we will use [legsta](https://github.com/tseemann/legsta) to analyze each
396
397
  of the _Legionella pneumophila_ assemblies. To do this, we will use the
397
- [legsta Bactopia Tool](https://bactopia.github.io/latest/bactopia-tools/legsta/).
398
+ [legsta Bactopia Tool](https://bactopia.io/bactopia-tools/legsta/).
398
399
 
399
400
  ```{bash}
400
401
  # Run legsta (please utilize Docker or Singularity only for reproducibility)
@@ -431,7 +432,7 @@ CPU hours : 5.2
431
432
  Succeeded : 5'395
432
433
  ```
433
434
 
434
- That's it! Now you can take advantage of any of the [Bactopia Tools](https://bactopia.github.io/latest/bactopia-tools/)
435
+ That's it! Now you can take advantage of any of the [Bactopia Tools](https://bactopia.io/bactopia-tools/)
435
436
  that utilize assemblies as inputs.
436
437
 
437
438
  # PubMLST DB Builds
@@ -1,7 +1,7 @@
1
1
  ![Bactopia Logo](https://raw.githubusercontent.com/bactopia/bactopia/master/data/bactopia-logo.png)
2
2
 
3
3
  # bactopia-py
4
- A Python package for working with [Bactopia](https://bactopia.github.io/)
4
+ A Python package for working with [Bactopia](https://bactopia.io/)
5
5
 
6
6
  ## Bactopia Subcommands
7
7
 
@@ -280,7 +280,7 @@ of nearly 2,000,000 bacterial genomes. Using available FASTQ files from the Euro
280
280
  Archive (ENA) and Sequence Read Archive (SRA), the genomes were assembled using [Shovill] and made
281
281
  publicly available from the [Iqbal Lab](https://github.com/iqbal-lab-org/AllTheBacteria).
282
282
 
283
- To make it easy to utilize [Bactopia Tools](https://bactopia.github.io/latest/bactopia-tools/) with
283
+ To make it easy to utilize [Bactopia Tools](https://bactopia.io/bactopia-tools/) with
284
284
  assemblies from AllTheBacteria, `bactopia-atb-formatter` was created. This tool will create a
285
285
  directory structure that resembles output from an actual Bactopia run.
286
286
 
@@ -318,7 +318,7 @@ directory structure that resembles output from an actual Bactopia run.
318
318
  To demonstrate the usage of `bactopia-atb-formatter`, we will use assemblies for
319
319
  _Legionella pneumophila_. The following steps will download the assemblies, build the
320
320
  Bactopia directory structure, and then run [legsta](https://github.com/tseemann/legsta)
321
- via the [Bactopia Tool](https://bactopia.github.io/latest/bactopia-tools/legsta/).
321
+ via the [Bactopia Tool](https://bactopia.io/bactopia-tools/legsta/).
322
322
 
323
323
  #### Download the Assemblies
324
324
 
@@ -362,7 +362,7 @@ created for 5,393 assemblies and is ready for use with Bactopia Tools.
362
362
 
363
363
  As mentioned above, we will use [legsta](https://github.com/tseemann/legsta) to analyze each
364
364
  of the _Legionella pneumophila_ assemblies. To do this, we will use the
365
- [legsta Bactopia Tool](https://bactopia.github.io/latest/bactopia-tools/legsta/).
365
+ [legsta Bactopia Tool](https://bactopia.io/bactopia-tools/legsta/).
366
366
 
367
367
  ```{bash}
368
368
  # Run legsta (please utilize Docker or Singularity only for reproducibility)
@@ -399,7 +399,7 @@ CPU hours : 5.2
399
399
  Succeeded : 5'395
400
400
  ```
401
401
 
402
- That's it! Now you can take advantage of any of the [Bactopia Tools](https://bactopia.github.io/latest/bactopia-tools/)
402
+ That's it! Now you can take advantage of any of the [Bactopia Tools](https://bactopia.io/bactopia-tools/)
403
403
  that utilize assemblies as inputs.
404
404
 
405
405
  # PubMLST DB Builds
@@ -26,7 +26,7 @@ def common_options(fn):
26
26
  def setup_logging(verbose: bool, silent: bool) -> None:
27
27
  """Configure root logger with RichHandler at the appropriate level."""
28
28
  logging.basicConfig(
29
- format="%(asctime)s:%(name)s:%(levelname)s - %(message)s",
29
+ format="%(message)s",
30
30
  datefmt="%Y-%m-%d %H:%M:%S",
31
31
  handlers=[
32
32
  RichHandler(rich_tracebacks=True, console=rich.console.Console(stderr=True))
@@ -30,6 +30,8 @@ click.rich_click.OPTION_GROUPS = {
30
30
  "name": "Query Options",
31
31
  "options": [
32
32
  "--exact-taxon",
33
+ "--provider",
34
+ "--only-provider",
33
35
  "--limit",
34
36
  "--accession-limit",
35
37
  "--biosample-subset",
@@ -192,6 +194,9 @@ def parse_query(q, accession_limit, exact_taxon=False):
192
194
  run_accessions = []
193
195
 
194
196
  for query in queries:
197
+ query = query.strip()
198
+ if not query:
199
+ continue
195
200
  try:
196
201
  taxon_id = int(query)
197
202
  if exact_taxon:
@@ -244,6 +249,18 @@ def parse_query(q, accession_limit, exact_taxon=False):
244
249
  help="Taxon ID or Study, BioSample, or Run accession (can also be comma separated or a file of accessions)",
245
250
  )
246
251
  @click.option("--exact-taxon", is_flag=True, help="Exclude Taxon ID descendants")
252
+ @click.option(
253
+ "--provider",
254
+ default="ena",
255
+ show_default=True,
256
+ type=click.Choice(["ena", "sra"], case_sensitive=False),
257
+ help="Provider to query first, falls back to the other",
258
+ )
259
+ @click.option(
260
+ "--only-provider",
261
+ is_flag=True,
262
+ help="Only query the given --provider, skip fallback",
263
+ )
247
264
  @click.option(
248
265
  "--outdir", "-o", default="./", show_default=True, help="Directory to write output"
249
266
  )
@@ -316,6 +333,8 @@ def parse_query(q, accession_limit, exact_taxon=False):
316
333
  def search(
317
334
  query,
318
335
  exact_taxon,
336
+ provider,
337
+ only_provider,
319
338
  outdir,
320
339
  prefix,
321
340
  limit,
@@ -379,12 +398,31 @@ def search(
379
398
  accessions_file = f"{outdir}/{prefix}-accessions.txt".replace("//", "/")
380
399
  filtered_file = f"{outdir}/{prefix}-filtered.txt".replace("//", "/")
381
400
  summary_file = f"{outdir}/{prefix}-search.txt".replace("//", "/")
401
+
402
+ if not force:
403
+ existing = [
404
+ f
405
+ for f in [metadata_file, accessions_file, filtered_file, summary_file]
406
+ if Path(f).exists()
407
+ ]
408
+ if existing:
409
+ logging.error(
410
+ f"Output files already exist: {', '.join(existing)}. "
411
+ "Use --force to overwrite."
412
+ )
413
+ sys.exit(1)
414
+
382
415
  genome_sizes = get_ncbi_genome_size() if use_ncbi_genome_size else None
383
416
  for query_type, ena_query, sra_query in queries:
384
417
  logging.info(f"Submitting query (type - {query_type})")
385
418
  is_accession = True if query_type.endswith("accession") else False
386
- success, query_results = get_run_info(
387
- sra_query, ena_query, is_accession, limit=limit
419
+ success, query_results, source = get_run_info(
420
+ sra_query,
421
+ ena_query,
422
+ is_accession,
423
+ limit=limit,
424
+ provider=provider,
425
+ only_provider=only_provider,
388
426
  )
389
427
  results += query_results
390
428
  if success:
@@ -395,8 +433,35 @@ def search(
395
433
  genome_size=genome_size,
396
434
  genome_sizes=genome_sizes,
397
435
  )
436
+
437
+ # Fallback: provider returned results but none passed filtering
438
+ if not query_accessions and not only_provider:
439
+ fallback = "sra" if source == "ena" else "ena"
440
+ logging.info(
441
+ f"Accession found on {source.upper()}, but missing "
442
+ f"metadata, checking {fallback.upper()}..."
443
+ )
444
+ fb_success, fb_results, fb_source = get_run_info(
445
+ sra_query,
446
+ ena_query,
447
+ is_accession,
448
+ limit=limit,
449
+ provider=fallback,
450
+ only_provider=True,
451
+ )
452
+ if fb_success:
453
+ results += fb_results
454
+ source = fb_source
455
+ query_accessions, query_filtered = parse_accessions(
456
+ fb_results,
457
+ min_read_length=min_read_length,
458
+ min_base_count=min_base_count,
459
+ genome_size=genome_size,
460
+ genome_sizes=genome_sizes,
461
+ )
462
+
463
+ WARNING_MESSAGE = None
398
464
  if len(query_accessions):
399
- WARNING_MESSAGE = None
400
465
  if query_type == "biosample" and biosample_subset > 0:
401
466
  if len(query_accessions) > biosample_subset:
402
467
  WARNING_MESSAGE = f"WARNING: Selected {biosample_subset} Experiment accession(s) from a total of {len(query_accessions)}"
@@ -404,20 +469,19 @@ def search(
404
469
  query_accessions, biosample_subset
405
470
  )
406
471
  accessions = list(set(accessions + query_accessions))
407
- filtered["min_base_count"] += query_filtered["min_base_count"]
408
- filtered["min_read_length"] += query_filtered["min_read_length"]
409
- filtered["technical"] += query_filtered["technical"]
410
- for filtered_sample in query_filtered["filtered"]:
411
- filtered["filtered"][filtered_sample["accession"]] = (
412
- filtered_sample["reason"]
413
- )
414
472
  else:
415
473
  if query_results:
416
- WARNING_MESSAGE = f"WARNING: {query} did not return any Illumina or Ont results from ENA."
474
+ WARNING_MESSAGE = f"WARNING: {query} did not return any Illumina or Ont results from {source.upper()}."
417
475
  else:
418
- WARNING_MESSAGE = (
419
- f"WARNING: {query} did not return any results from ENA."
420
- )
476
+ WARNING_MESSAGE = f"WARNING: {query} did not return any results from {source.upper()}."
477
+
478
+ filtered["min_base_count"] += query_filtered["min_base_count"]
479
+ filtered["min_read_length"] += query_filtered["min_read_length"]
480
+ filtered["technical"] += query_filtered["technical"]
481
+ for filtered_sample in query_filtered["filtered"]:
482
+ filtered["filtered"][filtered_sample["accession"]] = filtered_sample[
483
+ "reason"
484
+ ]
421
485
 
422
486
  # Create Summary
423
487
  query_string = query
@@ -435,6 +499,7 @@ def search(
435
499
  summary.append(
436
500
  f"DATE: {datetime.datetime.now().replace(microsecond=0).isoformat()}"
437
501
  )
502
+ summary.append(f"PROVIDER: {source.upper()}")
438
503
  summary.append(f"LIMIT: {limit}")
439
504
  summary.append(f"RESULTS: {len(results)} ({metadata_file})")
440
505
  summary.append(
@@ -462,6 +527,10 @@ def search(
462
527
  else:
463
528
  logging.error(f"ERROR: Unable to retrieve metadata for query ({query})")
464
529
 
530
+ if not results:
531
+ logging.error("No results found, skipping output files.")
532
+ sys.exit(1)
533
+
465
534
  # Output the results
466
535
  logging.info(f"Writing results to {metadata_file}")
467
536
  with open(metadata_file, "w") as output_fh:
@@ -0,0 +1,118 @@
1
+ import logging
2
+
3
+ import requests
4
+
5
+ ENA_URL = "https://www.ebi.ac.uk/ena/portal/api/search"
6
+
7
+
8
+ def get_ena_metadata(query: str, is_accession: bool, limit: int):
9
+ """Fetch metadata from ENA.
10
+ https://docs.google.com/document/d/1CwoY84MuZ3SdKYocqssumghBF88PWxUZ/edit#heading=h.ag0eqy2wfin5
11
+
12
+ Args:
13
+ query (str): The query to search for.
14
+ is_accession (bool): If the query is an accession or not.
15
+ limit (int): The maximum number of records to return.
16
+
17
+ Returns:
18
+ list: Records associated with the accession.
19
+ """
20
+ data = {
21
+ "dataPortal": "ena",
22
+ "dccDataOnly": "false",
23
+ "download": "false",
24
+ "result": "read_run",
25
+ "format": "tsv",
26
+ "limit": limit,
27
+ "fields": "all",
28
+ }
29
+
30
+ if is_accession:
31
+ data["includeAccessions"] = query
32
+ else:
33
+ data["query"] = (
34
+ f'"{query} AND library_source=GENOMIC AND '
35
+ "(library_strategy=OTHER OR library_strategy=WGS OR "
36
+ "library_strategy=WGA) AND (library_selection=MNase OR "
37
+ "library_selection=RANDOM OR library_selection=unspecified OR "
38
+ 'library_selection="size fractionation")"'
39
+ )
40
+
41
+ headers = {"accept": "*/*", "Content-type": "application/x-www-form-urlencoded"}
42
+
43
+ r = requests.post(ENA_URL, headers=headers, data=data)
44
+ if r.status_code == requests.codes.ok:
45
+ data = []
46
+ col_names = None
47
+ for line in r.text.split("\n"):
48
+ cols = line.split("\t")
49
+ if line:
50
+ if col_names:
51
+ data.append(dict(zip(col_names, cols)))
52
+ else:
53
+ col_names = cols
54
+ return [True, data]
55
+ else:
56
+ return [False, [r.status_code, r.text]]
57
+
58
+
59
+ def get_run_info(
60
+ sra_query: str,
61
+ ena_query: str,
62
+ is_accession: bool,
63
+ limit: int = 1000000,
64
+ provider: str = "ena",
65
+ only_provider: bool = False,
66
+ ) -> tuple:
67
+ """Retrieve a list of samples available from ENA and/or SRA.
68
+
69
+ By default, the provider is queried first and the other is used as fallback. When
70
+ only_provider is True, no fallback is attempted.
71
+
72
+ Args:
73
+ sra_query: A formatted query for SRA searches.
74
+ ena_query: A formatted query for ENA searches.
75
+ is_accession: If the query is an accession or not.
76
+ limit: The maximum number of records to return.
77
+ provider: Which provider to query first ("ena" or "sra").
78
+ only_provider: If True, skip fallback to the other provider.
79
+
80
+ Returns:
81
+ tuple: (success, data, source) where source is "ena", "sra", or "none".
82
+ """
83
+ from bactopia.databases.sra import get_sra_metadata
84
+
85
+ fallback = "sra" if provider == "ena" else "ena"
86
+
87
+ def _query_ena():
88
+ logging.debug("Querying ENA for metadata...")
89
+ success, data = get_ena_metadata(ena_query, is_accession, limit=limit)
90
+ if success and data:
91
+ return True, data
92
+ if not success:
93
+ logging.warning(f"ENA query failed (status {data[0]}).")
94
+ else:
95
+ logging.debug("ENA query returned no results.")
96
+ return False, []
97
+
98
+ def _query_sra():
99
+ logging.debug("Querying SRA for metadata...")
100
+ return get_sra_metadata(sra_query, is_accession, limit=limit)
101
+
102
+ query_fn = {"ena": _query_ena, "sra": _query_sra}
103
+
104
+ success, data = query_fn[provider]()
105
+ if success:
106
+ return True, data, provider
107
+
108
+ if only_provider:
109
+ logging.error(f"{provider.upper()} returned no results (--only-provider).")
110
+ return False, [], "none"
111
+
112
+ logging.info(f"No results from {provider.upper()}, checking {fallback.upper()}...")
113
+ success, data = query_fn[fallback]()
114
+ if success:
115
+ return True, data, fallback
116
+
117
+ logging.error("Both ENA and SRA returned no results.")
118
+ return False, [], "none"
@@ -0,0 +1,103 @@
1
+ import logging
2
+
3
+ from pysradb.sraweb import SRAweb
4
+
5
+ INSTRUMENT_PLATFORM_MAP = {
6
+ "illumina": "ILLUMINA",
7
+ "nextseq": "ILLUMINA",
8
+ "hiseq": "ILLUMINA",
9
+ "miseq": "ILLUMINA",
10
+ "novaseq": "ILLUMINA",
11
+ "miniseq": "ILLUMINA",
12
+ "genome analyzer": "ILLUMINA",
13
+ "minion": "OXFORD_NANOPORE",
14
+ "gridion": "OXFORD_NANOPORE",
15
+ "promethion": "OXFORD_NANOPORE",
16
+ "nanopore": "OXFORD_NANOPORE",
17
+ "pacbio": "PACBIO_SMRT",
18
+ "sequel": "PACBIO_SMRT",
19
+ "revio": "PACBIO_SMRT",
20
+ "ion torrent": "ION_TORRENT",
21
+ }
22
+
23
+ SRA_TO_ENA_FIELDS = {
24
+ "run_total_bases": "base_count",
25
+ "run_total_spots": "read_count",
26
+ "organism_taxid": "tax_id",
27
+ "organism_name": "scientific_name",
28
+ }
29
+
30
+
31
+ def instrument_to_platform(instrument: str) -> str:
32
+ """Map an SRA instrument model name to the ENA platform constant.
33
+
34
+ Args:
35
+ instrument: Instrument model name from SRA (e.g. "Illumina MiniSeq").
36
+
37
+ Returns:
38
+ str: Platform constant (e.g. "ILLUMINA") or the original value if unknown.
39
+ """
40
+ lower = instrument.lower()
41
+ for key, platform in INSTRUMENT_PLATFORM_MAP.items():
42
+ if key in lower:
43
+ return platform
44
+ return instrument
45
+
46
+
47
+ def normalize_sra_fields(records: list[dict]) -> list[dict]:
48
+ """Rename SRA fields to their ENA equivalents so parse_accessions() works unchanged.
49
+
50
+ Args:
51
+ records: List of dicts from pysradb search results.
52
+
53
+ Returns:
54
+ list[dict]: Records with critical fields renamed in place.
55
+ """
56
+ for record in records:
57
+ for sra_field, ena_field in SRA_TO_ENA_FIELDS.items():
58
+ if sra_field in record:
59
+ record[ena_field] = record.pop(sra_field)
60
+
61
+ if "instrument" in record and "instrument_model_desc" not in record:
62
+ record["instrument_model_desc"] = instrument_to_platform(
63
+ record["instrument"]
64
+ )
65
+
66
+ layout = record.get("library_layout", "SINGLE").upper()
67
+ record["fastq_bytes"] = "0;0" if layout == "PAIRED" else "0"
68
+
69
+ return records
70
+
71
+
72
+ def get_sra_metadata(query: str, is_accession: bool, limit: int) -> list:
73
+ """Fetch metadata from SRA via pysradb.
74
+
75
+ Args:
76
+ query: The query to search for (accession or NCBI query string).
77
+ is_accession: If the query is an accession or not.
78
+ limit: The maximum number of records to return.
79
+
80
+ Returns:
81
+ list: [success: bool, data: list[dict]]
82
+ """
83
+ try:
84
+ db = SRAweb()
85
+ df = db.search_sra(
86
+ query,
87
+ detailed=True,
88
+ sample_attribute=True,
89
+ expand_sample_attributes=True,
90
+ )
91
+ if df is None or df.empty:
92
+ logging.debug(f"SRA query returned no results for: {query}")
93
+ return [False, []]
94
+
95
+ if len(df) > limit:
96
+ logging.debug(f"SRA returned {len(df)} results, truncating to {limit}")
97
+ df = df.head(limit)
98
+
99
+ records = df.to_dict(orient="records")
100
+ return [True, normalize_sra_fields(records)]
101
+ except Exception as e:
102
+ logging.error(f"Error querying SRA: {e}")
103
+ return [False, []]
@@ -61,7 +61,7 @@ _NEXTFLOW_INFORMATIONAL_RE = re.compile(
61
61
  re.IGNORECASE,
62
62
  )
63
63
 
64
- # nextflow.config: nextflowVersion = '>=25.04.6'
64
+ # nextflow.config: nextflowVersion = '>=26.04.0'
65
65
  _NEXTFLOW_CONFIG_RE = re.compile(
66
66
  r"nextflowVersion\s*=\s*['\"][^\d]*(\d+\.\d+(?:\.\d+)?)"
67
67
  )
@@ -28,18 +28,20 @@ def parse(path: str, name: str) -> dict:
28
28
  # Single end
29
29
  r1 = Path(path)
30
30
  else:
31
- r1_path = None
32
- r2_path = None
33
- if "original" in path:
34
- r1_path = path.replace("-original.json", "_R1-original.json")
35
- r2_path = path.replace("-original.json", "_R2-original.json")
36
- else:
37
- r1_path = path.replace("-final.json", "_R1-final.json")
38
- r2_path = path.replace("-final.json", "_R2-final.json")
31
+ suffix = "-original.json" if "original" in path else "-final.json"
32
+
33
+ r1_path = path.replace(suffix, f"_R1{suffix}")
34
+ r2_path = path.replace(suffix, f"_R2{suffix}")
39
35
 
40
36
  if Path(r1_path).exists() and Path(r2_path).exists():
41
37
  r1 = Path(r1_path)
42
38
  r2 = Path(r2_path)
39
+ else:
40
+ for tag in ("_SE", "_ONT"):
41
+ alt_path = path.replace(suffix, f"{tag}{suffix}")
42
+ if Path(alt_path).exists():
43
+ r1 = Path(alt_path)
44
+ break
43
45
 
44
46
  final_results = {"sample": name}
45
47
  if r1:
@@ -79,6 +79,6 @@ Key module categories:
79
79
  - [README.md](README.md): Project overview and quick start
80
80
  - [CONTRIBUTING.md](CONTRIBUTING.md): Human contributor guide
81
81
  - [CHANGELOG.md](CHANGELOG.md): Version history
82
- - Full docs: https://bactopia.github.io/
82
+ - Full docs: https://bactopia.io/
83
83
 
84
84
  <!-- Auto-generated by bactopia-catalog. Do not edit directly; edit the template at bactopia-py/bactopia/templates/bactopia/llms.txt.j2. -->
@@ -5,7 +5,7 @@ manifest {
5
5
  homePage = 'https://github.com/bactopia/bactopia'
6
6
  description = 'An extensive workflow for processing sequencing of bacterial genomes.'
7
7
  mainScript = 'main.nf'
8
- version = '4.0.0'
8
+ version = '4.0.1'
9
9
  nextflowVersion = '>=26.04.0'
10
10
  }
11
11
 
@@ -21,7 +21,7 @@ params {
21
21
  }
22
22
 
23
23
  // Version
24
- params.bactopia_version = '4.0.0'
24
+ params.bactopia_version = '4.0.1'
25
25
  manifest.version = "${params.bactopia_version}"
26
26
 
27
27
  // Includes
@@ -93,7 +93,7 @@ dag {
93
93
 
94
94
  // Plugins
95
95
  plugins {
96
- id 'nf-bactopia@2.1.0'
96
+ id 'nf-bactopia@2.1.5'
97
97
  }
98
98
 
99
99
  bactopia {
@@ -10,7 +10,7 @@ params {
10
10
  ext = "fna"
11
11
  }
12
12
 
13
- bactopia_version = '4.0.0'
13
+ bactopia_version = '4.0.1'
14
14
  bactopia_cache = System.getenv("BACTOPIA_CACHEDIR") ?: "${System.getenv('HOME')}/.bactopia"
15
15
  condadir = "${params.bactopia_cache}/conda"
16
16
  wf = params.workflow.name
@@ -9,7 +9,7 @@ params {
9
9
  description = "{{ description }}"
10
10
  ext = "fna"
11
11
  }
12
- bactopia_version = '4.0.0'
12
+ bactopia_version = '4.0.1'
13
13
  bactopia_cache = System.getenv("BACTOPIA_CACHEDIR") ?: "${System.getenv('HOME')}/.bactopia"
14
14
  condadir = "${params.bactopia_cache}/conda"
15
15
  wf = params.workflow.name
@@ -38,5 +38,5 @@ includeConfig "../../../conf/profiles.config"
38
38
 
39
39
  // Plugin
40
40
  plugins {
41
- id 'nf-bactopia@2.0.2'
41
+ id 'nf-bactopia@2.1.5'
42
42
  }
@@ -5,8 +5,8 @@ manifest {
5
5
  homePage = 'https://github.com/bactopia/bactopia'
6
6
  description = 'An extensive workflow for processing sequencing of bacterial genomes.'
7
7
  mainScript = 'main.nf'
8
- version = '4.0.0'
9
- nextflowVersion = '>=25.04.6'
8
+ version = '4.0.1'
9
+ nextflowVersion = '>=26.04.0'
10
10
  }
11
11
 
12
12
  params {
@@ -19,7 +19,7 @@ params {
19
19
  }
20
20
 
21
21
  // Version
22
- params.bactopia_version = '4.0.0'
22
+ params.bactopia_version = '4.0.1'
23
23
  manifest.version = "${params.bactopia_version}"
24
24
 
25
25
  // Includes
@@ -85,7 +85,7 @@ dag {
85
85
 
86
86
  // Plugins
87
87
  plugins {
88
- id 'nf-bactopia@2.0.2'
88
+ id 'nf-bactopia@2.1.5'
89
89
  }
90
90
 
91
91
  bactopia {
@@ -1,13 +1,13 @@
1
1
  [tool.poetry]
2
2
  name = "bactopia"
3
- version = "2.1.5"
3
+ version = "2.2.0"
4
4
  description = "A Python package for working with Bactopia"
5
5
  authors = [
6
6
  "Robert A. Petit III <robbie.petit@gmail.com>",
7
7
  ]
8
8
  license = "MIT"
9
9
  readme = "README.md"
10
- homepage = "https://bactopia.github.io/"
10
+ homepage = "https://bactopia.io/"
11
11
  repository = "https://github.com/bactopia/bactopia-py"
12
12
  keywords = ["bioinformatics", "bacteria", "bactopia", "SRA", "ENA"]
13
13
 
@@ -57,6 +57,7 @@ pyyaml = ">=6.0"
57
57
  biopython = ">=1.80"
58
58
  openpyxl = ">=3.1.0"
59
59
  psutil = ">=5.9.0"
60
+ pysradb = ">=2.2.0"
60
61
 
61
62
  [tool.poetry.group.dev.dependencies]
62
63
  ruff = "^0.9"
@@ -1,86 +0,0 @@
1
- import logging
2
- import sys
3
-
4
- import requests
5
-
6
- ENA_URL = "https://www.ebi.ac.uk/ena/portal/api/search"
7
-
8
-
9
- def get_ena_metadata(query: str, is_accession: bool, limit: int):
10
- """Fetch metadata from ENA.
11
- https://docs.google.com/document/d/1CwoY84MuZ3SdKYocqssumghBF88PWxUZ/edit#heading=h.ag0eqy2wfin5
12
-
13
- Args:
14
- query (str): The query to search for.
15
- is_accession (bool): If the query is an accession or not.
16
- limit (int): The maximum number of records to return.
17
-
18
- Returns:
19
- list: Records associated with the accession.
20
- """
21
- data = {
22
- "dataPortal": "ena",
23
- "dccDataOnly": "false",
24
- "download": "false",
25
- "result": "read_run",
26
- "format": "tsv",
27
- "limit": limit,
28
- "fields": "all",
29
- }
30
-
31
- if is_accession:
32
- data["includeAccessions"] = query
33
- else:
34
- data["query"] = (
35
- f'"{query} AND library_source=GENOMIC AND '
36
- "(library_strategy=OTHER OR library_strategy=WGS OR "
37
- "library_strategy=WGA) AND (library_selection=MNase OR "
38
- "library_selection=RANDOM OR library_selection=unspecified OR "
39
- 'library_selection="size fractionation")"'
40
- )
41
-
42
- headers = {"accept": "*/*", "Content-type": "application/x-www-form-urlencoded"}
43
-
44
- r = requests.post(ENA_URL, headers=headers, data=data)
45
- if r.status_code == requests.codes.ok:
46
- data = []
47
- col_names = None
48
- for line in r.text.split("\n"):
49
- cols = line.split("\t")
50
- if line:
51
- if col_names:
52
- data.append(dict(zip(col_names, cols)))
53
- else:
54
- col_names = cols
55
- return [True, data]
56
- else:
57
- return [False, [r.status_code, r.text]]
58
-
59
-
60
- def get_run_info(
61
- sra_query: str, ena_query: str, is_accession: bool, limit: int = 1000000
62
- ) -> tuple:
63
- """Retrieve a list of samples available from ENA.
64
-
65
- The first attempt will be against ENA, and if that fails, SRA will be queried. This should
66
- capture those samples not yet synced between ENA and SRA.
67
-
68
- Args:
69
- sra_query (str): A formatted query for SRA searches.
70
- ena_query (str): A formatted query for ENA searches.
71
- is_accession (bool): If the query is an accession or not.
72
- limit (int): The maximum number of records to return.
73
-
74
- Returns:
75
- tuple: Records associated with the accession.
76
- """
77
-
78
- logging.debug("Querying ENA for metadata...")
79
- success, ena_data = get_ena_metadata(ena_query, is_accession, limit=limit)
80
- if success:
81
- return success, ena_data
82
- else:
83
- logging.error("There was an issue querying ENA, exiting...")
84
- logging.error(f"STATUS: {ena_data[0]}")
85
- logging.error(f"TEXT: {ena_data[1]}")
86
- sys.exit(1)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes