umierrorcorrect2 0.30.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. umierrorcorrect2-0.30.0/.gitignore +227 -0
  2. umierrorcorrect2-0.30.0/LICENSE.txt +8 -0
  3. umierrorcorrect2-0.30.0/PKG-INFO +104 -0
  4. umierrorcorrect2-0.30.0/README.md +68 -0
  5. umierrorcorrect2-0.30.0/pyproject.toml +164 -0
  6. umierrorcorrect2-0.30.0/umierrorcorrect/__init__.py +0 -0
  7. umierrorcorrect2-0.30.0/umierrorcorrect/align.py +104 -0
  8. umierrorcorrect2-0.30.0/umierrorcorrect/batch.py +517 -0
  9. umierrorcorrect2-0.30.0/umierrorcorrect/call_variants.py +165 -0
  10. umierrorcorrect2-0.30.0/umierrorcorrect/cli.py +610 -0
  11. umierrorcorrect2-0.30.0/umierrorcorrect/core/__init__.py +0 -0
  12. umierrorcorrect2-0.30.0/umierrorcorrect/core/check_args.py +105 -0
  13. umierrorcorrect2-0.30.0/umierrorcorrect/core/consensus.py +749 -0
  14. umierrorcorrect2-0.30.0/umierrorcorrect/core/constants.py +128 -0
  15. umierrorcorrect2-0.30.0/umierrorcorrect/core/filter.py +39 -0
  16. umierrorcorrect2-0.30.0/umierrorcorrect/core/fit_background_model.py +90 -0
  17. umierrorcorrect2-0.30.0/umierrorcorrect/core/get_cons_info.py +245 -0
  18. umierrorcorrect2-0.30.0/umierrorcorrect/core/get_regions_from_bed.py +130 -0
  19. umierrorcorrect2-0.30.0/umierrorcorrect/core/group.py +153 -0
  20. umierrorcorrect2-0.30.0/umierrorcorrect/core/logging_config.py +150 -0
  21. umierrorcorrect2-0.30.0/umierrorcorrect/core/read_fastq_records.py +43 -0
  22. umierrorcorrect2-0.30.0/umierrorcorrect/core/umi_cluster.py +223 -0
  23. umierrorcorrect2-0.30.0/umierrorcorrect/core/utils.py +135 -0
  24. umierrorcorrect2-0.30.0/umierrorcorrect/downsampling.py +158 -0
  25. umierrorcorrect2-0.30.0/umierrorcorrect/get_consensus_statistics.py +267 -0
  26. umierrorcorrect2-0.30.0/umierrorcorrect/models/__init__.py +0 -0
  27. umierrorcorrect2-0.30.0/umierrorcorrect/models/models.py +515 -0
  28. umierrorcorrect2-0.30.0/umierrorcorrect/pipeline.py +142 -0
  29. umierrorcorrect2-0.30.0/umierrorcorrect/preprocess.py +536 -0
  30. umierrorcorrect2-0.30.0/umierrorcorrect/qc.py +78 -0
  31. umierrorcorrect2-0.30.0/umierrorcorrect/umi_error_correct.py +726 -0
  32. umierrorcorrect2-0.30.0/umierrorcorrect/version.py +1 -0
@@ -0,0 +1,227 @@
1
+ *.pyc
2
+ .DS_Store
3
+
4
+ # Byte-compiled / optimized / DLL files
5
+ __pycache__/
6
+ *.py[codz]
7
+ *$py.class
8
+
9
+ # C extensions
10
+ *.so
11
+
12
+ *.sam
13
+ results*/
14
+ .vscode/
15
+ .claude/
16
+ CLAUDE.md
17
+ test_data/
18
+
19
+ # Distribution / packaging
20
+ .Python
21
+ build/
22
+ develop-eggs/
23
+ dist/
24
+ downloads/
25
+ eggs/
26
+ .eggs/
27
+ lib/
28
+ lib64/
29
+ parts/
30
+ sdist/
31
+ var/
32
+ wheels/
33
+ share/python-wheels/
34
+ *.egg-info/
35
+ .installed.cfg
36
+ *.egg
37
+ MANIFEST
38
+
39
+ # PyInstaller
40
+ # Usually these files are written by a python script from a template
41
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
42
+ *.manifest
43
+ *.spec
44
+
45
+ # Installer logs
46
+ pip-log.txt
47
+ pip-delete-this-directory.txt
48
+
49
+ # Unit test / coverage reports
50
+ htmlcov/
51
+ .tox/
52
+ .nox/
53
+ .coverage
54
+ .coverage.*
55
+ .cache
56
+ nosetests.xml
57
+ coverage.xml
58
+ *.cover
59
+ *.py.cover
60
+ .hypothesis/
61
+ .pytest_cache/
62
+ cover/
63
+
64
+ # Translations
65
+ *.mo
66
+ *.pot
67
+
68
+ # Django stuff:
69
+ *.log
70
+ local_settings.py
71
+ db.sqlite3
72
+ db.sqlite3-journal
73
+
74
+ # Flask stuff:
75
+ instance/
76
+ .webassets-cache
77
+
78
+ # Scrapy stuff:
79
+ .scrapy
80
+
81
+ # Sphinx documentation
82
+ docs/_build/
83
+
84
+ # PyBuilder
85
+ .pybuilder/
86
+ target/
87
+
88
+ # Jupyter Notebook
89
+ .ipynb_checkpoints
90
+
91
+ # IPython
92
+ profile_default/
93
+ ipython_config.py
94
+
95
+ # pyenv
96
+ # For a library or package, you might want to ignore these files since the code is
97
+ # intended to run in multiple environments; otherwise, check them in:
98
+ # .python-version
99
+
100
+ # pipenv
101
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
102
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
103
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
104
+ # install all needed dependencies.
105
+ # Pipfile.lock
106
+
107
+ # UV
108
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
109
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
110
+ # commonly ignored for libraries.
111
+ # uv.lock
112
+
113
+ # poetry
114
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
115
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
116
+ # commonly ignored for libraries.
117
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
118
+ # poetry.lock
119
+ # poetry.toml
120
+
121
+ # pdm
122
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
123
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
124
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
125
+ # pdm.lock
126
+ # pdm.toml
127
+ .pdm-python
128
+ .pdm-build/
129
+
130
+ # pixi
131
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
132
+ # pixi.lock
133
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
134
+ # in the .venv directory. It is recommended not to include this directory in version control.
135
+ .pixi
136
+
137
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
138
+ __pypackages__/
139
+
140
+ # Celery stuff
141
+ celerybeat-schedule
142
+ celerybeat.pid
143
+
144
+ # Redis
145
+ *.rdb
146
+ *.aof
147
+ *.pid
148
+
149
+ # RabbitMQ
150
+ mnesia/
151
+ rabbitmq/
152
+ rabbitmq-data/
153
+
154
+ # ActiveMQ
155
+ activemq-data/
156
+
157
+ # SageMath parsed files
158
+ *.sage.py
159
+
160
+ # Environments
161
+ .env
162
+ .envrc
163
+ .venv
164
+ env/
165
+ venv/
166
+ ENV/
167
+ env.bak/
168
+ venv.bak/
169
+
170
+ # Spyder project settings
171
+ .spyderproject
172
+ .spyproject
173
+
174
+ # Rope project settings
175
+ .ropeproject
176
+
177
+ # mkdocs documentation
178
+ /site
179
+
180
+ # mypy
181
+ .mypy_cache/
182
+ .dmypy.json
183
+ dmypy.json
184
+
185
+ # Pyre type checker
186
+ .pyre/
187
+
188
+ # pytype static type analyzer
189
+ .pytype/
190
+
191
+ # Cython debug symbols
192
+ cython_debug/
193
+
194
+ # PyCharm
195
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
196
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
197
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
198
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
199
+ # .idea/
200
+
201
+ # Abstra
202
+ # Abstra is an AI-powered process automation framework.
203
+ # Ignore directories containing user credentials, local state, and settings.
204
+ # Learn more at https://abstra.io/docs
205
+ .abstra/
206
+
207
+ # Visual Studio Code
208
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
209
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
210
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
211
+ # you could uncomment the following to ignore the entire vscode folder
212
+ # .vscode/
213
+
214
+ # Ruff stuff:
215
+ .ruff_cache/
216
+
217
+ # PyPI configuration file
218
+ .pypirc
219
+
220
+ # Marimo
221
+ marimo/_static/
222
+ marimo/_lsp/
223
+ __marimo__/
224
+
225
+ # Streamlit
226
+ .streamlit/secrets.toml*.fastq.gz
227
+ *.fastq.gz
@@ -0,0 +1,8 @@
1
+ Copyright (c) 2019-2023 Tobias Osterlund
2
+ Copyright (c) 2026 Stefan Filges
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
5
+
6
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
7
+
8
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,104 @@
1
+ Metadata-Version: 2.4
2
+ Name: umierrorcorrect2
3
+ Version: 0.30.0
4
+ Summary: Pipeline for analyzing barcoded amplicon sequencing data with Unique Molecular Identifiers (UMI)
5
+ Project-URL: Homepage, https://github.com/sfilges/umierrorcorrect
6
+ Project-URL: Documentation, https://github.com/sfilges/umierrorcorrect/wiki
7
+ Project-URL: Repository, https://github.com/sfilges/umierrorcorrect
8
+ Author-email: Stefan Filges <stefan.filges@pm.me>, Tobias Osterlund <tobias.osterlund@gu.se>
9
+ License-Expression: MIT
10
+ License-File: LICENSE.txt
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
17
+ Requires-Python: >=3.10
18
+ Requires-Dist: loguru>=0.7.0
19
+ Requires-Dist: matplotlib
20
+ Requires-Dist: pydantic>=2.0.0
21
+ Requires-Dist: pysam>=0.8.4
22
+ Requires-Dist: scipy
23
+ Requires-Dist: typer[all]>=0.9.0
24
+ Provides-Extra: dev
25
+ Requires-Dist: mypy; extra == 'dev'
26
+ Requires-Dist: pre-commit; extra == 'dev'
27
+ Requires-Dist: pytest; extra == 'dev'
28
+ Requires-Dist: pytest-cov; extra == 'dev'
29
+ Requires-Dist: ruff; extra == 'dev'
30
+ Provides-Extra: docs
31
+ Requires-Dist: sphinx; extra == 'docs'
32
+ Requires-Dist: sphinx-rtd-theme; extra == 'docs'
33
+ Provides-Extra: fast
34
+ Requires-Dist: numba>=0.57.0; extra == 'fast'
35
+ Description-Content-Type: text/markdown
36
+
37
+ # UMIErrorCorrect2
38
+
39
+ [![PyPI version](https://badge.fury.io/py/umierrorcorrect2.svg)](https://badge.fury.io/py/umierrorcorrect2)
40
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
41
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
42
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
43
+
44
+ A modern, high-performance pipeline for analyzing barcoded amplicon sequencing data with Unique Molecular Identifiers (UMI).
45
+
46
+ This package is a **complete modernization** of the original [UMIErrorCorrect](https://github.com/stahlberggroup/umierrorcorrect) published in *Clinical Chemistry* (2022).
47
+
48
+ ## Key Features
49
+
50
+ - **High Performance**: Parallel processing of genomic regions and fastp-based preprocessing.
51
+ - **Modern Tooling**: Built with `typer`, `pydantic`, `loguru`, and `hatch`.
52
+ - **Easy Installation**: Fully PEP 621 compliant, installable via `pip` or `uv`.
53
+ - **Comprehensive**: From raw FASTQ to error-corrected VCFs and consensus statistics.
54
+ - **Robust**: Extensive test suite and type safety.
55
+
56
+ ## Dependencies
57
+
58
+ - `fastp` for preprocessing
59
+ - `bwa` for alignment
60
+
61
+ ## Installation
62
+
63
+ Use [uv](https://github.com/astral-sh/uv) for lightning-fast installation:
64
+
65
+ ```bash
66
+ uv pip install umierrorcorrect2
67
+ ```
68
+
69
+ Or standard pip:
70
+
71
+ ```bash
72
+ pip install umierrorcorrect2
73
+ ```
74
+
75
+ ## Quick Start
76
+
77
+ The command-line tool is named `umierrorcorrect`. Run the full pipeline on a single sample:
78
+
79
+ ```bash
80
+ umierrorcorrect batch \
81
+ -r1 sample_R1.fastq.gz \
82
+ -r2 sample_R2.fastq.gz \
83
+ -r hg38.fa \
84
+ -o results/ \
85
+ -ul 12 \
86
+ -sl 16 \
87
+ --fastp
88
+ ```
89
+
90
+ For detailed instructions, see the **[User Guide](docs/USER_GUIDE.md)** or run:
91
+
92
+ ```bash
93
+ umierrorcorrect --help
94
+ ```
95
+
96
+ ## Documentation
97
+
98
+ - [User Guide](docs/USER_GUIDE.md): Detailed usage instructions for all commands.
99
+ - [Implementation Details](docs/IMPLEMENTATION.md): Architecture and design overview.
100
+ - [Docker Guide](docs/docker.md): Running with containers.
101
+
102
+ ## Citation
103
+
104
+ > Osterlund T., Filges S., Johansson G., Stahlberg A. *UMIErrorCorrect and UMIAnalyzer: Software for Consensus Read Generation, Error Correction, and Visualization Using Unique Molecular Identifiers*, Clinical Chemistry, 2022. [doi:10.1093/clinchem/hvac136](https://doi.org/10.1093/clinchem/hvac136)
@@ -0,0 +1,68 @@
1
+ # UMIErrorCorrect2
2
+
3
+ [![PyPI version](https://badge.fury.io/py/umierrorcorrect2.svg)](https://badge.fury.io/py/umierrorcorrect2)
4
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
7
+
8
+ A modern, high-performance pipeline for analyzing barcoded amplicon sequencing data with Unique Molecular Identifiers (UMI).
9
+
10
+ This package is a **complete modernization** of the original [UMIErrorCorrect](https://github.com/stahlberggroup/umierrorcorrect) published in *Clinical Chemistry* (2022).
11
+
12
+ ## Key Features
13
+
14
+ - **High Performance**: Parallel processing of genomic regions and fastp-based preprocessing.
15
+ - **Modern Tooling**: Built with `typer`, `pydantic`, `loguru`, and `hatch`.
16
+ - **Easy Installation**: Fully PEP 621 compliant, installable via `pip` or `uv`.
17
+ - **Comprehensive**: From raw FASTQ to error-corrected VCFs and consensus statistics.
18
+ - **Robust**: Extensive test suite and type safety.
19
+
20
+ ## Dependencies
21
+
22
+ - `fastp` for preprocessing
23
+ - `bwa` for alignment
24
+
25
+ ## Installation
26
+
27
+ Use [uv](https://github.com/astral-sh/uv) for lightning-fast installation:
28
+
29
+ ```bash
30
+ uv pip install umierrorcorrect2
31
+ ```
32
+
33
+ Or standard pip:
34
+
35
+ ```bash
36
+ pip install umierrorcorrect2
37
+ ```
38
+
39
+ ## Quick Start
40
+
41
+ The command-line tool is named `umierrorcorrect`. Run the full pipeline on a single sample:
42
+
43
+ ```bash
44
+ umierrorcorrect batch \
45
+ -r1 sample_R1.fastq.gz \
46
+ -r2 sample_R2.fastq.gz \
47
+ -r hg38.fa \
48
+ -o results/ \
49
+ -ul 12 \
50
+ -sl 16 \
51
+ --fastp
52
+ ```
53
+
54
+ For detailed instructions, see the **[User Guide](docs/USER_GUIDE.md)** or run:
55
+
56
+ ```bash
57
+ umierrorcorrect --help
58
+ ```
59
+
60
+ ## Documentation
61
+
62
+ - [User Guide](docs/USER_GUIDE.md): Detailed usage instructions for all commands.
63
+ - [Implementation Details](docs/IMPLEMENTATION.md): Architecture and design overview.
64
+ - [Docker Guide](docs/docker.md): Running with containers.
65
+
66
+ ## Citation
67
+
68
+ > Osterlund T., Filges S., Johansson G., Stahlberg A. *UMIErrorCorrect and UMIAnalyzer: Software for Consensus Read Generation, Error Correction, and Visualization Using Unique Molecular Identifiers*, Clinical Chemistry, 2022. [doi:10.1093/clinchem/hvac136](https://doi.org/10.1093/clinchem/hvac136)
@@ -0,0 +1,164 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+
7
+
8
+ name = "umierrorcorrect2"
9
+ dynamic = ["version"]
10
+ description = "Pipeline for analyzing barcoded amplicon sequencing data with Unique Molecular Identifiers (UMI)"
11
+ readme = "README.md"
12
+ license = "MIT"
13
+ requires-python = ">=3.10"
14
+ authors = [
15
+ { name = "Stefan Filges", email = "stefan.filges@pm.me" },
16
+ { name = "Tobias Osterlund", email = "tobias.osterlund@gu.se" },
17
+ ]
18
+ classifiers = [
19
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.9",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Programming Language :: Python :: 3.12",
25
+ ]
26
+ dependencies = [
27
+ "pysam>=0.8.4",
28
+ "scipy",
29
+ "matplotlib",
30
+ "loguru>=0.7.0",
31
+ "typer[all]>=0.9.0",
32
+ "pydantic>=2.0.0",
33
+ ]
34
+ [project.optional-dependencies]
35
+ dev = ["ruff", "pytest", "pytest-cov", "mypy", "pre-commit"]
36
+ docs = ["sphinx", "sphinx-rtd-theme"]
37
+ fast = ["numba>=0.57.0"]
38
+
39
+ [project.scripts]
40
+ umierrorcorrect = "umierrorcorrect.cli:main_cli"
41
+
42
+ [project.urls]
43
+ Homepage = "https://github.com/sfilges/umierrorcorrect"
44
+ Documentation = "https://github.com/sfilges/umierrorcorrect/wiki"
45
+ Repository = "https://github.com/sfilges/umierrorcorrect"
46
+
47
+ [tool.hatch.version]
48
+ path = "umierrorcorrect/version.py"
49
+
50
+ [tool.hatch.build.targets.sdist]
51
+ include = ["/umierrorcorrect", "/test_data", "/doc"]
52
+
53
+ [tool.hatch.build.targets.wheel]
54
+ packages = ["umierrorcorrect"]
55
+
56
+ # Ruff configuration
57
+ [tool.ruff]
58
+ target-version = "py39"
59
+ line-length = 120
60
+ src = ["umierrorcorrect"]
61
+
62
+ [tool.ruff.lint]
63
+ select = [
64
+ "E", # pycodestyle errors
65
+ "W", # pycodestyle warnings
66
+ "F", # Pyflakes
67
+ "I", # isort
68
+ "B", # flake8-bugbear
69
+ "C4", # flake8-comprehensions
70
+ "UP", # pyupgrade
71
+ "ARG", # flake8-unused-arguments
72
+ "SIM", # flake8-simplify
73
+ "S", # flake8-bandit (security)
74
+ "PTH", # flake8-use-pathlib
75
+ ]
76
+ ignore = [
77
+ "E501", # line too long (handled by formatter)
78
+ "S101", # assert usage (ok in tests)
79
+ "S603", # subprocess calls - expected for CLI tool that runs bwa, pigz, gzip, cutadapt
80
+ "S607", # partial executable path - expected for CLI tool (e.g., ["bwa", "mem", ...])
81
+ "S301", # pickle usage - used intentionally for caching in MSA alignment
82
+ ]
83
+ fixable = ["ALL"]
84
+
85
+ [tool.ruff.lint.per-file-ignores]
86
+ "tests/*" = ["S101", "ARG", "S311"] # S311: random ok for test data
87
+ "umierrorcorrect/cli.py" = ["ARG001"] # version arg used by typer callback
88
+ "umierrorcorrect/core/consensus.py" = [
89
+ "ARG001",
90
+ "SIM108",
91
+ "SIM201",
92
+ "B007",
93
+ ] # ref arg part of function signature; ternary would reduce readability
94
+ "umierrorcorrect/core/get_cons_info.py" = [
95
+ "ARG001",
96
+ "SIM201",
97
+ "SIM108",
98
+ "PTH123",
99
+ ] # indel_freq args part of API; debug code in __main__
100
+ "umierrorcorrect/core/check_args.py" = [
101
+ "SIM102",
102
+ ] # nested if is clearer for validation errors
103
+ "umierrorcorrect/core/umi_cluster.py" = [
104
+ "PTH123",
105
+ ] # hardcoded debug paths in __main__ block
106
+ "umierrorcorrect/umi_error_correct.py" = [
107
+ "SIM108",
108
+ ] # ternary would reduce readability for complex expressions
109
+ "umierrorcorrect/preprocess.py" = ["SIM108"] # ternary would reduce readability
110
+
111
+ [tool.ruff.format]
112
+ quote-style = "double"
113
+ indent-style = "space"
114
+
115
+ # Pytest configuration
116
+ [tool.pytest.ini_options]
117
+ testpaths = ["tests"]
118
+ python_files = ["test_*.py"]
119
+ python_functions = ["test_*"]
120
+ addopts = ["-v", "--tb=short", "--strict-markers"]
121
+ markers = [
122
+ "slow: marks tests as slow (deselect with '-m \"not slow\"')",
123
+ "integration: marks tests as integration tests",
124
+ "requires_bwa: marks tests that require bwa installed",
125
+ ]
126
+ filterwarnings = ["ignore::DeprecationWarning:pysam.*"]
127
+
128
+ # Coverage configuration
129
+ [tool.coverage.run]
130
+ source = ["umierrorcorrect"]
131
+ branch = true
132
+ omit = ["*/test_*.py", "*/__init__.py"]
133
+
134
+ [tool.coverage.report]
135
+ exclude_lines = [
136
+ "pragma: no cover",
137
+ "if __name__ == .__main__.:",
138
+ "raise NotImplementedError",
139
+ ]
140
+ show_missing = true
141
+
142
+ # Mypy configuration
143
+ [tool.mypy]
144
+ python_version = "3.10"
145
+ warn_return_any = false
146
+ warn_unused_configs = true
147
+ allow_redefinition = true
148
+ ignore_missing_imports = true
149
+ exclude = ["tests/", "build/"]
150
+
151
+ [[tool.mypy.overrides]]
152
+ module = "pysam.*"
153
+ ignore_missing_imports = true
154
+
155
+ [[tool.mypy.overrides]]
156
+ module = [
157
+ "umierrorcorrect.core.consensus",
158
+ "umierrorcorrect.core.umi_cluster",
159
+ "umierrorcorrect.umi_error_correct",
160
+ "umierrorcorrect.get_consensus_statistics",
161
+ "umierrorcorrect.cli",
162
+ "umierrorcorrect.core.logging_config"
163
+ ]
164
+ ignore_errors = true
File without changes
@@ -0,0 +1,104 @@
1
+ #!/usr/bin/env python3
2
+ import subprocess
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ import pysam
7
+
8
+ from umierrorcorrect.core.logging_config import get_logger, log_subprocess_stderr
9
+
10
+ logger = get_logger(__name__)
11
+
12
+
13
+ def _cleanup_files(*files: Path) -> None:
14
+ """Remove files if they exist."""
15
+ for f in files:
16
+ if f.exists():
17
+ f.unlink()
18
+
19
+
20
+ def check_bwa_index(reference_file: str | Path) -> None:
21
+ """Check if BWA index files exists, otherwise create."""
22
+ ref_path = Path(reference_file)
23
+ if not ref_path.is_file():
24
+ logger.error(f"Reference genome file {reference_file} does not exist, exiting")
25
+ sys.exit(1)
26
+
27
+ if not Path(str(reference_file) + ".bwt").is_file(): # check if index exists
28
+ logger.warning(f"BWA index for reference genome file {reference_file} does not exist")
29
+ answer = input("Do you want to create a BWA index now? (y/n) ").lower().strip()
30
+ while answer not in ("y", "yes", "n", "no"):
31
+ logger.warning("Answer yes or no")
32
+ answer = input("Do you want to create a BWA index now? (y/n) ").lower().strip()
33
+ if answer[0] != "y":
34
+ sys.exit(1)
35
+ try:
36
+ logger.info("Creating BWA index...")
37
+ result = subprocess.run(
38
+ ["bwa", "index", reference_file],
39
+ capture_output=True,
40
+ check=True,
41
+ )
42
+ log_subprocess_stderr(result.stderr, "bwa-index")
43
+ except subprocess.CalledProcessError as e:
44
+ logger.error(f"bwa index failed: {e.stderr.decode() if e.stderr else 'Unknown error'}")
45
+ sys.exit(1)
46
+
47
+
48
+ def align_bwa(
49
+ num_threads: int,
50
+ reference_file: str | Path,
51
+ fastq_files: list[str | Path],
52
+ output_path: str | Path,
53
+ sample_name: str,
54
+ remove_large_files: bool,
55
+ ) -> str | None:
56
+ """Align reads with BWA to create a SAM file, then convert it to BAM, sort and index the file."""
57
+ logger.info("Starting alignment with BWA")
58
+
59
+ # Validate inputs
60
+ output_dir = Path(output_path)
61
+ if not output_dir.is_dir():
62
+ logger.error(f"Output directory {output_path} does not exist")
63
+ return None
64
+
65
+ if not 1 <= len(fastq_files) <= 2:
66
+ logger.error(f"Expected 1 or 2 FASTQ files, got {len(fastq_files)}")
67
+ return None
68
+
69
+ check_bwa_index(reference_file)
70
+
71
+ output_base = output_dir / sample_name
72
+ sam_file = output_base.with_suffix(".sam")
73
+ bam_file = output_base.with_suffix(".bam")
74
+ sorted_bam = output_base.parent / f"{sample_name}.sorted.bam"
75
+ logger.info(f"Creating output file: {sorted_bam}")
76
+
77
+ bwacommand = ["bwa", "mem", "-t", str(num_threads), str(reference_file), *[str(f) for f in fastq_files]]
78
+
79
+ try:
80
+ with sam_file.open("w") as g:
81
+ result = subprocess.run(bwacommand, stdout=g, stderr=subprocess.PIPE, check=True)
82
+ log_subprocess_stderr(result.stderr, "bwa-mem")
83
+ except subprocess.CalledProcessError as e:
84
+ logger.error(f"bwa mem failed: {e.stderr.decode() if e.stderr else 'Unknown error'}")
85
+ _cleanup_files(sam_file)
86
+ return None
87
+
88
+ try:
89
+ pysam.view("-Sb", "-@", str(num_threads), str(sam_file), "-o", str(bam_file), catch_stdout=False)
90
+ pysam.sort("-@", str(num_threads), str(bam_file), "-o", str(sorted_bam), catch_stdout=False)
91
+ pysam.index(str(sorted_bam), catch_stdout=False)
92
+ except pysam.SamtoolsError as e:
93
+ logger.error(f"SAM/BAM processing failed: {e}")
94
+ _cleanup_files(sam_file, bam_file)
95
+ return None
96
+ finally:
97
+ _cleanup_files(sam_file, bam_file)
98
+
99
+ if remove_large_files:
100
+ for fastq in fastq_files:
101
+ Path(fastq).unlink()
102
+
103
+ logger.info("Finished alignment")
104
+ return str(sorted_bam)