sniffcell 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sniffcell-0.5.0/.github/workflows/python-publish.yml +70 -0
- sniffcell-0.5.0/.gitignore +23 -0
- sniffcell-0.5.0/LICENSE +21 -0
- sniffcell-0.5.0/PKG-INFO +48 -0
- sniffcell-0.5.0/README.md +16 -0
- sniffcell-0.5.0/img/workflow.png +0 -0
- sniffcell-0.5.0/pyproject.toml +8 -0
- sniffcell-0.5.0/setup.cfg +42 -0
- sniffcell-0.5.0/setup.py +14 -0
- sniffcell-0.5.0/src/sniffcell/__init__.py +1 -0
- sniffcell-0.5.0/src/sniffcell/anno/__init__.py +0 -0
- sniffcell-0.5.0/src/sniffcell/anno/anno.py +230 -0
- sniffcell-0.5.0/src/sniffcell/anno/filter_bed_based_on_variants.py +85 -0
- sniffcell-0.5.0/src/sniffcell/anno/kmeans.py +120 -0
- sniffcell-0.5.0/src/sniffcell/anno/methyl_matrix.py +138 -0
- sniffcell-0.5.0/src/sniffcell/anno/variant_assignment.py +223 -0
- sniffcell-0.5.0/src/sniffcell/anno/vcf_to_df.py +56 -0
- sniffcell-0.5.0/src/sniffcell/deconv/__init__.py +0 -0
- sniffcell-0.5.0/src/sniffcell/deconv/deconv.py +3 -0
- sniffcell-0.5.0/src/sniffcell/dmsv/__init__.py +0 -0
- sniffcell-0.5.0/src/sniffcell/dmsv/dmsv.py +168 -0
- sniffcell-0.5.0/src/sniffcell/dmsv/statistical_test_around_sv.py +182 -0
- sniffcell-0.5.0/src/sniffcell/dmsv/sv_methylation.py +276 -0
- sniffcell-0.5.0/src/sniffcell/find/__init__.py +0 -0
- sniffcell-0.5.0/src/sniffcell/find/ctdmr.py +216 -0
- sniffcell-0.5.0/src/sniffcell/find/find.py +70 -0
- sniffcell-0.5.0/src/sniffcell/main.py +30 -0
- sniffcell-0.5.0/src/sniffcell/parse_args.py +103 -0
- sniffcell-0.5.0/src/sniffcell.egg-info/PKG-INFO +48 -0
- sniffcell-0.5.0/src/sniffcell.egg-info/SOURCES.txt +33 -0
- sniffcell-0.5.0/src/sniffcell.egg-info/dependency_links.txt +1 -0
- sniffcell-0.5.0/src/sniffcell.egg-info/entry_points.txt +2 -0
- sniffcell-0.5.0/src/sniffcell.egg-info/requires.txt +8 -0
- sniffcell-0.5.0/src/sniffcell.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# This workflow will upload a Python Package to PyPI when a release is created
|
|
2
|
+
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
|
|
3
|
+
|
|
4
|
+
# This workflow uses actions that are not certified by GitHub.
|
|
5
|
+
# They are provided by a third-party and are governed by
|
|
6
|
+
# separate terms of service, privacy policy, and support
|
|
7
|
+
# documentation.
|
|
8
|
+
|
|
9
|
+
name: Upload Python Package
|
|
10
|
+
|
|
11
|
+
on:
|
|
12
|
+
release:
|
|
13
|
+
types: [published]
|
|
14
|
+
|
|
15
|
+
permissions:
|
|
16
|
+
contents: read
|
|
17
|
+
|
|
18
|
+
jobs:
|
|
19
|
+
release-build:
|
|
20
|
+
runs-on: ubuntu-latest
|
|
21
|
+
|
|
22
|
+
steps:
|
|
23
|
+
- uses: actions/checkout@v4
|
|
24
|
+
|
|
25
|
+
- uses: actions/setup-python@v5
|
|
26
|
+
with:
|
|
27
|
+
python-version: "3.x"
|
|
28
|
+
|
|
29
|
+
- name: Build release distributions
|
|
30
|
+
run: |
|
|
31
|
+
# NOTE: put your own distribution build steps here.
|
|
32
|
+
python -m pip install build
|
|
33
|
+
# python -m build
|
|
34
|
+
|
|
35
|
+
- name: Upload distributions
|
|
36
|
+
uses: actions/upload-artifact@v4
|
|
37
|
+
with:
|
|
38
|
+
name: release-dists
|
|
39
|
+
path: dist/
|
|
40
|
+
|
|
41
|
+
pypi-publish:
|
|
42
|
+
runs-on: ubuntu-latest
|
|
43
|
+
needs:
|
|
44
|
+
- release-build
|
|
45
|
+
permissions:
|
|
46
|
+
# IMPORTANT: this permission is mandatory for trusted publishing
|
|
47
|
+
id-token: write
|
|
48
|
+
|
|
49
|
+
# Dedicated environments with protections for publishing are strongly recommended.
|
|
50
|
+
# For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules
|
|
51
|
+
environment:
|
|
52
|
+
name: pypi
|
|
53
|
+
# OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
|
|
54
|
+
url: https://pypi.org/p/sniffcell
|
|
55
|
+
#
|
|
56
|
+
# ALTERNATIVE: if your GitHub Release name is the PyPI project version string
|
|
57
|
+
# ALTERNATIVE: exactly, uncomment the following line instead:
|
|
58
|
+
# url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }}
|
|
59
|
+
|
|
60
|
+
steps:
|
|
61
|
+
- name: Retrieve release distributions
|
|
62
|
+
uses: actions/download-artifact@v4
|
|
63
|
+
with:
|
|
64
|
+
name: release-dists
|
|
65
|
+
path: dist/
|
|
66
|
+
|
|
67
|
+
- name: Publish release distributions to PyPI
|
|
68
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
69
|
+
with:
|
|
70
|
+
packages-dir: dist/
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
.ipynb_checkpoints/*
|
|
2
|
+
|
|
3
|
+
# Ignore Python cache files
|
|
4
|
+
__pycache__/
|
|
5
|
+
src/__pycache__/*
|
|
6
|
+
*.py[cod]
|
|
7
|
+
|
|
8
|
+
# Ignore Jupyter Notebook checkpoints
|
|
9
|
+
.ipynb_checkpoints/
|
|
10
|
+
|
|
11
|
+
# Ignore virtual environment directories
|
|
12
|
+
venv/
|
|
13
|
+
env/
|
|
14
|
+
|
|
15
|
+
dataset/
|
|
16
|
+
sniffmeth_deconv_output/
|
|
17
|
+
notebooks/
|
|
18
|
+
.vscode/
|
|
19
|
+
|
|
20
|
+
src/archived_src/*
|
|
21
|
+
test/
|
|
22
|
+
|
|
23
|
+
atlas/
|
sniffcell-0.5.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Yilei Fu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
sniffcell-0.5.0/PKG-INFO
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sniffcell
|
|
3
|
+
Version: 0.5.0
|
|
4
|
+
Summary: SniffCell: Annotate SVs cell type based on CpG methylation
|
|
5
|
+
Home-page: https://github.com/Fu-Yilei/SniffCell
|
|
6
|
+
Author: Yilei Fu
|
|
7
|
+
Author-email: yilei.fu@bcm.edu
|
|
8
|
+
License: MIT
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/Fu-Yilei/SniffCell/issues
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: pysam>=0.21.0
|
|
17
|
+
Requires-Dist: edlib>=1.3.9
|
|
18
|
+
Requires-Dist: psutil>=5.9.4
|
|
19
|
+
Requires-Dist: numpy>=2.2.0
|
|
20
|
+
Requires-Dist: pandas>=2.3.0
|
|
21
|
+
Requires-Dist: scipy
|
|
22
|
+
Requires-Dist: tqdm
|
|
23
|
+
Requires-Dist: scikit-learn
|
|
24
|
+
Dynamic: author
|
|
25
|
+
Dynamic: author-email
|
|
26
|
+
Dynamic: description
|
|
27
|
+
Dynamic: description-content-type
|
|
28
|
+
Dynamic: home-page
|
|
29
|
+
Dynamic: license
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
Dynamic: summary
|
|
32
|
+
|
|
33
|
+
# SniffCell - Identifying cell type specific SV from long-read bulk sequenced tissue only
|
|
34
|
+
|
|
35
|
+
SniffCell is a tool designed to analyze DNA methylation changes associated with structural variations (SVs), including mosaic SVs. It processes primary alignments from BAM files and provides detailed outputs for visualization and analysis.
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
positional arguments:
|
|
39
|
+
{find,deconv,anno,svanno,dmsv}
|
|
40
|
+
find Find cell type-specific DMRs.
|
|
41
|
+
deconv Deconvolve cell-type composition from methylation data.
|
|
42
|
+
anno Annotate variants with cell-type-specific methylation.
|
|
43
|
+
svanno Use pre-annotated reads csv to annotate variants' cell types
|
|
44
|
+
dmsv Find out which SV's supporting reads have differential methylation compared to non-supporting reads.
|
|
45
|
+
|
|
46
|
+
options:
|
|
47
|
+
-h, --help show this help message and exit
|
|
48
|
+
-v, --version show program's version number and exit
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# SniffCell - Identifying cell type specific SV from long-read bulk sequenced tissue only
|
|
2
|
+
|
|
3
|
+
SniffCell is a tool designed to analyze DNA methylation changes associated with structural variations (SVs), including mosaic SVs. It processes primary alignments from BAM files and provides detailed outputs for visualization and analysis.
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
positional arguments:
|
|
7
|
+
{find,deconv,anno,svanno,dmsv}
|
|
8
|
+
find Find cell type-specific DMRs.
|
|
9
|
+
deconv Deconvolve cell-type composition from methylation data.
|
|
10
|
+
anno Annotate variants with cell-type-specific methylation.
|
|
11
|
+
svanno Use pre-annotated reads csv to annotate variants' cell types
|
|
12
|
+
dmsv Find out which SV's supporting reads have differential methylation compared to non-supporting reads.
|
|
13
|
+
|
|
14
|
+
options:
|
|
15
|
+
-h, --help show this help message and exit
|
|
16
|
+
-v, --version show program's version number and exit
|
|
Binary file
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[metadata]
|
|
2
|
+
name = sniffcell
|
|
3
|
+
version = 0.5.0
|
|
4
|
+
author = Yilei Fu
|
|
5
|
+
author_email = yilei.fu@bcm.edu
|
|
6
|
+
long_description = file: README.md
|
|
7
|
+
long_description_content_type = text/markdown
|
|
8
|
+
url = https://github.com/Fu-Yilei/SniffCell
|
|
9
|
+
project_urls =
|
|
10
|
+
Bug Tracker = https://github.com/Fu-Yilei/SniffCell/issues
|
|
11
|
+
classifiers =
|
|
12
|
+
Programming Language :: Python :: 3
|
|
13
|
+
License :: OSI Approved :: MIT License
|
|
14
|
+
Operating System :: OS Independent
|
|
15
|
+
|
|
16
|
+
[options]
|
|
17
|
+
package_dir =
|
|
18
|
+
= src
|
|
19
|
+
packages = find:
|
|
20
|
+
python_requires = >=3.10
|
|
21
|
+
install_requires =
|
|
22
|
+
pysam>=0.21.0
|
|
23
|
+
edlib>=1.3.9
|
|
24
|
+
psutil>=5.9.4
|
|
25
|
+
numpy>=2.2.0
|
|
26
|
+
pandas>=2.3.0
|
|
27
|
+
scipy
|
|
28
|
+
tqdm
|
|
29
|
+
scikit-learn
|
|
30
|
+
include_package_data = True
|
|
31
|
+
|
|
32
|
+
[options.packages.find]
|
|
33
|
+
where = src
|
|
34
|
+
|
|
35
|
+
[options.entry_points]
|
|
36
|
+
console_scripts =
|
|
37
|
+
sniffcell = sniffcell.main:main
|
|
38
|
+
|
|
39
|
+
[egg_info]
|
|
40
|
+
tag_build =
|
|
41
|
+
tag_date = 0
|
|
42
|
+
|
sniffcell-0.5.0/setup.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name='sniffcell',
|
|
5
|
+
version='0.5.0',
|
|
6
|
+
packages=find_packages(),
|
|
7
|
+
url='https://github.com/Fu-Yilei/SniffCell',
|
|
8
|
+
license='MIT',
|
|
9
|
+
author='Yilei Fu',
|
|
10
|
+
author_email='yilei.fu@bcm.edu',
|
|
11
|
+
description='SniffCell: Annotate SVs cell type based on CpG methylation',
|
|
12
|
+
long_description=open('README.md').read(),
|
|
13
|
+
long_description_content_type='text/markdown',
|
|
14
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.5.0"
|
|
File without changes
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from sniffcell.anno.kmeans import kmeans_cluster_cells
|
|
4
|
+
from sniffcell.anno.methyl_matrix import methyl_matrix_from_bam
|
|
5
|
+
from sniffcell.anno.filter_bed_based_on_variants import filter_bed_based_on_variants
|
|
6
|
+
from sniffcell.anno.vcf_to_df import read_vcf_to_df
|
|
7
|
+
from sniffcell.anno.variant_assignment import assign_sv_celltypes
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
import multiprocessing as mp
|
|
10
|
+
import numpy as np
|
|
11
|
+
import logging
|
|
12
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(processName)s] %(levelname)s: %(message)s")
|
|
13
|
+
|
|
14
|
+
def _one_dmr(args):
|
|
15
|
+
# args: (row_dict, input_bam, reference_fasta)
|
|
16
|
+
logger = logging.getLogger("anno._one_dmr")
|
|
17
|
+
|
|
18
|
+
row, input_file, reference = args
|
|
19
|
+
chrom = str(row["chr"])
|
|
20
|
+
start = int(row["start"])
|
|
21
|
+
end = int(row["end"])
|
|
22
|
+
|
|
23
|
+
best_group = str(row["best_group"])
|
|
24
|
+
best_dir = row.get("best_dir", None)
|
|
25
|
+
|
|
26
|
+
# collect arbitrary cell types from mean_* columns in THIS row
|
|
27
|
+
cell_types = []
|
|
28
|
+
for k, v in row.items():
|
|
29
|
+
if isinstance(k, str) and k.startswith("mean_") and k not in ("mean_best_value", "mean_rest_value", "mean_margin"):
|
|
30
|
+
cell_types.append(k[len("mean_"):])
|
|
31
|
+
if best_group not in cell_types:
|
|
32
|
+
cell_types.append(best_group)
|
|
33
|
+
cell_types = sorted(dict.fromkeys(cell_types)) # stable order
|
|
34
|
+
|
|
35
|
+
# logger.info(f"[{chrom}:{start}-{end}] best_group={best_group} best_dir={best_dir} "
|
|
36
|
+
# f"cell_types={cell_types} (n={len(cell_types)})")
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
# load methylation matrix + CpG positions
|
|
40
|
+
mm, cpgs = methyl_matrix_from_bam(
|
|
41
|
+
input_file, reference, chrom=chrom, start=start, end=end, return_positions=True
|
|
42
|
+
)
|
|
43
|
+
n_reads_raw = 0 if mm is None else mm.shape[0]
|
|
44
|
+
n_cpgs = len(cpgs)
|
|
45
|
+
if n_cpgs == 0:
|
|
46
|
+
logger.warning(f"[{chrom}:{start}-{end}] no CpGs found; skipping")
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
# drop rows that are entirely NaN across CpGs
|
|
50
|
+
mm = mm.dropna(how="all")
|
|
51
|
+
if mm.empty or mm.shape[0] < 2:
|
|
52
|
+
logger.warning(f"[{chrom}:{start}-{end}] usable_reads={mm.shape[0] if not mm.empty else 0} "
|
|
53
|
+
f"(raw={n_reads_raw}) < 2; skipping")
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
# call your untouched kmeans wrapper to assign target vs Other
|
|
57
|
+
dmr_row = {
|
|
58
|
+
"best_group": best_group,
|
|
59
|
+
"best_dir": best_dir,
|
|
60
|
+
"mean_best_value": row.get("mean_best_value", np.nan),
|
|
61
|
+
"mean_rest_value": row.get("mean_rest_value", np.nan),
|
|
62
|
+
}
|
|
63
|
+
out = kmeans_cluster_cells(mm, dmr_row=dmr_row)
|
|
64
|
+
|
|
65
|
+
# CpG bounds from cpgs
|
|
66
|
+
cpgstart = int(cpgs[0])
|
|
67
|
+
cpgend = int(cpgs[-1])
|
|
68
|
+
|
|
69
|
+
# read names from MultiIndex level 0 if present
|
|
70
|
+
if isinstance(mm.index, pd.MultiIndex) and "read_name" in mm.index.names:
|
|
71
|
+
readnames = mm.index.get_level_values("read_name").astype(str).values
|
|
72
|
+
else:
|
|
73
|
+
readnames = (mm.index.astype(str).values if mm.index.dtype == object
|
|
74
|
+
else np.array([f"read_{i}" for i in range(len(mm))], dtype=str))
|
|
75
|
+
|
|
76
|
+
# target mask from your output column
|
|
77
|
+
mask_target = (out["celltype_or_other"].astype(str).str.lower()
|
|
78
|
+
== best_group.strip().lower()).values
|
|
79
|
+
# build variable-length code strings
|
|
80
|
+
pos = {ct: i for i, ct in enumerate(cell_types)}
|
|
81
|
+
t_idx = pos[best_group]
|
|
82
|
+
target_bits = ["0"] * len(cell_types); target_bits[t_idx] = "1"
|
|
83
|
+
other_bits = ["1"] * len(cell_types); other_bits[t_idx] = "0"
|
|
84
|
+
target_code = "".join(target_bits)
|
|
85
|
+
other_code = "".join(other_bits)
|
|
86
|
+
code_col = np.where(mask_target, target_code, other_code)
|
|
87
|
+
|
|
88
|
+
# --- per-read assignments (each read = one row / index) ---
|
|
89
|
+
assign_df = pd.DataFrame({
|
|
90
|
+
"chr": chrom,
|
|
91
|
+
"start": start,
|
|
92
|
+
"end": end,
|
|
93
|
+
"cpgstart": cpgstart,
|
|
94
|
+
"cpgend": cpgend,
|
|
95
|
+
"code_order": "|".join(cell_types),
|
|
96
|
+
"code": code_col,
|
|
97
|
+
}, index=pd.Index(readnames, name="readname"))
|
|
98
|
+
|
|
99
|
+
# per-block means (per-read mean methylation, then avg by target vs other)
|
|
100
|
+
X_imp = mm.astype(float).copy().fillna(mm.astype(float).mean())
|
|
101
|
+
read_mean = X_imp.mean(axis=1).values
|
|
102
|
+
tgt_mean = float(np.nanmean(read_mean[mask_target])) if mask_target.any() else np.nan
|
|
103
|
+
oth_mean = float(np.nanmean(read_mean[~mask_target])) if (~mask_target).any() else np.nan
|
|
104
|
+
|
|
105
|
+
# logger.info(f"[{chrom}:{start}-{end}] target_mean={tgt_mean:.4f} other_mean={oth_mean:.4f} "
|
|
106
|
+
# f"cpg_bounds={cpgstart}-{cpgend}")
|
|
107
|
+
|
|
108
|
+
state_payload = {
|
|
109
|
+
"chr": chrom, "start": start, "end": end,
|
|
110
|
+
"cpgstart": cpgstart, "cpgend": cpgend,
|
|
111
|
+
}
|
|
112
|
+
for ct in cell_types:
|
|
113
|
+
state_payload[f"{ct}_methylation"] = tgt_mean if ct == best_group else oth_mean
|
|
114
|
+
state_df = pd.DataFrame([state_payload])
|
|
115
|
+
|
|
116
|
+
return assign_df, state_df
|
|
117
|
+
|
|
118
|
+
except Exception as e:
|
|
119
|
+
logger.exception(f"[{chrom}:{start}-{end}] failed with error")
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
def sv_anno(args):
|
|
123
|
+
logger = logging.getLogger("anno.sv_anno")
|
|
124
|
+
logger.info("Starting SV annotation from pre-annotated reads")
|
|
125
|
+
if args.command == "svanno":
|
|
126
|
+
input_file = args.input
|
|
127
|
+
else:
|
|
128
|
+
input_file = os.path.join(args.output, "reads_classification.tsv")
|
|
129
|
+
if args.kanpig_read_names is not None:
|
|
130
|
+
logger.info(f"Using kanpig read names from: {args.kanpig_read_names}")
|
|
131
|
+
else:
|
|
132
|
+
logger.info("No kanpig read names provided; using Sniffles read names from VCF")
|
|
133
|
+
sv_assignment_df = assign_sv_celltypes(read_vcf_to_df(args.vcf, kanpig_read_names=args.kanpig_read_names), pd.read_csv(input_file, sep="\t", index_col=0))
|
|
134
|
+
sv_assignment_df.to_csv(os.path.join(args.output, "sv_assignment.tsv"), sep="\t", index=False)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def anno_main(args):
|
|
140
|
+
# print(args)
|
|
141
|
+
# return
|
|
142
|
+
logger = logging.getLogger("anno.main")
|
|
143
|
+
|
|
144
|
+
bed_file = args.bed
|
|
145
|
+
base_out = args.output # writes <output>.reads.tsv and <output>.blocks.tsv
|
|
146
|
+
input_file = args.input
|
|
147
|
+
reference = args.reference
|
|
148
|
+
threads = int(args.threads)
|
|
149
|
+
window = int(args.window)
|
|
150
|
+
logger.info(f"Starting annotation: bed={bed_file} bam={input_file} ref={reference} "
|
|
151
|
+
f"threads={threads} out_base={base_out}")
|
|
152
|
+
|
|
153
|
+
# Output paths
|
|
154
|
+
reads_out = os.path.join(base_out, "reads_classification.tsv")
|
|
155
|
+
blocks_out = os.path.join(base_out, "blocks_classification.tsv")
|
|
156
|
+
|
|
157
|
+
# Load and (optionally) filter BED
|
|
158
|
+
bed = pd.read_csv(bed_file, sep="\t")
|
|
159
|
+
logger.info(f"Loaded BED with {len(bed)} DMR rows")
|
|
160
|
+
|
|
161
|
+
sv_df = read_vcf_to_df(args.vcf)
|
|
162
|
+
filtered_bed = filter_bed_based_on_variants(bed, sv_df=sv_df, window=window)
|
|
163
|
+
|
|
164
|
+
for col in ["chr", "start", "end", "best_group", "best_dir"]:
|
|
165
|
+
if col not in filtered_bed.columns:
|
|
166
|
+
logger.error(f"BED missing required column: {col}")
|
|
167
|
+
raise ValueError(f"BED missing required column: {col}")
|
|
168
|
+
|
|
169
|
+
n_tasks = len(filtered_bed)
|
|
170
|
+
logger.info(f"Filtered BED to {n_tasks} DMRs after variant overlap filtering, window size = {window}")
|
|
171
|
+
|
|
172
|
+
tasks = [(dict(row), input_file, reference) for _, row in filtered_bed.iterrows()]
|
|
173
|
+
|
|
174
|
+
# --- Prepare outputs: truncate files and reset header flags ---
|
|
175
|
+
# We'll only write headers on the first real chunk for each file.
|
|
176
|
+
open(reads_out, "w").close()
|
|
177
|
+
open(blocks_out, "w").close()
|
|
178
|
+
reads_header_written = False
|
|
179
|
+
blocks_header_written = False
|
|
180
|
+
blocks_cols_locked: list[str] | None = None # we lock schema to the first block we see
|
|
181
|
+
|
|
182
|
+
# Stream results and append immediately
|
|
183
|
+
with mp.Pool(threads) as pool:
|
|
184
|
+
for res in tqdm(pool.imap(_one_dmr, tasks, chunksize=1),
|
|
185
|
+
total=n_tasks, desc="Processing DMRs"):
|
|
186
|
+
if res is None:
|
|
187
|
+
continue
|
|
188
|
+
|
|
189
|
+
a_df, s_df = res
|
|
190
|
+
|
|
191
|
+
# --- APPEND READS ---
|
|
192
|
+
if a_df is not None and not a_df.empty:
|
|
193
|
+
if not reads_header_written:
|
|
194
|
+
# first write: include header and index (readname)
|
|
195
|
+
a_df.to_csv(reads_out, sep="\t", index=True, mode="a", header=True)
|
|
196
|
+
reads_header_written = True
|
|
197
|
+
else:
|
|
198
|
+
a_df.to_csv(reads_out, sep="\t", index=True, mode="a", header=False)
|
|
199
|
+
|
|
200
|
+
# --- APPEND BLOCKS (variable columns across DMRs) ---
|
|
201
|
+
if s_df is not None and not s_df.empty:
|
|
202
|
+
if not blocks_header_written:
|
|
203
|
+
# lock the schema to the first encountered block columns
|
|
204
|
+
blocks_cols_locked = list(s_df.columns)
|
|
205
|
+
s_df.to_csv(blocks_out, sep="\t", index=False, mode="a", header=True)
|
|
206
|
+
blocks_header_written = True
|
|
207
|
+
else:
|
|
208
|
+
# align columns to locked header; drop extras, add missing as NaN
|
|
209
|
+
assert blocks_cols_locked is not None
|
|
210
|
+
s_df_aligned = s_df.reindex(columns=blocks_cols_locked)
|
|
211
|
+
s_df_aligned.to_csv(blocks_out, sep="\t", index=False, mode="a", header=False)
|
|
212
|
+
|
|
213
|
+
# If nothing was written, emit empty files with headers to be friendly downstream
|
|
214
|
+
if not reads_header_written:
|
|
215
|
+
empty_reads = pd.DataFrame(
|
|
216
|
+
columns=["chr","start","end","cpgstart","cpgend","code_order","code"]
|
|
217
|
+
)
|
|
218
|
+
empty_reads.index.name = "readname"
|
|
219
|
+
empty_reads.to_csv(reads_out, sep="\t", index=True, header=True)
|
|
220
|
+
logger.warning("No per-read assignments generated; wrote empty reads header only")
|
|
221
|
+
|
|
222
|
+
if not blocks_header_written:
|
|
223
|
+
pd.DataFrame(columns=["chr","start","end","cpgstart","cpgend"]).to_csv(
|
|
224
|
+
blocks_out, sep="\t", index=False, header=True
|
|
225
|
+
)
|
|
226
|
+
logger.warning("No block states generated; wrote empty blocks header only")
|
|
227
|
+
sv_anno(args)
|
|
228
|
+
logger.info("Annotation complete")
|
|
229
|
+
|
|
230
|
+
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
def filter_bed_based_on_variants(bed_df: pd.DataFrame, sv_df: pd.DataFrame, window: int = 5000) -> pd.DataFrame:
|
|
5
|
+
# 1) Normalize column names
|
|
6
|
+
bed = bed_df.copy()
|
|
7
|
+
sv = sv_df.rename(columns={'chr': 'chr'}).copy()
|
|
8
|
+
if 'supporting_reads' not in sv.columns:
|
|
9
|
+
exit("VCF DataFrame missing 'supporting_reads' column required for SV assignment.")
|
|
10
|
+
# 2) Normalize chromosome naming (strip or add 'chr' to match)
|
|
11
|
+
def _norm_chr(x):
|
|
12
|
+
x = str(x)
|
|
13
|
+
return x[3:] if x.startswith('chr') else x
|
|
14
|
+
bed['chr'] = bed['chr'].map(_norm_chr)
|
|
15
|
+
sv['chr'] = sv['chr'].map(_norm_chr)
|
|
16
|
+
|
|
17
|
+
# 3) Enforce integer dtype
|
|
18
|
+
bed[['start','end']] = bed[['start','end']].astype(np.int64)
|
|
19
|
+
sv[['ref_start','ref_end']] = sv[['ref_start','ref_end']].astype(np.int64)
|
|
20
|
+
|
|
21
|
+
# 4) Convert VCF (1-based inclusive) -> BED-style half-open
|
|
22
|
+
# [ref_start-1, ref_end) in 0-based half-open. Equivalently:
|
|
23
|
+
# start0 = ref_start - 1; end0_exclusive = ref_end
|
|
24
|
+
sv_start0 = (sv['ref_start'].to_numpy(np.int64) - 1)
|
|
25
|
+
sv_end0 = sv['ref_end'].to_numpy(np.int64) # already exclusive after conversion
|
|
26
|
+
|
|
27
|
+
# Handle insertions robustly: if END < POS (some callers), clamp end to POS.
|
|
28
|
+
bad = sv_end0 < (sv_start0 + 1)
|
|
29
|
+
if np.any(bad):
|
|
30
|
+
sv_end0[bad] = sv_start0[bad] + 1
|
|
31
|
+
|
|
32
|
+
sv = sv.assign(_start=sv_start0, _end=sv_end0)
|
|
33
|
+
|
|
34
|
+
# 5) Optional: make chr categorical for faster groupbys
|
|
35
|
+
bed['chr'] = bed['chr'].astype('category')
|
|
36
|
+
sv['chr'] = sv['chr'].astype('category')
|
|
37
|
+
|
|
38
|
+
out_mask = np.zeros(len(bed), dtype=bool)
|
|
39
|
+
|
|
40
|
+
# Build per-chrom sorted arrays of SV starts/ends (already sorted? we still ensure monotonic)
|
|
41
|
+
sv_idx = {}
|
|
42
|
+
for chrom, sdf in sv.groupby('chr', sort=False, observed=False):
|
|
43
|
+
starts = sdf['_start'].to_numpy(np.int64)
|
|
44
|
+
ends = sdf['_end'].to_numpy(np.int64)
|
|
45
|
+
# Ensure sorted (cheap if already sorted)
|
|
46
|
+
order = np.argsort(starts, kind='mergesort')
|
|
47
|
+
starts = starts[order]
|
|
48
|
+
ends = ends[order] # maintain pairing order
|
|
49
|
+
# We also want an array of ends sorted to use searchsorted independently.
|
|
50
|
+
ends_sorted = np.sort(ends, kind='mergesort')
|
|
51
|
+
sv_idx[chrom] = (starts, ends, ends_sorted)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
for chrom, bdf in bed.groupby('chr', sort=False, observed=False):
|
|
55
|
+
if chrom not in sv_idx:
|
|
56
|
+
continue
|
|
57
|
+
starts, _, ends_sorted = sv_idx[chrom]
|
|
58
|
+
if starts.size == 0:
|
|
59
|
+
continue
|
|
60
|
+
|
|
61
|
+
# Core (original) BED interval
|
|
62
|
+
core_start = bdf['start'].to_numpy(np.int64)
|
|
63
|
+
core_end = bdf['end'].to_numpy(np.int64)
|
|
64
|
+
|
|
65
|
+
# Padded interval
|
|
66
|
+
bed_starts = core_start - window
|
|
67
|
+
bed_ends = core_end + window
|
|
68
|
+
|
|
69
|
+
# ---------- 1) Overlap with *padded* interval (half-open, as before) ----------
|
|
70
|
+
n_start_lt_end_pad = np.searchsorted(starts, bed_ends, side='left')
|
|
71
|
+
n_end_le_start_pad = np.searchsorted(ends_sorted, bed_starts, side='right')
|
|
72
|
+
overlap_padded = (n_start_lt_end_pad - n_end_le_start_pad) > 0
|
|
73
|
+
|
|
74
|
+
# ---------- 2) Overlap with *core* interval (treat breakpoint as overlap) ------
|
|
75
|
+
# Closed-interval style: sv_start <= core_end AND sv_end >= core_start
|
|
76
|
+
n_start_le_end_core = np.searchsorted(starts, core_end, side='right')
|
|
77
|
+
n_end_lt_start_core = np.searchsorted(ends_sorted, core_start, side='left')
|
|
78
|
+
overlap_core = (n_start_le_end_core - n_end_lt_start_core) > 0
|
|
79
|
+
|
|
80
|
+
# ---------- 3) We want SV in padding, but NOT in core --------------------------
|
|
81
|
+
keep = overlap_padded & (~overlap_core)
|
|
82
|
+
out_mask[bdf.index] = keep
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
return bed_df.loc[out_mask]
|