sniffcell 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. sniffcell-0.5.0/.github/workflows/python-publish.yml +70 -0
  2. sniffcell-0.5.0/.gitignore +23 -0
  3. sniffcell-0.5.0/LICENSE +21 -0
  4. sniffcell-0.5.0/PKG-INFO +48 -0
  5. sniffcell-0.5.0/README.md +16 -0
  6. sniffcell-0.5.0/img/workflow.png +0 -0
  7. sniffcell-0.5.0/pyproject.toml +8 -0
  8. sniffcell-0.5.0/setup.cfg +42 -0
  9. sniffcell-0.5.0/setup.py +14 -0
  10. sniffcell-0.5.0/src/sniffcell/__init__.py +1 -0
  11. sniffcell-0.5.0/src/sniffcell/anno/__init__.py +0 -0
  12. sniffcell-0.5.0/src/sniffcell/anno/anno.py +230 -0
  13. sniffcell-0.5.0/src/sniffcell/anno/filter_bed_based_on_variants.py +85 -0
  14. sniffcell-0.5.0/src/sniffcell/anno/kmeans.py +120 -0
  15. sniffcell-0.5.0/src/sniffcell/anno/methyl_matrix.py +138 -0
  16. sniffcell-0.5.0/src/sniffcell/anno/variant_assignment.py +223 -0
  17. sniffcell-0.5.0/src/sniffcell/anno/vcf_to_df.py +56 -0
  18. sniffcell-0.5.0/src/sniffcell/deconv/__init__.py +0 -0
  19. sniffcell-0.5.0/src/sniffcell/deconv/deconv.py +3 -0
  20. sniffcell-0.5.0/src/sniffcell/dmsv/__init__.py +0 -0
  21. sniffcell-0.5.0/src/sniffcell/dmsv/dmsv.py +168 -0
  22. sniffcell-0.5.0/src/sniffcell/dmsv/statistical_test_around_sv.py +182 -0
  23. sniffcell-0.5.0/src/sniffcell/dmsv/sv_methylation.py +276 -0
  24. sniffcell-0.5.0/src/sniffcell/find/__init__.py +0 -0
  25. sniffcell-0.5.0/src/sniffcell/find/ctdmr.py +216 -0
  26. sniffcell-0.5.0/src/sniffcell/find/find.py +70 -0
  27. sniffcell-0.5.0/src/sniffcell/main.py +30 -0
  28. sniffcell-0.5.0/src/sniffcell/parse_args.py +103 -0
  29. sniffcell-0.5.0/src/sniffcell.egg-info/PKG-INFO +48 -0
  30. sniffcell-0.5.0/src/sniffcell.egg-info/SOURCES.txt +33 -0
  31. sniffcell-0.5.0/src/sniffcell.egg-info/dependency_links.txt +1 -0
  32. sniffcell-0.5.0/src/sniffcell.egg-info/entry_points.txt +2 -0
  33. sniffcell-0.5.0/src/sniffcell.egg-info/requires.txt +8 -0
  34. sniffcell-0.5.0/src/sniffcell.egg-info/top_level.txt +1 -0
@@ -0,0 +1,70 @@
1
+ # This workflow will upload a Python Package to PyPI when a release is created
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3
+
4
+ # This workflow uses actions that are not certified by GitHub.
5
+ # They are provided by a third-party and are governed by
6
+ # separate terms of service, privacy policy, and support
7
+ # documentation.
8
+
9
+ name: Upload Python Package
10
+
11
+ on:
12
+ release:
13
+ types: [published]
14
+
15
+ permissions:
16
+ contents: read
17
+
18
+ jobs:
19
+ release-build:
20
+ runs-on: ubuntu-latest
21
+
22
+ steps:
23
+ - uses: actions/checkout@v4
24
+
25
+ - uses: actions/setup-python@v5
26
+ with:
27
+ python-version: "3.x"
28
+
29
+ - name: Build release distributions
30
+ run: |
31
+ # NOTE: put your own distribution build steps here.
32
+ python -m pip install build
33
+ # python -m build
34
+
35
+ - name: Upload distributions
36
+ uses: actions/upload-artifact@v4
37
+ with:
38
+ name: release-dists
39
+ path: dist/
40
+
41
+ pypi-publish:
42
+ runs-on: ubuntu-latest
43
+ needs:
44
+ - release-build
45
+ permissions:
46
+ # IMPORTANT: this permission is mandatory for trusted publishing
47
+ id-token: write
48
+
49
+ # Dedicated environments with protections for publishing are strongly recommended.
50
+ # For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules
51
+ environment:
52
+ name: pypi
53
+ # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
54
+ url: https://pypi.org/p/sniffcell
55
+ #
56
+ # ALTERNATIVE: if your GitHub Release name is the PyPI project version string
57
+ # ALTERNATIVE: exactly, uncomment the following line instead:
58
+ # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }}
59
+
60
+ steps:
61
+ - name: Retrieve release distributions
62
+ uses: actions/download-artifact@v4
63
+ with:
64
+ name: release-dists
65
+ path: dist/
66
+
67
+ - name: Publish release distributions to PyPI
68
+ uses: pypa/gh-action-pypi-publish@release/v1
69
+ with:
70
+ packages-dir: dist/
@@ -0,0 +1,23 @@
1
+ .ipynb_checkpoints/*
2
+
3
+ # Ignore Python cache files
4
+ __pycache__/
5
+ src/__pycache__/*
6
+ *.py[cod]
7
+
8
+ # Ignore Jupyter Notebook checkpoints
9
+ .ipynb_checkpoints/
10
+
11
+ # Ignore virtual environment directories
12
+ venv/
13
+ env/
14
+
15
+ dataset/
16
+ sniffmeth_deconv_output/
17
+ notebooks/
18
+ .vscode/
19
+
20
+ src/archived_src/*
21
+ test/
22
+
23
+ atlas/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Yilei Fu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,48 @@
1
+ Metadata-Version: 2.4
2
+ Name: sniffcell
3
+ Version: 0.5.0
4
+ Summary: SniffCell: Annotate SVs cell type based on CpG methylation
5
+ Home-page: https://github.com/Fu-Yilei/SniffCell
6
+ Author: Yilei Fu
7
+ Author-email: yilei.fu@bcm.edu
8
+ License: MIT
9
+ Project-URL: Bug Tracker, https://github.com/Fu-Yilei/SniffCell/issues
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Requires-Python: >=3.10
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Requires-Dist: pysam>=0.21.0
17
+ Requires-Dist: edlib>=1.3.9
18
+ Requires-Dist: psutil>=5.9.4
19
+ Requires-Dist: numpy>=2.2.0
20
+ Requires-Dist: pandas>=2.3.0
21
+ Requires-Dist: scipy
22
+ Requires-Dist: tqdm
23
+ Requires-Dist: scikit-learn
24
+ Dynamic: author
25
+ Dynamic: author-email
26
+ Dynamic: description
27
+ Dynamic: description-content-type
28
+ Dynamic: home-page
29
+ Dynamic: license
30
+ Dynamic: license-file
31
+ Dynamic: summary
32
+
33
+ # SniffCell - Identifying cell type specific SV from long-read bulk sequenced tissue only
34
+
35
+ SniffCell is a tool designed to analyze DNA methylation changes associated with structural variations (SVs), including mosaic SVs. It processes primary alignments from BAM files and provides detailed outputs for visualization and analysis.
36
+
37
+
38
+ positional arguments:
39
+ {find,deconv,anno,svanno,dmsv}
40
+ find Find cell type-specific DMRs.
41
+ deconv Deconvolve cell-type composition from methylation data.
42
+ anno Annotate variants with cell-type-specific methylation.
43
+ svanno Use pre-annotated reads csv to annotate variants' cell types
44
+ dmsv Find out which SV's supporting reads have differential methylation compared to non-supporting reads.
45
+
46
+ options:
47
+ -h, --help show this help message and exit
48
+ -v, --version show program's version number and exit
@@ -0,0 +1,16 @@
1
+ # SniffCell - Identifying cell type specific SV from long-read bulk sequenced tissue only
2
+
3
+ SniffCell is a tool designed to analyze DNA methylation changes associated with structural variations (SVs), including mosaic SVs. It processes primary alignments from BAM files and provides detailed outputs for visualization and analysis.
4
+
5
+
6
+ positional arguments:
7
+ {find,deconv,anno,svanno,dmsv}
8
+ find Find cell type-specific DMRs.
9
+ deconv Deconvolve cell-type composition from methylation data.
10
+ anno Annotate variants with cell-type-specific methylation.
11
+ svanno Use pre-annotated reads csv to annotate variants' cell types
12
+ dmsv Find out which SV's supporting reads have differential methylation compared to non-supporting reads.
13
+
14
+ options:
15
+ -h, --help show this help message and exit
16
+ -v, --version show program's version number and exit
Binary file
@@ -0,0 +1,8 @@
1
+ [build-system]
2
+ requires = [
3
+ "setuptools>=42",
4
+ "wheel"
5
+ ]
6
+ build-backend = "setuptools.build_meta"
7
+
8
+ [tool.setuptools_scm]
@@ -0,0 +1,42 @@
1
+ [metadata]
2
+ name = sniffcell
3
+ version = 0.5.0
4
+ author = Yilei Fu
5
+ author_email = yilei.fu@bcm.edu
6
+ long_description = file: README.md
7
+ long_description_content_type = text/markdown
8
+ url = https://github.com/Fu-Yilei/SniffCell
9
+ project_urls =
10
+ Bug Tracker = https://github.com/Fu-Yilei/SniffCell/issues
11
+ classifiers =
12
+ Programming Language :: Python :: 3
13
+ License :: OSI Approved :: MIT License
14
+ Operating System :: OS Independent
15
+
16
+ [options]
17
+ package_dir =
18
+ = src
19
+ packages = find:
20
+ python_requires = >=3.10
21
+ install_requires =
22
+ pysam>=0.21.0
23
+ edlib>=1.3.9
24
+ psutil>=5.9.4
25
+ numpy>=2.2.0
26
+ pandas>=2.3.0
27
+ scipy
28
+ tqdm
29
+ scikit-learn
30
+ include_package_data = True
31
+
32
+ [options.packages.find]
33
+ where = src
34
+
35
+ [options.entry_points]
36
+ console_scripts =
37
+ sniffcell = sniffcell.main:main
38
+
39
+ [egg_info]
40
+ tag_build =
41
+ tag_date = 0
42
+
@@ -0,0 +1,14 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name='sniffcell',
5
+ version='0.5.0',
6
+ packages=find_packages(),
7
+ url='https://github.com/Fu-Yilei/SniffCell',
8
+ license='MIT',
9
+ author='Yilei Fu',
10
+ author_email='yilei.fu@bcm.edu',
11
+ description='SniffCell: Annotate SVs cell type based on CpG methylation',
12
+ long_description=open('README.md').read(),
13
+ long_description_content_type='text/markdown',
14
+ )
@@ -0,0 +1 @@
1
+ __version__ = "0.5.0"
File without changes
@@ -0,0 +1,230 @@
1
+ import os
2
+ import pandas as pd
3
+ from sniffcell.anno.kmeans import kmeans_cluster_cells
4
+ from sniffcell.anno.methyl_matrix import methyl_matrix_from_bam
5
+ from sniffcell.anno.filter_bed_based_on_variants import filter_bed_based_on_variants
6
+ from sniffcell.anno.vcf_to_df import read_vcf_to_df
7
+ from sniffcell.anno.variant_assignment import assign_sv_celltypes
8
+ from tqdm import tqdm
9
+ import multiprocessing as mp
10
+ import numpy as np
11
+ import logging
12
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(processName)s] %(levelname)s: %(message)s")
13
+
14
+ def _one_dmr(args):
15
+ # args: (row_dict, input_bam, reference_fasta)
16
+ logger = logging.getLogger("anno._one_dmr")
17
+
18
+ row, input_file, reference = args
19
+ chrom = str(row["chr"])
20
+ start = int(row["start"])
21
+ end = int(row["end"])
22
+
23
+ best_group = str(row["best_group"])
24
+ best_dir = row.get("best_dir", None)
25
+
26
+ # collect arbitrary cell types from mean_* columns in THIS row
27
+ cell_types = []
28
+ for k, v in row.items():
29
+ if isinstance(k, str) and k.startswith("mean_") and k not in ("mean_best_value", "mean_rest_value", "mean_margin"):
30
+ cell_types.append(k[len("mean_"):])
31
+ if best_group not in cell_types:
32
+ cell_types.append(best_group)
33
+ cell_types = sorted(dict.fromkeys(cell_types)) # stable order
34
+
35
+ # logger.info(f"[{chrom}:{start}-{end}] best_group={best_group} best_dir={best_dir} "
36
+ # f"cell_types={cell_types} (n={len(cell_types)})")
37
+
38
+ try:
39
+ # load methylation matrix + CpG positions
40
+ mm, cpgs = methyl_matrix_from_bam(
41
+ input_file, reference, chrom=chrom, start=start, end=end, return_positions=True
42
+ )
43
+ n_reads_raw = 0 if mm is None else mm.shape[0]
44
+ n_cpgs = len(cpgs)
45
+ if n_cpgs == 0:
46
+ logger.warning(f"[{chrom}:{start}-{end}] no CpGs found; skipping")
47
+ return None
48
+
49
+ # drop rows that are entirely NaN across CpGs
50
+ mm = mm.dropna(how="all")
51
+ if mm.empty or mm.shape[0] < 2:
52
+ logger.warning(f"[{chrom}:{start}-{end}] usable_reads={mm.shape[0] if not mm.empty else 0} "
53
+ f"(raw={n_reads_raw}) < 2; skipping")
54
+ return None
55
+
56
+ # call your untouched kmeans wrapper to assign target vs Other
57
+ dmr_row = {
58
+ "best_group": best_group,
59
+ "best_dir": best_dir,
60
+ "mean_best_value": row.get("mean_best_value", np.nan),
61
+ "mean_rest_value": row.get("mean_rest_value", np.nan),
62
+ }
63
+ out = kmeans_cluster_cells(mm, dmr_row=dmr_row)
64
+
65
+ # CpG bounds from cpgs
66
+ cpgstart = int(cpgs[0])
67
+ cpgend = int(cpgs[-1])
68
+
69
+ # read names from MultiIndex level 0 if present
70
+ if isinstance(mm.index, pd.MultiIndex) and "read_name" in mm.index.names:
71
+ readnames = mm.index.get_level_values("read_name").astype(str).values
72
+ else:
73
+ readnames = (mm.index.astype(str).values if mm.index.dtype == object
74
+ else np.array([f"read_{i}" for i in range(len(mm))], dtype=str))
75
+
76
+ # target mask from your output column
77
+ mask_target = (out["celltype_or_other"].astype(str).str.lower()
78
+ == best_group.strip().lower()).values
79
+ # build variable-length code strings
80
+ pos = {ct: i for i, ct in enumerate(cell_types)}
81
+ t_idx = pos[best_group]
82
+ target_bits = ["0"] * len(cell_types); target_bits[t_idx] = "1"
83
+ other_bits = ["1"] * len(cell_types); other_bits[t_idx] = "0"
84
+ target_code = "".join(target_bits)
85
+ other_code = "".join(other_bits)
86
+ code_col = np.where(mask_target, target_code, other_code)
87
+
88
+ # --- per-read assignments (each read = one row / index) ---
89
+ assign_df = pd.DataFrame({
90
+ "chr": chrom,
91
+ "start": start,
92
+ "end": end,
93
+ "cpgstart": cpgstart,
94
+ "cpgend": cpgend,
95
+ "code_order": "|".join(cell_types),
96
+ "code": code_col,
97
+ }, index=pd.Index(readnames, name="readname"))
98
+
99
+ # per-block means (per-read mean methylation, then avg by target vs other)
100
+ X_imp = mm.astype(float).copy().fillna(mm.astype(float).mean())
101
+ read_mean = X_imp.mean(axis=1).values
102
+ tgt_mean = float(np.nanmean(read_mean[mask_target])) if mask_target.any() else np.nan
103
+ oth_mean = float(np.nanmean(read_mean[~mask_target])) if (~mask_target).any() else np.nan
104
+
105
+ # logger.info(f"[{chrom}:{start}-{end}] target_mean={tgt_mean:.4f} other_mean={oth_mean:.4f} "
106
+ # f"cpg_bounds={cpgstart}-{cpgend}")
107
+
108
+ state_payload = {
109
+ "chr": chrom, "start": start, "end": end,
110
+ "cpgstart": cpgstart, "cpgend": cpgend,
111
+ }
112
+ for ct in cell_types:
113
+ state_payload[f"{ct}_methylation"] = tgt_mean if ct == best_group else oth_mean
114
+ state_df = pd.DataFrame([state_payload])
115
+
116
+ return assign_df, state_df
117
+
118
+ except Exception as e:
119
+ logger.exception(f"[{chrom}:{start}-{end}] failed with error")
120
+ return None
121
+
122
+ def sv_anno(args):
123
+ logger = logging.getLogger("anno.sv_anno")
124
+ logger.info("Starting SV annotation from pre-annotated reads")
125
+ if args.command == "svanno":
126
+ input_file = args.input
127
+ else:
128
+ input_file = os.path.join(args.output, "reads_classification.tsv")
129
+ if args.kanpig_read_names is not None:
130
+ logger.info(f"Using kanpig read names from: {args.kanpig_read_names}")
131
+ else:
132
+ logger.info("No kanpig read names provided; using Sniffles read names from VCF")
133
+ sv_assignment_df = assign_sv_celltypes(read_vcf_to_df(args.vcf, kanpig_read_names=args.kanpig_read_names), pd.read_csv(input_file, sep="\t", index_col=0))
134
+ sv_assignment_df.to_csv(os.path.join(args.output, "sv_assignment.tsv"), sep="\t", index=False)
135
+
136
+
137
+
138
+
139
+ def anno_main(args):
140
+ # print(args)
141
+ # return
142
+ logger = logging.getLogger("anno.main")
143
+
144
+ bed_file = args.bed
145
+ base_out = args.output # writes <output>.reads.tsv and <output>.blocks.tsv
146
+ input_file = args.input
147
+ reference = args.reference
148
+ threads = int(args.threads)
149
+ window = int(args.window)
150
+ logger.info(f"Starting annotation: bed={bed_file} bam={input_file} ref={reference} "
151
+ f"threads={threads} out_base={base_out}")
152
+
153
+ # Output paths
154
+ reads_out = os.path.join(base_out, "reads_classification.tsv")
155
+ blocks_out = os.path.join(base_out, "blocks_classification.tsv")
156
+
157
+ # Load and (optionally) filter BED
158
+ bed = pd.read_csv(bed_file, sep="\t")
159
+ logger.info(f"Loaded BED with {len(bed)} DMR rows")
160
+
161
+ sv_df = read_vcf_to_df(args.vcf)
162
+ filtered_bed = filter_bed_based_on_variants(bed, sv_df=sv_df, window=window)
163
+
164
+ for col in ["chr", "start", "end", "best_group", "best_dir"]:
165
+ if col not in filtered_bed.columns:
166
+ logger.error(f"BED missing required column: {col}")
167
+ raise ValueError(f"BED missing required column: {col}")
168
+
169
+ n_tasks = len(filtered_bed)
170
+ logger.info(f"Filtered BED to {n_tasks} DMRs after variant overlap filtering, window size = {window}")
171
+
172
+ tasks = [(dict(row), input_file, reference) for _, row in filtered_bed.iterrows()]
173
+
174
+ # --- Prepare outputs: truncate files and reset header flags ---
175
+ # We'll only write headers on the first real chunk for each file.
176
+ open(reads_out, "w").close()
177
+ open(blocks_out, "w").close()
178
+ reads_header_written = False
179
+ blocks_header_written = False
180
+ blocks_cols_locked: list[str] | None = None # we lock schema to the first block we see
181
+
182
+ # Stream results and append immediately
183
+ with mp.Pool(threads) as pool:
184
+ for res in tqdm(pool.imap(_one_dmr, tasks, chunksize=1),
185
+ total=n_tasks, desc="Processing DMRs"):
186
+ if res is None:
187
+ continue
188
+
189
+ a_df, s_df = res
190
+
191
+ # --- APPEND READS ---
192
+ if a_df is not None and not a_df.empty:
193
+ if not reads_header_written:
194
+ # first write: include header and index (readname)
195
+ a_df.to_csv(reads_out, sep="\t", index=True, mode="a", header=True)
196
+ reads_header_written = True
197
+ else:
198
+ a_df.to_csv(reads_out, sep="\t", index=True, mode="a", header=False)
199
+
200
+ # --- APPEND BLOCKS (variable columns across DMRs) ---
201
+ if s_df is not None and not s_df.empty:
202
+ if not blocks_header_written:
203
+ # lock the schema to the first encountered block columns
204
+ blocks_cols_locked = list(s_df.columns)
205
+ s_df.to_csv(blocks_out, sep="\t", index=False, mode="a", header=True)
206
+ blocks_header_written = True
207
+ else:
208
+ # align columns to locked header; drop extras, add missing as NaN
209
+ assert blocks_cols_locked is not None
210
+ s_df_aligned = s_df.reindex(columns=blocks_cols_locked)
211
+ s_df_aligned.to_csv(blocks_out, sep="\t", index=False, mode="a", header=False)
212
+
213
+ # If nothing was written, emit empty files with headers to be friendly downstream
214
+ if not reads_header_written:
215
+ empty_reads = pd.DataFrame(
216
+ columns=["chr","start","end","cpgstart","cpgend","code_order","code"]
217
+ )
218
+ empty_reads.index.name = "readname"
219
+ empty_reads.to_csv(reads_out, sep="\t", index=True, header=True)
220
+ logger.warning("No per-read assignments generated; wrote empty reads header only")
221
+
222
+ if not blocks_header_written:
223
+ pd.DataFrame(columns=["chr","start","end","cpgstart","cpgend"]).to_csv(
224
+ blocks_out, sep="\t", index=False, header=True
225
+ )
226
+ logger.warning("No block states generated; wrote empty blocks header only")
227
+ sv_anno(args)
228
+ logger.info("Annotation complete")
229
+
230
+
@@ -0,0 +1,85 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+ def filter_bed_based_on_variants(bed_df: pd.DataFrame, sv_df: pd.DataFrame, window: int = 5000) -> pd.DataFrame:
5
+ # 1) Normalize column names
6
+ bed = bed_df.copy()
7
+ sv = sv_df.rename(columns={'chr': 'chr'}).copy()
8
+ if 'supporting_reads' not in sv.columns:
9
+ exit("VCF DataFrame missing 'supporting_reads' column required for SV assignment.")
10
+ # 2) Normalize chromosome naming (strip or add 'chr' to match)
11
+ def _norm_chr(x):
12
+ x = str(x)
13
+ return x[3:] if x.startswith('chr') else x
14
+ bed['chr'] = bed['chr'].map(_norm_chr)
15
+ sv['chr'] = sv['chr'].map(_norm_chr)
16
+
17
+ # 3) Enforce integer dtype
18
+ bed[['start','end']] = bed[['start','end']].astype(np.int64)
19
+ sv[['ref_start','ref_end']] = sv[['ref_start','ref_end']].astype(np.int64)
20
+
21
+ # 4) Convert VCF (1-based inclusive) -> BED-style half-open
22
+ # [ref_start-1, ref_end) in 0-based half-open. Equivalently:
23
+ # start0 = ref_start - 1; end0_exclusive = ref_end
24
+ sv_start0 = (sv['ref_start'].to_numpy(np.int64) - 1)
25
+ sv_end0 = sv['ref_end'].to_numpy(np.int64) # already exclusive after conversion
26
+
27
+ # Handle insertions robustly: if END < POS (some callers), clamp end to POS.
28
+ bad = sv_end0 < (sv_start0 + 1)
29
+ if np.any(bad):
30
+ sv_end0[bad] = sv_start0[bad] + 1
31
+
32
+ sv = sv.assign(_start=sv_start0, _end=sv_end0)
33
+
34
+ # 5) Optional: make chr categorical for faster groupbys
35
+ bed['chr'] = bed['chr'].astype('category')
36
+ sv['chr'] = sv['chr'].astype('category')
37
+
38
+ out_mask = np.zeros(len(bed), dtype=bool)
39
+
40
+ # Build per-chrom sorted arrays of SV starts/ends (already sorted? we still ensure monotonic)
41
+ sv_idx = {}
42
+ for chrom, sdf in sv.groupby('chr', sort=False, observed=False):
43
+ starts = sdf['_start'].to_numpy(np.int64)
44
+ ends = sdf['_end'].to_numpy(np.int64)
45
+ # Ensure sorted (cheap if already sorted)
46
+ order = np.argsort(starts, kind='mergesort')
47
+ starts = starts[order]
48
+ ends = ends[order] # maintain pairing order
49
+ # We also want an array of ends sorted to use searchsorted independently.
50
+ ends_sorted = np.sort(ends, kind='mergesort')
51
+ sv_idx[chrom] = (starts, ends, ends_sorted)
52
+
53
+
54
+ for chrom, bdf in bed.groupby('chr', sort=False, observed=False):
55
+ if chrom not in sv_idx:
56
+ continue
57
+ starts, _, ends_sorted = sv_idx[chrom]
58
+ if starts.size == 0:
59
+ continue
60
+
61
+ # Core (original) BED interval
62
+ core_start = bdf['start'].to_numpy(np.int64)
63
+ core_end = bdf['end'].to_numpy(np.int64)
64
+
65
+ # Padded interval
66
+ bed_starts = core_start - window
67
+ bed_ends = core_end + window
68
+
69
+ # ---------- 1) Overlap with *padded* interval (half-open, as before) ----------
70
+ n_start_lt_end_pad = np.searchsorted(starts, bed_ends, side='left')
71
+ n_end_le_start_pad = np.searchsorted(ends_sorted, bed_starts, side='right')
72
+ overlap_padded = (n_start_lt_end_pad - n_end_le_start_pad) > 0
73
+
74
+ # ---------- 2) Overlap with *core* interval (treat breakpoint as overlap) ------
75
+ # Closed-interval style: sv_start <= core_end AND sv_end >= core_start
76
+ n_start_le_end_core = np.searchsorted(starts, core_end, side='right')
77
+ n_end_lt_start_core = np.searchsorted(ends_sorted, core_start, side='left')
78
+ overlap_core = (n_start_le_end_core - n_end_lt_start_core) > 0
79
+
80
+ # ---------- 3) We want SV in padding, but NOT in core --------------------------
81
+ keep = overlap_padded & (~overlap_core)
82
+ out_mask[bdf.index] = keep
83
+
84
+
85
+ return bed_df.loc[out_mask]