lanctools 0.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lanctools-0.0.0/.gitignore +13 -0
- lanctools-0.0.0/.python-version +1 -0
- lanctools-0.0.0/CMakeLists.txt +19 -0
- lanctools-0.0.0/Makefile +20 -0
- lanctools-0.0.0/PKG-INFO +69 -0
- lanctools-0.0.0/README.md +56 -0
- lanctools-0.0.0/make.bat +35 -0
- lanctools-0.0.0/pyproject.toml +61 -0
- lanctools-0.0.0/source/conf.py +38 -0
- lanctools-0.0.0/source/index.rst +17 -0
- lanctools-0.0.0/source/lanctools.rst +8 -0
- lanctools-0.0.0/src/lanctools/__init__.py +4 -0
- lanctools-0.0.0/src/lanctools/_cpp/bindings.cpp +11 -0
- lanctools-0.0.0/src/lanctools/_cpp/flare.cpp +261 -0
- lanctools-0.0.0/src/lanctools/_cpp/flare.hpp +7 -0
- lanctools-0.0.0/src/lanctools/_cpp/rfmix.cpp +174 -0
- lanctools-0.0.0/src/lanctools/_cpp/rfmix.hpp +7 -0
- lanctools-0.0.0/src/lanctools/cli.py +93 -0
- lanctools-0.0.0/src/lanctools/core.py +372 -0
- lanctools-0.0.0/tests/test_lanctools.py +195 -0
- lanctools-0.0.0/uv.lock +1086 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
cmake_minimum_required(VERSION 3.15)
|
|
2
|
+
project(${SKBUILD_PROJECT_NAME} LANGUAGES CXX)
|
|
3
|
+
|
|
4
|
+
set(CMAKE_CXX_STANDARD 17)
|
|
5
|
+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
6
|
+
|
|
7
|
+
set(PYBIND11_FINDPYTHON ON)
|
|
8
|
+
find_package(pybind11 CONFIG REQUIRED)
|
|
9
|
+
find_package(ZLIB REQUIRED)
|
|
10
|
+
|
|
11
|
+
pybind11_add_module(_cpp
|
|
12
|
+
src/lanctools/_cpp/bindings.cpp
|
|
13
|
+
src/lanctools/_cpp/flare.cpp
|
|
14
|
+
src/lanctools/_cpp/rfmix.cpp
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
install(TARGETS _cpp DESTINATION ${SKBUILD_PROJECT_NAME})
|
|
18
|
+
|
|
19
|
+
target_link_libraries(_cpp PRIVATE ZLIB::ZLIB)
|
lanctools-0.0.0/Makefile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Minimal makefile for Sphinx documentation
|
|
2
|
+
#
|
|
3
|
+
|
|
4
|
+
# You can set these variables from the command line, and also
|
|
5
|
+
# from the environment for the first two.
|
|
6
|
+
SPHINXOPTS ?=
|
|
7
|
+
SPHINXBUILD ?= sphinx-build
|
|
8
|
+
SOURCEDIR = source
|
|
9
|
+
BUILDDIR = build
|
|
10
|
+
|
|
11
|
+
# Put it first so that "make" without argument is like "make help".
|
|
12
|
+
help:
|
|
13
|
+
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
|
14
|
+
|
|
15
|
+
.PHONY: help Makefile
|
|
16
|
+
|
|
17
|
+
# Catch-all target: route all unknown targets to Sphinx using the new
|
|
18
|
+
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
|
19
|
+
%: Makefile
|
|
20
|
+
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
lanctools-0.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: lanctools
|
|
3
|
+
Version: 0.0.0
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Author-Email: Frank Ockerman <frank.ockerman@gmail.com>
|
|
6
|
+
Requires-Python: >=3.12
|
|
7
|
+
Requires-Dist: numba>=0.63.1
|
|
8
|
+
Requires-Dist: numpy>=2.3.5
|
|
9
|
+
Requires-Dist: pandas>=2.3.3
|
|
10
|
+
Requires-Dist: pgenlib>=0.93.0
|
|
11
|
+
Requires-Dist: typer>=0.20.0
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# lanctools
|
|
15
|
+
|
|
16
|
+
Tools for working with local ancestry data in the `.lanc` file format.
|
|
17
|
+
This package contains two main components:
|
|
18
|
+
|
|
19
|
+
1. A function (and corresponding CLI) for converting FLARE and RFMix files to
|
|
20
|
+
the .lanc format
|
|
21
|
+
2. A `LancData` class with methods for fast querying of local ancestry and
|
|
22
|
+
local ancestry-masked genotypes
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
```
|
|
27
|
+
pip install lanctools
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Quickstart
|
|
31
|
+
|
|
32
|
+
To load and query local ancestry data for a set of variants::
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
import numpy as np
|
|
36
|
+
|
|
37
|
+
from lanctools import LancData
|
|
38
|
+
|
|
39
|
+
ld = LancData(
|
|
40
|
+
plink_prefix="chr1",
|
|
41
|
+
lanc_file="chr1.lanc",
|
|
42
|
+
ancestries=["YRI", "CEU"]
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
idx_var = np.arange(100, dtype=np.uint32)
|
|
46
|
+
|
|
47
|
+
lanc = ld.get_lanc(idx_var) # (N, 100, 2): phased local ancestry
|
|
48
|
+
geno = ld.get_geno(idx_var) # (N, 100, 2): phased genotypes
|
|
49
|
+
lanc_geno = ld.get_lanc_geno(idx_var) # (N, 100, len(ancestries))
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
To convert a FLARE (or RFMix) local ancestry file to `.lanc`:
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
from lanctools import convert_to_lanc
|
|
56
|
+
|
|
57
|
+
convert_to_lanc(
|
|
58
|
+
file="chr1.anc.vcf.gz",
|
|
59
|
+
file_fmt="FLARE",
|
|
60
|
+
plink_prefix="chr1",
|
|
61
|
+
output="chr1.lanc"
|
|
62
|
+
)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
To perform the above conversion using the CLI tool:
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
lanctools convert-flare --file chr1.anc.vcf.gz --plink_prefix chr1 --output chr1.lanc
|
|
69
|
+
```
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# lanctools
|
|
2
|
+
|
|
3
|
+
Tools for working with local ancestry data in the `.lanc` file format.
|
|
4
|
+
This package contains two main components:
|
|
5
|
+
|
|
6
|
+
1. A function (and corresponding CLI) for converting FLARE and RFMix files to
|
|
7
|
+
the .lanc format
|
|
8
|
+
2. A `LancData` class with methods for fast querying of local ancestry and
|
|
9
|
+
local ancestry-masked genotypes
|
|
10
|
+
|
|
11
|
+
## Installation
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
pip install lanctools
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Quickstart
|
|
18
|
+
|
|
19
|
+
To load and query local ancestry data for a set of variants::
|
|
20
|
+
|
|
21
|
+
```
|
|
22
|
+
import numpy as np
|
|
23
|
+
|
|
24
|
+
from lanctools import LancData
|
|
25
|
+
|
|
26
|
+
ld = LancData(
|
|
27
|
+
plink_prefix="chr1",
|
|
28
|
+
lanc_file="chr1.lanc",
|
|
29
|
+
ancestries=["YRI", "CEU"]
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
idx_var = np.arange(100, dtype=np.uint32)
|
|
33
|
+
|
|
34
|
+
lanc = ld.get_lanc(idx_var) # (N, 100, 2): phased local ancestry
|
|
35
|
+
geno = ld.get_geno(idx_var) # (N, 100, 2): phased genotypes
|
|
36
|
+
lanc_geno = ld.get_lanc_geno(idx_var) # (N, 100, len(ancestries))
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
To convert a FLARE (or RFMix) local ancestry file to `.lanc`:
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
from lanctools import convert_to_lanc
|
|
43
|
+
|
|
44
|
+
convert_to_lanc(
|
|
45
|
+
file="chr1.anc.vcf.gz",
|
|
46
|
+
file_fmt="FLARE",
|
|
47
|
+
plink_prefix="chr1",
|
|
48
|
+
output="chr1.lanc"
|
|
49
|
+
)
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
To perform the above conversion using the CLI tool:
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
lanctools convert-flare --file chr1.anc.vcf.gz --plink_prefix chr1 --output chr1.lanc
|
|
56
|
+
```
|
lanctools-0.0.0/make.bat
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
@ECHO OFF
|
|
2
|
+
|
|
3
|
+
pushd %~dp0
|
|
4
|
+
|
|
5
|
+
REM Command file for Sphinx documentation
|
|
6
|
+
|
|
7
|
+
if "%SPHINXBUILD%" == "" (
|
|
8
|
+
set SPHINXBUILD=sphinx-build
|
|
9
|
+
)
|
|
10
|
+
set SOURCEDIR=source
|
|
11
|
+
set BUILDDIR=build
|
|
12
|
+
|
|
13
|
+
%SPHINXBUILD% >NUL 2>NUL
|
|
14
|
+
if errorlevel 9009 (
|
|
15
|
+
echo.
|
|
16
|
+
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
|
17
|
+
echo.installed, then set the SPHINXBUILD environment variable to point
|
|
18
|
+
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
|
19
|
+
echo.may add the Sphinx directory to PATH.
|
|
20
|
+
echo.
|
|
21
|
+
echo.If you don't have Sphinx installed, grab it from
|
|
22
|
+
echo.https://www.sphinx-doc.org/
|
|
23
|
+
exit /b 1
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if "%1" == "" goto help
|
|
27
|
+
|
|
28
|
+
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
|
29
|
+
goto end
|
|
30
|
+
|
|
31
|
+
:help
|
|
32
|
+
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
|
33
|
+
|
|
34
|
+
:end
|
|
35
|
+
popd
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "lanctools"
|
|
3
|
+
version = "0.0.0"
|
|
4
|
+
description = "Add your description here"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [{ name = "Frank Ockerman", email = "frank.ockerman@gmail.com" }]
|
|
7
|
+
requires-python = ">=3.12"
|
|
8
|
+
dependencies = [
|
|
9
|
+
"numba>=0.63.1",
|
|
10
|
+
"numpy>=2.3.5",
|
|
11
|
+
"pandas>=2.3.3",
|
|
12
|
+
"pgenlib>=0.93.0",
|
|
13
|
+
"typer>=0.20.0",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[tool.scikit-build]
|
|
17
|
+
minimum-version = "build-system.requires"
|
|
18
|
+
build-dir = "build/{wheel_tag}"
|
|
19
|
+
|
|
20
|
+
wheel.exclude = ["data", "result"]
|
|
21
|
+
sdist.exclude = ["data", "result"]
|
|
22
|
+
|
|
23
|
+
[tool.uv]
|
|
24
|
+
cache-keys = [
|
|
25
|
+
{ file = "pyproject.toml" },
|
|
26
|
+
{ file = "src/**/*.{h,c,hpp,cpp}" },
|
|
27
|
+
{ file = "CMakeLists.txt" },
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[build-system]
|
|
31
|
+
requires = ["scikit-build-core>=0.10", "pybind11"]
|
|
32
|
+
build-backend = "scikit_build_core.build"
|
|
33
|
+
|
|
34
|
+
[dependency-groups]
|
|
35
|
+
dev = [
|
|
36
|
+
"ipython>=9.6.0",
|
|
37
|
+
"pytest>=9.0.2",
|
|
38
|
+
"pytest-cov>=7.0.0",
|
|
39
|
+
"sphinx>=7,<9",
|
|
40
|
+
"myst-parser>=2.0.0",
|
|
41
|
+
"sphinx-autodoc-typehints>=3.5.2",
|
|
42
|
+
"sphinx-rtd-theme>=3.0.2",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
[tool.setuptools.packages.find]
|
|
46
|
+
where = ["src"]
|
|
47
|
+
|
|
48
|
+
[project.scripts]
|
|
49
|
+
lanctools = "lanctools.cli:main_entry"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
[tool.pytest.ini_options]
|
|
53
|
+
testpaths = ["tests"]
|
|
54
|
+
python_files = ["test_*.py"]
|
|
55
|
+
python_functions = ["test_*"]
|
|
56
|
+
addopts = [
|
|
57
|
+
"--strict-markers",
|
|
58
|
+
"--strict-config",
|
|
59
|
+
"--cov=src",
|
|
60
|
+
"--cov-report=term-missing",
|
|
61
|
+
]
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Configuration file for the Sphinx documentation builder.
|
|
2
|
+
#
|
|
3
|
+
# For the full list of built-in configuration values, see the documentation:
|
|
4
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
|
5
|
+
|
|
6
|
+
# -- Project information -----------------------------------------------------
|
|
7
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
|
|
8
|
+
import os
|
|
9
|
+
import sys
|
|
10
|
+
|
|
11
|
+
sys.path.insert(0, os.path.abspath(".."))
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
project = "lanctools"
|
|
15
|
+
copyright = "2025, Franklin Ockerman"
|
|
16
|
+
author = "Franklin Ockerman"
|
|
17
|
+
release = "0.0.0"
|
|
18
|
+
|
|
19
|
+
# -- General configuration ---------------------------------------------------
|
|
20
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
|
|
21
|
+
|
|
22
|
+
extensions = [
|
|
23
|
+
"sphinx.ext.autodoc",
|
|
24
|
+
"sphinx.ext.napoleon",
|
|
25
|
+
"sphinx_autodoc_typehints",
|
|
26
|
+
"myst_parser",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
templates_path = ["_templates"]
|
|
31
|
+
exclude_patterns = []
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# -- Options for HTML output -------------------------------------------------
|
|
35
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
|
|
36
|
+
|
|
37
|
+
html_theme = "sphinx_rtd_theme"
|
|
38
|
+
html_static_path = ["_static"]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
.. lanctools documentation master file, created by
|
|
2
|
+
sphinx-quickstart on Sat Dec 20 23:18:55 2025.
|
|
3
|
+
You can adapt this file completely to your liking, but it should at least
|
|
4
|
+
contain the root `toctree` directive.
|
|
5
|
+
|
|
6
|
+
lanctools
|
|
7
|
+
=========
|
|
8
|
+
|
|
9
|
+
.. include:: ../README.md
|
|
10
|
+
:parser: myst_parser.sphinx_
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
.. toctree::
|
|
14
|
+
:maxdepth: 2
|
|
15
|
+
:caption: Contents:
|
|
16
|
+
|
|
17
|
+
lanctools
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
#include "flare.hpp"
|
|
2
|
+
#include <cstdint>
|
|
3
|
+
#include <pybind11/stl.h>
|
|
4
|
+
#include <sstream>
|
|
5
|
+
#include <stdexcept>
|
|
6
|
+
#include <string>
|
|
7
|
+
#include <unordered_map>
|
|
8
|
+
#include <vector>
|
|
9
|
+
#include <zlib.h>
|
|
10
|
+
|
|
11
|
+
namespace py = pybind11;
|
|
12
|
+
|
|
13
|
+
struct AncestryTract {
|
|
14
|
+
std::string chrom;
|
|
15
|
+
uint32_t spos;
|
|
16
|
+
uint32_t epos;
|
|
17
|
+
uint8_t anc0;
|
|
18
|
+
uint8_t anc1;
|
|
19
|
+
};
|
|
20
|
+
|
|
21
|
+
struct VCFRecord {
|
|
22
|
+
std::string chrom;
|
|
23
|
+
uint32_t pos;
|
|
24
|
+
std::vector<uint8_t> anc0;
|
|
25
|
+
std::vector<uint8_t> anc1;
|
|
26
|
+
|
|
27
|
+
VCFRecord(size_t n_samples) : anc0(n_samples, 255), anc1(n_samples, 255) {}
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
std::string gz_readline(gzFile file) {
|
|
31
|
+
const size_t chunk_size = 65536;
|
|
32
|
+
char buffer[chunk_size];
|
|
33
|
+
std::string line;
|
|
34
|
+
|
|
35
|
+
while (gzgets(file, buffer, chunk_size)) {
|
|
36
|
+
line += buffer;
|
|
37
|
+
if (!line.empty() && line.back() == '\n')
|
|
38
|
+
break;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
while (!line.empty() && (line.back() == '\n' || line.back() == '\r')) {
|
|
42
|
+
line.pop_back();
|
|
43
|
+
}
|
|
44
|
+
return line;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
std::vector<std::string> split(const std::string &s, char delim) {
|
|
48
|
+
std::vector<std::string> elems;
|
|
49
|
+
std::stringstream ss(s);
|
|
50
|
+
std::string item;
|
|
51
|
+
while (std::getline(ss, item, delim)) {
|
|
52
|
+
elems.push_back(item);
|
|
53
|
+
}
|
|
54
|
+
return elems;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
std::pair<uint8_t, uint8_t>
|
|
58
|
+
extract_AN1_AN2(const std::string &sample_field,
|
|
59
|
+
const std::vector<std::string> &format_fields, int an1_idx,
|
|
60
|
+
int an2_idx) {
|
|
61
|
+
uint8_t missing = 255;
|
|
62
|
+
std::vector<std::string> tokens = split(sample_field, ':');
|
|
63
|
+
uint8_t an1 = missing, an2 = missing;
|
|
64
|
+
|
|
65
|
+
if (an1_idx >= 0 && an1_idx < (int)tokens.size()) {
|
|
66
|
+
try {
|
|
67
|
+
int val = std::stoi(tokens[an1_idx]);
|
|
68
|
+
if (val >= 0 && val <= 255)
|
|
69
|
+
an1 = val;
|
|
70
|
+
} catch (...) {
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
if (an2_idx >= 0 && an2_idx < (int)tokens.size()) {
|
|
74
|
+
try {
|
|
75
|
+
int val = std::stoi(tokens[an2_idx]);
|
|
76
|
+
if (val >= 0 && val <= 255)
|
|
77
|
+
an2 = val;
|
|
78
|
+
} catch (...) {
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return {an1, an2};
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
VCFRecord construct_vcf_record(const std::vector<std::string> &fields,
|
|
85
|
+
const std::vector<std::string> &format_fields,
|
|
86
|
+
int an1_idx, int an2_idx, int format_idx,
|
|
87
|
+
int n_samples, const std::string &chrom,
|
|
88
|
+
uint32_t pos) {
|
|
89
|
+
VCFRecord record(n_samples);
|
|
90
|
+
record.chrom = chrom;
|
|
91
|
+
record.pos = pos;
|
|
92
|
+
for (int i = 0; i < n_samples; ++i) {
|
|
93
|
+
auto [an1, an2] = extract_AN1_AN2(fields[format_idx + 1 + i], format_fields,
|
|
94
|
+
an1_idx, an2_idx);
|
|
95
|
+
record.anc0[i] = an1;
|
|
96
|
+
record.anc1[i] = an2;
|
|
97
|
+
}
|
|
98
|
+
return record;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
void finalize_open_tracts(
|
|
102
|
+
const std::vector<std::string> &sample_ids,
|
|
103
|
+
const std::vector<uint8_t> &prev_anc,
|
|
104
|
+
const std::vector<uint32_t> &prev_spos,
|
|
105
|
+
std::unordered_map<std::string, std::vector<AncestryTract>> &sample_tracts,
|
|
106
|
+
const std::string &chrom, uint32_t final_pos) {
|
|
107
|
+
for (size_t i = 0; i < sample_ids.size(); ++i) {
|
|
108
|
+
const std::string &sample = sample_ids[i];
|
|
109
|
+
uint8_t anc0 = prev_anc[i * 2];
|
|
110
|
+
uint8_t anc1 = prev_anc[i * 2 + 1];
|
|
111
|
+
if (anc0 != 255 || anc1 != 255) {
|
|
112
|
+
sample_tracts[sample].push_back(
|
|
113
|
+
{chrom, prev_spos[i * 2], final_pos, anc0, anc1});
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
py::dict read_flare(const std::string &flare_file) {
|
|
119
|
+
gzFile file = gzopen(flare_file.c_str(), "rb");
|
|
120
|
+
if (!file)
|
|
121
|
+
throw std::runtime_error("Failed to open input VCF file");
|
|
122
|
+
|
|
123
|
+
std::string line;
|
|
124
|
+
bool found_header = false;
|
|
125
|
+
while (!(line = gz_readline(file)).empty()) {
|
|
126
|
+
if (line.substr(0, 6) == "#CHROM") {
|
|
127
|
+
found_header = true;
|
|
128
|
+
break;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
if (!found_header) {
|
|
132
|
+
gzclose(file);
|
|
133
|
+
throw std::runtime_error("Missing #CHROM header line");
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
std::vector<std::string> header_fields = split(line, '\t');
|
|
137
|
+
int chrom_idx = -1, pos_idx = -1, format_idx = -1;
|
|
138
|
+
for (size_t i = 0; i < header_fields.size(); ++i) {
|
|
139
|
+
if (header_fields[i] == "#CHROM")
|
|
140
|
+
chrom_idx = i;
|
|
141
|
+
else if (header_fields[i] == "POS")
|
|
142
|
+
pos_idx = i;
|
|
143
|
+
else if (header_fields[i] == "FORMAT")
|
|
144
|
+
format_idx = i;
|
|
145
|
+
}
|
|
146
|
+
if (chrom_idx == -1 || pos_idx == -1 || format_idx == -1) {
|
|
147
|
+
gzclose(file);
|
|
148
|
+
throw std::runtime_error("Missing essential VCF columns");
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
std::vector<std::string> sample_ids(header_fields.begin() + format_idx + 1,
|
|
152
|
+
header_fields.end());
|
|
153
|
+
int n_samples = sample_ids.size();
|
|
154
|
+
|
|
155
|
+
std::unordered_map<std::string, std::vector<AncestryTract>> sample_tracts;
|
|
156
|
+
std::vector<uint8_t> prev_anc(n_samples * 2, 255);
|
|
157
|
+
std::vector<uint32_t> prev_spos(n_samples * 2, 0);
|
|
158
|
+
uint32_t cur_pos = 0, prev_pos = 0;
|
|
159
|
+
std::string cur_chrom = "chr0";
|
|
160
|
+
bool is_first_record = true;
|
|
161
|
+
|
|
162
|
+
while (!(line = gz_readline(file)).empty()) {
|
|
163
|
+
if (line.empty() || line[0] == '#')
|
|
164
|
+
continue;
|
|
165
|
+
|
|
166
|
+
std::vector<std::string> fields = split(line, '\t');
|
|
167
|
+
if (fields.size() < format_idx + 1 + n_samples)
|
|
168
|
+
continue;
|
|
169
|
+
|
|
170
|
+
std::string chrom = fields[chrom_idx];
|
|
171
|
+
uint32_t pos = std::stoul(fields[pos_idx]);
|
|
172
|
+
|
|
173
|
+
std::vector<std::string> format_fields = split(fields[format_idx], ':');
|
|
174
|
+
int an1_idx = -1, an2_idx = -1;
|
|
175
|
+
for (size_t i = 0; i < format_fields.size(); ++i) {
|
|
176
|
+
if (format_fields[i] == "AN1")
|
|
177
|
+
an1_idx = i;
|
|
178
|
+
else if (format_fields[i] == "AN2")
|
|
179
|
+
an2_idx = i;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
VCFRecord record =
|
|
183
|
+
construct_vcf_record(fields, format_fields, an1_idx, an2_idx,
|
|
184
|
+
format_idx, n_samples, chrom, pos);
|
|
185
|
+
|
|
186
|
+
prev_pos = cur_pos;
|
|
187
|
+
cur_pos = pos;
|
|
188
|
+
|
|
189
|
+
if (is_first_record) {
|
|
190
|
+
for (size_t i = 0; i < n_samples; ++i) {
|
|
191
|
+
prev_spos[i * 2] = pos;
|
|
192
|
+
prev_spos[i * 2 + 1] = pos;
|
|
193
|
+
prev_anc[i * 2] = record.anc0[i];
|
|
194
|
+
prev_anc[i * 2 + 1] = record.anc1[i];
|
|
195
|
+
}
|
|
196
|
+
cur_chrom = chrom;
|
|
197
|
+
is_first_record = false;
|
|
198
|
+
continue;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
if (chrom != cur_chrom) {
|
|
202
|
+
finalize_open_tracts(sample_ids, prev_anc, prev_spos, sample_tracts,
|
|
203
|
+
cur_chrom, pos - 1);
|
|
204
|
+
for (size_t i = 0; i < n_samples; ++i) {
|
|
205
|
+
prev_spos[i * 2] = pos;
|
|
206
|
+
prev_spos[i * 2 + 1] = pos;
|
|
207
|
+
prev_anc[i * 2] = record.anc0[i];
|
|
208
|
+
prev_anc[i * 2 + 1] = record.anc1[i];
|
|
209
|
+
}
|
|
210
|
+
cur_chrom = chrom;
|
|
211
|
+
continue;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
for (size_t i = 0; i < n_samples; ++i) {
|
|
215
|
+
size_t idx0 = i * 2, idx1 = i * 2 + 1;
|
|
216
|
+
uint8_t new_anc0 = record.anc0[i];
|
|
217
|
+
uint8_t new_anc1 = record.anc1[i];
|
|
218
|
+
if (new_anc0 != prev_anc[idx0] || new_anc1 != prev_anc[idx1]) {
|
|
219
|
+
uint32_t midpoint = prev_pos + (cur_pos - prev_pos) / 2;
|
|
220
|
+
sample_tracts[sample_ids[i]].push_back(
|
|
221
|
+
{chrom, prev_spos[idx0], midpoint, prev_anc[idx0], prev_anc[idx1]});
|
|
222
|
+
prev_spos[idx0] = midpoint + 1;
|
|
223
|
+
prev_anc[idx0] = new_anc0;
|
|
224
|
+
prev_anc[idx1] = new_anc1;
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
gzclose(file);
|
|
229
|
+
finalize_open_tracts(sample_ids, prev_anc, prev_spos, sample_tracts,
|
|
230
|
+
cur_chrom, cur_pos);
|
|
231
|
+
|
|
232
|
+
py::dict result;
|
|
233
|
+
|
|
234
|
+
std::vector<std::string> samples, chroms;
|
|
235
|
+
std::vector<uint32_t> spos_vec, epos_vec;
|
|
236
|
+
std::vector<int> anc0_vec, anc1_vec;
|
|
237
|
+
|
|
238
|
+
for (const auto &[sample, tracts] : sample_tracts) {
|
|
239
|
+
for (const auto &tract : tracts) {
|
|
240
|
+
samples.push_back(sample);
|
|
241
|
+
chroms.push_back(tract.chrom);
|
|
242
|
+
spos_vec.push_back(tract.spos);
|
|
243
|
+
epos_vec.push_back(tract.epos);
|
|
244
|
+
anc0_vec.push_back(static_cast<int>(tract.anc0));
|
|
245
|
+
anc1_vec.push_back(static_cast<int>(tract.anc1));
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
result["sample"] = samples;
|
|
250
|
+
result["chrom"] = chroms;
|
|
251
|
+
result["spos"] = spos_vec;
|
|
252
|
+
result["epos"] = epos_vec;
|
|
253
|
+
result["anc0"] = anc0_vec;
|
|
254
|
+
result["anc1"] = anc1_vec;
|
|
255
|
+
|
|
256
|
+
return result;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
void bind_flare(py::module_ &m) {
|
|
260
|
+
m.def("read_flare", &read_flare, "Read FLARE VCF and return ancestry tracts");
|
|
261
|
+
}
|