lanctools 0.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+ # Misc
13
+ .coverage
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,19 @@
1
+ cmake_minimum_required(VERSION 3.15)
2
+ project(${SKBUILD_PROJECT_NAME} LANGUAGES CXX)
3
+
4
+ set(CMAKE_CXX_STANDARD 17)
5
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
6
+
7
+ set(PYBIND11_FINDPYTHON ON)
8
+ find_package(pybind11 CONFIG REQUIRED)
9
+ find_package(ZLIB REQUIRED)
10
+
11
+ pybind11_add_module(_cpp
12
+ src/lanctools/_cpp/bindings.cpp
13
+ src/lanctools/_cpp/flare.cpp
14
+ src/lanctools/_cpp/rfmix.cpp
15
+ )
16
+
17
+ install(TARGETS _cpp DESTINATION ${SKBUILD_PROJECT_NAME})
18
+
19
+ target_link_libraries(_cpp PRIVATE ZLIB::ZLIB)
@@ -0,0 +1,20 @@
1
+ # Minimal makefile for Sphinx documentation
2
+ #
3
+
4
+ # You can set these variables from the command line, and also
5
+ # from the environment for the first two.
6
+ SPHINXOPTS ?=
7
+ SPHINXBUILD ?= sphinx-build
8
+ SOURCEDIR = source
9
+ BUILDDIR = build
10
+
11
+ # Put it first so that "make" without argument is like "make help".
12
+ help:
13
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14
+
15
+ .PHONY: help Makefile
16
+
17
+ # Catch-all target: route all unknown targets to Sphinx using the new
18
+ # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19
+ %: Makefile
20
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
@@ -0,0 +1,69 @@
1
+ Metadata-Version: 2.1
2
+ Name: lanctools
3
+ Version: 0.0.0
4
+ Summary: Add your description here
5
+ Author-Email: Frank Ockerman <frank.ockerman@gmail.com>
6
+ Requires-Python: >=3.12
7
+ Requires-Dist: numba>=0.63.1
8
+ Requires-Dist: numpy>=2.3.5
9
+ Requires-Dist: pandas>=2.3.3
10
+ Requires-Dist: pgenlib>=0.93.0
11
+ Requires-Dist: typer>=0.20.0
12
+ Description-Content-Type: text/markdown
13
+
14
+ # lanctools
15
+
16
+ Tools for working with local ancestry data in the `.lanc` file format.
17
+ This package contains two main components:
18
+
19
+ 1. A function (and corresponding CLI) for converting FLARE and RFMix files to
20
+ the .lanc format
21
+ 2. A `LancData` class with methods for fast querying of local ancestry and
22
+ local ancestry-masked genotypes
23
+
24
+ ## Installation
25
+
26
+ ```
27
+ pip install lanctools
28
+ ```
29
+
30
+ ## Quickstart
31
+
32
+ To load and query local ancestry data for a set of variants::
33
+
34
+ ```
35
+ import numpy as np
36
+
37
+ from lanctools import LancData
38
+
39
+ ld = LancData(
40
+ plink_prefix="chr1",
41
+ lanc_file="chr1.lanc",
42
+ ancestries=["YRI", "CEU"]
43
+ )
44
+
45
+ idx_var = np.arange(100, dtype=np.uint32)
46
+
47
+ lanc = ld.get_lanc(idx_var) # (N, 100, 2): phased local ancestry
48
+ geno = ld.get_geno(idx_var) # (N, 100, 2): phased genotypes
49
+ lanc_geno = ld.get_lanc_geno(idx_var) # (N, 100, len(ancestries))
50
+ ```
51
+
52
+ To convert a FLARE (or RFMix) local ancestry file to `.lanc`:
53
+
54
+ ```
55
+ from lanctools import convert_to_lanc
56
+
57
+ convert_to_lanc(
58
+ file="chr1.anc.vcf.gz",
59
+ file_fmt="FLARE",
60
+ plink_prefix="chr1",
61
+ output="chr1.lanc"
62
+ )
63
+ ```
64
+
65
+ To perform the above conversion using the CLI tool:
66
+
67
+ ```
68
+ lanctools convert-flare --file chr1.anc.vcf.gz --plink_prefix chr1 --output chr1.lanc
69
+ ```
@@ -0,0 +1,56 @@
1
+ # lanctools
2
+
3
+ Tools for working with local ancestry data in the `.lanc` file format.
4
+ This package contains two main components:
5
+
6
+ 1. A function (and corresponding CLI) for converting FLARE and RFMix files to
7
+ the .lanc format
8
+ 2. A `LancData` class with methods for fast querying of local ancestry and
9
+ local ancestry-masked genotypes
10
+
11
+ ## Installation
12
+
13
+ ```
14
+ pip install lanctools
15
+ ```
16
+
17
+ ## Quickstart
18
+
19
+ To load and query local ancestry data for a set of variants::
20
+
21
+ ```
22
+ import numpy as np
23
+
24
+ from lanctools import LancData
25
+
26
+ ld = LancData(
27
+ plink_prefix="chr1",
28
+ lanc_file="chr1.lanc",
29
+ ancestries=["YRI", "CEU"]
30
+ )
31
+
32
+ idx_var = np.arange(100, dtype=np.uint32)
33
+
34
+ lanc = ld.get_lanc(idx_var) # (N, 100, 2): phased local ancestry
35
+ geno = ld.get_geno(idx_var) # (N, 100, 2): phased genotypes
36
+ lanc_geno = ld.get_lanc_geno(idx_var) # (N, 100, len(ancestries))
37
+ ```
38
+
39
+ To convert a FLARE (or RFMix) local ancestry file to `.lanc`:
40
+
41
+ ```
42
+ from lanctools import convert_to_lanc
43
+
44
+ convert_to_lanc(
45
+ file="chr1.anc.vcf.gz",
46
+ file_fmt="FLARE",
47
+ plink_prefix="chr1",
48
+ output="chr1.lanc"
49
+ )
50
+ ```
51
+
52
+ To perform the above conversion using the CLI tool:
53
+
54
+ ```
55
+ lanctools convert-flare --file chr1.anc.vcf.gz --plink_prefix chr1 --output chr1.lanc
56
+ ```
@@ -0,0 +1,35 @@
1
+ @ECHO OFF
2
+
3
+ pushd %~dp0
4
+
5
+ REM Command file for Sphinx documentation
6
+
7
+ if "%SPHINXBUILD%" == "" (
8
+ set SPHINXBUILD=sphinx-build
9
+ )
10
+ set SOURCEDIR=source
11
+ set BUILDDIR=build
12
+
13
+ %SPHINXBUILD% >NUL 2>NUL
14
+ if errorlevel 9009 (
15
+ echo.
16
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17
+ echo.installed, then set the SPHINXBUILD environment variable to point
18
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
19
+ echo.may add the Sphinx directory to PATH.
20
+ echo.
21
+ echo.If you don't have Sphinx installed, grab it from
22
+ echo.https://www.sphinx-doc.org/
23
+ exit /b 1
24
+ )
25
+
26
+ if "%1" == "" goto help
27
+
28
+ %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29
+ goto end
30
+
31
+ :help
32
+ %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33
+
34
+ :end
35
+ popd
@@ -0,0 +1,61 @@
1
+ [project]
2
+ name = "lanctools"
3
+ version = "0.0.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ authors = [{ name = "Frank Ockerman", email = "frank.ockerman@gmail.com" }]
7
+ requires-python = ">=3.12"
8
+ dependencies = [
9
+ "numba>=0.63.1",
10
+ "numpy>=2.3.5",
11
+ "pandas>=2.3.3",
12
+ "pgenlib>=0.93.0",
13
+ "typer>=0.20.0",
14
+ ]
15
+
16
+ [tool.scikit-build]
17
+ minimum-version = "build-system.requires"
18
+ build-dir = "build/{wheel_tag}"
19
+
20
+ wheel.exclude = ["data", "result"]
21
+ sdist.exclude = ["data", "result"]
22
+
23
+ [tool.uv]
24
+ cache-keys = [
25
+ { file = "pyproject.toml" },
26
+ { file = "src/**/*.{h,c,hpp,cpp}" },
27
+ { file = "CMakeLists.txt" },
28
+ ]
29
+
30
+ [build-system]
31
+ requires = ["scikit-build-core>=0.10", "pybind11"]
32
+ build-backend = "scikit_build_core.build"
33
+
34
+ [dependency-groups]
35
+ dev = [
36
+ "ipython>=9.6.0",
37
+ "pytest>=9.0.2",
38
+ "pytest-cov>=7.0.0",
39
+ "sphinx>=7,<9",
40
+ "myst-parser>=2.0.0",
41
+ "sphinx-autodoc-typehints>=3.5.2",
42
+ "sphinx-rtd-theme>=3.0.2",
43
+ ]
44
+
45
+ [tool.setuptools.packages.find]
46
+ where = ["src"]
47
+
48
+ [project.scripts]
49
+ lanctools = "lanctools.cli:main_entry"
50
+
51
+
52
+ [tool.pytest.ini_options]
53
+ testpaths = ["tests"]
54
+ python_files = ["test_*.py"]
55
+ python_functions = ["test_*"]
56
+ addopts = [
57
+ "--strict-markers",
58
+ "--strict-config",
59
+ "--cov=src",
60
+ "--cov-report=term-missing",
61
+ ]
@@ -0,0 +1,38 @@
1
+ # Configuration file for the Sphinx documentation builder.
2
+ #
3
+ # For the full list of built-in configuration values, see the documentation:
4
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html
5
+
6
+ # -- Project information -----------------------------------------------------
7
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
8
+ import os
9
+ import sys
10
+
11
+ sys.path.insert(0, os.path.abspath(".."))
12
+
13
+
14
+ project = "lanctools"
15
+ copyright = "2025, Franklin Ockerman"
16
+ author = "Franklin Ockerman"
17
+ release = "0.0.0"
18
+
19
+ # -- General configuration ---------------------------------------------------
20
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
21
+
22
+ extensions = [
23
+ "sphinx.ext.autodoc",
24
+ "sphinx.ext.napoleon",
25
+ "sphinx_autodoc_typehints",
26
+ "myst_parser",
27
+ ]
28
+
29
+
30
+ templates_path = ["_templates"]
31
+ exclude_patterns = []
32
+
33
+
34
+ # -- Options for HTML output -------------------------------------------------
35
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
36
+
37
+ html_theme = "sphinx_rtd_theme"
38
+ html_static_path = ["_static"]
@@ -0,0 +1,17 @@
1
+ .. lanctools documentation master file, created by
2
+ sphinx-quickstart on Sat Dec 20 23:18:55 2025.
3
+ You can adapt this file completely to your liking, but it should at least
4
+ contain the root `toctree` directive.
5
+
6
+ lanctools
7
+ =========
8
+
9
+ .. include:: ../README.md
10
+ :parser: myst_parser.sphinx_
11
+
12
+
13
+ .. toctree::
14
+ :maxdepth: 2
15
+ :caption: Contents:
16
+
17
+ lanctools
@@ -0,0 +1,8 @@
1
+ lanctools Package
2
+ ====================
3
+
4
+ .. automodule:: lanctools.core
5
+ :members:
6
+ :undoc-members:
7
+ :show-inheritance:
8
+
@@ -0,0 +1,4 @@
1
+ from .core import LancData, FlatLanc, convert_to_lanc
2
+
3
+ __all__ = ["LancData", "FlatLanc", "convert_to_lanc"]
4
+ __version__ = "0.0.0"
@@ -0,0 +1,11 @@
1
+ #include "flare.hpp"
2
+ #include "rfmix.hpp"
3
+ #include <pybind11/pybind11.h>
4
+
5
+ namespace py = pybind11;
6
+
7
+ PYBIND11_MODULE(_cpp, m) {
8
+ m.doc() = "lanctools C++ backend";
9
+ bind_rfmix(m);
10
+ bind_flare(m);
11
+ }
@@ -0,0 +1,261 @@
1
+ #include "flare.hpp"
2
+ #include <cstdint>
3
+ #include <pybind11/stl.h>
4
+ #include <sstream>
5
+ #include <stdexcept>
6
+ #include <string>
7
+ #include <unordered_map>
8
+ #include <vector>
9
+ #include <zlib.h>
10
+
11
+ namespace py = pybind11;
12
+
13
+ struct AncestryTract {
14
+ std::string chrom;
15
+ uint32_t spos;
16
+ uint32_t epos;
17
+ uint8_t anc0;
18
+ uint8_t anc1;
19
+ };
20
+
21
+ struct VCFRecord {
22
+ std::string chrom;
23
+ uint32_t pos;
24
+ std::vector<uint8_t> anc0;
25
+ std::vector<uint8_t> anc1;
26
+
27
+ VCFRecord(size_t n_samples) : anc0(n_samples, 255), anc1(n_samples, 255) {}
28
+ };
29
+
30
+ std::string gz_readline(gzFile file) {
31
+ const size_t chunk_size = 65536;
32
+ char buffer[chunk_size];
33
+ std::string line;
34
+
35
+ while (gzgets(file, buffer, chunk_size)) {
36
+ line += buffer;
37
+ if (!line.empty() && line.back() == '\n')
38
+ break;
39
+ }
40
+
41
+ while (!line.empty() && (line.back() == '\n' || line.back() == '\r')) {
42
+ line.pop_back();
43
+ }
44
+ return line;
45
+ }
46
+
47
+ std::vector<std::string> split(const std::string &s, char delim) {
48
+ std::vector<std::string> elems;
49
+ std::stringstream ss(s);
50
+ std::string item;
51
+ while (std::getline(ss, item, delim)) {
52
+ elems.push_back(item);
53
+ }
54
+ return elems;
55
+ }
56
+
57
+ std::pair<uint8_t, uint8_t>
58
+ extract_AN1_AN2(const std::string &sample_field,
59
+ const std::vector<std::string> &format_fields, int an1_idx,
60
+ int an2_idx) {
61
+ uint8_t missing = 255;
62
+ std::vector<std::string> tokens = split(sample_field, ':');
63
+ uint8_t an1 = missing, an2 = missing;
64
+
65
+ if (an1_idx >= 0 && an1_idx < (int)tokens.size()) {
66
+ try {
67
+ int val = std::stoi(tokens[an1_idx]);
68
+ if (val >= 0 && val <= 255)
69
+ an1 = val;
70
+ } catch (...) {
71
+ }
72
+ }
73
+ if (an2_idx >= 0 && an2_idx < (int)tokens.size()) {
74
+ try {
75
+ int val = std::stoi(tokens[an2_idx]);
76
+ if (val >= 0 && val <= 255)
77
+ an2 = val;
78
+ } catch (...) {
79
+ }
80
+ }
81
+ return {an1, an2};
82
+ }
83
+
84
+ VCFRecord construct_vcf_record(const std::vector<std::string> &fields,
85
+ const std::vector<std::string> &format_fields,
86
+ int an1_idx, int an2_idx, int format_idx,
87
+ int n_samples, const std::string &chrom,
88
+ uint32_t pos) {
89
+ VCFRecord record(n_samples);
90
+ record.chrom = chrom;
91
+ record.pos = pos;
92
+ for (int i = 0; i < n_samples; ++i) {
93
+ auto [an1, an2] = extract_AN1_AN2(fields[format_idx + 1 + i], format_fields,
94
+ an1_idx, an2_idx);
95
+ record.anc0[i] = an1;
96
+ record.anc1[i] = an2;
97
+ }
98
+ return record;
99
+ }
100
+
101
+ void finalize_open_tracts(
102
+ const std::vector<std::string> &sample_ids,
103
+ const std::vector<uint8_t> &prev_anc,
104
+ const std::vector<uint32_t> &prev_spos,
105
+ std::unordered_map<std::string, std::vector<AncestryTract>> &sample_tracts,
106
+ const std::string &chrom, uint32_t final_pos) {
107
+ for (size_t i = 0; i < sample_ids.size(); ++i) {
108
+ const std::string &sample = sample_ids[i];
109
+ uint8_t anc0 = prev_anc[i * 2];
110
+ uint8_t anc1 = prev_anc[i * 2 + 1];
111
+ if (anc0 != 255 || anc1 != 255) {
112
+ sample_tracts[sample].push_back(
113
+ {chrom, prev_spos[i * 2], final_pos, anc0, anc1});
114
+ }
115
+ }
116
+ }
117
+
118
+ py::dict read_flare(const std::string &flare_file) {
119
+ gzFile file = gzopen(flare_file.c_str(), "rb");
120
+ if (!file)
121
+ throw std::runtime_error("Failed to open input VCF file");
122
+
123
+ std::string line;
124
+ bool found_header = false;
125
+ while (!(line = gz_readline(file)).empty()) {
126
+ if (line.substr(0, 6) == "#CHROM") {
127
+ found_header = true;
128
+ break;
129
+ }
130
+ }
131
+ if (!found_header) {
132
+ gzclose(file);
133
+ throw std::runtime_error("Missing #CHROM header line");
134
+ }
135
+
136
+ std::vector<std::string> header_fields = split(line, '\t');
137
+ int chrom_idx = -1, pos_idx = -1, format_idx = -1;
138
+ for (size_t i = 0; i < header_fields.size(); ++i) {
139
+ if (header_fields[i] == "#CHROM")
140
+ chrom_idx = i;
141
+ else if (header_fields[i] == "POS")
142
+ pos_idx = i;
143
+ else if (header_fields[i] == "FORMAT")
144
+ format_idx = i;
145
+ }
146
+ if (chrom_idx == -1 || pos_idx == -1 || format_idx == -1) {
147
+ gzclose(file);
148
+ throw std::runtime_error("Missing essential VCF columns");
149
+ }
150
+
151
+ std::vector<std::string> sample_ids(header_fields.begin() + format_idx + 1,
152
+ header_fields.end());
153
+ int n_samples = sample_ids.size();
154
+
155
+ std::unordered_map<std::string, std::vector<AncestryTract>> sample_tracts;
156
+ std::vector<uint8_t> prev_anc(n_samples * 2, 255);
157
+ std::vector<uint32_t> prev_spos(n_samples * 2, 0);
158
+ uint32_t cur_pos = 0, prev_pos = 0;
159
+ std::string cur_chrom = "chr0";
160
+ bool is_first_record = true;
161
+
162
+ while (!(line = gz_readline(file)).empty()) {
163
+ if (line.empty() || line[0] == '#')
164
+ continue;
165
+
166
+ std::vector<std::string> fields = split(line, '\t');
167
+ if (fields.size() < format_idx + 1 + n_samples)
168
+ continue;
169
+
170
+ std::string chrom = fields[chrom_idx];
171
+ uint32_t pos = std::stoul(fields[pos_idx]);
172
+
173
+ std::vector<std::string> format_fields = split(fields[format_idx], ':');
174
+ int an1_idx = -1, an2_idx = -1;
175
+ for (size_t i = 0; i < format_fields.size(); ++i) {
176
+ if (format_fields[i] == "AN1")
177
+ an1_idx = i;
178
+ else if (format_fields[i] == "AN2")
179
+ an2_idx = i;
180
+ }
181
+
182
+ VCFRecord record =
183
+ construct_vcf_record(fields, format_fields, an1_idx, an2_idx,
184
+ format_idx, n_samples, chrom, pos);
185
+
186
+ prev_pos = cur_pos;
187
+ cur_pos = pos;
188
+
189
+ if (is_first_record) {
190
+ for (size_t i = 0; i < n_samples; ++i) {
191
+ prev_spos[i * 2] = pos;
192
+ prev_spos[i * 2 + 1] = pos;
193
+ prev_anc[i * 2] = record.anc0[i];
194
+ prev_anc[i * 2 + 1] = record.anc1[i];
195
+ }
196
+ cur_chrom = chrom;
197
+ is_first_record = false;
198
+ continue;
199
+ }
200
+
201
+ if (chrom != cur_chrom) {
202
+ finalize_open_tracts(sample_ids, prev_anc, prev_spos, sample_tracts,
203
+ cur_chrom, pos - 1);
204
+ for (size_t i = 0; i < n_samples; ++i) {
205
+ prev_spos[i * 2] = pos;
206
+ prev_spos[i * 2 + 1] = pos;
207
+ prev_anc[i * 2] = record.anc0[i];
208
+ prev_anc[i * 2 + 1] = record.anc1[i];
209
+ }
210
+ cur_chrom = chrom;
211
+ continue;
212
+ }
213
+
214
+ for (size_t i = 0; i < n_samples; ++i) {
215
+ size_t idx0 = i * 2, idx1 = i * 2 + 1;
216
+ uint8_t new_anc0 = record.anc0[i];
217
+ uint8_t new_anc1 = record.anc1[i];
218
+ if (new_anc0 != prev_anc[idx0] || new_anc1 != prev_anc[idx1]) {
219
+ uint32_t midpoint = prev_pos + (cur_pos - prev_pos) / 2;
220
+ sample_tracts[sample_ids[i]].push_back(
221
+ {chrom, prev_spos[idx0], midpoint, prev_anc[idx0], prev_anc[idx1]});
222
+ prev_spos[idx0] = midpoint + 1;
223
+ prev_anc[idx0] = new_anc0;
224
+ prev_anc[idx1] = new_anc1;
225
+ }
226
+ }
227
+ }
228
+ gzclose(file);
229
+ finalize_open_tracts(sample_ids, prev_anc, prev_spos, sample_tracts,
230
+ cur_chrom, cur_pos);
231
+
232
+ py::dict result;
233
+
234
+ std::vector<std::string> samples, chroms;
235
+ std::vector<uint32_t> spos_vec, epos_vec;
236
+ std::vector<int> anc0_vec, anc1_vec;
237
+
238
+ for (const auto &[sample, tracts] : sample_tracts) {
239
+ for (const auto &tract : tracts) {
240
+ samples.push_back(sample);
241
+ chroms.push_back(tract.chrom);
242
+ spos_vec.push_back(tract.spos);
243
+ epos_vec.push_back(tract.epos);
244
+ anc0_vec.push_back(static_cast<int>(tract.anc0));
245
+ anc1_vec.push_back(static_cast<int>(tract.anc1));
246
+ }
247
+ }
248
+
249
+ result["sample"] = samples;
250
+ result["chrom"] = chroms;
251
+ result["spos"] = spos_vec;
252
+ result["epos"] = epos_vec;
253
+ result["anc0"] = anc0_vec;
254
+ result["anc1"] = anc1_vec;
255
+
256
+ return result;
257
+ }
258
+
259
+ void bind_flare(py::module_ &m) {
260
+ m.def("read_flare", &read_flare, "Read FLARE VCF and return ancestry tracts");
261
+ }
@@ -0,0 +1,7 @@
1
+ #pragma once
2
+ #include <pybind11/pybind11.h>
3
+
4
+ namespace py = pybind11;
5
+
6
+ py::dict read_flare(const std::string &flare_file);
7
+ void bind_flare(py::module_ &m);