LZGraphs 3.0.0__tar.gz → 3.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/MANIFEST.in +1 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/Makefile +6 -1
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/PKG-INFO +3 -4
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/analytics.h +60 -15
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/diversity.h +2 -3
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/edge_builder.h +4 -2
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/gene_data.h +2 -2
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/graph.h +35 -5
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/hash_map.h +16 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/io.h +4 -4
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/pgen_dist.h +15 -10
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/posterior.h +1 -1
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/simulate.h +22 -17
- lzgraphs-3.0.2/include/lzgraph/walk_dict.h +232 -0
- lzgraphs-3.0.2/lib/analytics/analytics.c +183 -0
- lzgraphs-3.0.2/lib/analytics/analytics_mc.c +136 -0
- lzgraphs-3.0.2/lib/analytics/analytics_mc.h +31 -0
- lzgraphs-3.0.2/lib/analytics/diversity.c +44 -0
- lzgraphs-3.0.2/lib/analytics/diversity_compare.c +107 -0
- lzgraphs-3.0.2/lib/analytics/diversity_counting.c +150 -0
- lzgraphs-3.0.2/lib/analytics/diversity_internal.h +32 -0
- lzgraphs-3.0.2/lib/analytics/diversity_perplexity.c +87 -0
- lzgraphs-3.0.2/lib/analytics/hill_curve.c +55 -0
- lzgraphs-3.0.2/lib/analytics/pgen_dist.c +74 -0
- lzgraphs-3.0.2/lib/analytics/pgen_mixture.c +117 -0
- lzgraphs-3.0.2/lib/analytics/pgen_mixture.h +10 -0
- lzgraphs-3.0.2/lib/analytics/pgen_moments.c +103 -0
- lzgraphs-3.0.2/lib/analytics/pgen_moments.h +8 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/core/hash_map.c +88 -34
- lzgraphs-3.0.2/lib/graph/csr_graph.c +643 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/graph/edge_builder.c +18 -17
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/graph/gene_data.c +2 -2
- lzgraphs-3.0.2/lib/graph/graph_build_ingest.c +582 -0
- lzgraphs-3.0.2/lib/graph/graph_build_ingest.h +109 -0
- lzgraphs-3.0.2/lib/graph/graph_finalize.c +396 -0
- lzgraphs-3.0.2/lib/graph/graph_finalize.h +30 -0
- lzgraphs-3.0.2/lib/graph/graph_ops.c +31 -0
- lzgraphs-3.0.2/lib/graph/graph_ops_internal.h +18 -0
- lzgraphs-3.0.2/lib/graph/graph_set_ops.c +364 -0
- lzgraphs-3.0.2/lib/graph/graph_summary.c +36 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/graph/lz76.c +103 -26
- lzgraphs-3.0.2/lib/io/io.c +10 -0
- lzgraphs-3.0.2/lib/io/io_internal.c +87 -0
- lzgraphs-3.0.2/lib/io/io_internal.h +37 -0
- lzgraphs-3.0.2/lib/io/io_reader.c +548 -0
- lzgraphs-3.0.2/lib/io/io_reader.h +8 -0
- lzgraphs-3.0.2/lib/io/io_writer.c +380 -0
- lzgraphs-3.0.2/lib/io/io_writer.h +8 -0
- lzgraphs-3.0.2/lib/occupancy/occupancy.c +69 -0
- lzgraphs-3.0.2/lib/occupancy/occupancy_internal.h +47 -0
- lzgraphs-3.0.2/lib/occupancy/occupancy_numeric.c +83 -0
- lzgraphs-3.0.2/lib/occupancy/occupancy_richness.c +119 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/posterior/posterior.c +5 -5
- lzgraphs-3.0.2/lib/simulation/exact_model.c +122 -0
- lzgraphs-3.0.2/lib/simulation/exact_model.h +22 -0
- lzgraphs-3.0.2/lib/simulation/genomic_simulate.c +167 -0
- lzgraphs-3.0.2/lib/simulation/simulate.c +234 -0
- lzgraphs-3.0.2/lib/simulation/walk_engine.c +194 -0
- lzgraphs-3.0.2/lib/simulation/walk_engine.h +50 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/pyproject.toml +2 -3
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs/__init__.py +1 -1
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs/_clzgraph.c +129 -41
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs/_graph.py +187 -18
- lzgraphs-3.0.2/src/LZGraphs/_io.py +598 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs/_pgen_dist.py +2 -2
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs/cli.py +175 -15
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs.egg-info/PKG-INFO +3 -4
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs.egg-info/SOURCES.txt +32 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_analytics.py +91 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_cli.py +70 -1
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_graph.py +84 -1
- lzgraphs-3.0.2/tests/test_io.py +60 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_known_distributions.py +35 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_operations.py +17 -0
- lzgraphs-3.0.0/include/lzgraph/walk_dict.h +0 -123
- lzgraphs-3.0.0/lib/analytics/analytics.c +0 -291
- lzgraphs-3.0.0/lib/analytics/diversity.c +0 -288
- lzgraphs-3.0.0/lib/analytics/pgen_dist.c +0 -259
- lzgraphs-3.0.0/lib/graph/csr_graph.c +0 -674
- lzgraphs-3.0.0/lib/graph/graph_ops.c +0 -481
- lzgraphs-3.0.0/lib/io/io.c +0 -598
- lzgraphs-3.0.0/lib/occupancy/occupancy.c +0 -213
- lzgraphs-3.0.0/lib/simulation/genomic_simulate.c +0 -264
- lzgraphs-3.0.0/lib/simulation/simulate.c +0 -303
- lzgraphs-3.0.0/src/LZGraphs/_io.py +0 -191
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/CHANGELOG.md +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/CONTRIBUTING.md +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/LICENSE +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/README.md +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/common.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/crc32c.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/features.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/forward.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/genomic_simulate.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/graph_ops.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/lz76.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/occupancy.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/rng.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/sharing.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/string_pool.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/wynn.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/analytics/features.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/core/common.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/core/crc32c.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/core/rng.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/core/string_pool.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/core/wynn.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/graph/forward.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/occupancy/sharing.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/setup.cfg +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/setup.py +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs/_errors.py +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs/_simulation_result.py +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs.egg-info/dependency_links.txt +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs.egg-info/entry_points.txt +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs.egg-info/requires.txt +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs.egg-info/top_level.txt +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_deep_audit.py +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_lzpgen.py +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_mathematician_audit.py +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_model_audit.py +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_simulate.py +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_statistical_validity.py +0 -0
|
@@ -6,7 +6,7 @@ LDFLAGS = -lm
|
|
|
6
6
|
SRCS = $(shell find lib -name '*.c')
|
|
7
7
|
OBJS = $(patsubst lib/%.c, build/%.o, $(SRCS))
|
|
8
8
|
|
|
9
|
-
.PHONY: all clean test
|
|
9
|
+
.PHONY: all clean test bench_ops
|
|
10
10
|
|
|
11
11
|
all: build/liblzgraph.a
|
|
12
12
|
|
|
@@ -123,3 +123,8 @@ build/test_walk_dict: tests/c_unit/test_walk_dict.c build/liblzgraph.a | build_d
|
|
|
123
123
|
|
|
124
124
|
clean:
|
|
125
125
|
rm -rf build
|
|
126
|
+
|
|
127
|
+
bench_ops: build/bench_ops
|
|
128
|
+
|
|
129
|
+
build/bench_ops: tools/bench_ops.c build/liblzgraph.a | build_dirs
|
|
130
|
+
$(CC) $(CFLAGS) $< -Lbuild -llzgraph $(LDFLAGS) -o $@
|
|
@@ -1,22 +1,21 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: LZGraphs
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.2
|
|
4
4
|
Summary: High-performance LZ76 compression graphs for immune receptor repertoire analysis
|
|
5
5
|
Author-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
|
|
6
6
|
Maintainer-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
|
|
7
|
-
License: MIT
|
|
7
|
+
License-Expression: MIT
|
|
8
8
|
Project-URL: Homepage, https://github.com/MuteJester/LZGraphs
|
|
9
9
|
Project-URL: Documentation, https://mutejester.github.io/LZGraphs/
|
|
10
10
|
Project-URL: Repository, https://github.com/MuteJester/LZGraphs
|
|
11
11
|
Project-URL: Issues, https://github.com/MuteJester/LZGraphs/issues
|
|
12
|
-
Project-URL: Changelog, https://github.com/MuteJester/LZGraphs/blob/master/
|
|
12
|
+
Project-URL: Changelog, https://github.com/MuteJester/LZGraphs/blob/master/docs/resources/changelog.md
|
|
13
13
|
Keywords: Graph Theory,Immunology,Analytics,Biology,T-cell,Repertoire,CDR3,Bioinformatics,LZ76,Lempel-Ziv
|
|
14
14
|
Classifier: Development Status :: 4 - Beta
|
|
15
15
|
Classifier: Intended Audience :: Developers
|
|
16
16
|
Classifier: Intended Audience :: Science/Research
|
|
17
17
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
18
18
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
19
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
20
19
|
Classifier: Programming Language :: Python :: 3
|
|
21
20
|
Classifier: Programming Language :: Python :: 3.9
|
|
22
21
|
Classifier: Programming Language :: Python :: 3.10
|
|
@@ -1,10 +1,16 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @file analytics.h
|
|
3
|
-
* @brief Graph-level analytics computed via LZ-constrained
|
|
3
|
+
* @brief Graph-level analytics computed via LZ-constrained Monte Carlo.
|
|
4
4
|
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
5
|
+
* All public analytics are Monte Carlo estimates, not exact graph invariants.
|
|
6
|
+
* They sample the accepted-walk model (same as simulate()/lzpgen()) and
|
|
7
|
+
* compute diversity/entropy/support statistics from the sampled log-probs.
|
|
8
|
+
*
|
|
9
|
+
* Default sample count is LZG_ANALYTICS_DEFAULT_MC_SAMPLES (10000).
|
|
10
|
+
* Functions with _mc suffixes accept a custom sample count.
|
|
11
|
+
*
|
|
12
|
+
* pgen_diagnostics() estimates absorbed/leaked mass of the raw structural
|
|
13
|
+
* walk law via a separate MC run — see its doc for caveats.
|
|
8
14
|
*/
|
|
9
15
|
#ifndef LZGRAPH_ANALYTICS_H
|
|
10
16
|
#define LZGRAPH_ANALYTICS_H
|
|
@@ -12,23 +18,46 @@
|
|
|
12
18
|
#include "lzgraph/common.h"
|
|
13
19
|
#include "lzgraph/graph.h"
|
|
14
20
|
|
|
21
|
+
#define LZG_ANALYTICS_DEFAULT_MC_SAMPLES 10000u
|
|
22
|
+
|
|
15
23
|
/* ── Simulation potential size ──────────────────────────────── */
|
|
16
24
|
|
|
17
25
|
/**
|
|
18
|
-
*
|
|
19
|
-
* can produce from initial states to terminal states.
|
|
26
|
+
* Estimate the number of distinct completed LZ-valid walks (support size).
|
|
20
27
|
*/
|
|
21
28
|
LZGError lzg_graph_path_count(const LZGGraph *g, double *out_count);
|
|
22
29
|
|
|
30
|
+
/**
|
|
31
|
+
* Monte Carlo version of lzg_graph_path_count().
|
|
32
|
+
*
|
|
33
|
+
* @param n_samples Number of simulated walks. Use 0 for the default.
|
|
34
|
+
*/
|
|
35
|
+
LZGError lzg_graph_path_count_mc(const LZGGraph *g, uint32_t n_samples,
|
|
36
|
+
double *out_count);
|
|
37
|
+
|
|
23
38
|
/* ── PGEN diagnostics ───────────────────────────────────────── */
|
|
24
39
|
|
|
25
40
|
typedef struct {
|
|
26
|
-
double total_absorbed; /*
|
|
27
|
-
double total_leaked; /*
|
|
28
|
-
double initial_prob_sum; /* should be 1.0
|
|
29
|
-
bool is_proper; /* |total_absorbed - 1.0| < atol
|
|
41
|
+
double total_absorbed; /* MC estimate of absorbed mass (≤ 1.0) */
|
|
42
|
+
double total_leaked; /* 1 - total_absorbed */
|
|
43
|
+
double initial_prob_sum; /* should be 1.0 */
|
|
44
|
+
bool is_proper; /* |total_absorbed - 1.0| < atol */
|
|
45
|
+
uint32_t mc_samples; /* number of MC samples used */
|
|
30
46
|
} LZGPgenDiagnostics;
|
|
31
47
|
|
|
48
|
+
/**
|
|
49
|
+
* Estimate absorption/leakage of the raw structural walk law via Monte Carlo.
|
|
50
|
+
*
|
|
51
|
+
* total_absorbed is a Monte Carlo estimate (not an exact proof): it reports
|
|
52
|
+
* the fraction of raw constrained walks that reached a sink node out of
|
|
53
|
+
* mc_samples trials. A value of 1.0 means "no leaks were observed," not
|
|
54
|
+
* "the graph is mathematically proven leak-free." Rare leakage events may
|
|
55
|
+
* be missed if mc_samples is too small relative to the leakage probability.
|
|
56
|
+
*
|
|
57
|
+
* @param g The graph.
|
|
58
|
+
* @param atol Tolerance for is_proper check.
|
|
59
|
+
* @param out Output diagnostics.
|
|
60
|
+
*/
|
|
32
61
|
LZGError lzg_pgen_diagnostics(const LZGGraph *g, double atol,
|
|
33
62
|
LZGPgenDiagnostics *out);
|
|
34
63
|
|
|
@@ -46,18 +75,26 @@ LZGError lzg_effective_diversity(const LZGGraph *g, LZGEffectiveDiversity *out);
|
|
|
46
75
|
/* ── Hill numbers ───────────────────────────────────────────── */
|
|
47
76
|
|
|
48
77
|
/**
|
|
49
|
-
*
|
|
50
|
-
*
|
|
78
|
+
* Estimate the power sum M(α) = Σ_s π(s)^α of the public accepted-walk model.
|
|
79
|
+
* In particular, M(1) = 1 for every graph with at least one live completion.
|
|
51
80
|
*/
|
|
52
81
|
LZGError lzg_power_sum(const LZGGraph *g, double alpha, double *out_m);
|
|
53
82
|
|
|
54
83
|
/**
|
|
55
|
-
*
|
|
56
|
-
* For α = 0: returns
|
|
57
|
-
* For α = 1: returns exp(Shannon entropy).
|
|
84
|
+
* Estimate the classical Hill number of the public accepted-walk law.
|
|
85
|
+
* For α = 0: returns support size.
|
|
86
|
+
* For α = 1: returns exp(Shannon entropy of the absorbed law).
|
|
58
87
|
*/
|
|
59
88
|
LZGError lzg_hill_number(const LZGGraph *g, double alpha, double *out_d);
|
|
60
89
|
|
|
90
|
+
/**
|
|
91
|
+
* Monte Carlo version of lzg_hill_number().
|
|
92
|
+
*
|
|
93
|
+
* @param n_samples Number of simulated walks. Use 0 for the default.
|
|
94
|
+
*/
|
|
95
|
+
LZGError lzg_hill_number_mc(const LZGGraph *g, double alpha,
|
|
96
|
+
uint32_t n_samples, double *out_d);
|
|
97
|
+
|
|
61
98
|
/**
|
|
62
99
|
* Compute Hill numbers for multiple orders at once.
|
|
63
100
|
*
|
|
@@ -69,6 +106,14 @@ LZGError lzg_hill_number(const LZGGraph *g, double alpha, double *out_d);
|
|
|
69
106
|
LZGError lzg_hill_numbers(const LZGGraph *g, const double *orders,
|
|
70
107
|
uint32_t n, double *out);
|
|
71
108
|
|
|
109
|
+
/**
|
|
110
|
+
* Monte Carlo version of lzg_hill_numbers().
|
|
111
|
+
*
|
|
112
|
+
* @param n_samples Number of simulated walks. Use 0 for the default.
|
|
113
|
+
*/
|
|
114
|
+
LZGError lzg_hill_numbers_mc(const LZGGraph *g, const double *orders,
|
|
115
|
+
uint32_t n, uint32_t n_samples, double *out);
|
|
116
|
+
|
|
72
117
|
/* ── PGEN dynamic range ─────────────────────────────────────── */
|
|
73
118
|
|
|
74
119
|
typedef struct {
|
|
@@ -2,9 +2,8 @@
|
|
|
2
2
|
* @file diversity.h
|
|
3
3
|
* @brief Diversity metrics: perplexity, entropy rate, K-diversity, saturation.
|
|
4
4
|
*
|
|
5
|
-
* All probability-based metrics use the
|
|
6
|
-
*
|
|
7
|
-
* the proper probability distribution.
|
|
5
|
+
* All probability-based metrics use the public constrained walk model
|
|
6
|
+
* exposed through lzg_walk_log_prob().
|
|
8
7
|
*
|
|
9
8
|
* Counting-based metrics (K-diversity, saturation) operate on
|
|
10
9
|
* raw sequences without touching the probability model.
|
|
@@ -20,7 +20,7 @@ typedef struct LZGEdgeBuilder_ {
|
|
|
20
20
|
LZGHashMap *edge_map; /* key: pack(src,dst), value: edge index */
|
|
21
21
|
uint32_t *src_ids; /* [n_edges] source node IDs */
|
|
22
22
|
uint32_t *dst_ids; /* [n_edges] destination node IDs */
|
|
23
|
-
|
|
23
|
+
uint64_t *counts; /* [n_edges] raw transition counts */
|
|
24
24
|
uint32_t n_edges;
|
|
25
25
|
uint32_t capacity;
|
|
26
26
|
} LZGEdgeBuilder;
|
|
@@ -34,10 +34,12 @@ void lzg_eb_destroy(LZGEdgeBuilder *eb);
|
|
|
34
34
|
/**
|
|
35
35
|
* Record a transition from src → dst with the given count.
|
|
36
36
|
* If the edge already exists, increments its count.
|
|
37
|
+
* If `out_edge_idx` is non-NULL, returns the edge index that was updated.
|
|
37
38
|
*/
|
|
38
39
|
LZGError lzg_eb_record(LZGEdgeBuilder *eb,
|
|
39
40
|
uint32_t src_id, uint32_t dst_id,
|
|
40
|
-
|
|
41
|
+
uint64_t count,
|
|
42
|
+
uint32_t *out_edge_idx);
|
|
41
43
|
|
|
42
44
|
/** Pack key from (src, dst) pair. */
|
|
43
45
|
static inline uint64_t lzg_eb_pack_key(uint32_t src, uint32_t dst) {
|
|
@@ -26,13 +26,13 @@ typedef struct LZGGeneData_ {
|
|
|
26
26
|
/* ── Per-edge V gene counts (CSR-within-CSR) ── */
|
|
27
27
|
uint32_t *v_offsets; /* [n_edges + 1]: range per edge */
|
|
28
28
|
uint32_t *v_gene_ids; /* [total_v_entries]: interned gene IDs */
|
|
29
|
-
|
|
29
|
+
uint64_t *v_gene_counts; /* [total_v_entries]: raw counts */
|
|
30
30
|
uint32_t total_v_entries;
|
|
31
31
|
|
|
32
32
|
/* ── Per-edge J gene counts (CSR-within-CSR) ── */
|
|
33
33
|
uint32_t *j_offsets;
|
|
34
34
|
uint32_t *j_gene_ids;
|
|
35
|
-
|
|
35
|
+
uint64_t *j_gene_counts;
|
|
36
36
|
uint32_t total_j_entries;
|
|
37
37
|
|
|
38
38
|
/* ── Graph-level marginal distributions ── */
|
|
@@ -17,6 +17,8 @@
|
|
|
17
17
|
#include "lzgraph/hash_map.h"
|
|
18
18
|
#include "lzgraph/edge_builder.h"
|
|
19
19
|
|
|
20
|
+
struct LZGExactModel_;
|
|
21
|
+
|
|
20
22
|
/**
|
|
21
23
|
* LZGraph — directed acyclic graph with sentinel-bounded walks.
|
|
22
24
|
*
|
|
@@ -37,7 +39,7 @@ typedef struct LZGGraph_ {
|
|
|
37
39
|
|
|
38
40
|
/* ── Per-edge data (parallel to col_indices) ── */
|
|
39
41
|
double *edge_weights; /* [n_edges]: normalized P(dst|src) */
|
|
40
|
-
|
|
42
|
+
uint64_t *edge_counts; /* [n_edges]: raw transition counts */
|
|
41
43
|
|
|
42
44
|
/* ── Per-edge LZ constraint info ── */
|
|
43
45
|
uint32_t *edge_sp_id; /* [n_edges]: interned subpattern of dst */
|
|
@@ -45,7 +47,7 @@ typedef struct LZGGraph_ {
|
|
|
45
47
|
uint32_t *edge_prefix_id; /* [n_edges]: interned prefix of dst sp */
|
|
46
48
|
|
|
47
49
|
/* ── Per-node data ── */
|
|
48
|
-
|
|
50
|
+
uint64_t *outgoing_counts; /* [n_nodes]: total raw outgoing count */
|
|
49
51
|
uint32_t *node_sp_id; /* [n_nodes]: interned subpattern string */
|
|
50
52
|
uint8_t *node_sp_len; /* [n_nodes]: subpattern character length */
|
|
51
53
|
uint32_t *node_pos; /* [n_nodes]: cumulative position integer */
|
|
@@ -59,7 +61,7 @@ typedef struct LZGGraph_ {
|
|
|
59
61
|
bool topo_valid;
|
|
60
62
|
|
|
61
63
|
/* ── Length distribution ── */
|
|
62
|
-
|
|
64
|
+
uint64_t *length_counts; /* indexed by sequence length */
|
|
63
65
|
uint32_t max_length; /* largest observed sequence length */
|
|
64
66
|
|
|
65
67
|
/* ── String pool (owns all string data) ── */
|
|
@@ -71,6 +73,15 @@ typedef struct LZGGraph_ {
|
|
|
71
73
|
|
|
72
74
|
/* ── Gene data (optional, NULL if no gene columns provided) ── */
|
|
73
75
|
struct LZGGeneData_ *gene_data;
|
|
76
|
+
|
|
77
|
+
/* ── Transient query cache (not serialized) ── */
|
|
78
|
+
LZGHashMap *query_node_map; /* structural key -> node index */
|
|
79
|
+
uint64_t *edge_sp_hash; /* [n_edges]: hash of destination token */
|
|
80
|
+
uint64_t *edge_prefix_hash; /* [n_edges]: hash of prefix token or 0 */
|
|
81
|
+
uint64_t *node_sp_hash; /* [n_nodes]: hash of node token */
|
|
82
|
+
uint8_t *edge_single_char_idx; /* [n_edges]: aa bit index or UINT8_MAX */
|
|
83
|
+
uint8_t *node_single_char_idx; /* [n_nodes]: aa bit index or UINT8_MAX */
|
|
84
|
+
struct LZGExactModel_ *exact_model_cache; /* accepted-model normalizer cache */
|
|
74
85
|
} LZGGraph;
|
|
75
86
|
|
|
76
87
|
/** Allocate and initialize an empty graph. */
|
|
@@ -95,12 +106,25 @@ void lzg_graph_destroy(LZGGraph *g);
|
|
|
95
106
|
LZGError lzg_graph_build(LZGGraph *g,
|
|
96
107
|
const char **sequences,
|
|
97
108
|
uint32_t n_seqs,
|
|
98
|
-
const
|
|
109
|
+
const uint64_t *abundances,
|
|
99
110
|
const char **v_genes,
|
|
100
111
|
const char **j_genes,
|
|
101
112
|
double smoothing,
|
|
102
113
|
uint32_t min_init);
|
|
103
114
|
|
|
115
|
+
/**
|
|
116
|
+
* Build a graph from a plain text file without materializing all sequences in Python.
|
|
117
|
+
*
|
|
118
|
+
* Supported input formats:
|
|
119
|
+
* - one sequence per line
|
|
120
|
+
* - sequence<TAB>abundance
|
|
121
|
+
*
|
|
122
|
+
* Gene columns and headered tabular formats are not supported by this path.
|
|
123
|
+
*/
|
|
124
|
+
LZGError lzg_graph_build_plain_file(LZGGraph *g,
|
|
125
|
+
const char *path,
|
|
126
|
+
double smoothing);
|
|
127
|
+
|
|
104
128
|
/** Compute topological sort (cached, invalidated on structural changes). */
|
|
105
129
|
LZGError lzg_graph_topo_sort(LZGGraph *g);
|
|
106
130
|
|
|
@@ -116,7 +140,7 @@ LZGError lzg_graph_finalize_from_edges(
|
|
|
116
140
|
LZGHashMap *initial_counts,
|
|
117
141
|
LZGHashMap *terminal_counts,
|
|
118
142
|
LZGHashMap *outgoing_counts,
|
|
119
|
-
|
|
143
|
+
uint64_t *len_counts, uint32_t max_len);
|
|
120
144
|
|
|
121
145
|
/* ── Recalculation flags ───────────────────────────────────── */
|
|
122
146
|
|
|
@@ -132,6 +156,12 @@ typedef enum {
|
|
|
132
156
|
*/
|
|
133
157
|
LZGError lzg_graph_recalculate(LZGGraph *g, uint32_t flags);
|
|
134
158
|
|
|
159
|
+
/**
|
|
160
|
+
* Internal: ensure transient query-side edge hash caches exist.
|
|
161
|
+
* These caches are derived from immutable edge metadata and are not serialized.
|
|
162
|
+
*/
|
|
163
|
+
LZGError lzg_graph_ensure_query_edge_hashes(LZGGraph *g);
|
|
164
|
+
|
|
135
165
|
/** Get the number of successors for node `node_id`. */
|
|
136
166
|
static inline uint32_t lzg_graph_out_degree(const LZGGraph *g, uint32_t node_id) {
|
|
137
167
|
return g->row_offsets[node_id + 1] - g->row_offsets[node_id];
|
|
@@ -27,6 +27,22 @@ bool lzg_hm_put(LZGHashMap *map, uint64_t key, uint64_t value);
|
|
|
27
27
|
/** Lookup. Returns pointer to value or NULL. */
|
|
28
28
|
uint64_t *lzg_hm_get(const LZGHashMap *map, uint64_t key);
|
|
29
29
|
|
|
30
|
+
/**
|
|
31
|
+
* Lookup or insert in a single probe walk. Returns a writable value slot.
|
|
32
|
+
* If the key is absent, inserts it with `initial_value`.
|
|
33
|
+
* If `inserted` is non-NULL, it is set to true iff a new entry was created.
|
|
34
|
+
*/
|
|
35
|
+
uint64_t *lzg_hm_get_or_insert(LZGHashMap *map, uint64_t key,
|
|
36
|
+
uint64_t initial_value, bool *inserted);
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Add `delta` to the value stored at `key`, inserting `delta` if absent.
|
|
40
|
+
* Returns a writable value slot after the update.
|
|
41
|
+
* If `inserted` is non-NULL, it is set to true iff a new entry was created.
|
|
42
|
+
*/
|
|
43
|
+
uint64_t *lzg_hm_add_u64(LZGHashMap *map, uint64_t key,
|
|
44
|
+
uint64_t delta, bool *inserted);
|
|
45
|
+
|
|
30
46
|
/** Delete. Returns true if found. */
|
|
31
47
|
bool lzg_hm_delete(LZGHashMap *map, uint64_t key);
|
|
32
48
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @file io.h
|
|
3
|
-
* @brief
|
|
3
|
+
* @brief LZG binary file format: section-based save/load with CRC-32C.
|
|
4
4
|
*
|
|
5
5
|
* File extension: .lzg
|
|
6
6
|
* See FILE_FORMAT.md for the complete specification.
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
/* ── Format constants ──────────────────────────────────────── */
|
|
15
15
|
|
|
16
16
|
#define LZG_IO_MAGIC 0x4C5A4732u /* "LZG2" */
|
|
17
|
-
#define LZG_IO_FORMAT_VERSION
|
|
17
|
+
#define LZG_IO_FORMAT_VERSION 3
|
|
18
18
|
#define LZG_IO_ENDIAN_LE 0x01
|
|
19
19
|
#define LZG_IO_TRAILER_MAGIC 0x454E444Cu /* "ENDL" */
|
|
20
20
|
|
|
@@ -50,8 +50,8 @@
|
|
|
50
50
|
LZG_PACKED_BEGIN
|
|
51
51
|
typedef struct LZG_PACKED_ATTR {
|
|
52
52
|
uint32_t magic; /* 0x4C5A4732 */
|
|
53
|
-
uint16_t format_version; /*
|
|
54
|
-
uint16_t min_reader_version; /*
|
|
53
|
+
uint16_t format_version; /* current format version */
|
|
54
|
+
uint16_t min_reader_version; /* minimum supported reader version */
|
|
55
55
|
uint8_t endian_tag; /* 0x01 = LE */
|
|
56
56
|
uint8_t variant; /* 0=AAP, 1=NDP, 2=Naive */
|
|
57
57
|
uint8_t compression; /* 0=none, 1=zstd, 2=gzip */
|
|
@@ -2,14 +2,19 @@
|
|
|
2
2
|
* @file pgen_dist.h
|
|
3
3
|
* @brief LZPgen distribution: moments, analytical Gaussian mixture, pdf/cdf.
|
|
4
4
|
*
|
|
5
|
-
* The PGEN distribution characterizes
|
|
6
|
-
* (log P(seq)) across all sequences producible by the graph.
|
|
5
|
+
* The PGEN distribution characterizes log-probability mass over graph walks.
|
|
7
6
|
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
7
|
+
* Important: the current implementation is powered by the unconstrained
|
|
8
|
+
* topological forward DP in forward.c. Its moments and Gaussian mixture
|
|
9
|
+
* therefore summarize the graph-topology walk law, not the exact
|
|
10
|
+
* LZ-constrained per-history walk law used by lzg_walk_log_prob().
|
|
11
|
+
*
|
|
12
|
+
* lzg_pgen_moments() computes mean, variance, skewness, kurtosis via
|
|
13
|
+
* 5-dimensional moment propagation through that unconstrained forward DP.
|
|
10
14
|
*
|
|
11
15
|
* lzg_pgen_analytical_distribution() produces a Gaussian mixture
|
|
12
|
-
* (one component per walk length) for fast pdf/cdf/ppf evaluation
|
|
16
|
+
* (one component per walk length) for fast pdf/cdf/ppf evaluation of
|
|
17
|
+
* the same unconstrained approximation.
|
|
13
18
|
*/
|
|
14
19
|
#ifndef LZGRAPH_PGEN_DIST_H
|
|
15
20
|
#define LZGRAPH_PGEN_DIST_H
|
|
@@ -21,12 +26,12 @@
|
|
|
21
26
|
/* ── Moment results ────────────────────────────────────────── */
|
|
22
27
|
|
|
23
28
|
typedef struct {
|
|
24
|
-
double mean; /* E[log P]
|
|
25
|
-
double variance; /* Var[log P]
|
|
29
|
+
double mean; /* E[log P] under the unconstrained forward law */
|
|
30
|
+
double variance; /* Var[log P] under the same law */
|
|
26
31
|
double std; /* sqrt(variance) */
|
|
27
32
|
double skewness; /* standardized 3rd cumulant */
|
|
28
33
|
double kurtosis; /* standardized 4th cumulant */
|
|
29
|
-
double total_mass; /*
|
|
34
|
+
double total_mass; /* total absorbed mass in the unconstrained forward DP */
|
|
30
35
|
} LZGPgenMoments;
|
|
31
36
|
|
|
32
37
|
LZGError lzg_pgen_moments(const LZGGraph *g, LZGPgenMoments *out);
|
|
@@ -47,8 +52,8 @@ typedef struct {
|
|
|
47
52
|
} LZGPgenDist;
|
|
48
53
|
|
|
49
54
|
/**
|
|
50
|
-
* Compute the analytical PGEN distribution as a Gaussian mixture
|
|
51
|
-
*
|
|
55
|
+
* Compute the analytical PGEN distribution as a Gaussian mixture over the
|
|
56
|
+
* unconstrained forward-DP walk law. One component per walk length.
|
|
52
57
|
*/
|
|
53
58
|
LZGError lzg_pgen_analytical(const LZGGraph *g, LZGPgenDist *out);
|
|
54
59
|
|
|
@@ -2,14 +2,14 @@
|
|
|
2
2
|
* @file simulate.h
|
|
3
3
|
* @brief LZ76-constrained sequence simulation and walk probability.
|
|
4
4
|
*
|
|
5
|
-
* simulate() generates sequences
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
5
|
+
* simulate() generates sequences from a scalable accepted-sequence
|
|
6
|
+
* approximation: raw constrained walks are sampled forward until one
|
|
7
|
+
* reaches a terminal sink, and probabilities are normalized by a cached
|
|
8
|
+
* Monte Carlo estimate of the graph's absorption mass.
|
|
9
9
|
*
|
|
10
|
-
* walk_log_probability()
|
|
11
|
-
*
|
|
12
|
-
*
|
|
10
|
+
* walk_log_probability() uses that same cached normalization estimate, so
|
|
11
|
+
* its output stays consistent with simulate() without materializing the full
|
|
12
|
+
* history-expanded exact state space.
|
|
13
13
|
*/
|
|
14
14
|
#ifndef LZGRAPH_SIMULATE_H
|
|
15
15
|
#define LZGRAPH_SIMULATE_H
|
|
@@ -31,12 +31,12 @@ typedef struct {
|
|
|
31
31
|
} LZGSimResult;
|
|
32
32
|
|
|
33
33
|
/**
|
|
34
|
-
* Simulate n sequences from the
|
|
34
|
+
* Simulate n sequences from the public accepted-sequence model.
|
|
35
35
|
*
|
|
36
|
-
* Each walk enforces LZ76 dictionary constraints
|
|
37
|
-
*
|
|
38
|
-
*
|
|
39
|
-
*
|
|
36
|
+
* Each trial walk enforces LZ76 dictionary constraints exactly, stops on
|
|
37
|
+
* dead ends, and retries until a completed sink-reaching walk is obtained.
|
|
38
|
+
* This keeps generation scalable on large graphs while still returning only
|
|
39
|
+
* completed accepted walks.
|
|
40
40
|
*
|
|
41
41
|
* @param g The graph (must be finalized + topo-sorted).
|
|
42
42
|
* @param n Number of sequences to generate.
|
|
@@ -55,12 +55,12 @@ static inline void lzg_sim_result_free(LZGSimResult *r) {
|
|
|
55
55
|
}
|
|
56
56
|
|
|
57
57
|
/**
|
|
58
|
-
* Compute the log-probability of a sequence under the
|
|
58
|
+
* Compute the log-probability of a sequence under the public accepted model.
|
|
59
59
|
*
|
|
60
|
-
* The sequence is re-encoded via LZ76, producing its canonical
|
|
61
|
-
*
|
|
62
|
-
*
|
|
63
|
-
*
|
|
60
|
+
* The sequence is re-encoded via LZ76, producing its canonical raw
|
|
61
|
+
* constrained walk, and then normalized by the same cached Monte Carlo
|
|
62
|
+
* absorption estimate used by simulate(). This keeps lzpgen() consistent
|
|
63
|
+
* with public generation semantics without the exact state explosion.
|
|
64
64
|
*
|
|
65
65
|
* @param g The graph.
|
|
66
66
|
* @param seq Null-terminated amino acid sequence.
|
|
@@ -103,6 +103,11 @@ typedef struct {
|
|
|
103
103
|
* 2. Walk with edges filtered by LZ + live + V/J gene presence
|
|
104
104
|
* 3. Backtrack on dead ends (stack with blacklist)
|
|
105
105
|
*
|
|
106
|
+
* The reported log_prob is the gene-conditional walk probability: the
|
|
107
|
+
* product of transition probabilities over gene-filtered, LZ-valid edges.
|
|
108
|
+
* This is a different probability law from the unconditional model returned
|
|
109
|
+
* by simulate() / lzg_walk_log_prob(). The two are not directly comparable.
|
|
110
|
+
*
|
|
106
111
|
* @param g Graph with gene_data != NULL.
|
|
107
112
|
* @param n Number of sequences to generate.
|
|
108
113
|
* @param rng RNG state.
|