LZGraphs 3.0.0__tar.gz → 3.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/MANIFEST.in +1 -0
  2. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/Makefile +6 -1
  3. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/PKG-INFO +3 -4
  4. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/analytics.h +60 -15
  5. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/diversity.h +2 -3
  6. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/edge_builder.h +4 -2
  7. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/gene_data.h +2 -2
  8. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/graph.h +35 -5
  9. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/hash_map.h +16 -0
  10. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/io.h +4 -4
  11. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/pgen_dist.h +15 -10
  12. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/posterior.h +1 -1
  13. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/simulate.h +22 -17
  14. lzgraphs-3.0.2/include/lzgraph/walk_dict.h +232 -0
  15. lzgraphs-3.0.2/lib/analytics/analytics.c +183 -0
  16. lzgraphs-3.0.2/lib/analytics/analytics_mc.c +136 -0
  17. lzgraphs-3.0.2/lib/analytics/analytics_mc.h +31 -0
  18. lzgraphs-3.0.2/lib/analytics/diversity.c +44 -0
  19. lzgraphs-3.0.2/lib/analytics/diversity_compare.c +107 -0
  20. lzgraphs-3.0.2/lib/analytics/diversity_counting.c +150 -0
  21. lzgraphs-3.0.2/lib/analytics/diversity_internal.h +32 -0
  22. lzgraphs-3.0.2/lib/analytics/diversity_perplexity.c +87 -0
  23. lzgraphs-3.0.2/lib/analytics/hill_curve.c +55 -0
  24. lzgraphs-3.0.2/lib/analytics/pgen_dist.c +74 -0
  25. lzgraphs-3.0.2/lib/analytics/pgen_mixture.c +117 -0
  26. lzgraphs-3.0.2/lib/analytics/pgen_mixture.h +10 -0
  27. lzgraphs-3.0.2/lib/analytics/pgen_moments.c +103 -0
  28. lzgraphs-3.0.2/lib/analytics/pgen_moments.h +8 -0
  29. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/core/hash_map.c +88 -34
  30. lzgraphs-3.0.2/lib/graph/csr_graph.c +643 -0
  31. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/graph/edge_builder.c +18 -17
  32. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/graph/gene_data.c +2 -2
  33. lzgraphs-3.0.2/lib/graph/graph_build_ingest.c +582 -0
  34. lzgraphs-3.0.2/lib/graph/graph_build_ingest.h +109 -0
  35. lzgraphs-3.0.2/lib/graph/graph_finalize.c +396 -0
  36. lzgraphs-3.0.2/lib/graph/graph_finalize.h +30 -0
  37. lzgraphs-3.0.2/lib/graph/graph_ops.c +31 -0
  38. lzgraphs-3.0.2/lib/graph/graph_ops_internal.h +18 -0
  39. lzgraphs-3.0.2/lib/graph/graph_set_ops.c +364 -0
  40. lzgraphs-3.0.2/lib/graph/graph_summary.c +36 -0
  41. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/graph/lz76.c +103 -26
  42. lzgraphs-3.0.2/lib/io/io.c +10 -0
  43. lzgraphs-3.0.2/lib/io/io_internal.c +87 -0
  44. lzgraphs-3.0.2/lib/io/io_internal.h +37 -0
  45. lzgraphs-3.0.2/lib/io/io_reader.c +548 -0
  46. lzgraphs-3.0.2/lib/io/io_reader.h +8 -0
  47. lzgraphs-3.0.2/lib/io/io_writer.c +380 -0
  48. lzgraphs-3.0.2/lib/io/io_writer.h +8 -0
  49. lzgraphs-3.0.2/lib/occupancy/occupancy.c +69 -0
  50. lzgraphs-3.0.2/lib/occupancy/occupancy_internal.h +47 -0
  51. lzgraphs-3.0.2/lib/occupancy/occupancy_numeric.c +83 -0
  52. lzgraphs-3.0.2/lib/occupancy/occupancy_richness.c +119 -0
  53. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/posterior/posterior.c +5 -5
  54. lzgraphs-3.0.2/lib/simulation/exact_model.c +122 -0
  55. lzgraphs-3.0.2/lib/simulation/exact_model.h +22 -0
  56. lzgraphs-3.0.2/lib/simulation/genomic_simulate.c +167 -0
  57. lzgraphs-3.0.2/lib/simulation/simulate.c +234 -0
  58. lzgraphs-3.0.2/lib/simulation/walk_engine.c +194 -0
  59. lzgraphs-3.0.2/lib/simulation/walk_engine.h +50 -0
  60. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/pyproject.toml +2 -3
  61. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs/__init__.py +1 -1
  62. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs/_clzgraph.c +129 -41
  63. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs/_graph.py +187 -18
  64. lzgraphs-3.0.2/src/LZGraphs/_io.py +598 -0
  65. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs/_pgen_dist.py +2 -2
  66. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs/cli.py +175 -15
  67. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs.egg-info/PKG-INFO +3 -4
  68. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs.egg-info/SOURCES.txt +32 -0
  69. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_analytics.py +91 -0
  70. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_cli.py +70 -1
  71. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_graph.py +84 -1
  72. lzgraphs-3.0.2/tests/test_io.py +60 -0
  73. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_known_distributions.py +35 -0
  74. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_operations.py +17 -0
  75. lzgraphs-3.0.0/include/lzgraph/walk_dict.h +0 -123
  76. lzgraphs-3.0.0/lib/analytics/analytics.c +0 -291
  77. lzgraphs-3.0.0/lib/analytics/diversity.c +0 -288
  78. lzgraphs-3.0.0/lib/analytics/pgen_dist.c +0 -259
  79. lzgraphs-3.0.0/lib/graph/csr_graph.c +0 -674
  80. lzgraphs-3.0.0/lib/graph/graph_ops.c +0 -481
  81. lzgraphs-3.0.0/lib/io/io.c +0 -598
  82. lzgraphs-3.0.0/lib/occupancy/occupancy.c +0 -213
  83. lzgraphs-3.0.0/lib/simulation/genomic_simulate.c +0 -264
  84. lzgraphs-3.0.0/lib/simulation/simulate.c +0 -303
  85. lzgraphs-3.0.0/src/LZGraphs/_io.py +0 -191
  86. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/CHANGELOG.md +0 -0
  87. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/CONTRIBUTING.md +0 -0
  88. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/LICENSE +0 -0
  89. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/README.md +0 -0
  90. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/common.h +0 -0
  91. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/crc32c.h +0 -0
  92. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/features.h +0 -0
  93. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/forward.h +0 -0
  94. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/genomic_simulate.h +0 -0
  95. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/graph_ops.h +0 -0
  96. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/lz76.h +0 -0
  97. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/occupancy.h +0 -0
  98. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/rng.h +0 -0
  99. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/sharing.h +0 -0
  100. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/string_pool.h +0 -0
  101. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/include/lzgraph/wynn.h +0 -0
  102. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/analytics/features.c +0 -0
  103. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/core/common.c +0 -0
  104. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/core/crc32c.c +0 -0
  105. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/core/rng.c +0 -0
  106. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/core/string_pool.c +0 -0
  107. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/core/wynn.c +0 -0
  108. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/graph/forward.c +0 -0
  109. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/lib/occupancy/sharing.c +0 -0
  110. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/setup.cfg +0 -0
  111. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/setup.py +0 -0
  112. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs/_errors.py +0 -0
  113. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs/_simulation_result.py +0 -0
  114. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs.egg-info/dependency_links.txt +0 -0
  115. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs.egg-info/entry_points.txt +0 -0
  116. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs.egg-info/requires.txt +0 -0
  117. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/src/LZGraphs.egg-info/top_level.txt +0 -0
  118. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_deep_audit.py +0 -0
  119. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_lzpgen.py +0 -0
  120. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_mathematician_audit.py +0 -0
  121. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_model_audit.py +0 -0
  122. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_simulate.py +0 -0
  123. {lzgraphs-3.0.0 → lzgraphs-3.0.2}/tests/test_statistical_validity.py +0 -0
@@ -4,4 +4,5 @@ include LICENSE
4
4
  include Makefile
5
5
  recursive-include include *.h
6
6
  recursive-include lib *.c
7
+ recursive-include lib *.h
7
8
  recursive-include src/LZGraphs *.c
@@ -6,7 +6,7 @@ LDFLAGS = -lm
6
6
  SRCS = $(shell find lib -name '*.c')
7
7
  OBJS = $(patsubst lib/%.c, build/%.o, $(SRCS))
8
8
 
9
- .PHONY: all clean test
9
+ .PHONY: all clean test bench_ops
10
10
 
11
11
  all: build/liblzgraph.a
12
12
 
@@ -123,3 +123,8 @@ build/test_walk_dict: tests/c_unit/test_walk_dict.c build/liblzgraph.a | build_d
123
123
 
124
124
  clean:
125
125
  rm -rf build
126
+
127
+ bench_ops: build/bench_ops
128
+
129
+ build/bench_ops: tools/bench_ops.c build/liblzgraph.a | build_dirs
130
+ $(CC) $(CFLAGS) $< -Lbuild -llzgraph $(LDFLAGS) -o $@
@@ -1,22 +1,21 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: LZGraphs
3
- Version: 3.0.0
3
+ Version: 3.0.2
4
4
  Summary: High-performance LZ76 compression graphs for immune receptor repertoire analysis
5
5
  Author-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
6
6
  Maintainer-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
7
- License: MIT
7
+ License-Expression: MIT
8
8
  Project-URL: Homepage, https://github.com/MuteJester/LZGraphs
9
9
  Project-URL: Documentation, https://mutejester.github.io/LZGraphs/
10
10
  Project-URL: Repository, https://github.com/MuteJester/LZGraphs
11
11
  Project-URL: Issues, https://github.com/MuteJester/LZGraphs/issues
12
- Project-URL: Changelog, https://github.com/MuteJester/LZGraphs/blob/master/CHANGELOG.md
12
+ Project-URL: Changelog, https://github.com/MuteJester/LZGraphs/blob/master/docs/resources/changelog.md
13
13
  Keywords: Graph Theory,Immunology,Analytics,Biology,T-cell,Repertoire,CDR3,Bioinformatics,LZ76,Lempel-Ziv
14
14
  Classifier: Development Status :: 4 - Beta
15
15
  Classifier: Intended Audience :: Developers
16
16
  Classifier: Intended Audience :: Science/Research
17
17
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
18
18
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
19
- Classifier: License :: OSI Approved :: MIT License
20
19
  Classifier: Programming Language :: Python :: 3
21
20
  Classifier: Programming Language :: Python :: 3.9
22
21
  Classifier: Programming Language :: Python :: 3.10
@@ -1,10 +1,16 @@
1
1
  /**
2
2
  * @file analytics.h
3
- * @brief Graph-level analytics computed via LZ-constrained forward DP.
3
+ * @brief Graph-level analytics computed via LZ-constrained Monte Carlo.
4
4
  *
5
- * Every function here uses lzg_forward_propagate() internally, meaning
6
- * all results respect LZ76 dictionary constraints only valid walks
7
- * contribute to the computed quantities.
5
+ * All public analytics are Monte Carlo estimates, not exact graph invariants.
6
+ * They sample the accepted-walk model (same as simulate()/lzpgen()) and
7
+ * compute diversity/entropy/support statistics from the sampled log-probs.
8
+ *
9
+ * Default sample count is LZG_ANALYTICS_DEFAULT_MC_SAMPLES (10000).
10
+ * Functions with _mc suffixes accept a custom sample count.
11
+ *
12
+ * pgen_diagnostics() estimates absorbed/leaked mass of the raw structural
13
+ * walk law via a separate MC run — see its doc for caveats.
8
14
  */
9
15
  #ifndef LZGRAPH_ANALYTICS_H
10
16
  #define LZGRAPH_ANALYTICS_H
@@ -12,23 +18,46 @@
12
18
  #include "lzgraph/common.h"
13
19
  #include "lzgraph/graph.h"
14
20
 
21
+ #define LZG_ANALYTICS_DEFAULT_MC_SAMPLES 10000u
22
+
15
23
  /* ── Simulation potential size ──────────────────────────────── */
16
24
 
17
25
  /**
18
- * Count the number of distinct LZ-valid walks (sequences) the graph
19
- * can produce from initial states to terminal states.
26
+ * Estimate the number of distinct completed LZ-valid walks (support size).
20
27
  */
21
28
  LZGError lzg_graph_path_count(const LZGGraph *g, double *out_count);
22
29
 
30
+ /**
31
+ * Monte Carlo version of lzg_graph_path_count().
32
+ *
33
+ * @param n_samples Number of simulated walks. Use 0 for the default.
34
+ */
35
+ LZGError lzg_graph_path_count_mc(const LZGGraph *g, uint32_t n_samples,
36
+ double *out_count);
37
+
23
38
  /* ── PGEN diagnostics ───────────────────────────────────────── */
24
39
 
25
40
  typedef struct {
26
- double total_absorbed; /* should be ≤ 1.0 */
27
- double total_leaked; /* mass at dead-end non-terminals */
28
- double initial_prob_sum; /* should be 1.0 */
29
- bool is_proper; /* |total_absorbed - 1.0| < atol */
41
+ double total_absorbed; /* MC estimate of absorbed mass (≤ 1.0) */
42
+ double total_leaked; /* 1 - total_absorbed */
43
+ double initial_prob_sum; /* should be 1.0 */
44
+ bool is_proper; /* |total_absorbed - 1.0| < atol */
45
+ uint32_t mc_samples; /* number of MC samples used */
30
46
  } LZGPgenDiagnostics;
31
47
 
48
+ /**
49
+ * Estimate absorption/leakage of the raw structural walk law via Monte Carlo.
50
+ *
51
+ * total_absorbed is a Monte Carlo estimate (not an exact proof): it reports
52
+ * the fraction of raw constrained walks that reached a sink node out of
53
+ * mc_samples trials. A value of 1.0 means "no leaks were observed," not
54
+ * "the graph is mathematically proven leak-free." Rare leakage events may
55
+ * be missed if mc_samples is too small relative to the leakage probability.
56
+ *
57
+ * @param g The graph.
58
+ * @param atol Tolerance for is_proper check.
59
+ * @param out Output diagnostics.
60
+ */
32
61
  LZGError lzg_pgen_diagnostics(const LZGGraph *g, double atol,
33
62
  LZGPgenDiagnostics *out);
34
63
 
@@ -46,18 +75,26 @@ LZGError lzg_effective_diversity(const LZGGraph *g, LZGEffectiveDiversity *out);
46
75
  /* ── Hill numbers ───────────────────────────────────────────── */
47
76
 
48
77
  /**
49
- * Compute M(α) = Σ_{valid walks} π(s)^α via LZ-constrained DP.
50
- * The Hill number is D(α) = M(α)^{1/(1-α)}.
78
+ * Estimate the power sum M(α) = Σ_s π(s)^α of the public accepted-walk model.
79
+ * In particular, M(1) = 1 for every graph with at least one live completion.
51
80
  */
52
81
  LZGError lzg_power_sum(const LZGGraph *g, double alpha, double *out_m);
53
82
 
54
83
  /**
55
- * Compute D(α) = M(α)^{1/(1-α)} for a single order.
56
- * For α = 0: returns simulation_potential_size.
57
- * For α = 1: returns exp(Shannon entropy).
84
+ * Estimate the classical Hill number of the public accepted-walk law.
85
+ * For α = 0: returns support size.
86
+ * For α = 1: returns exp(Shannon entropy of the absorbed law).
58
87
  */
59
88
  LZGError lzg_hill_number(const LZGGraph *g, double alpha, double *out_d);
60
89
 
90
+ /**
91
+ * Monte Carlo version of lzg_hill_number().
92
+ *
93
+ * @param n_samples Number of simulated walks. Use 0 for the default.
94
+ */
95
+ LZGError lzg_hill_number_mc(const LZGGraph *g, double alpha,
96
+ uint32_t n_samples, double *out_d);
97
+
61
98
  /**
62
99
  * Compute Hill numbers for multiple orders at once.
63
100
  *
@@ -69,6 +106,14 @@ LZGError lzg_hill_number(const LZGGraph *g, double alpha, double *out_d);
69
106
  LZGError lzg_hill_numbers(const LZGGraph *g, const double *orders,
70
107
  uint32_t n, double *out);
71
108
 
109
+ /**
110
+ * Monte Carlo version of lzg_hill_numbers().
111
+ *
112
+ * @param n_samples Number of simulated walks. Use 0 for the default.
113
+ */
114
+ LZGError lzg_hill_numbers_mc(const LZGGraph *g, const double *orders,
115
+ uint32_t n, uint32_t n_samples, double *out);
116
+
72
117
  /* ── PGEN dynamic range ─────────────────────────────────────── */
73
118
 
74
119
  typedef struct {
@@ -2,9 +2,8 @@
2
2
  * @file diversity.h
3
3
  * @brief Diversity metrics: perplexity, entropy rate, K-diversity, saturation.
4
4
  *
5
- * All probability-based metrics use the LZ-constrained model
6
- * (via lzg_walk_log_prob), giving correct values under
7
- * the proper probability distribution.
5
+ * All probability-based metrics use the public constrained walk model
6
+ * exposed through lzg_walk_log_prob().
8
7
  *
9
8
  * Counting-based metrics (K-diversity, saturation) operate on
10
9
  * raw sequences without touching the probability model.
@@ -20,7 +20,7 @@ typedef struct LZGEdgeBuilder_ {
20
20
  LZGHashMap *edge_map; /* key: pack(src,dst), value: edge index */
21
21
  uint32_t *src_ids; /* [n_edges] source node IDs */
22
22
  uint32_t *dst_ids; /* [n_edges] destination node IDs */
23
- uint32_t *counts; /* [n_edges] raw transition counts */
23
+ uint64_t *counts; /* [n_edges] raw transition counts */
24
24
  uint32_t n_edges;
25
25
  uint32_t capacity;
26
26
  } LZGEdgeBuilder;
@@ -34,10 +34,12 @@ void lzg_eb_destroy(LZGEdgeBuilder *eb);
34
34
  /**
35
35
  * Record a transition from src → dst with the given count.
36
36
  * If the edge already exists, increments its count.
37
+ * If `out_edge_idx` is non-NULL, returns the edge index that was updated.
37
38
  */
38
39
  LZGError lzg_eb_record(LZGEdgeBuilder *eb,
39
40
  uint32_t src_id, uint32_t dst_id,
40
- uint32_t count);
41
+ uint64_t count,
42
+ uint32_t *out_edge_idx);
41
43
 
42
44
  /** Pack key from (src, dst) pair. */
43
45
  static inline uint64_t lzg_eb_pack_key(uint32_t src, uint32_t dst) {
@@ -26,13 +26,13 @@ typedef struct LZGGeneData_ {
26
26
  /* ── Per-edge V gene counts (CSR-within-CSR) ── */
27
27
  uint32_t *v_offsets; /* [n_edges + 1]: range per edge */
28
28
  uint32_t *v_gene_ids; /* [total_v_entries]: interned gene IDs */
29
- uint32_t *v_gene_counts; /* [total_v_entries]: raw counts */
29
+ uint64_t *v_gene_counts; /* [total_v_entries]: raw counts */
30
30
  uint32_t total_v_entries;
31
31
 
32
32
  /* ── Per-edge J gene counts (CSR-within-CSR) ── */
33
33
  uint32_t *j_offsets;
34
34
  uint32_t *j_gene_ids;
35
- uint32_t *j_gene_counts;
35
+ uint64_t *j_gene_counts;
36
36
  uint32_t total_j_entries;
37
37
 
38
38
  /* ── Graph-level marginal distributions ── */
@@ -17,6 +17,8 @@
17
17
  #include "lzgraph/hash_map.h"
18
18
  #include "lzgraph/edge_builder.h"
19
19
 
20
+ struct LZGExactModel_;
21
+
20
22
  /**
21
23
  * LZGraph — directed acyclic graph with sentinel-bounded walks.
22
24
  *
@@ -37,7 +39,7 @@ typedef struct LZGGraph_ {
37
39
 
38
40
  /* ── Per-edge data (parallel to col_indices) ── */
39
41
  double *edge_weights; /* [n_edges]: normalized P(dst|src) */
40
- uint32_t *edge_counts; /* [n_edges]: raw transition counts */
42
+ uint64_t *edge_counts; /* [n_edges]: raw transition counts */
41
43
 
42
44
  /* ── Per-edge LZ constraint info ── */
43
45
  uint32_t *edge_sp_id; /* [n_edges]: interned subpattern of dst */
@@ -45,7 +47,7 @@ typedef struct LZGGraph_ {
45
47
  uint32_t *edge_prefix_id; /* [n_edges]: interned prefix of dst sp */
46
48
 
47
49
  /* ── Per-node data ── */
48
- uint32_t *outgoing_counts; /* [n_nodes]: total raw outgoing count */
50
+ uint64_t *outgoing_counts; /* [n_nodes]: total raw outgoing count */
49
51
  uint32_t *node_sp_id; /* [n_nodes]: interned subpattern string */
50
52
  uint8_t *node_sp_len; /* [n_nodes]: subpattern character length */
51
53
  uint32_t *node_pos; /* [n_nodes]: cumulative position integer */
@@ -59,7 +61,7 @@ typedef struct LZGGraph_ {
59
61
  bool topo_valid;
60
62
 
61
63
  /* ── Length distribution ── */
62
- uint32_t *length_counts; /* indexed by sequence length */
64
+ uint64_t *length_counts; /* indexed by sequence length */
63
65
  uint32_t max_length; /* largest observed sequence length */
64
66
 
65
67
  /* ── String pool (owns all string data) ── */
@@ -71,6 +73,15 @@ typedef struct LZGGraph_ {
71
73
 
72
74
  /* ── Gene data (optional, NULL if no gene columns provided) ── */
73
75
  struct LZGGeneData_ *gene_data;
76
+
77
+ /* ── Transient query cache (not serialized) ── */
78
+ LZGHashMap *query_node_map; /* structural key -> node index */
79
+ uint64_t *edge_sp_hash; /* [n_edges]: hash of destination token */
80
+ uint64_t *edge_prefix_hash; /* [n_edges]: hash of prefix token or 0 */
81
+ uint64_t *node_sp_hash; /* [n_nodes]: hash of node token */
82
+ uint8_t *edge_single_char_idx; /* [n_edges]: aa bit index or UINT8_MAX */
83
+ uint8_t *node_single_char_idx; /* [n_nodes]: aa bit index or UINT8_MAX */
84
+ struct LZGExactModel_ *exact_model_cache; /* accepted-model normalizer cache */
74
85
  } LZGGraph;
75
86
 
76
87
  /** Allocate and initialize an empty graph. */
@@ -95,12 +106,25 @@ void lzg_graph_destroy(LZGGraph *g);
95
106
  LZGError lzg_graph_build(LZGGraph *g,
96
107
  const char **sequences,
97
108
  uint32_t n_seqs,
98
- const uint32_t *abundances,
109
+ const uint64_t *abundances,
99
110
  const char **v_genes,
100
111
  const char **j_genes,
101
112
  double smoothing,
102
113
  uint32_t min_init);
103
114
 
115
+ /**
116
+ * Build a graph from a plain text file without materializing all sequences in Python.
117
+ *
118
+ * Supported input formats:
119
+ * - one sequence per line
120
+ * - sequence<TAB>abundance
121
+ *
122
+ * Gene columns and headered tabular formats are not supported by this path.
123
+ */
124
+ LZGError lzg_graph_build_plain_file(LZGGraph *g,
125
+ const char *path,
126
+ double smoothing);
127
+
104
128
  /** Compute topological sort (cached, invalidated on structural changes). */
105
129
  LZGError lzg_graph_topo_sort(LZGGraph *g);
106
130
 
@@ -116,7 +140,7 @@ LZGError lzg_graph_finalize_from_edges(
116
140
  LZGHashMap *initial_counts,
117
141
  LZGHashMap *terminal_counts,
118
142
  LZGHashMap *outgoing_counts,
119
- uint32_t *len_counts, uint32_t max_len);
143
+ uint64_t *len_counts, uint32_t max_len);
120
144
 
121
145
  /* ── Recalculation flags ───────────────────────────────────── */
122
146
 
@@ -132,6 +156,12 @@ typedef enum {
132
156
  */
133
157
  LZGError lzg_graph_recalculate(LZGGraph *g, uint32_t flags);
134
158
 
159
+ /**
160
+ * Internal: ensure transient query-side edge hash caches exist.
161
+ * These caches are derived from immutable edge metadata and are not serialized.
162
+ */
163
+ LZGError lzg_graph_ensure_query_edge_hashes(LZGGraph *g);
164
+
135
165
  /** Get the number of successors for node `node_id`. */
136
166
  static inline uint32_t lzg_graph_out_degree(const LZGGraph *g, uint32_t node_id) {
137
167
  return g->row_offsets[node_id + 1] - g->row_offsets[node_id];
@@ -27,6 +27,22 @@ bool lzg_hm_put(LZGHashMap *map, uint64_t key, uint64_t value);
27
27
  /** Lookup. Returns pointer to value or NULL. */
28
28
  uint64_t *lzg_hm_get(const LZGHashMap *map, uint64_t key);
29
29
 
30
+ /**
31
+ * Lookup or insert in a single probe walk. Returns a writable value slot.
32
+ * If the key is absent, inserts it with `initial_value`.
33
+ * If `inserted` is non-NULL, it is set to true iff a new entry was created.
34
+ */
35
+ uint64_t *lzg_hm_get_or_insert(LZGHashMap *map, uint64_t key,
36
+ uint64_t initial_value, bool *inserted);
37
+
38
+ /**
39
+ * Add `delta` to the value stored at `key`, inserting `delta` if absent.
40
+ * Returns a writable value slot after the update.
41
+ * If `inserted` is non-NULL, it is set to true iff a new entry was created.
42
+ */
43
+ uint64_t *lzg_hm_add_u64(LZGHashMap *map, uint64_t key,
44
+ uint64_t delta, bool *inserted);
45
+
30
46
  /** Delete. Returns true if found. */
31
47
  bool lzg_hm_delete(LZGHashMap *map, uint64_t key);
32
48
 
@@ -1,6 +1,6 @@
1
1
  /**
2
2
  * @file io.h
3
- * @brief LZG2 binary file format: section-based save/load with CRC-32C.
3
+ * @brief LZG binary file format: section-based save/load with CRC-32C.
4
4
  *
5
5
  * File extension: .lzg
6
6
  * See FILE_FORMAT.md for the complete specification.
@@ -14,7 +14,7 @@
14
14
  /* ── Format constants ──────────────────────────────────────── */
15
15
 
16
16
  #define LZG_IO_MAGIC 0x4C5A4732u /* "LZG2" */
17
- #define LZG_IO_FORMAT_VERSION 2
17
+ #define LZG_IO_FORMAT_VERSION 3
18
18
  #define LZG_IO_ENDIAN_LE 0x01
19
19
  #define LZG_IO_TRAILER_MAGIC 0x454E444Cu /* "ENDL" */
20
20
 
@@ -50,8 +50,8 @@
50
50
  LZG_PACKED_BEGIN
51
51
  typedef struct LZG_PACKED_ATTR {
52
52
  uint32_t magic; /* 0x4C5A4732 */
53
- uint16_t format_version; /* 2 */
54
- uint16_t min_reader_version; /* 2 */
53
+ uint16_t format_version; /* current format version */
54
+ uint16_t min_reader_version; /* minimum supported reader version */
55
55
  uint8_t endian_tag; /* 0x01 = LE */
56
56
  uint8_t variant; /* 0=AAP, 1=NDP, 2=Naive */
57
57
  uint8_t compression; /* 0=none, 1=zstd, 2=gzip */
@@ -2,14 +2,19 @@
2
2
  * @file pgen_dist.h
3
3
  * @brief LZPgen distribution: moments, analytical Gaussian mixture, pdf/cdf.
4
4
  *
5
- * The PGEN distribution characterizes the probability of generation
6
- * (log P(seq)) across all sequences producible by the graph.
5
+ * The PGEN distribution characterizes log-probability mass over graph walks.
7
6
  *
8
- * lzg_pgen_moments() computes exact mean, variance, skewness, kurtosis
9
- * via 5-dimensional moment propagation through the LZ-constrained DP.
7
+ * Important: the current implementation is powered by the unconstrained
8
+ * topological forward DP in forward.c. Its moments and Gaussian mixture
9
+ * therefore summarize the graph-topology walk law, not the exact
10
+ * LZ-constrained per-history walk law used by lzg_walk_log_prob().
11
+ *
12
+ * lzg_pgen_moments() computes mean, variance, skewness, kurtosis via
13
+ * 5-dimensional moment propagation through that unconstrained forward DP.
10
14
  *
11
15
  * lzg_pgen_analytical_distribution() produces a Gaussian mixture
12
- * (one component per walk length) for fast pdf/cdf/ppf evaluation.
16
+ * (one component per walk length) for fast pdf/cdf/ppf evaluation of
17
+ * the same unconstrained approximation.
13
18
  */
14
19
  #ifndef LZGRAPH_PGEN_DIST_H
15
20
  #define LZGRAPH_PGEN_DIST_H
@@ -21,12 +26,12 @@
21
26
  /* ── Moment results ────────────────────────────────────────── */
22
27
 
23
28
  typedef struct {
24
- double mean; /* E[log P] */
25
- double variance; /* Var[log P] */
29
+ double mean; /* E[log P] under the unconstrained forward law */
30
+ double variance; /* Var[log P] under the same law */
26
31
  double std; /* sqrt(variance) */
27
32
  double skewness; /* standardized 3rd cumulant */
28
33
  double kurtosis; /* standardized 4th cumulant */
29
- double total_mass; /* Σ π(s) over valid walks */
34
+ double total_mass; /* total absorbed mass in the unconstrained forward DP */
30
35
  } LZGPgenMoments;
31
36
 
32
37
  LZGError lzg_pgen_moments(const LZGGraph *g, LZGPgenMoments *out);
@@ -47,8 +52,8 @@ typedef struct {
47
52
  } LZGPgenDist;
48
53
 
49
54
  /**
50
- * Compute the analytical PGEN distribution as a Gaussian mixture.
51
- * One component per walk length, with exact per-length (weight, mean, std).
55
+ * Compute the analytical PGEN distribution as a Gaussian mixture over the
56
+ * unconstrained forward-DP walk law. One component per walk length.
52
57
  */
53
58
  LZGError lzg_pgen_analytical(const LZGGraph *g, LZGPgenDist *out);
54
59
 
@@ -34,7 +34,7 @@
34
34
  */
35
35
  LZGError lzg_graph_posterior(const LZGGraph *prior,
36
36
  const char **sequences, uint32_t n_seqs,
37
- const uint32_t *abundances,
37
+ const uint64_t *abundances,
38
38
  double kappa,
39
39
  LZGGraph **out);
40
40
 
@@ -2,14 +2,14 @@
2
2
  * @file simulate.h
3
3
  * @brief LZ76-constrained sequence simulation and walk probability.
4
4
  *
5
- * simulate() generates sequences by random walks that respect LZ76
6
- * dictionary constraints at every step: single-char tokens must be
7
- * novel, multi-char tokens must extend a known prefix. Edge weights
8
- * are renormalized over the LZ-valid successor set at each step.
5
+ * simulate() generates sequences from a scalable accepted-sequence
6
+ * approximation: raw constrained walks are sampled forward until one
7
+ * reaches a terminal sink, and probabilities are normalized by a cached
8
+ * Monte Carlo estimate of the graph's absorption mass.
9
9
  *
10
- * walk_log_probability() computes the log-probability of a given
11
- * sequence under the same LZ-constrained model by re-encoding via
12
- * LZ76 and tracing the canonical walk with renormalized weights.
10
+ * walk_log_probability() uses that same cached normalization estimate, so
11
+ * its output stays consistent with simulate() without materializing the full
12
+ * history-expanded exact state space.
13
13
  */
14
14
  #ifndef LZGRAPH_SIMULATE_H
15
15
  #define LZGRAPH_SIMULATE_H
@@ -31,12 +31,12 @@ typedef struct {
31
31
  } LZGSimResult;
32
32
 
33
33
  /**
34
- * Simulate n sequences from the LZ-constrained generative model.
34
+ * Simulate n sequences from the public accepted-sequence model.
35
35
  *
36
- * Each walk enforces LZ76 dictionary constraints: at each step, only
37
- * successors whose subpattern is LZ76-valid given the accumulated
38
- * dictionary are eligible. Edge weights are renormalized over the
39
- * valid subset.
36
+ * Each trial walk enforces LZ76 dictionary constraints exactly, stops on
37
+ * dead ends, and retries until a completed sink-reaching walk is obtained.
38
+ * This keeps generation scalable on large graphs while still returning only
39
+ * completed accepted walks.
40
40
  *
41
41
  * @param g The graph (must be finalized + topo-sorted).
42
42
  * @param n Number of sequences to generate.
@@ -55,12 +55,12 @@ static inline void lzg_sim_result_free(LZGSimResult *r) {
55
55
  }
56
56
 
57
57
  /**
58
- * Compute the log-probability of a sequence under the LZ-constrained model.
58
+ * Compute the log-probability of a sequence under the public accepted model.
59
59
  *
60
- * The sequence is re-encoded via LZ76, producing its canonical walk.
61
- * At each step, the edge weight is renormalized by the sum of weights
62
- * of LZ-valid successors (given the accumulated LZ dictionary at that
63
- * point in the walk). This makes the result consistent with simulate().
60
+ * The sequence is re-encoded via LZ76, producing its canonical raw
61
+ * constrained walk, and then normalized by the same cached Monte Carlo
62
+ * absorption estimate used by simulate(). This keeps lzpgen() consistent
63
+ * with public generation semantics without the exact state explosion.
64
64
  *
65
65
  * @param g The graph.
66
66
  * @param seq Null-terminated amino acid sequence.
@@ -103,6 +103,11 @@ typedef struct {
103
103
  * 2. Walk with edges filtered by LZ + live + V/J gene presence
104
104
  * 3. Backtrack on dead ends (stack with blacklist)
105
105
  *
106
+ * The reported log_prob is the gene-conditional walk probability: the
107
+ * product of transition probabilities over gene-filtered, LZ-valid edges.
108
+ * This is a different probability law from the unconditional model returned
109
+ * by simulate() / lzg_walk_log_prob(). The two are not directly comparable.
110
+ *
106
111
  * @param g Graph with gene_data != NULL.
107
112
  * @param n Number of sequences to generate.
108
113
  * @param rng RNG state.