LZGraphs 2.1.2__tar.gz → 2.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/PKG-INFO +39 -28
  2. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/README.md +35 -26
  3. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/pyproject.toml +5 -2
  4. lzgraphs-2.3.0/setup.py +40 -0
  5. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/__init__.py +1 -1
  6. lzgraphs-2.3.0/src/LZGraphs/_fast_walk.c +321 -0
  7. lzgraphs-2.3.0/src/LZGraphs/constants.py +6 -0
  8. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/graphs/amino_acid_positional.py +87 -115
  9. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/graphs/graph_operations.py +4 -13
  10. lzgraphs-2.3.0/src/LZGraphs/graphs/lz_graph_base.py +1066 -0
  11. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/graphs/naive.py +6 -6
  12. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/graphs/nucleotide_double_positional.py +38 -45
  13. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/metrics/convenience.py +3 -4
  14. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/metrics/saturation.py +56 -47
  15. lzgraphs-2.3.0/src/LZGraphs/mixins/__init__.py +8 -0
  16. lzgraphs-2.3.0/src/LZGraphs/mixins/bayesian_posterior.py +267 -0
  17. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/mixins/gene_logic.py +40 -14
  18. lzgraphs-2.3.0/src/LZGraphs/mixins/graph_topology.py +59 -0
  19. lzgraphs-2.3.0/src/LZGraphs/mixins/lzpgen_distribution.py +457 -0
  20. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/mixins/random_walk.py +2 -2
  21. lzgraphs-2.3.0/src/LZGraphs/mixins/serialization.py +628 -0
  22. lzgraphs-2.3.0/src/LZGraphs/mixins/walk_analysis.py +177 -0
  23. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs.egg-info/PKG-INFO +39 -28
  24. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs.egg-info/SOURCES.txt +8 -0
  25. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs.egg-info/requires.txt +3 -1
  26. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_base_class_methods.py +8 -8
  27. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_diversity_theory.py +4 -4
  28. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_graph_operations.py +3 -3
  29. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_lzpgen_distribution.py +6 -1
  30. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_metrics.py +4 -4
  31. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_new_features.py +3 -3
  32. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_utilities.py +12 -14
  33. lzgraphs-2.1.2/src/LZGraphs/graphs/lz_graph_base.py +0 -2201
  34. lzgraphs-2.1.2/src/LZGraphs/mixins/__init__.py +0 -3
  35. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/CHANGELOG.md +0 -0
  36. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/CONTRIBUTING.md +0 -0
  37. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/LICENSE +0 -0
  38. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/MANIFEST.in +0 -0
  39. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/requirements.txt +0 -0
  40. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/setup.cfg +0 -0
  41. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/bag_of_words/__init__.py +0 -0
  42. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/bag_of_words/bow_encoder.py +0 -0
  43. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/exceptions/__init__.py +0 -0
  44. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/graphs/__init__.py +0 -0
  45. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/graphs/edge_data.py +0 -0
  46. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/metrics/__init__.py +0 -0
  47. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/metrics/diversity.py +0 -0
  48. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/metrics/entropy.py +0 -0
  49. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/metrics/pgen_distribution.py +0 -0
  50. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/mixins/gene_prediction.py +0 -0
  51. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/py.typed +0 -0
  52. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/utilities/__init__.py +0 -0
  53. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/utilities/decomposition.py +0 -0
  54. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/utilities/helpers.py +0 -0
  55. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/utilities/misc.py +0 -0
  56. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/visualization/__init__.py +0 -0
  57. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/visualization/visualize.py +0 -0
  58. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs.egg-info/dependency_links.txt +0 -0
  59. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs.egg-info/top_level.txt +0 -0
  60. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_aap_lzgraph.py +0 -0
  61. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_abundance.py +0 -0
  62. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_analytical_distribution.py +0 -0
  63. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_bow_encoder.py +0 -0
  64. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_flexible_input.py +0 -0
  65. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_naive_lzgraph.py +0 -0
  66. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_ndp_lzgraph.py +0 -0
  67. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_pgen_fixes.py +0 -0
  68. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_serialization.py +0 -0
  69. {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_simulate.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: LZGraphs
3
- Version: 2.1.2
3
+ Version: 2.3.0
4
4
  Summary: An Implementation of LZ76 Based Graphs for Repertoire Representation and Analysis
5
5
  Author-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
6
6
  Maintainer-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
@@ -29,7 +29,6 @@ Description-Content-Type: text/markdown
29
29
  License-File: LICENSE
30
30
  Requires-Dist: networkx>=3.0
31
31
  Requires-Dist: numpy>=1.24
32
- Requires-Dist: pandas>=1.5
33
32
  Requires-Dist: tqdm>=4.65
34
33
  Requires-Dist: scipy>=1.10
35
34
  Provides-Extra: viz
@@ -38,6 +37,7 @@ Requires-Dist: seaborn>=0.12; extra == "viz"
38
37
  Provides-Extra: dev
39
38
  Requires-Dist: pytest>=7.0; extra == "dev"
40
39
  Requires-Dist: pytest-cov>=4.0; extra == "dev"
40
+ Requires-Dist: pandas>=1.5; extra == "dev"
41
41
  Requires-Dist: black>=23.0; extra == "dev"
42
42
  Requires-Dist: isort>=5.12; extra == "dev"
43
43
  Requires-Dist: ruff>=0.1.0; extra == "dev"
@@ -45,6 +45,8 @@ Requires-Dist: pre-commit>=3.0; extra == "dev"
45
45
  Requires-Dist: build>=1.0; extra == "dev"
46
46
  Requires-Dist: twine>=4.0; extra == "dev"
47
47
  Provides-Extra: docs
48
+ Requires-Dist: mkdocs-material>=9.5; extra == "docs"
49
+ Requires-Dist: mkdocstrings[python]>=0.24; extra == "docs"
48
50
  Dynamic: license-file
49
51
 
50
52
  <p align="center">
@@ -125,6 +127,7 @@ The diversity of T-cells and B-cells is crucial for producing receptors that rec
125
127
  - **Repertoire comparison** -- compare two repertoires via graph-level statistics
126
128
  - **Analytical probability distributions** -- exact moments and scipy-like distribution objects for generation probabilities
127
129
  - **Gene annotation support** -- optional V/J gene tracking on edges for gene usage analysis
130
+ - **Bayesian posterior personalization** -- adapt population-level models to individual repertoires using Dirichlet-Multinomial conjugacy
128
131
  - **Abundance weighting** -- weight sequences by clonal abundance for more realistic models
129
132
  - **Serialization** -- save and load graphs in JSON format
130
133
 
@@ -150,23 +153,20 @@ print(LZGraphs.__version__)
150
153
  Build an amino acid positional graph from CDR3 sequences and compute sequence probabilities:
151
154
 
152
155
  ```python
153
- import pandas as pd
154
156
  from LZGraphs import AAPLZGraph
155
157
 
156
- # Prepare data as a DataFrame with a 'cdr3_amino_acid' column
157
- data = pd.DataFrame({
158
- 'cdr3_amino_acid': [
159
- 'CASSLAPGATNEKLFF',
160
- 'CASSLGQAYEQYF',
161
- 'CASSFSTCSANYGYTF',
162
- 'CASSQEGTEAFF',
163
- 'CASSLGQGNIQYF',
164
- # ... your CDR3 amino acid sequences
165
- ]
166
- })
158
+ # Pass a plain list of CDR3 amino acid sequences
159
+ sequences = [
160
+ 'CASSLAPGATNEKLFF',
161
+ 'CASSLGQAYEQYF',
162
+ 'CASSFSTCSANYGYTF',
163
+ 'CASSQEGTEAFF',
164
+ 'CASSLGQGNIQYF',
165
+ # ... your CDR3 amino acid sequences
166
+ ]
167
167
 
168
168
  # Construct the graph
169
- graph = AAPLZGraph(data, verbose=True)
169
+ graph = AAPLZGraph(sequences, verbose=True)
170
170
 
171
171
  # Compute the log-probability of a sequence under the model
172
172
  log_prob = graph.walk_log_probability('CASSLAPGATNEKLFF')
@@ -221,15 +221,14 @@ graph = NaiveLZGraph(cdr3_list, dictionary, verbose=True)
221
221
 
222
222
  ### Gene Annotation
223
223
 
224
- All three graph types support optional V and J gene annotation. Include `V` and `J` columns in your DataFrame (or pass them separately for NaiveLZGraph) to track gene usage on graph edges:
224
+ All three graph types support optional V and J gene annotation. Pass gene lists alongside sequences to track gene usage on graph edges:
225
225
 
226
226
  ```python
227
- data = pd.DataFrame({
228
- 'cdr3_amino_acid': sequences,
229
- 'V': v_genes,
230
- 'J': j_genes,
231
- })
232
- graph = AAPLZGraph(data, verbose=True)
227
+ sequences = ['CASSLEPSGGTDTQYF', 'CASSDTSGGTDTQYF', ...]
228
+ v_genes = ['TRBV16-1*01', 'TRBV1-1*01', ...]
229
+ j_genes = ['TRBJ1-2*01', 'TRBJ1-5*01', ...]
230
+
231
+ graph = AAPLZGraph(sequences, v_genes=v_genes, j_genes=j_genes, verbose=True)
233
232
 
234
233
  # Gene data is now available
235
234
  print(graph.has_gene_data) # True
@@ -248,16 +247,14 @@ This is particularly important for:
248
247
  - **Better representation of clonal expansion** -- dominant clones shape the graph structure proportionally to their prevalence
249
248
  - **More realistic sequence generation** -- simulated sequences reflect the abundance-weighted landscape, not just the unique sequence set
250
249
 
251
- To use abundance weighting, include an `abundance` column in your DataFrame:
250
+ To use abundance weighting, pass an `abundances` list alongside your sequences:
252
251
 
253
252
  ```python
254
- data = pd.DataFrame({
255
- 'cdr3_amino_acid': ['CASSLAPGATNEKLFF', 'CASSLGQAYEQYF', 'CASSFSTCSANYGYTF'],
256
- 'abundance': [150, 42, 7],
257
- })
253
+ sequences = ['CASSLAPGATNEKLFF', 'CASSLGQAYEQYF', 'CASSFSTCSANYGYTF']
254
+ abundances = [150, 42, 7]
258
255
 
259
256
  # Each sequence is weighted by its abundance during graph construction
260
- graph = AAPLZGraph(data, verbose=True)
257
+ graph = AAPLZGraph(sequences, abundances=abundances, verbose=True)
261
258
  ```
262
259
 
263
260
  For `NaiveLZGraph`, pass abundances as a separate parameter:
@@ -335,6 +332,20 @@ jsd = jensen_shannon_divergence(graph1, graph2)
335
332
  comparison = compare_repertoires(graph1, graph2)
336
333
  ```
337
334
 
335
+ ### Bayesian Posterior Personalization
336
+
337
+ ```python
338
+ # Adapt a population graph to an individual
339
+ posterior = population_graph.get_posterior(
340
+ individual_sequences,
341
+ abundances=clonal_counts,
342
+ kappa=100.0 # prior strength
343
+ )
344
+
345
+ # The posterior is a full graph
346
+ simulated = posterior.simulate(1000, seed=42)
347
+ ```
348
+
338
349
  ### Visualization
339
350
 
340
351
  ```python
@@ -76,6 +76,7 @@ The diversity of T-cells and B-cells is crucial for producing receptors that rec
76
76
  - **Repertoire comparison** -- compare two repertoires via graph-level statistics
77
77
  - **Analytical probability distributions** -- exact moments and scipy-like distribution objects for generation probabilities
78
78
  - **Gene annotation support** -- optional V/J gene tracking on edges for gene usage analysis
79
+ - **Bayesian posterior personalization** -- adapt population-level models to individual repertoires using Dirichlet-Multinomial conjugacy
79
80
  - **Abundance weighting** -- weight sequences by clonal abundance for more realistic models
80
81
  - **Serialization** -- save and load graphs in JSON format
81
82
 
@@ -101,23 +102,20 @@ print(LZGraphs.__version__)
101
102
  Build an amino acid positional graph from CDR3 sequences and compute sequence probabilities:
102
103
 
103
104
  ```python
104
- import pandas as pd
105
105
  from LZGraphs import AAPLZGraph
106
106
 
107
- # Prepare data as a DataFrame with a 'cdr3_amino_acid' column
108
- data = pd.DataFrame({
109
- 'cdr3_amino_acid': [
110
- 'CASSLAPGATNEKLFF',
111
- 'CASSLGQAYEQYF',
112
- 'CASSFSTCSANYGYTF',
113
- 'CASSQEGTEAFF',
114
- 'CASSLGQGNIQYF',
115
- # ... your CDR3 amino acid sequences
116
- ]
117
- })
107
+ # Pass a plain list of CDR3 amino acid sequences
108
+ sequences = [
109
+ 'CASSLAPGATNEKLFF',
110
+ 'CASSLGQAYEQYF',
111
+ 'CASSFSTCSANYGYTF',
112
+ 'CASSQEGTEAFF',
113
+ 'CASSLGQGNIQYF',
114
+ # ... your CDR3 amino acid sequences
115
+ ]
118
116
 
119
117
  # Construct the graph
120
- graph = AAPLZGraph(data, verbose=True)
118
+ graph = AAPLZGraph(sequences, verbose=True)
121
119
 
122
120
  # Compute the log-probability of a sequence under the model
123
121
  log_prob = graph.walk_log_probability('CASSLAPGATNEKLFF')
@@ -172,15 +170,14 @@ graph = NaiveLZGraph(cdr3_list, dictionary, verbose=True)
172
170
 
173
171
  ### Gene Annotation
174
172
 
175
- All three graph types support optional V and J gene annotation. Include `V` and `J` columns in your DataFrame (or pass them separately for NaiveLZGraph) to track gene usage on graph edges:
173
+ All three graph types support optional V and J gene annotation. Pass gene lists alongside sequences to track gene usage on graph edges:
176
174
 
177
175
  ```python
178
- data = pd.DataFrame({
179
- 'cdr3_amino_acid': sequences,
180
- 'V': v_genes,
181
- 'J': j_genes,
182
- })
183
- graph = AAPLZGraph(data, verbose=True)
176
+ sequences = ['CASSLEPSGGTDTQYF', 'CASSDTSGGTDTQYF', ...]
177
+ v_genes = ['TRBV16-1*01', 'TRBV1-1*01', ...]
178
+ j_genes = ['TRBJ1-2*01', 'TRBJ1-5*01', ...]
179
+
180
+ graph = AAPLZGraph(sequences, v_genes=v_genes, j_genes=j_genes, verbose=True)
184
181
 
185
182
  # Gene data is now available
186
183
  print(graph.has_gene_data) # True
@@ -199,16 +196,14 @@ This is particularly important for:
199
196
  - **Better representation of clonal expansion** -- dominant clones shape the graph structure proportionally to their prevalence
200
197
  - **More realistic sequence generation** -- simulated sequences reflect the abundance-weighted landscape, not just the unique sequence set
201
198
 
202
- To use abundance weighting, include an `abundance` column in your DataFrame:
199
+ To use abundance weighting, pass an `abundances` list alongside your sequences:
203
200
 
204
201
  ```python
205
- data = pd.DataFrame({
206
- 'cdr3_amino_acid': ['CASSLAPGATNEKLFF', 'CASSLGQAYEQYF', 'CASSFSTCSANYGYTF'],
207
- 'abundance': [150, 42, 7],
208
- })
202
+ sequences = ['CASSLAPGATNEKLFF', 'CASSLGQAYEQYF', 'CASSFSTCSANYGYTF']
203
+ abundances = [150, 42, 7]
209
204
 
210
205
  # Each sequence is weighted by its abundance during graph construction
211
- graph = AAPLZGraph(data, verbose=True)
206
+ graph = AAPLZGraph(sequences, abundances=abundances, verbose=True)
212
207
  ```
213
208
 
214
209
  For `NaiveLZGraph`, pass abundances as a separate parameter:
@@ -286,6 +281,20 @@ jsd = jensen_shannon_divergence(graph1, graph2)
286
281
  comparison = compare_repertoires(graph1, graph2)
287
282
  ```
288
283
 
284
+ ### Bayesian Posterior Personalization
285
+
286
+ ```python
287
+ # Adapt a population graph to an individual
288
+ posterior = population_graph.get_posterior(
289
+ individual_sequences,
290
+ abundances=clonal_counts,
291
+ kappa=100.0 # prior strength
292
+ )
293
+
294
+ # The posterior is a full graph
295
+ simulated = posterior.simulate(1000, seed=42)
296
+ ```
297
+
289
298
  ### Visualization
290
299
 
291
300
  ```python
@@ -45,7 +45,6 @@ classifiers = [
45
45
  dependencies = [
46
46
  "networkx>=3.0",
47
47
  "numpy>=1.24",
48
- "pandas>=1.5",
49
48
  "tqdm>=4.65",
50
49
  "scipy>=1.10",
51
50
  ]
@@ -58,6 +57,7 @@ viz = [
58
57
  dev = [
59
58
  "pytest>=7.0",
60
59
  "pytest-cov>=4.0",
60
+ "pandas>=1.5",
61
61
  "black>=23.0",
62
62
  "isort>=5.12",
63
63
  "ruff>=0.1.0",
@@ -65,7 +65,10 @@ dev = [
65
65
  "build>=1.0",
66
66
  "twine>=4.0",
67
67
  ]
68
- docs = []
68
+ docs = [
69
+ "mkdocs-material>=9.5",
70
+ "mkdocstrings[python]>=0.24",
71
+ ]
69
72
 
70
73
  [project.urls]
71
74
  Homepage = "https://github.com/MuteJester/LZGraphs"
@@ -0,0 +1,40 @@
1
+ """
2
+ Build script for optional C extensions.
3
+
4
+ The _fast_walk extension accelerates LZGraph.simulate() by ~50-100x.
5
+ If compilation fails (no C compiler), the package still installs and
6
+ falls back to the pure-Python implementation automatically.
7
+ """
8
+
9
+ import os
10
+ import sys
11
+ from setuptools import setup, Extension
12
+
13
+ # Ensure setuptools can resolve the dynamic version (attr = "LZGraphs.__version__")
14
+ # when running in an isolated build environment where src/ isn't on sys.path.
15
+ sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "src"))
16
+
17
+ ext_modules = [
18
+ Extension(
19
+ "LZGraphs._fast_walk",
20
+ sources=[os.path.join("src", "LZGraphs", "_fast_walk.c")],
21
+ # No external library dependencies — pure C + Python.h
22
+ ),
23
+ ]
24
+
25
+
26
+ def run_setup(extensions):
27
+ setup(ext_modules=extensions)
28
+
29
+
30
+ try:
31
+ run_setup(ext_modules)
32
+ except Exception:
33
+ print(
34
+ "\n"
35
+ "WARNING: Failed to compile C extension _fast_walk.\n"
36
+ " LZGraphs will use the pure-Python fallback for simulate().\n"
37
+ " This is fine — the package works without it, just slower.\n"
38
+ "\n"
39
+ )
40
+ run_setup([])
@@ -1,4 +1,4 @@
1
- __version__ = "2.1.2"
1
+ __version__ = "2.3.0"
2
2
 
3
3
  # =============================================================================
4
4
  # Graph classes
@@ -0,0 +1,321 @@
1
+ /*
2
+ * _fast_walk.c — CPython C extension for fast Markov chain random walks.
3
+ *
4
+ * Implements the full simulate() loop in C including string assembly,
5
+ * for ~100-200x speedup over the original pure-Python implementation.
6
+ * Uses xoshiro256++ for fast, high-quality RNG.
7
+ *
8
+ * The extension is optional: if it fails to compile (no C compiler),
9
+ * LZGraphs falls back to the pure-Python bisect-based implementation.
10
+ */
11
+
12
+ #define PY_SSIZE_T_CLEAN
13
+ #include <Python.h>
14
+ #include <stdint.h>
15
+ #include <string.h>
16
+
17
+ /* ========================================================================
18
+ * xoshiro256++ RNG — public domain by David Blackman and Sebastiano Vigna
19
+ * ======================================================================== */
20
+
21
+ static inline uint64_t rotl(const uint64_t x, int k) {
22
+ return (x << k) | (x >> (64 - k));
23
+ }
24
+
25
+ typedef struct {
26
+ uint64_t s[4];
27
+ } xoshiro256_state;
28
+
29
+ static inline uint64_t xoshiro256pp_next(xoshiro256_state *state) {
30
+ const uint64_t result = rotl(state->s[0] + state->s[3], 23) + state->s[0];
31
+ const uint64_t t = state->s[1] << 17;
32
+ state->s[2] ^= state->s[0];
33
+ state->s[3] ^= state->s[1];
34
+ state->s[1] ^= state->s[2];
35
+ state->s[0] ^= state->s[3];
36
+ state->s[2] ^= t;
37
+ state->s[3] = rotl(state->s[3], 45);
38
+ return result;
39
+ }
40
+
41
+ static inline double xoshiro256pp_double(xoshiro256_state *state) {
42
+ return (double)(xoshiro256pp_next(state) >> 11) * 0x1.0p-53;
43
+ }
44
+
45
+ static inline uint64_t splitmix64(uint64_t *x) {
46
+ uint64_t z = (*x += 0x9e3779b97f4a7c15ULL);
47
+ z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9ULL;
48
+ z = (z ^ (z >> 27)) * 0x94d049bb133111ebULL;
49
+ return z ^ (z >> 31);
50
+ }
51
+
52
+ static void seed_xoshiro256(xoshiro256_state *state, uint64_t seed) {
53
+ state->s[0] = splitmix64(&seed);
54
+ state->s[1] = splitmix64(&seed);
55
+ state->s[2] = splitmix64(&seed);
56
+ state->s[3] = splitmix64(&seed);
57
+ }
58
+
59
+ /* ========================================================================
60
+ * Binary search (bisect_left) on a double array
61
+ * ======================================================================== */
62
+
63
+ static inline Py_ssize_t bisect_left_double(
64
+ const double *arr, Py_ssize_t n, double value
65
+ ) {
66
+ Py_ssize_t lo = 0, hi = n;
67
+ while (lo < hi) {
68
+ Py_ssize_t mid = lo + (hi - lo) / 2;
69
+ if (arr[mid] < value)
70
+ lo = mid + 1;
71
+ else
72
+ hi = mid;
73
+ }
74
+ return lo;
75
+ }
76
+
77
+ /* ========================================================================
78
+ * simulate_walks — full simulation with string assembly in C
79
+ *
80
+ * Args:
81
+ * n_walks : int
82
+ * offsets : intp array [n_nodes+1] (buffer)
83
+ * neighbors : intp array [total_edges] (buffer)
84
+ * cumweights : float64 array [total_edges] (buffer)
85
+ * stop_probs : float64 array [n_nodes] (buffer)
86
+ * initial_ids : intp array [n_initial] (buffer)
87
+ * initial_cw : float64 array [n_initial] (buffer)
88
+ * seed : uint64
89
+ * clean_labels : list[str] — label for each node ID
90
+ * return_walks : bool — if True, return (walk, seq) tuples
91
+ * id_to_node : list[str] — node names (only used if return_walks)
92
+ *
93
+ * Returns:
94
+ * list[str] or list[tuple[list[str], str]]
95
+ * ======================================================================== */
96
+
97
+ static PyObject* py_simulate_walks(PyObject *self, PyObject *args) {
98
+ int n_walks, return_walks;
99
+ Py_buffer offsets_buf, neighbors_buf, cumweights_buf;
100
+ Py_buffer stop_probs_buf, initial_ids_buf, initial_cw_buf;
101
+ unsigned long long seed;
102
+ PyObject *clean_labels; /* Python list of str */
103
+ PyObject *id_to_node; /* Python list of str */
104
+ PyObject *result_list = NULL;
105
+
106
+ if (!PyArg_ParseTuple(args, "iy*y*y*y*y*y*KOpO",
107
+ &n_walks,
108
+ &offsets_buf, &neighbors_buf, &cumweights_buf,
109
+ &stop_probs_buf, &initial_ids_buf, &initial_cw_buf,
110
+ &seed,
111
+ &clean_labels,
112
+ &return_walks,
113
+ &id_to_node))
114
+ return NULL;
115
+
116
+ const Py_ssize_t *offsets = (const Py_ssize_t *)offsets_buf.buf;
117
+ const Py_ssize_t *neighbors = (const Py_ssize_t *)neighbors_buf.buf;
118
+ const double *cumweights = (const double *)cumweights_buf.buf;
119
+ const double *stop_probs = (const double *)stop_probs_buf.buf;
120
+ const Py_ssize_t *initial_ids = (const Py_ssize_t *)initial_ids_buf.buf;
121
+ const double *initial_cw = (const double *)initial_cw_buf.buf;
122
+ const Py_ssize_t n_initial = initial_cw_buf.len / (Py_ssize_t)sizeof(double);
123
+
124
+ if (n_initial <= 0) {
125
+ PyErr_SetString(PyExc_ValueError,
126
+ "Cannot simulate: graph has no initial states.");
127
+ goto cleanup;
128
+ }
129
+
130
+ /* Pre-fetch label UTF-8 data for fast string assembly */
131
+ const Py_ssize_t n_labels = PyList_GET_SIZE(clean_labels);
132
+ const char **label_ptrs = (const char **)PyMem_Malloc(n_labels * sizeof(char *));
133
+ Py_ssize_t *label_lens = (Py_ssize_t *)PyMem_Malloc(n_labels * sizeof(Py_ssize_t));
134
+ if (!label_ptrs || !label_lens) {
135
+ PyMem_Free(label_ptrs);
136
+ PyMem_Free(label_lens);
137
+ PyErr_NoMemory();
138
+ goto cleanup;
139
+ }
140
+ for (Py_ssize_t i = 0; i < n_labels; i++) {
141
+ PyObject *s = PyList_GET_ITEM(clean_labels, i);
142
+ label_ptrs[i] = PyUnicode_AsUTF8AndSize(s, &label_lens[i]);
143
+ if (!label_ptrs[i]) {
144
+ PyMem_Free(label_ptrs);
145
+ PyMem_Free(label_lens);
146
+ goto cleanup;
147
+ }
148
+ }
149
+
150
+ xoshiro256_state rng;
151
+ seed_xoshiro256(&rng, (uint64_t)seed);
152
+
153
+ result_list = PyList_New(n_walks);
154
+ if (!result_list) {
155
+ PyMem_Free(label_ptrs);
156
+ PyMem_Free(label_lens);
157
+ goto cleanup;
158
+ }
159
+
160
+ /* Reusable walk buffer */
161
+ Py_ssize_t walk_cap = 64;
162
+ Py_ssize_t *walk_buf = (Py_ssize_t *)PyMem_Malloc(walk_cap * sizeof(Py_ssize_t));
163
+ /* Reusable string buffer */
164
+ Py_ssize_t str_cap = 256;
165
+ char *str_buf = (char *)PyMem_Malloc(str_cap);
166
+ if (!walk_buf || !str_buf) {
167
+ PyMem_Free(walk_buf);
168
+ PyMem_Free(str_buf);
169
+ PyMem_Free(label_ptrs);
170
+ PyMem_Free(label_lens);
171
+ Py_DECREF(result_list);
172
+ PyErr_NoMemory();
173
+ goto cleanup;
174
+ }
175
+
176
+ for (int i = 0; i < n_walks; i++) {
177
+ /* Pick initial state */
178
+ double r = xoshiro256pp_double(&rng);
179
+ Py_ssize_t init_idx = bisect_left_double(initial_cw, n_initial, r);
180
+ if (init_idx >= n_initial) init_idx = n_initial - 1;
181
+ Py_ssize_t current = initial_ids[init_idx];
182
+
183
+ Py_ssize_t walk_len = 0;
184
+ walk_buf[walk_len++] = current;
185
+
186
+ /* Build string incrementally */
187
+ Py_ssize_t str_len = 0;
188
+ Py_ssize_t llen = label_lens[current];
189
+ if (str_len + llen > str_cap) {
190
+ str_cap = (str_len + llen) * 2;
191
+ str_buf = (char *)PyMem_Realloc(str_buf, str_cap);
192
+ if (!str_buf) goto oom;
193
+ }
194
+ memcpy(str_buf + str_len, label_ptrs[current], llen);
195
+ str_len += llen;
196
+
197
+ while (1) {
198
+ double sp = stop_probs[current];
199
+ if (sp == sp) {
200
+ if (xoshiro256pp_double(&rng) < sp)
201
+ break;
202
+ }
203
+
204
+ Py_ssize_t start = offsets[current];
205
+ Py_ssize_t end = offsets[current + 1];
206
+ if (start == end)
207
+ break;
208
+
209
+ r = xoshiro256pp_double(&rng);
210
+ Py_ssize_t idx = bisect_left_double(cumweights + start, end - start, r);
211
+ if (idx >= end - start) idx = end - start - 1;
212
+ current = neighbors[start + idx];
213
+
214
+ /* Grow walk buffer if needed */
215
+ if (walk_len >= walk_cap) {
216
+ walk_cap *= 2;
217
+ Py_ssize_t *new_buf = (Py_ssize_t *)PyMem_Realloc(walk_buf, walk_cap * sizeof(Py_ssize_t));
218
+ if (!new_buf) goto oom;
219
+ walk_buf = new_buf;
220
+ }
221
+ walk_buf[walk_len++] = current;
222
+
223
+ /* Append label to string buffer */
224
+ llen = label_lens[current];
225
+ if (str_len + llen > str_cap) {
226
+ str_cap = (str_len + llen) * 2;
227
+ char *new_str = (char *)PyMem_Realloc(str_buf, str_cap);
228
+ if (!new_str) goto oom;
229
+ str_buf = new_str;
230
+ }
231
+ memcpy(str_buf + str_len, label_ptrs[current], llen);
232
+ str_len += llen;
233
+ }
234
+
235
+ /* Create Python string from buffer */
236
+ PyObject *seq = PyUnicode_FromStringAndSize(str_buf, str_len);
237
+ if (!seq) goto oom;
238
+
239
+ if (return_walks) {
240
+ /* Build walk list of node name strings */
241
+ PyObject *walk = PyList_New(walk_len);
242
+ if (!walk) { Py_DECREF(seq); goto oom; }
243
+ for (Py_ssize_t j = 0; j < walk_len; j++) {
244
+ PyObject *node_name = PyList_GET_ITEM(id_to_node, walk_buf[j]);
245
+ Py_INCREF(node_name);
246
+ PyList_SET_ITEM(walk, j, node_name);
247
+ }
248
+ PyObject *tup = PyTuple_Pack(2, walk, seq);
249
+ Py_DECREF(walk);
250
+ Py_DECREF(seq);
251
+ if (!tup) goto oom;
252
+ PyList_SET_ITEM(result_list, i, tup);
253
+ } else {
254
+ PyList_SET_ITEM(result_list, i, seq);
255
+ }
256
+ }
257
+
258
+ PyMem_Free(walk_buf);
259
+ PyMem_Free(str_buf);
260
+ PyMem_Free(label_ptrs);
261
+ PyMem_Free(label_lens);
262
+ goto cleanup;
263
+
264
+ oom:
265
+ PyMem_Free(walk_buf);
266
+ PyMem_Free(str_buf);
267
+ PyMem_Free(label_ptrs);
268
+ PyMem_Free(label_lens);
269
+ Py_XDECREF(result_list);
270
+ result_list = NULL;
271
+ if (!PyErr_Occurred())
272
+ PyErr_NoMemory();
273
+
274
+ cleanup:
275
+ PyBuffer_Release(&offsets_buf);
276
+ PyBuffer_Release(&neighbors_buf);
277
+ PyBuffer_Release(&cumweights_buf);
278
+ PyBuffer_Release(&stop_probs_buf);
279
+ PyBuffer_Release(&initial_ids_buf);
280
+ PyBuffer_Release(&initial_cw_buf);
281
+
282
+ return result_list;
283
+ }
284
+
285
+ /* ========================================================================
286
+ * Module definition
287
+ * ======================================================================== */
288
+
289
+ static PyMethodDef FastWalkMethods[] = {
290
+ {"simulate_walks", py_simulate_walks, METH_VARARGS,
291
+ "Run n random walks on a CSR-encoded graph with string assembly.\n\n"
292
+ "Args:\n"
293
+ " n_walks (int): Number of walks.\n"
294
+ " offsets (array): CSR row offsets [n_nodes+1], dtype=intp.\n"
295
+ " neighbors (array): Flat neighbor IDs, dtype=intp.\n"
296
+ " cumweights (array): Flat cumulative weights, dtype=float64.\n"
297
+ " stop_probs (array): Per-node stop probability (NaN=none), dtype=float64.\n"
298
+ " initial_ids (array): Initial state IDs, dtype=intp.\n"
299
+ " initial_cumprobs (array): Cumulative initial probs, dtype=float64.\n"
300
+ " seed (int): RNG seed (xoshiro256++).\n"
301
+ " clean_labels (list[str]): Subpattern label for each node.\n"
302
+ " return_walks (bool): If True, return (walk, seq) tuples.\n"
303
+ " id_to_node (list[str]): Node names for walk output.\n\n"
304
+ "Returns:\n"
305
+ " list[str] or list[tuple[list[str], str]]\n"},
306
+ {NULL, NULL, 0, NULL}
307
+ };
308
+
309
+ static struct PyModuleDef fast_walk_module = {
310
+ PyModuleDef_HEAD_INIT,
311
+ "_fast_walk",
312
+ "C-accelerated random walk simulation for LZGraphs.\n"
313
+ "Uses xoshiro256++ RNG for high-quality, fast random number generation.\n"
314
+ "This module is optional — LZGraphs falls back to pure Python if unavailable.",
315
+ -1,
316
+ FastWalkMethods
317
+ };
318
+
319
+ PyMODINIT_FUNC PyInit__fast_walk(void) {
320
+ return PyModule_Create(&fast_walk_module);
321
+ }
@@ -0,0 +1,6 @@
1
+ """Module-level numeric constants shared across LZGraphs internals."""
2
+ import numpy as np
3
+
4
+ # Machine epsilon — cached once at module level (avoids repeated np.finfo calls)
5
+ _EPS = np.finfo(np.float64).eps
6
+ _LOG_EPS = np.log(_EPS)