LZGraphs 2.1.2__tar.gz → 2.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/PKG-INFO +39 -28
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/README.md +35 -26
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/pyproject.toml +5 -2
- lzgraphs-2.3.0/setup.py +40 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/__init__.py +1 -1
- lzgraphs-2.3.0/src/LZGraphs/_fast_walk.c +321 -0
- lzgraphs-2.3.0/src/LZGraphs/constants.py +6 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/graphs/amino_acid_positional.py +87 -115
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/graphs/graph_operations.py +4 -13
- lzgraphs-2.3.0/src/LZGraphs/graphs/lz_graph_base.py +1066 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/graphs/naive.py +6 -6
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/graphs/nucleotide_double_positional.py +38 -45
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/metrics/convenience.py +3 -4
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/metrics/saturation.py +56 -47
- lzgraphs-2.3.0/src/LZGraphs/mixins/__init__.py +8 -0
- lzgraphs-2.3.0/src/LZGraphs/mixins/bayesian_posterior.py +267 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/mixins/gene_logic.py +40 -14
- lzgraphs-2.3.0/src/LZGraphs/mixins/graph_topology.py +59 -0
- lzgraphs-2.3.0/src/LZGraphs/mixins/lzpgen_distribution.py +457 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/mixins/random_walk.py +2 -2
- lzgraphs-2.3.0/src/LZGraphs/mixins/serialization.py +628 -0
- lzgraphs-2.3.0/src/LZGraphs/mixins/walk_analysis.py +177 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs.egg-info/PKG-INFO +39 -28
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs.egg-info/SOURCES.txt +8 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs.egg-info/requires.txt +3 -1
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_base_class_methods.py +8 -8
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_diversity_theory.py +4 -4
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_graph_operations.py +3 -3
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_lzpgen_distribution.py +6 -1
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_metrics.py +4 -4
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_new_features.py +3 -3
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_utilities.py +12 -14
- lzgraphs-2.1.2/src/LZGraphs/graphs/lz_graph_base.py +0 -2201
- lzgraphs-2.1.2/src/LZGraphs/mixins/__init__.py +0 -3
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/CHANGELOG.md +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/CONTRIBUTING.md +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/LICENSE +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/MANIFEST.in +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/requirements.txt +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/setup.cfg +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/bag_of_words/__init__.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/bag_of_words/bow_encoder.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/exceptions/__init__.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/graphs/__init__.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/graphs/edge_data.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/metrics/__init__.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/metrics/diversity.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/metrics/entropy.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/metrics/pgen_distribution.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/mixins/gene_prediction.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/py.typed +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/utilities/__init__.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/utilities/decomposition.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/utilities/helpers.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/utilities/misc.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/visualization/__init__.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs/visualization/visualize.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs.egg-info/dependency_links.txt +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/src/LZGraphs.egg-info/top_level.txt +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_aap_lzgraph.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_abundance.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_analytical_distribution.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_bow_encoder.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_flexible_input.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_naive_lzgraph.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_ndp_lzgraph.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_pgen_fixes.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_serialization.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.3.0}/tests/test_simulate.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: LZGraphs
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.3.0
|
|
4
4
|
Summary: An Implementation of LZ76 Based Graphs for Repertoire Representation and Analysis
|
|
5
5
|
Author-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
|
|
6
6
|
Maintainer-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
|
|
@@ -29,7 +29,6 @@ Description-Content-Type: text/markdown
|
|
|
29
29
|
License-File: LICENSE
|
|
30
30
|
Requires-Dist: networkx>=3.0
|
|
31
31
|
Requires-Dist: numpy>=1.24
|
|
32
|
-
Requires-Dist: pandas>=1.5
|
|
33
32
|
Requires-Dist: tqdm>=4.65
|
|
34
33
|
Requires-Dist: scipy>=1.10
|
|
35
34
|
Provides-Extra: viz
|
|
@@ -38,6 +37,7 @@ Requires-Dist: seaborn>=0.12; extra == "viz"
|
|
|
38
37
|
Provides-Extra: dev
|
|
39
38
|
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
40
39
|
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
40
|
+
Requires-Dist: pandas>=1.5; extra == "dev"
|
|
41
41
|
Requires-Dist: black>=23.0; extra == "dev"
|
|
42
42
|
Requires-Dist: isort>=5.12; extra == "dev"
|
|
43
43
|
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
@@ -45,6 +45,8 @@ Requires-Dist: pre-commit>=3.0; extra == "dev"
|
|
|
45
45
|
Requires-Dist: build>=1.0; extra == "dev"
|
|
46
46
|
Requires-Dist: twine>=4.0; extra == "dev"
|
|
47
47
|
Provides-Extra: docs
|
|
48
|
+
Requires-Dist: mkdocs-material>=9.5; extra == "docs"
|
|
49
|
+
Requires-Dist: mkdocstrings[python]>=0.24; extra == "docs"
|
|
48
50
|
Dynamic: license-file
|
|
49
51
|
|
|
50
52
|
<p align="center">
|
|
@@ -125,6 +127,7 @@ The diversity of T-cells and B-cells is crucial for producing receptors that rec
|
|
|
125
127
|
- **Repertoire comparison** -- compare two repertoires via graph-level statistics
|
|
126
128
|
- **Analytical probability distributions** -- exact moments and scipy-like distribution objects for generation probabilities
|
|
127
129
|
- **Gene annotation support** -- optional V/J gene tracking on edges for gene usage analysis
|
|
130
|
+
- **Bayesian posterior personalization** -- adapt population-level models to individual repertoires using Dirichlet-Multinomial conjugacy
|
|
128
131
|
- **Abundance weighting** -- weight sequences by clonal abundance for more realistic models
|
|
129
132
|
- **Serialization** -- save and load graphs in JSON format
|
|
130
133
|
|
|
@@ -150,23 +153,20 @@ print(LZGraphs.__version__)
|
|
|
150
153
|
Build an amino acid positional graph from CDR3 sequences and compute sequence probabilities:
|
|
151
154
|
|
|
152
155
|
```python
|
|
153
|
-
import pandas as pd
|
|
154
156
|
from LZGraphs import AAPLZGraph
|
|
155
157
|
|
|
156
|
-
#
|
|
157
|
-
|
|
158
|
-
'
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
]
|
|
166
|
-
})
|
|
158
|
+
# Pass a plain list of CDR3 amino acid sequences
|
|
159
|
+
sequences = [
|
|
160
|
+
'CASSLAPGATNEKLFF',
|
|
161
|
+
'CASSLGQAYEQYF',
|
|
162
|
+
'CASSFSTCSANYGYTF',
|
|
163
|
+
'CASSQEGTEAFF',
|
|
164
|
+
'CASSLGQGNIQYF',
|
|
165
|
+
# ... your CDR3 amino acid sequences
|
|
166
|
+
]
|
|
167
167
|
|
|
168
168
|
# Construct the graph
|
|
169
|
-
graph = AAPLZGraph(
|
|
169
|
+
graph = AAPLZGraph(sequences, verbose=True)
|
|
170
170
|
|
|
171
171
|
# Compute the log-probability of a sequence under the model
|
|
172
172
|
log_prob = graph.walk_log_probability('CASSLAPGATNEKLFF')
|
|
@@ -221,15 +221,14 @@ graph = NaiveLZGraph(cdr3_list, dictionary, verbose=True)
|
|
|
221
221
|
|
|
222
222
|
### Gene Annotation
|
|
223
223
|
|
|
224
|
-
All three graph types support optional V and J gene annotation.
|
|
224
|
+
All three graph types support optional V and J gene annotation. Pass gene lists alongside sequences to track gene usage on graph edges:
|
|
225
225
|
|
|
226
226
|
```python
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
graph = AAPLZGraph(data, verbose=True)
|
|
227
|
+
sequences = ['CASSLEPSGGTDTQYF', 'CASSDTSGGTDTQYF', ...]
|
|
228
|
+
v_genes = ['TRBV16-1*01', 'TRBV1-1*01', ...]
|
|
229
|
+
j_genes = ['TRBJ1-2*01', 'TRBJ1-5*01', ...]
|
|
230
|
+
|
|
231
|
+
graph = AAPLZGraph(sequences, v_genes=v_genes, j_genes=j_genes, verbose=True)
|
|
233
232
|
|
|
234
233
|
# Gene data is now available
|
|
235
234
|
print(graph.has_gene_data) # True
|
|
@@ -248,16 +247,14 @@ This is particularly important for:
|
|
|
248
247
|
- **Better representation of clonal expansion** -- dominant clones shape the graph structure proportionally to their prevalence
|
|
249
248
|
- **More realistic sequence generation** -- simulated sequences reflect the abundance-weighted landscape, not just the unique sequence set
|
|
250
249
|
|
|
251
|
-
To use abundance weighting,
|
|
250
|
+
To use abundance weighting, pass an `abundances` list alongside your sequences:
|
|
252
251
|
|
|
253
252
|
```python
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
'abundance': [150, 42, 7],
|
|
257
|
-
})
|
|
253
|
+
sequences = ['CASSLAPGATNEKLFF', 'CASSLGQAYEQYF', 'CASSFSTCSANYGYTF']
|
|
254
|
+
abundances = [150, 42, 7]
|
|
258
255
|
|
|
259
256
|
# Each sequence is weighted by its abundance during graph construction
|
|
260
|
-
graph = AAPLZGraph(
|
|
257
|
+
graph = AAPLZGraph(sequences, abundances=abundances, verbose=True)
|
|
261
258
|
```
|
|
262
259
|
|
|
263
260
|
For `NaiveLZGraph`, pass abundances as a separate parameter:
|
|
@@ -335,6 +332,20 @@ jsd = jensen_shannon_divergence(graph1, graph2)
|
|
|
335
332
|
comparison = compare_repertoires(graph1, graph2)
|
|
336
333
|
```
|
|
337
334
|
|
|
335
|
+
### Bayesian Posterior Personalization
|
|
336
|
+
|
|
337
|
+
```python
|
|
338
|
+
# Adapt a population graph to an individual
|
|
339
|
+
posterior = population_graph.get_posterior(
|
|
340
|
+
individual_sequences,
|
|
341
|
+
abundances=clonal_counts,
|
|
342
|
+
kappa=100.0 # prior strength
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
# The posterior is a full graph
|
|
346
|
+
simulated = posterior.simulate(1000, seed=42)
|
|
347
|
+
```
|
|
348
|
+
|
|
338
349
|
### Visualization
|
|
339
350
|
|
|
340
351
|
```python
|
|
@@ -76,6 +76,7 @@ The diversity of T-cells and B-cells is crucial for producing receptors that rec
|
|
|
76
76
|
- **Repertoire comparison** -- compare two repertoires via graph-level statistics
|
|
77
77
|
- **Analytical probability distributions** -- exact moments and scipy-like distribution objects for generation probabilities
|
|
78
78
|
- **Gene annotation support** -- optional V/J gene tracking on edges for gene usage analysis
|
|
79
|
+
- **Bayesian posterior personalization** -- adapt population-level models to individual repertoires using Dirichlet-Multinomial conjugacy
|
|
79
80
|
- **Abundance weighting** -- weight sequences by clonal abundance for more realistic models
|
|
80
81
|
- **Serialization** -- save and load graphs in JSON format
|
|
81
82
|
|
|
@@ -101,23 +102,20 @@ print(LZGraphs.__version__)
|
|
|
101
102
|
Build an amino acid positional graph from CDR3 sequences and compute sequence probabilities:
|
|
102
103
|
|
|
103
104
|
```python
|
|
104
|
-
import pandas as pd
|
|
105
105
|
from LZGraphs import AAPLZGraph
|
|
106
106
|
|
|
107
|
-
#
|
|
108
|
-
|
|
109
|
-
'
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
]
|
|
117
|
-
})
|
|
107
|
+
# Pass a plain list of CDR3 amino acid sequences
|
|
108
|
+
sequences = [
|
|
109
|
+
'CASSLAPGATNEKLFF',
|
|
110
|
+
'CASSLGQAYEQYF',
|
|
111
|
+
'CASSFSTCSANYGYTF',
|
|
112
|
+
'CASSQEGTEAFF',
|
|
113
|
+
'CASSLGQGNIQYF',
|
|
114
|
+
# ... your CDR3 amino acid sequences
|
|
115
|
+
]
|
|
118
116
|
|
|
119
117
|
# Construct the graph
|
|
120
|
-
graph = AAPLZGraph(
|
|
118
|
+
graph = AAPLZGraph(sequences, verbose=True)
|
|
121
119
|
|
|
122
120
|
# Compute the log-probability of a sequence under the model
|
|
123
121
|
log_prob = graph.walk_log_probability('CASSLAPGATNEKLFF')
|
|
@@ -172,15 +170,14 @@ graph = NaiveLZGraph(cdr3_list, dictionary, verbose=True)
|
|
|
172
170
|
|
|
173
171
|
### Gene Annotation
|
|
174
172
|
|
|
175
|
-
All three graph types support optional V and J gene annotation.
|
|
173
|
+
All three graph types support optional V and J gene annotation. Pass gene lists alongside sequences to track gene usage on graph edges:
|
|
176
174
|
|
|
177
175
|
```python
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
graph = AAPLZGraph(data, verbose=True)
|
|
176
|
+
sequences = ['CASSLEPSGGTDTQYF', 'CASSDTSGGTDTQYF', ...]
|
|
177
|
+
v_genes = ['TRBV16-1*01', 'TRBV1-1*01', ...]
|
|
178
|
+
j_genes = ['TRBJ1-2*01', 'TRBJ1-5*01', ...]
|
|
179
|
+
|
|
180
|
+
graph = AAPLZGraph(sequences, v_genes=v_genes, j_genes=j_genes, verbose=True)
|
|
184
181
|
|
|
185
182
|
# Gene data is now available
|
|
186
183
|
print(graph.has_gene_data) # True
|
|
@@ -199,16 +196,14 @@ This is particularly important for:
|
|
|
199
196
|
- **Better representation of clonal expansion** -- dominant clones shape the graph structure proportionally to their prevalence
|
|
200
197
|
- **More realistic sequence generation** -- simulated sequences reflect the abundance-weighted landscape, not just the unique sequence set
|
|
201
198
|
|
|
202
|
-
To use abundance weighting,
|
|
199
|
+
To use abundance weighting, pass an `abundances` list alongside your sequences:
|
|
203
200
|
|
|
204
201
|
```python
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
'abundance': [150, 42, 7],
|
|
208
|
-
})
|
|
202
|
+
sequences = ['CASSLAPGATNEKLFF', 'CASSLGQAYEQYF', 'CASSFSTCSANYGYTF']
|
|
203
|
+
abundances = [150, 42, 7]
|
|
209
204
|
|
|
210
205
|
# Each sequence is weighted by its abundance during graph construction
|
|
211
|
-
graph = AAPLZGraph(
|
|
206
|
+
graph = AAPLZGraph(sequences, abundances=abundances, verbose=True)
|
|
212
207
|
```
|
|
213
208
|
|
|
214
209
|
For `NaiveLZGraph`, pass abundances as a separate parameter:
|
|
@@ -286,6 +281,20 @@ jsd = jensen_shannon_divergence(graph1, graph2)
|
|
|
286
281
|
comparison = compare_repertoires(graph1, graph2)
|
|
287
282
|
```
|
|
288
283
|
|
|
284
|
+
### Bayesian Posterior Personalization
|
|
285
|
+
|
|
286
|
+
```python
|
|
287
|
+
# Adapt a population graph to an individual
|
|
288
|
+
posterior = population_graph.get_posterior(
|
|
289
|
+
individual_sequences,
|
|
290
|
+
abundances=clonal_counts,
|
|
291
|
+
kappa=100.0 # prior strength
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
# The posterior is a full graph
|
|
295
|
+
simulated = posterior.simulate(1000, seed=42)
|
|
296
|
+
```
|
|
297
|
+
|
|
289
298
|
### Visualization
|
|
290
299
|
|
|
291
300
|
```python
|
|
@@ -45,7 +45,6 @@ classifiers = [
|
|
|
45
45
|
dependencies = [
|
|
46
46
|
"networkx>=3.0",
|
|
47
47
|
"numpy>=1.24",
|
|
48
|
-
"pandas>=1.5",
|
|
49
48
|
"tqdm>=4.65",
|
|
50
49
|
"scipy>=1.10",
|
|
51
50
|
]
|
|
@@ -58,6 +57,7 @@ viz = [
|
|
|
58
57
|
dev = [
|
|
59
58
|
"pytest>=7.0",
|
|
60
59
|
"pytest-cov>=4.0",
|
|
60
|
+
"pandas>=1.5",
|
|
61
61
|
"black>=23.0",
|
|
62
62
|
"isort>=5.12",
|
|
63
63
|
"ruff>=0.1.0",
|
|
@@ -65,7 +65,10 @@ dev = [
|
|
|
65
65
|
"build>=1.0",
|
|
66
66
|
"twine>=4.0",
|
|
67
67
|
]
|
|
68
|
-
docs = [
|
|
68
|
+
docs = [
|
|
69
|
+
"mkdocs-material>=9.5",
|
|
70
|
+
"mkdocstrings[python]>=0.24",
|
|
71
|
+
]
|
|
69
72
|
|
|
70
73
|
[project.urls]
|
|
71
74
|
Homepage = "https://github.com/MuteJester/LZGraphs"
|
lzgraphs-2.3.0/setup.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Build script for optional C extensions.
|
|
3
|
+
|
|
4
|
+
The _fast_walk extension accelerates LZGraph.simulate() by ~50-100x.
|
|
5
|
+
If compilation fails (no C compiler), the package still installs and
|
|
6
|
+
falls back to the pure-Python implementation automatically.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import sys
|
|
11
|
+
from setuptools import setup, Extension
|
|
12
|
+
|
|
13
|
+
# Ensure setuptools can resolve the dynamic version (attr = "LZGraphs.__version__")
|
|
14
|
+
# when running in an isolated build environment where src/ isn't on sys.path.
|
|
15
|
+
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "src"))
|
|
16
|
+
|
|
17
|
+
ext_modules = [
|
|
18
|
+
Extension(
|
|
19
|
+
"LZGraphs._fast_walk",
|
|
20
|
+
sources=[os.path.join("src", "LZGraphs", "_fast_walk.c")],
|
|
21
|
+
# No external library dependencies — pure C + Python.h
|
|
22
|
+
),
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def run_setup(extensions):
|
|
27
|
+
setup(ext_modules=extensions)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
run_setup(ext_modules)
|
|
32
|
+
except Exception:
|
|
33
|
+
print(
|
|
34
|
+
"\n"
|
|
35
|
+
"WARNING: Failed to compile C extension _fast_walk.\n"
|
|
36
|
+
" LZGraphs will use the pure-Python fallback for simulate().\n"
|
|
37
|
+
" This is fine — the package works without it, just slower.\n"
|
|
38
|
+
"\n"
|
|
39
|
+
)
|
|
40
|
+
run_setup([])
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* _fast_walk.c — CPython C extension for fast Markov chain random walks.
|
|
3
|
+
*
|
|
4
|
+
* Implements the full simulate() loop in C including string assembly,
|
|
5
|
+
* for ~100-200x speedup over the original pure-Python implementation.
|
|
6
|
+
* Uses xoshiro256++ for fast, high-quality RNG.
|
|
7
|
+
*
|
|
8
|
+
* The extension is optional: if it fails to compile (no C compiler),
|
|
9
|
+
* LZGraphs falls back to the pure-Python bisect-based implementation.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
#define PY_SSIZE_T_CLEAN
|
|
13
|
+
#include <Python.h>
|
|
14
|
+
#include <stdint.h>
|
|
15
|
+
#include <string.h>
|
|
16
|
+
|
|
17
|
+
/* ========================================================================
|
|
18
|
+
* xoshiro256++ RNG — public domain by David Blackman and Sebastiano Vigna
|
|
19
|
+
* ======================================================================== */
|
|
20
|
+
|
|
21
|
+
static inline uint64_t rotl(const uint64_t x, int k) {
|
|
22
|
+
return (x << k) | (x >> (64 - k));
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
typedef struct {
|
|
26
|
+
uint64_t s[4];
|
|
27
|
+
} xoshiro256_state;
|
|
28
|
+
|
|
29
|
+
static inline uint64_t xoshiro256pp_next(xoshiro256_state *state) {
|
|
30
|
+
const uint64_t result = rotl(state->s[0] + state->s[3], 23) + state->s[0];
|
|
31
|
+
const uint64_t t = state->s[1] << 17;
|
|
32
|
+
state->s[2] ^= state->s[0];
|
|
33
|
+
state->s[3] ^= state->s[1];
|
|
34
|
+
state->s[1] ^= state->s[2];
|
|
35
|
+
state->s[0] ^= state->s[3];
|
|
36
|
+
state->s[2] ^= t;
|
|
37
|
+
state->s[3] = rotl(state->s[3], 45);
|
|
38
|
+
return result;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
static inline double xoshiro256pp_double(xoshiro256_state *state) {
|
|
42
|
+
return (double)(xoshiro256pp_next(state) >> 11) * 0x1.0p-53;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
static inline uint64_t splitmix64(uint64_t *x) {
|
|
46
|
+
uint64_t z = (*x += 0x9e3779b97f4a7c15ULL);
|
|
47
|
+
z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9ULL;
|
|
48
|
+
z = (z ^ (z >> 27)) * 0x94d049bb133111ebULL;
|
|
49
|
+
return z ^ (z >> 31);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
static void seed_xoshiro256(xoshiro256_state *state, uint64_t seed) {
|
|
53
|
+
state->s[0] = splitmix64(&seed);
|
|
54
|
+
state->s[1] = splitmix64(&seed);
|
|
55
|
+
state->s[2] = splitmix64(&seed);
|
|
56
|
+
state->s[3] = splitmix64(&seed);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/* ========================================================================
|
|
60
|
+
* Binary search (bisect_left) on a double array
|
|
61
|
+
* ======================================================================== */
|
|
62
|
+
|
|
63
|
+
static inline Py_ssize_t bisect_left_double(
|
|
64
|
+
const double *arr, Py_ssize_t n, double value
|
|
65
|
+
) {
|
|
66
|
+
Py_ssize_t lo = 0, hi = n;
|
|
67
|
+
while (lo < hi) {
|
|
68
|
+
Py_ssize_t mid = lo + (hi - lo) / 2;
|
|
69
|
+
if (arr[mid] < value)
|
|
70
|
+
lo = mid + 1;
|
|
71
|
+
else
|
|
72
|
+
hi = mid;
|
|
73
|
+
}
|
|
74
|
+
return lo;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/* ========================================================================
|
|
78
|
+
* simulate_walks — full simulation with string assembly in C
|
|
79
|
+
*
|
|
80
|
+
* Args:
|
|
81
|
+
* n_walks : int
|
|
82
|
+
* offsets : intp array [n_nodes+1] (buffer)
|
|
83
|
+
* neighbors : intp array [total_edges] (buffer)
|
|
84
|
+
* cumweights : float64 array [total_edges] (buffer)
|
|
85
|
+
* stop_probs : float64 array [n_nodes] (buffer)
|
|
86
|
+
* initial_ids : intp array [n_initial] (buffer)
|
|
87
|
+
* initial_cw : float64 array [n_initial] (buffer)
|
|
88
|
+
* seed : uint64
|
|
89
|
+
* clean_labels : list[str] — label for each node ID
|
|
90
|
+
* return_walks : bool — if True, return (walk, seq) tuples
|
|
91
|
+
* id_to_node : list[str] — node names (only used if return_walks)
|
|
92
|
+
*
|
|
93
|
+
* Returns:
|
|
94
|
+
* list[str] or list[tuple[list[str], str]]
|
|
95
|
+
* ======================================================================== */
|
|
96
|
+
|
|
97
|
+
static PyObject* py_simulate_walks(PyObject *self, PyObject *args) {
|
|
98
|
+
int n_walks, return_walks;
|
|
99
|
+
Py_buffer offsets_buf, neighbors_buf, cumweights_buf;
|
|
100
|
+
Py_buffer stop_probs_buf, initial_ids_buf, initial_cw_buf;
|
|
101
|
+
unsigned long long seed;
|
|
102
|
+
PyObject *clean_labels; /* Python list of str */
|
|
103
|
+
PyObject *id_to_node; /* Python list of str */
|
|
104
|
+
PyObject *result_list = NULL;
|
|
105
|
+
|
|
106
|
+
if (!PyArg_ParseTuple(args, "iy*y*y*y*y*y*KOpO",
|
|
107
|
+
&n_walks,
|
|
108
|
+
&offsets_buf, &neighbors_buf, &cumweights_buf,
|
|
109
|
+
&stop_probs_buf, &initial_ids_buf, &initial_cw_buf,
|
|
110
|
+
&seed,
|
|
111
|
+
&clean_labels,
|
|
112
|
+
&return_walks,
|
|
113
|
+
&id_to_node))
|
|
114
|
+
return NULL;
|
|
115
|
+
|
|
116
|
+
const Py_ssize_t *offsets = (const Py_ssize_t *)offsets_buf.buf;
|
|
117
|
+
const Py_ssize_t *neighbors = (const Py_ssize_t *)neighbors_buf.buf;
|
|
118
|
+
const double *cumweights = (const double *)cumweights_buf.buf;
|
|
119
|
+
const double *stop_probs = (const double *)stop_probs_buf.buf;
|
|
120
|
+
const Py_ssize_t *initial_ids = (const Py_ssize_t *)initial_ids_buf.buf;
|
|
121
|
+
const double *initial_cw = (const double *)initial_cw_buf.buf;
|
|
122
|
+
const Py_ssize_t n_initial = initial_cw_buf.len / (Py_ssize_t)sizeof(double);
|
|
123
|
+
|
|
124
|
+
if (n_initial <= 0) {
|
|
125
|
+
PyErr_SetString(PyExc_ValueError,
|
|
126
|
+
"Cannot simulate: graph has no initial states.");
|
|
127
|
+
goto cleanup;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/* Pre-fetch label UTF-8 data for fast string assembly */
|
|
131
|
+
const Py_ssize_t n_labels = PyList_GET_SIZE(clean_labels);
|
|
132
|
+
const char **label_ptrs = (const char **)PyMem_Malloc(n_labels * sizeof(char *));
|
|
133
|
+
Py_ssize_t *label_lens = (Py_ssize_t *)PyMem_Malloc(n_labels * sizeof(Py_ssize_t));
|
|
134
|
+
if (!label_ptrs || !label_lens) {
|
|
135
|
+
PyMem_Free(label_ptrs);
|
|
136
|
+
PyMem_Free(label_lens);
|
|
137
|
+
PyErr_NoMemory();
|
|
138
|
+
goto cleanup;
|
|
139
|
+
}
|
|
140
|
+
for (Py_ssize_t i = 0; i < n_labels; i++) {
|
|
141
|
+
PyObject *s = PyList_GET_ITEM(clean_labels, i);
|
|
142
|
+
label_ptrs[i] = PyUnicode_AsUTF8AndSize(s, &label_lens[i]);
|
|
143
|
+
if (!label_ptrs[i]) {
|
|
144
|
+
PyMem_Free(label_ptrs);
|
|
145
|
+
PyMem_Free(label_lens);
|
|
146
|
+
goto cleanup;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
xoshiro256_state rng;
|
|
151
|
+
seed_xoshiro256(&rng, (uint64_t)seed);
|
|
152
|
+
|
|
153
|
+
result_list = PyList_New(n_walks);
|
|
154
|
+
if (!result_list) {
|
|
155
|
+
PyMem_Free(label_ptrs);
|
|
156
|
+
PyMem_Free(label_lens);
|
|
157
|
+
goto cleanup;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/* Reusable walk buffer */
|
|
161
|
+
Py_ssize_t walk_cap = 64;
|
|
162
|
+
Py_ssize_t *walk_buf = (Py_ssize_t *)PyMem_Malloc(walk_cap * sizeof(Py_ssize_t));
|
|
163
|
+
/* Reusable string buffer */
|
|
164
|
+
Py_ssize_t str_cap = 256;
|
|
165
|
+
char *str_buf = (char *)PyMem_Malloc(str_cap);
|
|
166
|
+
if (!walk_buf || !str_buf) {
|
|
167
|
+
PyMem_Free(walk_buf);
|
|
168
|
+
PyMem_Free(str_buf);
|
|
169
|
+
PyMem_Free(label_ptrs);
|
|
170
|
+
PyMem_Free(label_lens);
|
|
171
|
+
Py_DECREF(result_list);
|
|
172
|
+
PyErr_NoMemory();
|
|
173
|
+
goto cleanup;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
for (int i = 0; i < n_walks; i++) {
|
|
177
|
+
/* Pick initial state */
|
|
178
|
+
double r = xoshiro256pp_double(&rng);
|
|
179
|
+
Py_ssize_t init_idx = bisect_left_double(initial_cw, n_initial, r);
|
|
180
|
+
if (init_idx >= n_initial) init_idx = n_initial - 1;
|
|
181
|
+
Py_ssize_t current = initial_ids[init_idx];
|
|
182
|
+
|
|
183
|
+
Py_ssize_t walk_len = 0;
|
|
184
|
+
walk_buf[walk_len++] = current;
|
|
185
|
+
|
|
186
|
+
/* Build string incrementally */
|
|
187
|
+
Py_ssize_t str_len = 0;
|
|
188
|
+
Py_ssize_t llen = label_lens[current];
|
|
189
|
+
if (str_len + llen > str_cap) {
|
|
190
|
+
str_cap = (str_len + llen) * 2;
|
|
191
|
+
str_buf = (char *)PyMem_Realloc(str_buf, str_cap);
|
|
192
|
+
if (!str_buf) goto oom;
|
|
193
|
+
}
|
|
194
|
+
memcpy(str_buf + str_len, label_ptrs[current], llen);
|
|
195
|
+
str_len += llen;
|
|
196
|
+
|
|
197
|
+
while (1) {
|
|
198
|
+
double sp = stop_probs[current];
|
|
199
|
+
if (sp == sp) {
|
|
200
|
+
if (xoshiro256pp_double(&rng) < sp)
|
|
201
|
+
break;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
Py_ssize_t start = offsets[current];
|
|
205
|
+
Py_ssize_t end = offsets[current + 1];
|
|
206
|
+
if (start == end)
|
|
207
|
+
break;
|
|
208
|
+
|
|
209
|
+
r = xoshiro256pp_double(&rng);
|
|
210
|
+
Py_ssize_t idx = bisect_left_double(cumweights + start, end - start, r);
|
|
211
|
+
if (idx >= end - start) idx = end - start - 1;
|
|
212
|
+
current = neighbors[start + idx];
|
|
213
|
+
|
|
214
|
+
/* Grow walk buffer if needed */
|
|
215
|
+
if (walk_len >= walk_cap) {
|
|
216
|
+
walk_cap *= 2;
|
|
217
|
+
Py_ssize_t *new_buf = (Py_ssize_t *)PyMem_Realloc(walk_buf, walk_cap * sizeof(Py_ssize_t));
|
|
218
|
+
if (!new_buf) goto oom;
|
|
219
|
+
walk_buf = new_buf;
|
|
220
|
+
}
|
|
221
|
+
walk_buf[walk_len++] = current;
|
|
222
|
+
|
|
223
|
+
/* Append label to string buffer */
|
|
224
|
+
llen = label_lens[current];
|
|
225
|
+
if (str_len + llen > str_cap) {
|
|
226
|
+
str_cap = (str_len + llen) * 2;
|
|
227
|
+
char *new_str = (char *)PyMem_Realloc(str_buf, str_cap);
|
|
228
|
+
if (!new_str) goto oom;
|
|
229
|
+
str_buf = new_str;
|
|
230
|
+
}
|
|
231
|
+
memcpy(str_buf + str_len, label_ptrs[current], llen);
|
|
232
|
+
str_len += llen;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/* Create Python string from buffer */
|
|
236
|
+
PyObject *seq = PyUnicode_FromStringAndSize(str_buf, str_len);
|
|
237
|
+
if (!seq) goto oom;
|
|
238
|
+
|
|
239
|
+
if (return_walks) {
|
|
240
|
+
/* Build walk list of node name strings */
|
|
241
|
+
PyObject *walk = PyList_New(walk_len);
|
|
242
|
+
if (!walk) { Py_DECREF(seq); goto oom; }
|
|
243
|
+
for (Py_ssize_t j = 0; j < walk_len; j++) {
|
|
244
|
+
PyObject *node_name = PyList_GET_ITEM(id_to_node, walk_buf[j]);
|
|
245
|
+
Py_INCREF(node_name);
|
|
246
|
+
PyList_SET_ITEM(walk, j, node_name);
|
|
247
|
+
}
|
|
248
|
+
PyObject *tup = PyTuple_Pack(2, walk, seq);
|
|
249
|
+
Py_DECREF(walk);
|
|
250
|
+
Py_DECREF(seq);
|
|
251
|
+
if (!tup) goto oom;
|
|
252
|
+
PyList_SET_ITEM(result_list, i, tup);
|
|
253
|
+
} else {
|
|
254
|
+
PyList_SET_ITEM(result_list, i, seq);
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
PyMem_Free(walk_buf);
|
|
259
|
+
PyMem_Free(str_buf);
|
|
260
|
+
PyMem_Free(label_ptrs);
|
|
261
|
+
PyMem_Free(label_lens);
|
|
262
|
+
goto cleanup;
|
|
263
|
+
|
|
264
|
+
oom:
|
|
265
|
+
PyMem_Free(walk_buf);
|
|
266
|
+
PyMem_Free(str_buf);
|
|
267
|
+
PyMem_Free(label_ptrs);
|
|
268
|
+
PyMem_Free(label_lens);
|
|
269
|
+
Py_XDECREF(result_list);
|
|
270
|
+
result_list = NULL;
|
|
271
|
+
if (!PyErr_Occurred())
|
|
272
|
+
PyErr_NoMemory();
|
|
273
|
+
|
|
274
|
+
cleanup:
|
|
275
|
+
PyBuffer_Release(&offsets_buf);
|
|
276
|
+
PyBuffer_Release(&neighbors_buf);
|
|
277
|
+
PyBuffer_Release(&cumweights_buf);
|
|
278
|
+
PyBuffer_Release(&stop_probs_buf);
|
|
279
|
+
PyBuffer_Release(&initial_ids_buf);
|
|
280
|
+
PyBuffer_Release(&initial_cw_buf);
|
|
281
|
+
|
|
282
|
+
return result_list;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
/* ========================================================================
|
|
286
|
+
* Module definition
|
|
287
|
+
* ======================================================================== */
|
|
288
|
+
|
|
289
|
+
static PyMethodDef FastWalkMethods[] = {
|
|
290
|
+
{"simulate_walks", py_simulate_walks, METH_VARARGS,
|
|
291
|
+
"Run n random walks on a CSR-encoded graph with string assembly.\n\n"
|
|
292
|
+
"Args:\n"
|
|
293
|
+
" n_walks (int): Number of walks.\n"
|
|
294
|
+
" offsets (array): CSR row offsets [n_nodes+1], dtype=intp.\n"
|
|
295
|
+
" neighbors (array): Flat neighbor IDs, dtype=intp.\n"
|
|
296
|
+
" cumweights (array): Flat cumulative weights, dtype=float64.\n"
|
|
297
|
+
" stop_probs (array): Per-node stop probability (NaN=none), dtype=float64.\n"
|
|
298
|
+
" initial_ids (array): Initial state IDs, dtype=intp.\n"
|
|
299
|
+
" initial_cumprobs (array): Cumulative initial probs, dtype=float64.\n"
|
|
300
|
+
" seed (int): RNG seed (xoshiro256++).\n"
|
|
301
|
+
" clean_labels (list[str]): Subpattern label for each node.\n"
|
|
302
|
+
" return_walks (bool): If True, return (walk, seq) tuples.\n"
|
|
303
|
+
" id_to_node (list[str]): Node names for walk output.\n\n"
|
|
304
|
+
"Returns:\n"
|
|
305
|
+
" list[str] or list[tuple[list[str], str]]\n"},
|
|
306
|
+
{NULL, NULL, 0, NULL}
|
|
307
|
+
};
|
|
308
|
+
|
|
309
|
+
static struct PyModuleDef fast_walk_module = {
|
|
310
|
+
PyModuleDef_HEAD_INIT,
|
|
311
|
+
"_fast_walk",
|
|
312
|
+
"C-accelerated random walk simulation for LZGraphs.\n"
|
|
313
|
+
"Uses xoshiro256++ RNG for high-quality, fast random number generation.\n"
|
|
314
|
+
"This module is optional — LZGraphs falls back to pure Python if unavailable.",
|
|
315
|
+
-1,
|
|
316
|
+
FastWalkMethods
|
|
317
|
+
};
|
|
318
|
+
|
|
319
|
+
PyMODINIT_FUNC PyInit__fast_walk(void) {
|
|
320
|
+
return PyModule_Create(&fast_walk_module);
|
|
321
|
+
}
|