LZGraphs 2.1.2__tar.gz → 2.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/PKG-INFO +39 -28
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/README.md +35 -26
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/pyproject.toml +5 -2
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/__init__.py +1 -1
- lzgraphs-2.2.0/src/LZGraphs/constants.py +6 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/graphs/amino_acid_positional.py +87 -113
- lzgraphs-2.2.0/src/LZGraphs/graphs/lz_graph_base.py +962 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/graphs/naive.py +6 -6
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/graphs/nucleotide_double_positional.py +38 -43
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/metrics/convenience.py +3 -4
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/metrics/saturation.py +56 -47
- lzgraphs-2.2.0/src/LZGraphs/mixins/__init__.py +8 -0
- lzgraphs-2.2.0/src/LZGraphs/mixins/bayesian_posterior.py +267 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/mixins/gene_logic.py +21 -13
- lzgraphs-2.2.0/src/LZGraphs/mixins/graph_topology.py +59 -0
- lzgraphs-2.2.0/src/LZGraphs/mixins/lzpgen_distribution.py +457 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/mixins/random_walk.py +2 -2
- lzgraphs-2.2.0/src/LZGraphs/mixins/serialization.py +615 -0
- lzgraphs-2.2.0/src/LZGraphs/mixins/walk_analysis.py +177 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs.egg-info/PKG-INFO +39 -28
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs.egg-info/SOURCES.txt +6 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs.egg-info/requires.txt +3 -1
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_base_class_methods.py +8 -8
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_diversity_theory.py +4 -4
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_graph_operations.py +3 -3
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_metrics.py +4 -4
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_new_features.py +3 -3
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_utilities.py +12 -14
- lzgraphs-2.1.2/src/LZGraphs/graphs/lz_graph_base.py +0 -2201
- lzgraphs-2.1.2/src/LZGraphs/mixins/__init__.py +0 -3
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/CHANGELOG.md +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/CONTRIBUTING.md +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/LICENSE +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/MANIFEST.in +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/requirements.txt +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/setup.cfg +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/bag_of_words/__init__.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/bag_of_words/bow_encoder.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/exceptions/__init__.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/graphs/__init__.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/graphs/edge_data.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/graphs/graph_operations.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/metrics/__init__.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/metrics/diversity.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/metrics/entropy.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/metrics/pgen_distribution.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/mixins/gene_prediction.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/py.typed +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/utilities/__init__.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/utilities/decomposition.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/utilities/helpers.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/utilities/misc.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/visualization/__init__.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/visualization/visualize.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs.egg-info/dependency_links.txt +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs.egg-info/top_level.txt +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_aap_lzgraph.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_abundance.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_analytical_distribution.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_bow_encoder.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_flexible_input.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_lzpgen_distribution.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_naive_lzgraph.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_ndp_lzgraph.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_pgen_fixes.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_serialization.py +0 -0
- {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_simulate.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: LZGraphs
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.2.0
|
|
4
4
|
Summary: An Implementation of LZ76 Based Graphs for Repertoire Representation and Analysis
|
|
5
5
|
Author-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
|
|
6
6
|
Maintainer-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
|
|
@@ -29,7 +29,6 @@ Description-Content-Type: text/markdown
|
|
|
29
29
|
License-File: LICENSE
|
|
30
30
|
Requires-Dist: networkx>=3.0
|
|
31
31
|
Requires-Dist: numpy>=1.24
|
|
32
|
-
Requires-Dist: pandas>=1.5
|
|
33
32
|
Requires-Dist: tqdm>=4.65
|
|
34
33
|
Requires-Dist: scipy>=1.10
|
|
35
34
|
Provides-Extra: viz
|
|
@@ -38,6 +37,7 @@ Requires-Dist: seaborn>=0.12; extra == "viz"
|
|
|
38
37
|
Provides-Extra: dev
|
|
39
38
|
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
40
39
|
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
40
|
+
Requires-Dist: pandas>=1.5; extra == "dev"
|
|
41
41
|
Requires-Dist: black>=23.0; extra == "dev"
|
|
42
42
|
Requires-Dist: isort>=5.12; extra == "dev"
|
|
43
43
|
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
@@ -45,6 +45,8 @@ Requires-Dist: pre-commit>=3.0; extra == "dev"
|
|
|
45
45
|
Requires-Dist: build>=1.0; extra == "dev"
|
|
46
46
|
Requires-Dist: twine>=4.0; extra == "dev"
|
|
47
47
|
Provides-Extra: docs
|
|
48
|
+
Requires-Dist: mkdocs-material>=9.5; extra == "docs"
|
|
49
|
+
Requires-Dist: mkdocstrings[python]>=0.24; extra == "docs"
|
|
48
50
|
Dynamic: license-file
|
|
49
51
|
|
|
50
52
|
<p align="center">
|
|
@@ -125,6 +127,7 @@ The diversity of T-cells and B-cells is crucial for producing receptors that rec
|
|
|
125
127
|
- **Repertoire comparison** -- compare two repertoires via graph-level statistics
|
|
126
128
|
- **Analytical probability distributions** -- exact moments and scipy-like distribution objects for generation probabilities
|
|
127
129
|
- **Gene annotation support** -- optional V/J gene tracking on edges for gene usage analysis
|
|
130
|
+
- **Bayesian posterior personalization** -- adapt population-level models to individual repertoires using Dirichlet-Multinomial conjugacy
|
|
128
131
|
- **Abundance weighting** -- weight sequences by clonal abundance for more realistic models
|
|
129
132
|
- **Serialization** -- save and load graphs in JSON format
|
|
130
133
|
|
|
@@ -150,23 +153,20 @@ print(LZGraphs.__version__)
|
|
|
150
153
|
Build an amino acid positional graph from CDR3 sequences and compute sequence probabilities:
|
|
151
154
|
|
|
152
155
|
```python
|
|
153
|
-
import pandas as pd
|
|
154
156
|
from LZGraphs import AAPLZGraph
|
|
155
157
|
|
|
156
|
-
#
|
|
157
|
-
|
|
158
|
-
'
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
]
|
|
166
|
-
})
|
|
158
|
+
# Pass a plain list of CDR3 amino acid sequences
|
|
159
|
+
sequences = [
|
|
160
|
+
'CASSLAPGATNEKLFF',
|
|
161
|
+
'CASSLGQAYEQYF',
|
|
162
|
+
'CASSFSTCSANYGYTF',
|
|
163
|
+
'CASSQEGTEAFF',
|
|
164
|
+
'CASSLGQGNIQYF',
|
|
165
|
+
# ... your CDR3 amino acid sequences
|
|
166
|
+
]
|
|
167
167
|
|
|
168
168
|
# Construct the graph
|
|
169
|
-
graph = AAPLZGraph(
|
|
169
|
+
graph = AAPLZGraph(sequences, verbose=True)
|
|
170
170
|
|
|
171
171
|
# Compute the log-probability of a sequence under the model
|
|
172
172
|
log_prob = graph.walk_log_probability('CASSLAPGATNEKLFF')
|
|
@@ -221,15 +221,14 @@ graph = NaiveLZGraph(cdr3_list, dictionary, verbose=True)
|
|
|
221
221
|
|
|
222
222
|
### Gene Annotation
|
|
223
223
|
|
|
224
|
-
All three graph types support optional V and J gene annotation.
|
|
224
|
+
All three graph types support optional V and J gene annotation. Pass gene lists alongside sequences to track gene usage on graph edges:
|
|
225
225
|
|
|
226
226
|
```python
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
graph = AAPLZGraph(data, verbose=True)
|
|
227
|
+
sequences = ['CASSLEPSGGTDTQYF', 'CASSDTSGGTDTQYF', ...]
|
|
228
|
+
v_genes = ['TRBV16-1*01', 'TRBV1-1*01', ...]
|
|
229
|
+
j_genes = ['TRBJ1-2*01', 'TRBJ1-5*01', ...]
|
|
230
|
+
|
|
231
|
+
graph = AAPLZGraph(sequences, v_genes=v_genes, j_genes=j_genes, verbose=True)
|
|
233
232
|
|
|
234
233
|
# Gene data is now available
|
|
235
234
|
print(graph.has_gene_data) # True
|
|
@@ -248,16 +247,14 @@ This is particularly important for:
|
|
|
248
247
|
- **Better representation of clonal expansion** -- dominant clones shape the graph structure proportionally to their prevalence
|
|
249
248
|
- **More realistic sequence generation** -- simulated sequences reflect the abundance-weighted landscape, not just the unique sequence set
|
|
250
249
|
|
|
251
|
-
To use abundance weighting,
|
|
250
|
+
To use abundance weighting, pass an `abundances` list alongside your sequences:
|
|
252
251
|
|
|
253
252
|
```python
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
'abundance': [150, 42, 7],
|
|
257
|
-
})
|
|
253
|
+
sequences = ['CASSLAPGATNEKLFF', 'CASSLGQAYEQYF', 'CASSFSTCSANYGYTF']
|
|
254
|
+
abundances = [150, 42, 7]
|
|
258
255
|
|
|
259
256
|
# Each sequence is weighted by its abundance during graph construction
|
|
260
|
-
graph = AAPLZGraph(
|
|
257
|
+
graph = AAPLZGraph(sequences, abundances=abundances, verbose=True)
|
|
261
258
|
```
|
|
262
259
|
|
|
263
260
|
For `NaiveLZGraph`, pass abundances as a separate parameter:
|
|
@@ -335,6 +332,20 @@ jsd = jensen_shannon_divergence(graph1, graph2)
|
|
|
335
332
|
comparison = compare_repertoires(graph1, graph2)
|
|
336
333
|
```
|
|
337
334
|
|
|
335
|
+
### Bayesian Posterior Personalization
|
|
336
|
+
|
|
337
|
+
```python
|
|
338
|
+
# Adapt a population graph to an individual
|
|
339
|
+
posterior = population_graph.get_posterior(
|
|
340
|
+
individual_sequences,
|
|
341
|
+
abundances=clonal_counts,
|
|
342
|
+
kappa=100.0 # prior strength
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
# The posterior is a full graph
|
|
346
|
+
simulated = posterior.simulate(1000, seed=42)
|
|
347
|
+
```
|
|
348
|
+
|
|
338
349
|
### Visualization
|
|
339
350
|
|
|
340
351
|
```python
|
|
@@ -76,6 +76,7 @@ The diversity of T-cells and B-cells is crucial for producing receptors that rec
|
|
|
76
76
|
- **Repertoire comparison** -- compare two repertoires via graph-level statistics
|
|
77
77
|
- **Analytical probability distributions** -- exact moments and scipy-like distribution objects for generation probabilities
|
|
78
78
|
- **Gene annotation support** -- optional V/J gene tracking on edges for gene usage analysis
|
|
79
|
+
- **Bayesian posterior personalization** -- adapt population-level models to individual repertoires using Dirichlet-Multinomial conjugacy
|
|
79
80
|
- **Abundance weighting** -- weight sequences by clonal abundance for more realistic models
|
|
80
81
|
- **Serialization** -- save and load graphs in JSON format
|
|
81
82
|
|
|
@@ -101,23 +102,20 @@ print(LZGraphs.__version__)
|
|
|
101
102
|
Build an amino acid positional graph from CDR3 sequences and compute sequence probabilities:
|
|
102
103
|
|
|
103
104
|
```python
|
|
104
|
-
import pandas as pd
|
|
105
105
|
from LZGraphs import AAPLZGraph
|
|
106
106
|
|
|
107
|
-
#
|
|
108
|
-
|
|
109
|
-
'
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
]
|
|
117
|
-
})
|
|
107
|
+
# Pass a plain list of CDR3 amino acid sequences
|
|
108
|
+
sequences = [
|
|
109
|
+
'CASSLAPGATNEKLFF',
|
|
110
|
+
'CASSLGQAYEQYF',
|
|
111
|
+
'CASSFSTCSANYGYTF',
|
|
112
|
+
'CASSQEGTEAFF',
|
|
113
|
+
'CASSLGQGNIQYF',
|
|
114
|
+
# ... your CDR3 amino acid sequences
|
|
115
|
+
]
|
|
118
116
|
|
|
119
117
|
# Construct the graph
|
|
120
|
-
graph = AAPLZGraph(
|
|
118
|
+
graph = AAPLZGraph(sequences, verbose=True)
|
|
121
119
|
|
|
122
120
|
# Compute the log-probability of a sequence under the model
|
|
123
121
|
log_prob = graph.walk_log_probability('CASSLAPGATNEKLFF')
|
|
@@ -172,15 +170,14 @@ graph = NaiveLZGraph(cdr3_list, dictionary, verbose=True)
|
|
|
172
170
|
|
|
173
171
|
### Gene Annotation
|
|
174
172
|
|
|
175
|
-
All three graph types support optional V and J gene annotation.
|
|
173
|
+
All three graph types support optional V and J gene annotation. Pass gene lists alongside sequences to track gene usage on graph edges:
|
|
176
174
|
|
|
177
175
|
```python
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
graph = AAPLZGraph(data, verbose=True)
|
|
176
|
+
sequences = ['CASSLEPSGGTDTQYF', 'CASSDTSGGTDTQYF', ...]
|
|
177
|
+
v_genes = ['TRBV16-1*01', 'TRBV1-1*01', ...]
|
|
178
|
+
j_genes = ['TRBJ1-2*01', 'TRBJ1-5*01', ...]
|
|
179
|
+
|
|
180
|
+
graph = AAPLZGraph(sequences, v_genes=v_genes, j_genes=j_genes, verbose=True)
|
|
184
181
|
|
|
185
182
|
# Gene data is now available
|
|
186
183
|
print(graph.has_gene_data) # True
|
|
@@ -199,16 +196,14 @@ This is particularly important for:
|
|
|
199
196
|
- **Better representation of clonal expansion** -- dominant clones shape the graph structure proportionally to their prevalence
|
|
200
197
|
- **More realistic sequence generation** -- simulated sequences reflect the abundance-weighted landscape, not just the unique sequence set
|
|
201
198
|
|
|
202
|
-
To use abundance weighting,
|
|
199
|
+
To use abundance weighting, pass an `abundances` list alongside your sequences:
|
|
203
200
|
|
|
204
201
|
```python
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
'abundance': [150, 42, 7],
|
|
208
|
-
})
|
|
202
|
+
sequences = ['CASSLAPGATNEKLFF', 'CASSLGQAYEQYF', 'CASSFSTCSANYGYTF']
|
|
203
|
+
abundances = [150, 42, 7]
|
|
209
204
|
|
|
210
205
|
# Each sequence is weighted by its abundance during graph construction
|
|
211
|
-
graph = AAPLZGraph(
|
|
206
|
+
graph = AAPLZGraph(sequences, abundances=abundances, verbose=True)
|
|
212
207
|
```
|
|
213
208
|
|
|
214
209
|
For `NaiveLZGraph`, pass abundances as a separate parameter:
|
|
@@ -286,6 +281,20 @@ jsd = jensen_shannon_divergence(graph1, graph2)
|
|
|
286
281
|
comparison = compare_repertoires(graph1, graph2)
|
|
287
282
|
```
|
|
288
283
|
|
|
284
|
+
### Bayesian Posterior Personalization
|
|
285
|
+
|
|
286
|
+
```python
|
|
287
|
+
# Adapt a population graph to an individual
|
|
288
|
+
posterior = population_graph.get_posterior(
|
|
289
|
+
individual_sequences,
|
|
290
|
+
abundances=clonal_counts,
|
|
291
|
+
kappa=100.0 # prior strength
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
# The posterior is a full graph
|
|
295
|
+
simulated = posterior.simulate(1000, seed=42)
|
|
296
|
+
```
|
|
297
|
+
|
|
289
298
|
### Visualization
|
|
290
299
|
|
|
291
300
|
```python
|
|
@@ -45,7 +45,6 @@ classifiers = [
|
|
|
45
45
|
dependencies = [
|
|
46
46
|
"networkx>=3.0",
|
|
47
47
|
"numpy>=1.24",
|
|
48
|
-
"pandas>=1.5",
|
|
49
48
|
"tqdm>=4.65",
|
|
50
49
|
"scipy>=1.10",
|
|
51
50
|
]
|
|
@@ -58,6 +57,7 @@ viz = [
|
|
|
58
57
|
dev = [
|
|
59
58
|
"pytest>=7.0",
|
|
60
59
|
"pytest-cov>=4.0",
|
|
60
|
+
"pandas>=1.5",
|
|
61
61
|
"black>=23.0",
|
|
62
62
|
"isort>=5.12",
|
|
63
63
|
"ruff>=0.1.0",
|
|
@@ -65,7 +65,10 @@ dev = [
|
|
|
65
65
|
"build>=1.0",
|
|
66
66
|
"twine>=4.0",
|
|
67
67
|
]
|
|
68
|
-
docs = [
|
|
68
|
+
docs = [
|
|
69
|
+
"mkdocs-material>=9.5",
|
|
70
|
+
"mkdocstrings[python]>=0.24",
|
|
71
|
+
]
|
|
69
72
|
|
|
70
73
|
[project.urls]
|
|
71
74
|
Homepage = "https://github.com/MuteJester/LZGraphs"
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import random as _random
|
|
2
3
|
import time
|
|
4
|
+
from collections import defaultdict
|
|
3
5
|
from typing import List, Tuple, Union, Optional, Generator
|
|
4
6
|
|
|
5
7
|
import networkx as nx
|
|
6
8
|
import numpy as np
|
|
7
|
-
import pandas as pd
|
|
8
9
|
from tqdm.auto import tqdm
|
|
9
10
|
|
|
10
11
|
from .lz_graph_base import LZGraphBase
|
|
@@ -67,7 +68,7 @@ class AAPLZGraph(LZGraphBase):
|
|
|
67
68
|
|
|
68
69
|
def __init__(
|
|
69
70
|
self,
|
|
70
|
-
data
|
|
71
|
+
data,
|
|
71
72
|
*,
|
|
72
73
|
abundances: Optional[List[int]] = None,
|
|
73
74
|
v_genes: Optional[List[str]] = None,
|
|
@@ -81,17 +82,18 @@ class AAPLZGraph(LZGraphBase):
|
|
|
81
82
|
"""
|
|
82
83
|
Create an amino-acid-positional LZGraph.
|
|
83
84
|
|
|
84
|
-
*data* can be a
|
|
85
|
-
a plain list of amino-acid sequences, or
|
|
85
|
+
*data* can be a DataFrame-like object with a ``cdr3_amino_acid``
|
|
86
|
+
column, a plain list of amino-acid sequences, or any iterable with
|
|
87
|
+
a ``.tolist()`` method.
|
|
86
88
|
|
|
87
|
-
When *data* is a list
|
|
88
|
-
*
|
|
89
|
-
|
|
90
|
-
|
|
89
|
+
When *data* is a list the optional keyword arguments *abundances*,
|
|
90
|
+
*v_genes* and *j_genes* may be used to supply additional
|
|
91
|
+
per-sequence information. When *data* is a DataFrame these must be
|
|
92
|
+
``None`` — use DataFrame columns instead.
|
|
91
93
|
|
|
92
94
|
Args:
|
|
93
95
|
data: Sequence data. DataFrame (with ``cdr3_amino_acid`` column),
|
|
94
|
-
list of strings, or
|
|
96
|
+
list of strings, or any iterable of strings.
|
|
95
97
|
abundances: Per-sequence abundance counts (list input only).
|
|
96
98
|
v_genes: Per-sequence V gene annotations (list input only).
|
|
97
99
|
j_genes: Per-sequence J gene annotations (list input only).
|
|
@@ -112,7 +114,7 @@ class AAPLZGraph(LZGraphBase):
|
|
|
112
114
|
"""
|
|
113
115
|
super().__init__() # Initialize LZGraphBase
|
|
114
116
|
|
|
115
|
-
# Normalize flexible input →
|
|
117
|
+
# Normalize flexible input → dict-of-lists
|
|
116
118
|
data = self._normalize_input(
|
|
117
119
|
data, "cdr3_amino_acid",
|
|
118
120
|
abundances=abundances, v_genes=v_genes, j_genes=j_genes,
|
|
@@ -127,20 +129,16 @@ class AAPLZGraph(LZGraphBase):
|
|
|
127
129
|
self._validate_input(data, validate_sequences)
|
|
128
130
|
|
|
129
131
|
# Determine if we have gene data
|
|
130
|
-
self.has_gene_data = (
|
|
131
|
-
isinstance(data, pd.DataFrame) and
|
|
132
|
-
("V" in data.columns) and
|
|
133
|
-
("J" in data.columns)
|
|
134
|
-
)
|
|
132
|
+
self.has_gene_data = data.get('v_genes') is not None
|
|
135
133
|
|
|
136
134
|
# Load gene data if present
|
|
137
135
|
if self.has_gene_data:
|
|
138
136
|
self._load_gene_data(data)
|
|
139
|
-
self.
|
|
137
|
+
self._log_step("Gene information loaded.", verbose)
|
|
140
138
|
|
|
141
139
|
# Build the graph with a custom routine
|
|
142
140
|
self.__simultaneous_graph_construction(data)
|
|
143
|
-
self.
|
|
141
|
+
self._log_step("Graph constructed.", verbose)
|
|
144
142
|
|
|
145
143
|
# Normalize and derive probability dicts
|
|
146
144
|
self.length_counts = dict(self.lengths)
|
|
@@ -162,103 +160,87 @@ class AAPLZGraph(LZGraphBase):
|
|
|
162
160
|
if total_initial > 0 else {}
|
|
163
161
|
)
|
|
164
162
|
|
|
165
|
-
self.
|
|
163
|
+
self._log_step("Graph metadata derived.", verbose)
|
|
166
164
|
|
|
167
165
|
# Derive subpattern probabilities & normalize edges
|
|
168
166
|
self._derive_node_probability()
|
|
169
|
-
self.
|
|
167
|
+
self._log_step("Node probabilities derived.", verbose)
|
|
170
168
|
|
|
171
169
|
self._normalize_edge_weights()
|
|
172
|
-
self.
|
|
170
|
+
self._log_step("Edge weights normalized.", verbose)
|
|
173
171
|
|
|
174
172
|
# Additional map derivations
|
|
175
173
|
self._edges_cache = None
|
|
176
174
|
self._derive_stop_probability_data()
|
|
177
|
-
self.
|
|
175
|
+
self._log_step("Stop probabilities derived.", verbose)
|
|
178
176
|
|
|
179
177
|
# Optionally compute the PGEN for each sequence
|
|
180
178
|
if calculate_trainset_pgen:
|
|
181
179
|
logger.info("Calculating PGEN for the training set. This may take some time...")
|
|
182
180
|
self.train_pgen = np.array([
|
|
183
181
|
self.walk_probability(seq, verbose=False)
|
|
184
|
-
for seq in data[
|
|
182
|
+
for seq in data['sequences']
|
|
185
183
|
])
|
|
186
184
|
|
|
187
185
|
self.constructor_end_time = time.time()
|
|
188
|
-
self.
|
|
189
|
-
self.verbose_driver(-2, verbose)
|
|
186
|
+
self._log_step("LZGraph created successfully.", verbose)
|
|
190
187
|
|
|
191
188
|
# --------------------------------------------------------------------------
|
|
192
189
|
# Input Validation
|
|
193
190
|
# --------------------------------------------------------------------------
|
|
194
191
|
|
|
195
|
-
def _validate_input(self, data:
|
|
192
|
+
def _validate_input(self, data: dict, validate_sequences: bool) -> None:
|
|
196
193
|
"""
|
|
197
194
|
Validate input data before graph construction.
|
|
198
195
|
|
|
199
196
|
Args:
|
|
200
|
-
data:
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
Raises:
|
|
204
|
-
TypeError: If data is not a pandas DataFrame
|
|
205
|
-
ValueError: If required columns are missing or data is invalid
|
|
197
|
+
data: Normalised dict with key ``'sequences'`` (and optionally
|
|
198
|
+
``'v_genes'``, ``'j_genes'``, ``'abundances'``).
|
|
199
|
+
validate_sequences: Whether to check sequence content.
|
|
206
200
|
"""
|
|
207
|
-
|
|
208
|
-
if not isinstance(data, pd.DataFrame):
|
|
209
|
-
raise TypeError(
|
|
210
|
-
f"Expected pandas DataFrame, got {type(data).__name__}. "
|
|
211
|
-
"Please provide a DataFrame with a 'cdr3_amino_acid' column."
|
|
212
|
-
)
|
|
213
|
-
|
|
214
|
-
# Check for required column
|
|
215
|
-
if 'cdr3_amino_acid' not in data.columns:
|
|
216
|
-
raise MissingColumnError(
|
|
217
|
-
column_name='cdr3_amino_acid',
|
|
218
|
-
available_columns=list(data.columns)
|
|
219
|
-
)
|
|
201
|
+
sequences = data['sequences']
|
|
220
202
|
|
|
221
203
|
# Check for empty data
|
|
222
|
-
if len(
|
|
223
|
-
raise EmptyDataError("
|
|
204
|
+
if len(sequences) == 0:
|
|
205
|
+
raise EmptyDataError("No sequences provided. Cannot build LZGraph from zero sequences.")
|
|
224
206
|
|
|
225
|
-
# Check for null values
|
|
226
|
-
null_count =
|
|
207
|
+
# Check for null values
|
|
208
|
+
null_count = sum(1 for x in sequences if x is None)
|
|
227
209
|
if null_count > 0:
|
|
228
210
|
raise ValueError(
|
|
229
|
-
f"Found {null_count} null values in
|
|
211
|
+
f"Found {null_count} null values in sequences. "
|
|
230
212
|
"Please remove or fill null values before building the graph."
|
|
231
213
|
)
|
|
232
214
|
|
|
233
215
|
# Check for empty strings
|
|
234
|
-
empty_count = (
|
|
216
|
+
empty_count = sum(1 for x in sequences if not x)
|
|
235
217
|
if empty_count > 0:
|
|
236
218
|
raise ValueError(
|
|
237
|
-
f"Found {empty_count} empty strings in
|
|
219
|
+
f"Found {empty_count} empty strings in sequences. "
|
|
238
220
|
"Please remove empty sequences before building the graph."
|
|
239
221
|
)
|
|
240
222
|
|
|
241
223
|
# Validate sequence content if requested
|
|
242
224
|
if validate_sequences:
|
|
243
|
-
self._validate_sequence_content(
|
|
225
|
+
self._validate_sequence_content(sequences)
|
|
244
226
|
|
|
245
227
|
# Validate gene columns if present
|
|
246
|
-
if
|
|
228
|
+
if data.get('v_genes') is not None:
|
|
247
229
|
self._validate_gene_columns(data)
|
|
248
230
|
|
|
249
|
-
def _validate_sequence_content(self, sequences:
|
|
231
|
+
def _validate_sequence_content(self, sequences: list) -> None:
|
|
250
232
|
"""
|
|
251
233
|
Validate that sequences contain only valid amino acid characters.
|
|
252
234
|
|
|
253
235
|
Args:
|
|
254
|
-
sequences:
|
|
255
|
-
|
|
256
|
-
Raises:
|
|
257
|
-
ValueError: If invalid characters are found
|
|
236
|
+
sequences: List of amino acid sequences.
|
|
258
237
|
"""
|
|
259
238
|
# Sample up to 1000 sequences for validation (performance)
|
|
260
239
|
sample_size = min(1000, len(sequences))
|
|
261
|
-
|
|
240
|
+
if len(sequences) > sample_size:
|
|
241
|
+
sample = _random.Random(42).sample(sequences, k=sample_size)
|
|
242
|
+
else:
|
|
243
|
+
sample = sequences
|
|
262
244
|
|
|
263
245
|
invalid_chars_found = set()
|
|
264
246
|
invalid_sequences = []
|
|
@@ -287,19 +269,15 @@ class AAPLZGraph(LZGraphBase):
|
|
|
287
269
|
)
|
|
288
270
|
)
|
|
289
271
|
|
|
290
|
-
def _validate_gene_columns(self, data:
|
|
272
|
+
def _validate_gene_columns(self, data: dict) -> None:
|
|
291
273
|
"""
|
|
292
|
-
Validate V and J gene
|
|
274
|
+
Validate V and J gene lists.
|
|
293
275
|
|
|
294
276
|
Args:
|
|
295
|
-
data:
|
|
296
|
-
|
|
297
|
-
Raises:
|
|
298
|
-
ValueError: If gene columns contain invalid data
|
|
277
|
+
data: Dict with ``'v_genes'`` and ``'j_genes'`` lists.
|
|
299
278
|
"""
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
j_nulls = data['J'].isna().sum()
|
|
279
|
+
v_nulls = sum(1 for x in data['v_genes'] if x is None)
|
|
280
|
+
j_nulls = sum(1 for x in data['j_genes'] if x is None)
|
|
303
281
|
|
|
304
282
|
if v_nulls > 0 or j_nulls > 0:
|
|
305
283
|
raise ValueError(
|
|
@@ -328,34 +306,31 @@ class AAPLZGraph(LZGraphBase):
|
|
|
328
306
|
idx = base.rfind('_')
|
|
329
307
|
return base[:idx] if idx > 0 else base
|
|
330
308
|
|
|
331
|
-
def _decomposed_sequence_generator(
|
|
332
|
-
self,
|
|
333
|
-
data: Union[pd.DataFrame, pd.Series]
|
|
334
|
-
) -> Generator:
|
|
309
|
+
def _decomposed_sequence_generator(self, data: dict) -> Generator:
|
|
335
310
|
"""
|
|
336
311
|
A generator that yields the information needed to build the graph.
|
|
337
312
|
|
|
338
|
-
|
|
339
|
-
|
|
313
|
+
Args:
|
|
314
|
+
data: Normalised dict with ``'sequences'`` (and optionally
|
|
315
|
+
``'abundances'``, ``'v_genes'``, ``'j_genes'``).
|
|
340
316
|
|
|
341
317
|
Yields:
|
|
342
318
|
If genetic: (steps, locations, v, j, count)
|
|
343
319
|
Otherwise: (steps, locations, count)
|
|
344
320
|
"""
|
|
345
|
-
|
|
321
|
+
sequences = data['sequences']
|
|
322
|
+
abundances = data.get('abundances')
|
|
346
323
|
|
|
347
324
|
if self.has_gene_data:
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
count = int(abundance)
|
|
355
|
-
else:
|
|
356
|
-
cdr3, v, j = row
|
|
357
|
-
count = 1
|
|
325
|
+
v_genes = data['v_genes']
|
|
326
|
+
j_genes = data['j_genes']
|
|
327
|
+
if abundances is not None:
|
|
328
|
+
row_iter = zip(sequences, v_genes, j_genes, abundances)
|
|
329
|
+
else:
|
|
330
|
+
row_iter = ((s, v, j, 1) for s, v, j in zip(sequences, v_genes, j_genes))
|
|
358
331
|
|
|
332
|
+
for cdr3, v, j, abundance in tqdm(row_iter, desc="Building Graph", leave=False):
|
|
333
|
+
count = int(abundance)
|
|
359
334
|
lz, locs = derive_lz_and_position(cdr3)
|
|
360
335
|
steps = window(lz, 2)
|
|
361
336
|
locations = window(locs, 2)
|
|
@@ -366,12 +341,10 @@ class AAPLZGraph(LZGraphBase):
|
|
|
366
341
|
|
|
367
342
|
yield (steps, locations, v, j, count)
|
|
368
343
|
else:
|
|
369
|
-
if
|
|
370
|
-
seq_iter = zip(
|
|
371
|
-
elif isinstance(data, pd.DataFrame):
|
|
372
|
-
seq_iter = ((cdr3, 1) for cdr3 in data["cdr3_amino_acid"])
|
|
344
|
+
if abundances is not None:
|
|
345
|
+
seq_iter = zip(sequences, abundances)
|
|
373
346
|
else:
|
|
374
|
-
seq_iter = ((
|
|
347
|
+
seq_iter = ((s, 1) for s in sequences)
|
|
375
348
|
|
|
376
349
|
for cdr3, abundance in tqdm(seq_iter, desc="Building Graph", leave=False):
|
|
377
350
|
count = int(abundance)
|
|
@@ -385,7 +358,7 @@ class AAPLZGraph(LZGraphBase):
|
|
|
385
358
|
|
|
386
359
|
yield (steps, locations, count)
|
|
387
360
|
|
|
388
|
-
def __simultaneous_graph_construction(self, data:
|
|
361
|
+
def __simultaneous_graph_construction(self, data: dict) -> None:
|
|
389
362
|
"""
|
|
390
363
|
Custom simultaneous construction of the graph, mirroring the parent's
|
|
391
364
|
_simultaneous_graph_construction but applying our specialized decomposition.
|
|
@@ -582,50 +555,51 @@ class AAPLZGraph(LZGraphBase):
|
|
|
582
555
|
|
|
583
556
|
return results
|
|
584
557
|
|
|
585
|
-
def random_walk_distribution_based(self, length_distribution
|
|
558
|
+
def random_walk_distribution_based(self, length_distribution):
|
|
586
559
|
"""
|
|
587
560
|
Creates random walks in proportion to a given length distribution.
|
|
588
561
|
We do a large number of unsupervised walks, then sample from them
|
|
589
562
|
to match the specified distribution.
|
|
590
563
|
|
|
591
564
|
Args:
|
|
592
|
-
length_distribution: A
|
|
593
|
-
|
|
565
|
+
length_distribution: A dict (or Series-like) mapping sequence
|
|
566
|
+
lengths to the number of sequences desired at that length.
|
|
594
567
|
|
|
595
568
|
Returns:
|
|
596
|
-
A 2D array
|
|
569
|
+
A 2D numpy array of shape [N, 2], where each row is (Seq, Walk).
|
|
597
570
|
"""
|
|
598
|
-
|
|
599
|
-
|
|
571
|
+
# Accept both dict and Series-like objects
|
|
572
|
+
if hasattr(length_distribution, 'to_dict'):
|
|
573
|
+
length_distribution = length_distribution.to_dict()
|
|
600
574
|
|
|
601
|
-
|
|
602
|
-
|
|
575
|
+
total = sum(length_distribution.values())
|
|
576
|
+
N = int(total * 3)
|
|
577
|
+
|
|
578
|
+
# Generate random walks and group by length
|
|
579
|
+
by_length = defaultdict(list)
|
|
603
580
|
logger.info(f"Generating ~{N} random walks to filter by length distribution...")
|
|
604
581
|
for _ in tqdm(range(N), desc="Random Walk Distribution"):
|
|
605
582
|
rw, rseq = self.unsupervised_random_walk()
|
|
606
|
-
|
|
607
|
-
seqs.append(rseq)
|
|
608
|
-
|
|
609
|
-
df = pd.DataFrame({"Seqs": seqs, "Walks": walks})
|
|
610
|
-
df["L"] = df["Seqs"].str.len()
|
|
583
|
+
by_length[len(rseq)].append((rseq, rw))
|
|
611
584
|
|
|
585
|
+
rng = _random.Random(42)
|
|
612
586
|
samples = []
|
|
613
|
-
for length_val in length_distribution.
|
|
614
|
-
needed =
|
|
615
|
-
|
|
616
|
-
if len(
|
|
587
|
+
for length_val, needed in length_distribution.items():
|
|
588
|
+
needed = int(needed)
|
|
589
|
+
available = by_length.get(length_val, [])
|
|
590
|
+
if len(available) < needed:
|
|
617
591
|
logger.warning(
|
|
618
|
-
f"Requested {needed} sequences of length {length_val},
|
|
592
|
+
f"Requested {needed} sequences of length {length_val}, "
|
|
593
|
+
f"but only found {len(available)}."
|
|
619
594
|
)
|
|
620
|
-
needed = len(
|
|
595
|
+
needed = len(available)
|
|
621
596
|
if needed > 0:
|
|
622
|
-
samples.
|
|
597
|
+
samples.extend(rng.sample(available, k=needed))
|
|
623
598
|
|
|
624
599
|
if not samples:
|
|
625
600
|
return np.array([])
|
|
626
601
|
|
|
627
|
-
|
|
628
|
-
return final[["Seqs", "Walks"]].values
|
|
602
|
+
return np.array(samples, dtype=object)
|
|
629
603
|
|
|
630
604
|
def get_gene_graph(self, v: str, j: str) -> nx.DiGraph:
|
|
631
605
|
"""
|