LZGraphs 2.1.2__tar.gz → 2.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/PKG-INFO +39 -28
  2. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/README.md +35 -26
  3. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/pyproject.toml +5 -2
  4. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/__init__.py +1 -1
  5. lzgraphs-2.2.0/src/LZGraphs/constants.py +6 -0
  6. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/graphs/amino_acid_positional.py +87 -113
  7. lzgraphs-2.2.0/src/LZGraphs/graphs/lz_graph_base.py +962 -0
  8. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/graphs/naive.py +6 -6
  9. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/graphs/nucleotide_double_positional.py +38 -43
  10. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/metrics/convenience.py +3 -4
  11. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/metrics/saturation.py +56 -47
  12. lzgraphs-2.2.0/src/LZGraphs/mixins/__init__.py +8 -0
  13. lzgraphs-2.2.0/src/LZGraphs/mixins/bayesian_posterior.py +267 -0
  14. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/mixins/gene_logic.py +21 -13
  15. lzgraphs-2.2.0/src/LZGraphs/mixins/graph_topology.py +59 -0
  16. lzgraphs-2.2.0/src/LZGraphs/mixins/lzpgen_distribution.py +457 -0
  17. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/mixins/random_walk.py +2 -2
  18. lzgraphs-2.2.0/src/LZGraphs/mixins/serialization.py +615 -0
  19. lzgraphs-2.2.0/src/LZGraphs/mixins/walk_analysis.py +177 -0
  20. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs.egg-info/PKG-INFO +39 -28
  21. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs.egg-info/SOURCES.txt +6 -0
  22. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs.egg-info/requires.txt +3 -1
  23. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_base_class_methods.py +8 -8
  24. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_diversity_theory.py +4 -4
  25. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_graph_operations.py +3 -3
  26. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_metrics.py +4 -4
  27. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_new_features.py +3 -3
  28. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_utilities.py +12 -14
  29. lzgraphs-2.1.2/src/LZGraphs/graphs/lz_graph_base.py +0 -2201
  30. lzgraphs-2.1.2/src/LZGraphs/mixins/__init__.py +0 -3
  31. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/CHANGELOG.md +0 -0
  32. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/CONTRIBUTING.md +0 -0
  33. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/LICENSE +0 -0
  34. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/MANIFEST.in +0 -0
  35. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/requirements.txt +0 -0
  36. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/setup.cfg +0 -0
  37. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/bag_of_words/__init__.py +0 -0
  38. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/bag_of_words/bow_encoder.py +0 -0
  39. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/exceptions/__init__.py +0 -0
  40. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/graphs/__init__.py +0 -0
  41. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/graphs/edge_data.py +0 -0
  42. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/graphs/graph_operations.py +0 -0
  43. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/metrics/__init__.py +0 -0
  44. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/metrics/diversity.py +0 -0
  45. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/metrics/entropy.py +0 -0
  46. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/metrics/pgen_distribution.py +0 -0
  47. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/mixins/gene_prediction.py +0 -0
  48. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/py.typed +0 -0
  49. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/utilities/__init__.py +0 -0
  50. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/utilities/decomposition.py +0 -0
  51. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/utilities/helpers.py +0 -0
  52. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/utilities/misc.py +0 -0
  53. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/visualization/__init__.py +0 -0
  54. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs/visualization/visualize.py +0 -0
  55. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs.egg-info/dependency_links.txt +0 -0
  56. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/src/LZGraphs.egg-info/top_level.txt +0 -0
  57. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_aap_lzgraph.py +0 -0
  58. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_abundance.py +0 -0
  59. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_analytical_distribution.py +0 -0
  60. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_bow_encoder.py +0 -0
  61. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_flexible_input.py +0 -0
  62. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_lzpgen_distribution.py +0 -0
  63. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_naive_lzgraph.py +0 -0
  64. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_ndp_lzgraph.py +0 -0
  65. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_pgen_fixes.py +0 -0
  66. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_serialization.py +0 -0
  67. {lzgraphs-2.1.2 → lzgraphs-2.2.0}/tests/test_simulate.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: LZGraphs
3
- Version: 2.1.2
3
+ Version: 2.2.0
4
4
  Summary: An Implementation of LZ76 Based Graphs for Repertoire Representation and Analysis
5
5
  Author-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
6
6
  Maintainer-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
@@ -29,7 +29,6 @@ Description-Content-Type: text/markdown
29
29
  License-File: LICENSE
30
30
  Requires-Dist: networkx>=3.0
31
31
  Requires-Dist: numpy>=1.24
32
- Requires-Dist: pandas>=1.5
33
32
  Requires-Dist: tqdm>=4.65
34
33
  Requires-Dist: scipy>=1.10
35
34
  Provides-Extra: viz
@@ -38,6 +37,7 @@ Requires-Dist: seaborn>=0.12; extra == "viz"
38
37
  Provides-Extra: dev
39
38
  Requires-Dist: pytest>=7.0; extra == "dev"
40
39
  Requires-Dist: pytest-cov>=4.0; extra == "dev"
40
+ Requires-Dist: pandas>=1.5; extra == "dev"
41
41
  Requires-Dist: black>=23.0; extra == "dev"
42
42
  Requires-Dist: isort>=5.12; extra == "dev"
43
43
  Requires-Dist: ruff>=0.1.0; extra == "dev"
@@ -45,6 +45,8 @@ Requires-Dist: pre-commit>=3.0; extra == "dev"
45
45
  Requires-Dist: build>=1.0; extra == "dev"
46
46
  Requires-Dist: twine>=4.0; extra == "dev"
47
47
  Provides-Extra: docs
48
+ Requires-Dist: mkdocs-material>=9.5; extra == "docs"
49
+ Requires-Dist: mkdocstrings[python]>=0.24; extra == "docs"
48
50
  Dynamic: license-file
49
51
 
50
52
  <p align="center">
@@ -125,6 +127,7 @@ The diversity of T-cells and B-cells is crucial for producing receptors that rec
125
127
  - **Repertoire comparison** -- compare two repertoires via graph-level statistics
126
128
  - **Analytical probability distributions** -- exact moments and scipy-like distribution objects for generation probabilities
127
129
  - **Gene annotation support** -- optional V/J gene tracking on edges for gene usage analysis
130
+ - **Bayesian posterior personalization** -- adapt population-level models to individual repertoires using Dirichlet-Multinomial conjugacy
128
131
  - **Abundance weighting** -- weight sequences by clonal abundance for more realistic models
129
132
  - **Serialization** -- save and load graphs in JSON format
130
133
 
@@ -150,23 +153,20 @@ print(LZGraphs.__version__)
150
153
  Build an amino acid positional graph from CDR3 sequences and compute sequence probabilities:
151
154
 
152
155
  ```python
153
- import pandas as pd
154
156
  from LZGraphs import AAPLZGraph
155
157
 
156
- # Prepare data as a DataFrame with a 'cdr3_amino_acid' column
157
- data = pd.DataFrame({
158
- 'cdr3_amino_acid': [
159
- 'CASSLAPGATNEKLFF',
160
- 'CASSLGQAYEQYF',
161
- 'CASSFSTCSANYGYTF',
162
- 'CASSQEGTEAFF',
163
- 'CASSLGQGNIQYF',
164
- # ... your CDR3 amino acid sequences
165
- ]
166
- })
158
+ # Pass a plain list of CDR3 amino acid sequences
159
+ sequences = [
160
+ 'CASSLAPGATNEKLFF',
161
+ 'CASSLGQAYEQYF',
162
+ 'CASSFSTCSANYGYTF',
163
+ 'CASSQEGTEAFF',
164
+ 'CASSLGQGNIQYF',
165
+ # ... your CDR3 amino acid sequences
166
+ ]
167
167
 
168
168
  # Construct the graph
169
- graph = AAPLZGraph(data, verbose=True)
169
+ graph = AAPLZGraph(sequences, verbose=True)
170
170
 
171
171
  # Compute the log-probability of a sequence under the model
172
172
  log_prob = graph.walk_log_probability('CASSLAPGATNEKLFF')
@@ -221,15 +221,14 @@ graph = NaiveLZGraph(cdr3_list, dictionary, verbose=True)
221
221
 
222
222
  ### Gene Annotation
223
223
 
224
- All three graph types support optional V and J gene annotation. Include `V` and `J` columns in your DataFrame (or pass them separately for NaiveLZGraph) to track gene usage on graph edges:
224
+ All three graph types support optional V and J gene annotation. Pass gene lists alongside sequences to track gene usage on graph edges:
225
225
 
226
226
  ```python
227
- data = pd.DataFrame({
228
- 'cdr3_amino_acid': sequences,
229
- 'V': v_genes,
230
- 'J': j_genes,
231
- })
232
- graph = AAPLZGraph(data, verbose=True)
227
+ sequences = ['CASSLEPSGGTDTQYF', 'CASSDTSGGTDTQYF', ...]
228
+ v_genes = ['TRBV16-1*01', 'TRBV1-1*01', ...]
229
+ j_genes = ['TRBJ1-2*01', 'TRBJ1-5*01', ...]
230
+
231
+ graph = AAPLZGraph(sequences, v_genes=v_genes, j_genes=j_genes, verbose=True)
233
232
 
234
233
  # Gene data is now available
235
234
  print(graph.has_gene_data) # True
@@ -248,16 +247,14 @@ This is particularly important for:
248
247
  - **Better representation of clonal expansion** -- dominant clones shape the graph structure proportionally to their prevalence
249
248
  - **More realistic sequence generation** -- simulated sequences reflect the abundance-weighted landscape, not just the unique sequence set
250
249
 
251
- To use abundance weighting, include an `abundance` column in your DataFrame:
250
+ To use abundance weighting, pass an `abundances` list alongside your sequences:
252
251
 
253
252
  ```python
254
- data = pd.DataFrame({
255
- 'cdr3_amino_acid': ['CASSLAPGATNEKLFF', 'CASSLGQAYEQYF', 'CASSFSTCSANYGYTF'],
256
- 'abundance': [150, 42, 7],
257
- })
253
+ sequences = ['CASSLAPGATNEKLFF', 'CASSLGQAYEQYF', 'CASSFSTCSANYGYTF']
254
+ abundances = [150, 42, 7]
258
255
 
259
256
  # Each sequence is weighted by its abundance during graph construction
260
- graph = AAPLZGraph(data, verbose=True)
257
+ graph = AAPLZGraph(sequences, abundances=abundances, verbose=True)
261
258
  ```
262
259
 
263
260
  For `NaiveLZGraph`, pass abundances as a separate parameter:
@@ -335,6 +332,20 @@ jsd = jensen_shannon_divergence(graph1, graph2)
335
332
  comparison = compare_repertoires(graph1, graph2)
336
333
  ```
337
334
 
335
+ ### Bayesian Posterior Personalization
336
+
337
+ ```python
338
+ # Adapt a population graph to an individual
339
+ posterior = population_graph.get_posterior(
340
+ individual_sequences,
341
+ abundances=clonal_counts,
342
+ kappa=100.0 # prior strength
343
+ )
344
+
345
+ # The posterior is a full graph
346
+ simulated = posterior.simulate(1000, seed=42)
347
+ ```
348
+
338
349
  ### Visualization
339
350
 
340
351
  ```python
@@ -76,6 +76,7 @@ The diversity of T-cells and B-cells is crucial for producing receptors that rec
76
76
  - **Repertoire comparison** -- compare two repertoires via graph-level statistics
77
77
  - **Analytical probability distributions** -- exact moments and scipy-like distribution objects for generation probabilities
78
78
  - **Gene annotation support** -- optional V/J gene tracking on edges for gene usage analysis
79
+ - **Bayesian posterior personalization** -- adapt population-level models to individual repertoires using Dirichlet-Multinomial conjugacy
79
80
  - **Abundance weighting** -- weight sequences by clonal abundance for more realistic models
80
81
  - **Serialization** -- save and load graphs in JSON format
81
82
 
@@ -101,23 +102,20 @@ print(LZGraphs.__version__)
101
102
  Build an amino acid positional graph from CDR3 sequences and compute sequence probabilities:
102
103
 
103
104
  ```python
104
- import pandas as pd
105
105
  from LZGraphs import AAPLZGraph
106
106
 
107
- # Prepare data as a DataFrame with a 'cdr3_amino_acid' column
108
- data = pd.DataFrame({
109
- 'cdr3_amino_acid': [
110
- 'CASSLAPGATNEKLFF',
111
- 'CASSLGQAYEQYF',
112
- 'CASSFSTCSANYGYTF',
113
- 'CASSQEGTEAFF',
114
- 'CASSLGQGNIQYF',
115
- # ... your CDR3 amino acid sequences
116
- ]
117
- })
107
+ # Pass a plain list of CDR3 amino acid sequences
108
+ sequences = [
109
+ 'CASSLAPGATNEKLFF',
110
+ 'CASSLGQAYEQYF',
111
+ 'CASSFSTCSANYGYTF',
112
+ 'CASSQEGTEAFF',
113
+ 'CASSLGQGNIQYF',
114
+ # ... your CDR3 amino acid sequences
115
+ ]
118
116
 
119
117
  # Construct the graph
120
- graph = AAPLZGraph(data, verbose=True)
118
+ graph = AAPLZGraph(sequences, verbose=True)
121
119
 
122
120
  # Compute the log-probability of a sequence under the model
123
121
  log_prob = graph.walk_log_probability('CASSLAPGATNEKLFF')
@@ -172,15 +170,14 @@ graph = NaiveLZGraph(cdr3_list, dictionary, verbose=True)
172
170
 
173
171
  ### Gene Annotation
174
172
 
175
- All three graph types support optional V and J gene annotation. Include `V` and `J` columns in your DataFrame (or pass them separately for NaiveLZGraph) to track gene usage on graph edges:
173
+ All three graph types support optional V and J gene annotation. Pass gene lists alongside sequences to track gene usage on graph edges:
176
174
 
177
175
  ```python
178
- data = pd.DataFrame({
179
- 'cdr3_amino_acid': sequences,
180
- 'V': v_genes,
181
- 'J': j_genes,
182
- })
183
- graph = AAPLZGraph(data, verbose=True)
176
+ sequences = ['CASSLEPSGGTDTQYF', 'CASSDTSGGTDTQYF', ...]
177
+ v_genes = ['TRBV16-1*01', 'TRBV1-1*01', ...]
178
+ j_genes = ['TRBJ1-2*01', 'TRBJ1-5*01', ...]
179
+
180
+ graph = AAPLZGraph(sequences, v_genes=v_genes, j_genes=j_genes, verbose=True)
184
181
 
185
182
  # Gene data is now available
186
183
  print(graph.has_gene_data) # True
@@ -199,16 +196,14 @@ This is particularly important for:
199
196
  - **Better representation of clonal expansion** -- dominant clones shape the graph structure proportionally to their prevalence
200
197
  - **More realistic sequence generation** -- simulated sequences reflect the abundance-weighted landscape, not just the unique sequence set
201
198
 
202
- To use abundance weighting, include an `abundance` column in your DataFrame:
199
+ To use abundance weighting, pass an `abundances` list alongside your sequences:
203
200
 
204
201
  ```python
205
- data = pd.DataFrame({
206
- 'cdr3_amino_acid': ['CASSLAPGATNEKLFF', 'CASSLGQAYEQYF', 'CASSFSTCSANYGYTF'],
207
- 'abundance': [150, 42, 7],
208
- })
202
+ sequences = ['CASSLAPGATNEKLFF', 'CASSLGQAYEQYF', 'CASSFSTCSANYGYTF']
203
+ abundances = [150, 42, 7]
209
204
 
210
205
  # Each sequence is weighted by its abundance during graph construction
211
- graph = AAPLZGraph(data, verbose=True)
206
+ graph = AAPLZGraph(sequences, abundances=abundances, verbose=True)
212
207
  ```
213
208
 
214
209
  For `NaiveLZGraph`, pass abundances as a separate parameter:
@@ -286,6 +281,20 @@ jsd = jensen_shannon_divergence(graph1, graph2)
286
281
  comparison = compare_repertoires(graph1, graph2)
287
282
  ```
288
283
 
284
+ ### Bayesian Posterior Personalization
285
+
286
+ ```python
287
+ # Adapt a population graph to an individual
288
+ posterior = population_graph.get_posterior(
289
+ individual_sequences,
290
+ abundances=clonal_counts,
291
+ kappa=100.0 # prior strength
292
+ )
293
+
294
+ # The posterior is a full graph
295
+ simulated = posterior.simulate(1000, seed=42)
296
+ ```
297
+
289
298
  ### Visualization
290
299
 
291
300
  ```python
@@ -45,7 +45,6 @@ classifiers = [
45
45
  dependencies = [
46
46
  "networkx>=3.0",
47
47
  "numpy>=1.24",
48
- "pandas>=1.5",
49
48
  "tqdm>=4.65",
50
49
  "scipy>=1.10",
51
50
  ]
@@ -58,6 +57,7 @@ viz = [
58
57
  dev = [
59
58
  "pytest>=7.0",
60
59
  "pytest-cov>=4.0",
60
+ "pandas>=1.5",
61
61
  "black>=23.0",
62
62
  "isort>=5.12",
63
63
  "ruff>=0.1.0",
@@ -65,7 +65,10 @@ dev = [
65
65
  "build>=1.0",
66
66
  "twine>=4.0",
67
67
  ]
68
- docs = []
68
+ docs = [
69
+ "mkdocs-material>=9.5",
70
+ "mkdocstrings[python]>=0.24",
71
+ ]
69
72
 
70
73
  [project.urls]
71
74
  Homepage = "https://github.com/MuteJester/LZGraphs"
@@ -1,4 +1,4 @@
1
- __version__ = "2.1.2"
1
+ __version__ = "2.2.0"
2
2
 
3
3
  # =============================================================================
4
4
  # Graph classes
@@ -0,0 +1,6 @@
1
+ """Module-level numeric constants shared across LZGraphs internals."""
2
+ import numpy as np
3
+
4
+ # Machine epsilon — cached once at module level (avoids repeated np.finfo calls)
5
+ _EPS = np.finfo(np.float64).eps
6
+ _LOG_EPS = np.log(_EPS)
@@ -1,10 +1,11 @@
1
1
  import logging
2
+ import random as _random
2
3
  import time
4
+ from collections import defaultdict
3
5
  from typing import List, Tuple, Union, Optional, Generator
4
6
 
5
7
  import networkx as nx
6
8
  import numpy as np
7
- import pandas as pd
8
9
  from tqdm.auto import tqdm
9
10
 
10
11
  from .lz_graph_base import LZGraphBase
@@ -67,7 +68,7 @@ class AAPLZGraph(LZGraphBase):
67
68
 
68
69
  def __init__(
69
70
  self,
70
- data: Union[pd.DataFrame, List[str], pd.Series],
71
+ data,
71
72
  *,
72
73
  abundances: Optional[List[int]] = None,
73
74
  v_genes: Optional[List[str]] = None,
@@ -81,17 +82,18 @@ class AAPLZGraph(LZGraphBase):
81
82
  """
82
83
  Create an amino-acid-positional LZGraph.
83
84
 
84
- *data* can be a pandas DataFrame with a ``cdr3_amino_acid`` column,
85
- a plain list of amino-acid sequences, or a pandas Series.
85
+ *data* can be a DataFrame-like object with a ``cdr3_amino_acid``
86
+ column, a plain list of amino-acid sequences, or any iterable with
87
+ a ``.tolist()`` method.
86
88
 
87
- When *data* is a list or Series the optional keyword arguments
88
- *abundances*, *v_genes* and *j_genes* may be used to supply
89
- additional per-sequence information. When *data* is a DataFrame
90
- these must be ``None`` — use DataFrame columns instead.
89
+ When *data* is a list the optional keyword arguments *abundances*,
90
+ *v_genes* and *j_genes* may be used to supply additional
91
+ per-sequence information. When *data* is a DataFrame these must be
92
+ ``None`` — use DataFrame columns instead.
91
93
 
92
94
  Args:
93
95
  data: Sequence data. DataFrame (with ``cdr3_amino_acid`` column),
94
- list of strings, or pandas Series.
96
+ list of strings, or any iterable of strings.
95
97
  abundances: Per-sequence abundance counts (list input only).
96
98
  v_genes: Per-sequence V gene annotations (list input only).
97
99
  j_genes: Per-sequence J gene annotations (list input only).
@@ -112,7 +114,7 @@ class AAPLZGraph(LZGraphBase):
112
114
  """
113
115
  super().__init__() # Initialize LZGraphBase
114
116
 
115
- # Normalize flexible input → DataFrame
117
+ # Normalize flexible input → dict-of-lists
116
118
  data = self._normalize_input(
117
119
  data, "cdr3_amino_acid",
118
120
  abundances=abundances, v_genes=v_genes, j_genes=j_genes,
@@ -127,20 +129,16 @@ class AAPLZGraph(LZGraphBase):
127
129
  self._validate_input(data, validate_sequences)
128
130
 
129
131
  # Determine if we have gene data
130
- self.has_gene_data = (
131
- isinstance(data, pd.DataFrame) and
132
- ("V" in data.columns) and
133
- ("J" in data.columns)
134
- )
132
+ self.has_gene_data = data.get('v_genes') is not None
135
133
 
136
134
  # Load gene data if present
137
135
  if self.has_gene_data:
138
136
  self._load_gene_data(data)
139
- self.verbose_driver(0, verbose) # "Gene Information Loaded"
137
+ self._log_step("Gene information loaded.", verbose)
140
138
 
141
139
  # Build the graph with a custom routine
142
140
  self.__simultaneous_graph_construction(data)
143
- self.verbose_driver(1, verbose) # "Graph Constructed"
141
+ self._log_step("Graph constructed.", verbose)
144
142
 
145
143
  # Normalize and derive probability dicts
146
144
  self.length_counts = dict(self.lengths)
@@ -162,103 +160,87 @@ class AAPLZGraph(LZGraphBase):
162
160
  if total_initial > 0 else {}
163
161
  )
164
162
 
165
- self.verbose_driver(2, verbose) # "Graph Metadata Derived"
163
+ self._log_step("Graph metadata derived.", verbose)
166
164
 
167
165
  # Derive subpattern probabilities & normalize edges
168
166
  self._derive_node_probability()
169
- self.verbose_driver(8, verbose)
167
+ self._log_step("Node probabilities derived.", verbose)
170
168
 
171
169
  self._normalize_edge_weights()
172
- self.verbose_driver(3, verbose)
170
+ self._log_step("Edge weights normalized.", verbose)
173
171
 
174
172
  # Additional map derivations
175
173
  self._edges_cache = None
176
174
  self._derive_stop_probability_data()
177
- self.verbose_driver(9, verbose)
175
+ self._log_step("Stop probabilities derived.", verbose)
178
176
 
179
177
  # Optionally compute the PGEN for each sequence
180
178
  if calculate_trainset_pgen:
181
179
  logger.info("Calculating PGEN for the training set. This may take some time...")
182
180
  self.train_pgen = np.array([
183
181
  self.walk_probability(seq, verbose=False)
184
- for seq in data["cdr3_amino_acid"]
182
+ for seq in data['sequences']
185
183
  ])
186
184
 
187
185
  self.constructor_end_time = time.time()
188
- self.verbose_driver(6, verbose)
189
- self.verbose_driver(-2, verbose)
186
+ self._log_step("LZGraph created successfully.", verbose)
190
187
 
191
188
  # --------------------------------------------------------------------------
192
189
  # Input Validation
193
190
  # --------------------------------------------------------------------------
194
191
 
195
- def _validate_input(self, data: pd.DataFrame, validate_sequences: bool) -> None:
192
+ def _validate_input(self, data: dict, validate_sequences: bool) -> None:
196
193
  """
197
194
  Validate input data before graph construction.
198
195
 
199
196
  Args:
200
- data: Input DataFrame
201
- validate_sequences: Whether to check sequence content
202
-
203
- Raises:
204
- TypeError: If data is not a pandas DataFrame
205
- ValueError: If required columns are missing or data is invalid
197
+ data: Normalised dict with key ``'sequences'`` (and optionally
198
+ ``'v_genes'``, ``'j_genes'``, ``'abundances'``).
199
+ validate_sequences: Whether to check sequence content.
206
200
  """
207
- # Check type
208
- if not isinstance(data, pd.DataFrame):
209
- raise TypeError(
210
- f"Expected pandas DataFrame, got {type(data).__name__}. "
211
- "Please provide a DataFrame with a 'cdr3_amino_acid' column."
212
- )
213
-
214
- # Check for required column
215
- if 'cdr3_amino_acid' not in data.columns:
216
- raise MissingColumnError(
217
- column_name='cdr3_amino_acid',
218
- available_columns=list(data.columns)
219
- )
201
+ sequences = data['sequences']
220
202
 
221
203
  # Check for empty data
222
- if len(data) == 0:
223
- raise EmptyDataError("DataFrame is empty. Cannot build LZGraph from zero sequences.")
204
+ if len(sequences) == 0:
205
+ raise EmptyDataError("No sequences provided. Cannot build LZGraph from zero sequences.")
224
206
 
225
- # Check for null values in CDR3 column
226
- null_count = data['cdr3_amino_acid'].isna().sum()
207
+ # Check for null values
208
+ null_count = sum(1 for x in sequences if x is None)
227
209
  if null_count > 0:
228
210
  raise ValueError(
229
- f"Found {null_count} null values in 'cdr3_amino_acid' column. "
211
+ f"Found {null_count} null values in sequences. "
230
212
  "Please remove or fill null values before building the graph."
231
213
  )
232
214
 
233
215
  # Check for empty strings
234
- empty_count = (data['cdr3_amino_acid'].str.len() == 0).sum()
216
+ empty_count = sum(1 for x in sequences if not x)
235
217
  if empty_count > 0:
236
218
  raise ValueError(
237
- f"Found {empty_count} empty strings in 'cdr3_amino_acid' column. "
219
+ f"Found {empty_count} empty strings in sequences. "
238
220
  "Please remove empty sequences before building the graph."
239
221
  )
240
222
 
241
223
  # Validate sequence content if requested
242
224
  if validate_sequences:
243
- self._validate_sequence_content(data['cdr3_amino_acid'])
225
+ self._validate_sequence_content(sequences)
244
226
 
245
227
  # Validate gene columns if present
246
- if 'V' in data.columns and 'J' in data.columns:
228
+ if data.get('v_genes') is not None:
247
229
  self._validate_gene_columns(data)
248
230
 
249
- def _validate_sequence_content(self, sequences: pd.Series) -> None:
231
+ def _validate_sequence_content(self, sequences: list) -> None:
250
232
  """
251
233
  Validate that sequences contain only valid amino acid characters.
252
234
 
253
235
  Args:
254
- sequences: Series of amino acid sequences
255
-
256
- Raises:
257
- ValueError: If invalid characters are found
236
+ sequences: List of amino acid sequences.
258
237
  """
259
238
  # Sample up to 1000 sequences for validation (performance)
260
239
  sample_size = min(1000, len(sequences))
261
- sample = sequences.sample(n=sample_size, random_state=42) if len(sequences) > sample_size else sequences
240
+ if len(sequences) > sample_size:
241
+ sample = _random.Random(42).sample(sequences, k=sample_size)
242
+ else:
243
+ sample = sequences
262
244
 
263
245
  invalid_chars_found = set()
264
246
  invalid_sequences = []
@@ -287,19 +269,15 @@ class AAPLZGraph(LZGraphBase):
287
269
  )
288
270
  )
289
271
 
290
- def _validate_gene_columns(self, data: pd.DataFrame) -> None:
272
+ def _validate_gene_columns(self, data: dict) -> None:
291
273
  """
292
- Validate V and J gene columns.
274
+ Validate V and J gene lists.
293
275
 
294
276
  Args:
295
- data: DataFrame with V and J columns
296
-
297
- Raises:
298
- ValueError: If gene columns contain invalid data
277
+ data: Dict with ``'v_genes'`` and ``'j_genes'`` lists.
299
278
  """
300
- # Check for nulls in gene columns
301
- v_nulls = data['V'].isna().sum()
302
- j_nulls = data['J'].isna().sum()
279
+ v_nulls = sum(1 for x in data['v_genes'] if x is None)
280
+ j_nulls = sum(1 for x in data['j_genes'] if x is None)
303
281
 
304
282
  if v_nulls > 0 or j_nulls > 0:
305
283
  raise ValueError(
@@ -328,34 +306,31 @@ class AAPLZGraph(LZGraphBase):
328
306
  idx = base.rfind('_')
329
307
  return base[:idx] if idx > 0 else base
330
308
 
331
- def _decomposed_sequence_generator(
332
- self,
333
- data: Union[pd.DataFrame, pd.Series]
334
- ) -> Generator:
309
+ def _decomposed_sequence_generator(self, data: dict) -> Generator:
335
310
  """
336
311
  A generator that yields the information needed to build the graph.
337
312
 
338
- If an ``abundance`` column is present in the DataFrame, each sequence
339
- is weighted by its abundance count. Otherwise each sequence counts as 1.
313
+ Args:
314
+ data: Normalised dict with ``'sequences'`` (and optionally
315
+ ``'abundances'``, ``'v_genes'``, ``'j_genes'``).
340
316
 
341
317
  Yields:
342
318
  If genetic: (steps, locations, v, j, count)
343
319
  Otherwise: (steps, locations, count)
344
320
  """
345
- has_abundance = isinstance(data, pd.DataFrame) and 'abundance' in data.columns
321
+ sequences = data['sequences']
322
+ abundances = data.get('abundances')
346
323
 
347
324
  if self.has_gene_data:
348
- iterables = [data["cdr3_amino_acid"], data["V"], data["J"]]
349
- if has_abundance:
350
- iterables.append(data["abundance"])
351
- for row in tqdm(zip(*iterables), desc="Building Graph", leave=False):
352
- if has_abundance:
353
- cdr3, v, j, abundance = row
354
- count = int(abundance)
355
- else:
356
- cdr3, v, j = row
357
- count = 1
325
+ v_genes = data['v_genes']
326
+ j_genes = data['j_genes']
327
+ if abundances is not None:
328
+ row_iter = zip(sequences, v_genes, j_genes, abundances)
329
+ else:
330
+ row_iter = ((s, v, j, 1) for s, v, j in zip(sequences, v_genes, j_genes))
358
331
 
332
+ for cdr3, v, j, abundance in tqdm(row_iter, desc="Building Graph", leave=False):
333
+ count = int(abundance)
359
334
  lz, locs = derive_lz_and_position(cdr3)
360
335
  steps = window(lz, 2)
361
336
  locations = window(locs, 2)
@@ -366,12 +341,10 @@ class AAPLZGraph(LZGraphBase):
366
341
 
367
342
  yield (steps, locations, v, j, count)
368
343
  else:
369
- if has_abundance:
370
- seq_iter = zip(data["cdr3_amino_acid"], data["abundance"])
371
- elif isinstance(data, pd.DataFrame):
372
- seq_iter = ((cdr3, 1) for cdr3 in data["cdr3_amino_acid"])
344
+ if abundances is not None:
345
+ seq_iter = zip(sequences, abundances)
373
346
  else:
374
- seq_iter = ((cdr3, 1) for cdr3 in data)
347
+ seq_iter = ((s, 1) for s in sequences)
375
348
 
376
349
  for cdr3, abundance in tqdm(seq_iter, desc="Building Graph", leave=False):
377
350
  count = int(abundance)
@@ -385,7 +358,7 @@ class AAPLZGraph(LZGraphBase):
385
358
 
386
359
  yield (steps, locations, count)
387
360
 
388
- def __simultaneous_graph_construction(self, data: pd.DataFrame) -> None:
361
+ def __simultaneous_graph_construction(self, data: dict) -> None:
389
362
  """
390
363
  Custom simultaneous construction of the graph, mirroring the parent's
391
364
  _simultaneous_graph_construction but applying our specialized decomposition.
@@ -582,50 +555,51 @@ class AAPLZGraph(LZGraphBase):
582
555
 
583
556
  return results
584
557
 
585
- def random_walk_distribution_based(self, length_distribution: pd.Series):
558
+ def random_walk_distribution_based(self, length_distribution):
586
559
  """
587
560
  Creates random walks in proportion to a given length distribution.
588
561
  We do a large number of unsupervised walks, then sample from them
589
562
  to match the specified distribution.
590
563
 
591
564
  Args:
592
- length_distribution: A Series whose index is lengths and values are
593
- how many sequences of that length we want.
565
+ length_distribution: A dict (or Series-like) mapping sequence
566
+ lengths to the number of sequences desired at that length.
594
567
 
595
568
  Returns:
596
- A 2D array (list of pairs) of shape [N, 2], where each row is (Seq, Walk).
569
+ A 2D numpy array of shape [N, 2], where each row is (Seq, Walk).
597
570
  """
598
- N = length_distribution.sum() * 3 # multiply by some factor
599
- N = int(N)
571
+ # Accept both dict and Series-like objects
572
+ if hasattr(length_distribution, 'to_dict'):
573
+ length_distribution = length_distribution.to_dict()
600
574
 
601
- walks = []
602
- seqs = []
575
+ total = sum(length_distribution.values())
576
+ N = int(total * 3)
577
+
578
+ # Generate random walks and group by length
579
+ by_length = defaultdict(list)
603
580
  logger.info(f"Generating ~{N} random walks to filter by length distribution...")
604
581
  for _ in tqdm(range(N), desc="Random Walk Distribution"):
605
582
  rw, rseq = self.unsupervised_random_walk()
606
- walks.append(rw)
607
- seqs.append(rseq)
608
-
609
- df = pd.DataFrame({"Seqs": seqs, "Walks": walks})
610
- df["L"] = df["Seqs"].str.len()
583
+ by_length[len(rseq)].append((rseq, rw))
611
584
 
585
+ rng = _random.Random(42)
612
586
  samples = []
613
- for length_val in length_distribution.index:
614
- needed = length_distribution[length_val]
615
- subset = df[df["L"] == length_val]
616
- if len(subset) < needed:
587
+ for length_val, needed in length_distribution.items():
588
+ needed = int(needed)
589
+ available = by_length.get(length_val, [])
590
+ if len(available) < needed:
617
591
  logger.warning(
618
- f"Requested {needed} sequences of length {length_val}, but only found {len(subset)}."
592
+ f"Requested {needed} sequences of length {length_val}, "
593
+ f"but only found {len(available)}."
619
594
  )
620
- needed = len(subset)
595
+ needed = len(available)
621
596
  if needed > 0:
622
- samples.append(subset.sample(n=needed, replace=False))
597
+ samples.extend(rng.sample(available, k=needed))
623
598
 
624
599
  if not samples:
625
600
  return np.array([])
626
601
 
627
- final = pd.concat(samples, ignore_index=True)
628
- return final[["Seqs", "Walks"]].values
602
+ return np.array(samples, dtype=object)
629
603
 
630
604
  def get_gene_graph(self, v: str, j: str) -> nx.DiGraph:
631
605
  """