LZGraphs 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,104 @@
1
+ from collections.abc import Iterable
2
+
3
+ import numpy as np
4
+ from tqdm.auto import tqdm
5
+
6
+ from ..Utilities.decomposition import lempel_ziv_decomposition
7
+ from ..Exceptions import EncodingFunctionMismatchError
8
+
9
+
10
+ class LZBOW:
11
+ """
12
+
13
+ This class supplies a full suite for the conversion of repertoires into a bag of words representation
14
+ based on a given sub-pattern (graph node) deriving function.
15
+ This class requires fitting on a set of sequences in order to derive the dictionary of unique sub patterns used
16
+ to generate the bag of words representation.
17
+ After the class been fitted on a source set of sequences each time a transformation is needed one can use
18
+ the transform method to get the vector representation.
19
+
20
+ Args:
21
+ encoding_function (str): the selected node extraction method to use 'naive' - emulate Naive LZGraph extraction
22
+ / 'ndp'- emulate Nucleotide Double Positional LZGraph / 'aap' - Amino Acid Positional LZGraph.
23
+
24
+ Attributes:
25
+
26
+ dictionary (set): a set of sub-patterns (graph nodes) representing the dictionary of the BOW vector
27
+ dictionary_size (int): The size of the dictionary
28
+ observed_sequences (int): The number of sequences used to derive the dictionary
29
+ encoding_function (func): the function used to derive sub-patterns from a sequence (in the context of this
30
+ library it is one of the 3: Naive, Nucleotide Double Positional, Amino Acid Positional
31
+ dictionary_index_map (dict): a dictionary that maps the set of sub-patterns to numerical positions in
32
+ the BOW vector.
33
+ dictionary_index_inverse_map (dict): a dictionary that maps numerical positions to the sub-patterns from
34
+ the dictionary set
35
+
36
+
37
+ """
38
+ def __init__(self, encoding_function=lempel_ziv_decomposition):
39
+ self.dictionary = set()
40
+ self.dictionary_size = 0
41
+ self.observed_sequences = 0
42
+ self.encoding_function = encoding_function
43
+
44
+ self.dictionary_index_map = dict()
45
+ self.dictionary_index_inverse_map = dict()
46
+
47
+ def _derive_index_maps(self):
48
+ self.dictionary_index_map = {pattern: idx for idx, pattern in enumerate(self.dictionary)}
49
+ self.dictionary_index_inverse_map = {idx: pattern for idx, pattern in enumerate(self.dictionary)}
50
+ self.dictionary_size = len(self.dictionary)
51
+
52
+ def fit(self, data):
53
+ if type(data) == str:
54
+ encoded = self.encoding_function(data)
55
+ self.dictionary = self.dictionary | set(encoded)
56
+ self._derive_index_maps()
57
+
58
+ elif isinstance(data, Iterable):
59
+ for seq in tqdm(data, leave=False, position=0):
60
+ encoded = self.encoding_function(seq)
61
+ self.dictionary = self.dictionary | set(encoded)
62
+ self.observed_sequences += 1
63
+ self._derive_index_maps()
64
+
65
+ def _seq_to_index(self, seq):
66
+ encoded = self.encoding_function(seq)
67
+ return [self.dictionary_index_map[i] for i in encoded if i in self.dictionary]
68
+
69
+ def transform(self, data, normalize=False):
70
+ if type(data) == str:
71
+ result = np.zeros(self.dictionary_size)
72
+ result[self._seq_to_index(data)] += 1
73
+ return result
74
+ elif isinstance(data, Iterable):
75
+ result = np.zeros(self.dictionary_size)
76
+ for seq in tqdm(data, leave=False, position=0):
77
+ result[self._seq_to_index(seq)] += 1
78
+ if normalize:
79
+ return result / result.sum()
80
+ else:
81
+ return result
82
+
83
+ def load_from(self, other):
84
+ self.dictionary = other.dictionary
85
+ self.dictionary_size = other.dictionary_size
86
+ self.observed_sequences = other.observed_sequences
87
+ self.encoding_function = other.encoding_function
88
+
89
+ self.dictionary_index_map = other.dictionary_index_map
90
+ self.dictionary_index_inverse_map = other.dictionary_index_inverse_map
91
+
92
+ def __add__(self, other):
93
+ if self.encoding_function != other.encoding_function:
94
+ raise EncodingFunctionMismatchError(
95
+ "Cannot combine BOW objects with different encoding functions. "
96
+ "Both objects must use the same encoding function."
97
+ )
98
+ union = LZBOW(self.encoding_function)
99
+ union.dictionary = self.dictionary | other.dictionary
100
+ union.observed_sequences = self.observed_sequences + other.observed_sequences
101
+ union.dictionary_index_map = {pattern: idx for idx, pattern in enumerate(union.dictionary)}
102
+ union.dictionary_index_inverse_map = {idx: pattern for idx, pattern in enumerate(union.dictionary)}
103
+ union.dictionary_size = len(self.dictionary)
104
+ return union
File without changes
@@ -0,0 +1,550 @@
1
+ """
2
+ Custom exceptions for the LZGraphs library.
3
+
4
+ This module provides a hierarchy of exception classes that give users
5
+ clear, actionable error messages when something goes wrong. Using specific
6
+ exception types allows for targeted error handling in downstream code.
7
+
8
+ Exception Hierarchy:
9
+ LZGraphError (base)
10
+ ├── InputValidationError
11
+ │ ├── EmptyDataError
12
+ │ ├── MissingColumnError
13
+ │ └── InvalidSequenceError
14
+ ├── GraphConstructionError
15
+ │ └── EncodingError
16
+ ├── GeneDataError
17
+ │ ├── NoGeneDataError
18
+ │ └── GeneAnnotationError
19
+ ├── WalkError
20
+ │ ├── NoValidPathError
21
+ │ └── MissingNodeError
22
+ ├── SerializationError
23
+ │ └── UnsupportedFormatError
24
+ ├── BOWError
25
+ │ └── EncodingFunctionMismatchError
26
+ └── GraphOperationError
27
+ └── IncompatibleGraphsError
28
+
29
+ Example:
30
+ >>> from LZGraphs.Exceptions import NoGeneDataError, InvalidSequenceError
31
+ >>> try:
32
+ ... graph.genomic_random_walk()
33
+ ... except NoGeneDataError as e:
34
+ ... print(f"Gene data required: {e}")
35
+ """
36
+
37
+
38
+ # =============================================================================
39
+ # Base Exception
40
+ # =============================================================================
41
+
42
+ class LZGraphError(Exception):
43
+ """
44
+ Base exception for all LZGraphs errors.
45
+
46
+ All custom exceptions in this library inherit from this class,
47
+ allowing users to catch all LZGraphs-related errors with a single
48
+ except clause if desired.
49
+
50
+ Example:
51
+ >>> try:
52
+ ... # Any LZGraphs operation
53
+ ... graph = AAPLZGraph(data)
54
+ ... except LZGraphError as e:
55
+ ... print(f"LZGraphs error: {e}")
56
+ """
57
+ pass
58
+
59
+
60
+ # =============================================================================
61
+ # Input Validation Errors
62
+ # =============================================================================
63
+
64
+ class InputValidationError(LZGraphError):
65
+ """
66
+ Raised when input data fails validation checks.
67
+
68
+ This is the base class for all input-related errors. Use more specific
69
+ subclasses when the error type is known.
70
+ """
71
+ pass
72
+
73
+
74
+ class EmptyDataError(InputValidationError):
75
+ """
76
+ Raised when an operation receives empty data where non-empty is required.
77
+
78
+ Common causes:
79
+ - Passing an empty DataFrame to graph constructor
80
+ - Passing an empty list of sequences to transform
81
+ - Empty sequence list for diversity metrics
82
+
83
+ Example:
84
+ >>> graph = AAPLZGraph(pd.DataFrame()) # Raises EmptyDataError
85
+ """
86
+ pass
87
+
88
+
89
+ class MissingColumnError(InputValidationError):
90
+ """
91
+ Raised when a required column is missing from input DataFrame.
92
+
93
+ The error message includes:
94
+ - The name of the missing column
95
+ - The columns that were found in the DataFrame
96
+
97
+ Example:
98
+ >>> df = pd.DataFrame({'wrong_col': ['CASS']})
99
+ >>> graph = AAPLZGraph(df) # Raises MissingColumnError
100
+ """
101
+
102
+ def __init__(self, column_name: str, available_columns: list = None, message: str = None):
103
+ if message is None:
104
+ message = f"Required column '{column_name}' not found in DataFrame"
105
+ if available_columns:
106
+ message += f". Available columns: {available_columns}"
107
+ self.column_name = column_name
108
+ self.available_columns = available_columns
109
+ super().__init__(message)
110
+
111
+
112
+ class InvalidSequenceError(InputValidationError):
113
+ """
114
+ Raised when a sequence contains invalid characters or format.
115
+
116
+ The error message includes:
117
+ - The problematic sequence (or portion of it)
118
+ - The invalid characters found
119
+ - Expected format information
120
+
121
+ Example:
122
+ >>> graph.walk_probability("INVALID123SEQUENCE") # Raises InvalidSequenceError
123
+ """
124
+
125
+ def __init__(self, sequence: str = None, invalid_chars: str = None, message: str = None):
126
+ if message is None:
127
+ message = "Invalid sequence"
128
+ if sequence:
129
+ display_seq = sequence[:50] + "..." if len(sequence) > 50 else sequence
130
+ message += f": '{display_seq}'"
131
+ if invalid_chars:
132
+ message += f". Invalid characters: {invalid_chars}"
133
+ self.sequence = sequence
134
+ self.invalid_chars = invalid_chars
135
+ super().__init__(message)
136
+
137
+
138
+ class InvalidProbabilityError(InputValidationError):
139
+ """
140
+ Raised when probability values are invalid.
141
+
142
+ Common causes:
143
+ - Probabilities don't sum to 1.0
144
+ - Negative probability values
145
+ - Probability array length mismatch
146
+
147
+ Example:
148
+ >>> choice(['A', 'B'], [0.3, 0.3]) # Raises InvalidProbabilityError (sum != 1)
149
+ """
150
+
151
+ def __init__(self, message: str = None, prob_sum: float = None):
152
+ if message is None and prob_sum is not None:
153
+ message = f"Probabilities must sum to ~1.0, got {prob_sum:.4f}"
154
+ elif message is None:
155
+ message = "Invalid probability distribution"
156
+ self.prob_sum = prob_sum
157
+ super().__init__(message)
158
+
159
+
160
+ # =============================================================================
161
+ # Graph Construction Errors
162
+ # =============================================================================
163
+
164
+ class GraphConstructionError(LZGraphError):
165
+ """
166
+ Raised when graph construction fails.
167
+
168
+ This is the base class for errors that occur during the process
169
+ of building an LZGraph from sequence data.
170
+ """
171
+ pass
172
+
173
+
174
+ class EncodingError(GraphConstructionError):
175
+ """
176
+ Raised when sequence encoding into subpatterns fails.
177
+
178
+ This typically occurs when:
179
+ - The sequence contains unsupported characters
180
+ - The encoding function encounters an unexpected pattern
181
+ - Position calculation fails
182
+
183
+ Example:
184
+ >>> graph.encode_sequence("???") # May raise EncodingError
185
+ """
186
+
187
+ def __init__(self, sequence: str = None, message: str = None):
188
+ if message is None:
189
+ message = "Failed to encode sequence"
190
+ if sequence:
191
+ display_seq = sequence[:30] + "..." if len(sequence) > 30 else sequence
192
+ message += f": '{display_seq}'"
193
+ self.sequence = sequence
194
+ super().__init__(message)
195
+
196
+
197
+ # =============================================================================
198
+ # Gene Data Errors
199
+ # =============================================================================
200
+
201
+ class GeneDataError(LZGraphError):
202
+ """
203
+ Base class for gene-related errors.
204
+
205
+ These errors occur when working with V/J gene annotations
206
+ in genetic LZGraphs.
207
+ """
208
+ pass
209
+
210
+
211
+ class NoGeneDataError(GeneDataError):
212
+ """
213
+ Raised when a gene-related operation is attempted on a non-genetic graph.
214
+
215
+ This occurs when:
216
+ - Calling genomic_random_walk() on a graph with genetic=False
217
+ - Accessing gene prediction features without gene data
218
+ - Attempting gene-based filtering without annotations
219
+
220
+ Solution:
221
+ Build the graph with V and J gene columns in the input DataFrame.
222
+
223
+ Example:
224
+ >>> graph = AAPLZGraph(df_without_genes)
225
+ >>> graph.genomic_random_walk() # Raises NoGeneDataError
226
+ """
227
+
228
+ def __init__(self, operation: str = None, message: str = None):
229
+ if message is None:
230
+ message = "This operation requires gene annotation data (genetic=True)"
231
+ if operation:
232
+ message = f"'{operation}' requires gene annotation data (genetic=True)"
233
+ message += ". Build the graph with V and J gene columns to enable this feature."
234
+ self.operation = operation
235
+ super().__init__(message)
236
+
237
+
238
+ class GeneAnnotationError(GeneDataError):
239
+ """
240
+ Raised when gene annotation data is malformed or inconsistent.
241
+
242
+ This can occur when:
243
+ - V/J gene names don't match expected patterns
244
+ - Gene data is missing from edge attributes
245
+ - Inconsistent gene annotations across edges
246
+
247
+ Example:
248
+ >>> graph.walk_genes(walk) # May raise GeneAnnotationError if data corrupt
249
+ """
250
+ pass
251
+
252
+
253
+ # =============================================================================
254
+ # Walk and Probability Errors
255
+ # =============================================================================
256
+
257
+ class WalkError(LZGraphError):
258
+ """
259
+ Base class for errors during graph traversal operations.
260
+
261
+ These errors occur during random walks, probability calculations,
262
+ or path-finding operations on the graph.
263
+ """
264
+ pass
265
+
266
+
267
+ class NoValidPathError(WalkError):
268
+ """
269
+ Raised when no valid path exists for a given operation.
270
+
271
+ This can occur when:
272
+ - A random walk cannot proceed (no outgoing edges)
273
+ - No path exists between start and end nodes
274
+ - All potential paths are blocked
275
+
276
+ Example:
277
+ >>> graph.random_walk() # May raise NoValidPathError if graph disconnected
278
+ """
279
+
280
+ def __init__(self, start_node: str = None, message: str = None):
281
+ if message is None:
282
+ message = "No valid path found"
283
+ if start_node:
284
+ message += f" from node '{start_node}'"
285
+ self.start_node = start_node
286
+ super().__init__(message)
287
+
288
+
289
+ class MissingNodeError(WalkError):
290
+ """
291
+ Raised when a required node does not exist in the graph.
292
+
293
+ This typically occurs when:
294
+ - Computing walk probability for an unseen sequence
295
+ - A subpattern in the sequence was never observed during training
296
+ - Referencing a node that was removed
297
+
298
+ Example:
299
+ >>> graph.walk_probability("CASSXYZABC") # Raises MissingNodeError if XYZ never seen
300
+ """
301
+
302
+ def __init__(self, node: str = None, message: str = None):
303
+ if message is None:
304
+ message = "Node not found in graph"
305
+ if node:
306
+ message = f"Node '{node}' not found in graph"
307
+ self.node = node
308
+ super().__init__(message)
309
+
310
+
311
+ class MissingEdgeError(WalkError):
312
+ """
313
+ Raised when a required edge does not exist in the graph.
314
+
315
+ This occurs when:
316
+ - A transition between two nodes was never observed
317
+ - Computing probability for an impossible transition
318
+
319
+ Example:
320
+ >>> # If 'CA_0' -> 'XY_1' was never seen during training
321
+ >>> graph.walk_probability("CAXY...") # May raise MissingEdgeError
322
+ """
323
+
324
+ def __init__(self, source: str = None, target: str = None, message: str = None):
325
+ if message is None:
326
+ if source and target:
327
+ message = f"Edge '{source}' -> '{target}' not found in graph"
328
+ else:
329
+ message = "Edge not found in graph"
330
+ self.source = source
331
+ self.target = target
332
+ super().__init__(message)
333
+
334
+
335
+ # =============================================================================
336
+ # Serialization Errors
337
+ # =============================================================================
338
+
339
+ class SerializationError(LZGraphError):
340
+ """
341
+ Base class for errors during save/load operations.
342
+
343
+ These errors occur when saving graphs to files or loading
344
+ them back into memory.
345
+ """
346
+ pass
347
+
348
+
349
+ class UnsupportedFormatError(SerializationError):
350
+ """
351
+ Raised when an unsupported serialization format is specified.
352
+
353
+ Supported formats are:
354
+ - 'pickle': Binary format (recommended)
355
+ - 'json': Human-readable format
356
+
357
+ Example:
358
+ >>> graph.save('file.xml', format='xml') # Raises UnsupportedFormatError
359
+ """
360
+
361
+ def __init__(self, format: str = None, supported: list = None, message: str = None):
362
+ if message is None:
363
+ message = f"Unsupported format: '{format}'"
364
+ if supported:
365
+ message += f". Supported formats: {supported}"
366
+ else:
367
+ message += ". Supported formats: ['pickle', 'json']"
368
+ self.format = format
369
+ self.supported = supported or ['pickle', 'json']
370
+ super().__init__(message)
371
+
372
+
373
+ class CorruptedFileError(SerializationError):
374
+ """
375
+ Raised when a saved graph file appears to be corrupted.
376
+
377
+ This can occur when:
378
+ - File was partially written
379
+ - File was modified externally
380
+ - Incompatible version loaded
381
+
382
+ Example:
383
+ >>> graph = AAPLZGraph.load('corrupted.pkl') # Raises CorruptedFileError
384
+ """
385
+ pass
386
+
387
+
388
+ # =============================================================================
389
+ # BOW (Bag of Words) Errors
390
+ # =============================================================================
391
+
392
+ class BOWError(LZGraphError):
393
+ """
394
+ Base class for Bag of Words encoder errors.
395
+
396
+ These errors occur during BOW fitting, transformation,
397
+ or combination operations.
398
+ """
399
+ pass
400
+
401
+
402
+ class EncodingFunctionMismatchError(BOWError):
403
+ """
404
+ Raised when combining BOW objects with different encoding functions.
405
+
406
+ BOW objects can only be combined (using +) if they use the same
407
+ encoding function. This ensures the resulting dictionary is consistent.
408
+
409
+ Example:
410
+ >>> bow1 = LZBOW(encoding_function=lempel_ziv_decomposition)
411
+ >>> bow2 = LZBOW(encoding_function=lambda x: list(x))
412
+ >>> combined = bow1 + bow2 # Raises EncodingFunctionMismatchError
413
+ """
414
+ pass
415
+
416
+
417
+ class UnfittedBOWError(BOWError):
418
+ """
419
+ Raised when transform is called on an unfitted BOW object.
420
+
421
+ The BOW encoder must be fitted with fit() before calling transform().
422
+
423
+ Example:
424
+ >>> bow = LZBOW()
425
+ >>> bow.transform("CASSABC") # Raises UnfittedBOWError if not fitted
426
+ """
427
+ pass
428
+
429
+
430
+ # =============================================================================
431
+ # Graph Operation Errors
432
+ # =============================================================================
433
+
434
+ class GraphOperationError(LZGraphError):
435
+ """
436
+ Base class for errors during graph operations.
437
+
438
+ These errors occur during operations like graph union,
439
+ comparison, or modification.
440
+ """
441
+ pass
442
+
443
+
444
+ class IncompatibleGraphsError(GraphOperationError):
445
+ """
446
+ Raised when attempting to combine incompatible graphs.
447
+
448
+ Graphs must be of the same type to be combined. For example,
449
+ you cannot union an AAPLZGraph with an NDPLZGraph.
450
+
451
+ Example:
452
+ >>> graph_union(aap_graph, ndp_graph) # Raises IncompatibleGraphsError
453
+ """
454
+
455
+ def __init__(self, type1: str = None, type2: str = None, message: str = None):
456
+ if message is None:
457
+ if type1 and type2:
458
+ message = f"Cannot combine graphs of different types: {type1} and {type2}"
459
+ else:
460
+ message = "Cannot combine graphs of different types"
461
+ self.type1 = type1
462
+ self.type2 = type2
463
+ super().__init__(message)
464
+
465
+
466
+ # =============================================================================
467
+ # Metrics Errors
468
+ # =============================================================================
469
+
470
+ class MetricsError(LZGraphError):
471
+ """
472
+ Base class for errors in metrics calculations.
473
+
474
+ These errors occur during diversity, entropy, or other
475
+ statistical metric computations.
476
+ """
477
+ pass
478
+
479
+
480
+ class InsufficientDataError(MetricsError):
481
+ """
482
+ Raised when there's not enough data for a statistical calculation.
483
+
484
+ This can occur when:
485
+ - K-diversity requires more sequences than available
486
+ - Confidence intervals need more samples
487
+ - Statistical tests need more data points
488
+
489
+ Example:
490
+ >>> K_Diversity(sequences, k=1000, draws=100) # Raises if < 1000 sequences
491
+ """
492
+
493
+ def __init__(self, required: int = None, available: int = None, message: str = None):
494
+ if message is None:
495
+ message = "Insufficient data for calculation"
496
+ if required and available:
497
+ message = f"Insufficient data: need at least {required}, got {available}"
498
+ self.required = required
499
+ self.available = available
500
+ super().__init__(message)
501
+
502
+
503
+ # =============================================================================
504
+ # Convenience exports
505
+ # =============================================================================
506
+
507
+ __all__ = [
508
+ # Base
509
+ 'LZGraphError',
510
+
511
+ # Input validation
512
+ 'InputValidationError',
513
+ 'EmptyDataError',
514
+ 'MissingColumnError',
515
+ 'InvalidSequenceError',
516
+ 'InvalidProbabilityError',
517
+
518
+ # Graph construction
519
+ 'GraphConstructionError',
520
+ 'EncodingError',
521
+
522
+ # Gene data
523
+ 'GeneDataError',
524
+ 'NoGeneDataError',
525
+ 'GeneAnnotationError',
526
+
527
+ # Walk/probability
528
+ 'WalkError',
529
+ 'NoValidPathError',
530
+ 'MissingNodeError',
531
+ 'MissingEdgeError',
532
+
533
+ # Serialization
534
+ 'SerializationError',
535
+ 'UnsupportedFormatError',
536
+ 'CorruptedFileError',
537
+
538
+ # BOW
539
+ 'BOWError',
540
+ 'EncodingFunctionMismatchError',
541
+ 'UnfittedBOWError',
542
+
543
+ # Graph operations
544
+ 'GraphOperationError',
545
+ 'IncompatibleGraphsError',
546
+
547
+ # Metrics
548
+ 'MetricsError',
549
+ 'InsufficientDataError',
550
+ ]