LZGraphs 1.2.0__tar.gz → 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/CHANGELOG.md +1 -1
  2. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/PKG-INFO +5 -11
  3. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/pyproject.toml +5 -3
  4. lzgraphs-2.1.0/requirements.txt +5 -0
  5. lzgraphs-2.1.0/src/LZGraphs/__init__.py +195 -0
  6. lzgraphs-2.1.0/src/LZGraphs/bag_of_words/__init__.py +3 -0
  7. lzgraphs-1.2.0/src/LZGraphs/BagOfWords/BOWEncoder.py → lzgraphs-2.1.0/src/LZGraphs/bag_of_words/bow_encoder.py +92 -11
  8. {lzgraphs-1.2.0/src/LZGraphs/Exceptions → lzgraphs-2.1.0/src/LZGraphs/exceptions}/__init__.py +1 -1
  9. lzgraphs-2.1.0/src/LZGraphs/graphs/__init__.py +6 -0
  10. lzgraphs-1.2.0/src/LZGraphs/Graphs/AminoAcidPositional.py → lzgraphs-2.1.0/src/LZGraphs/graphs/amino_acid_positional.py +59 -335
  11. lzgraphs-2.1.0/src/LZGraphs/graphs/edge_data.py +197 -0
  12. lzgraphs-2.1.0/src/LZGraphs/graphs/graph_operations.py +115 -0
  13. lzgraphs-1.2.0/src/LZGraphs/Graphs/LZGraphBase.py → lzgraphs-2.1.0/src/LZGraphs/graphs/lz_graph_base.py +646 -66
  14. lzgraphs-2.1.0/src/LZGraphs/graphs/naive.py +337 -0
  15. lzgraphs-1.2.0/src/LZGraphs/Graphs/NucleotideDoublePositional.py → lzgraphs-2.1.0/src/LZGraphs/graphs/nucleotide_double_positional.py +37 -266
  16. lzgraphs-2.1.0/src/LZGraphs/metrics/__init__.py +72 -0
  17. lzgraphs-2.1.0/src/LZGraphs/metrics/convenience.py +91 -0
  18. lzgraphs-1.2.0/src/LZGraphs/Metrics/Metrics.py → lzgraphs-2.1.0/src/LZGraphs/metrics/diversity.py +13 -2
  19. lzgraphs-2.1.0/src/LZGraphs/metrics/entropy.py +1007 -0
  20. lzgraphs-1.2.0/src/LZGraphs/Utilities/NodeEdgeSaturationProbe.py → lzgraphs-2.1.0/src/LZGraphs/metrics/saturation.py +15 -6
  21. lzgraphs-2.1.0/src/LZGraphs/mixins/__init__.py +3 -0
  22. lzgraphs-1.2.0/src/LZGraphs/Mixins/GeneLogicMixin.py → lzgraphs-2.1.0/src/LZGraphs/mixins/gene_logic.py +10 -44
  23. lzgraphs-1.2.0/src/LZGraphs/Mixins/GenePredictionMixin.py → lzgraphs-2.1.0/src/LZGraphs/mixins/gene_prediction.py +35 -86
  24. lzgraphs-1.2.0/src/LZGraphs/Mixins/RandomWalkMixin.py → lzgraphs-2.1.0/src/LZGraphs/mixins/random_walk.py +1 -1
  25. lzgraphs-2.1.0/src/LZGraphs/utilities/__init__.py +13 -0
  26. {lzgraphs-1.2.0/src/LZGraphs/Utilities → lzgraphs-2.1.0/src/LZGraphs/utilities}/decomposition.py +2 -0
  27. lzgraphs-2.1.0/src/LZGraphs/utilities/helpers.py +50 -0
  28. {lzgraphs-1.2.0/src/LZGraphs/Utilities → lzgraphs-2.1.0/src/LZGraphs/utilities}/misc.py +40 -13
  29. lzgraphs-2.1.0/src/LZGraphs/visualization/__init__.py +18 -0
  30. lzgraphs-1.2.0/src/LZGraphs/Visualization/Visualize.py → lzgraphs-2.1.0/src/LZGraphs/visualization/visualize.py +92 -54
  31. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/src/LZGraphs.egg-info/PKG-INFO +5 -11
  32. lzgraphs-2.1.0/src/LZGraphs.egg-info/SOURCES.txt +52 -0
  33. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/src/LZGraphs.egg-info/requires.txt +4 -2
  34. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/tests/test_aap_lzgraph.py +73 -4
  35. lzgraphs-2.1.0/tests/test_base_class_methods.py +320 -0
  36. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/tests/test_bow_encoder.py +88 -2
  37. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/tests/test_diversity_theory.py +3 -3
  38. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/tests/test_graph_operations.py +56 -9
  39. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/tests/test_metrics.py +232 -2
  40. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/tests/test_naive_lzgraph.py +20 -1
  41. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/tests/test_ndp_lzgraph.py +1 -1
  42. lzgraphs-2.1.0/tests/test_new_features.py +396 -0
  43. lzgraphs-2.1.0/tests/test_pgen_fixes.py +225 -0
  44. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/tests/test_serialization.py +1 -1
  45. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/tests/test_utilities.py +57 -2
  46. lzgraphs-1.2.0/requirements.txt +0 -7
  47. lzgraphs-1.2.0/setup.py +0 -37
  48. lzgraphs-1.2.0/src/LZGraphs/BagOfWords/__init__.py +0 -0
  49. lzgraphs-1.2.0/src/LZGraphs/Graphs/Naive.py +0 -735
  50. lzgraphs-1.2.0/src/LZGraphs/Graphs/__init__.py +0 -0
  51. lzgraphs-1.2.0/src/LZGraphs/Metrics/__init__.py +0 -22
  52. lzgraphs-1.2.0/src/LZGraphs/Metrics/entropy.py +0 -477
  53. lzgraphs-1.2.0/src/LZGraphs/Mixins/__init__.py +0 -3
  54. lzgraphs-1.2.0/src/LZGraphs/Utilities/Utilities.py +0 -101
  55. lzgraphs-1.2.0/src/LZGraphs/Utilities/__init__.py +0 -1
  56. lzgraphs-1.2.0/src/LZGraphs/Utilities/graph_operations.py +0 -123
  57. lzgraphs-1.2.0/src/LZGraphs/Visualization/__init__.py +0 -0
  58. lzgraphs-1.2.0/src/LZGraphs/__init__.py +0 -50
  59. lzgraphs-1.2.0/src/LZGraphs.egg-info/SOURCES.txt +0 -48
  60. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/CONTRIBUTING.md +0 -0
  61. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/LICENSE +0 -0
  62. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/MANIFEST.in +0 -0
  63. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/README.md +0 -0
  64. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/setup.cfg +0 -0
  65. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/src/LZGraphs/py.typed +0 -0
  66. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/src/LZGraphs.egg-info/dependency_links.txt +0 -0
  67. {lzgraphs-1.2.0 → lzgraphs-2.1.0}/src/LZGraphs.egg-info/top_level.txt +0 -0
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
9
9
 
10
10
  ### Added
11
11
  - Custom exceptions module with comprehensive exception hierarchy for better error handling
12
- - Information-theoretic metrics module (`LZGraphs.Metrics.entropy`)
12
+ - Information-theoretic metrics module (`LZGraphs.metrics.entropy`)
13
13
  - `node_entropy()` - Shannon entropy of node probability distribution
14
14
  - `edge_entropy()` - Shannon entropy of edge transition probabilities
15
15
  - `graph_entropy()` - Combined graph entropy measure
@@ -1,10 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: LZGraphs
3
- Version: 1.2.0
3
+ Version: 2.1.0
4
4
  Summary: An Implementation of LZ76 Based Graphs for Repertoire Representation and Analysis
5
- Home-page: https://github.com/MuteJester/LZGraphs
6
- Download-URL: https://github.com/MuteJester/LZGraphs/archive/refs/tags/Beta1.1.1.tar.gz
7
- Author: Thomas Konstantinovsky
8
5
  Author-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
9
6
  Maintainer-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
10
7
  License: MIT
@@ -27,16 +24,17 @@ Classifier: Programming Language :: Python :: 3.11
27
24
  Classifier: Programming Language :: Python :: 3.12
28
25
  Classifier: Operating System :: OS Independent
29
26
  Classifier: Typing :: Typed
30
- Requires-Python: >=3.8, <4
27
+ Requires-Python: >=3.9
31
28
  Description-Content-Type: text/markdown
32
29
  License-File: LICENSE
33
30
  Requires-Dist: networkx>=3.0
34
31
  Requires-Dist: numpy>=1.24
35
32
  Requires-Dist: pandas>=1.5
36
33
  Requires-Dist: tqdm>=4.65
37
- Requires-Dist: matplotlib>=3.7
38
- Requires-Dist: seaborn>=0.12
39
34
  Requires-Dist: scipy>=1.10
35
+ Provides-Extra: viz
36
+ Requires-Dist: matplotlib>=3.7; extra == "viz"
37
+ Requires-Dist: seaborn>=0.12; extra == "viz"
40
38
  Provides-Extra: dev
41
39
  Requires-Dist: pytest>=7.0; extra == "dev"
42
40
  Requires-Dist: pytest-cov>=4.0; extra == "dev"
@@ -51,11 +49,7 @@ Requires-Dist: mkdocs>=1.5; extra == "docs"
51
49
  Requires-Dist: mkdocs-material>=9.0; extra == "docs"
52
50
  Requires-Dist: mkdocstrings[python]>=0.24; extra == "docs"
53
51
  Requires-Dist: pymdown-extensions>=10.0; extra == "docs"
54
- Dynamic: author
55
- Dynamic: download-url
56
- Dynamic: home-page
57
52
  Dynamic: license-file
58
- Dynamic: requires-python
59
53
 
60
54
  <p align="center">
61
55
 
@@ -47,12 +47,14 @@ dependencies = [
47
47
  "numpy>=1.24",
48
48
  "pandas>=1.5",
49
49
  "tqdm>=4.65",
50
- "matplotlib>=3.7",
51
- "seaborn>=0.12",
52
50
  "scipy>=1.10",
53
51
  ]
54
52
 
55
53
  [project.optional-dependencies]
54
+ viz = [
55
+ "matplotlib>=3.7",
56
+ "seaborn>=0.12",
57
+ ]
56
58
  dev = [
57
59
  "pytest>=7.0",
58
60
  "pytest-cov>=4.0",
@@ -79,7 +81,7 @@ Changelog = "https://github.com/MuteJester/LZGraphs/blob/master/CHANGELOG.md"
79
81
 
80
82
  [tool.setuptools]
81
83
  package-dir = {"" = "src"}
82
- packages = ["LZGraphs", "LZGraphs.Graphs", "LZGraphs.Metrics", "LZGraphs.Utilities", "LZGraphs.Mixins", "LZGraphs.BagOfWords", "LZGraphs.Visualization", "LZGraphs.Exceptions"]
84
+ packages = ["LZGraphs", "LZGraphs.graphs", "LZGraphs.metrics", "LZGraphs.utilities", "LZGraphs.mixins", "LZGraphs.bag_of_words", "LZGraphs.visualization", "LZGraphs.exceptions"]
83
85
 
84
86
  [tool.setuptools.dynamic]
85
87
  version = {attr = "LZGraphs.__version__"}
@@ -0,0 +1,5 @@
1
+ networkx>=3.0
2
+ numpy>=1.24
3
+ pandas>=1.5
4
+ tqdm>=4.65
5
+ scipy>=1.10
@@ -0,0 +1,195 @@
1
+ __version__ = "2.1.0"
2
+
3
+ # =============================================================================
4
+ # Graph classes
5
+ # =============================================================================
6
+ from .graphs.amino_acid_positional import AAPLZGraph
7
+ from .graphs.nucleotide_double_positional import NDPLZGraph
8
+ from .graphs.naive import NaiveLZGraph
9
+
10
+ # =============================================================================
11
+ # Graph operations
12
+ # =============================================================================
13
+ from .graphs.graph_operations import graph_union
14
+
15
+ # =============================================================================
16
+ # Bag of Words
17
+ # =============================================================================
18
+ from .bag_of_words.bow_encoder import LZBOW
19
+
20
+ # =============================================================================
21
+ # Metrics - Diversity
22
+ # =============================================================================
23
+ from .metrics.diversity import (
24
+ LZCentrality,
25
+ K_Diversity,
26
+ K100_Diversity,
27
+ K500_Diversity,
28
+ K1000_Diversity,
29
+ K5000_Diversity,
30
+ adaptive_K_Diversity,
31
+ )
32
+
33
+ # =============================================================================
34
+ # Metrics - Entropy / Information Theory
35
+ # =============================================================================
36
+ from .metrics.entropy import (
37
+ node_entropy,
38
+ edge_entropy,
39
+ graph_entropy,
40
+ normalized_graph_entropy,
41
+ sequence_perplexity,
42
+ repertoire_perplexity,
43
+ jensen_shannon_divergence,
44
+ cross_entropy,
45
+ kl_divergence,
46
+ mutual_information_genes,
47
+ transition_predictability,
48
+ graph_compression_ratio,
49
+ repertoire_compressibility_index,
50
+ transition_kl_divergence,
51
+ transition_jsd,
52
+ transition_mutual_information_profile,
53
+ path_entropy_rate,
54
+ )
55
+
56
+ # =============================================================================
57
+ # Metrics - Saturation
58
+ # =============================================================================
59
+ from .metrics.saturation import NodeEdgeSaturationProbe
60
+
61
+ # =============================================================================
62
+ # Metrics - Convenience
63
+ # =============================================================================
64
+ from .metrics.convenience import compare_repertoires
65
+
66
+ # =============================================================================
67
+ # Utilities
68
+ # =============================================================================
69
+ from .utilities.helpers import generate_kmer_dictionary
70
+ from .utilities.decomposition import lempel_ziv_decomposition
71
+
72
+ # =============================================================================
73
+ # Visualization (optional dependency)
74
+ # =============================================================================
75
+ try:
76
+ from .visualization.visualize import (
77
+ sequence_genomic_edges_variability_plot,
78
+ sequence_genomic_node_variability_plot,
79
+ sequence_possible_paths_plot,
80
+ ancestors_descendants_curves_plot,
81
+ draw_graph,
82
+ )
83
+ except ImportError:
84
+ pass # Visualization features not available without matplotlib/seaborn
85
+
86
+ # =============================================================================
87
+ # Exceptions
88
+ # =============================================================================
89
+ from .exceptions import (
90
+ # Base
91
+ LZGraphError,
92
+ # Input validation
93
+ InputValidationError,
94
+ EmptyDataError,
95
+ MissingColumnError,
96
+ InvalidSequenceError,
97
+ InvalidProbabilityError,
98
+ # Graph construction
99
+ GraphConstructionError,
100
+ EncodingError,
101
+ # Gene data
102
+ GeneDataError,
103
+ NoGeneDataError,
104
+ GeneAnnotationError,
105
+ # Walk/probability
106
+ WalkError,
107
+ NoValidPathError,
108
+ MissingNodeError,
109
+ MissingEdgeError,
110
+ # Serialization
111
+ SerializationError,
112
+ UnsupportedFormatError,
113
+ CorruptedFileError,
114
+ # BOW
115
+ BOWError,
116
+ EncodingFunctionMismatchError,
117
+ UnfittedBOWError,
118
+ # Graph operations
119
+ GraphOperationError,
120
+ IncompatibleGraphsError,
121
+ # Metrics
122
+ MetricsError,
123
+ InsufficientDataError,
124
+ )
125
+
126
+
127
+ __all__ = [
128
+ # Graph classes
129
+ 'AAPLZGraph',
130
+ 'NDPLZGraph',
131
+ 'NaiveLZGraph',
132
+ # Graph operations
133
+ 'graph_union',
134
+ # Bag of Words
135
+ 'LZBOW',
136
+ # Diversity metrics
137
+ 'LZCentrality',
138
+ 'K_Diversity',
139
+ 'K100_Diversity',
140
+ 'K500_Diversity',
141
+ 'K1000_Diversity',
142
+ 'K5000_Diversity',
143
+ 'adaptive_K_Diversity',
144
+ # Entropy metrics
145
+ 'node_entropy',
146
+ 'edge_entropy',
147
+ 'graph_entropy',
148
+ 'normalized_graph_entropy',
149
+ 'sequence_perplexity',
150
+ 'repertoire_perplexity',
151
+ 'jensen_shannon_divergence',
152
+ 'cross_entropy',
153
+ 'kl_divergence',
154
+ 'mutual_information_genes',
155
+ 'transition_predictability',
156
+ 'graph_compression_ratio',
157
+ 'repertoire_compressibility_index',
158
+ 'transition_kl_divergence',
159
+ 'transition_jsd',
160
+ 'transition_mutual_information_profile',
161
+ 'path_entropy_rate',
162
+ # Saturation
163
+ 'NodeEdgeSaturationProbe',
164
+ # Convenience
165
+ 'compare_repertoires',
166
+ # Utilities
167
+ 'generate_kmer_dictionary',
168
+ 'lempel_ziv_decomposition',
169
+ # Exceptions
170
+ 'LZGraphError',
171
+ 'InputValidationError',
172
+ 'EmptyDataError',
173
+ 'MissingColumnError',
174
+ 'InvalidSequenceError',
175
+ 'InvalidProbabilityError',
176
+ 'GraphConstructionError',
177
+ 'EncodingError',
178
+ 'GeneDataError',
179
+ 'NoGeneDataError',
180
+ 'GeneAnnotationError',
181
+ 'WalkError',
182
+ 'NoValidPathError',
183
+ 'MissingNodeError',
184
+ 'MissingEdgeError',
185
+ 'SerializationError',
186
+ 'UnsupportedFormatError',
187
+ 'CorruptedFileError',
188
+ 'BOWError',
189
+ 'EncodingFunctionMismatchError',
190
+ 'UnfittedBOWError',
191
+ 'GraphOperationError',
192
+ 'IncompatibleGraphsError',
193
+ 'MetricsError',
194
+ 'InsufficientDataError',
195
+ ]
@@ -0,0 +1,3 @@
1
+ from .bow_encoder import LZBOW
2
+
3
+ __all__ = ['LZBOW']
@@ -3,8 +3,10 @@ from collections.abc import Iterable
3
3
  import numpy as np
4
4
  from tqdm.auto import tqdm
5
5
 
6
- from ..Utilities.decomposition import lempel_ziv_decomposition
7
- from ..Exceptions import EncodingFunctionMismatchError
6
+ from ..utilities.decomposition import lempel_ziv_decomposition
7
+ from ..exceptions import EncodingFunctionMismatchError
8
+
9
+ __all__ = ["LZBOW"]
8
10
 
9
11
 
10
12
  class LZBOW:
@@ -44,13 +46,17 @@ class LZBOW:
44
46
  self.dictionary_index_map = dict()
45
47
  self.dictionary_index_inverse_map = dict()
46
48
 
49
+ def __repr__(self):
50
+ return (f"LZBOW(dictionary_size={self.dictionary_size}, "
51
+ f"observed_sequences={self.observed_sequences})")
52
+
47
53
  def _derive_index_maps(self):
48
54
  self.dictionary_index_map = {pattern: idx for idx, pattern in enumerate(self.dictionary)}
49
55
  self.dictionary_index_inverse_map = {idx: pattern for idx, pattern in enumerate(self.dictionary)}
50
56
  self.dictionary_size = len(self.dictionary)
51
57
 
52
58
  def fit(self, data):
53
- if type(data) == str:
59
+ if isinstance(data, str):
54
60
  encoded = self.encoding_function(data)
55
61
  self.dictionary = self.dictionary | set(encoded)
56
62
  self._derive_index_maps()
@@ -66,18 +72,29 @@ class LZBOW:
66
72
  encoded = self.encoding_function(seq)
67
73
  return [self.dictionary_index_map[i] for i in encoded if i in self.dictionary]
68
74
 
69
- def transform(self, data, normalize=False):
70
- if type(data) == str:
75
+ def transform(self, data, normalize=False, per_sequence=False):
76
+ if isinstance(data, str):
71
77
  result = np.zeros(self.dictionary_size)
72
78
  result[self._seq_to_index(data)] += 1
73
79
  return result
74
80
  elif isinstance(data, Iterable):
75
- result = np.zeros(self.dictionary_size)
76
- for seq in tqdm(data, leave=False, position=0):
77
- result[self._seq_to_index(seq)] += 1
78
- if normalize:
79
- return result / result.sum()
81
+ if per_sequence:
82
+ data_list = list(data)
83
+ matrix = np.zeros((len(data_list), self.dictionary_size))
84
+ for i, seq in enumerate(tqdm(data_list, leave=False, position=0)):
85
+ matrix[i, self._seq_to_index(seq)] += 1
86
+ if normalize:
87
+ row_sums = matrix.sum(axis=1, keepdims=True)
88
+ row_sums[row_sums == 0] = 1 # avoid division by zero
89
+ return matrix / row_sums
90
+ return matrix
80
91
  else:
92
+ result = np.zeros(self.dictionary_size)
93
+ for seq in tqdm(data, leave=False, position=0):
94
+ result[self._seq_to_index(seq)] += 1
95
+ if normalize:
96
+ total = result.sum()
97
+ return result / total if total > 0 else result
81
98
  return result
82
99
 
83
100
  def load_from(self, other):
@@ -100,5 +117,69 @@ class LZBOW:
100
117
  union.observed_sequences = self.observed_sequences + other.observed_sequences
101
118
  union.dictionary_index_map = {pattern: idx for idx, pattern in enumerate(union.dictionary)}
102
119
  union.dictionary_index_inverse_map = {idx: pattern for idx, pattern in enumerate(union.dictionary)}
103
- union.dictionary_size = len(self.dictionary)
120
+ union.dictionary_size = len(union.dictionary)
104
121
  return union
122
+
123
+ def fit_transform(self, data, normalize=False, per_sequence=False):
124
+ """
125
+ Fit the encoder on data and transform it in one step.
126
+
127
+ Equivalent to calling fit(data) followed by transform(data), but
128
+ avoids processing the data twice for fitting.
129
+
130
+ Args:
131
+ data: A string (single sequence) or iterable of strings.
132
+ normalize (bool): If True, normalize the output vectors.
133
+ per_sequence (bool): If True and data is iterable, return a
134
+ 2D matrix (n_sequences x dictionary_size).
135
+
136
+ Returns:
137
+ np.ndarray: BOW vector(s) for the input data.
138
+
139
+ Example:
140
+ >>> bow = LZBOW()
141
+ >>> matrix = bow.fit_transform(sequences, per_sequence=True)
142
+ """
143
+ self.fit(data)
144
+ return self.transform(data, normalize=normalize, per_sequence=per_sequence)
145
+
146
+ def tfidf_transform(self, data):
147
+ """
148
+ Transform sequences into TF-IDF weighted bag-of-words vectors.
149
+
150
+ TF-IDF (Term Frequency - Inverse Document Frequency) weights
151
+ down-weight subpatterns that appear in many sequences and up-weight
152
+ those that are more discriminative.
153
+
154
+ The encoder must be fitted before calling this method.
155
+
156
+ Args:
157
+ data: An iterable of sequence strings.
158
+
159
+ Returns:
160
+ np.ndarray: 2D matrix (n_sequences x dictionary_size) with TF-IDF weights.
161
+
162
+ Example:
163
+ >>> bow = LZBOW()
164
+ >>> bow.fit(train_sequences)
165
+ >>> tfidf_matrix = bow.tfidf_transform(test_sequences)
166
+ """
167
+ # Get per-sequence term frequency matrix
168
+ tf_matrix = self.transform(data, per_sequence=True)
169
+
170
+ n_docs = tf_matrix.shape[0]
171
+ if n_docs == 0:
172
+ return tf_matrix
173
+
174
+ # Compute document frequency: number of sequences containing each term
175
+ doc_freq = np.count_nonzero(tf_matrix, axis=0).astype(np.float64)
176
+
177
+ # IDF = log(1 + N / (1 + df)), smoothed variant that's always non-negative
178
+ idf = np.log1p(n_docs / (1.0 + doc_freq))
179
+
180
+ # Normalize TF per row (L1 normalization)
181
+ row_sums = tf_matrix.sum(axis=1, keepdims=True)
182
+ row_sums[row_sums == 0] = 1 # avoid division by zero
183
+ tf_normalized = tf_matrix / row_sums
184
+
185
+ return tf_normalized * idf
@@ -27,7 +27,7 @@ Exception Hierarchy:
27
27
  └── IncompatibleGraphsError
28
28
 
29
29
  Example:
30
- >>> from LZGraphs.Exceptions import NoGeneDataError, InvalidSequenceError
30
+ >>> from LZGraphs.exceptions import NoGeneDataError, InvalidSequenceError
31
31
  >>> try:
32
32
  ... graph.genomic_random_walk()
33
33
  ... except NoGeneDataError as e:
@@ -0,0 +1,6 @@
1
+ from .amino_acid_positional import AAPLZGraph
2
+ from .nucleotide_double_positional import NDPLZGraph
3
+ from .naive import NaiveLZGraph
4
+ from .graph_operations import graph_union
5
+
6
+ __all__ = ['AAPLZGraph', 'NDPLZGraph', 'NaiveLZGraph', 'graph_union']