LZGraphs 1.2.0__tar.gz → 2.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/CHANGELOG.md +1 -1
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/PKG-INFO +5 -11
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/pyproject.toml +5 -3
- lzgraphs-2.1.0/requirements.txt +5 -0
- lzgraphs-2.1.0/src/LZGraphs/__init__.py +195 -0
- lzgraphs-2.1.0/src/LZGraphs/bag_of_words/__init__.py +3 -0
- lzgraphs-1.2.0/src/LZGraphs/BagOfWords/BOWEncoder.py → lzgraphs-2.1.0/src/LZGraphs/bag_of_words/bow_encoder.py +92 -11
- {lzgraphs-1.2.0/src/LZGraphs/Exceptions → lzgraphs-2.1.0/src/LZGraphs/exceptions}/__init__.py +1 -1
- lzgraphs-2.1.0/src/LZGraphs/graphs/__init__.py +6 -0
- lzgraphs-1.2.0/src/LZGraphs/Graphs/AminoAcidPositional.py → lzgraphs-2.1.0/src/LZGraphs/graphs/amino_acid_positional.py +59 -335
- lzgraphs-2.1.0/src/LZGraphs/graphs/edge_data.py +197 -0
- lzgraphs-2.1.0/src/LZGraphs/graphs/graph_operations.py +115 -0
- lzgraphs-1.2.0/src/LZGraphs/Graphs/LZGraphBase.py → lzgraphs-2.1.0/src/LZGraphs/graphs/lz_graph_base.py +646 -66
- lzgraphs-2.1.0/src/LZGraphs/graphs/naive.py +337 -0
- lzgraphs-1.2.0/src/LZGraphs/Graphs/NucleotideDoublePositional.py → lzgraphs-2.1.0/src/LZGraphs/graphs/nucleotide_double_positional.py +37 -266
- lzgraphs-2.1.0/src/LZGraphs/metrics/__init__.py +72 -0
- lzgraphs-2.1.0/src/LZGraphs/metrics/convenience.py +91 -0
- lzgraphs-1.2.0/src/LZGraphs/Metrics/Metrics.py → lzgraphs-2.1.0/src/LZGraphs/metrics/diversity.py +13 -2
- lzgraphs-2.1.0/src/LZGraphs/metrics/entropy.py +1007 -0
- lzgraphs-1.2.0/src/LZGraphs/Utilities/NodeEdgeSaturationProbe.py → lzgraphs-2.1.0/src/LZGraphs/metrics/saturation.py +15 -6
- lzgraphs-2.1.0/src/LZGraphs/mixins/__init__.py +3 -0
- lzgraphs-1.2.0/src/LZGraphs/Mixins/GeneLogicMixin.py → lzgraphs-2.1.0/src/LZGraphs/mixins/gene_logic.py +10 -44
- lzgraphs-1.2.0/src/LZGraphs/Mixins/GenePredictionMixin.py → lzgraphs-2.1.0/src/LZGraphs/mixins/gene_prediction.py +35 -86
- lzgraphs-1.2.0/src/LZGraphs/Mixins/RandomWalkMixin.py → lzgraphs-2.1.0/src/LZGraphs/mixins/random_walk.py +1 -1
- lzgraphs-2.1.0/src/LZGraphs/utilities/__init__.py +13 -0
- {lzgraphs-1.2.0/src/LZGraphs/Utilities → lzgraphs-2.1.0/src/LZGraphs/utilities}/decomposition.py +2 -0
- lzgraphs-2.1.0/src/LZGraphs/utilities/helpers.py +50 -0
- {lzgraphs-1.2.0/src/LZGraphs/Utilities → lzgraphs-2.1.0/src/LZGraphs/utilities}/misc.py +40 -13
- lzgraphs-2.1.0/src/LZGraphs/visualization/__init__.py +18 -0
- lzgraphs-1.2.0/src/LZGraphs/Visualization/Visualize.py → lzgraphs-2.1.0/src/LZGraphs/visualization/visualize.py +92 -54
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/src/LZGraphs.egg-info/PKG-INFO +5 -11
- lzgraphs-2.1.0/src/LZGraphs.egg-info/SOURCES.txt +52 -0
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/src/LZGraphs.egg-info/requires.txt +4 -2
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/tests/test_aap_lzgraph.py +73 -4
- lzgraphs-2.1.0/tests/test_base_class_methods.py +320 -0
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/tests/test_bow_encoder.py +88 -2
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/tests/test_diversity_theory.py +3 -3
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/tests/test_graph_operations.py +56 -9
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/tests/test_metrics.py +232 -2
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/tests/test_naive_lzgraph.py +20 -1
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/tests/test_ndp_lzgraph.py +1 -1
- lzgraphs-2.1.0/tests/test_new_features.py +396 -0
- lzgraphs-2.1.0/tests/test_pgen_fixes.py +225 -0
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/tests/test_serialization.py +1 -1
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/tests/test_utilities.py +57 -2
- lzgraphs-1.2.0/requirements.txt +0 -7
- lzgraphs-1.2.0/setup.py +0 -37
- lzgraphs-1.2.0/src/LZGraphs/BagOfWords/__init__.py +0 -0
- lzgraphs-1.2.0/src/LZGraphs/Graphs/Naive.py +0 -735
- lzgraphs-1.2.0/src/LZGraphs/Graphs/__init__.py +0 -0
- lzgraphs-1.2.0/src/LZGraphs/Metrics/__init__.py +0 -22
- lzgraphs-1.2.0/src/LZGraphs/Metrics/entropy.py +0 -477
- lzgraphs-1.2.0/src/LZGraphs/Mixins/__init__.py +0 -3
- lzgraphs-1.2.0/src/LZGraphs/Utilities/Utilities.py +0 -101
- lzgraphs-1.2.0/src/LZGraphs/Utilities/__init__.py +0 -1
- lzgraphs-1.2.0/src/LZGraphs/Utilities/graph_operations.py +0 -123
- lzgraphs-1.2.0/src/LZGraphs/Visualization/__init__.py +0 -0
- lzgraphs-1.2.0/src/LZGraphs/__init__.py +0 -50
- lzgraphs-1.2.0/src/LZGraphs.egg-info/SOURCES.txt +0 -48
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/CONTRIBUTING.md +0 -0
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/LICENSE +0 -0
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/MANIFEST.in +0 -0
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/README.md +0 -0
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/setup.cfg +0 -0
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/src/LZGraphs/py.typed +0 -0
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/src/LZGraphs.egg-info/dependency_links.txt +0 -0
- {lzgraphs-1.2.0 → lzgraphs-2.1.0}/src/LZGraphs.egg-info/top_level.txt +0 -0
|
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
9
9
|
|
|
10
10
|
### Added
|
|
11
11
|
- Custom exceptions module with comprehensive exception hierarchy for better error handling
|
|
12
|
-
- Information-theoretic metrics module (`LZGraphs.
|
|
12
|
+
- Information-theoretic metrics module (`LZGraphs.metrics.entropy`)
|
|
13
13
|
- `node_entropy()` - Shannon entropy of node probability distribution
|
|
14
14
|
- `edge_entropy()` - Shannon entropy of edge transition probabilities
|
|
15
15
|
- `graph_entropy()` - Combined graph entropy measure
|
|
@@ -1,10 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: LZGraphs
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 2.1.0
|
|
4
4
|
Summary: An Implementation of LZ76 Based Graphs for Repertoire Representation and Analysis
|
|
5
|
-
Home-page: https://github.com/MuteJester/LZGraphs
|
|
6
|
-
Download-URL: https://github.com/MuteJester/LZGraphs/archive/refs/tags/Beta1.1.1.tar.gz
|
|
7
|
-
Author: Thomas Konstantinovsky
|
|
8
5
|
Author-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
|
|
9
6
|
Maintainer-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
|
|
10
7
|
License: MIT
|
|
@@ -27,16 +24,17 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
27
24
|
Classifier: Programming Language :: Python :: 3.12
|
|
28
25
|
Classifier: Operating System :: OS Independent
|
|
29
26
|
Classifier: Typing :: Typed
|
|
30
|
-
Requires-Python: >=3.
|
|
27
|
+
Requires-Python: >=3.9
|
|
31
28
|
Description-Content-Type: text/markdown
|
|
32
29
|
License-File: LICENSE
|
|
33
30
|
Requires-Dist: networkx>=3.0
|
|
34
31
|
Requires-Dist: numpy>=1.24
|
|
35
32
|
Requires-Dist: pandas>=1.5
|
|
36
33
|
Requires-Dist: tqdm>=4.65
|
|
37
|
-
Requires-Dist: matplotlib>=3.7
|
|
38
|
-
Requires-Dist: seaborn>=0.12
|
|
39
34
|
Requires-Dist: scipy>=1.10
|
|
35
|
+
Provides-Extra: viz
|
|
36
|
+
Requires-Dist: matplotlib>=3.7; extra == "viz"
|
|
37
|
+
Requires-Dist: seaborn>=0.12; extra == "viz"
|
|
40
38
|
Provides-Extra: dev
|
|
41
39
|
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
42
40
|
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
@@ -51,11 +49,7 @@ Requires-Dist: mkdocs>=1.5; extra == "docs"
|
|
|
51
49
|
Requires-Dist: mkdocs-material>=9.0; extra == "docs"
|
|
52
50
|
Requires-Dist: mkdocstrings[python]>=0.24; extra == "docs"
|
|
53
51
|
Requires-Dist: pymdown-extensions>=10.0; extra == "docs"
|
|
54
|
-
Dynamic: author
|
|
55
|
-
Dynamic: download-url
|
|
56
|
-
Dynamic: home-page
|
|
57
52
|
Dynamic: license-file
|
|
58
|
-
Dynamic: requires-python
|
|
59
53
|
|
|
60
54
|
<p align="center">
|
|
61
55
|
|
|
@@ -47,12 +47,14 @@ dependencies = [
|
|
|
47
47
|
"numpy>=1.24",
|
|
48
48
|
"pandas>=1.5",
|
|
49
49
|
"tqdm>=4.65",
|
|
50
|
-
"matplotlib>=3.7",
|
|
51
|
-
"seaborn>=0.12",
|
|
52
50
|
"scipy>=1.10",
|
|
53
51
|
]
|
|
54
52
|
|
|
55
53
|
[project.optional-dependencies]
|
|
54
|
+
viz = [
|
|
55
|
+
"matplotlib>=3.7",
|
|
56
|
+
"seaborn>=0.12",
|
|
57
|
+
]
|
|
56
58
|
dev = [
|
|
57
59
|
"pytest>=7.0",
|
|
58
60
|
"pytest-cov>=4.0",
|
|
@@ -79,7 +81,7 @@ Changelog = "https://github.com/MuteJester/LZGraphs/blob/master/CHANGELOG.md"
|
|
|
79
81
|
|
|
80
82
|
[tool.setuptools]
|
|
81
83
|
package-dir = {"" = "src"}
|
|
82
|
-
packages = ["LZGraphs", "LZGraphs.
|
|
84
|
+
packages = ["LZGraphs", "LZGraphs.graphs", "LZGraphs.metrics", "LZGraphs.utilities", "LZGraphs.mixins", "LZGraphs.bag_of_words", "LZGraphs.visualization", "LZGraphs.exceptions"]
|
|
83
85
|
|
|
84
86
|
[tool.setuptools.dynamic]
|
|
85
87
|
version = {attr = "LZGraphs.__version__"}
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
__version__ = "2.1.0"
|
|
2
|
+
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Graph classes
|
|
5
|
+
# =============================================================================
|
|
6
|
+
from .graphs.amino_acid_positional import AAPLZGraph
|
|
7
|
+
from .graphs.nucleotide_double_positional import NDPLZGraph
|
|
8
|
+
from .graphs.naive import NaiveLZGraph
|
|
9
|
+
|
|
10
|
+
# =============================================================================
|
|
11
|
+
# Graph operations
|
|
12
|
+
# =============================================================================
|
|
13
|
+
from .graphs.graph_operations import graph_union
|
|
14
|
+
|
|
15
|
+
# =============================================================================
|
|
16
|
+
# Bag of Words
|
|
17
|
+
# =============================================================================
|
|
18
|
+
from .bag_of_words.bow_encoder import LZBOW
|
|
19
|
+
|
|
20
|
+
# =============================================================================
|
|
21
|
+
# Metrics - Diversity
|
|
22
|
+
# =============================================================================
|
|
23
|
+
from .metrics.diversity import (
|
|
24
|
+
LZCentrality,
|
|
25
|
+
K_Diversity,
|
|
26
|
+
K100_Diversity,
|
|
27
|
+
K500_Diversity,
|
|
28
|
+
K1000_Diversity,
|
|
29
|
+
K5000_Diversity,
|
|
30
|
+
adaptive_K_Diversity,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# =============================================================================
|
|
34
|
+
# Metrics - Entropy / Information Theory
|
|
35
|
+
# =============================================================================
|
|
36
|
+
from .metrics.entropy import (
|
|
37
|
+
node_entropy,
|
|
38
|
+
edge_entropy,
|
|
39
|
+
graph_entropy,
|
|
40
|
+
normalized_graph_entropy,
|
|
41
|
+
sequence_perplexity,
|
|
42
|
+
repertoire_perplexity,
|
|
43
|
+
jensen_shannon_divergence,
|
|
44
|
+
cross_entropy,
|
|
45
|
+
kl_divergence,
|
|
46
|
+
mutual_information_genes,
|
|
47
|
+
transition_predictability,
|
|
48
|
+
graph_compression_ratio,
|
|
49
|
+
repertoire_compressibility_index,
|
|
50
|
+
transition_kl_divergence,
|
|
51
|
+
transition_jsd,
|
|
52
|
+
transition_mutual_information_profile,
|
|
53
|
+
path_entropy_rate,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# =============================================================================
|
|
57
|
+
# Metrics - Saturation
|
|
58
|
+
# =============================================================================
|
|
59
|
+
from .metrics.saturation import NodeEdgeSaturationProbe
|
|
60
|
+
|
|
61
|
+
# =============================================================================
|
|
62
|
+
# Metrics - Convenience
|
|
63
|
+
# =============================================================================
|
|
64
|
+
from .metrics.convenience import compare_repertoires
|
|
65
|
+
|
|
66
|
+
# =============================================================================
|
|
67
|
+
# Utilities
|
|
68
|
+
# =============================================================================
|
|
69
|
+
from .utilities.helpers import generate_kmer_dictionary
|
|
70
|
+
from .utilities.decomposition import lempel_ziv_decomposition
|
|
71
|
+
|
|
72
|
+
# =============================================================================
|
|
73
|
+
# Visualization (optional dependency)
|
|
74
|
+
# =============================================================================
|
|
75
|
+
try:
|
|
76
|
+
from .visualization.visualize import (
|
|
77
|
+
sequence_genomic_edges_variability_plot,
|
|
78
|
+
sequence_genomic_node_variability_plot,
|
|
79
|
+
sequence_possible_paths_plot,
|
|
80
|
+
ancestors_descendants_curves_plot,
|
|
81
|
+
draw_graph,
|
|
82
|
+
)
|
|
83
|
+
except ImportError:
|
|
84
|
+
pass # Visualization features not available without matplotlib/seaborn
|
|
85
|
+
|
|
86
|
+
# =============================================================================
|
|
87
|
+
# Exceptions
|
|
88
|
+
# =============================================================================
|
|
89
|
+
from .exceptions import (
|
|
90
|
+
# Base
|
|
91
|
+
LZGraphError,
|
|
92
|
+
# Input validation
|
|
93
|
+
InputValidationError,
|
|
94
|
+
EmptyDataError,
|
|
95
|
+
MissingColumnError,
|
|
96
|
+
InvalidSequenceError,
|
|
97
|
+
InvalidProbabilityError,
|
|
98
|
+
# Graph construction
|
|
99
|
+
GraphConstructionError,
|
|
100
|
+
EncodingError,
|
|
101
|
+
# Gene data
|
|
102
|
+
GeneDataError,
|
|
103
|
+
NoGeneDataError,
|
|
104
|
+
GeneAnnotationError,
|
|
105
|
+
# Walk/probability
|
|
106
|
+
WalkError,
|
|
107
|
+
NoValidPathError,
|
|
108
|
+
MissingNodeError,
|
|
109
|
+
MissingEdgeError,
|
|
110
|
+
# Serialization
|
|
111
|
+
SerializationError,
|
|
112
|
+
UnsupportedFormatError,
|
|
113
|
+
CorruptedFileError,
|
|
114
|
+
# BOW
|
|
115
|
+
BOWError,
|
|
116
|
+
EncodingFunctionMismatchError,
|
|
117
|
+
UnfittedBOWError,
|
|
118
|
+
# Graph operations
|
|
119
|
+
GraphOperationError,
|
|
120
|
+
IncompatibleGraphsError,
|
|
121
|
+
# Metrics
|
|
122
|
+
MetricsError,
|
|
123
|
+
InsufficientDataError,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
__all__ = [
|
|
128
|
+
# Graph classes
|
|
129
|
+
'AAPLZGraph',
|
|
130
|
+
'NDPLZGraph',
|
|
131
|
+
'NaiveLZGraph',
|
|
132
|
+
# Graph operations
|
|
133
|
+
'graph_union',
|
|
134
|
+
# Bag of Words
|
|
135
|
+
'LZBOW',
|
|
136
|
+
# Diversity metrics
|
|
137
|
+
'LZCentrality',
|
|
138
|
+
'K_Diversity',
|
|
139
|
+
'K100_Diversity',
|
|
140
|
+
'K500_Diversity',
|
|
141
|
+
'K1000_Diversity',
|
|
142
|
+
'K5000_Diversity',
|
|
143
|
+
'adaptive_K_Diversity',
|
|
144
|
+
# Entropy metrics
|
|
145
|
+
'node_entropy',
|
|
146
|
+
'edge_entropy',
|
|
147
|
+
'graph_entropy',
|
|
148
|
+
'normalized_graph_entropy',
|
|
149
|
+
'sequence_perplexity',
|
|
150
|
+
'repertoire_perplexity',
|
|
151
|
+
'jensen_shannon_divergence',
|
|
152
|
+
'cross_entropy',
|
|
153
|
+
'kl_divergence',
|
|
154
|
+
'mutual_information_genes',
|
|
155
|
+
'transition_predictability',
|
|
156
|
+
'graph_compression_ratio',
|
|
157
|
+
'repertoire_compressibility_index',
|
|
158
|
+
'transition_kl_divergence',
|
|
159
|
+
'transition_jsd',
|
|
160
|
+
'transition_mutual_information_profile',
|
|
161
|
+
'path_entropy_rate',
|
|
162
|
+
# Saturation
|
|
163
|
+
'NodeEdgeSaturationProbe',
|
|
164
|
+
# Convenience
|
|
165
|
+
'compare_repertoires',
|
|
166
|
+
# Utilities
|
|
167
|
+
'generate_kmer_dictionary',
|
|
168
|
+
'lempel_ziv_decomposition',
|
|
169
|
+
# Exceptions
|
|
170
|
+
'LZGraphError',
|
|
171
|
+
'InputValidationError',
|
|
172
|
+
'EmptyDataError',
|
|
173
|
+
'MissingColumnError',
|
|
174
|
+
'InvalidSequenceError',
|
|
175
|
+
'InvalidProbabilityError',
|
|
176
|
+
'GraphConstructionError',
|
|
177
|
+
'EncodingError',
|
|
178
|
+
'GeneDataError',
|
|
179
|
+
'NoGeneDataError',
|
|
180
|
+
'GeneAnnotationError',
|
|
181
|
+
'WalkError',
|
|
182
|
+
'NoValidPathError',
|
|
183
|
+
'MissingNodeError',
|
|
184
|
+
'MissingEdgeError',
|
|
185
|
+
'SerializationError',
|
|
186
|
+
'UnsupportedFormatError',
|
|
187
|
+
'CorruptedFileError',
|
|
188
|
+
'BOWError',
|
|
189
|
+
'EncodingFunctionMismatchError',
|
|
190
|
+
'UnfittedBOWError',
|
|
191
|
+
'GraphOperationError',
|
|
192
|
+
'IncompatibleGraphsError',
|
|
193
|
+
'MetricsError',
|
|
194
|
+
'InsufficientDataError',
|
|
195
|
+
]
|
|
@@ -3,8 +3,10 @@ from collections.abc import Iterable
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
from tqdm.auto import tqdm
|
|
5
5
|
|
|
6
|
-
from ..
|
|
7
|
-
from ..
|
|
6
|
+
from ..utilities.decomposition import lempel_ziv_decomposition
|
|
7
|
+
from ..exceptions import EncodingFunctionMismatchError
|
|
8
|
+
|
|
9
|
+
__all__ = ["LZBOW"]
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
class LZBOW:
|
|
@@ -44,13 +46,17 @@ class LZBOW:
|
|
|
44
46
|
self.dictionary_index_map = dict()
|
|
45
47
|
self.dictionary_index_inverse_map = dict()
|
|
46
48
|
|
|
49
|
+
def __repr__(self):
|
|
50
|
+
return (f"LZBOW(dictionary_size={self.dictionary_size}, "
|
|
51
|
+
f"observed_sequences={self.observed_sequences})")
|
|
52
|
+
|
|
47
53
|
def _derive_index_maps(self):
|
|
48
54
|
self.dictionary_index_map = {pattern: idx for idx, pattern in enumerate(self.dictionary)}
|
|
49
55
|
self.dictionary_index_inverse_map = {idx: pattern for idx, pattern in enumerate(self.dictionary)}
|
|
50
56
|
self.dictionary_size = len(self.dictionary)
|
|
51
57
|
|
|
52
58
|
def fit(self, data):
|
|
53
|
-
if
|
|
59
|
+
if isinstance(data, str):
|
|
54
60
|
encoded = self.encoding_function(data)
|
|
55
61
|
self.dictionary = self.dictionary | set(encoded)
|
|
56
62
|
self._derive_index_maps()
|
|
@@ -66,18 +72,29 @@ class LZBOW:
|
|
|
66
72
|
encoded = self.encoding_function(seq)
|
|
67
73
|
return [self.dictionary_index_map[i] for i in encoded if i in self.dictionary]
|
|
68
74
|
|
|
69
|
-
def transform(self, data, normalize=False):
|
|
70
|
-
if
|
|
75
|
+
def transform(self, data, normalize=False, per_sequence=False):
|
|
76
|
+
if isinstance(data, str):
|
|
71
77
|
result = np.zeros(self.dictionary_size)
|
|
72
78
|
result[self._seq_to_index(data)] += 1
|
|
73
79
|
return result
|
|
74
80
|
elif isinstance(data, Iterable):
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
81
|
+
if per_sequence:
|
|
82
|
+
data_list = list(data)
|
|
83
|
+
matrix = np.zeros((len(data_list), self.dictionary_size))
|
|
84
|
+
for i, seq in enumerate(tqdm(data_list, leave=False, position=0)):
|
|
85
|
+
matrix[i, self._seq_to_index(seq)] += 1
|
|
86
|
+
if normalize:
|
|
87
|
+
row_sums = matrix.sum(axis=1, keepdims=True)
|
|
88
|
+
row_sums[row_sums == 0] = 1 # avoid division by zero
|
|
89
|
+
return matrix / row_sums
|
|
90
|
+
return matrix
|
|
80
91
|
else:
|
|
92
|
+
result = np.zeros(self.dictionary_size)
|
|
93
|
+
for seq in tqdm(data, leave=False, position=0):
|
|
94
|
+
result[self._seq_to_index(seq)] += 1
|
|
95
|
+
if normalize:
|
|
96
|
+
total = result.sum()
|
|
97
|
+
return result / total if total > 0 else result
|
|
81
98
|
return result
|
|
82
99
|
|
|
83
100
|
def load_from(self, other):
|
|
@@ -100,5 +117,69 @@ class LZBOW:
|
|
|
100
117
|
union.observed_sequences = self.observed_sequences + other.observed_sequences
|
|
101
118
|
union.dictionary_index_map = {pattern: idx for idx, pattern in enumerate(union.dictionary)}
|
|
102
119
|
union.dictionary_index_inverse_map = {idx: pattern for idx, pattern in enumerate(union.dictionary)}
|
|
103
|
-
union.dictionary_size = len(
|
|
120
|
+
union.dictionary_size = len(union.dictionary)
|
|
104
121
|
return union
|
|
122
|
+
|
|
123
|
+
def fit_transform(self, data, normalize=False, per_sequence=False):
|
|
124
|
+
"""
|
|
125
|
+
Fit the encoder on data and transform it in one step.
|
|
126
|
+
|
|
127
|
+
Equivalent to calling fit(data) followed by transform(data), but
|
|
128
|
+
avoids processing the data twice for fitting.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
data: A string (single sequence) or iterable of strings.
|
|
132
|
+
normalize (bool): If True, normalize the output vectors.
|
|
133
|
+
per_sequence (bool): If True and data is iterable, return a
|
|
134
|
+
2D matrix (n_sequences x dictionary_size).
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
np.ndarray: BOW vector(s) for the input data.
|
|
138
|
+
|
|
139
|
+
Example:
|
|
140
|
+
>>> bow = LZBOW()
|
|
141
|
+
>>> matrix = bow.fit_transform(sequences, per_sequence=True)
|
|
142
|
+
"""
|
|
143
|
+
self.fit(data)
|
|
144
|
+
return self.transform(data, normalize=normalize, per_sequence=per_sequence)
|
|
145
|
+
|
|
146
|
+
def tfidf_transform(self, data):
|
|
147
|
+
"""
|
|
148
|
+
Transform sequences into TF-IDF weighted bag-of-words vectors.
|
|
149
|
+
|
|
150
|
+
TF-IDF (Term Frequency - Inverse Document Frequency) weights
|
|
151
|
+
down-weight subpatterns that appear in many sequences and up-weight
|
|
152
|
+
those that are more discriminative.
|
|
153
|
+
|
|
154
|
+
The encoder must be fitted before calling this method.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
data: An iterable of sequence strings.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
np.ndarray: 2D matrix (n_sequences x dictionary_size) with TF-IDF weights.
|
|
161
|
+
|
|
162
|
+
Example:
|
|
163
|
+
>>> bow = LZBOW()
|
|
164
|
+
>>> bow.fit(train_sequences)
|
|
165
|
+
>>> tfidf_matrix = bow.tfidf_transform(test_sequences)
|
|
166
|
+
"""
|
|
167
|
+
# Get per-sequence term frequency matrix
|
|
168
|
+
tf_matrix = self.transform(data, per_sequence=True)
|
|
169
|
+
|
|
170
|
+
n_docs = tf_matrix.shape[0]
|
|
171
|
+
if n_docs == 0:
|
|
172
|
+
return tf_matrix
|
|
173
|
+
|
|
174
|
+
# Compute document frequency: number of sequences containing each term
|
|
175
|
+
doc_freq = np.count_nonzero(tf_matrix, axis=0).astype(np.float64)
|
|
176
|
+
|
|
177
|
+
# IDF = log(1 + N / (1 + df)), smoothed variant that's always non-negative
|
|
178
|
+
idf = np.log1p(n_docs / (1.0 + doc_freq))
|
|
179
|
+
|
|
180
|
+
# Normalize TF per row (L1 normalization)
|
|
181
|
+
row_sums = tf_matrix.sum(axis=1, keepdims=True)
|
|
182
|
+
row_sums[row_sums == 0] = 1 # avoid division by zero
|
|
183
|
+
tf_normalized = tf_matrix / row_sums
|
|
184
|
+
|
|
185
|
+
return tf_normalized * idf
|
{lzgraphs-1.2.0/src/LZGraphs/Exceptions → lzgraphs-2.1.0/src/LZGraphs/exceptions}/__init__.py
RENAMED
|
@@ -27,7 +27,7 @@ Exception Hierarchy:
|
|
|
27
27
|
└── IncompatibleGraphsError
|
|
28
28
|
|
|
29
29
|
Example:
|
|
30
|
-
>>> from LZGraphs.
|
|
30
|
+
>>> from LZGraphs.exceptions import NoGeneDataError, InvalidSequenceError
|
|
31
31
|
>>> try:
|
|
32
32
|
... graph.genomic_random_walk()
|
|
33
33
|
... except NoGeneDataError as e:
|