imageatlas 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imageatlas/__init__.py +42 -0
- imageatlas/clustering/__init__.py +14 -0
- imageatlas/clustering/base.py +129 -0
- imageatlas/clustering/factory.py +43 -0
- imageatlas/clustering/gmm.py +165 -0
- imageatlas/clustering/hdbscan_clustering.py +175 -0
- imageatlas/clustering/kmeans.py +148 -0
- imageatlas/core/__init__.py +15 -0
- imageatlas/core/clusterer.py +377 -0
- imageatlas/core/results.py +362 -0
- imageatlas/features/__init__.py +18 -0
- imageatlas/features/adapter.py +0 -0
- imageatlas/features/batch.py +142 -0
- imageatlas/features/cache.py +257 -0
- imageatlas/features/extractors/__init__.py +20 -0
- imageatlas/features/extractors/base.py +73 -0
- imageatlas/features/extractors/clip.py +26 -0
- imageatlas/features/extractors/convnext.py +58 -0
- imageatlas/features/extractors/dinov2.py +42 -0
- imageatlas/features/extractors/efficientnet.py +54 -0
- imageatlas/features/extractors/factory.py +47 -0
- imageatlas/features/extractors/mobilenet.py +58 -0
- imageatlas/features/extractors/resnet.py +63 -0
- imageatlas/features/extractors/swin.py +60 -0
- imageatlas/features/extractors/vgg.py +46 -0
- imageatlas/features/extractors/vit.py +67 -0
- imageatlas/features/loaders.py +187 -0
- imageatlas/features/metadata.py +81 -0
- imageatlas/features/pipeline.py +347 -0
- imageatlas/reduction/__init__.py +20 -0
- imageatlas/reduction/base.py +131 -0
- imageatlas/reduction/factory.py +51 -0
- imageatlas/reduction/pca.py +148 -0
- imageatlas/reduction/tsne.py +173 -0
- imageatlas/reduction/umap_reducer.py +110 -0
- imageatlas/visualization/__init__.py +10 -0
- imageatlas/visualization/grids.py +197 -0
- imageatlas-0.1.0.dist-info/METADATA +203 -0
- imageatlas-0.1.0.dist-info/RECORD +42 -0
- imageatlas-0.1.0.dist-info/WHEEL +5 -0
- imageatlas-0.1.0.dist-info/licenses/LICENSE +21 -0
- imageatlas-0.1.0.dist-info/top_level.txt +1 -0
imageatlas/__init__.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ImageAtlas: A toolkit for organizing, cleaning and analysing your image datasets.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
__version__ = '0.1.0'
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# 1. High level API (The everything tool)
|
|
9
|
+
from .core.clusterer import ImageClusterer
|
|
10
|
+
from .core.results import ClusteringResults
|
|
11
|
+
|
|
12
|
+
# 2. Modular APIs (For specific tasks)
|
|
13
|
+
|
|
14
|
+
# Feature extraction tools
|
|
15
|
+
from .features.pipeline import FeaturePipeline
|
|
16
|
+
from .features.extractors.factory import create_feature_extractor
|
|
17
|
+
|
|
18
|
+
# ImageLoader, BatchProcessing later
|
|
19
|
+
|
|
20
|
+
# Dimensionality Reduction Tools
|
|
21
|
+
from .reduction.factory import create_reducer
|
|
22
|
+
from .reduction.pca import PCAReducer
|
|
23
|
+
from .reduction.umap_reducer import UMAPReducer
|
|
24
|
+
|
|
25
|
+
# Clustering Tools
|
|
26
|
+
from .clustering.factory import create_clustering_algorithm
|
|
27
|
+
|
|
28
|
+
# Visualization Tools
|
|
29
|
+
from .visualization.grids import GridVisualizer, create_cluster_grids
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
'ImageClusterer',
|
|
33
|
+
'ClusteringResults',
|
|
34
|
+
'FeaturePipeline',
|
|
35
|
+
'create_feature_extractor',
|
|
36
|
+
'create_reducer',
|
|
37
|
+
'PCAReducer',
|
|
38
|
+
'UMAPReducer',
|
|
39
|
+
'create_clustering_algorithm',
|
|
40
|
+
'GridVisualizer',
|
|
41
|
+
'create_cluster_grids',
|
|
42
|
+
]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from .base import ClusteringResult, ClusteringAlgorithm
|
|
2
|
+
from .kmeans import KMeansClustering
|
|
3
|
+
from .hdbscan_clustering import HDBSCANClustering
|
|
4
|
+
from .gmm import GMMClustering
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
'ClusteringAlgorithm',
|
|
10
|
+
'ClusteringResult',
|
|
11
|
+
'KMeansClustering',
|
|
12
|
+
'HDBSCANClustering',
|
|
13
|
+
'GMMClustering',
|
|
14
|
+
]
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import Dict, List, Optional, Any
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class ClusteringResult:
|
|
9
|
+
"""
|
|
10
|
+
Container for clustering Results.
|
|
11
|
+
"""
|
|
12
|
+
cluster_labels: np.ndarray
|
|
13
|
+
cluster_dict: Dict[int, List[int]]
|
|
14
|
+
n_clusters: int
|
|
15
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
16
|
+
|
|
17
|
+
def get_cluster_sizes(self) -> Dict[int, int]:
|
|
18
|
+
"""Get the size of each cluster."""
|
|
19
|
+
return {cluster_id: len(indices) for cluster_id, indices in self.cluster_dict.items()}
|
|
20
|
+
|
|
21
|
+
def get_cluster(self, cluster_id: int) -> List[int]:
|
|
22
|
+
"""Get indices belonging to a specific cluster."""
|
|
23
|
+
return self.cluster_dict.get(cluster_id, [])
|
|
24
|
+
|
|
25
|
+
def get_outliers(self) -> Optional[List[int]]:
|
|
26
|
+
"""Get outlier indices (cluster_id = -1) if any exist."""
|
|
27
|
+
return self.cluster_dict.get(-1, None)
|
|
28
|
+
|
|
29
|
+
def summary(self) -> str:
|
|
30
|
+
"""Get a summary string of clustering results."""
|
|
31
|
+
|
|
32
|
+
summary_lines = [
|
|
33
|
+
f"Clustering Results Summary:",
|
|
34
|
+
f" Total Samples: {len(self.clustering_labels)}",
|
|
35
|
+
f" Number of Clusters: {self.n_clusters}",
|
|
36
|
+
f" Cluster Sizes: {self.get_cluster_sizes()}",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
outliers = self.get_outliers()
|
|
40
|
+
|
|
41
|
+
if outliers:
|
|
42
|
+
summary_lines.append(f" Outliers: {len(outliers)}")
|
|
43
|
+
|
|
44
|
+
if self.metadata:
|
|
45
|
+
summary_lines.append(f" Metadata: {list(self.metadata.keys())}")
|
|
46
|
+
|
|
47
|
+
return "\n".join(summary_lines)
|
|
48
|
+
|
|
49
|
+
class ClusteringAlgorithm(ABC):
|
|
50
|
+
"""
|
|
51
|
+
Abstract base class for all clustering algorithms.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, random_state=42, **kwargs):
|
|
55
|
+
"""
|
|
56
|
+
Initialize the clustering algorithm.
|
|
57
|
+
"""
|
|
58
|
+
self.random_state = random_state
|
|
59
|
+
self.params = kwargs
|
|
60
|
+
self.is_fitted = False
|
|
61
|
+
self._model = None
|
|
62
|
+
|
|
63
|
+
@abstractmethod
|
|
64
|
+
def fit_predict(self, features) -> ClusteringResult:
|
|
65
|
+
"""
|
|
66
|
+
Fit the clustering algorithms and predict cluster labels.
|
|
67
|
+
"""
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
@abstractmethod
|
|
71
|
+
def get_algorithm_name(self):
|
|
72
|
+
"""
|
|
73
|
+
Return the name of the clustering algorithm.
|
|
74
|
+
"""
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
def _validate_features(self, features:np.ndarray) -> None:
|
|
78
|
+
"""
|
|
79
|
+
Validate the input feature matrix.
|
|
80
|
+
"""
|
|
81
|
+
if not isinstance(features, np.ndarray):
|
|
82
|
+
raise ValueError(f"Feature must be a numpy array, got {type(features)}")
|
|
83
|
+
|
|
84
|
+
if features.ndim != 2:
|
|
85
|
+
raise ValueError(f"Features must be 2D array, got shape: {features.shape}")
|
|
86
|
+
|
|
87
|
+
if features.shape[0] == 0:
|
|
88
|
+
raise ValueError("Feature matrix is empty.")
|
|
89
|
+
|
|
90
|
+
if np.any(np.isnan(features)) or np.any(np.isinf(features)):
|
|
91
|
+
raise ValueError("Features contain NaN or Inf values")
|
|
92
|
+
|
|
93
|
+
def _create_cluster_dict(self, cluster_labels, filenames=None):
|
|
94
|
+
"""
|
|
95
|
+
Createa dictionary mapping cluster IDs to indices or filenames
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
cluster_dict = {}
|
|
99
|
+
|
|
100
|
+
for idx, cluster_id in enumerate(cluster_labels):
|
|
101
|
+
cluster_id = int(cluster_id)
|
|
102
|
+
|
|
103
|
+
if cluster_id not in cluster_dict:
|
|
104
|
+
cluster_dict[cluster_id] = []
|
|
105
|
+
|
|
106
|
+
item = filenames[idx] if filenames else idx
|
|
107
|
+
cluster_dict[cluster_id].append(item)
|
|
108
|
+
|
|
109
|
+
return cluster_dict
|
|
110
|
+
|
|
111
|
+
def get_params(self):
|
|
112
|
+
"""
|
|
113
|
+
Get parameters of the clustering algorithms.
|
|
114
|
+
"""
|
|
115
|
+
return {
|
|
116
|
+
'random_state': self.random_state,
|
|
117
|
+
**self.params,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
def __repr__(self):
|
|
121
|
+
"""
|
|
122
|
+
String representation of the clustering algorithm.
|
|
123
|
+
"""
|
|
124
|
+
params_str = ", ".join(f"{key}={value}" for key, value in self.get_params().items())
|
|
125
|
+
return f"{self.get_algorithm_name()}({params_str})"
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Factory function for creating clustering algorithms.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from .base import ClusteringAlgorithm
|
|
7
|
+
from .kmeans import KMeansClustering
|
|
8
|
+
from .hdbscan_clustering import HDBSCANClustering
|
|
9
|
+
from .gmm import GMMClustering
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# Registry of available clustering algorithms.
|
|
13
|
+
CLUSTERING_ALGORITHMS = {
|
|
14
|
+
'kmeans': KMeansClustering,
|
|
15
|
+
'hdbscan': HDBSCANClustering,
|
|
16
|
+
'gmm': GMMClustering,
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
def create_clustering_algorithm(
|
|
20
|
+
method:str,
|
|
21
|
+
**kwargs,
|
|
22
|
+
) -> ClusteringAlgorithm:
|
|
23
|
+
"""
|
|
24
|
+
Factory function to create clustering algorithms.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
method = method.lower()
|
|
28
|
+
|
|
29
|
+
if method not in CLUSTERING_ALGORITHMS:
|
|
30
|
+
available = ", ".join(sorted(CLUSTERING_ALGORITHMS.keys()))
|
|
31
|
+
raise ValueError(
|
|
32
|
+
f"Unknown clustering Method: '{method}'.",
|
|
33
|
+
f"Available methods: {available}"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
algorithm_class = CLUSTERING_ALGORITHMS[method]
|
|
37
|
+
return algorithm_class(**kwargs)
|
|
38
|
+
|
|
39
|
+
def get_available_algorithms():
|
|
40
|
+
"""
|
|
41
|
+
Get a list of available clustering algorithms.
|
|
42
|
+
"""
|
|
43
|
+
return sorted(CLUSTERING_ALGORITHMS.keys())
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Gaussian Mixture Model (GMM) clustering implementation.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from typing import Optional, List
|
|
7
|
+
from sklearn.mixture import GaussianMixture
|
|
8
|
+
from .base import ClusteringAlgorithm, ClusteringResult
|
|
9
|
+
|
|
10
|
+
class GMMClustering(ClusteringAlgorithm):
|
|
11
|
+
"""
|
|
12
|
+
Gaussian Mixture Model clustering algorithm.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
n_components = 5,
|
|
18
|
+
covariance_type = 'diag',
|
|
19
|
+
max_iter = 100,
|
|
20
|
+
n_init = 10,
|
|
21
|
+
reg_covar = 1e-6,
|
|
22
|
+
random_state = 42,
|
|
23
|
+
**kwargs,
|
|
24
|
+
):
|
|
25
|
+
|
|
26
|
+
super().__init__(random_state, **kwargs)
|
|
27
|
+
|
|
28
|
+
if n_components < 2:
|
|
29
|
+
raise ValueError("n_components must be atleast 2")
|
|
30
|
+
|
|
31
|
+
valid_covariance_types = ['full', 'tied', 'diag', 'spherical']
|
|
32
|
+
if covariance_type not in valid_covariance_types:
|
|
33
|
+
raise ValueError(f"covariance_type must be one of {valid_covariance_types}")
|
|
34
|
+
|
|
35
|
+
self.n_components = n_components
|
|
36
|
+
self.covariance_type = covariance_type
|
|
37
|
+
self.max_iter = max_iter
|
|
38
|
+
self.n_init = n_init
|
|
39
|
+
self.reg_covar = reg_covar
|
|
40
|
+
|
|
41
|
+
def fit_predict(
|
|
42
|
+
self,
|
|
43
|
+
features,
|
|
44
|
+
filenames = None,
|
|
45
|
+
) -> ClusteringResult:
|
|
46
|
+
|
|
47
|
+
"""
|
|
48
|
+
Fit GMM and predict cluster labels.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
self._validate_features(features)
|
|
52
|
+
print('fshape: ', features.shape)
|
|
53
|
+
|
|
54
|
+
n_samples = features.shape[0]
|
|
55
|
+
|
|
56
|
+
# Adjust n-components if needed
|
|
57
|
+
actual_n_components = min(self.n_components, n_samples)
|
|
58
|
+
|
|
59
|
+
if actual_n_components < self.n_components:
|
|
60
|
+
print(f"Warning: Requested {self.n_components} components only "
|
|
61
|
+
f"{n_samples} samples. Using {actual_n_components} components.")
|
|
62
|
+
|
|
63
|
+
features_float64 = features.astype(np.float64)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# Create and fit GMM
|
|
67
|
+
self._model = GaussianMixture(
|
|
68
|
+
n_components=actual_n_components,
|
|
69
|
+
covariance_type=self.covariance_type,
|
|
70
|
+
max_iter=self.max_iter,
|
|
71
|
+
n_init=self.n_init,
|
|
72
|
+
reg_covar=self.reg_covar,
|
|
73
|
+
random_state=self.random_state,
|
|
74
|
+
verbose=0
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Fit and predict
|
|
78
|
+
cluster_labels = self._model.fit_predict(features_float64)
|
|
79
|
+
|
|
80
|
+
# Get probability scores
|
|
81
|
+
probabilities = self._model.predict_proba(features_float64)
|
|
82
|
+
|
|
83
|
+
# Create cluster dictionary
|
|
84
|
+
cluster_dict =self._create_cluster_dict(cluster_labels, filenames)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# Prepare metadata
|
|
88
|
+
metadata = {
|
|
89
|
+
'algorithm': 'gmm',
|
|
90
|
+
'converged': bool(self._model.converged_),
|
|
91
|
+
'n_iter': int(self._model.n_iter_),
|
|
92
|
+
'bic': float(self._model.bic(features_float64)),
|
|
93
|
+
'aic': float(self._model.aic(features_float64)),
|
|
94
|
+
'lower_bound': float(self._model.lower_bound_),
|
|
95
|
+
'covariance_type': self.covariance_type,
|
|
96
|
+
'probabilities': probabilities,
|
|
97
|
+
'means': self._model.means_,
|
|
98
|
+
'covariance': self._model.covariances_,
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
self.is_fitted = True
|
|
102
|
+
|
|
103
|
+
return ClusteringResult(
|
|
104
|
+
cluster_labels = cluster_labels,
|
|
105
|
+
cluster_dict = cluster_dict,
|
|
106
|
+
n_clusters = actual_n_components,
|
|
107
|
+
metadata = metadata
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def predict(self, features):
|
|
111
|
+
"""
|
|
112
|
+
Predict cluster label for new samples.
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
if not self.is_fitted or self._model is None:
|
|
116
|
+
raise RuntimeError("Model must be fitted before prediction. Call fit_predict first.")
|
|
117
|
+
|
|
118
|
+
self._validate_features(features)
|
|
119
|
+
return self._model.predict(features)
|
|
120
|
+
|
|
121
|
+
def predict_proba(self, features):
|
|
122
|
+
"""
|
|
123
|
+
Predict probability of each cluster for new samples.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
if not self.is_fitted or self._model is None:
|
|
127
|
+
raise RuntimeError("Model must be fitted before prediction. Call fit_predict first.")
|
|
128
|
+
|
|
129
|
+
self._validate_features(features)
|
|
130
|
+
return self._model.predict_proba(features)
|
|
131
|
+
|
|
132
|
+
def get_cluster_means(self):
|
|
133
|
+
"""
|
|
134
|
+
Get cluster means (centers) if model is fitted.
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
if self.is_fitted and self._model is not None:
|
|
138
|
+
return self._model.means_
|
|
139
|
+
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
def score(self, features):
|
|
143
|
+
"""
|
|
144
|
+
Compute the log-likelihood of the data under the model.
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
if not self.is_fitted or self._model is None:
|
|
148
|
+
raise RuntimeError("Model must be fitted before scoring.")
|
|
149
|
+
|
|
150
|
+
return self._model.score(features)
|
|
151
|
+
|
|
152
|
+
def get_algorithm_name(self):
|
|
153
|
+
"""Return the name of the clustering algorithm."""
|
|
154
|
+
return "GaussianMixtureModel (GMM)"
|
|
155
|
+
|
|
156
|
+
def get_params(self):
|
|
157
|
+
"""Get parameters of the clustering algorithm."""
|
|
158
|
+
return {
|
|
159
|
+
'n_components': self.n_components,
|
|
160
|
+
'covariance_type': self.covariance_type,
|
|
161
|
+
'max_iter': self.max_iter,
|
|
162
|
+
'n_init': self.n_init,
|
|
163
|
+
'random_state': self.random_state,
|
|
164
|
+
'reg_covar': self.reg_covar
|
|
165
|
+
}
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""HDBSCAN clustering Implementation."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from typing import Optional, List
|
|
6
|
+
from .base import ClusteringAlgorithm, ClusteringResult
|
|
7
|
+
|
|
8
|
+
class HDBSCANClustering(ClusteringAlgorithm):
|
|
9
|
+
"""
|
|
10
|
+
HDBSCAN algorithm.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
min_cluster_size = None,
|
|
16
|
+
min_samples = None,
|
|
17
|
+
metric = 'euclidean',
|
|
18
|
+
cluster_selection_method = 'eom',
|
|
19
|
+
auto_params = True,
|
|
20
|
+
random_state = 42,
|
|
21
|
+
**kwargs,
|
|
22
|
+
):
|
|
23
|
+
|
|
24
|
+
super().__init__(random_state=random_state, **kwargs)
|
|
25
|
+
|
|
26
|
+
self.min_cluster_size = min_cluster_size
|
|
27
|
+
self.min_samples = min_samples
|
|
28
|
+
self.metric = metric
|
|
29
|
+
self.cluster_selection_method = cluster_selection_method
|
|
30
|
+
self.auto_params = auto_params
|
|
31
|
+
|
|
32
|
+
# Validate parameters
|
|
33
|
+
if cluster_selection_method not in ['eom', 'leaf']:
|
|
34
|
+
raise ValueError("cluster_selection_method must be 'eom' or 'leaf'")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _auto_select_params(self, n_samples):
|
|
38
|
+
"""
|
|
39
|
+
Automatically select HDBSCAN parameters based on dataset size.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
if n_samples < 100:
|
|
43
|
+
min_cluster_size = max(5, n_samples // 20)
|
|
44
|
+
min_samples = 1
|
|
45
|
+
elif n_samples < 2000:
|
|
46
|
+
min_cluster_size = 10
|
|
47
|
+
min_samples = 1
|
|
48
|
+
elif n_samples < 10000:
|
|
49
|
+
min_cluster_size = 20
|
|
50
|
+
min_samples = 3
|
|
51
|
+
else:
|
|
52
|
+
min_cluster_size = 30
|
|
53
|
+
min_samples = 5
|
|
54
|
+
|
|
55
|
+
return min_cluster_size, min_samples
|
|
56
|
+
|
|
57
|
+
def fit_predict(
|
|
58
|
+
self,
|
|
59
|
+
features,
|
|
60
|
+
filenames,
|
|
61
|
+
) -> ClusteringResult:
|
|
62
|
+
|
|
63
|
+
"""
|
|
64
|
+
Fit HDBSCAN and predict cluster labels.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
import hdbscan
|
|
69
|
+
except ImportError:
|
|
70
|
+
raise ImportError(
|
|
71
|
+
"hdbscan is not installed. Install it with: pip install hdbscan"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
self._validate_features(features)
|
|
75
|
+
|
|
76
|
+
n_samples = features.shape[0]
|
|
77
|
+
|
|
78
|
+
# Auto-select parameters if needed.
|
|
79
|
+
if self.auto_params:
|
|
80
|
+
auto_min_cluster_size, auto_min_samples = self._auto_select_params(n_samples)
|
|
81
|
+
min_cluster_size = self.min_cluster_size or auto_min_cluster_size
|
|
82
|
+
min_samples = self.min_samples or auto_min_samples
|
|
83
|
+
else:
|
|
84
|
+
min_cluster_size = self.min_cluster_size or 10
|
|
85
|
+
min_samples = self.min_samples or 1
|
|
86
|
+
|
|
87
|
+
# Ensure parameters are valid.
|
|
88
|
+
min_cluster_size = min(min_cluster_size, n_samples)
|
|
89
|
+
min_samples = min(min_samples, min_cluster_size)
|
|
90
|
+
|
|
91
|
+
# Create and fit HDBSCAN
|
|
92
|
+
self._model = hdbscan.HDBSCAN(
|
|
93
|
+
min_cluster_size = min_cluster_size,
|
|
94
|
+
min_samples = min_samples,
|
|
95
|
+
metric = self.metric,
|
|
96
|
+
cluster_selection_method = self.cluster_selection_method,
|
|
97
|
+
core_dist_n_jobs=-1 # Use all cores
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Fit and predict
|
|
101
|
+
cluster_labels = self._model.fit_predict(features)
|
|
102
|
+
|
|
103
|
+
# Create cluster dictionary
|
|
104
|
+
cluster_dict = self._create_cluster_dict(cluster_labels, filenames)
|
|
105
|
+
|
|
106
|
+
# Count actual clusters
|
|
107
|
+
unique_labels = np.unique(cluster_labels)
|
|
108
|
+
n_clusters = len(unique_labels[unique_labels >= 0])
|
|
109
|
+
n_noise = np.sum(cluster_labels == -1)
|
|
110
|
+
|
|
111
|
+
# Prepare metadata
|
|
112
|
+
metadata = {
|
|
113
|
+
'algorithm': 'hdbscan',
|
|
114
|
+
'n_clusters_found': n_clusters,
|
|
115
|
+
'n_noise_points': int(n_noise),
|
|
116
|
+
'min_cluster_size': min_cluster_size,
|
|
117
|
+
'min_samples' : min_samples,
|
|
118
|
+
'metric': self.metric,
|
|
119
|
+
'cluster_selection_method': self.cluster_selection_method,
|
|
120
|
+
'probabilities': self._model.probabilities_,
|
|
121
|
+
'outlier_scores': self._model.outlier_scores_,
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
# Add exemplars if available (representative points for each cluster)
|
|
125
|
+
if hasattr(self._model, 'exemplars_'):
|
|
126
|
+
metadata['exemplars'] = self._model.exemplars_
|
|
127
|
+
|
|
128
|
+
self.is_fitted = True
|
|
129
|
+
|
|
130
|
+
return ClusteringResult(
|
|
131
|
+
cluster_labels=cluster_labels,
|
|
132
|
+
cluster_dict=cluster_dict,
|
|
133
|
+
n_clusters=n_clusters,
|
|
134
|
+
metadata=metadata
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
def get_outlier_score(self):
|
|
138
|
+
"""
|
|
139
|
+
Get outlier score for each sample.
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
if self.is_fitted and self._model is not None:
|
|
143
|
+
return self._model.outlier_scores_
|
|
144
|
+
|
|
145
|
+
return None
|
|
146
|
+
|
|
147
|
+
def get_condensed_tree(self):
|
|
148
|
+
"""
|
|
149
|
+
Get condensed cluster hierarchy tree.
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
if self.is_fitted and self._model is not None:
|
|
153
|
+
return self._model.condensed_tree_
|
|
154
|
+
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
def get_algorithm_name(self):
|
|
158
|
+
"""
|
|
159
|
+
Return the name of the clustering algorithm.
|
|
160
|
+
"""
|
|
161
|
+
return "HDBSCAN"
|
|
162
|
+
|
|
163
|
+
def get_params(self):
|
|
164
|
+
"""
|
|
165
|
+
Get parameters of the clustering algorithm.
|
|
166
|
+
"""
|
|
167
|
+
return {
|
|
168
|
+
'min_cluster_size': self.min_cluster_size,
|
|
169
|
+
'min_samples': self.min_samples,
|
|
170
|
+
'metric': self.metric,
|
|
171
|
+
'cluster_selection_method': self.cluster_selection_method,
|
|
172
|
+
'auto_params': self.auto_params,
|
|
173
|
+
'random_state': self.random_state
|
|
174
|
+
}
|
|
175
|
+
|