imageatlas 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. imageatlas/__init__.py +42 -0
  2. imageatlas/clustering/__init__.py +14 -0
  3. imageatlas/clustering/base.py +129 -0
  4. imageatlas/clustering/factory.py +43 -0
  5. imageatlas/clustering/gmm.py +165 -0
  6. imageatlas/clustering/hdbscan_clustering.py +175 -0
  7. imageatlas/clustering/kmeans.py +148 -0
  8. imageatlas/core/__init__.py +15 -0
  9. imageatlas/core/clusterer.py +377 -0
  10. imageatlas/core/results.py +362 -0
  11. imageatlas/features/__init__.py +18 -0
  12. imageatlas/features/adapter.py +0 -0
  13. imageatlas/features/batch.py +142 -0
  14. imageatlas/features/cache.py +257 -0
  15. imageatlas/features/extractors/__init__.py +20 -0
  16. imageatlas/features/extractors/base.py +73 -0
  17. imageatlas/features/extractors/clip.py +26 -0
  18. imageatlas/features/extractors/convnext.py +58 -0
  19. imageatlas/features/extractors/dinov2.py +42 -0
  20. imageatlas/features/extractors/efficientnet.py +54 -0
  21. imageatlas/features/extractors/factory.py +47 -0
  22. imageatlas/features/extractors/mobilenet.py +58 -0
  23. imageatlas/features/extractors/resnet.py +63 -0
  24. imageatlas/features/extractors/swin.py +60 -0
  25. imageatlas/features/extractors/vgg.py +46 -0
  26. imageatlas/features/extractors/vit.py +67 -0
  27. imageatlas/features/loaders.py +187 -0
  28. imageatlas/features/metadata.py +81 -0
  29. imageatlas/features/pipeline.py +347 -0
  30. imageatlas/reduction/__init__.py +20 -0
  31. imageatlas/reduction/base.py +131 -0
  32. imageatlas/reduction/factory.py +51 -0
  33. imageatlas/reduction/pca.py +148 -0
  34. imageatlas/reduction/tsne.py +173 -0
  35. imageatlas/reduction/umap_reducer.py +110 -0
  36. imageatlas/visualization/__init__.py +10 -0
  37. imageatlas/visualization/grids.py +197 -0
  38. imageatlas-0.1.0.dist-info/METADATA +203 -0
  39. imageatlas-0.1.0.dist-info/RECORD +42 -0
  40. imageatlas-0.1.0.dist-info/WHEEL +5 -0
  41. imageatlas-0.1.0.dist-info/licenses/LICENSE +21 -0
  42. imageatlas-0.1.0.dist-info/top_level.txt +1 -0
imageatlas/__init__.py ADDED
@@ -0,0 +1,42 @@
1
+ """
2
+ ImageAtlas: A toolkit for organizing, cleaning and analysing your image datasets.
3
+ """
4
+
5
+ __version__ = '0.1.0'
6
+
7
+
8
+ # 1. High level API (The everything tool)
9
+ from .core.clusterer import ImageClusterer
10
+ from .core.results import ClusteringResults
11
+
12
+ # 2. Modular APIs (For specific tasks)
13
+
14
+ # Feature extraction tools
15
+ from .features.pipeline import FeaturePipeline
16
+ from .features.extractors.factory import create_feature_extractor
17
+
18
+ # ImageLoader, BatchProcessing later
19
+
20
+ # Dimensionality Reduction Tools
21
+ from .reduction.factory import create_reducer
22
+ from .reduction.pca import PCAReducer
23
+ from .reduction.umap_reducer import UMAPReducer
24
+
25
+ # Clustering Tools
26
+ from .clustering.factory import create_clustering_algorithm
27
+
28
+ # Visualization Tools
29
+ from .visualization.grids import GridVisualizer, create_cluster_grids
30
+
31
+ __all__ = [
32
+ 'ImageClusterer',
33
+ 'ClusteringResults',
34
+ 'FeaturePipeline',
35
+ 'create_feature_extractor',
36
+ 'create_reducer',
37
+ 'PCAReducer',
38
+ 'UMAPReducer',
39
+ 'create_clustering_algorithm',
40
+ 'GridVisualizer',
41
+ 'create_cluster_grids',
42
+ ]
@@ -0,0 +1,14 @@
1
+ from .base import ClusteringResult, ClusteringAlgorithm
2
+ from .kmeans import KMeansClustering
3
+ from .hdbscan_clustering import HDBSCANClustering
4
+ from .gmm import GMMClustering
5
+
6
+
7
+
8
+ __all__ = [
9
+ 'ClusteringAlgorithm',
10
+ 'ClusteringResult',
11
+ 'KMeansClustering',
12
+ 'HDBSCANClustering',
13
+ 'GMMClustering',
14
+ ]
@@ -0,0 +1,129 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass, field
3
+ from typing import Dict, List, Optional, Any
4
+ import numpy as np
5
+
6
+
7
+ @dataclass
8
+ class ClusteringResult:
9
+ """
10
+ Container for clustering Results.
11
+ """
12
+ cluster_labels: np.ndarray
13
+ cluster_dict: Dict[int, List[int]]
14
+ n_clusters: int
15
+ metadata: Dict[str, Any] = field(default_factory=dict)
16
+
17
+ def get_cluster_sizes(self) -> Dict[int, int]:
18
+ """Get the size of each cluster."""
19
+ return {cluster_id: len(indices) for cluster_id, indices in self.cluster_dict.items()}
20
+
21
+ def get_cluster(self, cluster_id: int) -> List[int]:
22
+ """Get indices belonging to a specific cluster."""
23
+ return self.cluster_dict.get(cluster_id, [])
24
+
25
+ def get_outliers(self) -> Optional[List[int]]:
26
+ """Get outlier indices (cluster_id = -1) if any exist."""
27
+ return self.cluster_dict.get(-1, None)
28
+
29
+ def summary(self) -> str:
30
+ """Get a summary string of clustering results."""
31
+
32
+ summary_lines = [
33
+ f"Clustering Results Summary:",
34
+ f" Total Samples: {len(self.clustering_labels)}",
35
+ f" Number of Clusters: {self.n_clusters}",
36
+ f" Cluster Sizes: {self.get_cluster_sizes()}",
37
+ ]
38
+
39
+ outliers = self.get_outliers()
40
+
41
+ if outliers:
42
+ summary_lines.append(f" Outliers: {len(outliers)}")
43
+
44
+ if self.metadata:
45
+ summary_lines.append(f" Metadata: {list(self.metadata.keys())}")
46
+
47
+ return "\n".join(summary_lines)
48
+
49
+ class ClusteringAlgorithm(ABC):
50
+ """
51
+ Abstract base class for all clustering algorithms.
52
+ """
53
+
54
+ def __init__(self, random_state=42, **kwargs):
55
+ """
56
+ Initialize the clustering algorithm.
57
+ """
58
+ self.random_state = random_state
59
+ self.params = kwargs
60
+ self.is_fitted = False
61
+ self._model = None
62
+
63
+ @abstractmethod
64
+ def fit_predict(self, features) -> ClusteringResult:
65
+ """
66
+ Fit the clustering algorithms and predict cluster labels.
67
+ """
68
+ pass
69
+
70
+ @abstractmethod
71
+ def get_algorithm_name(self):
72
+ """
73
+ Return the name of the clustering algorithm.
74
+ """
75
+ pass
76
+
77
+ def _validate_features(self, features:np.ndarray) -> None:
78
+ """
79
+ Validate the input feature matrix.
80
+ """
81
+ if not isinstance(features, np.ndarray):
82
+ raise ValueError(f"Feature must be a numpy array, got {type(features)}")
83
+
84
+ if features.ndim != 2:
85
+ raise ValueError(f"Features must be 2D array, got shape: {features.shape}")
86
+
87
+ if features.shape[0] == 0:
88
+ raise ValueError("Feature matrix is empty.")
89
+
90
+ if np.any(np.isnan(features)) or np.any(np.isinf(features)):
91
+ raise ValueError("Features contain NaN or Inf values")
92
+
93
+ def _create_cluster_dict(self, cluster_labels, filenames=None):
94
+ """
95
+ Createa dictionary mapping cluster IDs to indices or filenames
96
+ """
97
+
98
+ cluster_dict = {}
99
+
100
+ for idx, cluster_id in enumerate(cluster_labels):
101
+ cluster_id = int(cluster_id)
102
+
103
+ if cluster_id not in cluster_dict:
104
+ cluster_dict[cluster_id] = []
105
+
106
+ item = filenames[idx] if filenames else idx
107
+ cluster_dict[cluster_id].append(item)
108
+
109
+ return cluster_dict
110
+
111
+ def get_params(self):
112
+ """
113
+ Get parameters of the clustering algorithms.
114
+ """
115
+ return {
116
+ 'random_state': self.random_state,
117
+ **self.params,
118
+ }
119
+
120
+ def __repr__(self):
121
+ """
122
+ String representation of the clustering algorithm.
123
+ """
124
+ params_str = ", ".join(f"{key}={value}" for key, value in self.get_params().items())
125
+ return f"{self.get_algorithm_name()}({params_str})"
126
+
127
+
128
+
129
+
@@ -0,0 +1,43 @@
1
+ """
2
+ Factory function for creating clustering algorithms.
3
+ """
4
+
5
+
6
+ from .base import ClusteringAlgorithm
7
+ from .kmeans import KMeansClustering
8
+ from .hdbscan_clustering import HDBSCANClustering
9
+ from .gmm import GMMClustering
10
+
11
+
12
+ # Registry of available clustering algorithms.
13
+ CLUSTERING_ALGORITHMS = {
14
+ 'kmeans': KMeansClustering,
15
+ 'hdbscan': HDBSCANClustering,
16
+ 'gmm': GMMClustering,
17
+ }
18
+
19
+ def create_clustering_algorithm(
20
+ method:str,
21
+ **kwargs,
22
+ ) -> ClusteringAlgorithm:
23
+ """
24
+ Factory function to create clustering algorithms.
25
+ """
26
+
27
+ method = method.lower()
28
+
29
+ if method not in CLUSTERING_ALGORITHMS:
30
+ available = ", ".join(sorted(CLUSTERING_ALGORITHMS.keys()))
31
+ raise ValueError(
32
+ f"Unknown clustering Method: '{method}'.",
33
+ f"Available methods: {available}"
34
+ )
35
+
36
+ algorithm_class = CLUSTERING_ALGORITHMS[method]
37
+ return algorithm_class(**kwargs)
38
+
39
+ def get_available_algorithms():
40
+ """
41
+ Get a list of available clustering algorithms.
42
+ """
43
+ return sorted(CLUSTERING_ALGORITHMS.keys())
@@ -0,0 +1,165 @@
1
+ """
2
+ Gaussian Mixture Model (GMM) clustering implementation.
3
+ """
4
+
5
+ import numpy as np
6
+ from typing import Optional, List
7
+ from sklearn.mixture import GaussianMixture
8
+ from .base import ClusteringAlgorithm, ClusteringResult
9
+
10
+ class GMMClustering(ClusteringAlgorithm):
11
+ """
12
+ Gaussian Mixture Model clustering algorithm.
13
+ """
14
+
15
+ def __init__(
16
+ self,
17
+ n_components = 5,
18
+ covariance_type = 'diag',
19
+ max_iter = 100,
20
+ n_init = 10,
21
+ reg_covar = 1e-6,
22
+ random_state = 42,
23
+ **kwargs,
24
+ ):
25
+
26
+ super().__init__(random_state, **kwargs)
27
+
28
+ if n_components < 2:
29
+ raise ValueError("n_components must be atleast 2")
30
+
31
+ valid_covariance_types = ['full', 'tied', 'diag', 'spherical']
32
+ if covariance_type not in valid_covariance_types:
33
+ raise ValueError(f"covariance_type must be one of {valid_covariance_types}")
34
+
35
+ self.n_components = n_components
36
+ self.covariance_type = covariance_type
37
+ self.max_iter = max_iter
38
+ self.n_init = n_init
39
+ self.reg_covar = reg_covar
40
+
41
+ def fit_predict(
42
+ self,
43
+ features,
44
+ filenames = None,
45
+ ) -> ClusteringResult:
46
+
47
+ """
48
+ Fit GMM and predict cluster labels.
49
+ """
50
+
51
+ self._validate_features(features)
52
+ print('fshape: ', features.shape)
53
+
54
+ n_samples = features.shape[0]
55
+
56
+ # Adjust n-components if needed
57
+ actual_n_components = min(self.n_components, n_samples)
58
+
59
+ if actual_n_components < self.n_components:
60
+ print(f"Warning: Requested {self.n_components} components only "
61
+ f"{n_samples} samples. Using {actual_n_components} components.")
62
+
63
+ features_float64 = features.astype(np.float64)
64
+
65
+
66
+ # Create and fit GMM
67
+ self._model = GaussianMixture(
68
+ n_components=actual_n_components,
69
+ covariance_type=self.covariance_type,
70
+ max_iter=self.max_iter,
71
+ n_init=self.n_init,
72
+ reg_covar=self.reg_covar,
73
+ random_state=self.random_state,
74
+ verbose=0
75
+ )
76
+
77
+ # Fit and predict
78
+ cluster_labels = self._model.fit_predict(features_float64)
79
+
80
+ # Get probability scores
81
+ probabilities = self._model.predict_proba(features_float64)
82
+
83
+ # Create cluster dictionary
84
+ cluster_dict =self._create_cluster_dict(cluster_labels, filenames)
85
+
86
+
87
+ # Prepare metadata
88
+ metadata = {
89
+ 'algorithm': 'gmm',
90
+ 'converged': bool(self._model.converged_),
91
+ 'n_iter': int(self._model.n_iter_),
92
+ 'bic': float(self._model.bic(features_float64)),
93
+ 'aic': float(self._model.aic(features_float64)),
94
+ 'lower_bound': float(self._model.lower_bound_),
95
+ 'covariance_type': self.covariance_type,
96
+ 'probabilities': probabilities,
97
+ 'means': self._model.means_,
98
+ 'covariance': self._model.covariances_,
99
+ }
100
+
101
+ self.is_fitted = True
102
+
103
+ return ClusteringResult(
104
+ cluster_labels = cluster_labels,
105
+ cluster_dict = cluster_dict,
106
+ n_clusters = actual_n_components,
107
+ metadata = metadata
108
+ )
109
+
110
+ def predict(self, features):
111
+ """
112
+ Predict cluster label for new samples.
113
+ """
114
+
115
+ if not self.is_fitted or self._model is None:
116
+ raise RuntimeError("Model must be fitted before prediction. Call fit_predict first.")
117
+
118
+ self._validate_features(features)
119
+ return self._model.predict(features)
120
+
121
+ def predict_proba(self, features):
122
+ """
123
+ Predict probability of each cluster for new samples.
124
+ """
125
+
126
+ if not self.is_fitted or self._model is None:
127
+ raise RuntimeError("Model must be fitted before prediction. Call fit_predict first.")
128
+
129
+ self._validate_features(features)
130
+ return self._model.predict_proba(features)
131
+
132
+ def get_cluster_means(self):
133
+ """
134
+ Get cluster means (centers) if model is fitted.
135
+ """
136
+
137
+ if self.is_fitted and self._model is not None:
138
+ return self._model.means_
139
+
140
+ return None
141
+
142
+ def score(self, features):
143
+ """
144
+ Compute the log-likelihood of the data under the model.
145
+ """
146
+
147
+ if not self.is_fitted or self._model is None:
148
+ raise RuntimeError("Model must be fitted before scoring.")
149
+
150
+ return self._model.score(features)
151
+
152
+ def get_algorithm_name(self):
153
+ """Return the name of the clustering algorithm."""
154
+ return "GaussianMixtureModel (GMM)"
155
+
156
+ def get_params(self):
157
+ """Get parameters of the clustering algorithm."""
158
+ return {
159
+ 'n_components': self.n_components,
160
+ 'covariance_type': self.covariance_type,
161
+ 'max_iter': self.max_iter,
162
+ 'n_init': self.n_init,
163
+ 'random_state': self.random_state,
164
+ 'reg_covar': self.reg_covar
165
+ }
@@ -0,0 +1,175 @@
1
+ """HDBSCAN clustering Implementation."""
2
+
3
+
4
+ import numpy as np
5
+ from typing import Optional, List
6
+ from .base import ClusteringAlgorithm, ClusteringResult
7
+
8
+ class HDBSCANClustering(ClusteringAlgorithm):
9
+ """
10
+ HDBSCAN algorithm.
11
+ """
12
+
13
+ def __init__(
14
+ self,
15
+ min_cluster_size = None,
16
+ min_samples = None,
17
+ metric = 'euclidean',
18
+ cluster_selection_method = 'eom',
19
+ auto_params = True,
20
+ random_state = 42,
21
+ **kwargs,
22
+ ):
23
+
24
+ super().__init__(random_state=random_state, **kwargs)
25
+
26
+ self.min_cluster_size = min_cluster_size
27
+ self.min_samples = min_samples
28
+ self.metric = metric
29
+ self.cluster_selection_method = cluster_selection_method
30
+ self.auto_params = auto_params
31
+
32
+ # Validate parameters
33
+ if cluster_selection_method not in ['eom', 'leaf']:
34
+ raise ValueError("cluster_selection_method must be 'eom' or 'leaf'")
35
+
36
+
37
+ def _auto_select_params(self, n_samples):
38
+ """
39
+ Automatically select HDBSCAN parameters based on dataset size.
40
+ """
41
+
42
+ if n_samples < 100:
43
+ min_cluster_size = max(5, n_samples // 20)
44
+ min_samples = 1
45
+ elif n_samples < 2000:
46
+ min_cluster_size = 10
47
+ min_samples = 1
48
+ elif n_samples < 10000:
49
+ min_cluster_size = 20
50
+ min_samples = 3
51
+ else:
52
+ min_cluster_size = 30
53
+ min_samples = 5
54
+
55
+ return min_cluster_size, min_samples
56
+
57
+ def fit_predict(
58
+ self,
59
+ features,
60
+ filenames,
61
+ ) -> ClusteringResult:
62
+
63
+ """
64
+ Fit HDBSCAN and predict cluster labels.
65
+ """
66
+
67
+ try:
68
+ import hdbscan
69
+ except ImportError:
70
+ raise ImportError(
71
+ "hdbscan is not installed. Install it with: pip install hdbscan"
72
+ )
73
+
74
+ self._validate_features(features)
75
+
76
+ n_samples = features.shape[0]
77
+
78
+ # Auto-select parameters if needed.
79
+ if self.auto_params:
80
+ auto_min_cluster_size, auto_min_samples = self._auto_select_params(n_samples)
81
+ min_cluster_size = self.min_cluster_size or auto_min_cluster_size
82
+ min_samples = self.min_samples or auto_min_samples
83
+ else:
84
+ min_cluster_size = self.min_cluster_size or 10
85
+ min_samples = self.min_samples or 1
86
+
87
+ # Ensure parameters are valid.
88
+ min_cluster_size = min(min_cluster_size, n_samples)
89
+ min_samples = min(min_samples, min_cluster_size)
90
+
91
+ # Create and fit HDBSCAN
92
+ self._model = hdbscan.HDBSCAN(
93
+ min_cluster_size = min_cluster_size,
94
+ min_samples = min_samples,
95
+ metric = self.metric,
96
+ cluster_selection_method = self.cluster_selection_method,
97
+ core_dist_n_jobs=-1 # Use all cores
98
+ )
99
+
100
+ # Fit and predict
101
+ cluster_labels = self._model.fit_predict(features)
102
+
103
+ # Create cluster dictionary
104
+ cluster_dict = self._create_cluster_dict(cluster_labels, filenames)
105
+
106
+ # Count actual clusters
107
+ unique_labels = np.unique(cluster_labels)
108
+ n_clusters = len(unique_labels[unique_labels >= 0])
109
+ n_noise = np.sum(cluster_labels == -1)
110
+
111
+ # Prepare metadata
112
+ metadata = {
113
+ 'algorithm': 'hdbscan',
114
+ 'n_clusters_found': n_clusters,
115
+ 'n_noise_points': int(n_noise),
116
+ 'min_cluster_size': min_cluster_size,
117
+ 'min_samples' : min_samples,
118
+ 'metric': self.metric,
119
+ 'cluster_selection_method': self.cluster_selection_method,
120
+ 'probabilities': self._model.probabilities_,
121
+ 'outlier_scores': self._model.outlier_scores_,
122
+ }
123
+
124
+ # Add exemplars if available (representative points for each cluster)
125
+ if hasattr(self._model, 'exemplars_'):
126
+ metadata['exemplars'] = self._model.exemplars_
127
+
128
+ self.is_fitted = True
129
+
130
+ return ClusteringResult(
131
+ cluster_labels=cluster_labels,
132
+ cluster_dict=cluster_dict,
133
+ n_clusters=n_clusters,
134
+ metadata=metadata
135
+ )
136
+
137
+ def get_outlier_score(self):
138
+ """
139
+ Get outlier score for each sample.
140
+ """
141
+
142
+ if self.is_fitted and self._model is not None:
143
+ return self._model.outlier_scores_
144
+
145
+ return None
146
+
147
+ def get_condensed_tree(self):
148
+ """
149
+ Get condensed cluster hierarchy tree.
150
+ """
151
+
152
+ if self.is_fitted and self._model is not None:
153
+ return self._model.condensed_tree_
154
+
155
+ return None
156
+
157
+ def get_algorithm_name(self):
158
+ """
159
+ Return the name of the clustering algorithm.
160
+ """
161
+ return "HDBSCAN"
162
+
163
+ def get_params(self):
164
+ """
165
+ Get parameters of the clustering algorithm.
166
+ """
167
+ return {
168
+ 'min_cluster_size': self.min_cluster_size,
169
+ 'min_samples': self.min_samples,
170
+ 'metric': self.metric,
171
+ 'cluster_selection_method': self.cluster_selection_method,
172
+ 'auto_params': self.auto_params,
173
+ 'random_state': self.random_state
174
+ }
175
+