imageatlas 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. imageatlas/__init__.py +42 -0
  2. imageatlas/clustering/__init__.py +14 -0
  3. imageatlas/clustering/base.py +129 -0
  4. imageatlas/clustering/factory.py +43 -0
  5. imageatlas/clustering/gmm.py +165 -0
  6. imageatlas/clustering/hdbscan_clustering.py +175 -0
  7. imageatlas/clustering/kmeans.py +148 -0
  8. imageatlas/core/__init__.py +15 -0
  9. imageatlas/core/clusterer.py +377 -0
  10. imageatlas/core/results.py +362 -0
  11. imageatlas/features/__init__.py +18 -0
  12. imageatlas/features/adapter.py +0 -0
  13. imageatlas/features/batch.py +142 -0
  14. imageatlas/features/cache.py +257 -0
  15. imageatlas/features/extractors/__init__.py +20 -0
  16. imageatlas/features/extractors/base.py +73 -0
  17. imageatlas/features/extractors/clip.py +26 -0
  18. imageatlas/features/extractors/convnext.py +58 -0
  19. imageatlas/features/extractors/dinov2.py +42 -0
  20. imageatlas/features/extractors/efficientnet.py +54 -0
  21. imageatlas/features/extractors/factory.py +47 -0
  22. imageatlas/features/extractors/mobilenet.py +58 -0
  23. imageatlas/features/extractors/resnet.py +63 -0
  24. imageatlas/features/extractors/swin.py +60 -0
  25. imageatlas/features/extractors/vgg.py +46 -0
  26. imageatlas/features/extractors/vit.py +67 -0
  27. imageatlas/features/loaders.py +187 -0
  28. imageatlas/features/metadata.py +81 -0
  29. imageatlas/features/pipeline.py +347 -0
  30. imageatlas/reduction/__init__.py +20 -0
  31. imageatlas/reduction/base.py +131 -0
  32. imageatlas/reduction/factory.py +51 -0
  33. imageatlas/reduction/pca.py +148 -0
  34. imageatlas/reduction/tsne.py +173 -0
  35. imageatlas/reduction/umap_reducer.py +110 -0
  36. imageatlas/visualization/__init__.py +10 -0
  37. imageatlas/visualization/grids.py +197 -0
  38. imageatlas-0.1.0.dist-info/METADATA +203 -0
  39. imageatlas-0.1.0.dist-info/RECORD +42 -0
  40. imageatlas-0.1.0.dist-info/WHEEL +5 -0
  41. imageatlas-0.1.0.dist-info/licenses/LICENSE +21 -0
  42. imageatlas-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,148 @@
1
+ """
2
+ K-Means clustering implementation
3
+ """
4
+
5
+ import numpy as np
6
+ from sklearn.cluster import KMeans, MiniBatchKMeans
7
+ from .base import ClusteringAlgorithm, ClusteringResult
8
+ from typing import Optional
9
+
10
+ class KMeansClustering(ClusteringAlgorithm):
11
+ """
12
+ K-Means clustering algorithm.
13
+ """
14
+
15
+ def __init__(
16
+ self,
17
+ n_clusters: int = 5,
18
+ n_init: int = 15,
19
+ max_iter: int = 300,
20
+ use_minibatch: bool = False,
21
+ batch_size: int = 1024,
22
+ random_state: Optional[int] = 42,
23
+ **kwargs
24
+ ):
25
+
26
+ super().__init__(random_state=random_state, **kwargs)
27
+
28
+ if n_clusters < 2:
29
+ raise ValueError("n_clusters must be atleast 2.")
30
+
31
+ self.n_clusters = n_clusters
32
+ self.n_init = n_init
33
+ self.max_iter = max_iter
34
+ self.use_minibatch = use_minibatch
35
+ self.batch_size = batch_size
36
+
37
+ def fit_predict(
38
+ self,
39
+ features,
40
+ filenames = None,
41
+ ) -> ClusteringResult:
42
+
43
+ """
44
+ Fit K-Means and predict cluster labels.
45
+ """
46
+
47
+ self._validate_features(features)
48
+
49
+ n_samples = features.shape[0]
50
+
51
+ # Adjust n-clusters if needed.
52
+ actual_n_clusters = min(self.n_clusters, n_samples)
53
+
54
+ if actual_n_clusters < self.n_clusters:
55
+ print(f"Warning: Requested {self.n_clusters}clusters but only ",
56
+ f"{n_samples} samples. Using {actual_n_clusters} clusters.")
57
+
58
+ # Decide whether to use MiniBatchKMeans
59
+
60
+ use_minibatch = self.use_minibatch or n_samples > 10000
61
+
62
+ if use_minibatch:
63
+ self._model = MiniBatchKMeans(
64
+ n_clusters=actual_n_clusters,
65
+ init='k-means++',
66
+ max_iter=self.max_iter,
67
+ batch_size=min(self.batch_size, n_samples),
68
+ random_state=self.random_state,
69
+ n_init=self.n_init,
70
+ verbose=0
71
+ )
72
+
73
+ else:
74
+ self._model = KMeans(
75
+ n_clusters=actual_n_clusters,
76
+ init='k-means++',
77
+ n_init=self.n_init,
78
+ max_iter=self.max_iter,
79
+ random_state=self.random_state,
80
+ verbose=0
81
+ )
82
+
83
+ # Fit and predict
84
+ cluster_labels = self._model.fit_predict(features)
85
+
86
+ # Create cluster dictionary
87
+ cluster_dict = self._create_cluster_dict(cluster_labels, filenames)
88
+
89
+ # Prepare metadata
90
+ metadata = {
91
+ 'algorithm': 'kmeans',
92
+ 'inertia': float(self._model.inertia_),
93
+ 'n_iter': int(self._model.n_iter_),
94
+ 'cluster_centers': self._model.cluster_centers_.tolist(),
95
+ 'used_minibatch': use_minibatch,
96
+ }
97
+
98
+ self.is_fitted = True
99
+
100
+ return ClusteringResult(
101
+ cluster_labels=cluster_labels,
102
+ cluster_dict=cluster_dict,
103
+ n_clusters=actual_n_clusters,
104
+ metadata=metadata
105
+ )
106
+
107
+
108
+ def predict(self, features):
109
+ """
110
+ Predict cluster label for new samples.
111
+ """
112
+
113
+ if not self.is_fitted or self._model == None:
114
+ raise RuntimeError("Model must be fitted before prediction. Call fit_predict first.")
115
+
116
+ self._validate_features(features)
117
+ return self._model.predict(features)
118
+
119
+ def get_cluster_centers(self):
120
+ """
121
+ Get cluster centers if model is fitted.
122
+ """
123
+ if self.is_fitted and self._model is not None:
124
+ return self._model.cluster_centers_
125
+
126
+ return None
127
+
128
+ def get_algorithm_name(self):
129
+ """
130
+ Return the name of the clustering algorithm.
131
+ """
132
+ return "KMeans"
133
+
134
+ def get_params(self):
135
+ """
136
+ Get parameters of the clustering algorithm.
137
+ """
138
+
139
+ return {
140
+ 'n_clusters': self.n_clusters,
141
+ 'n_init': self.n_init,
142
+ 'max_iter': self.max_iter,
143
+ 'use_minibatch': self.use_minibatch,
144
+ 'batch_size': self.batch_size,
145
+ 'random_state': self.random_state,
146
+ }
147
+
148
+
@@ -0,0 +1,15 @@
1
+ """
2
+ Core module for ImageClusterViz.
3
+
4
+ This module provides the main ImageClusterer API that ties together
5
+ feature extraction, dimensionality reduction, and clustering.
6
+ """
7
+
8
+ from .clusterer import ImageClusterer
9
+ from .results import ClusteringResults, ExportManager
10
+
11
+ __all__ = [
12
+ 'ImageClusterer',
13
+ 'ClusteringResults',
14
+ 'ExportManager',
15
+ ]
@@ -0,0 +1,377 @@
1
+ import os
2
+ import numpy as np
3
+ from pathlib import Path
4
+
5
+ from ..features import FeaturePipeline
6
+ from ..features.extractors import create_feature_extractor
7
+ from ..clustering.factory import create_clustering_algorithm
8
+ from ..reduction.factory import create_reducer
9
+ from .results import ClusteringResults
10
+
11
+
12
+ class ImageClusterer:
13
+ """
14
+ Main API for Image Clustering.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ model='resnet',
20
+ model_variant=None,
21
+ n_clusters=5,
22
+ clustering_method='kmeans',
23
+ reducer=None,
24
+ n_components=50,
25
+ batch_size=32,
26
+ device='auto',
27
+ random_state=42,
28
+ verbose=True
29
+ ):
30
+ """
31
+ Initialize ImageClusterer.
32
+ """
33
+
34
+ self.model = model
35
+ self.model_variant = model_variant
36
+ self.n_clusters = n_clusters
37
+ self.clustering_method = clustering_method
38
+ self.reducer = reducer
39
+ self.n_components = n_components
40
+ self.batch_size = batch_size
41
+ self.device = self._validate_device(device)
42
+ self.random_state = random_state
43
+ self.verbose = verbose
44
+
45
+
46
+ # Initializer components
47
+ self._extractor = None
48
+ self._feature_pipeline = None
49
+ self._reducer_instance = None
50
+ self._clusterer_instance = None
51
+
52
+ # Storage for fitted data
53
+ self.features_ = None
54
+ self.reduced_features_ = None
55
+ self.filenames_ = None
56
+ self.results_ = None
57
+ self.is_fitted_ = False
58
+
59
+
60
+ def _validate_device(self, device):
61
+ """Validate and normalize device string."""
62
+ if device == 'auto':
63
+ import torch
64
+ return 'cuda' if torch.cuda.is_available() else 'cpu'
65
+
66
+ return device
67
+
68
+ def _get_extractor(self):
69
+ """
70
+ Get or create feature extractor.
71
+ """
72
+ if self._extractor is None:
73
+ if self.verbose:
74
+ print(f"Creating feature extractor: {self.model}")
75
+ self._extractor = create_feature_extractor(
76
+ model_type=self.model,
77
+ variant=self.model_variant,
78
+ device=self.device
79
+ )
80
+
81
+ return self._extractor
82
+
83
+ def _get_feature_pipeline(self):
84
+ """
85
+ Get or create feature extraction pipeline.
86
+ """
87
+ if self._feature_pipeline is None:
88
+ extractor = self._get_extractor()
89
+ self._feature_pipeline = FeaturePipeline(
90
+ extractor=extractor,
91
+ batch_size=self.batch_size,
92
+ device=self.device,
93
+ verbose=self.verbose
94
+ )
95
+
96
+ return self._feature_pipeline
97
+
98
+
99
+ def _get_reducer(self):
100
+ """
101
+ Get or create dimensionality reducer.
102
+ """
103
+ if self.reducer is None:
104
+ return None
105
+
106
+ if self._reducer_instance is None:
107
+ if self.verbose:
108
+ print(f"Creating dimensionality reducer: {self.reducer}")
109
+
110
+ self._reducer_instance = create_reducer(
111
+ algorithm=self.reducer,
112
+ n_components=self.n_components,
113
+ random_state=self.random_state
114
+ )
115
+
116
+ return self._reducer_instance
117
+
118
+ def _get_clusterer(self):
119
+ """
120
+ Get or create clustering algorithm.
121
+ """
122
+
123
+ if self._clusterer_instance is None:
124
+ if self.verbose:
125
+ print(f"Creating clusterer: {self.clustering_method}")
126
+
127
+ # Create clusterer based on method
128
+ if self.clustering_method in ['kmeans', 'gmm']:
129
+ self._clusterer_instance = create_clustering_algorithm(
130
+ self.clustering_method,
131
+ n_clusters=self.n_clusters if self.clustering_method == 'kmeans' else None,
132
+ n_components=self.n_clusters if self.clustering_method == 'gmm' else None,
133
+ random_state=self.random_state
134
+ )
135
+ elif self.clustering_method == 'hdbscan':
136
+ self._clusterer_instance = create_clustering_algorithm(
137
+ 'hdbscan',
138
+ auto_params=True,
139
+ random_state=self.random_state
140
+ )
141
+
142
+ else:
143
+ raise ValueError(f"Unknown clustering method: {self.clustering_method}")
144
+
145
+ return self._clusterer_instance
146
+
147
+ def fit(
148
+ self,
149
+ image_dir,
150
+ pattern="*",
151
+ recursive=False,
152
+ cache_path=None,
153
+ use_cache=False
154
+ ):
155
+ """
156
+ Fit the clusterer to the images in a directory.
157
+ This method:
158
+ 1. Extract features from images.
159
+ 2. Optionally reduces dimensionality.
160
+ 3. Clusters the features.
161
+ 4. Returns results object.
162
+ """
163
+
164
+ if self.verbose:
165
+ print("="*60)
166
+ print("IMAGE CLUSTERING PIPELINE")
167
+ print("="*60)
168
+
169
+ # Step 1: Feature Extraction
170
+ if self.verbose:
171
+ print("\nStep 1: Feature Extraction.")
172
+
173
+ pipeline = self._get_feature_pipeline()
174
+
175
+ # Check for cached features
176
+ if use_cache and cache_path and os.path.exists(cache_path):
177
+ if self.verbose:
178
+ print(f"Loading cached features from: {cache_path}")
179
+ pipeline.load(cache_path)
180
+ else:
181
+ # Extract features
182
+ pipeline.extract_from_directory(
183
+ image_dir,
184
+ pattern=pattern,
185
+ recursive=recursive
186
+ )
187
+
188
+ # Save cache if path provided
189
+ if cache_path:
190
+ pipeline.save(cache_path)
191
+
192
+ # Get features and filenames
193
+ self.features_ = pipeline.get_features()
194
+ self.filenames_= pipeline.get_filenames()
195
+
196
+ if self.verbose:
197
+ print(f" Extracted features: {self.features_.shape}")
198
+
199
+ # Step 2: Dimensionality Reduction (optional)
200
+ features_for_clustering = self.features_
201
+
202
+ if self.reducer is not None:
203
+ if self.verbose:
204
+ print(f"\nStep 2: Dimensionality Reduction ({self.reducer.upper()})")
205
+
206
+ reducer = self._get_reducer()
207
+ self.reduced_features_ = reducer.fit_transform(self.features_)
208
+ features_for_clustering = self.reduced_features_
209
+
210
+ if self.verbose:
211
+ print(f" Reduced to: {self.reduced_features_.shape}")
212
+
213
+
214
+ # Print variance info for PCA
215
+ metadata = reducer.get_metadata()
216
+ if 'total_variance_explained' in metadata:
217
+ print(f" Variance explained: {metadata['total_variance_explained']:.2%}")
218
+
219
+ else:
220
+ if self.verbose:
221
+ print(f"\n2: Dimensionality Reduction (skipped)")
222
+ self.reduced_features_ = None
223
+
224
+
225
+ # Step 3: Clustering
226
+ if self.verbose:
227
+ print(f"\n3. Step 3: Clustering ({self.clustering_method.upper()})")
228
+
229
+ clusterer = self._get_clusterer()
230
+ clustering_result = clusterer.fit_predict(
231
+ features_for_clustering,
232
+ filenames=self.filenames_
233
+ )
234
+
235
+ if self.verbose:
236
+ print(f" Found {clustering_result.n_clusters} clusters")
237
+ print(f" Cluster sizes: {clustering_result.get_cluster_sizes()}")
238
+
239
+ # Step 4: Create results object
240
+ metadata = {
241
+ 'model_type': self.model,
242
+ 'model_variant': self.model_variant,
243
+ 'clustering_method': self.clustering_method,
244
+ 'n_clusters': clustering_result.n_clusters,
245
+ 'reducer': self.reducer,
246
+ 'n_components': self.n_components if self.reducer else None,
247
+ 'device': self.device,
248
+ 'batch_size': self.batch_size
249
+ }
250
+
251
+
252
+ # Add clustering metadata
253
+ if clustering_result.metadata:
254
+ metadata['clustering_metadata'] = clustering_result.metadata
255
+
256
+ self.results_ = ClusteringResults(
257
+ cluster_labels=clustering_result.cluster_labels,
258
+ cluster_dict=clustering_result.cluster_dict,
259
+ filenames=self.filenames_,
260
+ features=self.features_,
261
+ reduced_features=self.reduced_features_,
262
+ n_clusters=clustering_result.n_clusters,
263
+ metadata=metadata
264
+ )
265
+
266
+ self.is_fitted_ = True
267
+
268
+ if self.verbose:
269
+ print("\n" + "="*60)
270
+ print("CLUSTERING COMPLETE")
271
+ print("="*60)
272
+ print(f"\n{self.results_.summary()}")
273
+
274
+ return self.results_
275
+
276
+
277
+ def fit_features(
278
+ self,
279
+ features,
280
+ filenames=None
281
+ ):
282
+ """
283
+ Fit the clusterer to pre-computed featuers.
284
+ """
285
+
286
+ if filenames is None:
287
+ filenames = [f"sample_{i}" for i in range((len(features)))]
288
+
289
+ self.features_ = features
290
+ self.filenames_ = filenames
291
+
292
+ if self.verbose:
293
+ print("="*60)
294
+ print("CLUSTERING PRE-COMPUTED FEATURES")
295
+ print("="*60)
296
+ print(f"\nInput features: {features.shape}")
297
+
298
+
299
+ # Apply dimensionality reduction if specified
300
+ features_for_clustering = features
301
+
302
+ if self.reducer is not None:
303
+ if self.verbose:
304
+ print(f"\nApplying {self.reducer.upper()} reduction...")
305
+
306
+ reducer = self._get_reducer()
307
+ self.reduced_features_ = reducer.fit_transform(features)
308
+ features_for_clustering = self.reduced_features_
309
+
310
+ if self.verbose:
311
+ print(f" Reduced to: {self.reduced_features_.shape}")
312
+ else:
313
+ self.reduced_features_ = None
314
+
315
+ # Cluster
316
+ if self.verbose:
317
+ print(f"\nClustering with {self.clustering_method.upper()}...")
318
+
319
+ clusterer = self._get_clusterer()
320
+ clustering_result = clusterer.fit_predict(
321
+ features_for_clustering,
322
+ filenames=filenames
323
+ )
324
+
325
+ if self.verbose:
326
+ print(f" Found {clustering_result.n_clusters} clusters")
327
+
328
+ # Create results
329
+ metadata = {
330
+ 'clustering_method': self.clustering_method,
331
+ 'n_clusters': clustering_result.n_clusters,
332
+ 'reducer': self.reducer,
333
+ 'n_components': self.n_components if self.reducer else None,
334
+ 'clustering_metadata': clustering_result.metadata,
335
+ }
336
+
337
+ self.results_ = ClusteringResults(
338
+ cluster_labels=clustering_result.cluster_labels,
339
+ cluster_dict=clustering_result.cluster_dict,
340
+ filenames=filenames,
341
+ features=features,
342
+ reduced_features=self.reduced_features_,
343
+ n_clusters=clustering_result.n_clusters,
344
+ metadata=metadata
345
+ )
346
+
347
+ self.is_fitted_ = True
348
+
349
+ if self.verbose:
350
+ print(f"\n{self.results_.summary()}")
351
+
352
+ return self.results_
353
+
354
+
355
+ def get_results(self):
356
+ """
357
+ Get clustering results
358
+ """
359
+ if not self.is_fitted_:
360
+ raise RuntimeError("Clusterer has not been fitted yet. Call fit() first.")
361
+ return self.results_
362
+
363
+ def __repr__(self):
364
+ """
365
+ String representation.
366
+ """
367
+ status = "fitted" if self.is_fitted_ else "not fitted"
368
+ return (f"ImageClusterer(model='{self.model}', "
369
+ f"n_clusters={self.n_clusters}, "
370
+ f"method='{self.clustering_method}', "
371
+ f"status={status})")
372
+
373
+
374
+
375
+
376
+
377
+