imageatlas 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imageatlas/__init__.py +1 -1
- imageatlas/clustering/__init__.py +10 -0
- imageatlas/clustering/base.py +33 -0
- imageatlas/clustering/factory.py +24 -0
- imageatlas/clustering/gmm.py +42 -1
- imageatlas/clustering/hdbscan_clustering.py +28 -1
- imageatlas/clustering/kmeans.py +27 -0
- imageatlas/features/batch.py +23 -1
- imageatlas/features/cache.py +39 -1
- imageatlas/features/loaders.py +52 -0
- imageatlas/features/metadata.py +3 -0
- imageatlas/features/pipeline.py +50 -0
- {imageatlas-0.1.0.dist-info → imageatlas-0.1.2.dist-info}/METADATA +8 -1
- {imageatlas-0.1.0.dist-info → imageatlas-0.1.2.dist-info}/RECORD +17 -17
- {imageatlas-0.1.0.dist-info → imageatlas-0.1.2.dist-info}/WHEEL +1 -1
- {imageatlas-0.1.0.dist-info → imageatlas-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {imageatlas-0.1.0.dist-info → imageatlas-0.1.2.dist-info}/top_level.txt +0 -0
imageatlas/__init__.py
CHANGED
|
@@ -1,7 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Clustering Algorithms module.
|
|
3
|
+
|
|
4
|
+
This module provides various clustering algorithms with a unified interface for clustering
|
|
5
|
+
on image features.
|
|
6
|
+
|
|
7
|
+
"""
|
|
1
8
|
from .base import ClusteringResult, ClusteringAlgorithm
|
|
2
9
|
from .kmeans import KMeansClustering
|
|
3
10
|
from .hdbscan_clustering import HDBSCANClustering
|
|
4
11
|
from .gmm import GMMClustering
|
|
12
|
+
from .factory import create_clustering_algorithm, get_available_algorithms
|
|
5
13
|
|
|
6
14
|
|
|
7
15
|
|
|
@@ -11,4 +19,6 @@ __all__ = [
|
|
|
11
19
|
'KMeansClustering',
|
|
12
20
|
'HDBSCANClustering',
|
|
13
21
|
'GMMClustering',
|
|
22
|
+
'create_clustering_algorithm',
|
|
23
|
+
'get_available_algorithms'
|
|
14
24
|
]
|
imageatlas/clustering/base.py
CHANGED
|
@@ -8,7 +8,13 @@ import numpy as np
|
|
|
8
8
|
class ClusteringResult:
|
|
9
9
|
"""
|
|
10
10
|
Container for clustering Results.
|
|
11
|
+
Attributes:
|
|
12
|
+
cluster_labels: Array of cluster assignments for each sample.
|
|
13
|
+
cluster_dict: Dictionary mapping cluster IDs to list of sample indices.
|
|
14
|
+
n_clusters: Number of clusters found.
|
|
15
|
+
metadata: Additional algorithm-specific metadata.
|
|
11
16
|
"""
|
|
17
|
+
|
|
12
18
|
cluster_labels: np.ndarray
|
|
13
19
|
cluster_dict: Dict[int, List[int]]
|
|
14
20
|
n_clusters: int
|
|
@@ -49,11 +55,18 @@ class ClusteringResult:
|
|
|
49
55
|
class ClusteringAlgorithm(ABC):
|
|
50
56
|
"""
|
|
51
57
|
Abstract base class for all clustering algorithms.
|
|
58
|
+
|
|
59
|
+
All the clustering algorithms must implement the fit_predict method and
|
|
60
|
+
provide a consistent interface for clustering operations.
|
|
52
61
|
"""
|
|
53
62
|
|
|
54
63
|
def __init__(self, random_state=42, **kwargs):
|
|
55
64
|
"""
|
|
56
65
|
Initialize the clustering algorithm.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
random_state: Random seed for reproducibility.
|
|
69
|
+
**kwargs: Additional algorithm related parameters.
|
|
57
70
|
"""
|
|
58
71
|
self.random_state = random_state
|
|
59
72
|
self.params = kwargs
|
|
@@ -64,6 +77,12 @@ class ClusteringAlgorithm(ABC):
|
|
|
64
77
|
def fit_predict(self, features) -> ClusteringResult:
|
|
65
78
|
"""
|
|
66
79
|
Fit the clustering algorithms and predict cluster labels.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
features: Feature matrix of shape (n_samples, n_features)
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
ClusteringResult object containing cluster assignments and metadata.
|
|
67
86
|
"""
|
|
68
87
|
pass
|
|
69
88
|
|
|
@@ -77,7 +96,14 @@ class ClusteringAlgorithm(ABC):
|
|
|
77
96
|
def _validate_features(self, features:np.ndarray) -> None:
|
|
78
97
|
"""
|
|
79
98
|
Validate the input feature matrix.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
features: Feature matrix of shape (n_samples, n_features) to validate.
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
ValueError: If features are invalid.
|
|
80
105
|
"""
|
|
106
|
+
|
|
81
107
|
if not isinstance(features, np.ndarray):
|
|
82
108
|
raise ValueError(f"Feature must be a numpy array, got {type(features)}")
|
|
83
109
|
|
|
@@ -93,6 +119,13 @@ class ClusteringAlgorithm(ABC):
|
|
|
93
119
|
def _create_cluster_dict(self, cluster_labels, filenames=None):
|
|
94
120
|
"""
|
|
95
121
|
Createa dictionary mapping cluster IDs to indices or filenames
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
cluster_labels: Array of cluster assignments.
|
|
125
|
+
filenames: Optional list of filenames corresponding to images.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
Dictionary mapping cluster IDs to lists of indices or filenames
|
|
96
129
|
"""
|
|
97
130
|
|
|
98
131
|
cluster_dict = {}
|
imageatlas/clustering/factory.py
CHANGED
|
@@ -22,6 +22,27 @@ def create_clustering_algorithm(
|
|
|
22
22
|
) -> ClusteringAlgorithm:
|
|
23
23
|
"""
|
|
24
24
|
Factory function to create clustering algorithms.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
method: Name of the clustering algorithm ('kmeans', 'gmm', 'hdbscan')
|
|
28
|
+
**kwargs: Algorithm specific parameters
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Instance of the requested clustering algorithm
|
|
32
|
+
|
|
33
|
+
Raises:
|
|
34
|
+
Value Error: If clustering method is not supported.
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
Examples:
|
|
38
|
+
>>> # Create KMeans with 5 clusters
|
|
39
|
+
>>> clusterer = create_clustering_algorithm('kmeans', n_clusters=5)
|
|
40
|
+
|
|
41
|
+
>>> # Create GMM with full covariance
|
|
42
|
+
>>> clusterer = create_clustering_algorithm('gmm', n_components=8, covariance_type='full')
|
|
43
|
+
|
|
44
|
+
>>> # Create HDBSCAN with auto parameters
|
|
45
|
+
>>> clusterer = create_clustering_algorithm('hdbscan', auto_params=True)
|
|
25
46
|
"""
|
|
26
47
|
|
|
27
48
|
method = method.lower()
|
|
@@ -39,5 +60,8 @@ def create_clustering_algorithm(
|
|
|
39
60
|
def get_available_algorithms():
|
|
40
61
|
"""
|
|
41
62
|
Get a list of available clustering algorithms.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
List of algorithm names.
|
|
42
66
|
"""
|
|
43
67
|
return sorted(CLUSTERING_ALGORITHMS.keys())
|
imageatlas/clustering/gmm.py
CHANGED
|
@@ -10,6 +10,14 @@ from .base import ClusteringAlgorithm, ClusteringResult
|
|
|
10
10
|
class GMMClustering(ClusteringAlgorithm):
|
|
11
11
|
"""
|
|
12
12
|
Gaussian Mixture Model clustering algorithm.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
n_components: Number of mixture components (clusters)
|
|
16
|
+
covariance_type: Type of covarince parameters ('full', 'diag', 'tied', 'spherical')
|
|
17
|
+
max_iter: Maximum number of EM iterations
|
|
18
|
+
n_init: Number of initializations to perform
|
|
19
|
+
reg_covar: Regularization added to diagonal of covariance (prevents singular matrices)
|
|
20
|
+
random_state: Random seed for reproducibility
|
|
13
21
|
"""
|
|
14
22
|
|
|
15
23
|
def __init__(
|
|
@@ -46,10 +54,16 @@ class GMMClustering(ClusteringAlgorithm):
|
|
|
46
54
|
|
|
47
55
|
"""
|
|
48
56
|
Fit GMM and predict cluster labels.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
features: Feature matrix of shape (n_samples, n_features)
|
|
60
|
+
filenames: Optional list of filenames for cluster mapping
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
ClusteringResult object with cluster assignments.
|
|
49
64
|
"""
|
|
50
65
|
|
|
51
66
|
self._validate_features(features)
|
|
52
|
-
print('fshape: ', features.shape)
|
|
53
67
|
|
|
54
68
|
n_samples = features.shape[0]
|
|
55
69
|
|
|
@@ -110,6 +124,15 @@ class GMMClustering(ClusteringAlgorithm):
|
|
|
110
124
|
def predict(self, features):
|
|
111
125
|
"""
|
|
112
126
|
Predict cluster label for new samples.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
features: Feature matrix of shape (n_samples, n_features)
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Array of cluster labels
|
|
133
|
+
|
|
134
|
+
Raises:
|
|
135
|
+
RuntimeError: If model has not been fitted yet.
|
|
113
136
|
"""
|
|
114
137
|
|
|
115
138
|
if not self.is_fitted or self._model is None:
|
|
@@ -121,6 +144,15 @@ class GMMClustering(ClusteringAlgorithm):
|
|
|
121
144
|
def predict_proba(self, features):
|
|
122
145
|
"""
|
|
123
146
|
Predict probability of each cluster for new samples.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
features: Feature matrix of shape (n_samples, n_features)
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Array of cluster labels
|
|
153
|
+
|
|
154
|
+
Raises:
|
|
155
|
+
RuntimeError: If model has not been fitted yet.
|
|
124
156
|
"""
|
|
125
157
|
|
|
126
158
|
if not self.is_fitted or self._model is None:
|
|
@@ -132,6 +164,9 @@ class GMMClustering(ClusteringAlgorithm):
|
|
|
132
164
|
def get_cluster_means(self):
|
|
133
165
|
"""
|
|
134
166
|
Get cluster means (centers) if model is fitted.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Array of cluster centers or None if not fitted.
|
|
135
170
|
"""
|
|
136
171
|
|
|
137
172
|
if self.is_fitted and self._model is not None:
|
|
@@ -142,6 +177,12 @@ class GMMClustering(ClusteringAlgorithm):
|
|
|
142
177
|
def score(self, features):
|
|
143
178
|
"""
|
|
144
179
|
Compute the log-likelihood of the data under the model.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
features: Feature matrix of shape (n_samples, n_features)
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
Log-likelihood score
|
|
145
186
|
"""
|
|
146
187
|
|
|
147
188
|
if not self.is_fitted or self._model is None:
|
|
@@ -7,7 +7,16 @@ from .base import ClusteringAlgorithm, ClusteringResult
|
|
|
7
7
|
|
|
8
8
|
class HDBSCANClustering(ClusteringAlgorithm):
|
|
9
9
|
"""
|
|
10
|
-
HDBSCAN
|
|
10
|
+
HDBSCAN (Hierarchical Density-Based Spatial Clustering) Algorithm.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
min_cluster_size: Minimum number of samples in a cluster
|
|
14
|
+
min_samples: Number of samples in a neighborhood for core points.
|
|
15
|
+
metric: Distance metric to use
|
|
16
|
+
cluster_selection_method: Method for selecting clusters ('eom' or 'leaf')
|
|
17
|
+
auto_params: Whether to automatically set parameters based on dataset size
|
|
18
|
+
random_state: Random seed (note: HDBSCAN is deterministic, this is for consistency)
|
|
19
|
+
|
|
11
20
|
"""
|
|
12
21
|
|
|
13
22
|
def __init__(
|
|
@@ -37,6 +46,12 @@ class HDBSCANClustering(ClusteringAlgorithm):
|
|
|
37
46
|
def _auto_select_params(self, n_samples):
|
|
38
47
|
"""
|
|
39
48
|
Automatically select HDBSCAN parameters based on dataset size.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
n_samples: Number of samples in the dataset.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
Tuple of (min_cluster_size, min_samples)
|
|
40
55
|
"""
|
|
41
56
|
|
|
42
57
|
if n_samples < 100:
|
|
@@ -62,6 +77,10 @@ class HDBSCANClustering(ClusteringAlgorithm):
|
|
|
62
77
|
|
|
63
78
|
"""
|
|
64
79
|
Fit HDBSCAN and predict cluster labels.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
features: Feature matrix of shape (n_samples, n_features)
|
|
83
|
+
filenames: Optional list of filenames for cluster mapping.
|
|
65
84
|
"""
|
|
66
85
|
|
|
67
86
|
try:
|
|
@@ -137,6 +156,11 @@ class HDBSCANClustering(ClusteringAlgorithm):
|
|
|
137
156
|
def get_outlier_score(self):
|
|
138
157
|
"""
|
|
139
158
|
Get outlier score for each sample.
|
|
159
|
+
|
|
160
|
+
Higher scores indicate more likely outliers.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Array of outlier scores or None if model is not fitted.
|
|
140
164
|
"""
|
|
141
165
|
|
|
142
166
|
if self.is_fitted and self._model is not None:
|
|
@@ -147,6 +171,9 @@ class HDBSCANClustering(ClusteringAlgorithm):
|
|
|
147
171
|
def get_condensed_tree(self):
|
|
148
172
|
"""
|
|
149
173
|
Get condensed cluster hierarchy tree.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Array of membership probabilities or None if model not fitted.
|
|
150
177
|
"""
|
|
151
178
|
|
|
152
179
|
if self.is_fitted and self._model is not None:
|
imageatlas/clustering/kmeans.py
CHANGED
|
@@ -10,6 +10,14 @@ from typing import Optional
|
|
|
10
10
|
class KMeansClustering(ClusteringAlgorithm):
|
|
11
11
|
"""
|
|
12
12
|
K-Means clustering algorithm.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
n_clusters: Number of clusters to form
|
|
16
|
+
n_init: Number of times to run with different centroid seeds
|
|
17
|
+
max_iter: Maximum number of iterations
|
|
18
|
+
use_minibatch: Whether to use MiniBatchKMeans for large datasets
|
|
19
|
+
batch_size: Batch size for MiniBatchKMeans
|
|
20
|
+
random_state: Random seed for reproducibility
|
|
13
21
|
"""
|
|
14
22
|
|
|
15
23
|
def __init__(
|
|
@@ -42,6 +50,13 @@ class KMeansClustering(ClusteringAlgorithm):
|
|
|
42
50
|
|
|
43
51
|
"""
|
|
44
52
|
Fit K-Means and predict cluster labels.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
features: Feature matrix of shape (n_samples, n_features)
|
|
56
|
+
filenames: Optional list of filenames for cluster mapping
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
ClusteringResult object with cluster assignments.
|
|
45
60
|
"""
|
|
46
61
|
|
|
47
62
|
self._validate_features(features)
|
|
@@ -108,6 +123,15 @@ class KMeansClustering(ClusteringAlgorithm):
|
|
|
108
123
|
def predict(self, features):
|
|
109
124
|
"""
|
|
110
125
|
Predict cluster label for new samples.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
features: Feature matrix of shape (n_samples, n_features)
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
Array of cluster labels
|
|
132
|
+
|
|
133
|
+
Raises:
|
|
134
|
+
RuntimeError: If model has not yet been fitted.
|
|
111
135
|
"""
|
|
112
136
|
|
|
113
137
|
if not self.is_fitted or self._model == None:
|
|
@@ -119,6 +143,9 @@ class KMeansClustering(ClusteringAlgorithm):
|
|
|
119
143
|
def get_cluster_centers(self):
|
|
120
144
|
"""
|
|
121
145
|
Get cluster centers if model is fitted.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Array of cluster centers or None if not fitted.
|
|
122
149
|
"""
|
|
123
150
|
if self.is_fitted and self._model is not None:
|
|
124
151
|
return self._model.cluster_centers_
|
imageatlas/features/batch.py
CHANGED
|
@@ -10,6 +10,8 @@ import warnings
|
|
|
10
10
|
class BatchProcessor:
|
|
11
11
|
"""
|
|
12
12
|
Handles batch processing of images through feature extractors.
|
|
13
|
+
|
|
14
|
+
Manages batching, device placement and memory cleanup.
|
|
13
15
|
"""
|
|
14
16
|
|
|
15
17
|
def __init__(
|
|
@@ -20,6 +22,11 @@ class BatchProcessor:
|
|
|
20
22
|
):
|
|
21
23
|
"""
|
|
22
24
|
Initialize batch processor.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
batch_size: Number of images to process at once.
|
|
28
|
+
device: Device to use ('cpu', 'cuda', 'cuda:0', etc.)
|
|
29
|
+
clear_cache: Whether to clear GPU cache after each batch
|
|
23
30
|
"""
|
|
24
31
|
|
|
25
32
|
self.batch_size = batch_size
|
|
@@ -50,8 +57,15 @@ class BatchProcessor:
|
|
|
50
57
|
):
|
|
51
58
|
"""
|
|
52
59
|
Process a batch of extractors through the feature extractor.
|
|
53
|
-
|
|
60
|
+
Args:
|
|
61
|
+
images: List of PIL Images.
|
|
62
|
+
extractor: Feature extractor with extract_features method
|
|
63
|
+
return_numpy: Whether to return numpy array (vs torch tensor)
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Array of feature vectors, shape (batch_size, feature_dim)
|
|
54
67
|
"""
|
|
68
|
+
# TODO: use the correct batching method in the feature_extractors module.
|
|
55
69
|
|
|
56
70
|
if not images:
|
|
57
71
|
return np.array([])
|
|
@@ -108,6 +122,14 @@ class BatchProcessor:
|
|
|
108
122
|
):
|
|
109
123
|
"""
|
|
110
124
|
Estimate memory usage for a batch.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
n_images: Number of images in a batch
|
|
128
|
+
feature_dim: Dimensions of feature vector
|
|
129
|
+
dtype: Data type of features
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Estimated memory in GB
|
|
111
133
|
"""
|
|
112
134
|
|
|
113
135
|
bytes_per_element = np.dtype(dtype).itemsize
|
imageatlas/features/cache.py
CHANGED
|
@@ -70,6 +70,12 @@ class HDF5Cache(FeatureCache):
|
|
|
70
70
|
):
|
|
71
71
|
"""
|
|
72
72
|
Save features to HDF5 file.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
features: Feature array, shape (n_samples, feature_dim)
|
|
76
|
+
filenames: List of filenames corresponding to features
|
|
77
|
+
metadata: Feature metadata
|
|
78
|
+
path: Path to save HDF5 file
|
|
73
79
|
"""
|
|
74
80
|
|
|
75
81
|
# Make sure path has .h5 extension
|
|
@@ -115,13 +121,20 @@ class HDF5Cache(FeatureCache):
|
|
|
115
121
|
):
|
|
116
122
|
"""
|
|
117
123
|
Load features from HDF5 file.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
path: Path to HDF5 File
|
|
127
|
+
lazy: If True, return memory-mapped array instead of loading to RAM
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Tuple of (features, filenames, metadata)
|
|
118
131
|
"""
|
|
119
132
|
|
|
120
133
|
if not path.endswith(".h5"):
|
|
121
134
|
path = path + ".h5"
|
|
122
135
|
|
|
123
136
|
if not self.exists(path):
|
|
124
|
-
raise FileNotFoundError("
|
|
137
|
+
raise FileNotFoundError(f"Cache file not found: {path}")
|
|
125
138
|
|
|
126
139
|
with h5py.File(path, 'r') as f:
|
|
127
140
|
# Load filenames
|
|
@@ -161,6 +174,14 @@ class HDF5Cache(FeatureCache):
|
|
|
161
174
|
):
|
|
162
175
|
"""
|
|
163
176
|
Load a subset of features.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
path: Path to HDF5 file
|
|
180
|
+
indices: Indices to load (if provided)
|
|
181
|
+
filenames: Filenames to load (if provided)
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Tuple of (features, filenames)
|
|
164
185
|
"""
|
|
165
186
|
|
|
166
187
|
if not path.endswith(".h5"):
|
|
@@ -193,6 +214,11 @@ class HDF5Cache(FeatureCache):
|
|
|
193
214
|
):
|
|
194
215
|
"""
|
|
195
216
|
Append new features to existing cache.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
path: Path to the HDF5 file
|
|
220
|
+
new_features: New features to append
|
|
221
|
+
new_filenames: Corresponding filenames
|
|
196
222
|
"""
|
|
197
223
|
|
|
198
224
|
if not path.endswith(".h5"):
|
|
@@ -223,6 +249,12 @@ class HDF5Cache(FeatureCache):
|
|
|
223
249
|
def get_feature_dict(self, path):
|
|
224
250
|
"""
|
|
225
251
|
Load features as dictionary (for backward compatibility)
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
path: Path to HDF5 file
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
Dictionary mapping filenames to feature vectors
|
|
226
258
|
"""
|
|
227
259
|
features, filenames, _ = self.load(path)
|
|
228
260
|
return {fn: feat for fn, feat in zip(filenames, features)}
|
|
@@ -230,6 +262,12 @@ class HDF5Cache(FeatureCache):
|
|
|
230
262
|
def get_info(self, path):
|
|
231
263
|
"""
|
|
232
264
|
Get information about the cache without loading data.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
path: Path to HDF5 file
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
Dictionary with cache information
|
|
233
271
|
"""
|
|
234
272
|
|
|
235
273
|
if not path.endswith(".h5"):
|
imageatlas/features/loaders.py
CHANGED
|
@@ -12,6 +12,8 @@ import warnings
|
|
|
12
12
|
class ImageLoader:
|
|
13
13
|
"""
|
|
14
14
|
Image loader with validation and error handling.
|
|
15
|
+
|
|
16
|
+
Handles corrupted images, format conversions, and EXIF orientations.
|
|
15
17
|
"""
|
|
16
18
|
VALID_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif', '.webp'}
|
|
17
19
|
|
|
@@ -23,6 +25,11 @@ class ImageLoader:
|
|
|
23
25
|
):
|
|
24
26
|
"""
|
|
25
27
|
Initialize image loader.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
max_size: Optional maximum size (width, height) for images
|
|
31
|
+
convert_mode: PIL odel to covert images to ('RGB', 'L', etc.)
|
|
32
|
+
handle_exif: Whether to handle EXIF orientation
|
|
26
33
|
"""
|
|
27
34
|
|
|
28
35
|
self.max_size = max_size
|
|
@@ -33,6 +40,12 @@ class ImageLoader:
|
|
|
33
40
|
def validate_path(self, path):
|
|
34
41
|
"""
|
|
35
42
|
Check if path is valid
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
path: Path to image file
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
True if valid, False otherwise
|
|
36
49
|
"""
|
|
37
50
|
if not os.path.exists(path):
|
|
38
51
|
return False
|
|
@@ -43,6 +56,12 @@ class ImageLoader:
|
|
|
43
56
|
def load_image(self, path):
|
|
44
57
|
"""
|
|
45
58
|
Load a single image.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
path: Path to image file
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
PIL Image or None if loadig failed
|
|
46
65
|
"""
|
|
47
66
|
|
|
48
67
|
try:
|
|
@@ -78,6 +97,12 @@ class ImageLoader:
|
|
|
78
97
|
def load_batch(self, paths):
|
|
79
98
|
"""
|
|
80
99
|
Load a batch of images.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
paths: List of image paths
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Tuple of (loaded_images, successful_paths, failed_paths)
|
|
81
106
|
"""
|
|
82
107
|
|
|
83
108
|
images = []
|
|
@@ -97,6 +122,12 @@ class ImageLoader:
|
|
|
97
122
|
def _handle_orientation(self, image):
|
|
98
123
|
"""
|
|
99
124
|
Handle EXIF orientation tag.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
image: PIL Image
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Oriented image
|
|
100
131
|
"""
|
|
101
132
|
|
|
102
133
|
try:
|
|
@@ -129,6 +160,12 @@ class ImageLoader:
|
|
|
129
160
|
def _resize_if_needed(self, image):
|
|
130
161
|
"""
|
|
131
162
|
Resize image if it exceeds max size.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
image: PIL Image
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
Resized image
|
|
132
169
|
"""
|
|
133
170
|
if self.max_size is None:
|
|
134
171
|
return image
|
|
@@ -149,6 +186,14 @@ class ImageLoader:
|
|
|
149
186
|
):
|
|
150
187
|
"""
|
|
151
188
|
Find all images in a directory.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
directory: Directory to search
|
|
192
|
+
pattern: Glob pattern for filenames
|
|
193
|
+
recursive: Whether to search recursively
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
List of image paths
|
|
152
197
|
"""
|
|
153
198
|
path = Path(directory)
|
|
154
199
|
|
|
@@ -180,6 +225,13 @@ class ImageLoader:
|
|
|
180
225
|
|
|
181
226
|
"""
|
|
182
227
|
Create batches from a list of items.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
items: List of items to batch
|
|
231
|
+
batch_size: Size of each batch
|
|
232
|
+
|
|
233
|
+
Yeilds:
|
|
234
|
+
Batches of items
|
|
183
235
|
"""
|
|
184
236
|
|
|
185
237
|
for i in range(0, len(items), batch_size):
|
imageatlas/features/metadata.py
CHANGED
imageatlas/features/pipeline.py
CHANGED
|
@@ -17,6 +17,18 @@ from .metadata import FeatureMetadata
|
|
|
17
17
|
class FeaturePipeline:
|
|
18
18
|
"""
|
|
19
19
|
Main pipeline for feature extraction.
|
|
20
|
+
|
|
21
|
+
Handles batch processing, caching, progress tracking, and error recovery.
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
>>> from features import FeaturePipeline
|
|
25
|
+
>>> from feature_extractors import create_feature_extractor
|
|
26
|
+
>>>
|
|
27
|
+
>>> extractor = create_feature_extractor('dinov2', device='cuda')
|
|
28
|
+
>>> pipeline = FeaturePipeline(extractor, batch_size=32)
|
|
29
|
+
>>>
|
|
30
|
+
>>> result = pipeline.extract_from_directory('./images')
|
|
31
|
+
>>> pipeline.save('./features/features.h5')
|
|
20
32
|
"""
|
|
21
33
|
|
|
22
34
|
def __init__(
|
|
@@ -30,6 +42,14 @@ class FeaturePipeline:
|
|
|
30
42
|
):
|
|
31
43
|
"""
|
|
32
44
|
Initialize feature extraction pipeline.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
extractor: Feature extractor (from feature_extractors module)
|
|
48
|
+
batch_size: Number of images to process at once
|
|
49
|
+
device: Device for processing ('cpu', 'cuda')
|
|
50
|
+
cache_backend: Cache backend to use ('hdf5')
|
|
51
|
+
max_image_size: Optional max size for images (width, height)
|
|
52
|
+
verbose: Whether to show progress bars
|
|
33
53
|
"""
|
|
34
54
|
|
|
35
55
|
self.extractor = extractor
|
|
@@ -79,6 +99,16 @@ class FeaturePipeline:
|
|
|
79
99
|
|
|
80
100
|
"""
|
|
81
101
|
Extract features from all images in a directory.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
directory: Directory containing images
|
|
105
|
+
pattern: Glob pattern for filenames
|
|
106
|
+
recursive: Whether to search recursively
|
|
107
|
+
save_every: Save checkpoint every N images (optional)
|
|
108
|
+
save_path: Path for checkpoint saves (required if save_every is set)
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Self for method chaining
|
|
82
112
|
"""
|
|
83
113
|
|
|
84
114
|
# Find all images
|
|
@@ -112,6 +142,14 @@ class FeaturePipeline:
|
|
|
112
142
|
|
|
113
143
|
"""
|
|
114
144
|
Extract features from a list of filepaths.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
file_paths: List of image file paths
|
|
148
|
+
save_every: Save checkpoint every N images (optional)
|
|
149
|
+
save_path: Path for checkpoint saves (required if save_every is set)
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Self for method chaining
|
|
115
153
|
"""
|
|
116
154
|
|
|
117
155
|
if save_every is not None and save_path is None:
|
|
@@ -223,6 +261,10 @@ class FeaturePipeline:
|
|
|
223
261
|
def save(self, path, format='hdf5'):
|
|
224
262
|
"""
|
|
225
263
|
Save extracted features to disk.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
path: Path to save features
|
|
267
|
+
format: Format to use ('hdf5')
|
|
226
268
|
"""
|
|
227
269
|
|
|
228
270
|
if self.features is None or self.metadata is None:
|
|
@@ -244,6 +286,11 @@ class FeaturePipeline:
|
|
|
244
286
|
def load(self, path):
|
|
245
287
|
"""
|
|
246
288
|
Load features from disk.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
path: Path to feature cache
|
|
292
|
+
|
|
293
|
+
Returns: Self for method chaining
|
|
247
294
|
"""
|
|
248
295
|
|
|
249
296
|
self.features, self.filenames, self.metadata = self.cache.load(path)
|
|
@@ -271,6 +318,9 @@ class FeaturePipeline:
|
|
|
271
318
|
def get_feature_dict(self):
|
|
272
319
|
"""
|
|
273
320
|
Get features as dictionary
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
Dictionary mapping filenames to feature vectors
|
|
274
324
|
"""
|
|
275
325
|
|
|
276
326
|
if self.features is None:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: imageatlas
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: ImageAtlas: A toolkit for organizing, cleaning and analysing your image datasets.
|
|
5
5
|
Author-email: Ahmad Javed <ahmadjaved97@gmail.com>
|
|
6
6
|
Maintainer-email: Ahmad Javed <ahmadjaved97@gmail.com>
|
|
@@ -63,6 +63,7 @@ Requires-Dist: openpyxl; extra == "full"
|
|
|
63
63
|
Dynamic: license-file
|
|
64
64
|
|
|
65
65
|
# ImageAtlas
|
|
66
|
+
[](https://pepy.tech/projects/imageatlas)
|
|
66
67
|
|
|
67
68
|
## Overview
|
|
68
69
|
|
|
@@ -86,6 +87,12 @@ pip install imageatlas
|
|
|
86
87
|
pip install imageatlas[full]
|
|
87
88
|
```
|
|
88
89
|
|
|
90
|
+
**Note on CLIP**: If you wish to use the CLIP model, you must install it manually from GitHub using:
|
|
91
|
+
|
|
92
|
+
```
|
|
93
|
+
pip install git+https://github.com/openai/CLIP.git
|
|
94
|
+
```
|
|
95
|
+
|
|
89
96
|
**From Source**
|
|
90
97
|
```
|
|
91
98
|
git clone https://github.com/ahmadjaved97/ImageAtlas.git
|
|
@@ -1,20 +1,20 @@
|
|
|
1
|
-
imageatlas/__init__.py,sha256=
|
|
2
|
-
imageatlas/clustering/__init__.py,sha256=
|
|
3
|
-
imageatlas/clustering/base.py,sha256=
|
|
4
|
-
imageatlas/clustering/factory.py,sha256=
|
|
5
|
-
imageatlas/clustering/gmm.py,sha256=
|
|
6
|
-
imageatlas/clustering/hdbscan_clustering.py,sha256=
|
|
7
|
-
imageatlas/clustering/kmeans.py,sha256=
|
|
1
|
+
imageatlas/__init__.py,sha256=nNeRH7OYG4CT5lQjy5CcDAsXJD5gpX58IKfefgriskY,1062
|
|
2
|
+
imageatlas/clustering/__init__.py,sha256=5BlL9QeyQbml08f8YfHtrSpLUfwb5tlFRc7u_VRtRsc,602
|
|
3
|
+
imageatlas/clustering/base.py,sha256=rpy_JI6nUakwF7qg1vUjy40FDqr7WyXu7W8dJvRd2qU,5082
|
|
4
|
+
imageatlas/clustering/factory.py,sha256=h_NRt-edgJh-5jzfDYIuv5eeXgV-mQiJCPzj29YjV_E,1822
|
|
5
|
+
imageatlas/clustering/gmm.py,sha256=heADnjFuQfZHxsedOmDLwORW2o1533sHrRGVOsjRUPE,6368
|
|
6
|
+
imageatlas/clustering/hdbscan_clustering.py,sha256=H8Yj74XCiCBnuVhqygGL4gHFaZrBiNwl14GiedmyQMI,6291
|
|
7
|
+
imageatlas/clustering/kmeans.py,sha256=M5ibQpgC43FQbEzDwKBPj4c4fjzWT3h3SvY670b9vRs,5043
|
|
8
8
|
imageatlas/core/__init__.py,sha256=FnKCmANLS0flQzoPNAwTJbIJvn0JQXTlXDQ5n6F5rWo,347
|
|
9
9
|
imageatlas/core/clusterer.py,sha256=-q6wovIfOhNJWwaU9sV1A9dTeksWFkCg8hZ14QwdDXM,11661
|
|
10
10
|
imageatlas/core/results.py,sha256=jekDXZG4bjcmmsob21QKwWrRBucIxekEpz90E_uEnWs,11372
|
|
11
11
|
imageatlas/features/__init__.py,sha256=Zk2IzFNhULQvzQWjscz2q9-lorpPHeARseoo-TwBJwU,442
|
|
12
12
|
imageatlas/features/adapter.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
-
imageatlas/features/batch.py,sha256=
|
|
14
|
-
imageatlas/features/cache.py,sha256=
|
|
15
|
-
imageatlas/features/loaders.py,sha256=
|
|
16
|
-
imageatlas/features/metadata.py,sha256=
|
|
17
|
-
imageatlas/features/pipeline.py,sha256=
|
|
13
|
+
imageatlas/features/batch.py,sha256=M94pSbBab8SUT6brEx8QhQ2c_HPl5B-3nWAUJkNKRmw,4943
|
|
14
|
+
imageatlas/features/cache.py,sha256=HpxuhgVP9jdnniuRxoh8yebRhrdItqfzyKb3xQCN8k0,8354
|
|
15
|
+
imageatlas/features/loaders.py,sha256=BEZqhDHNgTg-7q_WBX0_dkDvGGnNwdIQd0wJn6TuT9M,6069
|
|
16
|
+
imageatlas/features/metadata.py,sha256=5U0J2vCEaDsuxkYuu2t8ZqeN60scB5q-asfvCcgBIMI,2468
|
|
17
|
+
imageatlas/features/pipeline.py,sha256=em_2fajOxZghd9xRFAdzcz94Hji18RJ-Gezy_Q51GgQ,11471
|
|
18
18
|
imageatlas/features/extractors/__init__.py,sha256=FmFDGzZeipltlZuGVZjY8ONTDywly85Y9X_GVneFZTA,345
|
|
19
19
|
imageatlas/features/extractors/base.py,sha256=MnpwqNQveWEtWkzujvdasawY5ndT7gHRm2gJzPHrLqA,2297
|
|
20
20
|
imageatlas/features/extractors/clip.py,sha256=Hq64t1fKTmWtMunAYaRVGX_-DjC5AD67Jwtq-2680H4,965
|
|
@@ -35,8 +35,8 @@ imageatlas/reduction/tsne.py,sha256=Ra5vq8sWfGQ_0nfLL_QcO0zg6BIe_nhn7D948Hy_LAY,
|
|
|
35
35
|
imageatlas/reduction/umap_reducer.py,sha256=Lwu5_lDZt9CKBFmu7qpKYwyY2grChUMKwHi4129fxcQ,2951
|
|
36
36
|
imageatlas/visualization/__init__.py,sha256=sWZUMQn3p3s9IYuksZ0tInifnM4QwYTQIZpvL0GrwOc,171
|
|
37
37
|
imageatlas/visualization/grids.py,sha256=MITFnFo81yua7VG2tIsC8obXr7Tf24XxYw2oCNFQFnU,5456
|
|
38
|
-
imageatlas-0.1.
|
|
39
|
-
imageatlas-0.1.
|
|
40
|
-
imageatlas-0.1.
|
|
41
|
-
imageatlas-0.1.
|
|
42
|
-
imageatlas-0.1.
|
|
38
|
+
imageatlas-0.1.2.dist-info/licenses/LICENSE,sha256=FM0ees3eP8Mm6C2J9euHxj8RjBIPQk5EWLxk4bYfez0,1068
|
|
39
|
+
imageatlas-0.1.2.dist-info/METADATA,sha256=_NflQfHMcM_zVKQkUhx3lZh0Avs07adFgAmxNV1h_AE,8282
|
|
40
|
+
imageatlas-0.1.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
41
|
+
imageatlas-0.1.2.dist-info/top_level.txt,sha256=jB6Ct7oH-wRZOSCZpFKo-yXZtkYcfq3ucb6eqI3JWig,11
|
|
42
|
+
imageatlas-0.1.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|