imageatlas 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {imageatlas-0.1.0 → imageatlas-0.1.2}/PKG-INFO +8 -1
  2. {imageatlas-0.1.0 → imageatlas-0.1.2}/README.md +7 -0
  3. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/__init__.py +1 -1
  4. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/clustering/__init__.py +10 -0
  5. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/clustering/base.py +33 -0
  6. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/clustering/factory.py +24 -0
  7. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/clustering/gmm.py +42 -1
  8. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/clustering/hdbscan_clustering.py +28 -1
  9. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/clustering/kmeans.py +27 -0
  10. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/features/batch.py +23 -1
  11. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/features/cache.py +39 -1
  12. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/features/loaders.py +52 -0
  13. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/features/metadata.py +3 -0
  14. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/features/pipeline.py +50 -0
  15. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas.egg-info/PKG-INFO +8 -1
  16. {imageatlas-0.1.0 → imageatlas-0.1.2}/CHANGELOG.md +0 -0
  17. {imageatlas-0.1.0 → imageatlas-0.1.2}/CONTRIBUTING.md +0 -0
  18. {imageatlas-0.1.0 → imageatlas-0.1.2}/LICENSE +0 -0
  19. {imageatlas-0.1.0 → imageatlas-0.1.2}/MANIFEST.in +0 -0
  20. {imageatlas-0.1.0 → imageatlas-0.1.2}/examples/example_apis.ipynb +0 -0
  21. {imageatlas-0.1.0 → imageatlas-0.1.2}/examples/example_complete_workflow.py +0 -0
  22. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/core/__init__.py +0 -0
  23. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/core/clusterer.py +0 -0
  24. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/core/results.py +0 -0
  25. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/features/__init__.py +0 -0
  26. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/features/adapter.py +0 -0
  27. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/features/extractors/__init__.py +0 -0
  28. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/features/extractors/base.py +0 -0
  29. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/features/extractors/clip.py +0 -0
  30. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/features/extractors/convnext.py +0 -0
  31. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/features/extractors/dinov2.py +0 -0
  32. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/features/extractors/efficientnet.py +0 -0
  33. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/features/extractors/factory.py +0 -0
  34. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/features/extractors/mobilenet.py +0 -0
  35. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/features/extractors/resnet.py +0 -0
  36. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/features/extractors/swin.py +0 -0
  37. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/features/extractors/vgg.py +0 -0
  38. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/features/extractors/vit.py +0 -0
  39. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/reduction/__init__.py +0 -0
  40. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/reduction/base.py +0 -0
  41. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/reduction/factory.py +0 -0
  42. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/reduction/pca.py +0 -0
  43. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/reduction/tsne.py +0 -0
  44. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/reduction/umap_reducer.py +0 -0
  45. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/visualization/__init__.py +0 -0
  46. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas/visualization/grids.py +0 -0
  47. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas.egg-info/SOURCES.txt +0 -0
  48. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas.egg-info/dependency_links.txt +0 -0
  49. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas.egg-info/requires.txt +0 -0
  50. {imageatlas-0.1.0 → imageatlas-0.1.2}/imageatlas.egg-info/top_level.txt +0 -0
  51. {imageatlas-0.1.0 → imageatlas-0.1.2}/pyproject.toml +0 -0
  52. {imageatlas-0.1.0 → imageatlas-0.1.2}/requirements.txt +0 -0
  53. {imageatlas-0.1.0 → imageatlas-0.1.2}/setup.cfg +0 -0
  54. {imageatlas-0.1.0 → imageatlas-0.1.2}/tests/test_batch_processing.py +0 -0
  55. {imageatlas-0.1.0 → imageatlas-0.1.2}/tests/test_core_api.py +0 -0
  56. {imageatlas-0.1.0 → imageatlas-0.1.2}/tests/test_features_pipeline.py +0 -0
  57. {imageatlas-0.1.0 → imageatlas-0.1.2}/tests/test_reduction_module.py +0 -0
  58. {imageatlas-0.1.0 → imageatlas-0.1.2}/tests/test_visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: imageatlas
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: ImageAtlas: A toolkit for organizing, cleaning and analysing your image datasets.
5
5
  Author-email: Ahmad Javed <ahmadjaved97@gmail.com>
6
6
  Maintainer-email: Ahmad Javed <ahmadjaved97@gmail.com>
@@ -63,6 +63,7 @@ Requires-Dist: openpyxl; extra == "full"
63
63
  Dynamic: license-file
64
64
 
65
65
  # ImageAtlas
66
+ [![PyPI Downloads](https://static.pepy.tech/personalized-badge/imageatlas?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=Downloads)](https://pepy.tech/projects/imageatlas)
66
67
 
67
68
  ## Overview
68
69
 
@@ -86,6 +87,12 @@ pip install imageatlas
86
87
  pip install imageatlas[full]
87
88
  ```
88
89
 
90
+ **Note on CLIP**: If you wish to use the CLIP model, you must install it manually from GitHub using:
91
+
92
+ ```
93
+ pip install git+https://github.com/openai/CLIP.git
94
+ ```
95
+
89
96
  **From Source**
90
97
  ```
91
98
  git clone https://github.com/ahmadjaved97/ImageAtlas.git
@@ -1,4 +1,5 @@
1
1
  # ImageAtlas
2
+ [![PyPI Downloads](https://static.pepy.tech/personalized-badge/imageatlas?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=Downloads)](https://pepy.tech/projects/imageatlas)
2
3
 
3
4
  ## Overview
4
5
 
@@ -22,6 +23,12 @@ pip install imageatlas
22
23
  pip install imageatlas[full]
23
24
  ```
24
25
 
26
+ **Note on CLIP**: If you wish to use the CLIP model, you must install it manually from GitHub using:
27
+
28
+ ```
29
+ pip install git+https://github.com/openai/CLIP.git
30
+ ```
31
+
25
32
  **From Source**
26
33
  ```
27
34
  git clone https://github.com/ahmadjaved97/ImageAtlas.git
@@ -2,7 +2,7 @@
2
2
  ImageAtlas: A toolkit for organizing, cleaning and analysing your image datasets.
3
3
  """
4
4
 
5
- __version__ = '0.1.0'
5
+ __version__ = '0.1.2'
6
6
 
7
7
 
8
8
  # 1. High level API (The everything tool)
@@ -1,7 +1,15 @@
1
+ """
2
+ Clustering Algorithms module.
3
+
4
+ This module provides various clustering algorithms with a unified interface for clustering
5
+ on image features.
6
+
7
+ """
1
8
  from .base import ClusteringResult, ClusteringAlgorithm
2
9
  from .kmeans import KMeansClustering
3
10
  from .hdbscan_clustering import HDBSCANClustering
4
11
  from .gmm import GMMClustering
12
+ from .factory import create_clustering_algorithm, get_available_algorithms
5
13
 
6
14
 
7
15
 
@@ -11,4 +19,6 @@ __all__ = [
11
19
  'KMeansClustering',
12
20
  'HDBSCANClustering',
13
21
  'GMMClustering',
22
+ 'create_clustering_algorithm',
23
+ 'get_available_algorithms'
14
24
  ]
@@ -8,7 +8,13 @@ import numpy as np
8
8
  class ClusteringResult:
9
9
  """
10
10
  Container for clustering Results.
11
+ Attributes:
12
+ cluster_labels: Array of cluster assignments for each sample.
13
+ cluster_dict: Dictionary mapping cluster IDs to list of sample indices.
14
+ n_clusters: Number of clusters found.
15
+ metadata: Additional algorithm-specific metadata.
11
16
  """
17
+
12
18
  cluster_labels: np.ndarray
13
19
  cluster_dict: Dict[int, List[int]]
14
20
  n_clusters: int
@@ -49,11 +55,18 @@ class ClusteringResult:
49
55
  class ClusteringAlgorithm(ABC):
50
56
  """
51
57
  Abstract base class for all clustering algorithms.
58
+
59
+ All the clustering algorithms must implement the fit_predict method and
60
+ provide a consistent interface for clustering operations.
52
61
  """
53
62
 
54
63
  def __init__(self, random_state=42, **kwargs):
55
64
  """
56
65
  Initialize the clustering algorithm.
66
+
67
+ Args:
68
+ random_state: Random seed for reproducibility.
69
+ **kwargs: Additional algorithm related parameters.
57
70
  """
58
71
  self.random_state = random_state
59
72
  self.params = kwargs
@@ -64,6 +77,12 @@ class ClusteringAlgorithm(ABC):
64
77
  def fit_predict(self, features) -> ClusteringResult:
65
78
  """
66
79
  Fit the clustering algorithms and predict cluster labels.
80
+
81
+ Args:
82
+ features: Feature matrix of shape (n_samples, n_features)
83
+
84
+ Returns:
85
+ ClusteringResult object containing cluster assignments and metadata.
67
86
  """
68
87
  pass
69
88
 
@@ -77,7 +96,14 @@ class ClusteringAlgorithm(ABC):
77
96
  def _validate_features(self, features:np.ndarray) -> None:
78
97
  """
79
98
  Validate the input feature matrix.
99
+
100
+ Args:
101
+ features: Feature matrix of shape (n_samples, n_features) to validate.
102
+
103
+ Raises:
104
+ ValueError: If features are invalid.
80
105
  """
106
+
81
107
  if not isinstance(features, np.ndarray):
82
108
  raise ValueError(f"Feature must be a numpy array, got {type(features)}")
83
109
 
@@ -93,6 +119,13 @@ class ClusteringAlgorithm(ABC):
93
119
  def _create_cluster_dict(self, cluster_labels, filenames=None):
94
120
  """
95
121
  Createa dictionary mapping cluster IDs to indices or filenames
122
+
123
+ Args:
124
+ cluster_labels: Array of cluster assignments.
125
+ filenames: Optional list of filenames corresponding to images.
126
+
127
+ Returns:
128
+ Dictionary mapping cluster IDs to lists of indices or filenames
96
129
  """
97
130
 
98
131
  cluster_dict = {}
@@ -22,6 +22,27 @@ def create_clustering_algorithm(
22
22
  ) -> ClusteringAlgorithm:
23
23
  """
24
24
  Factory function to create clustering algorithms.
25
+
26
+ Args:
27
+ method: Name of the clustering algorithm ('kmeans', 'gmm', 'hdbscan')
28
+ **kwargs: Algorithm specific parameters
29
+
30
+ Returns:
31
+ Instance of the requested clustering algorithm
32
+
33
+ Raises:
34
+ Value Error: If clustering method is not supported.
35
+
36
+
37
+ Examples:
38
+ >>> # Create KMeans with 5 clusters
39
+ >>> clusterer = create_clustering_algorithm('kmeans', n_clusters=5)
40
+
41
+ >>> # Create GMM with full covariance
42
+ >>> clusterer = create_clustering_algorithm('gmm', n_components=8, covariance_type='full')
43
+
44
+ >>> # Create HDBSCAN with auto parameters
45
+ >>> clusterer = create_clustering_algorithm('hdbscan', auto_params=True)
25
46
  """
26
47
 
27
48
  method = method.lower()
@@ -39,5 +60,8 @@ def create_clustering_algorithm(
39
60
  def get_available_algorithms():
40
61
  """
41
62
  Get a list of available clustering algorithms.
63
+
64
+ Returns:
65
+ List of algorithm names.
42
66
  """
43
67
  return sorted(CLUSTERING_ALGORITHMS.keys())
@@ -10,6 +10,14 @@ from .base import ClusteringAlgorithm, ClusteringResult
10
10
  class GMMClustering(ClusteringAlgorithm):
11
11
  """
12
12
  Gaussian Mixture Model clustering algorithm.
13
+
14
+ Args:
15
+ n_components: Number of mixture components (clusters)
16
+ covariance_type: Type of covarince parameters ('full', 'diag', 'tied', 'spherical')
17
+ max_iter: Maximum number of EM iterations
18
+ n_init: Number of initializations to perform
19
+ reg_covar: Regularization added to diagonal of covariance (prevents singular matrices)
20
+ random_state: Random seed for reproducibility
13
21
  """
14
22
 
15
23
  def __init__(
@@ -46,10 +54,16 @@ class GMMClustering(ClusteringAlgorithm):
46
54
 
47
55
  """
48
56
  Fit GMM and predict cluster labels.
57
+
58
+ Args:
59
+ features: Feature matrix of shape (n_samples, n_features)
60
+ filenames: Optional list of filenames for cluster mapping
61
+
62
+ Returns:
63
+ ClusteringResult object with cluster assignments.
49
64
  """
50
65
 
51
66
  self._validate_features(features)
52
- print('fshape: ', features.shape)
53
67
 
54
68
  n_samples = features.shape[0]
55
69
 
@@ -110,6 +124,15 @@ class GMMClustering(ClusteringAlgorithm):
110
124
  def predict(self, features):
111
125
  """
112
126
  Predict cluster label for new samples.
127
+
128
+ Args:
129
+ features: Feature matrix of shape (n_samples, n_features)
130
+
131
+ Returns:
132
+ Array of cluster labels
133
+
134
+ Raises:
135
+ RuntimeError: If model has not been fitted yet.
113
136
  """
114
137
 
115
138
  if not self.is_fitted or self._model is None:
@@ -121,6 +144,15 @@ class GMMClustering(ClusteringAlgorithm):
121
144
  def predict_proba(self, features):
122
145
  """
123
146
  Predict probability of each cluster for new samples.
147
+
148
+ Args:
149
+ features: Feature matrix of shape (n_samples, n_features)
150
+
151
+ Returns:
152
+ Array of cluster labels
153
+
154
+ Raises:
155
+ RuntimeError: If model has not been fitted yet.
124
156
  """
125
157
 
126
158
  if not self.is_fitted or self._model is None:
@@ -132,6 +164,9 @@ class GMMClustering(ClusteringAlgorithm):
132
164
  def get_cluster_means(self):
133
165
  """
134
166
  Get cluster means (centers) if model is fitted.
167
+
168
+ Returns:
169
+ Array of cluster centers or None if not fitted.
135
170
  """
136
171
 
137
172
  if self.is_fitted and self._model is not None:
@@ -142,6 +177,12 @@ class GMMClustering(ClusteringAlgorithm):
142
177
  def score(self, features):
143
178
  """
144
179
  Compute the log-likelihood of the data under the model.
180
+
181
+ Args:
182
+ features: Feature matrix of shape (n_samples, n_features)
183
+
184
+ Returns:
185
+ Log-likelihood score
145
186
  """
146
187
 
147
188
  if not self.is_fitted or self._model is None:
@@ -7,7 +7,16 @@ from .base import ClusteringAlgorithm, ClusteringResult
7
7
 
8
8
  class HDBSCANClustering(ClusteringAlgorithm):
9
9
  """
10
- HDBSCAN algorithm.
10
+ HDBSCAN (Hierarchical Density-Based Spatial Clustering) Algorithm.
11
+
12
+ Args:
13
+ min_cluster_size: Minimum number of samples in a cluster
14
+ min_samples: Number of samples in a neighborhood for core points.
15
+ metric: Distance metric to use
16
+ cluster_selection_method: Method for selecting clusters ('eom' or 'leaf')
17
+ auto_params: Whether to automatically set parameters based on dataset size
18
+ random_state: Random seed (note: HDBSCAN is deterministic, this is for consistency)
19
+
11
20
  """
12
21
 
13
22
  def __init__(
@@ -37,6 +46,12 @@ class HDBSCANClustering(ClusteringAlgorithm):
37
46
  def _auto_select_params(self, n_samples):
38
47
  """
39
48
  Automatically select HDBSCAN parameters based on dataset size.
49
+
50
+ Args:
51
+ n_samples: Number of samples in the dataset.
52
+
53
+ Returns:
54
+ Tuple of (min_cluster_size, min_samples)
40
55
  """
41
56
 
42
57
  if n_samples < 100:
@@ -62,6 +77,10 @@ class HDBSCANClustering(ClusteringAlgorithm):
62
77
 
63
78
  """
64
79
  Fit HDBSCAN and predict cluster labels.
80
+
81
+ Args:
82
+ features: Feature matrix of shape (n_samples, n_features)
83
+ filenames: Optional list of filenames for cluster mapping.
65
84
  """
66
85
 
67
86
  try:
@@ -137,6 +156,11 @@ class HDBSCANClustering(ClusteringAlgorithm):
137
156
  def get_outlier_score(self):
138
157
  """
139
158
  Get outlier score for each sample.
159
+
160
+ Higher scores indicate more likely outliers.
161
+
162
+ Returns:
163
+ Array of outlier scores or None if model is not fitted.
140
164
  """
141
165
 
142
166
  if self.is_fitted and self._model is not None:
@@ -147,6 +171,9 @@ class HDBSCANClustering(ClusteringAlgorithm):
147
171
  def get_condensed_tree(self):
148
172
  """
149
173
  Get condensed cluster hierarchy tree.
174
+
175
+ Returns:
176
+ Array of membership probabilities or None if model not fitted.
150
177
  """
151
178
 
152
179
  if self.is_fitted and self._model is not None:
@@ -10,6 +10,14 @@ from typing import Optional
10
10
  class KMeansClustering(ClusteringAlgorithm):
11
11
  """
12
12
  K-Means clustering algorithm.
13
+
14
+ Args:
15
+ n_clusters: Number of clusters to form
16
+ n_init: Number of times to run with different centroid seeds
17
+ max_iter: Maximum number of iterations
18
+ use_minibatch: Whether to use MiniBatchKMeans for large datasets
19
+ batch_size: Batch size for MiniBatchKMeans
20
+ random_state: Random seed for reproducibility
13
21
  """
14
22
 
15
23
  def __init__(
@@ -42,6 +50,13 @@ class KMeansClustering(ClusteringAlgorithm):
42
50
 
43
51
  """
44
52
  Fit K-Means and predict cluster labels.
53
+
54
+ Args:
55
+ features: Feature matrix of shape (n_samples, n_features)
56
+ filenames: Optional list of filenames for cluster mapping
57
+
58
+ Returns:
59
+ ClusteringResult object with cluster assignments.
45
60
  """
46
61
 
47
62
  self._validate_features(features)
@@ -108,6 +123,15 @@ class KMeansClustering(ClusteringAlgorithm):
108
123
  def predict(self, features):
109
124
  """
110
125
  Predict cluster label for new samples.
126
+
127
+ Args:
128
+ features: Feature matrix of shape (n_samples, n_features)
129
+
130
+ Returns:
131
+ Array of cluster labels
132
+
133
+ Raises:
134
+ RuntimeError: If model has not yet been fitted.
111
135
  """
112
136
 
113
137
  if not self.is_fitted or self._model == None:
@@ -119,6 +143,9 @@ class KMeansClustering(ClusteringAlgorithm):
119
143
  def get_cluster_centers(self):
120
144
  """
121
145
  Get cluster centers if model is fitted.
146
+
147
+ Returns:
148
+ Array of cluster centers or None if not fitted.
122
149
  """
123
150
  if self.is_fitted and self._model is not None:
124
151
  return self._model.cluster_centers_
@@ -10,6 +10,8 @@ import warnings
10
10
  class BatchProcessor:
11
11
  """
12
12
  Handles batch processing of images through feature extractors.
13
+
14
+ Manages batching, device placement and memory cleanup.
13
15
  """
14
16
 
15
17
  def __init__(
@@ -20,6 +22,11 @@ class BatchProcessor:
20
22
  ):
21
23
  """
22
24
  Initialize batch processor.
25
+
26
+ Args:
27
+ batch_size: Number of images to process at once.
28
+ device: Device to use ('cpu', 'cuda', 'cuda:0', etc.)
29
+ clear_cache: Whether to clear GPU cache after each batch
23
30
  """
24
31
 
25
32
  self.batch_size = batch_size
@@ -50,8 +57,15 @@ class BatchProcessor:
50
57
  ):
51
58
  """
52
59
  Process a batch of extractors through the feature extractor.
53
- TODO: use the correct batching method in the feature_extractors module.
60
+ Args:
61
+ images: List of PIL Images.
62
+ extractor: Feature extractor with extract_features method
63
+ return_numpy: Whether to return numpy array (vs torch tensor)
64
+
65
+ Returns:
66
+ Array of feature vectors, shape (batch_size, feature_dim)
54
67
  """
68
+ # TODO: use the correct batching method in the feature_extractors module.
55
69
 
56
70
  if not images:
57
71
  return np.array([])
@@ -108,6 +122,14 @@ class BatchProcessor:
108
122
  ):
109
123
  """
110
124
  Estimate memory usage for a batch.
125
+
126
+ Args:
127
+ n_images: Number of images in a batch
128
+ feature_dim: Dimensions of feature vector
129
+ dtype: Data type of features
130
+
131
+ Returns:
132
+ Estimated memory in GB
111
133
  """
112
134
 
113
135
  bytes_per_element = np.dtype(dtype).itemsize
@@ -70,6 +70,12 @@ class HDF5Cache(FeatureCache):
70
70
  ):
71
71
  """
72
72
  Save features to HDF5 file.
73
+
74
+ Args:
75
+ features: Feature array, shape (n_samples, feature_dim)
76
+ filenames: List of filenames corresponding to features
77
+ metadata: Feature metadata
78
+ path: Path to save HDF5 file
73
79
  """
74
80
 
75
81
  # Make sure path has .h5 extension
@@ -115,13 +121,20 @@ class HDF5Cache(FeatureCache):
115
121
  ):
116
122
  """
117
123
  Load features from HDF5 file.
124
+
125
+ Args:
126
+ path: Path to HDF5 File
127
+ lazy: If True, return memory-mapped array instead of loading to RAM
128
+
129
+ Returns:
130
+ Tuple of (features, filenames, metadata)
118
131
  """
119
132
 
120
133
  if not path.endswith(".h5"):
121
134
  path = path + ".h5"
122
135
 
123
136
  if not self.exists(path):
124
- raise FileNotFoundError("fCache file not found: {path}")
137
+ raise FileNotFoundError(f"Cache file not found: {path}")
125
138
 
126
139
  with h5py.File(path, 'r') as f:
127
140
  # Load filenames
@@ -161,6 +174,14 @@ class HDF5Cache(FeatureCache):
161
174
  ):
162
175
  """
163
176
  Load a subset of features.
177
+
178
+ Args:
179
+ path: Path to HDF5 file
180
+ indices: Indices to load (if provided)
181
+ filenames: Filenames to load (if provided)
182
+
183
+ Returns:
184
+ Tuple of (features, filenames)
164
185
  """
165
186
 
166
187
  if not path.endswith(".h5"):
@@ -193,6 +214,11 @@ class HDF5Cache(FeatureCache):
193
214
  ):
194
215
  """
195
216
  Append new features to existing cache.
217
+
218
+ Args:
219
+ path: Path to the HDF5 file
220
+ new_features: New features to append
221
+ new_filenames: Corresponding filenames
196
222
  """
197
223
 
198
224
  if not path.endswith(".h5"):
@@ -223,6 +249,12 @@ class HDF5Cache(FeatureCache):
223
249
  def get_feature_dict(self, path):
224
250
  """
225
251
  Load features as dictionary (for backward compatibility)
252
+
253
+ Args:
254
+ path: Path to HDF5 file
255
+
256
+ Returns:
257
+ Dictionary mapping filenames to feature vectors
226
258
  """
227
259
  features, filenames, _ = self.load(path)
228
260
  return {fn: feat for fn, feat in zip(filenames, features)}
@@ -230,6 +262,12 @@ class HDF5Cache(FeatureCache):
230
262
  def get_info(self, path):
231
263
  """
232
264
  Get information about the cache without loading data.
265
+
266
+ Args:
267
+ path: Path to HDF5 file
268
+
269
+ Returns:
270
+ Dictionary with cache information
233
271
  """
234
272
 
235
273
  if not path.endswith(".h5"):
@@ -12,6 +12,8 @@ import warnings
12
12
  class ImageLoader:
13
13
  """
14
14
  Image loader with validation and error handling.
15
+
16
+ Handles corrupted images, format conversions, and EXIF orientations.
15
17
  """
16
18
  VALID_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif', '.webp'}
17
19
 
@@ -23,6 +25,11 @@ class ImageLoader:
23
25
  ):
24
26
  """
25
27
  Initialize image loader.
28
+
29
+ Args:
30
+ max_size: Optional maximum size (width, height) for images
31
+ convert_mode: PIL odel to covert images to ('RGB', 'L', etc.)
32
+ handle_exif: Whether to handle EXIF orientation
26
33
  """
27
34
 
28
35
  self.max_size = max_size
@@ -33,6 +40,12 @@ class ImageLoader:
33
40
  def validate_path(self, path):
34
41
  """
35
42
  Check if path is valid
43
+
44
+ Args:
45
+ path: Path to image file
46
+
47
+ Returns:
48
+ True if valid, False otherwise
36
49
  """
37
50
  if not os.path.exists(path):
38
51
  return False
@@ -43,6 +56,12 @@ class ImageLoader:
43
56
  def load_image(self, path):
44
57
  """
45
58
  Load a single image.
59
+
60
+ Args:
61
+ path: Path to image file
62
+
63
+ Returns:
64
+ PIL Image or None if loadig failed
46
65
  """
47
66
 
48
67
  try:
@@ -78,6 +97,12 @@ class ImageLoader:
78
97
  def load_batch(self, paths):
79
98
  """
80
99
  Load a batch of images.
100
+
101
+ Args:
102
+ paths: List of image paths
103
+
104
+ Returns:
105
+ Tuple of (loaded_images, successful_paths, failed_paths)
81
106
  """
82
107
 
83
108
  images = []
@@ -97,6 +122,12 @@ class ImageLoader:
97
122
  def _handle_orientation(self, image):
98
123
  """
99
124
  Handle EXIF orientation tag.
125
+
126
+ Args:
127
+ image: PIL Image
128
+
129
+ Returns:
130
+ Oriented image
100
131
  """
101
132
 
102
133
  try:
@@ -129,6 +160,12 @@ class ImageLoader:
129
160
  def _resize_if_needed(self, image):
130
161
  """
131
162
  Resize image if it exceeds max size.
163
+
164
+ Args:
165
+ image: PIL Image
166
+
167
+ Returns:
168
+ Resized image
132
169
  """
133
170
  if self.max_size is None:
134
171
  return image
@@ -149,6 +186,14 @@ class ImageLoader:
149
186
  ):
150
187
  """
151
188
  Find all images in a directory.
189
+
190
+ Args:
191
+ directory: Directory to search
192
+ pattern: Glob pattern for filenames
193
+ recursive: Whether to search recursively
194
+
195
+ Returns:
196
+ List of image paths
152
197
  """
153
198
  path = Path(directory)
154
199
 
@@ -180,6 +225,13 @@ class ImageLoader:
180
225
 
181
226
  """
182
227
  Create batches from a list of items.
228
+
229
+ Args:
230
+ items: List of items to batch
231
+ batch_size: Size of each batch
232
+
233
+ Yeilds:
234
+ Batches of items
183
235
  """
184
236
 
185
237
  for i in range(0, len(items), batch_size):
@@ -12,6 +12,9 @@ import json
12
12
  class FeatureMetadata:
13
13
  """
14
14
  Metadata for extracted features.
15
+
16
+ Tracks information about the feature extractionn process including
17
+ model details, extraction parameters, and statistics.
15
18
  """
16
19
 
17
20
  # Model information
@@ -17,6 +17,18 @@ from .metadata import FeatureMetadata
17
17
  class FeaturePipeline:
18
18
  """
19
19
  Main pipeline for feature extraction.
20
+
21
+ Handles batch processing, caching, progress tracking, and error recovery.
22
+
23
+ Example:
24
+ >>> from features import FeaturePipeline
25
+ >>> from feature_extractors import create_feature_extractor
26
+ >>>
27
+ >>> extractor = create_feature_extractor('dinov2', device='cuda')
28
+ >>> pipeline = FeaturePipeline(extractor, batch_size=32)
29
+ >>>
30
+ >>> result = pipeline.extract_from_directory('./images')
31
+ >>> pipeline.save('./features/features.h5')
20
32
  """
21
33
 
22
34
  def __init__(
@@ -30,6 +42,14 @@ class FeaturePipeline:
30
42
  ):
31
43
  """
32
44
  Initialize feature extraction pipeline.
45
+
46
+ Args:
47
+ extractor: Feature extractor (from feature_extractors module)
48
+ batch_size: Number of images to process at once
49
+ device: Device for processing ('cpu', 'cuda')
50
+ cache_backend: Cache backend to use ('hdf5')
51
+ max_image_size: Optional max size for images (width, height)
52
+ verbose: Whether to show progress bars
33
53
  """
34
54
 
35
55
  self.extractor = extractor
@@ -79,6 +99,16 @@ class FeaturePipeline:
79
99
 
80
100
  """
81
101
  Extract features from all images in a directory.
102
+
103
+ Args:
104
+ directory: Directory containing images
105
+ pattern: Glob pattern for filenames
106
+ recursive: Whether to search recursively
107
+ save_every: Save checkpoint every N images (optional)
108
+ save_path: Path for checkpoint saves (required if save_every is set)
109
+
110
+ Returns:
111
+ Self for method chaining
82
112
  """
83
113
 
84
114
  # Find all images
@@ -112,6 +142,14 @@ class FeaturePipeline:
112
142
 
113
143
  """
114
144
  Extract features from a list of filepaths.
145
+
146
+ Args:
147
+ file_paths: List of image file paths
148
+ save_every: Save checkpoint every N images (optional)
149
+ save_path: Path for checkpoint saves (required if save_every is set)
150
+
151
+ Returns:
152
+ Self for method chaining
115
153
  """
116
154
 
117
155
  if save_every is not None and save_path is None:
@@ -223,6 +261,10 @@ class FeaturePipeline:
223
261
  def save(self, path, format='hdf5'):
224
262
  """
225
263
  Save extracted features to disk.
264
+
265
+ Args:
266
+ path: Path to save features
267
+ format: Format to use ('hdf5')
226
268
  """
227
269
 
228
270
  if self.features is None or self.metadata is None:
@@ -244,6 +286,11 @@ class FeaturePipeline:
244
286
  def load(self, path):
245
287
  """
246
288
  Load features from disk.
289
+
290
+ Args:
291
+ path: Path to feature cache
292
+
293
+ Returns: Self for method chaining
247
294
  """
248
295
 
249
296
  self.features, self.filenames, self.metadata = self.cache.load(path)
@@ -271,6 +318,9 @@ class FeaturePipeline:
271
318
  def get_feature_dict(self):
272
319
  """
273
320
  Get features as dictionary
321
+
322
+ Returns:
323
+ Dictionary mapping filenames to feature vectors
274
324
  """
275
325
 
276
326
  if self.features is None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: imageatlas
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: ImageAtlas: A toolkit for organizing, cleaning and analysing your image datasets.
5
5
  Author-email: Ahmad Javed <ahmadjaved97@gmail.com>
6
6
  Maintainer-email: Ahmad Javed <ahmadjaved97@gmail.com>
@@ -63,6 +63,7 @@ Requires-Dist: openpyxl; extra == "full"
63
63
  Dynamic: license-file
64
64
 
65
65
  # ImageAtlas
66
+ [![PyPI Downloads](https://static.pepy.tech/personalized-badge/imageatlas?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=Downloads)](https://pepy.tech/projects/imageatlas)
66
67
 
67
68
  ## Overview
68
69
 
@@ -86,6 +87,12 @@ pip install imageatlas
86
87
  pip install imageatlas[full]
87
88
  ```
88
89
 
90
+ **Note on CLIP**: If you wish to use the CLIP model, you must install it manually from GitHub using:
91
+
92
+ ```
93
+ pip install git+https://github.com/openai/CLIP.git
94
+ ```
95
+
89
96
  **From Source**
90
97
  ```
91
98
  git clone https://github.com/ahmadjaved97/ImageAtlas.git
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes