linkml-store 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linkml_store/__init__.py +7 -0
- linkml_store/api/__init__.py +8 -0
- linkml_store/api/client.py +414 -0
- linkml_store/api/collection.py +1280 -0
- linkml_store/api/config.py +187 -0
- linkml_store/api/database.py +862 -0
- linkml_store/api/queries.py +69 -0
- linkml_store/api/stores/__init__.py +0 -0
- linkml_store/api/stores/chromadb/__init__.py +7 -0
- linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
- linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
- linkml_store/api/stores/dremio/__init__.py +10 -0
- linkml_store/api/stores/dremio/dremio_collection.py +555 -0
- linkml_store/api/stores/dremio/dremio_database.py +1052 -0
- linkml_store/api/stores/dremio/mappings.py +105 -0
- linkml_store/api/stores/dremio_rest/__init__.py +11 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
- linkml_store/api/stores/duckdb/__init__.py +16 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
- linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
- linkml_store/api/stores/duckdb/mappings.py +8 -0
- linkml_store/api/stores/filesystem/__init__.py +15 -0
- linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
- linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
- linkml_store/api/stores/hdf5/__init__.py +7 -0
- linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
- linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
- linkml_store/api/stores/ibis/__init__.py +5 -0
- linkml_store/api/stores/ibis/ibis_collection.py +488 -0
- linkml_store/api/stores/ibis/ibis_database.py +328 -0
- linkml_store/api/stores/mongodb/__init__.py +25 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
- linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
- linkml_store/api/stores/neo4j/__init__.py +0 -0
- linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
- linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
- linkml_store/api/stores/solr/__init__.py +3 -0
- linkml_store/api/stores/solr/solr_collection.py +224 -0
- linkml_store/api/stores/solr/solr_database.py +83 -0
- linkml_store/api/stores/solr/solr_utils.py +0 -0
- linkml_store/api/types.py +4 -0
- linkml_store/cli.py +1147 -0
- linkml_store/constants.py +7 -0
- linkml_store/graphs/__init__.py +0 -0
- linkml_store/graphs/graph_map.py +24 -0
- linkml_store/index/__init__.py +53 -0
- linkml_store/index/implementations/__init__.py +0 -0
- linkml_store/index/implementations/llm_indexer.py +174 -0
- linkml_store/index/implementations/simple_indexer.py +43 -0
- linkml_store/index/indexer.py +211 -0
- linkml_store/inference/__init__.py +13 -0
- linkml_store/inference/evaluation.py +195 -0
- linkml_store/inference/implementations/__init__.py +0 -0
- linkml_store/inference/implementations/llm_inference_engine.py +154 -0
- linkml_store/inference/implementations/rag_inference_engine.py +276 -0
- linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
- linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
- linkml_store/inference/inference_config.py +66 -0
- linkml_store/inference/inference_engine.py +209 -0
- linkml_store/inference/inference_engine_registry.py +74 -0
- linkml_store/plotting/__init__.py +5 -0
- linkml_store/plotting/cli.py +826 -0
- linkml_store/plotting/dimensionality_reduction.py +453 -0
- linkml_store/plotting/embedding_plot.py +489 -0
- linkml_store/plotting/facet_chart.py +73 -0
- linkml_store/plotting/heatmap.py +383 -0
- linkml_store/utils/__init__.py +0 -0
- linkml_store/utils/change_utils.py +17 -0
- linkml_store/utils/dat_parser.py +95 -0
- linkml_store/utils/embedding_matcher.py +424 -0
- linkml_store/utils/embedding_utils.py +299 -0
- linkml_store/utils/enrichment_analyzer.py +217 -0
- linkml_store/utils/file_utils.py +37 -0
- linkml_store/utils/format_utils.py +550 -0
- linkml_store/utils/io.py +38 -0
- linkml_store/utils/llm_utils.py +122 -0
- linkml_store/utils/mongodb_utils.py +145 -0
- linkml_store/utils/neo4j_utils.py +42 -0
- linkml_store/utils/object_utils.py +190 -0
- linkml_store/utils/pandas_utils.py +93 -0
- linkml_store/utils/patch_utils.py +126 -0
- linkml_store/utils/query_utils.py +89 -0
- linkml_store/utils/schema_utils.py +23 -0
- linkml_store/utils/sklearn_utils.py +193 -0
- linkml_store/utils/sql_utils.py +177 -0
- linkml_store/utils/stats_utils.py +53 -0
- linkml_store/utils/vector_utils.py +158 -0
- linkml_store/webapi/__init__.py +0 -0
- linkml_store/webapi/html/__init__.py +3 -0
- linkml_store/webapi/html/base.html.j2 +24 -0
- linkml_store/webapi/html/collection_details.html.j2 +15 -0
- linkml_store/webapi/html/database_details.html.j2 +16 -0
- linkml_store/webapi/html/databases.html.j2 +14 -0
- linkml_store/webapi/html/generic.html.j2 +43 -0
- linkml_store/webapi/main.py +855 -0
- linkml_store-0.3.0.dist-info/METADATA +226 -0
- linkml_store-0.3.0.dist-info/RECORD +101 -0
- linkml_store-0.3.0.dist-info/WHEEL +4 -0
- linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
- linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,453 @@
|
|
|
1
|
+
"""Dimensionality reduction utilities for embedding visualization."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Dict, Literal, Optional, Tuple, Union
|
|
5
|
+
import numpy as np
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class ReductionResult:
|
|
13
|
+
"""Container for dimensionality reduction results."""
|
|
14
|
+
|
|
15
|
+
coordinates: np.ndarray
|
|
16
|
+
method: str
|
|
17
|
+
parameters: Dict
|
|
18
|
+
explained_variance: Optional[float] = None
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def n_samples(self) -> int:
|
|
22
|
+
"""Number of samples."""
|
|
23
|
+
return len(self.coordinates)
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def n_components(self) -> int:
|
|
27
|
+
"""Number of reduced dimensions."""
|
|
28
|
+
return self.coordinates.shape[1]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def reduce_dimensions(
|
|
32
|
+
vectors: np.ndarray,
|
|
33
|
+
method: Literal["umap", "tsne", "pca"] = "umap",
|
|
34
|
+
n_components: int = 2,
|
|
35
|
+
random_state: Optional[int] = None,
|
|
36
|
+
**kwargs
|
|
37
|
+
) -> ReductionResult:
|
|
38
|
+
"""
|
|
39
|
+
Reduce dimensionality of embedding vectors.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
vectors: Input vectors (n_samples, n_features)
|
|
43
|
+
method: Reduction method
|
|
44
|
+
n_components: Number of output dimensions
|
|
45
|
+
random_state: Random seed for reproducibility
|
|
46
|
+
**kwargs: Additional parameters for the reduction method
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
ReductionResult with reduced coordinates
|
|
50
|
+
"""
|
|
51
|
+
if method == "pca":
|
|
52
|
+
return _reduce_with_pca(vectors, n_components, random_state, **kwargs)
|
|
53
|
+
elif method == "tsne":
|
|
54
|
+
return _reduce_with_tsne(vectors, n_components, random_state, **kwargs)
|
|
55
|
+
elif method == "umap":
|
|
56
|
+
return _reduce_with_umap(vectors, n_components, random_state, **kwargs)
|
|
57
|
+
else:
|
|
58
|
+
raise ValueError(f"Unknown reduction method: {method}")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _reduce_with_pca(
|
|
62
|
+
vectors: np.ndarray,
|
|
63
|
+
n_components: int,
|
|
64
|
+
random_state: Optional[int],
|
|
65
|
+
**kwargs
|
|
66
|
+
) -> ReductionResult:
|
|
67
|
+
"""Reduce dimensions using PCA."""
|
|
68
|
+
try:
|
|
69
|
+
from sklearn.decomposition import PCA
|
|
70
|
+
except ImportError:
|
|
71
|
+
raise ImportError("scikit-learn is required for PCA. Install with: pip install scikit-learn")
|
|
72
|
+
|
|
73
|
+
pca = PCA(n_components=n_components, random_state=random_state, **kwargs)
|
|
74
|
+
coordinates = pca.fit_transform(vectors)
|
|
75
|
+
|
|
76
|
+
explained_var = sum(pca.explained_variance_ratio_) if hasattr(pca, 'explained_variance_ratio_') else None
|
|
77
|
+
|
|
78
|
+
return ReductionResult(
|
|
79
|
+
coordinates=coordinates,
|
|
80
|
+
method="pca",
|
|
81
|
+
parameters={"n_components": n_components, **kwargs},
|
|
82
|
+
explained_variance=explained_var
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _reduce_with_tsne(
|
|
87
|
+
vectors: np.ndarray,
|
|
88
|
+
n_components: int,
|
|
89
|
+
random_state: Optional[int],
|
|
90
|
+
perplexity: float = 30.0,
|
|
91
|
+
learning_rate: Union[float, str] = "auto",
|
|
92
|
+
n_iter: int = 1000,
|
|
93
|
+
max_dimensions: int = 500,
|
|
94
|
+
**kwargs
|
|
95
|
+
) -> ReductionResult:
|
|
96
|
+
"""Reduce dimensions using t-SNE."""
|
|
97
|
+
try:
|
|
98
|
+
from sklearn.manifold import TSNE
|
|
99
|
+
except ImportError:
|
|
100
|
+
raise ImportError("scikit-learn is required for t-SNE. Install with: pip install scikit-learn")
|
|
101
|
+
|
|
102
|
+
# Validate and debug input vectors
|
|
103
|
+
logger.info(f"Input vector shape: {vectors.shape}")
|
|
104
|
+
logger.info(f"Input vector dtype: {vectors.dtype}")
|
|
105
|
+
|
|
106
|
+
# Check for NaN or Inf values
|
|
107
|
+
if np.any(np.isnan(vectors)):
|
|
108
|
+
nan_count = np.sum(np.isnan(vectors))
|
|
109
|
+
nan_rows = np.any(np.isnan(vectors), axis=1)
|
|
110
|
+
logger.warning(f"Found {nan_count} NaN values in {np.sum(nan_rows)} rows")
|
|
111
|
+
# Replace NaNs with zeros as a fallback
|
|
112
|
+
vectors = np.nan_to_num(vectors, nan=0.0)
|
|
113
|
+
logger.info("Replaced NaN values with zeros")
|
|
114
|
+
|
|
115
|
+
if np.any(np.isinf(vectors)):
|
|
116
|
+
inf_count = np.sum(np.isinf(vectors))
|
|
117
|
+
inf_rows = np.any(np.isinf(vectors), axis=1)
|
|
118
|
+
logger.warning(f"Found {inf_count} Inf values in {np.sum(inf_rows)} rows")
|
|
119
|
+
# Replace Infs with large finite values
|
|
120
|
+
vectors = np.nan_to_num(vectors, posinf=1e10, neginf=-1e10)
|
|
121
|
+
logger.info("Replaced Inf values with finite values")
|
|
122
|
+
|
|
123
|
+
# Check vector statistics
|
|
124
|
+
logger.info(f"Vector stats - min: {np.min(vectors):.6f}, max: {np.max(vectors):.6f}, mean: {np.mean(vectors):.6f}, std: {np.std(vectors):.6f}")
|
|
125
|
+
|
|
126
|
+
# Check if all vectors are identical (can cause issues)
|
|
127
|
+
if np.allclose(vectors, vectors[0]):
|
|
128
|
+
logger.warning("All input vectors are identical! This will cause t-SNE to fail.")
|
|
129
|
+
# Add small random noise to break symmetry
|
|
130
|
+
noise = np.random.RandomState(random_state).normal(0, 1e-8, vectors.shape)
|
|
131
|
+
vectors = vectors + noise
|
|
132
|
+
logger.info("Added small random noise to break symmetry")
|
|
133
|
+
|
|
134
|
+
# Check variance per dimension
|
|
135
|
+
dim_variance = np.var(vectors, axis=0)
|
|
136
|
+
zero_var_dims = np.sum(dim_variance == 0)
|
|
137
|
+
if zero_var_dims > 0:
|
|
138
|
+
logger.warning(f"Found {zero_var_dims} dimensions with zero variance")
|
|
139
|
+
|
|
140
|
+
# t-SNE specific adjustments
|
|
141
|
+
n_samples = len(vectors)
|
|
142
|
+
perplexity = min(perplexity, n_samples - 1)
|
|
143
|
+
|
|
144
|
+
# Additional perplexity validation
|
|
145
|
+
if perplexity < 5:
|
|
146
|
+
logger.warning(f"Perplexity {perplexity} is very low, may cause instability")
|
|
147
|
+
if perplexity > n_samples / 2:
|
|
148
|
+
logger.warning(f"Perplexity {perplexity} is very high relative to sample size {n_samples}")
|
|
149
|
+
|
|
150
|
+
logger.info(f"t-SNE parameters: perplexity={perplexity}, learning_rate={learning_rate}, n_iter={n_iter}")
|
|
151
|
+
|
|
152
|
+
# Pre-reduce with PCA if dimensions are very high (>max_dimensions)
|
|
153
|
+
n_features = vectors.shape[1]
|
|
154
|
+
if n_features > max_dimensions:
|
|
155
|
+
logger.info(f"High dimensional data ({n_features}D). Pre-reducing with PCA to {max_dimensions}D for t-SNE stability")
|
|
156
|
+
from sklearn.decomposition import PCA
|
|
157
|
+
pca = PCA(n_components=min(max_dimensions, n_samples - 1), random_state=random_state)
|
|
158
|
+
vectors = pca.fit_transform(vectors)
|
|
159
|
+
logger.info(f"PCA reduced to shape: {vectors.shape}")
|
|
160
|
+
|
|
161
|
+
# Use max_iter instead of n_iter for newer sklearn versions
|
|
162
|
+
tsne_params = {
|
|
163
|
+
"n_components": n_components,
|
|
164
|
+
"perplexity": perplexity,
|
|
165
|
+
"learning_rate": learning_rate,
|
|
166
|
+
"random_state": random_state,
|
|
167
|
+
"init": "random", # Use random init to avoid potential issues
|
|
168
|
+
"method": "barnes_hut" if n_samples >= 1000 else "exact", # Use exact for small datasets
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
# Handle deprecated n_iter parameter
|
|
172
|
+
try:
|
|
173
|
+
# Try with max_iter first (newer sklearn)
|
|
174
|
+
tsne_params["max_iter"] = n_iter
|
|
175
|
+
tsne = TSNE(**tsne_params, **kwargs)
|
|
176
|
+
except TypeError:
|
|
177
|
+
# Fall back to n_iter for older sklearn
|
|
178
|
+
tsne_params["n_iter"] = n_iter
|
|
179
|
+
del tsne_params["max_iter"]
|
|
180
|
+
tsne = TSNE(**tsne_params, **kwargs)
|
|
181
|
+
|
|
182
|
+
logger.info(f"Starting t-SNE fit_transform with {n_samples} samples, method: {tsne_params.get('method', 'auto')}")
|
|
183
|
+
try:
|
|
184
|
+
coordinates = tsne.fit_transform(vectors)
|
|
185
|
+
logger.info(f"t-SNE fit transform complete")
|
|
186
|
+
except Exception as e:
|
|
187
|
+
logger.error(f"t-SNE failed with error: {e}")
|
|
188
|
+
logger.error(f"Error type: {type(e).__name__}")
|
|
189
|
+
raise
|
|
190
|
+
|
|
191
|
+
return ReductionResult(
|
|
192
|
+
coordinates=coordinates,
|
|
193
|
+
method="tsne",
|
|
194
|
+
parameters={
|
|
195
|
+
"n_components": n_components,
|
|
196
|
+
"perplexity": perplexity,
|
|
197
|
+
"learning_rate": learning_rate,
|
|
198
|
+
"n_iter": n_iter,
|
|
199
|
+
**kwargs
|
|
200
|
+
}
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _reduce_with_umap(
|
|
205
|
+
vectors: np.ndarray,
|
|
206
|
+
n_components: int,
|
|
207
|
+
random_state: Optional[int],
|
|
208
|
+
n_neighbors: int = 15,
|
|
209
|
+
min_dist: float = 0.1,
|
|
210
|
+
metric: str = "cosine",
|
|
211
|
+
**kwargs
|
|
212
|
+
) -> ReductionResult:
|
|
213
|
+
"""Reduce dimensions using UMAP."""
|
|
214
|
+
try:
|
|
215
|
+
import umap
|
|
216
|
+
except ImportError:
|
|
217
|
+
raise ImportError("umap-learn is required for UMAP. Install with: pip install umap-learn")
|
|
218
|
+
|
|
219
|
+
# UMAP specific adjustments
|
|
220
|
+
n_samples = len(vectors)
|
|
221
|
+
n_neighbors = min(n_neighbors, n_samples - 1)
|
|
222
|
+
|
|
223
|
+
reducer = umap.UMAP(
|
|
224
|
+
n_components=n_components,
|
|
225
|
+
n_neighbors=n_neighbors,
|
|
226
|
+
min_dist=min_dist,
|
|
227
|
+
metric=metric,
|
|
228
|
+
random_state=random_state,
|
|
229
|
+
**kwargs
|
|
230
|
+
)
|
|
231
|
+
coordinates = reducer.fit_transform(vectors)
|
|
232
|
+
|
|
233
|
+
return ReductionResult(
|
|
234
|
+
coordinates=coordinates,
|
|
235
|
+
method="umap",
|
|
236
|
+
parameters={
|
|
237
|
+
"n_components": n_components,
|
|
238
|
+
"n_neighbors": n_neighbors,
|
|
239
|
+
"min_dist": min_dist,
|
|
240
|
+
"metric": metric,
|
|
241
|
+
**kwargs
|
|
242
|
+
}
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def validate_embeddings(vectors: np.ndarray) -> Dict:
|
|
247
|
+
"""
|
|
248
|
+
Validate and analyze embedding vectors for potential issues.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
vectors: Input vectors to validate
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
Dictionary with validation results and statistics
|
|
255
|
+
"""
|
|
256
|
+
results = {
|
|
257
|
+
"shape": vectors.shape,
|
|
258
|
+
"dtype": str(vectors.dtype),
|
|
259
|
+
"has_nan": bool(np.any(np.isnan(vectors))),
|
|
260
|
+
"has_inf": bool(np.any(np.isinf(vectors))),
|
|
261
|
+
"min": float(np.min(vectors)),
|
|
262
|
+
"max": float(np.max(vectors)),
|
|
263
|
+
"mean": float(np.mean(vectors)),
|
|
264
|
+
"std": float(np.std(vectors)),
|
|
265
|
+
"all_identical": bool(np.allclose(vectors, vectors[0])),
|
|
266
|
+
"zero_variance_dims": int(np.sum(np.var(vectors, axis=0) == 0)),
|
|
267
|
+
"n_unique_vectors": int(len(np.unique(vectors, axis=0))),
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
if results["has_nan"]:
|
|
271
|
+
results["nan_count"] = int(np.sum(np.isnan(vectors)))
|
|
272
|
+
results["nan_rows"] = int(np.sum(np.any(np.isnan(vectors), axis=1)))
|
|
273
|
+
|
|
274
|
+
if results["has_inf"]:
|
|
275
|
+
results["inf_count"] = int(np.sum(np.isinf(vectors)))
|
|
276
|
+
results["inf_rows"] = int(np.sum(np.any(np.isinf(vectors), axis=1)))
|
|
277
|
+
|
|
278
|
+
# Check for problematic patterns
|
|
279
|
+
results["warnings"] = []
|
|
280
|
+
|
|
281
|
+
if results["has_nan"]:
|
|
282
|
+
results["warnings"].append(f"Contains {results.get('nan_count', 0)} NaN values")
|
|
283
|
+
|
|
284
|
+
if results["has_inf"]:
|
|
285
|
+
results["warnings"].append(f"Contains {results.get('inf_count', 0)} Inf values")
|
|
286
|
+
|
|
287
|
+
if results["all_identical"]:
|
|
288
|
+
results["warnings"].append("All vectors are identical")
|
|
289
|
+
|
|
290
|
+
if results["zero_variance_dims"] > 0:
|
|
291
|
+
results["warnings"].append(f"{results['zero_variance_dims']} dimensions have zero variance")
|
|
292
|
+
|
|
293
|
+
if results["n_unique_vectors"] < vectors.shape[0] * 0.1:
|
|
294
|
+
results["warnings"].append(f"Only {results['n_unique_vectors']} unique vectors out of {vectors.shape[0]}")
|
|
295
|
+
|
|
296
|
+
if results["std"] < 1e-10:
|
|
297
|
+
results["warnings"].append("Extremely low variance in data")
|
|
298
|
+
|
|
299
|
+
return results
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def get_optimal_parameters(
|
|
303
|
+
method: str,
|
|
304
|
+
n_samples: int
|
|
305
|
+
) -> Dict:
|
|
306
|
+
"""
|
|
307
|
+
Get optimal parameters for dimensionality reduction based on data size.
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
method: Reduction method
|
|
311
|
+
n_samples: Number of samples
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
Dictionary of recommended parameters
|
|
315
|
+
"""
|
|
316
|
+
if method == "pca":
|
|
317
|
+
return {
|
|
318
|
+
"n_components": min(2, n_samples - 1)
|
|
319
|
+
}
|
|
320
|
+
elif method == "tsne":
|
|
321
|
+
# Adjust perplexity based on sample size
|
|
322
|
+
perplexity = min(30, max(5, n_samples / 100))
|
|
323
|
+
return {
|
|
324
|
+
"n_components": 2,
|
|
325
|
+
"perplexity": perplexity,
|
|
326
|
+
"learning_rate": "auto",
|
|
327
|
+
"n_iter": 1000 if n_samples < 10000 else 500
|
|
328
|
+
}
|
|
329
|
+
elif method == "umap":
|
|
330
|
+
# Adjust n_neighbors based on sample size
|
|
331
|
+
n_neighbors = min(15, max(2, int(np.sqrt(n_samples))))
|
|
332
|
+
return {
|
|
333
|
+
"n_components": 2,
|
|
334
|
+
"n_neighbors": n_neighbors,
|
|
335
|
+
"min_dist": 0.1,
|
|
336
|
+
"metric": "cosine"
|
|
337
|
+
}
|
|
338
|
+
else:
|
|
339
|
+
return {}
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def cached_reduction(
|
|
343
|
+
vectors: np.ndarray,
|
|
344
|
+
method: str,
|
|
345
|
+
cache_path: Optional[str] = None,
|
|
346
|
+
force_recompute: bool = False,
|
|
347
|
+
**kwargs
|
|
348
|
+
) -> ReductionResult:
|
|
349
|
+
"""
|
|
350
|
+
Perform dimensionality reduction with optional caching.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
vectors: Input vectors
|
|
354
|
+
method: Reduction method
|
|
355
|
+
cache_path: Path to cache file
|
|
356
|
+
force_recompute: Force recomputation even if cached
|
|
357
|
+
**kwargs: Additional parameters for reduction
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
ReductionResult
|
|
361
|
+
"""
|
|
362
|
+
import hashlib
|
|
363
|
+
import pickle
|
|
364
|
+
from pathlib import Path
|
|
365
|
+
|
|
366
|
+
# Create cache key from vectors and parameters
|
|
367
|
+
cache_key = None
|
|
368
|
+
if cache_path and not force_recompute:
|
|
369
|
+
# Create hash of input data and parameters
|
|
370
|
+
hasher = hashlib.sha256()
|
|
371
|
+
hasher.update(vectors.tobytes())
|
|
372
|
+
hasher.update(method.encode())
|
|
373
|
+
hasher.update(str(sorted(kwargs.items())).encode())
|
|
374
|
+
cache_key = hasher.hexdigest()
|
|
375
|
+
|
|
376
|
+
cache_file = Path(cache_path) / f"reduction_{cache_key}.pkl"
|
|
377
|
+
|
|
378
|
+
if cache_file.exists():
|
|
379
|
+
try:
|
|
380
|
+
with open(cache_file, "rb") as f:
|
|
381
|
+
result = pickle.load(f)
|
|
382
|
+
logger.info(f"Loaded cached reduction from {cache_file}")
|
|
383
|
+
return result
|
|
384
|
+
except Exception as e:
|
|
385
|
+
logger.warning(f"Failed to load cache: {e}")
|
|
386
|
+
|
|
387
|
+
# Compute reduction
|
|
388
|
+
result = reduce_dimensions(vectors, method=method, **kwargs)
|
|
389
|
+
|
|
390
|
+
# Cache result if requested
|
|
391
|
+
if cache_path and cache_key:
|
|
392
|
+
cache_file = Path(cache_path) / f"reduction_{cache_key}.pkl"
|
|
393
|
+
cache_file.parent.mkdir(parents=True, exist_ok=True)
|
|
394
|
+
try:
|
|
395
|
+
with open(cache_file, "wb") as f:
|
|
396
|
+
pickle.dump(result, f)
|
|
397
|
+
logger.info(f"Cached reduction to {cache_file}")
|
|
398
|
+
except Exception as e:
|
|
399
|
+
logger.warning(f"Failed to cache result: {e}")
|
|
400
|
+
|
|
401
|
+
return result
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def align_reductions(
|
|
405
|
+
reductions: Dict[str, ReductionResult],
|
|
406
|
+
reference_key: Optional[str] = None
|
|
407
|
+
) -> Dict[str, ReductionResult]:
|
|
408
|
+
"""
|
|
409
|
+
Align multiple reduction results for comparison.
|
|
410
|
+
|
|
411
|
+
Uses Procrustes analysis to align coordinates.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
reductions: Dictionary of reduction results
|
|
415
|
+
reference_key: Key of reference reduction (default: first)
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
Dictionary of aligned reduction results
|
|
419
|
+
"""
|
|
420
|
+
if len(reductions) <= 1:
|
|
421
|
+
return reductions
|
|
422
|
+
|
|
423
|
+
try:
|
|
424
|
+
from scipy.spatial import procrustes
|
|
425
|
+
except ImportError:
|
|
426
|
+
logger.warning("scipy not available, returning unaligned reductions")
|
|
427
|
+
return reductions
|
|
428
|
+
|
|
429
|
+
keys = list(reductions.keys())
|
|
430
|
+
if reference_key is None:
|
|
431
|
+
reference_key = keys[0]
|
|
432
|
+
|
|
433
|
+
if reference_key not in reductions:
|
|
434
|
+
raise ValueError(f"Reference key {reference_key} not found")
|
|
435
|
+
|
|
436
|
+
ref_coords = reductions[reference_key].coordinates
|
|
437
|
+
aligned = {reference_key: reductions[reference_key]}
|
|
438
|
+
|
|
439
|
+
for key in keys:
|
|
440
|
+
if key == reference_key:
|
|
441
|
+
continue
|
|
442
|
+
|
|
443
|
+
# Align using Procrustes
|
|
444
|
+
_, aligned_coords, _ = procrustes(ref_coords, reductions[key].coordinates)
|
|
445
|
+
|
|
446
|
+
aligned[key] = ReductionResult(
|
|
447
|
+
coordinates=aligned_coords,
|
|
448
|
+
method=reductions[key].method,
|
|
449
|
+
parameters=reductions[key].parameters,
|
|
450
|
+
explained_variance=reductions[key].explained_variance
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
return aligned
|