linkml-store 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. linkml_store/__init__.py +7 -0
  2. linkml_store/api/__init__.py +8 -0
  3. linkml_store/api/client.py +414 -0
  4. linkml_store/api/collection.py +1280 -0
  5. linkml_store/api/config.py +187 -0
  6. linkml_store/api/database.py +862 -0
  7. linkml_store/api/queries.py +69 -0
  8. linkml_store/api/stores/__init__.py +0 -0
  9. linkml_store/api/stores/chromadb/__init__.py +7 -0
  10. linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
  11. linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
  12. linkml_store/api/stores/dremio/__init__.py +10 -0
  13. linkml_store/api/stores/dremio/dremio_collection.py +555 -0
  14. linkml_store/api/stores/dremio/dremio_database.py +1052 -0
  15. linkml_store/api/stores/dremio/mappings.py +105 -0
  16. linkml_store/api/stores/dremio_rest/__init__.py +11 -0
  17. linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
  18. linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
  19. linkml_store/api/stores/duckdb/__init__.py +16 -0
  20. linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
  21. linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
  22. linkml_store/api/stores/duckdb/mappings.py +8 -0
  23. linkml_store/api/stores/filesystem/__init__.py +15 -0
  24. linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
  25. linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
  26. linkml_store/api/stores/hdf5/__init__.py +7 -0
  27. linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
  28. linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
  29. linkml_store/api/stores/ibis/__init__.py +5 -0
  30. linkml_store/api/stores/ibis/ibis_collection.py +488 -0
  31. linkml_store/api/stores/ibis/ibis_database.py +328 -0
  32. linkml_store/api/stores/mongodb/__init__.py +25 -0
  33. linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
  34. linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
  35. linkml_store/api/stores/neo4j/__init__.py +0 -0
  36. linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
  37. linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
  38. linkml_store/api/stores/solr/__init__.py +3 -0
  39. linkml_store/api/stores/solr/solr_collection.py +224 -0
  40. linkml_store/api/stores/solr/solr_database.py +83 -0
  41. linkml_store/api/stores/solr/solr_utils.py +0 -0
  42. linkml_store/api/types.py +4 -0
  43. linkml_store/cli.py +1147 -0
  44. linkml_store/constants.py +7 -0
  45. linkml_store/graphs/__init__.py +0 -0
  46. linkml_store/graphs/graph_map.py +24 -0
  47. linkml_store/index/__init__.py +53 -0
  48. linkml_store/index/implementations/__init__.py +0 -0
  49. linkml_store/index/implementations/llm_indexer.py +174 -0
  50. linkml_store/index/implementations/simple_indexer.py +43 -0
  51. linkml_store/index/indexer.py +211 -0
  52. linkml_store/inference/__init__.py +13 -0
  53. linkml_store/inference/evaluation.py +195 -0
  54. linkml_store/inference/implementations/__init__.py +0 -0
  55. linkml_store/inference/implementations/llm_inference_engine.py +154 -0
  56. linkml_store/inference/implementations/rag_inference_engine.py +276 -0
  57. linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
  58. linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
  59. linkml_store/inference/inference_config.py +66 -0
  60. linkml_store/inference/inference_engine.py +209 -0
  61. linkml_store/inference/inference_engine_registry.py +74 -0
  62. linkml_store/plotting/__init__.py +5 -0
  63. linkml_store/plotting/cli.py +826 -0
  64. linkml_store/plotting/dimensionality_reduction.py +453 -0
  65. linkml_store/plotting/embedding_plot.py +489 -0
  66. linkml_store/plotting/facet_chart.py +73 -0
  67. linkml_store/plotting/heatmap.py +383 -0
  68. linkml_store/utils/__init__.py +0 -0
  69. linkml_store/utils/change_utils.py +17 -0
  70. linkml_store/utils/dat_parser.py +95 -0
  71. linkml_store/utils/embedding_matcher.py +424 -0
  72. linkml_store/utils/embedding_utils.py +299 -0
  73. linkml_store/utils/enrichment_analyzer.py +217 -0
  74. linkml_store/utils/file_utils.py +37 -0
  75. linkml_store/utils/format_utils.py +550 -0
  76. linkml_store/utils/io.py +38 -0
  77. linkml_store/utils/llm_utils.py +122 -0
  78. linkml_store/utils/mongodb_utils.py +145 -0
  79. linkml_store/utils/neo4j_utils.py +42 -0
  80. linkml_store/utils/object_utils.py +190 -0
  81. linkml_store/utils/pandas_utils.py +93 -0
  82. linkml_store/utils/patch_utils.py +126 -0
  83. linkml_store/utils/query_utils.py +89 -0
  84. linkml_store/utils/schema_utils.py +23 -0
  85. linkml_store/utils/sklearn_utils.py +193 -0
  86. linkml_store/utils/sql_utils.py +177 -0
  87. linkml_store/utils/stats_utils.py +53 -0
  88. linkml_store/utils/vector_utils.py +158 -0
  89. linkml_store/webapi/__init__.py +0 -0
  90. linkml_store/webapi/html/__init__.py +3 -0
  91. linkml_store/webapi/html/base.html.j2 +24 -0
  92. linkml_store/webapi/html/collection_details.html.j2 +15 -0
  93. linkml_store/webapi/html/database_details.html.j2 +16 -0
  94. linkml_store/webapi/html/databases.html.j2 +14 -0
  95. linkml_store/webapi/html/generic.html.j2 +43 -0
  96. linkml_store/webapi/main.py +855 -0
  97. linkml_store-0.3.0.dist-info/METADATA +226 -0
  98. linkml_store-0.3.0.dist-info/RECORD +101 -0
  99. linkml_store-0.3.0.dist-info/WHEEL +4 -0
  100. linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
  101. linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
@@ -0,0 +1,453 @@
1
+ """Dimensionality reduction utilities for embedding visualization."""
2
+
3
+ import logging
4
+ from typing import Dict, Literal, Optional, Tuple, Union
5
+ import numpy as np
6
+ from dataclasses import dataclass
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ @dataclass
12
+ class ReductionResult:
13
+ """Container for dimensionality reduction results."""
14
+
15
+ coordinates: np.ndarray
16
+ method: str
17
+ parameters: Dict
18
+ explained_variance: Optional[float] = None
19
+
20
+ @property
21
+ def n_samples(self) -> int:
22
+ """Number of samples."""
23
+ return len(self.coordinates)
24
+
25
+ @property
26
+ def n_components(self) -> int:
27
+ """Number of reduced dimensions."""
28
+ return self.coordinates.shape[1]
29
+
30
+
31
+ def reduce_dimensions(
32
+ vectors: np.ndarray,
33
+ method: Literal["umap", "tsne", "pca"] = "umap",
34
+ n_components: int = 2,
35
+ random_state: Optional[int] = None,
36
+ **kwargs
37
+ ) -> ReductionResult:
38
+ """
39
+ Reduce dimensionality of embedding vectors.
40
+
41
+ Args:
42
+ vectors: Input vectors (n_samples, n_features)
43
+ method: Reduction method
44
+ n_components: Number of output dimensions
45
+ random_state: Random seed for reproducibility
46
+ **kwargs: Additional parameters for the reduction method
47
+
48
+ Returns:
49
+ ReductionResult with reduced coordinates
50
+ """
51
+ if method == "pca":
52
+ return _reduce_with_pca(vectors, n_components, random_state, **kwargs)
53
+ elif method == "tsne":
54
+ return _reduce_with_tsne(vectors, n_components, random_state, **kwargs)
55
+ elif method == "umap":
56
+ return _reduce_with_umap(vectors, n_components, random_state, **kwargs)
57
+ else:
58
+ raise ValueError(f"Unknown reduction method: {method}")
59
+
60
+
61
+ def _reduce_with_pca(
62
+ vectors: np.ndarray,
63
+ n_components: int,
64
+ random_state: Optional[int],
65
+ **kwargs
66
+ ) -> ReductionResult:
67
+ """Reduce dimensions using PCA."""
68
+ try:
69
+ from sklearn.decomposition import PCA
70
+ except ImportError:
71
+ raise ImportError("scikit-learn is required for PCA. Install with: pip install scikit-learn")
72
+
73
+ pca = PCA(n_components=n_components, random_state=random_state, **kwargs)
74
+ coordinates = pca.fit_transform(vectors)
75
+
76
+ explained_var = sum(pca.explained_variance_ratio_) if hasattr(pca, 'explained_variance_ratio_') else None
77
+
78
+ return ReductionResult(
79
+ coordinates=coordinates,
80
+ method="pca",
81
+ parameters={"n_components": n_components, **kwargs},
82
+ explained_variance=explained_var
83
+ )
84
+
85
+
86
+ def _reduce_with_tsne(
87
+ vectors: np.ndarray,
88
+ n_components: int,
89
+ random_state: Optional[int],
90
+ perplexity: float = 30.0,
91
+ learning_rate: Union[float, str] = "auto",
92
+ n_iter: int = 1000,
93
+ max_dimensions: int = 500,
94
+ **kwargs
95
+ ) -> ReductionResult:
96
+ """Reduce dimensions using t-SNE."""
97
+ try:
98
+ from sklearn.manifold import TSNE
99
+ except ImportError:
100
+ raise ImportError("scikit-learn is required for t-SNE. Install with: pip install scikit-learn")
101
+
102
+ # Validate and debug input vectors
103
+ logger.info(f"Input vector shape: {vectors.shape}")
104
+ logger.info(f"Input vector dtype: {vectors.dtype}")
105
+
106
+ # Check for NaN or Inf values
107
+ if np.any(np.isnan(vectors)):
108
+ nan_count = np.sum(np.isnan(vectors))
109
+ nan_rows = np.any(np.isnan(vectors), axis=1)
110
+ logger.warning(f"Found {nan_count} NaN values in {np.sum(nan_rows)} rows")
111
+ # Replace NaNs with zeros as a fallback
112
+ vectors = np.nan_to_num(vectors, nan=0.0)
113
+ logger.info("Replaced NaN values with zeros")
114
+
115
+ if np.any(np.isinf(vectors)):
116
+ inf_count = np.sum(np.isinf(vectors))
117
+ inf_rows = np.any(np.isinf(vectors), axis=1)
118
+ logger.warning(f"Found {inf_count} Inf values in {np.sum(inf_rows)} rows")
119
+ # Replace Infs with large finite values
120
+ vectors = np.nan_to_num(vectors, posinf=1e10, neginf=-1e10)
121
+ logger.info("Replaced Inf values with finite values")
122
+
123
+ # Check vector statistics
124
+ logger.info(f"Vector stats - min: {np.min(vectors):.6f}, max: {np.max(vectors):.6f}, mean: {np.mean(vectors):.6f}, std: {np.std(vectors):.6f}")
125
+
126
+ # Check if all vectors are identical (can cause issues)
127
+ if np.allclose(vectors, vectors[0]):
128
+ logger.warning("All input vectors are identical! This will cause t-SNE to fail.")
129
+ # Add small random noise to break symmetry
130
+ noise = np.random.RandomState(random_state).normal(0, 1e-8, vectors.shape)
131
+ vectors = vectors + noise
132
+ logger.info("Added small random noise to break symmetry")
133
+
134
+ # Check variance per dimension
135
+ dim_variance = np.var(vectors, axis=0)
136
+ zero_var_dims = np.sum(dim_variance == 0)
137
+ if zero_var_dims > 0:
138
+ logger.warning(f"Found {zero_var_dims} dimensions with zero variance")
139
+
140
+ # t-SNE specific adjustments
141
+ n_samples = len(vectors)
142
+ perplexity = min(perplexity, n_samples - 1)
143
+
144
+ # Additional perplexity validation
145
+ if perplexity < 5:
146
+ logger.warning(f"Perplexity {perplexity} is very low, may cause instability")
147
+ if perplexity > n_samples / 2:
148
+ logger.warning(f"Perplexity {perplexity} is very high relative to sample size {n_samples}")
149
+
150
+ logger.info(f"t-SNE parameters: perplexity={perplexity}, learning_rate={learning_rate}, n_iter={n_iter}")
151
+
152
+ # Pre-reduce with PCA if dimensions are very high (>max_dimensions)
153
+ n_features = vectors.shape[1]
154
+ if n_features > max_dimensions:
155
+ logger.info(f"High dimensional data ({n_features}D). Pre-reducing with PCA to {max_dimensions}D for t-SNE stability")
156
+ from sklearn.decomposition import PCA
157
+ pca = PCA(n_components=min(max_dimensions, n_samples - 1), random_state=random_state)
158
+ vectors = pca.fit_transform(vectors)
159
+ logger.info(f"PCA reduced to shape: {vectors.shape}")
160
+
161
+ # Use max_iter instead of n_iter for newer sklearn versions
162
+ tsne_params = {
163
+ "n_components": n_components,
164
+ "perplexity": perplexity,
165
+ "learning_rate": learning_rate,
166
+ "random_state": random_state,
167
+ "init": "random", # Use random init to avoid potential issues
168
+ "method": "barnes_hut" if n_samples >= 1000 else "exact", # Use exact for small datasets
169
+ }
170
+
171
+ # Handle deprecated n_iter parameter
172
+ try:
173
+ # Try with max_iter first (newer sklearn)
174
+ tsne_params["max_iter"] = n_iter
175
+ tsne = TSNE(**tsne_params, **kwargs)
176
+ except TypeError:
177
+ # Fall back to n_iter for older sklearn
178
+ tsne_params["n_iter"] = n_iter
179
+ del tsne_params["max_iter"]
180
+ tsne = TSNE(**tsne_params, **kwargs)
181
+
182
+ logger.info(f"Starting t-SNE fit_transform with {n_samples} samples, method: {tsne_params.get('method', 'auto')}")
183
+ try:
184
+ coordinates = tsne.fit_transform(vectors)
185
+ logger.info(f"t-SNE fit transform complete")
186
+ except Exception as e:
187
+ logger.error(f"t-SNE failed with error: {e}")
188
+ logger.error(f"Error type: {type(e).__name__}")
189
+ raise
190
+
191
+ return ReductionResult(
192
+ coordinates=coordinates,
193
+ method="tsne",
194
+ parameters={
195
+ "n_components": n_components,
196
+ "perplexity": perplexity,
197
+ "learning_rate": learning_rate,
198
+ "n_iter": n_iter,
199
+ **kwargs
200
+ }
201
+ )
202
+
203
+
204
+ def _reduce_with_umap(
205
+ vectors: np.ndarray,
206
+ n_components: int,
207
+ random_state: Optional[int],
208
+ n_neighbors: int = 15,
209
+ min_dist: float = 0.1,
210
+ metric: str = "cosine",
211
+ **kwargs
212
+ ) -> ReductionResult:
213
+ """Reduce dimensions using UMAP."""
214
+ try:
215
+ import umap
216
+ except ImportError:
217
+ raise ImportError("umap-learn is required for UMAP. Install with: pip install umap-learn")
218
+
219
+ # UMAP specific adjustments
220
+ n_samples = len(vectors)
221
+ n_neighbors = min(n_neighbors, n_samples - 1)
222
+
223
+ reducer = umap.UMAP(
224
+ n_components=n_components,
225
+ n_neighbors=n_neighbors,
226
+ min_dist=min_dist,
227
+ metric=metric,
228
+ random_state=random_state,
229
+ **kwargs
230
+ )
231
+ coordinates = reducer.fit_transform(vectors)
232
+
233
+ return ReductionResult(
234
+ coordinates=coordinates,
235
+ method="umap",
236
+ parameters={
237
+ "n_components": n_components,
238
+ "n_neighbors": n_neighbors,
239
+ "min_dist": min_dist,
240
+ "metric": metric,
241
+ **kwargs
242
+ }
243
+ )
244
+
245
+
246
+ def validate_embeddings(vectors: np.ndarray) -> Dict:
247
+ """
248
+ Validate and analyze embedding vectors for potential issues.
249
+
250
+ Args:
251
+ vectors: Input vectors to validate
252
+
253
+ Returns:
254
+ Dictionary with validation results and statistics
255
+ """
256
+ results = {
257
+ "shape": vectors.shape,
258
+ "dtype": str(vectors.dtype),
259
+ "has_nan": bool(np.any(np.isnan(vectors))),
260
+ "has_inf": bool(np.any(np.isinf(vectors))),
261
+ "min": float(np.min(vectors)),
262
+ "max": float(np.max(vectors)),
263
+ "mean": float(np.mean(vectors)),
264
+ "std": float(np.std(vectors)),
265
+ "all_identical": bool(np.allclose(vectors, vectors[0])),
266
+ "zero_variance_dims": int(np.sum(np.var(vectors, axis=0) == 0)),
267
+ "n_unique_vectors": int(len(np.unique(vectors, axis=0))),
268
+ }
269
+
270
+ if results["has_nan"]:
271
+ results["nan_count"] = int(np.sum(np.isnan(vectors)))
272
+ results["nan_rows"] = int(np.sum(np.any(np.isnan(vectors), axis=1)))
273
+
274
+ if results["has_inf"]:
275
+ results["inf_count"] = int(np.sum(np.isinf(vectors)))
276
+ results["inf_rows"] = int(np.sum(np.any(np.isinf(vectors), axis=1)))
277
+
278
+ # Check for problematic patterns
279
+ results["warnings"] = []
280
+
281
+ if results["has_nan"]:
282
+ results["warnings"].append(f"Contains {results.get('nan_count', 0)} NaN values")
283
+
284
+ if results["has_inf"]:
285
+ results["warnings"].append(f"Contains {results.get('inf_count', 0)} Inf values")
286
+
287
+ if results["all_identical"]:
288
+ results["warnings"].append("All vectors are identical")
289
+
290
+ if results["zero_variance_dims"] > 0:
291
+ results["warnings"].append(f"{results['zero_variance_dims']} dimensions have zero variance")
292
+
293
+ if results["n_unique_vectors"] < vectors.shape[0] * 0.1:
294
+ results["warnings"].append(f"Only {results['n_unique_vectors']} unique vectors out of {vectors.shape[0]}")
295
+
296
+ if results["std"] < 1e-10:
297
+ results["warnings"].append("Extremely low variance in data")
298
+
299
+ return results
300
+
301
+
302
+ def get_optimal_parameters(
303
+ method: str,
304
+ n_samples: int
305
+ ) -> Dict:
306
+ """
307
+ Get optimal parameters for dimensionality reduction based on data size.
308
+
309
+ Args:
310
+ method: Reduction method
311
+ n_samples: Number of samples
312
+
313
+ Returns:
314
+ Dictionary of recommended parameters
315
+ """
316
+ if method == "pca":
317
+ return {
318
+ "n_components": min(2, n_samples - 1)
319
+ }
320
+ elif method == "tsne":
321
+ # Adjust perplexity based on sample size
322
+ perplexity = min(30, max(5, n_samples / 100))
323
+ return {
324
+ "n_components": 2,
325
+ "perplexity": perplexity,
326
+ "learning_rate": "auto",
327
+ "n_iter": 1000 if n_samples < 10000 else 500
328
+ }
329
+ elif method == "umap":
330
+ # Adjust n_neighbors based on sample size
331
+ n_neighbors = min(15, max(2, int(np.sqrt(n_samples))))
332
+ return {
333
+ "n_components": 2,
334
+ "n_neighbors": n_neighbors,
335
+ "min_dist": 0.1,
336
+ "metric": "cosine"
337
+ }
338
+ else:
339
+ return {}
340
+
341
+
342
+ def cached_reduction(
343
+ vectors: np.ndarray,
344
+ method: str,
345
+ cache_path: Optional[str] = None,
346
+ force_recompute: bool = False,
347
+ **kwargs
348
+ ) -> ReductionResult:
349
+ """
350
+ Perform dimensionality reduction with optional caching.
351
+
352
+ Args:
353
+ vectors: Input vectors
354
+ method: Reduction method
355
+ cache_path: Path to cache file
356
+ force_recompute: Force recomputation even if cached
357
+ **kwargs: Additional parameters for reduction
358
+
359
+ Returns:
360
+ ReductionResult
361
+ """
362
+ import hashlib
363
+ import pickle
364
+ from pathlib import Path
365
+
366
+ # Create cache key from vectors and parameters
367
+ cache_key = None
368
+ if cache_path and not force_recompute:
369
+ # Create hash of input data and parameters
370
+ hasher = hashlib.sha256()
371
+ hasher.update(vectors.tobytes())
372
+ hasher.update(method.encode())
373
+ hasher.update(str(sorted(kwargs.items())).encode())
374
+ cache_key = hasher.hexdigest()
375
+
376
+ cache_file = Path(cache_path) / f"reduction_{cache_key}.pkl"
377
+
378
+ if cache_file.exists():
379
+ try:
380
+ with open(cache_file, "rb") as f:
381
+ result = pickle.load(f)
382
+ logger.info(f"Loaded cached reduction from {cache_file}")
383
+ return result
384
+ except Exception as e:
385
+ logger.warning(f"Failed to load cache: {e}")
386
+
387
+ # Compute reduction
388
+ result = reduce_dimensions(vectors, method=method, **kwargs)
389
+
390
+ # Cache result if requested
391
+ if cache_path and cache_key:
392
+ cache_file = Path(cache_path) / f"reduction_{cache_key}.pkl"
393
+ cache_file.parent.mkdir(parents=True, exist_ok=True)
394
+ try:
395
+ with open(cache_file, "wb") as f:
396
+ pickle.dump(result, f)
397
+ logger.info(f"Cached reduction to {cache_file}")
398
+ except Exception as e:
399
+ logger.warning(f"Failed to cache result: {e}")
400
+
401
+ return result
402
+
403
+
404
+ def align_reductions(
405
+ reductions: Dict[str, ReductionResult],
406
+ reference_key: Optional[str] = None
407
+ ) -> Dict[str, ReductionResult]:
408
+ """
409
+ Align multiple reduction results for comparison.
410
+
411
+ Uses Procrustes analysis to align coordinates.
412
+
413
+ Args:
414
+ reductions: Dictionary of reduction results
415
+ reference_key: Key of reference reduction (default: first)
416
+
417
+ Returns:
418
+ Dictionary of aligned reduction results
419
+ """
420
+ if len(reductions) <= 1:
421
+ return reductions
422
+
423
+ try:
424
+ from scipy.spatial import procrustes
425
+ except ImportError:
426
+ logger.warning("scipy not available, returning unaligned reductions")
427
+ return reductions
428
+
429
+ keys = list(reductions.keys())
430
+ if reference_key is None:
431
+ reference_key = keys[0]
432
+
433
+ if reference_key not in reductions:
434
+ raise ValueError(f"Reference key {reference_key} not found")
435
+
436
+ ref_coords = reductions[reference_key].coordinates
437
+ aligned = {reference_key: reductions[reference_key]}
438
+
439
+ for key in keys:
440
+ if key == reference_key:
441
+ continue
442
+
443
+ # Align using Procrustes
444
+ _, aligned_coords, _ = procrustes(ref_coords, reductions[key].coordinates)
445
+
446
+ aligned[key] = ReductionResult(
447
+ coordinates=aligned_coords,
448
+ method=reductions[key].method,
449
+ parameters=reductions[key].parameters,
450
+ explained_variance=reductions[key].explained_variance
451
+ )
452
+
453
+ return aligned