linkml-store 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. linkml_store/__init__.py +7 -0
  2. linkml_store/api/__init__.py +8 -0
  3. linkml_store/api/client.py +414 -0
  4. linkml_store/api/collection.py +1280 -0
  5. linkml_store/api/config.py +187 -0
  6. linkml_store/api/database.py +862 -0
  7. linkml_store/api/queries.py +69 -0
  8. linkml_store/api/stores/__init__.py +0 -0
  9. linkml_store/api/stores/chromadb/__init__.py +7 -0
  10. linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
  11. linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
  12. linkml_store/api/stores/dremio/__init__.py +10 -0
  13. linkml_store/api/stores/dremio/dremio_collection.py +555 -0
  14. linkml_store/api/stores/dremio/dremio_database.py +1052 -0
  15. linkml_store/api/stores/dremio/mappings.py +105 -0
  16. linkml_store/api/stores/dremio_rest/__init__.py +11 -0
  17. linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
  18. linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
  19. linkml_store/api/stores/duckdb/__init__.py +16 -0
  20. linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
  21. linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
  22. linkml_store/api/stores/duckdb/mappings.py +8 -0
  23. linkml_store/api/stores/filesystem/__init__.py +15 -0
  24. linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
  25. linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
  26. linkml_store/api/stores/hdf5/__init__.py +7 -0
  27. linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
  28. linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
  29. linkml_store/api/stores/ibis/__init__.py +5 -0
  30. linkml_store/api/stores/ibis/ibis_collection.py +488 -0
  31. linkml_store/api/stores/ibis/ibis_database.py +328 -0
  32. linkml_store/api/stores/mongodb/__init__.py +25 -0
  33. linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
  34. linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
  35. linkml_store/api/stores/neo4j/__init__.py +0 -0
  36. linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
  37. linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
  38. linkml_store/api/stores/solr/__init__.py +3 -0
  39. linkml_store/api/stores/solr/solr_collection.py +224 -0
  40. linkml_store/api/stores/solr/solr_database.py +83 -0
  41. linkml_store/api/stores/solr/solr_utils.py +0 -0
  42. linkml_store/api/types.py +4 -0
  43. linkml_store/cli.py +1147 -0
  44. linkml_store/constants.py +7 -0
  45. linkml_store/graphs/__init__.py +0 -0
  46. linkml_store/graphs/graph_map.py +24 -0
  47. linkml_store/index/__init__.py +53 -0
  48. linkml_store/index/implementations/__init__.py +0 -0
  49. linkml_store/index/implementations/llm_indexer.py +174 -0
  50. linkml_store/index/implementations/simple_indexer.py +43 -0
  51. linkml_store/index/indexer.py +211 -0
  52. linkml_store/inference/__init__.py +13 -0
  53. linkml_store/inference/evaluation.py +195 -0
  54. linkml_store/inference/implementations/__init__.py +0 -0
  55. linkml_store/inference/implementations/llm_inference_engine.py +154 -0
  56. linkml_store/inference/implementations/rag_inference_engine.py +276 -0
  57. linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
  58. linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
  59. linkml_store/inference/inference_config.py +66 -0
  60. linkml_store/inference/inference_engine.py +209 -0
  61. linkml_store/inference/inference_engine_registry.py +74 -0
  62. linkml_store/plotting/__init__.py +5 -0
  63. linkml_store/plotting/cli.py +826 -0
  64. linkml_store/plotting/dimensionality_reduction.py +453 -0
  65. linkml_store/plotting/embedding_plot.py +489 -0
  66. linkml_store/plotting/facet_chart.py +73 -0
  67. linkml_store/plotting/heatmap.py +383 -0
  68. linkml_store/utils/__init__.py +0 -0
  69. linkml_store/utils/change_utils.py +17 -0
  70. linkml_store/utils/dat_parser.py +95 -0
  71. linkml_store/utils/embedding_matcher.py +424 -0
  72. linkml_store/utils/embedding_utils.py +299 -0
  73. linkml_store/utils/enrichment_analyzer.py +217 -0
  74. linkml_store/utils/file_utils.py +37 -0
  75. linkml_store/utils/format_utils.py +550 -0
  76. linkml_store/utils/io.py +38 -0
  77. linkml_store/utils/llm_utils.py +122 -0
  78. linkml_store/utils/mongodb_utils.py +145 -0
  79. linkml_store/utils/neo4j_utils.py +42 -0
  80. linkml_store/utils/object_utils.py +190 -0
  81. linkml_store/utils/pandas_utils.py +93 -0
  82. linkml_store/utils/patch_utils.py +126 -0
  83. linkml_store/utils/query_utils.py +89 -0
  84. linkml_store/utils/schema_utils.py +23 -0
  85. linkml_store/utils/sklearn_utils.py +193 -0
  86. linkml_store/utils/sql_utils.py +177 -0
  87. linkml_store/utils/stats_utils.py +53 -0
  88. linkml_store/utils/vector_utils.py +158 -0
  89. linkml_store/webapi/__init__.py +0 -0
  90. linkml_store/webapi/html/__init__.py +3 -0
  91. linkml_store/webapi/html/base.html.j2 +24 -0
  92. linkml_store/webapi/html/collection_details.html.j2 +15 -0
  93. linkml_store/webapi/html/database_details.html.j2 +16 -0
  94. linkml_store/webapi/html/databases.html.j2 +14 -0
  95. linkml_store/webapi/html/generic.html.j2 +43 -0
  96. linkml_store/webapi/main.py +855 -0
  97. linkml_store-0.3.0.dist-info/METADATA +226 -0
  98. linkml_store-0.3.0.dist-info/RECORD +101 -0
  99. linkml_store-0.3.0.dist-info/WHEEL +4 -0
  100. linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
  101. linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
@@ -0,0 +1,424 @@
1
+ """Utilities for finding matches between embeddings in collections."""
2
+
3
+ import logging
4
+ from typing import Dict, List, Optional, Tuple, Literal, Any
5
+ from dataclasses import dataclass, field
6
+ import numpy as np
7
+ from enum import Enum
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class DistanceMetric(str, Enum):
13
+ """Distance metrics for similarity computation."""
14
+ COSINE = "cosine"
15
+ EUCLIDEAN = "euclidean"
16
+ L2 = "l2" # Alias for euclidean
17
+ DOT = "dot"
18
+ MANHATTAN = "manhattan"
19
+
20
+
21
+ @dataclass
22
+ class MatchResult:
23
+ """Result of a single match between items."""
24
+ source_id: str
25
+ source_data: Dict[str, Any]
26
+ target_id: str
27
+ target_data: Dict[str, Any]
28
+ similarity: float
29
+ distance: float
30
+ rank: int
31
+
32
+ def to_dict(self) -> Dict:
33
+ """Convert to dictionary for serialization."""
34
+ return {
35
+ "source_id": self.source_id,
36
+ "target_id": self.target_id,
37
+ "similarity": self.similarity,
38
+ "distance": self.distance,
39
+ "rank": self.rank,
40
+ "source": self.source_data,
41
+ "target": self.target_data
42
+ }
43
+
44
+
45
+ @dataclass
46
+ class MatchingConfig:
47
+ """Configuration for matching operations."""
48
+ metric: DistanceMetric = DistanceMetric.COSINE
49
+ max_matches_per_item: int = 5
50
+ similarity_threshold: Optional[float] = None
51
+ distance_threshold: Optional[float] = None
52
+ source_fields: Optional[List[str]] = None
53
+ target_fields: Optional[List[str]] = None
54
+ exclude_self_matches: bool = True
55
+ normalize_vectors: bool = False
56
+ batch_size: int = 100
57
+
58
+
59
+ @dataclass
60
+ class MatchingResults:
61
+ """Container for all matching results."""
62
+ matches: List[MatchResult]
63
+ config: MatchingConfig
64
+ source_collection: str
65
+ target_collection: str
66
+ total_source_items: int
67
+ total_target_items: int
68
+
69
+ @property
70
+ def num_matches(self) -> int:
71
+ """Total number of matches found."""
72
+ return len(self.matches)
73
+
74
+ def get_matches_for_source(self, source_id: str) -> List[MatchResult]:
75
+ """Get all matches for a specific source item."""
76
+ return [m for m in self.matches if m.source_id == source_id]
77
+
78
+ def to_dataframe(self):
79
+ """Convert to pandas DataFrame for analysis."""
80
+ try:
81
+ import pandas as pd
82
+ data = []
83
+ for match in self.matches:
84
+ row = {
85
+ "source_id": match.source_id,
86
+ "target_id": match.target_id,
87
+ "similarity": match.similarity,
88
+ "distance": match.distance,
89
+ "rank": match.rank
90
+ }
91
+ # Add source fields
92
+ for key, value in match.source_data.items():
93
+ row[f"source_{key}"] = value
94
+ # Add target fields
95
+ for key, value in match.target_data.items():
96
+ row[f"target_{key}"] = value
97
+ data.append(row)
98
+ return pd.DataFrame(data)
99
+ except ImportError:
100
+ logger.warning("pandas not installed, cannot convert to DataFrame")
101
+ return None
102
+
103
+
104
+ def compute_similarity_matrix(
105
+ source_vectors: np.ndarray,
106
+ target_vectors: np.ndarray,
107
+ metric: DistanceMetric = DistanceMetric.COSINE
108
+ ) -> Tuple[np.ndarray, np.ndarray]:
109
+ """
110
+ Compute similarity and distance matrices between source and target vectors.
111
+
112
+ Args:
113
+ source_vectors: Source embeddings (n_source, n_dims)
114
+ target_vectors: Target embeddings (n_target, n_dims)
115
+ metric: Distance metric to use
116
+
117
+ Returns:
118
+ Tuple of (similarity_matrix, distance_matrix)
119
+ """
120
+ if metric in [DistanceMetric.COSINE, DistanceMetric.DOT]:
121
+ # Normalize for cosine similarity
122
+ if metric == DistanceMetric.COSINE:
123
+ source_norm = source_vectors / (np.linalg.norm(source_vectors, axis=1, keepdims=True) + 1e-10)
124
+ target_norm = target_vectors / (np.linalg.norm(target_vectors, axis=1, keepdims=True) + 1e-10)
125
+ else:
126
+ source_norm = source_vectors
127
+ target_norm = target_vectors
128
+
129
+ # Compute similarity matrix
130
+ similarity_matrix = np.dot(source_norm, target_norm.T)
131
+ distance_matrix = 1 - similarity_matrix
132
+
133
+ elif metric in [DistanceMetric.EUCLIDEAN, DistanceMetric.L2]:
134
+ # Compute pairwise Euclidean distances
135
+ # Using broadcasting: ||a - b||^2 = ||a||^2 + ||b||^2 - 2*a·b
136
+ source_sq = np.sum(source_vectors**2, axis=1, keepdims=True)
137
+ target_sq = np.sum(target_vectors**2, axis=1, keepdims=True)
138
+ dot_product = np.dot(source_vectors, target_vectors.T)
139
+
140
+ distance_matrix = np.sqrt(np.maximum(0, source_sq + target_sq.T - 2 * dot_product))
141
+ # Convert distance to similarity (inverse relationship)
142
+ max_dist = np.max(distance_matrix)
143
+ if max_dist > 0:
144
+ similarity_matrix = 1 - (distance_matrix / max_dist)
145
+ else:
146
+ similarity_matrix = 1 - distance_matrix
147
+
148
+ elif metric == DistanceMetric.MANHATTAN:
149
+ # Manhattan distance (L1)
150
+ distance_matrix = np.zeros((len(source_vectors), len(target_vectors)))
151
+ for i, source_vec in enumerate(source_vectors):
152
+ for j, target_vec in enumerate(target_vectors):
153
+ distance_matrix[i, j] = np.sum(np.abs(source_vec - target_vec))
154
+
155
+ max_dist = np.max(distance_matrix)
156
+ if max_dist > 0:
157
+ similarity_matrix = 1 - (distance_matrix / max_dist)
158
+ else:
159
+ similarity_matrix = 1 - distance_matrix
160
+ else:
161
+ raise ValueError(f"Unknown metric: {metric}")
162
+
163
+ return similarity_matrix, distance_matrix
164
+
165
+
166
+ def find_best_matches(
167
+ source_vectors: np.ndarray,
168
+ target_vectors: np.ndarray,
169
+ source_ids: List[str],
170
+ target_ids: List[str],
171
+ source_metadata: List[Dict],
172
+ target_metadata: List[Dict],
173
+ config: MatchingConfig
174
+ ) -> List[MatchResult]:
175
+ """
176
+ Find best matches between source and target embeddings.
177
+
178
+ Args:
179
+ source_vectors: Source embedding vectors
180
+ target_vectors: Target embedding vectors
181
+ source_ids: IDs for source items
182
+ target_ids: IDs for target items
183
+ source_metadata: Metadata for source items
184
+ target_metadata: Metadata for target items
185
+ config: Matching configuration
186
+
187
+ Returns:
188
+ List of match results
189
+ """
190
+ # Normalize vectors if requested
191
+ if config.normalize_vectors:
192
+ source_vectors = source_vectors / (np.linalg.norm(source_vectors, axis=1, keepdims=True) + 1e-10)
193
+ target_vectors = target_vectors / (np.linalg.norm(target_vectors, axis=1, keepdims=True) + 1e-10)
194
+
195
+ # Compute similarity and distance matrices
196
+ similarity_matrix, distance_matrix = compute_similarity_matrix(
197
+ source_vectors, target_vectors, config.metric
198
+ )
199
+
200
+ matches = []
201
+
202
+ # Find best matches for each source item
203
+ for i, source_id in enumerate(source_ids):
204
+ # Get similarities and distances for this source item
205
+ similarities = similarity_matrix[i]
206
+ distances = distance_matrix[i]
207
+
208
+ # Create pairs of (index, similarity, distance)
209
+ candidates = []
210
+ for j, (sim, dist) in enumerate(zip(similarities, distances)):
211
+ target_id = target_ids[j]
212
+
213
+ # Skip self-matches if configured
214
+ if config.exclude_self_matches and source_id == target_id:
215
+ continue
216
+
217
+ # Apply thresholds
218
+ if config.similarity_threshold is not None and sim < config.similarity_threshold:
219
+ continue
220
+ if config.distance_threshold is not None and dist > config.distance_threshold:
221
+ continue
222
+
223
+ candidates.append((j, sim, dist))
224
+
225
+ # Sort by similarity (descending)
226
+ candidates.sort(key=lambda x: x[1], reverse=True)
227
+
228
+ # Take top N matches
229
+ for rank, (j, sim, dist) in enumerate(candidates[:config.max_matches_per_item], 1):
230
+ # Extract metadata fields
231
+ source_data = {}
232
+ if config.source_fields:
233
+ source_data = {k: source_metadata[i].get(k) for k in config.source_fields}
234
+ else:
235
+ source_data = source_metadata[i]
236
+
237
+ target_data = {}
238
+ if config.target_fields:
239
+ target_data = {k: target_metadata[j].get(k) for k in config.target_fields}
240
+ else:
241
+ target_data = target_metadata[j]
242
+
243
+ match = MatchResult(
244
+ source_id=source_id,
245
+ source_data=source_data,
246
+ target_id=target_ids[j],
247
+ target_data=target_data,
248
+ similarity=float(sim),
249
+ distance=float(dist),
250
+ rank=rank
251
+ )
252
+ matches.append(match)
253
+
254
+ return matches
255
+
256
+
257
+ def match_embeddings_between_collections(
258
+ database,
259
+ source_collection: str,
260
+ target_collection: str,
261
+ index_name: str = None,
262
+ config: Optional[MatchingConfig] = None,
263
+ limit: Optional[int] = None
264
+ ) -> MatchingResults:
265
+ """
266
+ Find matches between embeddings in two collections.
267
+
268
+ Args:
269
+ database: LinkML database object
270
+ source_collection: Name of source collection
271
+ target_collection: Name of target collection
272
+ index_name: Name of index to use (defaults to first available)
273
+ config: Matching configuration
274
+ limit: Limit number of items to process
275
+
276
+ Returns:
277
+ MatchingResults object
278
+ """
279
+ if config is None:
280
+ config = MatchingConfig()
281
+
282
+ from linkml_store.utils.embedding_utils import extract_embeddings_from_collection
283
+
284
+ # Extract embeddings from source collection
285
+ logger.info(f"Extracting embeddings from source collection: {source_collection}")
286
+ source_coll = database.get_collection(source_collection)
287
+ source_data = extract_embeddings_from_collection(
288
+ source_coll,
289
+ index_name=index_name,
290
+ limit=limit,
291
+ include_metadata=True
292
+ )
293
+
294
+ # Extract embeddings from target collection
295
+ logger.info(f"Extracting embeddings from target collection: {target_collection}")
296
+ target_coll = database.get_collection(target_collection)
297
+ target_data = extract_embeddings_from_collection(
298
+ target_coll,
299
+ index_name=index_name,
300
+ limit=limit,
301
+ include_metadata=True
302
+ )
303
+
304
+ # Find matches
305
+ logger.info(f"Finding matches between {source_data.n_samples} source and {target_data.n_samples} target items")
306
+ matches = find_best_matches(
307
+ source_vectors=source_data.vectors,
308
+ target_vectors=target_data.vectors,
309
+ source_ids=source_data.object_ids,
310
+ target_ids=target_data.object_ids,
311
+ source_metadata=source_data.metadata,
312
+ target_metadata=target_data.metadata,
313
+ config=config
314
+ )
315
+
316
+ return MatchingResults(
317
+ matches=matches,
318
+ config=config,
319
+ source_collection=source_collection,
320
+ target_collection=target_collection,
321
+ total_source_items=source_data.n_samples,
322
+ total_target_items=target_data.n_samples
323
+ )
324
+
325
+
326
+ def match_embeddings_within_collection(
327
+ database,
328
+ collection_name: str,
329
+ index_name: str = None,
330
+ config: Optional[MatchingConfig] = None,
331
+ limit: Optional[int] = None
332
+ ) -> MatchingResults:
333
+ """
334
+ Find matches within a single collection (self-similarity).
335
+
336
+ Args:
337
+ database: LinkML database object
338
+ collection_name: Name of collection
339
+ index_name: Name of index to use
340
+ config: Matching configuration
341
+ limit: Limit number of items
342
+
343
+ Returns:
344
+ MatchingResults object
345
+ """
346
+ if config is None:
347
+ config = MatchingConfig()
348
+
349
+ # Ensure self-matches are excluded for within-collection matching
350
+ config.exclude_self_matches = True
351
+
352
+ # Use same collection as both source and target
353
+ return match_embeddings_between_collections(
354
+ database=database,
355
+ source_collection=collection_name,
356
+ target_collection=collection_name,
357
+ index_name=index_name,
358
+ config=config,
359
+ limit=limit
360
+ )
361
+
362
+
363
+ def format_matches_report(
364
+ results: MatchingResults,
365
+ max_examples: int = 10
366
+ ) -> str:
367
+ """
368
+ Format matching results as a human-readable report.
369
+
370
+ Args:
371
+ results: Matching results
372
+ max_examples: Maximum examples to show
373
+
374
+ Returns:
375
+ Formatted report string
376
+ """
377
+ lines = []
378
+ lines.append("=" * 60)
379
+ lines.append("EMBEDDING MATCHING REPORT")
380
+ lines.append("=" * 60)
381
+
382
+ lines.append(f"\nSource Collection: {results.source_collection}")
383
+ lines.append(f"Target Collection: {results.target_collection}")
384
+ lines.append(f"Source Items: {results.total_source_items}")
385
+ lines.append(f"Target Items: {results.total_target_items}")
386
+ lines.append(f"Total Matches: {results.num_matches}")
387
+
388
+ lines.append(f"\nConfiguration:")
389
+ lines.append(f" Metric: {results.config.metric.value}")
390
+ lines.append(f" Max matches per item: {results.config.max_matches_per_item}")
391
+ if results.config.similarity_threshold:
392
+ lines.append(f" Similarity threshold: {results.config.similarity_threshold}")
393
+ if results.config.distance_threshold:
394
+ lines.append(f" Distance threshold: {results.config.distance_threshold}")
395
+
396
+ # Show top matches
397
+ if results.matches:
398
+ lines.append(f"\nTop {min(max_examples, len(results.matches))} Matches:")
399
+ lines.append("-" * 60)
400
+
401
+ # Sort by similarity for display
402
+ sorted_matches = sorted(results.matches, key=lambda m: m.similarity, reverse=True)
403
+
404
+ for i, match in enumerate(sorted_matches[:max_examples], 1):
405
+ lines.append(f"\n{i}. Similarity: {match.similarity:.4f} | Distance: {match.distance:.4f}")
406
+ lines.append(f" Source [{match.source_id}]:")
407
+ for key, value in match.source_data.items():
408
+ if value:
409
+ lines.append(f" {key}: {str(value)[:100]}")
410
+ lines.append(f" Target [{match.target_id}]:")
411
+ for key, value in match.target_data.items():
412
+ if value:
413
+ lines.append(f" {key}: {str(value)[:100]}")
414
+
415
+ # Summary statistics
416
+ if results.matches:
417
+ similarities = [m.similarity for m in results.matches]
418
+ lines.append("\nSummary Statistics:")
419
+ lines.append(f" Mean similarity: {np.mean(similarities):.4f}")
420
+ lines.append(f" Std similarity: {np.std(similarities):.4f}")
421
+ lines.append(f" Min similarity: {np.min(similarities):.4f}")
422
+ lines.append(f" Max similarity: {np.max(similarities):.4f}")
423
+
424
+ return "\n".join(lines)