linkml-store 0.2.6__py3-none-any.whl → 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (28) hide show
  1. linkml_store/api/client.py +2 -3
  2. linkml_store/api/collection.py +63 -8
  3. linkml_store/api/database.py +30 -2
  4. linkml_store/api/stores/duckdb/duckdb_collection.py +165 -3
  5. linkml_store/api/stores/duckdb/duckdb_database.py +3 -3
  6. linkml_store/api/stores/filesystem/__init__.py +1 -1
  7. linkml_store/api/stores/mongodb/mongodb_collection.py +115 -12
  8. linkml_store/api/stores/mongodb/mongodb_database.py +2 -1
  9. linkml_store/api/stores/solr/solr_collection.py +7 -1
  10. linkml_store/cli.py +201 -20
  11. linkml_store/index/implementations/llm_indexer.py +14 -6
  12. linkml_store/index/indexer.py +7 -4
  13. linkml_store/inference/implementations/llm_inference_engine.py +13 -9
  14. linkml_store/inference/implementations/rag_inference_engine.py +13 -10
  15. linkml_store/inference/implementations/sklearn_inference_engine.py +7 -1
  16. linkml_store/inference/inference_config.py +1 -0
  17. linkml_store/utils/dat_parser.py +95 -0
  18. linkml_store/utils/enrichment_analyzer.py +217 -0
  19. linkml_store/utils/format_utils.py +124 -3
  20. linkml_store/utils/llm_utils.py +3 -1
  21. linkml_store/utils/pandas_utils.py +1 -1
  22. linkml_store/utils/sql_utils.py +1 -1
  23. linkml_store/utils/vector_utils.py +3 -10
  24. {linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/METADATA +3 -1
  25. {linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/RECORD +28 -26
  26. {linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/WHEEL +1 -1
  27. {linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/LICENSE +0 -0
  28. {linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,95 @@
1
+ from typing import Tuple, Optional, Dict, Any, List
2
+
3
+ ENTRY = Dict[str, Any]
4
+
5
+
6
+ def parse_sib_format(text) -> Tuple[Optional[ENTRY], List[ENTRY]]:
7
+ """
8
+ Parse SIB/Swiss-Prot format data into a structured dictionary.
9
+
10
+ Args:
11
+ text (str): The text in SIB/Swiss-Prot format
12
+
13
+ Returns:
14
+ dict: A dictionary with entry IDs as keys and parsed data as values
15
+ """
16
+ # Split the text into entries (separated by //)
17
+ entries = text.split("//\n")
18
+ header = None
19
+
20
+ # Initialize results dictionary
21
+ results = []
22
+
23
+ # Parse each entry
24
+ for entry in entries:
25
+ if not entry.strip():
26
+ continue
27
+
28
+ # Initialize dictionary for current entry
29
+ current_entry = {}
30
+ current_code = None
31
+
32
+ # Process each line
33
+ for line in entry.strip().split("\n"):
34
+ if not line.strip():
35
+ continue
36
+
37
+ # Check if this is a new field (starts with a 2-letter code followed by space)
38
+ if len(line) > 2 and line[2] == " ":
39
+ current_code = line[0:2]
40
+ # Remove the code and the following space(s)
41
+ value = line[3:].strip()
42
+
43
+ # Initialize as list if needed for multi-line fields
44
+ if current_code not in current_entry:
45
+ current_entry[current_code] = []
46
+
47
+ current_entry[current_code].append(value)
48
+
49
+ # Continuation of previous field
50
+ elif current_code is not None:
51
+ # Handle continuation lines (typically indented)
52
+ if current_code == "CC":
53
+ # For comments, preserve the indentation
54
+ current_entry[current_code].append(line)
55
+ else:
56
+ # For other fields, strip and append
57
+ current_entry[current_code].append(line.strip())
58
+
59
+ # Combine multiline comments; e.g
60
+ # -!- ...
61
+ # ...
62
+ # -!- ...
63
+ ccs = current_entry.get("CC", [])
64
+ new_ccs = []
65
+ for cc in ccs:
66
+ if not cc.startswith("-!-") and new_ccs:
67
+ new_ccs[-1] += " " + cc
68
+ else:
69
+ new_ccs.append(cc)
70
+ current_entry["CC"] = new_ccs
71
+ for k, vs in current_entry.items():
72
+ if k != "CC":
73
+ combined = "".join(vs)
74
+ combined = combined.strip()
75
+ if combined.endswith("."):
76
+ combined = combined.split(".")
77
+ combined = [c.strip() for c in combined if c.strip()]
78
+ if k == "DE":
79
+ combined = combined[0]
80
+ current_entry[k] = combined
81
+
82
+ if "ID" in current_entry:
83
+ results.append(current_entry)
84
+ else:
85
+ header = current_entry
86
+
87
+ return header, results
88
+
89
+
90
+ # Example usage:
91
+ # data = parse_sib_format(text)
92
+ # for entry_id, entry_data in data.items():
93
+ # print(f"Entry: {entry_id}")
94
+ # for code, values in entry_data.items():
95
+ # print(f" {code}: {values}")
@@ -0,0 +1,217 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from linkml_store.api import Collection
4
+ from scipy import stats
5
+ from typing import Dict, List
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class EnrichedCategory(BaseModel):
10
+ """
11
+ Information about a category enriched in a sample
12
+ """
13
+
14
+ category: str
15
+ fold_change: float
16
+ original_p_value: float
17
+ adjusted_p_value: float
18
+
19
+
20
+ from collections import Counter, defaultdict
21
+
22
+
23
+ class EnrichmentAnalyzer:
24
+ def __init__(self, df: pd.DataFrame, sample_key: str, classification_key: str):
25
+ """
26
+ Initialize the analyzer with a DataFrame and key column names.
27
+ Precomputes category frequencies for the entire dataset.
28
+
29
+ Args:
30
+ df: DataFrame containing the data
31
+ sample_key: Column name for sample IDs
32
+ classification_key: Column name for category lists
33
+ """
34
+ self.df = df
35
+ self.sample_key = sample_key
36
+ self.classification_key = classification_key
37
+
38
+ # Precompute global category statistics
39
+ self.global_stats = self._compute_global_stats()
40
+
41
+ # Cache for sample-specific category counts
42
+ self.sample_cache: Dict[str, Counter] = {}
43
+
44
+ @classmethod
45
+ def from_collection(cls, collection: Collection, sample_key: str, classification_key: str) -> "EnrichmentAnalyzer":
46
+ """
47
+ Initialize the analyzer with a Collection and key column names.
48
+ Precomputes category frequencies for the entire dataset.
49
+
50
+ Args:
51
+ collection: Collection containing the data
52
+ sample_key: Column name for sample IDs
53
+ classification_key: Column name for category lists
54
+ """
55
+ column_atts = [sample_key, classification_key]
56
+ results = collection.find(select_cols=column_atts, limit=-1)
57
+ df = results.rows_dataframe
58
+ ea = cls(df, sample_key=sample_key, classification_key=classification_key)
59
+ return ea
60
+
61
+ def _compute_global_stats(self) -> Dict[str, int]:
62
+ """
63
+ Compute global category frequencies across all samples.
64
+ Returns a dictionary of category -> count
65
+ """
66
+ global_counter = Counter()
67
+
68
+ # Flatten all categories and count
69
+ for categories in self.df[self.classification_key]:
70
+ if isinstance(categories, list):
71
+ global_counter.update(categories)
72
+ else:
73
+ # Handle case where categories might be a string
74
+ global_counter.update([categories])
75
+
76
+ return global_counter
77
+
78
+ @property
79
+ def sample_ids(self) -> List[str]:
80
+ df = self.df
81
+ return df[self.sample_key].unique().tolist()
82
+
83
+ def _get_sample_stats(self, sample_id: str) -> Counter:
84
+ """
85
+ Get category frequencies for a specific sample.
86
+ Uses caching to avoid recomputation.
87
+ """
88
+ if sample_id in self.sample_cache:
89
+ return self.sample_cache[sample_id]
90
+
91
+ sample_data = self.df[self.df[self.sample_key] == sample_id]
92
+ if sample_data.empty:
93
+ raise KeyError(f"Sample ID '{sample_id}' not found")
94
+ sample_data = sample_data.dropna()
95
+ # if sample_data.empty:
96
+ # raise ValueError(f"Sample ID '{sample_id}' has missing values after dropping NA")
97
+ counter = Counter()
98
+
99
+ for categories in sample_data[self.classification_key]:
100
+ if isinstance(categories, list):
101
+ counter.update(categories)
102
+ else:
103
+ counter.update([categories])
104
+
105
+ self.sample_cache[sample_id] = counter
106
+ return counter
107
+
108
+ def find_enriched_categories(
109
+ self,
110
+ sample_id: str,
111
+ min_occurrences: int = 5,
112
+ p_value_threshold: float = 0.05,
113
+ multiple_testing_correction: str = "bh",
114
+ ) -> List[EnrichedCategory]:
115
+ """
116
+ Find categories that are enriched in the given sample.
117
+
118
+ Args:
119
+ sample_id: ID of the sample to analyze
120
+ min_occurrences: Minimum number of occurrences required for a category
121
+ p_value_threshold: P-value threshold for significance
122
+
123
+ Returns:
124
+ List of tuples (category, fold_change, p_value) sorted by significance
125
+ """
126
+ sample_stats = self._get_sample_stats(sample_id)
127
+ total_sample_annotations = sum(sample_stats.values())
128
+ total_global_annotations = sum(self.global_stats.values())
129
+
130
+ results = []
131
+
132
+ for category, sample_count in sample_stats.items():
133
+ global_count = self.global_stats[category]
134
+
135
+ # Skip rare categories
136
+ if global_count < min_occurrences:
137
+ continue
138
+
139
+ # Calculate fold change
140
+ sample_freq = sample_count / total_sample_annotations
141
+ global_freq = global_count / total_global_annotations
142
+ fold_change = sample_freq / global_freq if global_freq > 0 else float("inf")
143
+
144
+ # Perform Fisher's exact test
145
+ contingency_table = np.array(
146
+ [
147
+ [sample_count, global_count - sample_count],
148
+ [
149
+ total_sample_annotations - sample_count,
150
+ total_global_annotations - total_sample_annotations - (global_count - sample_count),
151
+ ],
152
+ ]
153
+ )
154
+
155
+ _, p_value = stats.fisher_exact(contingency_table)
156
+
157
+ if p_value < p_value_threshold:
158
+ results.append((category, fold_change, p_value))
159
+
160
+ if not results:
161
+ return results
162
+
163
+ # Sort by p-value
164
+ results.sort(key=lambda x: x[2])
165
+
166
+ # Apply multiple testing correction
167
+ categories, fold_changes, p_values = zip(*results)
168
+
169
+ if multiple_testing_correction.lower() == "bonf":
170
+ # Bonferroni correction
171
+ n_tests = len(self.global_stats) # Total number of categories tested
172
+ adjusted_p_values = [min(1.0, p * n_tests) for p in p_values]
173
+
174
+ elif multiple_testing_correction.lower() == "bh":
175
+ # Benjamini-Hochberg correction
176
+ n = len(p_values)
177
+ sorted_indices = np.argsort(p_values)
178
+ sorted_p_values = np.array(p_values)[sorted_indices]
179
+
180
+ # Calculate BH adjusted p-values
181
+ adjusted_p_values = np.zeros(n)
182
+ for i, p in enumerate(sorted_p_values):
183
+ adjusted_p_values[i] = p * n / (i + 1)
184
+
185
+ # Ensure monotonicity
186
+ for i in range(n - 2, -1, -1):
187
+ adjusted_p_values[i] = min(adjusted_p_values[i], adjusted_p_values[i + 1])
188
+
189
+ # Restore original order
190
+ inverse_indices = np.argsort(sorted_indices)
191
+ adjusted_p_values = adjusted_p_values[inverse_indices]
192
+
193
+ # Ensure we don't exceed 1.0
194
+ adjusted_p_values = np.minimum(adjusted_p_values, 1.0)
195
+
196
+ else:
197
+ # No correction
198
+ adjusted_p_values = p_values
199
+
200
+ # Filter by adjusted p-value threshold and create final results
201
+ # Create EnrichedCategory objects
202
+ final_results = [
203
+ EnrichedCategory(category=cat, fold_change=fc, original_p_value=p, adjusted_p_value=adj_p)
204
+ for cat, fc, p, adj_p in zip(categories, fold_changes, p_values, adjusted_p_values)
205
+ if adj_p < p_value_threshold
206
+ ]
207
+
208
+ # Sort by adjusted p-value
209
+ final_results.sort(key=lambda x: x.adjusted_p_value)
210
+ return final_results
211
+
212
+
213
+ # Example usage:
214
+ # analyzer = EnrichmentAnalyzer(df, 'sample_id', 'categories')
215
+ # enriched = analyzer.find_enriched_categories('sample1')
216
+ # for category, fold_change, p_value in enriched:
217
+ # print(f"{category}: {fold_change:.2f}x enrichment (p={p_value:.2e})")
@@ -1,5 +1,6 @@
1
1
  import csv
2
2
  import gzip
3
+ import hashlib
3
4
  import io
4
5
  import json
5
6
  import logging
@@ -29,13 +30,24 @@ class Format(Enum):
29
30
  JSONL = "jsonl"
30
31
  YAML = "yaml"
31
32
  YAMLL = "yamll"
33
+ TOML = "toml"
32
34
  TSV = "tsv"
33
35
  CSV = "csv"
34
36
  XML = "xml"
37
+ TURTLE = "turtle"
38
+ RDFXML = "rdfxml"
39
+ TEXT = "text"
40
+ TEXTLINES = "textlines"
35
41
  OBO = "obo"
42
+ FASTA = "fasta"
43
+ GMT = "gmt"
44
+ DAT = "dat"
45
+ MARKDOWN = "markdown"
36
46
  PKL = "pkl"
37
47
  PYTHON = "python"
38
48
  PARQUET = "parquet"
49
+ HDF5 = "hdf5"
50
+ NETCDF = "netcdf"
39
51
  FORMATTED = "formatted"
40
52
  TABLE = "table"
41
53
  XLSX = "xlsx"
@@ -55,7 +67,12 @@ class Format(Enum):
55
67
  ".yamll": cls.YAMLL,
56
68
  ".tsv": cls.TSV,
57
69
  ".csv": cls.CSV,
70
+ ".txt": cls.TEXT,
58
71
  ".xml": cls.XML,
72
+ ".owx": cls.XML,
73
+ ".owl": cls.RDFXML,
74
+ ".ttl": cls.TURTLE,
75
+ ".md": cls.MARKDOWN,
59
76
  ".py": cls.PYTHON,
60
77
  ".parquet": cls.PARQUET,
61
78
  ".pq": cls.PARQUET,
@@ -122,12 +139,25 @@ def clean_nested_structure(obj):
122
139
  else:
123
140
  return clean_pandas_value(obj)
124
141
 
142
+
125
143
  def process_file(
126
- f: IO, format: Format, expected_type: Optional[Type] = None, header_comment_token: Optional[str] = None
144
+ f: IO,
145
+ format: Format,
146
+ expected_type: Optional[Type] = None,
147
+ header_comment_token: Optional[str] = None,
148
+ format_options: Optional[Dict[str, Any]] = None,
127
149
  ) -> List[Dict[str, Any]]:
128
150
  """
129
151
  Process a single file and return a list of objects.
152
+
153
+ :param f: The file object.
154
+ :param format: The format of the file.
155
+ :param expected_type: The expected type of the objects.
156
+ :param header_comment_token: Token used for header comments to be skipped
157
+ :return:
130
158
  """
159
+ if format_options is None:
160
+ format_options = {}
131
161
  if format == Format.YAMLL:
132
162
  format = Format.YAML
133
163
  expected_type = list
@@ -142,6 +172,14 @@ def process_file(
142
172
  objs = [obj for obj in objs if obj is not None]
143
173
  else:
144
174
  objs = yaml.safe_load(f)
175
+ elif format == Format.TOML:
176
+ import toml
177
+
178
+ objs = toml.load(f)
179
+ if not isinstance(objs, list):
180
+ objs = [objs]
181
+ elif format == Format.TEXTLINES:
182
+ objs = f.readlines()
145
183
  elif format in [Format.TSV, Format.CSV]:
146
184
  if header_comment_token:
147
185
  while True:
@@ -160,14 +198,82 @@ def process_file(
160
198
  elif format == Format.XLSX:
161
199
  xls = pd.ExcelFile(f)
162
200
  objs = {sheet: clean_nested_structure(xls.parse(sheet).to_dict(orient="records")) for sheet in xls.sheet_names}
201
+ elif format == Format.TEXT:
202
+ txt = f.read()
203
+ objs = [
204
+ {
205
+ "name": Path(f.name).name,
206
+ "path": f.name,
207
+ "content": txt,
208
+ "size": len(txt),
209
+ "lines": txt.count("\n") + 1,
210
+ "md5": hashlib.md5(txt.encode()).hexdigest(),
211
+ }
212
+ ]
213
+ elif format == Format.GMT:
214
+ objs = []
215
+ lib_name = Path(f.name).name
216
+ for line in f:
217
+ parts = line.strip().split("\t")
218
+ desc = parts[1]
219
+ objs.append(
220
+ {
221
+ "library": lib_name,
222
+ "uid": f"{lib_name}.{parts[0]}",
223
+ "name": parts[0],
224
+ "description": desc if desc else None,
225
+ "genes": parts[2:],
226
+ }
227
+ )
228
+ elif format == Format.FASTA:
229
+ objs = []
230
+ current_obj = None
231
+ for line in f:
232
+ line = line.strip()
233
+ if line.startswith(">"):
234
+ if current_obj:
235
+ objs.append(current_obj)
236
+ current_obj = {"id": line[1:], "sequence": ""}
237
+ else:
238
+ current_obj["sequence"] += line
239
+ if current_obj:
240
+ objs.append(current_obj)
163
241
  elif format == Format.OBO:
164
242
  blocks = split_document(f.read(), "\n\n")
165
243
  id_pattern = re.compile(r"id: (\S+)")
244
+
166
245
  def get_id(block):
167
246
  m = id_pattern.search(block)
168
247
  return m.group(1) if m else None
248
+
169
249
  objs = [{"id": get_id(block), "content": block} for block in blocks]
170
250
  objs = [obj for obj in objs if obj["id"]]
251
+ elif format == Format.DAT:
252
+ from linkml_store.utils.dat_parser import parse_sib_format
253
+
254
+ _, objs = parse_sib_format(f.read())
255
+ elif format in (Format.RDFXML, Format.TURTLE):
256
+ import lightrdf
257
+
258
+ parser = lightrdf.Parser()
259
+ objs = []
260
+ ext_fmt = "rdfxml"
261
+ if format == Format.TURTLE:
262
+ ext_fmt = "ttl"
263
+ bytesio = io.BytesIO(f.read().encode("utf-8"))
264
+ buffer = io.BufferedReader(bytesio)
265
+ for s, p, o in parser.parse(buffer, base_iri=None, format=ext_fmt):
266
+ obj = {
267
+ "subject": s,
268
+ "predicate": p,
269
+ "object": o,
270
+ }
271
+ if format_options.get("pivot", False):
272
+ obj = {
273
+ "subject": s,
274
+ p: o,
275
+ }
276
+ objs.append(obj)
171
277
  elif format == Format.PARQUET:
172
278
  import pyarrow.parquet as pq
173
279
 
@@ -202,6 +308,7 @@ def load_objects(
202
308
  :param compression: The compression type. Supports 'gz' for gzip and 'tgz' for tar.gz.
203
309
  :param expected_type: The target type to load the objects into, e.g. list
204
310
  :param header_comment_token: Token used for header comments to be skipped
311
+ :param select_query: JSONPath query to select specific objects from the loaded data.
205
312
  :return: A list of dictionaries representing the loaded objects.
206
313
  """
207
314
  if isinstance(file_path, Path):
@@ -290,7 +397,8 @@ def write_output(
290
397
 
291
398
 
292
399
  def render_output(
293
- data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame], format: Optional[Union[Format, str]] = Format.YAML
400
+ data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame, List[BaseModel]],
401
+ format: Optional[Union[Format, str]] = Format.YAML,
294
402
  ) -> str:
295
403
  """
296
404
  Render output data in JSON, JSONLines, YAML, CSV, or TSV format.
@@ -323,6 +431,12 @@ def render_output(
323
431
  if isinstance(data, pd.DataFrame):
324
432
  data = data.to_dict(orient="records")
325
433
 
434
+ if isinstance(data, BaseModel):
435
+ data = data.model_dump()
436
+
437
+ if data and isinstance(data, list) and isinstance(data[0], BaseModel):
438
+ data = [d.model_dump() if isinstance(d, BaseModel) else d for d in data]
439
+
326
440
  if isinstance(data, dict) and format in [Format.TSV, Format.CSV]:
327
441
  data = [data]
328
442
 
@@ -335,8 +449,15 @@ def render_output(
335
449
  return "\n".join(json.dumps(obj) for obj in data)
336
450
  elif format == Format.PYTHON:
337
451
  return str(data)
452
+ elif format == Format.MARKDOWN:
453
+
454
+ def as_markdown(obj: dict):
455
+ return "## Object\n\n" + "\n".join([f" * {k}: {v}" for k, v in obj.items()])
456
+
457
+ return "\n\n".join([as_markdown(obj) for obj in data]) if isinstance(data, list) else as_markdown(data)
338
458
  elif format == Format.TABLE:
339
459
  from tabulate import tabulate
460
+
340
461
  return tabulate(pd.DataFrame(data), headers="keys", tablefmt="psql")
341
462
  elif format == Format.YAML:
342
463
  if isinstance(data, list):
@@ -401,4 +522,4 @@ def split_document(doc: str, delimiter: str):
401
522
  :param delimiter: The delimiter.
402
523
  :return: The parts of the document.
403
524
  """
404
- return doc.split(delimiter)
525
+ return doc.split(delimiter)
@@ -76,6 +76,7 @@ def render_formatted_text(
76
76
  return text
77
77
  if not values:
78
78
  raise ValueError(f"Cannot fit text into token limit: {text_length} > {token_limit}")
79
+ # remove last element and try again
79
80
  return render_formatted_text(render_func, values[0:-1], encoding=encoding, token_limit=token_limit)
80
81
 
81
82
 
@@ -104,6 +105,7 @@ def get_token_limit(model_name: str) -> int:
104
105
 
105
106
  def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]:
106
107
  import yaml
108
+
107
109
  if "```" in yaml_str:
108
110
  yaml_str = yaml_str.split("```")[1].strip()
109
111
  if yaml_str.startswith("yaml"):
@@ -114,4 +116,4 @@ def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]:
114
116
  if strict:
115
117
  raise e
116
118
  logger.error(f"Error parsing YAML: {yaml_str}\n{e}")
117
- return None
119
+ return None
@@ -56,7 +56,7 @@ def nested_objects_to_dataframe(data: List[Dict[str, Any]]) -> pd.DataFrame:
56
56
 
57
57
 
58
58
  def facet_summary_to_dataframe_unmelted(
59
- facet_summary: Dict[Union[str, Tuple[str, ...]], List[Tuple[Union[str, Tuple[str, ...]], int]]]
59
+ facet_summary: Dict[Union[str, Tuple[str, ...]], List[Tuple[Union[str, Tuple[str, ...]], int]]],
60
60
  ) -> pd.DataFrame:
61
61
  rows = []
62
62
 
@@ -116,7 +116,7 @@ def facet_count_sql(query: Query, facet_column: Union[str, Tuple[str, ...]], mul
116
116
  modified_where = " AND ".join(conditions)
117
117
 
118
118
  def make_col_safe(col):
119
- return '"' + quoted_name(col, True) + '"' if ' ' in col else col
119
+ return '"' + quoted_name(col, True) + '"' if " " in col else col
120
120
 
121
121
  if isinstance(facet_column, str):
122
122
  facet_column = make_col_safe(facet_column)
@@ -8,6 +8,7 @@ logger = logging.getLogger(__name__)
8
8
 
9
9
  LOL = List[List[float]]
10
10
 
11
+
11
12
  def pairwise_cosine_similarity(vector1: np.array, vector2: np.array) -> float:
12
13
  """
13
14
  Calculate the cosine similarity between two vectors.
@@ -77,9 +78,7 @@ def top_matches(cosine_similarity_matrix: np.ndarray) -> Tuple[np.ndarray, np.nd
77
78
  return top_match_indices, top_match_values
78
79
 
79
80
 
80
- def top_n_matches(
81
- cosine_similarity_matrix: np.ndarray, n: int = 10
82
- ) -> Tuple[np.ndarray, np.ndarray]:
81
+ def top_n_matches(cosine_similarity_matrix: np.ndarray, n: int = 10) -> Tuple[np.ndarray, np.ndarray]:
83
82
  # Find the indices that would sort each row in descending order
84
83
  sorted_indices = np.argsort(-cosine_similarity_matrix, axis=1)
85
84
 
@@ -136,10 +135,7 @@ def mmr_diversified_search(
136
135
  max_sim_to_selected = max(
137
136
  [
138
137
  np.dot(document_vectors[idx], document_vectors[s])
139
- / (
140
- np.linalg.norm(document_vectors[idx])
141
- * np.linalg.norm(document_vectors[s])
142
- )
138
+ / (np.linalg.norm(document_vectors[idx]) * np.linalg.norm(document_vectors[s]))
143
139
  for s in selected_indices
144
140
  ]
145
141
  )
@@ -160,6 +156,3 @@ def mmr_diversified_search(
160
156
  selected_indices.add(best_index)
161
157
 
162
158
  return result_indices
163
-
164
-
165
-
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: linkml-store
3
- Version: 0.2.6
3
+ Version: 0.2.9
4
4
  Summary: linkml-store
5
5
  License: MIT
6
6
  Author: Author 1
@@ -24,6 +24,7 @@ Provides-Extra: map
24
24
  Provides-Extra: mongodb
25
25
  Provides-Extra: neo4j
26
26
  Provides-Extra: pyarrow
27
+ Provides-Extra: rdf
27
28
  Provides-Extra: renderer
28
29
  Provides-Extra: scipy
29
30
  Provides-Extra: tests
@@ -39,6 +40,7 @@ Requires-Dist: h5py ; extra == "h5py"
39
40
  Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
40
41
  Requires-Dist: jsonlines (>=4.0.0,<5.0.0)
41
42
  Requires-Dist: jsonpatch (>=1.33)
43
+ Requires-Dist: lightrdf ; extra == "rdf"
42
44
  Requires-Dist: linkml (>=1.8.0) ; extra == "validation"
43
45
  Requires-Dist: linkml-runtime (>=1.8.0)
44
46
  Requires-Dist: linkml_map ; extra == "map"