linkml-store 0.2.6__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/client.py +2 -3
- linkml_store/api/collection.py +63 -8
- linkml_store/api/database.py +30 -2
- linkml_store/api/stores/duckdb/duckdb_collection.py +165 -3
- linkml_store/api/stores/duckdb/duckdb_database.py +3 -3
- linkml_store/api/stores/filesystem/__init__.py +1 -1
- linkml_store/api/stores/mongodb/mongodb_collection.py +115 -12
- linkml_store/api/stores/mongodb/mongodb_database.py +2 -1
- linkml_store/api/stores/solr/solr_collection.py +7 -1
- linkml_store/cli.py +201 -20
- linkml_store/index/implementations/llm_indexer.py +14 -6
- linkml_store/index/indexer.py +7 -4
- linkml_store/inference/implementations/llm_inference_engine.py +13 -9
- linkml_store/inference/implementations/rag_inference_engine.py +13 -10
- linkml_store/inference/implementations/sklearn_inference_engine.py +7 -1
- linkml_store/inference/inference_config.py +1 -0
- linkml_store/utils/dat_parser.py +95 -0
- linkml_store/utils/enrichment_analyzer.py +217 -0
- linkml_store/utils/format_utils.py +124 -3
- linkml_store/utils/llm_utils.py +3 -1
- linkml_store/utils/pandas_utils.py +1 -1
- linkml_store/utils/sql_utils.py +1 -1
- linkml_store/utils/vector_utils.py +3 -10
- {linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/METADATA +3 -1
- {linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/RECORD +28 -26
- {linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/WHEEL +1 -1
- {linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/LICENSE +0 -0
- {linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from typing import Tuple, Optional, Dict, Any, List
|
|
2
|
+
|
|
3
|
+
ENTRY = Dict[str, Any]
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def parse_sib_format(text) -> Tuple[Optional[ENTRY], List[ENTRY]]:
|
|
7
|
+
"""
|
|
8
|
+
Parse SIB/Swiss-Prot format data into a structured dictionary.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
text (str): The text in SIB/Swiss-Prot format
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
dict: A dictionary with entry IDs as keys and parsed data as values
|
|
15
|
+
"""
|
|
16
|
+
# Split the text into entries (separated by //)
|
|
17
|
+
entries = text.split("//\n")
|
|
18
|
+
header = None
|
|
19
|
+
|
|
20
|
+
# Initialize results dictionary
|
|
21
|
+
results = []
|
|
22
|
+
|
|
23
|
+
# Parse each entry
|
|
24
|
+
for entry in entries:
|
|
25
|
+
if not entry.strip():
|
|
26
|
+
continue
|
|
27
|
+
|
|
28
|
+
# Initialize dictionary for current entry
|
|
29
|
+
current_entry = {}
|
|
30
|
+
current_code = None
|
|
31
|
+
|
|
32
|
+
# Process each line
|
|
33
|
+
for line in entry.strip().split("\n"):
|
|
34
|
+
if not line.strip():
|
|
35
|
+
continue
|
|
36
|
+
|
|
37
|
+
# Check if this is a new field (starts with a 2-letter code followed by space)
|
|
38
|
+
if len(line) > 2 and line[2] == " ":
|
|
39
|
+
current_code = line[0:2]
|
|
40
|
+
# Remove the code and the following space(s)
|
|
41
|
+
value = line[3:].strip()
|
|
42
|
+
|
|
43
|
+
# Initialize as list if needed for multi-line fields
|
|
44
|
+
if current_code not in current_entry:
|
|
45
|
+
current_entry[current_code] = []
|
|
46
|
+
|
|
47
|
+
current_entry[current_code].append(value)
|
|
48
|
+
|
|
49
|
+
# Continuation of previous field
|
|
50
|
+
elif current_code is not None:
|
|
51
|
+
# Handle continuation lines (typically indented)
|
|
52
|
+
if current_code == "CC":
|
|
53
|
+
# For comments, preserve the indentation
|
|
54
|
+
current_entry[current_code].append(line)
|
|
55
|
+
else:
|
|
56
|
+
# For other fields, strip and append
|
|
57
|
+
current_entry[current_code].append(line.strip())
|
|
58
|
+
|
|
59
|
+
# Combine multiline comments; e.g
|
|
60
|
+
# -!- ...
|
|
61
|
+
# ...
|
|
62
|
+
# -!- ...
|
|
63
|
+
ccs = current_entry.get("CC", [])
|
|
64
|
+
new_ccs = []
|
|
65
|
+
for cc in ccs:
|
|
66
|
+
if not cc.startswith("-!-") and new_ccs:
|
|
67
|
+
new_ccs[-1] += " " + cc
|
|
68
|
+
else:
|
|
69
|
+
new_ccs.append(cc)
|
|
70
|
+
current_entry["CC"] = new_ccs
|
|
71
|
+
for k, vs in current_entry.items():
|
|
72
|
+
if k != "CC":
|
|
73
|
+
combined = "".join(vs)
|
|
74
|
+
combined = combined.strip()
|
|
75
|
+
if combined.endswith("."):
|
|
76
|
+
combined = combined.split(".")
|
|
77
|
+
combined = [c.strip() for c in combined if c.strip()]
|
|
78
|
+
if k == "DE":
|
|
79
|
+
combined = combined[0]
|
|
80
|
+
current_entry[k] = combined
|
|
81
|
+
|
|
82
|
+
if "ID" in current_entry:
|
|
83
|
+
results.append(current_entry)
|
|
84
|
+
else:
|
|
85
|
+
header = current_entry
|
|
86
|
+
|
|
87
|
+
return header, results
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# Example usage:
|
|
91
|
+
# data = parse_sib_format(text)
|
|
92
|
+
# for entry_id, entry_data in data.items():
|
|
93
|
+
# print(f"Entry: {entry_id}")
|
|
94
|
+
# for code, values in entry_data.items():
|
|
95
|
+
# print(f" {code}: {values}")
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from linkml_store.api import Collection
|
|
4
|
+
from scipy import stats
|
|
5
|
+
from typing import Dict, List
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class EnrichedCategory(BaseModel):
|
|
10
|
+
"""
|
|
11
|
+
Information about a category enriched in a sample
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
category: str
|
|
15
|
+
fold_change: float
|
|
16
|
+
original_p_value: float
|
|
17
|
+
adjusted_p_value: float
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
from collections import Counter, defaultdict
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class EnrichmentAnalyzer:
|
|
24
|
+
def __init__(self, df: pd.DataFrame, sample_key: str, classification_key: str):
|
|
25
|
+
"""
|
|
26
|
+
Initialize the analyzer with a DataFrame and key column names.
|
|
27
|
+
Precomputes category frequencies for the entire dataset.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
df: DataFrame containing the data
|
|
31
|
+
sample_key: Column name for sample IDs
|
|
32
|
+
classification_key: Column name for category lists
|
|
33
|
+
"""
|
|
34
|
+
self.df = df
|
|
35
|
+
self.sample_key = sample_key
|
|
36
|
+
self.classification_key = classification_key
|
|
37
|
+
|
|
38
|
+
# Precompute global category statistics
|
|
39
|
+
self.global_stats = self._compute_global_stats()
|
|
40
|
+
|
|
41
|
+
# Cache for sample-specific category counts
|
|
42
|
+
self.sample_cache: Dict[str, Counter] = {}
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def from_collection(cls, collection: Collection, sample_key: str, classification_key: str) -> "EnrichmentAnalyzer":
|
|
46
|
+
"""
|
|
47
|
+
Initialize the analyzer with a Collection and key column names.
|
|
48
|
+
Precomputes category frequencies for the entire dataset.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
collection: Collection containing the data
|
|
52
|
+
sample_key: Column name for sample IDs
|
|
53
|
+
classification_key: Column name for category lists
|
|
54
|
+
"""
|
|
55
|
+
column_atts = [sample_key, classification_key]
|
|
56
|
+
results = collection.find(select_cols=column_atts, limit=-1)
|
|
57
|
+
df = results.rows_dataframe
|
|
58
|
+
ea = cls(df, sample_key=sample_key, classification_key=classification_key)
|
|
59
|
+
return ea
|
|
60
|
+
|
|
61
|
+
def _compute_global_stats(self) -> Dict[str, int]:
|
|
62
|
+
"""
|
|
63
|
+
Compute global category frequencies across all samples.
|
|
64
|
+
Returns a dictionary of category -> count
|
|
65
|
+
"""
|
|
66
|
+
global_counter = Counter()
|
|
67
|
+
|
|
68
|
+
# Flatten all categories and count
|
|
69
|
+
for categories in self.df[self.classification_key]:
|
|
70
|
+
if isinstance(categories, list):
|
|
71
|
+
global_counter.update(categories)
|
|
72
|
+
else:
|
|
73
|
+
# Handle case where categories might be a string
|
|
74
|
+
global_counter.update([categories])
|
|
75
|
+
|
|
76
|
+
return global_counter
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def sample_ids(self) -> List[str]:
|
|
80
|
+
df = self.df
|
|
81
|
+
return df[self.sample_key].unique().tolist()
|
|
82
|
+
|
|
83
|
+
def _get_sample_stats(self, sample_id: str) -> Counter:
|
|
84
|
+
"""
|
|
85
|
+
Get category frequencies for a specific sample.
|
|
86
|
+
Uses caching to avoid recomputation.
|
|
87
|
+
"""
|
|
88
|
+
if sample_id in self.sample_cache:
|
|
89
|
+
return self.sample_cache[sample_id]
|
|
90
|
+
|
|
91
|
+
sample_data = self.df[self.df[self.sample_key] == sample_id]
|
|
92
|
+
if sample_data.empty:
|
|
93
|
+
raise KeyError(f"Sample ID '{sample_id}' not found")
|
|
94
|
+
sample_data = sample_data.dropna()
|
|
95
|
+
# if sample_data.empty:
|
|
96
|
+
# raise ValueError(f"Sample ID '{sample_id}' has missing values after dropping NA")
|
|
97
|
+
counter = Counter()
|
|
98
|
+
|
|
99
|
+
for categories in sample_data[self.classification_key]:
|
|
100
|
+
if isinstance(categories, list):
|
|
101
|
+
counter.update(categories)
|
|
102
|
+
else:
|
|
103
|
+
counter.update([categories])
|
|
104
|
+
|
|
105
|
+
self.sample_cache[sample_id] = counter
|
|
106
|
+
return counter
|
|
107
|
+
|
|
108
|
+
def find_enriched_categories(
|
|
109
|
+
self,
|
|
110
|
+
sample_id: str,
|
|
111
|
+
min_occurrences: int = 5,
|
|
112
|
+
p_value_threshold: float = 0.05,
|
|
113
|
+
multiple_testing_correction: str = "bh",
|
|
114
|
+
) -> List[EnrichedCategory]:
|
|
115
|
+
"""
|
|
116
|
+
Find categories that are enriched in the given sample.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
sample_id: ID of the sample to analyze
|
|
120
|
+
min_occurrences: Minimum number of occurrences required for a category
|
|
121
|
+
p_value_threshold: P-value threshold for significance
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
List of tuples (category, fold_change, p_value) sorted by significance
|
|
125
|
+
"""
|
|
126
|
+
sample_stats = self._get_sample_stats(sample_id)
|
|
127
|
+
total_sample_annotations = sum(sample_stats.values())
|
|
128
|
+
total_global_annotations = sum(self.global_stats.values())
|
|
129
|
+
|
|
130
|
+
results = []
|
|
131
|
+
|
|
132
|
+
for category, sample_count in sample_stats.items():
|
|
133
|
+
global_count = self.global_stats[category]
|
|
134
|
+
|
|
135
|
+
# Skip rare categories
|
|
136
|
+
if global_count < min_occurrences:
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
# Calculate fold change
|
|
140
|
+
sample_freq = sample_count / total_sample_annotations
|
|
141
|
+
global_freq = global_count / total_global_annotations
|
|
142
|
+
fold_change = sample_freq / global_freq if global_freq > 0 else float("inf")
|
|
143
|
+
|
|
144
|
+
# Perform Fisher's exact test
|
|
145
|
+
contingency_table = np.array(
|
|
146
|
+
[
|
|
147
|
+
[sample_count, global_count - sample_count],
|
|
148
|
+
[
|
|
149
|
+
total_sample_annotations - sample_count,
|
|
150
|
+
total_global_annotations - total_sample_annotations - (global_count - sample_count),
|
|
151
|
+
],
|
|
152
|
+
]
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
_, p_value = stats.fisher_exact(contingency_table)
|
|
156
|
+
|
|
157
|
+
if p_value < p_value_threshold:
|
|
158
|
+
results.append((category, fold_change, p_value))
|
|
159
|
+
|
|
160
|
+
if not results:
|
|
161
|
+
return results
|
|
162
|
+
|
|
163
|
+
# Sort by p-value
|
|
164
|
+
results.sort(key=lambda x: x[2])
|
|
165
|
+
|
|
166
|
+
# Apply multiple testing correction
|
|
167
|
+
categories, fold_changes, p_values = zip(*results)
|
|
168
|
+
|
|
169
|
+
if multiple_testing_correction.lower() == "bonf":
|
|
170
|
+
# Bonferroni correction
|
|
171
|
+
n_tests = len(self.global_stats) # Total number of categories tested
|
|
172
|
+
adjusted_p_values = [min(1.0, p * n_tests) for p in p_values]
|
|
173
|
+
|
|
174
|
+
elif multiple_testing_correction.lower() == "bh":
|
|
175
|
+
# Benjamini-Hochberg correction
|
|
176
|
+
n = len(p_values)
|
|
177
|
+
sorted_indices = np.argsort(p_values)
|
|
178
|
+
sorted_p_values = np.array(p_values)[sorted_indices]
|
|
179
|
+
|
|
180
|
+
# Calculate BH adjusted p-values
|
|
181
|
+
adjusted_p_values = np.zeros(n)
|
|
182
|
+
for i, p in enumerate(sorted_p_values):
|
|
183
|
+
adjusted_p_values[i] = p * n / (i + 1)
|
|
184
|
+
|
|
185
|
+
# Ensure monotonicity
|
|
186
|
+
for i in range(n - 2, -1, -1):
|
|
187
|
+
adjusted_p_values[i] = min(adjusted_p_values[i], adjusted_p_values[i + 1])
|
|
188
|
+
|
|
189
|
+
# Restore original order
|
|
190
|
+
inverse_indices = np.argsort(sorted_indices)
|
|
191
|
+
adjusted_p_values = adjusted_p_values[inverse_indices]
|
|
192
|
+
|
|
193
|
+
# Ensure we don't exceed 1.0
|
|
194
|
+
adjusted_p_values = np.minimum(adjusted_p_values, 1.0)
|
|
195
|
+
|
|
196
|
+
else:
|
|
197
|
+
# No correction
|
|
198
|
+
adjusted_p_values = p_values
|
|
199
|
+
|
|
200
|
+
# Filter by adjusted p-value threshold and create final results
|
|
201
|
+
# Create EnrichedCategory objects
|
|
202
|
+
final_results = [
|
|
203
|
+
EnrichedCategory(category=cat, fold_change=fc, original_p_value=p, adjusted_p_value=adj_p)
|
|
204
|
+
for cat, fc, p, adj_p in zip(categories, fold_changes, p_values, adjusted_p_values)
|
|
205
|
+
if adj_p < p_value_threshold
|
|
206
|
+
]
|
|
207
|
+
|
|
208
|
+
# Sort by adjusted p-value
|
|
209
|
+
final_results.sort(key=lambda x: x.adjusted_p_value)
|
|
210
|
+
return final_results
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
# Example usage:
|
|
214
|
+
# analyzer = EnrichmentAnalyzer(df, 'sample_id', 'categories')
|
|
215
|
+
# enriched = analyzer.find_enriched_categories('sample1')
|
|
216
|
+
# for category, fold_change, p_value in enriched:
|
|
217
|
+
# print(f"{category}: {fold_change:.2f}x enrichment (p={p_value:.2e})")
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import csv
|
|
2
2
|
import gzip
|
|
3
|
+
import hashlib
|
|
3
4
|
import io
|
|
4
5
|
import json
|
|
5
6
|
import logging
|
|
@@ -29,13 +30,24 @@ class Format(Enum):
|
|
|
29
30
|
JSONL = "jsonl"
|
|
30
31
|
YAML = "yaml"
|
|
31
32
|
YAMLL = "yamll"
|
|
33
|
+
TOML = "toml"
|
|
32
34
|
TSV = "tsv"
|
|
33
35
|
CSV = "csv"
|
|
34
36
|
XML = "xml"
|
|
37
|
+
TURTLE = "turtle"
|
|
38
|
+
RDFXML = "rdfxml"
|
|
39
|
+
TEXT = "text"
|
|
40
|
+
TEXTLINES = "textlines"
|
|
35
41
|
OBO = "obo"
|
|
42
|
+
FASTA = "fasta"
|
|
43
|
+
GMT = "gmt"
|
|
44
|
+
DAT = "dat"
|
|
45
|
+
MARKDOWN = "markdown"
|
|
36
46
|
PKL = "pkl"
|
|
37
47
|
PYTHON = "python"
|
|
38
48
|
PARQUET = "parquet"
|
|
49
|
+
HDF5 = "hdf5"
|
|
50
|
+
NETCDF = "netcdf"
|
|
39
51
|
FORMATTED = "formatted"
|
|
40
52
|
TABLE = "table"
|
|
41
53
|
XLSX = "xlsx"
|
|
@@ -55,7 +67,12 @@ class Format(Enum):
|
|
|
55
67
|
".yamll": cls.YAMLL,
|
|
56
68
|
".tsv": cls.TSV,
|
|
57
69
|
".csv": cls.CSV,
|
|
70
|
+
".txt": cls.TEXT,
|
|
58
71
|
".xml": cls.XML,
|
|
72
|
+
".owx": cls.XML,
|
|
73
|
+
".owl": cls.RDFXML,
|
|
74
|
+
".ttl": cls.TURTLE,
|
|
75
|
+
".md": cls.MARKDOWN,
|
|
59
76
|
".py": cls.PYTHON,
|
|
60
77
|
".parquet": cls.PARQUET,
|
|
61
78
|
".pq": cls.PARQUET,
|
|
@@ -122,12 +139,25 @@ def clean_nested_structure(obj):
|
|
|
122
139
|
else:
|
|
123
140
|
return clean_pandas_value(obj)
|
|
124
141
|
|
|
142
|
+
|
|
125
143
|
def process_file(
|
|
126
|
-
f: IO,
|
|
144
|
+
f: IO,
|
|
145
|
+
format: Format,
|
|
146
|
+
expected_type: Optional[Type] = None,
|
|
147
|
+
header_comment_token: Optional[str] = None,
|
|
148
|
+
format_options: Optional[Dict[str, Any]] = None,
|
|
127
149
|
) -> List[Dict[str, Any]]:
|
|
128
150
|
"""
|
|
129
151
|
Process a single file and return a list of objects.
|
|
152
|
+
|
|
153
|
+
:param f: The file object.
|
|
154
|
+
:param format: The format of the file.
|
|
155
|
+
:param expected_type: The expected type of the objects.
|
|
156
|
+
:param header_comment_token: Token used for header comments to be skipped
|
|
157
|
+
:return:
|
|
130
158
|
"""
|
|
159
|
+
if format_options is None:
|
|
160
|
+
format_options = {}
|
|
131
161
|
if format == Format.YAMLL:
|
|
132
162
|
format = Format.YAML
|
|
133
163
|
expected_type = list
|
|
@@ -142,6 +172,14 @@ def process_file(
|
|
|
142
172
|
objs = [obj for obj in objs if obj is not None]
|
|
143
173
|
else:
|
|
144
174
|
objs = yaml.safe_load(f)
|
|
175
|
+
elif format == Format.TOML:
|
|
176
|
+
import toml
|
|
177
|
+
|
|
178
|
+
objs = toml.load(f)
|
|
179
|
+
if not isinstance(objs, list):
|
|
180
|
+
objs = [objs]
|
|
181
|
+
elif format == Format.TEXTLINES:
|
|
182
|
+
objs = f.readlines()
|
|
145
183
|
elif format in [Format.TSV, Format.CSV]:
|
|
146
184
|
if header_comment_token:
|
|
147
185
|
while True:
|
|
@@ -160,14 +198,82 @@ def process_file(
|
|
|
160
198
|
elif format == Format.XLSX:
|
|
161
199
|
xls = pd.ExcelFile(f)
|
|
162
200
|
objs = {sheet: clean_nested_structure(xls.parse(sheet).to_dict(orient="records")) for sheet in xls.sheet_names}
|
|
201
|
+
elif format == Format.TEXT:
|
|
202
|
+
txt = f.read()
|
|
203
|
+
objs = [
|
|
204
|
+
{
|
|
205
|
+
"name": Path(f.name).name,
|
|
206
|
+
"path": f.name,
|
|
207
|
+
"content": txt,
|
|
208
|
+
"size": len(txt),
|
|
209
|
+
"lines": txt.count("\n") + 1,
|
|
210
|
+
"md5": hashlib.md5(txt.encode()).hexdigest(),
|
|
211
|
+
}
|
|
212
|
+
]
|
|
213
|
+
elif format == Format.GMT:
|
|
214
|
+
objs = []
|
|
215
|
+
lib_name = Path(f.name).name
|
|
216
|
+
for line in f:
|
|
217
|
+
parts = line.strip().split("\t")
|
|
218
|
+
desc = parts[1]
|
|
219
|
+
objs.append(
|
|
220
|
+
{
|
|
221
|
+
"library": lib_name,
|
|
222
|
+
"uid": f"{lib_name}.{parts[0]}",
|
|
223
|
+
"name": parts[0],
|
|
224
|
+
"description": desc if desc else None,
|
|
225
|
+
"genes": parts[2:],
|
|
226
|
+
}
|
|
227
|
+
)
|
|
228
|
+
elif format == Format.FASTA:
|
|
229
|
+
objs = []
|
|
230
|
+
current_obj = None
|
|
231
|
+
for line in f:
|
|
232
|
+
line = line.strip()
|
|
233
|
+
if line.startswith(">"):
|
|
234
|
+
if current_obj:
|
|
235
|
+
objs.append(current_obj)
|
|
236
|
+
current_obj = {"id": line[1:], "sequence": ""}
|
|
237
|
+
else:
|
|
238
|
+
current_obj["sequence"] += line
|
|
239
|
+
if current_obj:
|
|
240
|
+
objs.append(current_obj)
|
|
163
241
|
elif format == Format.OBO:
|
|
164
242
|
blocks = split_document(f.read(), "\n\n")
|
|
165
243
|
id_pattern = re.compile(r"id: (\S+)")
|
|
244
|
+
|
|
166
245
|
def get_id(block):
|
|
167
246
|
m = id_pattern.search(block)
|
|
168
247
|
return m.group(1) if m else None
|
|
248
|
+
|
|
169
249
|
objs = [{"id": get_id(block), "content": block} for block in blocks]
|
|
170
250
|
objs = [obj for obj in objs if obj["id"]]
|
|
251
|
+
elif format == Format.DAT:
|
|
252
|
+
from linkml_store.utils.dat_parser import parse_sib_format
|
|
253
|
+
|
|
254
|
+
_, objs = parse_sib_format(f.read())
|
|
255
|
+
elif format in (Format.RDFXML, Format.TURTLE):
|
|
256
|
+
import lightrdf
|
|
257
|
+
|
|
258
|
+
parser = lightrdf.Parser()
|
|
259
|
+
objs = []
|
|
260
|
+
ext_fmt = "rdfxml"
|
|
261
|
+
if format == Format.TURTLE:
|
|
262
|
+
ext_fmt = "ttl"
|
|
263
|
+
bytesio = io.BytesIO(f.read().encode("utf-8"))
|
|
264
|
+
buffer = io.BufferedReader(bytesio)
|
|
265
|
+
for s, p, o in parser.parse(buffer, base_iri=None, format=ext_fmt):
|
|
266
|
+
obj = {
|
|
267
|
+
"subject": s,
|
|
268
|
+
"predicate": p,
|
|
269
|
+
"object": o,
|
|
270
|
+
}
|
|
271
|
+
if format_options.get("pivot", False):
|
|
272
|
+
obj = {
|
|
273
|
+
"subject": s,
|
|
274
|
+
p: o,
|
|
275
|
+
}
|
|
276
|
+
objs.append(obj)
|
|
171
277
|
elif format == Format.PARQUET:
|
|
172
278
|
import pyarrow.parquet as pq
|
|
173
279
|
|
|
@@ -202,6 +308,7 @@ def load_objects(
|
|
|
202
308
|
:param compression: The compression type. Supports 'gz' for gzip and 'tgz' for tar.gz.
|
|
203
309
|
:param expected_type: The target type to load the objects into, e.g. list
|
|
204
310
|
:param header_comment_token: Token used for header comments to be skipped
|
|
311
|
+
:param select_query: JSONPath query to select specific objects from the loaded data.
|
|
205
312
|
:return: A list of dictionaries representing the loaded objects.
|
|
206
313
|
"""
|
|
207
314
|
if isinstance(file_path, Path):
|
|
@@ -290,7 +397,8 @@ def write_output(
|
|
|
290
397
|
|
|
291
398
|
|
|
292
399
|
def render_output(
|
|
293
|
-
data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame
|
|
400
|
+
data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame, List[BaseModel]],
|
|
401
|
+
format: Optional[Union[Format, str]] = Format.YAML,
|
|
294
402
|
) -> str:
|
|
295
403
|
"""
|
|
296
404
|
Render output data in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
@@ -323,6 +431,12 @@ def render_output(
|
|
|
323
431
|
if isinstance(data, pd.DataFrame):
|
|
324
432
|
data = data.to_dict(orient="records")
|
|
325
433
|
|
|
434
|
+
if isinstance(data, BaseModel):
|
|
435
|
+
data = data.model_dump()
|
|
436
|
+
|
|
437
|
+
if data and isinstance(data, list) and isinstance(data[0], BaseModel):
|
|
438
|
+
data = [d.model_dump() if isinstance(d, BaseModel) else d for d in data]
|
|
439
|
+
|
|
326
440
|
if isinstance(data, dict) and format in [Format.TSV, Format.CSV]:
|
|
327
441
|
data = [data]
|
|
328
442
|
|
|
@@ -335,8 +449,15 @@ def render_output(
|
|
|
335
449
|
return "\n".join(json.dumps(obj) for obj in data)
|
|
336
450
|
elif format == Format.PYTHON:
|
|
337
451
|
return str(data)
|
|
452
|
+
elif format == Format.MARKDOWN:
|
|
453
|
+
|
|
454
|
+
def as_markdown(obj: dict):
|
|
455
|
+
return "## Object\n\n" + "\n".join([f" * {k}: {v}" for k, v in obj.items()])
|
|
456
|
+
|
|
457
|
+
return "\n\n".join([as_markdown(obj) for obj in data]) if isinstance(data, list) else as_markdown(data)
|
|
338
458
|
elif format == Format.TABLE:
|
|
339
459
|
from tabulate import tabulate
|
|
460
|
+
|
|
340
461
|
return tabulate(pd.DataFrame(data), headers="keys", tablefmt="psql")
|
|
341
462
|
elif format == Format.YAML:
|
|
342
463
|
if isinstance(data, list):
|
|
@@ -401,4 +522,4 @@ def split_document(doc: str, delimiter: str):
|
|
|
401
522
|
:param delimiter: The delimiter.
|
|
402
523
|
:return: The parts of the document.
|
|
403
524
|
"""
|
|
404
|
-
return doc.split(delimiter)
|
|
525
|
+
return doc.split(delimiter)
|
linkml_store/utils/llm_utils.py
CHANGED
|
@@ -76,6 +76,7 @@ def render_formatted_text(
|
|
|
76
76
|
return text
|
|
77
77
|
if not values:
|
|
78
78
|
raise ValueError(f"Cannot fit text into token limit: {text_length} > {token_limit}")
|
|
79
|
+
# remove last element and try again
|
|
79
80
|
return render_formatted_text(render_func, values[0:-1], encoding=encoding, token_limit=token_limit)
|
|
80
81
|
|
|
81
82
|
|
|
@@ -104,6 +105,7 @@ def get_token_limit(model_name: str) -> int:
|
|
|
104
105
|
|
|
105
106
|
def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]:
|
|
106
107
|
import yaml
|
|
108
|
+
|
|
107
109
|
if "```" in yaml_str:
|
|
108
110
|
yaml_str = yaml_str.split("```")[1].strip()
|
|
109
111
|
if yaml_str.startswith("yaml"):
|
|
@@ -114,4 +116,4 @@ def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]:
|
|
|
114
116
|
if strict:
|
|
115
117
|
raise e
|
|
116
118
|
logger.error(f"Error parsing YAML: {yaml_str}\n{e}")
|
|
117
|
-
return None
|
|
119
|
+
return None
|
|
@@ -56,7 +56,7 @@ def nested_objects_to_dataframe(data: List[Dict[str, Any]]) -> pd.DataFrame:
|
|
|
56
56
|
|
|
57
57
|
|
|
58
58
|
def facet_summary_to_dataframe_unmelted(
|
|
59
|
-
facet_summary: Dict[Union[str, Tuple[str, ...]], List[Tuple[Union[str, Tuple[str, ...]], int]]]
|
|
59
|
+
facet_summary: Dict[Union[str, Tuple[str, ...]], List[Tuple[Union[str, Tuple[str, ...]], int]]],
|
|
60
60
|
) -> pd.DataFrame:
|
|
61
61
|
rows = []
|
|
62
62
|
|
linkml_store/utils/sql_utils.py
CHANGED
|
@@ -116,7 +116,7 @@ def facet_count_sql(query: Query, facet_column: Union[str, Tuple[str, ...]], mul
|
|
|
116
116
|
modified_where = " AND ".join(conditions)
|
|
117
117
|
|
|
118
118
|
def make_col_safe(col):
|
|
119
|
-
return '"' + quoted_name(col, True) + '"' if
|
|
119
|
+
return '"' + quoted_name(col, True) + '"' if " " in col else col
|
|
120
120
|
|
|
121
121
|
if isinstance(facet_column, str):
|
|
122
122
|
facet_column = make_col_safe(facet_column)
|
|
@@ -8,6 +8,7 @@ logger = logging.getLogger(__name__)
|
|
|
8
8
|
|
|
9
9
|
LOL = List[List[float]]
|
|
10
10
|
|
|
11
|
+
|
|
11
12
|
def pairwise_cosine_similarity(vector1: np.array, vector2: np.array) -> float:
|
|
12
13
|
"""
|
|
13
14
|
Calculate the cosine similarity between two vectors.
|
|
@@ -77,9 +78,7 @@ def top_matches(cosine_similarity_matrix: np.ndarray) -> Tuple[np.ndarray, np.nd
|
|
|
77
78
|
return top_match_indices, top_match_values
|
|
78
79
|
|
|
79
80
|
|
|
80
|
-
def top_n_matches(
|
|
81
|
-
cosine_similarity_matrix: np.ndarray, n: int = 10
|
|
82
|
-
) -> Tuple[np.ndarray, np.ndarray]:
|
|
81
|
+
def top_n_matches(cosine_similarity_matrix: np.ndarray, n: int = 10) -> Tuple[np.ndarray, np.ndarray]:
|
|
83
82
|
# Find the indices that would sort each row in descending order
|
|
84
83
|
sorted_indices = np.argsort(-cosine_similarity_matrix, axis=1)
|
|
85
84
|
|
|
@@ -136,10 +135,7 @@ def mmr_diversified_search(
|
|
|
136
135
|
max_sim_to_selected = max(
|
|
137
136
|
[
|
|
138
137
|
np.dot(document_vectors[idx], document_vectors[s])
|
|
139
|
-
/ (
|
|
140
|
-
np.linalg.norm(document_vectors[idx])
|
|
141
|
-
* np.linalg.norm(document_vectors[s])
|
|
142
|
-
)
|
|
138
|
+
/ (np.linalg.norm(document_vectors[idx]) * np.linalg.norm(document_vectors[s]))
|
|
143
139
|
for s in selected_indices
|
|
144
140
|
]
|
|
145
141
|
)
|
|
@@ -160,6 +156,3 @@ def mmr_diversified_search(
|
|
|
160
156
|
selected_indices.add(best_index)
|
|
161
157
|
|
|
162
158
|
return result_indices
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: linkml-store
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.9
|
|
4
4
|
Summary: linkml-store
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Author 1
|
|
@@ -24,6 +24,7 @@ Provides-Extra: map
|
|
|
24
24
|
Provides-Extra: mongodb
|
|
25
25
|
Provides-Extra: neo4j
|
|
26
26
|
Provides-Extra: pyarrow
|
|
27
|
+
Provides-Extra: rdf
|
|
27
28
|
Provides-Extra: renderer
|
|
28
29
|
Provides-Extra: scipy
|
|
29
30
|
Provides-Extra: tests
|
|
@@ -39,6 +40,7 @@ Requires-Dist: h5py ; extra == "h5py"
|
|
|
39
40
|
Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
|
|
40
41
|
Requires-Dist: jsonlines (>=4.0.0,<5.0.0)
|
|
41
42
|
Requires-Dist: jsonpatch (>=1.33)
|
|
43
|
+
Requires-Dist: lightrdf ; extra == "rdf"
|
|
42
44
|
Requires-Dist: linkml (>=1.8.0) ; extra == "validation"
|
|
43
45
|
Requires-Dist: linkml-runtime (>=1.8.0)
|
|
44
46
|
Requires-Dist: linkml_map ; extra == "map"
|