linkml-store 0.2.5__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/client.py +9 -6
- linkml_store/api/collection.py +118 -5
- linkml_store/api/database.py +45 -14
- linkml_store/api/stores/duckdb/duckdb_collection.py +176 -8
- linkml_store/api/stores/duckdb/duckdb_database.py +52 -19
- linkml_store/api/stores/filesystem/__init__.py +1 -1
- linkml_store/api/stores/mongodb/mongodb_collection.py +186 -0
- linkml_store/api/stores/mongodb/mongodb_database.py +8 -3
- linkml_store/api/stores/solr/solr_collection.py +7 -1
- linkml_store/cli.py +202 -21
- linkml_store/index/implementations/llm_indexer.py +14 -6
- linkml_store/index/indexer.py +7 -4
- linkml_store/inference/implementations/llm_inference_engine.py +13 -9
- linkml_store/inference/implementations/rag_inference_engine.py +13 -10
- linkml_store/inference/implementations/sklearn_inference_engine.py +7 -1
- linkml_store/inference/inference_config.py +1 -0
- linkml_store/utils/dat_parser.py +95 -0
- linkml_store/utils/enrichment_analyzer.py +217 -0
- linkml_store/utils/format_utils.py +183 -3
- linkml_store/utils/llm_utils.py +3 -1
- linkml_store/utils/pandas_utils.py +1 -1
- linkml_store/utils/sql_utils.py +7 -1
- linkml_store/utils/vector_utils.py +4 -11
- {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/METADATA +4 -3
- {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/RECORD +28 -26
- {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/WHEEL +1 -1
- {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/LICENSE +0 -0
- {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from typing import Tuple, Optional, Dict, Any, List
|
|
2
|
+
|
|
3
|
+
ENTRY = Dict[str, Any]
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def parse_sib_format(text) -> Tuple[Optional[ENTRY], List[ENTRY]]:
|
|
7
|
+
"""
|
|
8
|
+
Parse SIB/Swiss-Prot format data into a structured dictionary.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
text (str): The text in SIB/Swiss-Prot format
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
dict: A dictionary with entry IDs as keys and parsed data as values
|
|
15
|
+
"""
|
|
16
|
+
# Split the text into entries (separated by //)
|
|
17
|
+
entries = text.split("//\n")
|
|
18
|
+
header = None
|
|
19
|
+
|
|
20
|
+
# Initialize results dictionary
|
|
21
|
+
results = []
|
|
22
|
+
|
|
23
|
+
# Parse each entry
|
|
24
|
+
for entry in entries:
|
|
25
|
+
if not entry.strip():
|
|
26
|
+
continue
|
|
27
|
+
|
|
28
|
+
# Initialize dictionary for current entry
|
|
29
|
+
current_entry = {}
|
|
30
|
+
current_code = None
|
|
31
|
+
|
|
32
|
+
# Process each line
|
|
33
|
+
for line in entry.strip().split("\n"):
|
|
34
|
+
if not line.strip():
|
|
35
|
+
continue
|
|
36
|
+
|
|
37
|
+
# Check if this is a new field (starts with a 2-letter code followed by space)
|
|
38
|
+
if len(line) > 2 and line[2] == " ":
|
|
39
|
+
current_code = line[0:2]
|
|
40
|
+
# Remove the code and the following space(s)
|
|
41
|
+
value = line[3:].strip()
|
|
42
|
+
|
|
43
|
+
# Initialize as list if needed for multi-line fields
|
|
44
|
+
if current_code not in current_entry:
|
|
45
|
+
current_entry[current_code] = []
|
|
46
|
+
|
|
47
|
+
current_entry[current_code].append(value)
|
|
48
|
+
|
|
49
|
+
# Continuation of previous field
|
|
50
|
+
elif current_code is not None:
|
|
51
|
+
# Handle continuation lines (typically indented)
|
|
52
|
+
if current_code == "CC":
|
|
53
|
+
# For comments, preserve the indentation
|
|
54
|
+
current_entry[current_code].append(line)
|
|
55
|
+
else:
|
|
56
|
+
# For other fields, strip and append
|
|
57
|
+
current_entry[current_code].append(line.strip())
|
|
58
|
+
|
|
59
|
+
# Combine multiline comments; e.g
|
|
60
|
+
# -!- ...
|
|
61
|
+
# ...
|
|
62
|
+
# -!- ...
|
|
63
|
+
ccs = current_entry.get("CC", [])
|
|
64
|
+
new_ccs = []
|
|
65
|
+
for cc in ccs:
|
|
66
|
+
if not cc.startswith("-!-") and new_ccs:
|
|
67
|
+
new_ccs[-1] += " " + cc
|
|
68
|
+
else:
|
|
69
|
+
new_ccs.append(cc)
|
|
70
|
+
current_entry["CC"] = new_ccs
|
|
71
|
+
for k, vs in current_entry.items():
|
|
72
|
+
if k != "CC":
|
|
73
|
+
combined = "".join(vs)
|
|
74
|
+
combined = combined.strip()
|
|
75
|
+
if combined.endswith("."):
|
|
76
|
+
combined = combined.split(".")
|
|
77
|
+
combined = [c.strip() for c in combined if c.strip()]
|
|
78
|
+
if k == "DE":
|
|
79
|
+
combined = combined[0]
|
|
80
|
+
current_entry[k] = combined
|
|
81
|
+
|
|
82
|
+
if "ID" in current_entry:
|
|
83
|
+
results.append(current_entry)
|
|
84
|
+
else:
|
|
85
|
+
header = current_entry
|
|
86
|
+
|
|
87
|
+
return header, results
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# Example usage:
|
|
91
|
+
# data = parse_sib_format(text)
|
|
92
|
+
# for entry_id, entry_data in data.items():
|
|
93
|
+
# print(f"Entry: {entry_id}")
|
|
94
|
+
# for code, values in entry_data.items():
|
|
95
|
+
# print(f" {code}: {values}")
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from linkml_store.api import Collection
|
|
4
|
+
from scipy import stats
|
|
5
|
+
from typing import Dict, List
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class EnrichedCategory(BaseModel):
|
|
10
|
+
"""
|
|
11
|
+
Information about a category enriched in a sample
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
category: str
|
|
15
|
+
fold_change: float
|
|
16
|
+
original_p_value: float
|
|
17
|
+
adjusted_p_value: float
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
from collections import Counter, defaultdict
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class EnrichmentAnalyzer:
|
|
24
|
+
def __init__(self, df: pd.DataFrame, sample_key: str, classification_key: str):
|
|
25
|
+
"""
|
|
26
|
+
Initialize the analyzer with a DataFrame and key column names.
|
|
27
|
+
Precomputes category frequencies for the entire dataset.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
df: DataFrame containing the data
|
|
31
|
+
sample_key: Column name for sample IDs
|
|
32
|
+
classification_key: Column name for category lists
|
|
33
|
+
"""
|
|
34
|
+
self.df = df
|
|
35
|
+
self.sample_key = sample_key
|
|
36
|
+
self.classification_key = classification_key
|
|
37
|
+
|
|
38
|
+
# Precompute global category statistics
|
|
39
|
+
self.global_stats = self._compute_global_stats()
|
|
40
|
+
|
|
41
|
+
# Cache for sample-specific category counts
|
|
42
|
+
self.sample_cache: Dict[str, Counter] = {}
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def from_collection(cls, collection: Collection, sample_key: str, classification_key: str) -> "EnrichmentAnalyzer":
|
|
46
|
+
"""
|
|
47
|
+
Initialize the analyzer with a Collection and key column names.
|
|
48
|
+
Precomputes category frequencies for the entire dataset.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
collection: Collection containing the data
|
|
52
|
+
sample_key: Column name for sample IDs
|
|
53
|
+
classification_key: Column name for category lists
|
|
54
|
+
"""
|
|
55
|
+
column_atts = [sample_key, classification_key]
|
|
56
|
+
results = collection.find(select_cols=column_atts, limit=-1)
|
|
57
|
+
df = results.rows_dataframe
|
|
58
|
+
ea = cls(df, sample_key=sample_key, classification_key=classification_key)
|
|
59
|
+
return ea
|
|
60
|
+
|
|
61
|
+
def _compute_global_stats(self) -> Dict[str, int]:
|
|
62
|
+
"""
|
|
63
|
+
Compute global category frequencies across all samples.
|
|
64
|
+
Returns a dictionary of category -> count
|
|
65
|
+
"""
|
|
66
|
+
global_counter = Counter()
|
|
67
|
+
|
|
68
|
+
# Flatten all categories and count
|
|
69
|
+
for categories in self.df[self.classification_key]:
|
|
70
|
+
if isinstance(categories, list):
|
|
71
|
+
global_counter.update(categories)
|
|
72
|
+
else:
|
|
73
|
+
# Handle case where categories might be a string
|
|
74
|
+
global_counter.update([categories])
|
|
75
|
+
|
|
76
|
+
return global_counter
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def sample_ids(self) -> List[str]:
|
|
80
|
+
df = self.df
|
|
81
|
+
return df[self.sample_key].unique().tolist()
|
|
82
|
+
|
|
83
|
+
def _get_sample_stats(self, sample_id: str) -> Counter:
|
|
84
|
+
"""
|
|
85
|
+
Get category frequencies for a specific sample.
|
|
86
|
+
Uses caching to avoid recomputation.
|
|
87
|
+
"""
|
|
88
|
+
if sample_id in self.sample_cache:
|
|
89
|
+
return self.sample_cache[sample_id]
|
|
90
|
+
|
|
91
|
+
sample_data = self.df[self.df[self.sample_key] == sample_id]
|
|
92
|
+
if sample_data.empty:
|
|
93
|
+
raise KeyError(f"Sample ID '{sample_id}' not found")
|
|
94
|
+
sample_data = sample_data.dropna()
|
|
95
|
+
# if sample_data.empty:
|
|
96
|
+
# raise ValueError(f"Sample ID '{sample_id}' has missing values after dropping NA")
|
|
97
|
+
counter = Counter()
|
|
98
|
+
|
|
99
|
+
for categories in sample_data[self.classification_key]:
|
|
100
|
+
if isinstance(categories, list):
|
|
101
|
+
counter.update(categories)
|
|
102
|
+
else:
|
|
103
|
+
counter.update([categories])
|
|
104
|
+
|
|
105
|
+
self.sample_cache[sample_id] = counter
|
|
106
|
+
return counter
|
|
107
|
+
|
|
108
|
+
def find_enriched_categories(
|
|
109
|
+
self,
|
|
110
|
+
sample_id: str,
|
|
111
|
+
min_occurrences: int = 5,
|
|
112
|
+
p_value_threshold: float = 0.05,
|
|
113
|
+
multiple_testing_correction: str = "bh",
|
|
114
|
+
) -> List[EnrichedCategory]:
|
|
115
|
+
"""
|
|
116
|
+
Find categories that are enriched in the given sample.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
sample_id: ID of the sample to analyze
|
|
120
|
+
min_occurrences: Minimum number of occurrences required for a category
|
|
121
|
+
p_value_threshold: P-value threshold for significance
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
List of tuples (category, fold_change, p_value) sorted by significance
|
|
125
|
+
"""
|
|
126
|
+
sample_stats = self._get_sample_stats(sample_id)
|
|
127
|
+
total_sample_annotations = sum(sample_stats.values())
|
|
128
|
+
total_global_annotations = sum(self.global_stats.values())
|
|
129
|
+
|
|
130
|
+
results = []
|
|
131
|
+
|
|
132
|
+
for category, sample_count in sample_stats.items():
|
|
133
|
+
global_count = self.global_stats[category]
|
|
134
|
+
|
|
135
|
+
# Skip rare categories
|
|
136
|
+
if global_count < min_occurrences:
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
# Calculate fold change
|
|
140
|
+
sample_freq = sample_count / total_sample_annotations
|
|
141
|
+
global_freq = global_count / total_global_annotations
|
|
142
|
+
fold_change = sample_freq / global_freq if global_freq > 0 else float("inf")
|
|
143
|
+
|
|
144
|
+
# Perform Fisher's exact test
|
|
145
|
+
contingency_table = np.array(
|
|
146
|
+
[
|
|
147
|
+
[sample_count, global_count - sample_count],
|
|
148
|
+
[
|
|
149
|
+
total_sample_annotations - sample_count,
|
|
150
|
+
total_global_annotations - total_sample_annotations - (global_count - sample_count),
|
|
151
|
+
],
|
|
152
|
+
]
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
_, p_value = stats.fisher_exact(contingency_table)
|
|
156
|
+
|
|
157
|
+
if p_value < p_value_threshold:
|
|
158
|
+
results.append((category, fold_change, p_value))
|
|
159
|
+
|
|
160
|
+
if not results:
|
|
161
|
+
return results
|
|
162
|
+
|
|
163
|
+
# Sort by p-value
|
|
164
|
+
results.sort(key=lambda x: x[2])
|
|
165
|
+
|
|
166
|
+
# Apply multiple testing correction
|
|
167
|
+
categories, fold_changes, p_values = zip(*results)
|
|
168
|
+
|
|
169
|
+
if multiple_testing_correction.lower() == "bonf":
|
|
170
|
+
# Bonferroni correction
|
|
171
|
+
n_tests = len(self.global_stats) # Total number of categories tested
|
|
172
|
+
adjusted_p_values = [min(1.0, p * n_tests) for p in p_values]
|
|
173
|
+
|
|
174
|
+
elif multiple_testing_correction.lower() == "bh":
|
|
175
|
+
# Benjamini-Hochberg correction
|
|
176
|
+
n = len(p_values)
|
|
177
|
+
sorted_indices = np.argsort(p_values)
|
|
178
|
+
sorted_p_values = np.array(p_values)[sorted_indices]
|
|
179
|
+
|
|
180
|
+
# Calculate BH adjusted p-values
|
|
181
|
+
adjusted_p_values = np.zeros(n)
|
|
182
|
+
for i, p in enumerate(sorted_p_values):
|
|
183
|
+
adjusted_p_values[i] = p * n / (i + 1)
|
|
184
|
+
|
|
185
|
+
# Ensure monotonicity
|
|
186
|
+
for i in range(n - 2, -1, -1):
|
|
187
|
+
adjusted_p_values[i] = min(adjusted_p_values[i], adjusted_p_values[i + 1])
|
|
188
|
+
|
|
189
|
+
# Restore original order
|
|
190
|
+
inverse_indices = np.argsort(sorted_indices)
|
|
191
|
+
adjusted_p_values = adjusted_p_values[inverse_indices]
|
|
192
|
+
|
|
193
|
+
# Ensure we don't exceed 1.0
|
|
194
|
+
adjusted_p_values = np.minimum(adjusted_p_values, 1.0)
|
|
195
|
+
|
|
196
|
+
else:
|
|
197
|
+
# No correction
|
|
198
|
+
adjusted_p_values = p_values
|
|
199
|
+
|
|
200
|
+
# Filter by adjusted p-value threshold and create final results
|
|
201
|
+
# Create EnrichedCategory objects
|
|
202
|
+
final_results = [
|
|
203
|
+
EnrichedCategory(category=cat, fold_change=fc, original_p_value=p, adjusted_p_value=adj_p)
|
|
204
|
+
for cat, fc, p, adj_p in zip(categories, fold_changes, p_values, adjusted_p_values)
|
|
205
|
+
if adj_p < p_value_threshold
|
|
206
|
+
]
|
|
207
|
+
|
|
208
|
+
# Sort by adjusted p-value
|
|
209
|
+
final_results.sort(key=lambda x: x.adjusted_p_value)
|
|
210
|
+
return final_results
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
# Example usage:
|
|
214
|
+
# analyzer = EnrichmentAnalyzer(df, 'sample_id', 'categories')
|
|
215
|
+
# enriched = analyzer.find_enriched_categories('sample1')
|
|
216
|
+
# for category, fold_change, p_value in enriched:
|
|
217
|
+
# print(f"{category}: {fold_change:.2f}x enrichment (p={p_value:.2e})")
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import csv
|
|
2
2
|
import gzip
|
|
3
|
+
import hashlib
|
|
3
4
|
import io
|
|
4
5
|
import json
|
|
5
6
|
import logging
|
|
7
|
+
import re
|
|
6
8
|
import sys
|
|
7
9
|
import tarfile
|
|
8
10
|
from enum import Enum
|
|
@@ -28,13 +30,27 @@ class Format(Enum):
|
|
|
28
30
|
JSONL = "jsonl"
|
|
29
31
|
YAML = "yaml"
|
|
30
32
|
YAMLL = "yamll"
|
|
33
|
+
TOML = "toml"
|
|
31
34
|
TSV = "tsv"
|
|
32
35
|
CSV = "csv"
|
|
33
36
|
XML = "xml"
|
|
37
|
+
TURTLE = "turtle"
|
|
38
|
+
RDFXML = "rdfxml"
|
|
39
|
+
TEXT = "text"
|
|
40
|
+
TEXTLINES = "textlines"
|
|
41
|
+
OBO = "obo"
|
|
42
|
+
FASTA = "fasta"
|
|
43
|
+
GMT = "gmt"
|
|
44
|
+
DAT = "dat"
|
|
45
|
+
MARKDOWN = "markdown"
|
|
46
|
+
PKL = "pkl"
|
|
34
47
|
PYTHON = "python"
|
|
35
48
|
PARQUET = "parquet"
|
|
49
|
+
HDF5 = "hdf5"
|
|
50
|
+
NETCDF = "netcdf"
|
|
36
51
|
FORMATTED = "formatted"
|
|
37
52
|
TABLE = "table"
|
|
53
|
+
XLSX = "xlsx"
|
|
38
54
|
SQLDUMP_DUCKDB = "duckdb"
|
|
39
55
|
SQLDUMP_POSTGRES = "postgres"
|
|
40
56
|
DUMP_MONGODB = "mongodb"
|
|
@@ -51,7 +67,12 @@ class Format(Enum):
|
|
|
51
67
|
".yamll": cls.YAMLL,
|
|
52
68
|
".tsv": cls.TSV,
|
|
53
69
|
".csv": cls.CSV,
|
|
70
|
+
".txt": cls.TEXT,
|
|
54
71
|
".xml": cls.XML,
|
|
72
|
+
".owx": cls.XML,
|
|
73
|
+
".owl": cls.RDFXML,
|
|
74
|
+
".ttl": cls.TURTLE,
|
|
75
|
+
".md": cls.MARKDOWN,
|
|
55
76
|
".py": cls.PYTHON,
|
|
56
77
|
".parquet": cls.PARQUET,
|
|
57
78
|
".pq": cls.PARQUET,
|
|
@@ -67,6 +88,9 @@ class Format(Enum):
|
|
|
67
88
|
def is_dump_format(self):
|
|
68
89
|
return self in [Format.SQLDUMP_DUCKDB, Format.SQLDUMP_POSTGRES, Format.DUMP_MONGODB]
|
|
69
90
|
|
|
91
|
+
def is_binary_format(self):
|
|
92
|
+
return self in [Format.PARQUET, Format.XLSX]
|
|
93
|
+
|
|
70
94
|
def is_xsv(self):
|
|
71
95
|
return self in [Format.TSV, Format.CSV]
|
|
72
96
|
|
|
@@ -95,12 +119,45 @@ def load_objects_from_url(
|
|
|
95
119
|
return objs
|
|
96
120
|
|
|
97
121
|
|
|
122
|
+
def clean_pandas_value(v):
|
|
123
|
+
"""Clean a single value from pandas."""
|
|
124
|
+
import math
|
|
125
|
+
|
|
126
|
+
if isinstance(v, float):
|
|
127
|
+
if math.isnan(v) or math.isinf(v):
|
|
128
|
+
return None
|
|
129
|
+
return float(v) # Ensures proper float type
|
|
130
|
+
return v
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def clean_nested_structure(obj):
|
|
134
|
+
"""Recursively clean a nested structure of dicts/lists from pandas."""
|
|
135
|
+
if isinstance(obj, dict):
|
|
136
|
+
return {k: clean_nested_structure(v) for k, v in obj.items()}
|
|
137
|
+
elif isinstance(obj, list):
|
|
138
|
+
return [clean_nested_structure(item) for item in obj] # Fixed: using 'item' instead of 'v'
|
|
139
|
+
else:
|
|
140
|
+
return clean_pandas_value(obj)
|
|
141
|
+
|
|
142
|
+
|
|
98
143
|
def process_file(
|
|
99
|
-
f: IO,
|
|
144
|
+
f: IO,
|
|
145
|
+
format: Format,
|
|
146
|
+
expected_type: Optional[Type] = None,
|
|
147
|
+
header_comment_token: Optional[str] = None,
|
|
148
|
+
format_options: Optional[Dict[str, Any]] = None,
|
|
100
149
|
) -> List[Dict[str, Any]]:
|
|
101
150
|
"""
|
|
102
151
|
Process a single file and return a list of objects.
|
|
152
|
+
|
|
153
|
+
:param f: The file object.
|
|
154
|
+
:param format: The format of the file.
|
|
155
|
+
:param expected_type: The expected type of the objects.
|
|
156
|
+
:param header_comment_token: Token used for header comments to be skipped
|
|
157
|
+
:return:
|
|
103
158
|
"""
|
|
159
|
+
if format_options is None:
|
|
160
|
+
format_options = {}
|
|
104
161
|
if format == Format.YAMLL:
|
|
105
162
|
format = Format.YAML
|
|
106
163
|
expected_type = list
|
|
@@ -115,6 +172,14 @@ def process_file(
|
|
|
115
172
|
objs = [obj for obj in objs if obj is not None]
|
|
116
173
|
else:
|
|
117
174
|
objs = yaml.safe_load(f)
|
|
175
|
+
elif format == Format.TOML:
|
|
176
|
+
import toml
|
|
177
|
+
|
|
178
|
+
objs = toml.load(f)
|
|
179
|
+
if not isinstance(objs, list):
|
|
180
|
+
objs = [objs]
|
|
181
|
+
elif format == Format.TEXTLINES:
|
|
182
|
+
objs = f.readlines()
|
|
118
183
|
elif format in [Format.TSV, Format.CSV]:
|
|
119
184
|
if header_comment_token:
|
|
120
185
|
while True:
|
|
@@ -128,6 +193,87 @@ def process_file(
|
|
|
128
193
|
objs = list(reader)
|
|
129
194
|
elif format == Format.XML:
|
|
130
195
|
objs = xmltodict.parse(f.read())
|
|
196
|
+
elif format == Format.PKL:
|
|
197
|
+
objs = pd.read_pickle(f).to_dict(orient="records")
|
|
198
|
+
elif format == Format.XLSX:
|
|
199
|
+
xls = pd.ExcelFile(f)
|
|
200
|
+
objs = {sheet: clean_nested_structure(xls.parse(sheet).to_dict(orient="records")) for sheet in xls.sheet_names}
|
|
201
|
+
elif format == Format.TEXT:
|
|
202
|
+
txt = f.read()
|
|
203
|
+
objs = [
|
|
204
|
+
{
|
|
205
|
+
"name": Path(f.name).name,
|
|
206
|
+
"path": f.name,
|
|
207
|
+
"content": txt,
|
|
208
|
+
"size": len(txt),
|
|
209
|
+
"lines": txt.count("\n") + 1,
|
|
210
|
+
"md5": hashlib.md5(txt.encode()).hexdigest(),
|
|
211
|
+
}
|
|
212
|
+
]
|
|
213
|
+
elif format == Format.GMT:
|
|
214
|
+
objs = []
|
|
215
|
+
lib_name = Path(f.name).name
|
|
216
|
+
for line in f:
|
|
217
|
+
parts = line.strip().split("\t")
|
|
218
|
+
desc = parts[1]
|
|
219
|
+
objs.append(
|
|
220
|
+
{
|
|
221
|
+
"library": lib_name,
|
|
222
|
+
"uid": f"{lib_name}.{parts[0]}",
|
|
223
|
+
"name": parts[0],
|
|
224
|
+
"description": desc if desc else None,
|
|
225
|
+
"genes": parts[2:],
|
|
226
|
+
}
|
|
227
|
+
)
|
|
228
|
+
elif format == Format.FASTA:
|
|
229
|
+
objs = []
|
|
230
|
+
current_obj = None
|
|
231
|
+
for line in f:
|
|
232
|
+
line = line.strip()
|
|
233
|
+
if line.startswith(">"):
|
|
234
|
+
if current_obj:
|
|
235
|
+
objs.append(current_obj)
|
|
236
|
+
current_obj = {"id": line[1:], "sequence": ""}
|
|
237
|
+
else:
|
|
238
|
+
current_obj["sequence"] += line
|
|
239
|
+
if current_obj:
|
|
240
|
+
objs.append(current_obj)
|
|
241
|
+
elif format == Format.OBO:
|
|
242
|
+
blocks = split_document(f.read(), "\n\n")
|
|
243
|
+
id_pattern = re.compile(r"id: (\S+)")
|
|
244
|
+
|
|
245
|
+
def get_id(block):
|
|
246
|
+
m = id_pattern.search(block)
|
|
247
|
+
return m.group(1) if m else None
|
|
248
|
+
|
|
249
|
+
objs = [{"id": get_id(block), "content": block} for block in blocks]
|
|
250
|
+
objs = [obj for obj in objs if obj["id"]]
|
|
251
|
+
elif format == Format.DAT:
|
|
252
|
+
from linkml_store.utils.dat_parser import parse_sib_format
|
|
253
|
+
|
|
254
|
+
_, objs = parse_sib_format(f.read())
|
|
255
|
+
elif format in (Format.RDFXML, Format.TURTLE):
|
|
256
|
+
import lightrdf
|
|
257
|
+
|
|
258
|
+
parser = lightrdf.Parser()
|
|
259
|
+
objs = []
|
|
260
|
+
ext_fmt = "rdfxml"
|
|
261
|
+
if format == Format.TURTLE:
|
|
262
|
+
ext_fmt = "ttl"
|
|
263
|
+
bytesio = io.BytesIO(f.read().encode("utf-8"))
|
|
264
|
+
buffer = io.BufferedReader(bytesio)
|
|
265
|
+
for s, p, o in parser.parse(buffer, base_iri=None, format=ext_fmt):
|
|
266
|
+
obj = {
|
|
267
|
+
"subject": s,
|
|
268
|
+
"predicate": p,
|
|
269
|
+
"object": o,
|
|
270
|
+
}
|
|
271
|
+
if format_options.get("pivot", False):
|
|
272
|
+
obj = {
|
|
273
|
+
"subject": s,
|
|
274
|
+
p: o,
|
|
275
|
+
}
|
|
276
|
+
objs.append(obj)
|
|
131
277
|
elif format == Format.PARQUET:
|
|
132
278
|
import pyarrow.parquet as pq
|
|
133
279
|
|
|
@@ -162,11 +308,20 @@ def load_objects(
|
|
|
162
308
|
:param compression: The compression type. Supports 'gz' for gzip and 'tgz' for tar.gz.
|
|
163
309
|
:param expected_type: The target type to load the objects into, e.g. list
|
|
164
310
|
:param header_comment_token: Token used for header comments to be skipped
|
|
311
|
+
:param select_query: JSONPath query to select specific objects from the loaded data.
|
|
165
312
|
:return: A list of dictionaries representing the loaded objects.
|
|
166
313
|
"""
|
|
167
314
|
if isinstance(file_path, Path):
|
|
168
315
|
file_path = str(file_path)
|
|
169
316
|
|
|
317
|
+
for url_scheme in ["http", "https", "ftp"]:
|
|
318
|
+
if file_path.startswith(f"{url_scheme}://"):
|
|
319
|
+
return load_objects_from_url(
|
|
320
|
+
file_path,
|
|
321
|
+
format=format,
|
|
322
|
+
expected_type=expected_type,
|
|
323
|
+
)
|
|
324
|
+
|
|
170
325
|
if isinstance(format, str):
|
|
171
326
|
format = Format(format)
|
|
172
327
|
|
|
@@ -185,9 +340,9 @@ def load_objects(
|
|
|
185
340
|
else:
|
|
186
341
|
if Path(file_path).is_dir():
|
|
187
342
|
raise ValueError(f"{file_path} is a dir, which is invalid for {format}")
|
|
188
|
-
mode = "rb" if format == Format.PARQUET or compression == "gz" else "r"
|
|
189
343
|
open_func = gzip.open if compression == "gz" else open
|
|
190
344
|
format = Format.guess_format(file_path) if not format else format
|
|
345
|
+
mode = "rb" if (format and format.is_binary_format()) or compression == "gz" else "r"
|
|
191
346
|
with open_func(file_path, mode) if file_path != "-" else sys.stdin as f:
|
|
192
347
|
if compression == "gz" and mode == "r":
|
|
193
348
|
f = io.TextIOWrapper(f)
|
|
@@ -242,7 +397,8 @@ def write_output(
|
|
|
242
397
|
|
|
243
398
|
|
|
244
399
|
def render_output(
|
|
245
|
-
data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame
|
|
400
|
+
data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame, List[BaseModel]],
|
|
401
|
+
format: Optional[Union[Format, str]] = Format.YAML,
|
|
246
402
|
) -> str:
|
|
247
403
|
"""
|
|
248
404
|
Render output data in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
@@ -275,6 +431,12 @@ def render_output(
|
|
|
275
431
|
if isinstance(data, pd.DataFrame):
|
|
276
432
|
data = data.to_dict(orient="records")
|
|
277
433
|
|
|
434
|
+
if isinstance(data, BaseModel):
|
|
435
|
+
data = data.model_dump()
|
|
436
|
+
|
|
437
|
+
if data and isinstance(data, list) and isinstance(data[0], BaseModel):
|
|
438
|
+
data = [d.model_dump() if isinstance(d, BaseModel) else d for d in data]
|
|
439
|
+
|
|
278
440
|
if isinstance(data, dict) and format in [Format.TSV, Format.CSV]:
|
|
279
441
|
data = [data]
|
|
280
442
|
|
|
@@ -287,8 +449,15 @@ def render_output(
|
|
|
287
449
|
return "\n".join(json.dumps(obj) for obj in data)
|
|
288
450
|
elif format == Format.PYTHON:
|
|
289
451
|
return str(data)
|
|
452
|
+
elif format == Format.MARKDOWN:
|
|
453
|
+
|
|
454
|
+
def as_markdown(obj: dict):
|
|
455
|
+
return "## Object\n\n" + "\n".join([f" * {k}: {v}" for k, v in obj.items()])
|
|
456
|
+
|
|
457
|
+
return "\n\n".join([as_markdown(obj) for obj in data]) if isinstance(data, list) else as_markdown(data)
|
|
290
458
|
elif format == Format.TABLE:
|
|
291
459
|
from tabulate import tabulate
|
|
460
|
+
|
|
292
461
|
return tabulate(pd.DataFrame(data), headers="keys", tablefmt="psql")
|
|
293
462
|
elif format == Format.YAML:
|
|
294
463
|
if isinstance(data, list):
|
|
@@ -343,3 +512,14 @@ def guess_format(path: str) -> Optional[Format]:
|
|
|
343
512
|
:return: The guessed format.
|
|
344
513
|
"""
|
|
345
514
|
return Format.guess_format(path)
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def split_document(doc: str, delimiter: str):
|
|
518
|
+
"""
|
|
519
|
+
Split a document into parts based on a delimiter.
|
|
520
|
+
|
|
521
|
+
:param doc: The document to split.
|
|
522
|
+
:param delimiter: The delimiter.
|
|
523
|
+
:return: The parts of the document.
|
|
524
|
+
"""
|
|
525
|
+
return doc.split(delimiter)
|
linkml_store/utils/llm_utils.py
CHANGED
|
@@ -76,6 +76,7 @@ def render_formatted_text(
|
|
|
76
76
|
return text
|
|
77
77
|
if not values:
|
|
78
78
|
raise ValueError(f"Cannot fit text into token limit: {text_length} > {token_limit}")
|
|
79
|
+
# remove last element and try again
|
|
79
80
|
return render_formatted_text(render_func, values[0:-1], encoding=encoding, token_limit=token_limit)
|
|
80
81
|
|
|
81
82
|
|
|
@@ -104,6 +105,7 @@ def get_token_limit(model_name: str) -> int:
|
|
|
104
105
|
|
|
105
106
|
def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]:
|
|
106
107
|
import yaml
|
|
108
|
+
|
|
107
109
|
if "```" in yaml_str:
|
|
108
110
|
yaml_str = yaml_str.split("```")[1].strip()
|
|
109
111
|
if yaml_str.startswith("yaml"):
|
|
@@ -114,4 +116,4 @@ def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]:
|
|
|
114
116
|
if strict:
|
|
115
117
|
raise e
|
|
116
118
|
logger.error(f"Error parsing YAML: {yaml_str}\n{e}")
|
|
117
|
-
return None
|
|
119
|
+
return None
|
|
@@ -56,7 +56,7 @@ def nested_objects_to_dataframe(data: List[Dict[str, Any]]) -> pd.DataFrame:
|
|
|
56
56
|
|
|
57
57
|
|
|
58
58
|
def facet_summary_to_dataframe_unmelted(
|
|
59
|
-
facet_summary: Dict[Union[str, Tuple[str, ...]], List[Tuple[Union[str, Tuple[str, ...]], int]]]
|
|
59
|
+
facet_summary: Dict[Union[str, Tuple[str, ...]], List[Tuple[Union[str, Tuple[str, ...]], int]]],
|
|
60
60
|
) -> pd.DataFrame:
|
|
61
61
|
rows = []
|
|
62
62
|
|
linkml_store/utils/sql_utils.py
CHANGED
|
@@ -5,7 +5,7 @@ import sqlalchemy
|
|
|
5
5
|
import sqlalchemy.sql.sqltypes as sqlt
|
|
6
6
|
from linkml_runtime.linkml_model import SchemaDefinition, SlotDefinition
|
|
7
7
|
from linkml_runtime.utils.schema_builder import SchemaBuilder
|
|
8
|
-
from sqlalchemy import MetaData
|
|
8
|
+
from sqlalchemy import MetaData, quoted_name
|
|
9
9
|
|
|
10
10
|
from linkml_store.api.queries import Query
|
|
11
11
|
|
|
@@ -115,7 +115,13 @@ def facet_count_sql(query: Query, facet_column: Union[str, Tuple[str, ...]], mul
|
|
|
115
115
|
conditions = [cond for cond in where_clause_sql.split(" AND ") if not cond.startswith(f"{facet_column} ")]
|
|
116
116
|
modified_where = " AND ".join(conditions)
|
|
117
117
|
|
|
118
|
+
def make_col_safe(col):
|
|
119
|
+
return '"' + quoted_name(col, True) + '"' if " " in col else col
|
|
120
|
+
|
|
121
|
+
if isinstance(facet_column, str):
|
|
122
|
+
facet_column = make_col_safe(facet_column)
|
|
118
123
|
if isinstance(facet_column, tuple):
|
|
124
|
+
facet_column = [make_col_safe(col) for col in facet_column]
|
|
119
125
|
if multivalued:
|
|
120
126
|
raise NotImplementedError("Multivalued facets are not supported for multiple columns")
|
|
121
127
|
facet_column = ", ".join(facet_column)
|