linkml-store 0.1.14__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/collection.py +48 -5
- linkml_store/api/database.py +7 -1
- linkml_store/api/queries.py +3 -1
- linkml_store/api/stores/duckdb/duckdb_collection.py +8 -2
- linkml_store/cli.py +44 -18
- linkml_store/index/implementations/llm_indexer.py +20 -2
- linkml_store/index/indexer.py +51 -1
- linkml_store/inference/evaluation.py +195 -0
- linkml_store/inference/implementations/rag_inference_engine.py +120 -33
- linkml_store/inference/implementations/rule_based_inference_engine.py +15 -4
- linkml_store/inference/implementations/sklearn_inference_engine.py +20 -2
- linkml_store/inference/inference_config.py +1 -0
- linkml_store/inference/inference_engine.py +53 -19
- linkml_store/utils/format_utils.py +6 -0
- linkml_store/utils/llm_utils.py +2 -0
- linkml_store/utils/object_utils.py +100 -1
- {linkml_store-0.1.14.dist-info → linkml_store-0.2.1.dist-info}/METADATA +9 -1
- {linkml_store-0.1.14.dist-info → linkml_store-0.2.1.dist-info}/RECORD +21 -20
- {linkml_store-0.1.14.dist-info → linkml_store-0.2.1.dist-info}/LICENSE +0 -0
- {linkml_store-0.1.14.dist-info → linkml_store-0.2.1.dist-info}/WHEEL +0 -0
- {linkml_store-0.1.14.dist-info → linkml_store-0.2.1.dist-info}/entry_points.txt +0 -0
|
@@ -1,13 +1,17 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import logging
|
|
2
3
|
from dataclasses import dataclass
|
|
3
|
-
from
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import ClassVar, List, Optional, TextIO, Union
|
|
4
6
|
|
|
5
7
|
import yaml
|
|
6
8
|
from llm import get_key
|
|
9
|
+
from pydantic import BaseModel
|
|
7
10
|
|
|
8
11
|
from linkml_store.api.collection import OBJECT, Collection
|
|
9
12
|
from linkml_store.inference.inference_config import Inference, InferenceConfig, LLMConfig
|
|
10
|
-
from linkml_store.inference.inference_engine import InferenceEngine
|
|
13
|
+
from linkml_store.inference.inference_engine import InferenceEngine, ModelSerialization
|
|
14
|
+
from linkml_store.utils.object_utils import select_nested
|
|
11
15
|
|
|
12
16
|
logger = logging.getLogger(__name__)
|
|
13
17
|
|
|
@@ -22,6 +26,12 @@ You should return ONLY valid YAML in your response.
|
|
|
22
26
|
"""
|
|
23
27
|
|
|
24
28
|
|
|
29
|
+
class TrainedModel(BaseModel, extra="forbid"):
|
|
30
|
+
rag_collection_rows: List[OBJECT]
|
|
31
|
+
index_rows: List[OBJECT]
|
|
32
|
+
config: Optional[InferenceConfig] = None
|
|
33
|
+
|
|
34
|
+
|
|
25
35
|
@dataclass
|
|
26
36
|
class RAGInferenceEngine(InferenceEngine):
|
|
27
37
|
"""
|
|
@@ -48,14 +58,23 @@ class RAGInferenceEngine(InferenceEngine):
|
|
|
48
58
|
>>> prediction.predicted_object
|
|
49
59
|
{'capital': 'Montevideo', 'code': 'UY', 'continent': 'South America', 'languages': ['Spanish']}
|
|
50
60
|
|
|
61
|
+
The "model" can be saved for later use:
|
|
62
|
+
|
|
63
|
+
>>> ie.export_model("tests/output/countries.rag_model.json")
|
|
64
|
+
|
|
65
|
+
Note in this case the model is not the underlying LLM, but the "RAG Model" which is the vectorized
|
|
66
|
+
representation of training set objects.
|
|
67
|
+
|
|
51
68
|
"""
|
|
52
69
|
|
|
53
|
-
classifier: Any = None
|
|
54
|
-
encoders: dict = None
|
|
55
70
|
_model: "llm.Model" = None # noqa: F821
|
|
56
71
|
|
|
57
72
|
rag_collection: Collection = None
|
|
58
73
|
|
|
74
|
+
PERSIST_COLS: ClassVar[List[str]] = [
|
|
75
|
+
"config",
|
|
76
|
+
]
|
|
77
|
+
|
|
59
78
|
def __post_init__(self):
|
|
60
79
|
if not self.config:
|
|
61
80
|
self.config = InferenceConfig()
|
|
@@ -75,18 +94,11 @@ class RAGInferenceEngine(InferenceEngine):
|
|
|
75
94
|
return self._model
|
|
76
95
|
|
|
77
96
|
def initialize_model(self, **kwargs):
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
rag_collection =
|
|
82
|
-
|
|
83
|
-
base_collection = td.collection
|
|
84
|
-
objs = base_collection.find({}, offset=s[0], limit=s[1] - s[0]).rows
|
|
85
|
-
db = base_collection.parent
|
|
86
|
-
rag_collection = db.get_collection(f"{base_collection.alias}__rag_{s[0]}_{s[1]}", create_if_not_exists=True)
|
|
87
|
-
rag_collection.insert(objs)
|
|
88
|
-
rag_collection.attach_indexer("llm", auto_index=False)
|
|
89
|
-
self.rag_collection = rag_collection
|
|
97
|
+
logger.info(f"Initializing model {self.model}")
|
|
98
|
+
if self.training_data:
|
|
99
|
+
rag_collection = self.training_data.collection
|
|
100
|
+
rag_collection.attach_indexer("llm", auto_index=False)
|
|
101
|
+
self.rag_collection = rag_collection
|
|
90
102
|
|
|
91
103
|
def object_to_text(self, object: OBJECT) -> str:
|
|
92
104
|
return yaml.dump(object)
|
|
@@ -103,24 +115,34 @@ class RAGInferenceEngine(InferenceEngine):
|
|
|
103
115
|
target_attributes = self.config.target_attributes
|
|
104
116
|
num_examples = self.config.llm_config.number_of_few_shot_examples or 5
|
|
105
117
|
query_text = self.object_to_text(object)
|
|
106
|
-
if not self.rag_collection
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
118
|
+
if not self.rag_collection:
|
|
119
|
+
# TODO: zero-shot mode
|
|
120
|
+
examples = []
|
|
121
|
+
else:
|
|
122
|
+
if not self.rag_collection.indexers:
|
|
123
|
+
raise ValueError("RAG collection must have an indexer attached")
|
|
124
|
+
rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm")
|
|
125
|
+
examples = rs.rows
|
|
126
|
+
if not examples:
|
|
127
|
+
raise ValueError(f"No examples found for {query_text}; size = {self.rag_collection.size()}")
|
|
112
128
|
prompt_clauses = []
|
|
129
|
+
query_obj = select_nested(object, feature_attributes)
|
|
130
|
+
query_text = self.object_to_text(query_obj)
|
|
113
131
|
for example in examples:
|
|
114
|
-
input_obj =
|
|
115
|
-
|
|
132
|
+
input_obj = select_nested(example, feature_attributes)
|
|
133
|
+
input_obj_text = self.object_to_text(input_obj)
|
|
134
|
+
if input_obj_text == query_text:
|
|
135
|
+
raise ValueError(
|
|
136
|
+
f"Query object {query_text} is the same as example object {input_obj_text}\n"
|
|
137
|
+
"This indicates possible test data leakage\n."
|
|
138
|
+
"TODO: allow an option that allows user to treat this as a basic lookup\n"
|
|
139
|
+
)
|
|
140
|
+
output_obj = select_nested(example, target_attributes)
|
|
116
141
|
prompt_clause = (
|
|
117
|
-
"---\nExample:\n"
|
|
118
|
-
f"## INPUT:\n{self.object_to_text(input_obj)}\n"
|
|
119
|
-
f"## OUTPUT:\n{self.object_to_text(output_obj)}\n"
|
|
142
|
+
"---\nExample:\n" f"## INPUT:\n{input_obj_text}\n" f"## OUTPUT:\n{self.object_to_text(output_obj)}\n"
|
|
120
143
|
)
|
|
121
144
|
prompt_clauses.append(prompt_clause)
|
|
122
|
-
|
|
123
|
-
query_text = self.object_to_text(query_obj)
|
|
145
|
+
|
|
124
146
|
prompt_end = "---\nQuery:\n" f"## INPUT:\n{query_text}\n" "## OUTPUT:\n"
|
|
125
147
|
system_prompt = SYSTEM_PROMPT.format(llm_config=self.config.llm_config)
|
|
126
148
|
|
|
@@ -137,9 +159,74 @@ class RAGInferenceEngine(InferenceEngine):
|
|
|
137
159
|
response = model.prompt(prompt, system_prompt)
|
|
138
160
|
yaml_str = response.text()
|
|
139
161
|
logger.info(f"Response: {yaml_str}")
|
|
162
|
+
return Inference(predicted_object=self._parse_yaml_payload(yaml_str))
|
|
163
|
+
|
|
164
|
+
def _parse_yaml_payload(self, yaml_str: str, strict=False) -> Optional[OBJECT]:
|
|
165
|
+
if "```" in yaml_str:
|
|
166
|
+
yaml_str = yaml_str.split("```")[1].strip()
|
|
167
|
+
if yaml_str.startswith("yaml"):
|
|
168
|
+
yaml_str = yaml_str[4:].strip()
|
|
140
169
|
try:
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
170
|
+
return yaml.safe_load(yaml_str)
|
|
171
|
+
except Exception as e:
|
|
172
|
+
if strict:
|
|
173
|
+
raise e
|
|
174
|
+
logger.error(f"Error parsing YAML: {yaml_str}\n{e}")
|
|
145
175
|
return None
|
|
176
|
+
|
|
177
|
+
def export_model(
|
|
178
|
+
self, output: Optional[Union[str, Path, TextIO]], model_serialization: ModelSerialization = None, **kwargs
|
|
179
|
+
):
|
|
180
|
+
self.save_model(output)
|
|
181
|
+
|
|
182
|
+
def save_model(self, output: Union[str, Path]) -> None:
|
|
183
|
+
"""
|
|
184
|
+
Save the trained model and related data to a file.
|
|
185
|
+
|
|
186
|
+
:param output: Path to save the model
|
|
187
|
+
"""
|
|
188
|
+
|
|
189
|
+
# trigger index
|
|
190
|
+
_qr = self.rag_collection.search("*", limit=1)
|
|
191
|
+
assert len(_qr.ranked_rows) > 0
|
|
192
|
+
|
|
193
|
+
rows = self.rag_collection.find(limit=-1).rows
|
|
194
|
+
|
|
195
|
+
indexers = self.rag_collection.indexers
|
|
196
|
+
assert len(indexers) == 1
|
|
197
|
+
ix = self.rag_collection.indexers["llm"]
|
|
198
|
+
ix_coll = self.rag_collection.parent.get_collection(self.rag_collection.get_index_collection_name(ix))
|
|
199
|
+
|
|
200
|
+
ix_rows = ix_coll.find(limit=-1).rows
|
|
201
|
+
assert len(ix_rows) > 0
|
|
202
|
+
tm = TrainedModel(rag_collection_rows=rows, index_rows=ix_rows, config=self.config)
|
|
203
|
+
# tm = TrainedModel(rag_collection_rows=rows, index_rows=ix_rows)
|
|
204
|
+
with open(output, "w", encoding="utf-8") as f:
|
|
205
|
+
json.dump(tm.model_dump(), f)
|
|
206
|
+
|
|
207
|
+
@classmethod
|
|
208
|
+
def load_model(cls, file_path: Union[str, Path]) -> "RAGInferenceEngine":
|
|
209
|
+
"""
|
|
210
|
+
Load a trained model and related data from a file.
|
|
211
|
+
|
|
212
|
+
:param file_path: Path to the saved model
|
|
213
|
+
:return: SklearnInferenceEngine instance with loaded model
|
|
214
|
+
"""
|
|
215
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
216
|
+
model_data = json.load(f)
|
|
217
|
+
tm = TrainedModel(**model_data)
|
|
218
|
+
from linkml_store.api import Client
|
|
219
|
+
|
|
220
|
+
client = Client()
|
|
221
|
+
db = client.attach_database("duckdb", alias="training")
|
|
222
|
+
db.store({"data": tm.rag_collection_rows})
|
|
223
|
+
collection = db.get_collection("data")
|
|
224
|
+
ix = collection.attach_indexer("llm", auto_index=False)
|
|
225
|
+
assert ix.name
|
|
226
|
+
ix_coll_name = collection.get_index_collection_name(ix)
|
|
227
|
+
assert ix_coll_name
|
|
228
|
+
ix_coll = db.get_collection(ix_coll_name, create_if_not_exists=True)
|
|
229
|
+
ix_coll.insert(tm.index_rows)
|
|
230
|
+
ie = cls(config=tm.config)
|
|
231
|
+
ie.rag_collection = collection
|
|
232
|
+
return ie
|
|
@@ -13,7 +13,7 @@ from linkml_runtime.utils.formatutils import underscore
|
|
|
13
13
|
from pydantic import BaseModel
|
|
14
14
|
|
|
15
15
|
from linkml_store.api.collection import OBJECT, Collection
|
|
16
|
-
from linkml_store.inference.inference_config import Inference
|
|
16
|
+
from linkml_store.inference.inference_config import Inference, InferenceConfig
|
|
17
17
|
from linkml_store.inference.inference_engine import InferenceEngine, ModelSerialization
|
|
18
18
|
|
|
19
19
|
logger = logging.getLogger(__name__)
|
|
@@ -111,11 +111,16 @@ class RuleBasedInferenceEngine(InferenceEngine):
|
|
|
111
111
|
object = {underscore(k): v for k, v in object.items()}
|
|
112
112
|
if self.slot_expressions:
|
|
113
113
|
for slot, expr in self.slot_expressions.items():
|
|
114
|
-
print(f"EVAL {object}")
|
|
115
114
|
v = eval_expr(expr, **object)
|
|
116
115
|
if v is not None:
|
|
117
116
|
object[slot] = v
|
|
118
|
-
|
|
117
|
+
if self.config and self.config.target_attributes:
|
|
118
|
+
predicted_object = {k: object.get(k, None) for k in self.config.target_attributes}
|
|
119
|
+
else:
|
|
120
|
+
predicted_object = object
|
|
121
|
+
if all(v is None for v in predicted_object.values()):
|
|
122
|
+
return None
|
|
123
|
+
return Inference(predicted_object=predicted_object)
|
|
119
124
|
|
|
120
125
|
def import_model_from(self, inference_engine: InferenceEngine, **kwargs):
|
|
121
126
|
io = StringIO()
|
|
@@ -127,6 +132,8 @@ class RuleBasedInferenceEngine(InferenceEngine):
|
|
|
127
132
|
if self.slot_expressions is None:
|
|
128
133
|
self.slot_expressions = {}
|
|
129
134
|
self.slot_expressions[target_attribute] = io.getvalue()
|
|
135
|
+
if not self.config:
|
|
136
|
+
self.config = inference_engine.config
|
|
130
137
|
|
|
131
138
|
def save_model(self, output: Union[str, Path]) -> None:
|
|
132
139
|
"""
|
|
@@ -148,7 +155,11 @@ class RuleBasedInferenceEngine(InferenceEngine):
|
|
|
148
155
|
def load_model(cls, file_path: Union[str, Path]) -> "RuleBasedInferenceEngine":
|
|
149
156
|
model_data = yaml.safe_load(open(file_path))
|
|
150
157
|
|
|
151
|
-
|
|
158
|
+
if model_data["config"]:
|
|
159
|
+
config = InferenceConfig(**model_data["config"])
|
|
160
|
+
else:
|
|
161
|
+
config = None
|
|
162
|
+
engine = cls(config=config)
|
|
152
163
|
for k, v in model_data.items():
|
|
153
164
|
if k == "config":
|
|
154
165
|
continue
|
|
@@ -153,7 +153,7 @@ class SklearnInferenceEngine(InferenceEngine):
|
|
|
153
153
|
y = y_encoder.fit_transform(y.values.ravel()) # Convert to 1D numpy array
|
|
154
154
|
self.transformed_targets = y_encoder.classes_
|
|
155
155
|
|
|
156
|
-
|
|
156
|
+
# print(f"Fitting model with features: {X.columns}")
|
|
157
157
|
clf = DecisionTreeClassifier(random_state=42)
|
|
158
158
|
clf.fit(X, y)
|
|
159
159
|
self.classifier = clf
|
|
@@ -174,6 +174,7 @@ class SklearnInferenceEngine(InferenceEngine):
|
|
|
174
174
|
if col in self.encoders:
|
|
175
175
|
encoder = self.encoders[col]
|
|
176
176
|
if isinstance(encoder, OneHotEncoder):
|
|
177
|
+
print(f"Encoding: {col} v={object[col]} df={new_X[[col]]} encoder={encoder}")
|
|
177
178
|
encoded = encoder.transform(new_X[[col]])
|
|
178
179
|
feature_names = encoder.get_feature_names_out([col])
|
|
179
180
|
for i, name in enumerate(feature_names):
|
|
@@ -216,7 +217,24 @@ class SklearnInferenceEngine(InferenceEngine):
|
|
|
216
217
|
return Inference(predicted_object=predicted_object, confidence=self.confidence)
|
|
217
218
|
|
|
218
219
|
def _normalize(self, object: OBJECT) -> OBJECT:
|
|
219
|
-
|
|
220
|
+
"""
|
|
221
|
+
Normalize the input object to ensure it has all the expected attributes.
|
|
222
|
+
|
|
223
|
+
Also remove any numpy/pandas oddities
|
|
224
|
+
|
|
225
|
+
:param object:
|
|
226
|
+
:return:
|
|
227
|
+
"""
|
|
228
|
+
np_map = {np.nan: None}
|
|
229
|
+
|
|
230
|
+
def _tr(x: Any):
|
|
231
|
+
# TODO: figure a more elegant way to do this
|
|
232
|
+
try:
|
|
233
|
+
return np_map.get(x, x)
|
|
234
|
+
except TypeError:
|
|
235
|
+
return x
|
|
236
|
+
|
|
237
|
+
return {k: _tr(object.get(k, None)) for k in self.config.feature_attributes}
|
|
220
238
|
|
|
221
239
|
def export_model(
|
|
222
240
|
self, output: Optional[Union[str, Path, TextIO]], model_serialization: ModelSerialization = None, **kwargs
|
|
@@ -35,6 +35,7 @@ class InferenceConfig(BaseModel, extra="forbid"):
|
|
|
35
35
|
feature_attributes: Optional[List[str]] = None
|
|
36
36
|
train_test_split: Optional[Tuple[float, float]] = None
|
|
37
37
|
llm_config: Optional[LLMConfig] = None
|
|
38
|
+
random_seed: Optional[int] = None
|
|
38
39
|
|
|
39
40
|
@classmethod
|
|
40
41
|
def from_file(cls, file_path: str, format: Optional[Format] = None) -> "InferenceConfig":
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import random
|
|
2
3
|
from abc import ABC
|
|
3
4
|
from dataclasses import dataclass
|
|
4
5
|
from enum import Enum
|
|
@@ -6,7 +7,7 @@ from pathlib import Path
|
|
|
6
7
|
from typing import Optional, TextIO, Tuple, Union
|
|
7
8
|
|
|
8
9
|
import pandas as pd
|
|
9
|
-
from pydantic import BaseModel, ConfigDict
|
|
10
|
+
from pydantic import BaseModel, ConfigDict
|
|
10
11
|
|
|
11
12
|
from linkml_store.api.collection import OBJECT, Collection
|
|
12
13
|
from linkml_store.inference.inference_config import Inference, InferenceConfig
|
|
@@ -28,6 +29,7 @@ class ModelSerialization(str, Enum):
|
|
|
28
29
|
PNG = "png"
|
|
29
30
|
LINKML_EXPRESSION = "linkml_expression"
|
|
30
31
|
RULE_BASED = "rulebased"
|
|
32
|
+
RAG_INDEX = "rag_index"
|
|
31
33
|
|
|
32
34
|
@classmethod
|
|
33
35
|
def from_filepath(cls, file_path: str) -> Optional["ModelSerialization"]:
|
|
@@ -57,11 +59,36 @@ class ModelSerialization(str, Enum):
|
|
|
57
59
|
|
|
58
60
|
|
|
59
61
|
class CollectionSlice(BaseModel):
|
|
60
|
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
62
|
+
model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid")
|
|
63
|
+
|
|
64
|
+
name: Optional[str] = None
|
|
65
|
+
base_collection: Optional[Collection] = None
|
|
66
|
+
# _dataframe: Optional[pd.DataFrame] = None
|
|
67
|
+
# slice: Tuple[Optional[int], Optional[int]] = Field(default=(None, None))
|
|
68
|
+
indices: Optional[Tuple[int, ...]] = None
|
|
69
|
+
_collection: Optional[Collection] = None
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def collection(self) -> Collection:
|
|
73
|
+
if not self._collection and not self.indices:
|
|
74
|
+
return self.base_collection
|
|
75
|
+
if not self._collection:
|
|
76
|
+
rows = self.base_collection.find({}, limit=-1).rows
|
|
77
|
+
subset = [rows[i] for i in self.indices]
|
|
78
|
+
db = self.base_collection.parent
|
|
79
|
+
subset_name = self.slice_alias
|
|
80
|
+
subset_collection = db.get_collection(subset_name, create_if_not_exists=True)
|
|
81
|
+
# ensure the collection has the same schema type as the base collection;
|
|
82
|
+
# this ensures that column/attribute types are preserved
|
|
83
|
+
subset_collection.metadata.type = self.base_collection.target_class_name
|
|
84
|
+
subset_collection.delete_where({})
|
|
85
|
+
subset_collection.insert(subset)
|
|
86
|
+
self._collection = subset_collection
|
|
87
|
+
return self._collection
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def slice_alias(self) -> str:
|
|
91
|
+
return f"{self.base_collection.alias}__rag_{self.name}"
|
|
65
92
|
|
|
66
93
|
def as_dataframe(self, flattened=False) -> pd.DataFrame:
|
|
67
94
|
"""
|
|
@@ -69,17 +96,11 @@ class CollectionSlice(BaseModel):
|
|
|
69
96
|
|
|
70
97
|
:return:
|
|
71
98
|
"""
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
return
|
|
75
|
-
elif self.collection is not None:
|
|
76
|
-
rs = self.collection.find({}, offset=self.slice[0], limit=self.slice[1] - self.slice[0])
|
|
77
|
-
if flattened:
|
|
78
|
-
return nested_objects_to_dataframe(rs.rows)
|
|
79
|
-
else:
|
|
80
|
-
return rs.rows_dataframe
|
|
99
|
+
rs = self.collection.find({}, limit=-1)
|
|
100
|
+
if flattened:
|
|
101
|
+
return nested_objects_to_dataframe(rs.rows)
|
|
81
102
|
else:
|
|
82
|
-
|
|
103
|
+
return rs.rows_dataframe
|
|
83
104
|
|
|
84
105
|
|
|
85
106
|
@dataclass
|
|
@@ -96,21 +117,34 @@ class InferenceEngine(ABC):
|
|
|
96
117
|
training_data: Optional[CollectionSlice] = None
|
|
97
118
|
testing_data: Optional[CollectionSlice] = None
|
|
98
119
|
|
|
99
|
-
def load_and_split_data(self, collection: Collection, split: Optional[Tuple[float, float]] = None):
|
|
120
|
+
def load_and_split_data(self, collection: Collection, split: Optional[Tuple[float, float]] = None, randomize=True):
|
|
100
121
|
"""
|
|
101
122
|
Load the data and split it into training and testing sets.
|
|
102
123
|
|
|
103
124
|
:param collection:
|
|
104
125
|
:param split:
|
|
126
|
+
:param randomize:
|
|
105
127
|
:return:
|
|
106
128
|
"""
|
|
129
|
+
local_random = random.Random(self.config.random_seed) if self.config.random_seed else random.Random()
|
|
107
130
|
split = split or self.config.train_test_split
|
|
108
131
|
if not split:
|
|
109
132
|
split = (0.7, 0.3)
|
|
133
|
+
if split[0] == 1.0:
|
|
134
|
+
self.training_data = CollectionSlice(name="train", base_collection=collection, indices=None)
|
|
135
|
+
self.testing_data = None
|
|
136
|
+
return
|
|
110
137
|
logger.info(f"Loading and splitting data from collection {collection.alias}")
|
|
111
138
|
size = collection.size()
|
|
112
|
-
|
|
113
|
-
|
|
139
|
+
indices = range(size)
|
|
140
|
+
if randomize:
|
|
141
|
+
train_indices = local_random.sample(indices, int(size * split[0]))
|
|
142
|
+
test_indices = set(indices) - set(train_indices)
|
|
143
|
+
else:
|
|
144
|
+
train_indices = indices[: int(size * split[0])]
|
|
145
|
+
test_indices = indices[int(size * split[0]) :]
|
|
146
|
+
self.training_data = CollectionSlice(name="train", base_collection=collection, indices=train_indices)
|
|
147
|
+
self.testing_data = CollectionSlice(name="test", base_collection=collection, indices=test_indices)
|
|
114
148
|
|
|
115
149
|
def initialize_model(self, **kwargs):
|
|
116
150
|
"""
|
|
@@ -47,6 +47,7 @@ class Format(Enum):
|
|
|
47
47
|
".jsonl": cls.JSONL,
|
|
48
48
|
".yaml": cls.YAML,
|
|
49
49
|
".yml": cls.YAML,
|
|
50
|
+
".yamll": cls.YAMLL,
|
|
50
51
|
".tsv": cls.TSV,
|
|
51
52
|
".csv": cls.CSV,
|
|
52
53
|
".py": cls.PYTHON,
|
|
@@ -98,6 +99,9 @@ def process_file(
|
|
|
98
99
|
"""
|
|
99
100
|
Process a single file and return a list of objects.
|
|
100
101
|
"""
|
|
102
|
+
if format == Format.YAMLL:
|
|
103
|
+
format = Format.YAML
|
|
104
|
+
expected_type = list
|
|
101
105
|
if format == Format.JSON:
|
|
102
106
|
objs = json.load(f)
|
|
103
107
|
elif format == Format.JSONL:
|
|
@@ -105,6 +109,8 @@ def process_file(
|
|
|
105
109
|
elif format == Format.YAML:
|
|
106
110
|
if expected_type and expected_type == list: # noqa E721
|
|
107
111
|
objs = list(yaml.safe_load_all(f))
|
|
112
|
+
# allow YAML with a `---` with no object before it
|
|
113
|
+
objs = [obj for obj in objs if obj is not None]
|
|
108
114
|
else:
|
|
109
115
|
objs = yaml.safe_load(f)
|
|
110
116
|
elif format in [Format.TSV, Format.CSV]:
|
linkml_store/utils/llm_utils.py
CHANGED
|
@@ -20,6 +20,7 @@ MODEL_TOKEN_MAPPING = {
|
|
|
20
20
|
"gpt-3.5-turbo-instruct": 4096,
|
|
21
21
|
"text-ada-001": 2049,
|
|
22
22
|
"ada": 2049,
|
|
23
|
+
"ada-002": 8192,
|
|
23
24
|
"text-babbage-001": 2040,
|
|
24
25
|
"babbage": 2049,
|
|
25
26
|
"text-curie-001": 2049,
|
|
@@ -32,6 +33,7 @@ MODEL_TOKEN_MAPPING = {
|
|
|
32
33
|
"code-cushman-002": 2048,
|
|
33
34
|
"code-cushman-001": 2048,
|
|
34
35
|
"claude": 200_000,
|
|
36
|
+
"llama-3": 200_000,
|
|
35
37
|
}
|
|
36
38
|
|
|
37
39
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from copy import deepcopy
|
|
3
|
-
from typing import Any, Dict, List, Union
|
|
3
|
+
from typing import Any, Dict, List, Optional, Union
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel
|
|
6
6
|
|
|
@@ -60,6 +60,41 @@ def object_path_update(
|
|
|
60
60
|
return ret_obj
|
|
61
61
|
|
|
62
62
|
|
|
63
|
+
def object_path_get(obj: Union[BaseModel, Dict[str, Any]], path: str, default_value=None) -> Any:
|
|
64
|
+
"""
|
|
65
|
+
Retrieves a value from a nested object based on a path description. The path to the
|
|
66
|
+
desired field is given in dot and bracket notation (e.g., 'a[0].b.c[1]').
|
|
67
|
+
|
|
68
|
+
:param obj: The dictionary object to be updated.
|
|
69
|
+
:type obj: Dict[str, Any]
|
|
70
|
+
:param path: The path string indicating where to place the value within the object.
|
|
71
|
+
:type path: str
|
|
72
|
+
:return: The value at the specified path.
|
|
73
|
+
:rtype: Any
|
|
74
|
+
|
|
75
|
+
**Example**::
|
|
76
|
+
|
|
77
|
+
>>> data = {'persons': [{'foo': {'bar': 1}}]}
|
|
78
|
+
>>> object_path_get(data, 'persons[0].foo.bar')
|
|
79
|
+
1
|
|
80
|
+
>>> object_path_get(data, 'persons[0].foo')
|
|
81
|
+
{'bar': 1}
|
|
82
|
+
>>> object_path_get({}, 'not there', "NA")
|
|
83
|
+
'NA'
|
|
84
|
+
"""
|
|
85
|
+
if isinstance(obj, BaseModel):
|
|
86
|
+
obj = obj.dict()
|
|
87
|
+
parts = path.split(".")
|
|
88
|
+
for part in parts:
|
|
89
|
+
if "[" in part:
|
|
90
|
+
key, index = part[:-1].split("[")
|
|
91
|
+
index = int(index)
|
|
92
|
+
obj = obj[key][index]
|
|
93
|
+
else:
|
|
94
|
+
obj = obj.get(part, default_value)
|
|
95
|
+
return obj
|
|
96
|
+
|
|
97
|
+
|
|
63
98
|
def parse_update_expression(expr: str) -> Union[tuple[str, Any], None]:
|
|
64
99
|
"""
|
|
65
100
|
Parse a string expression of the form 'path.to.field=value' into a path and a value.
|
|
@@ -81,3 +116,67 @@ def clean_empties(value: Union[Dict, List]) -> Any:
|
|
|
81
116
|
elif isinstance(value, list):
|
|
82
117
|
value = [v for v in (clean_empties(v) for v in value) if v is not None]
|
|
83
118
|
return value
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def select_nested(data: dict, paths: List[Union[str, List[str]]], current_path=None) -> Optional[dict]:
|
|
122
|
+
"""
|
|
123
|
+
Select nested attributes from a complex dictionary based on selector strings.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
data (dict): The input nested dictionary.
|
|
127
|
+
selectors (list): A list of selector strings.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
dict: A new dictionary with the same structure, but only the selected attributes.
|
|
131
|
+
|
|
132
|
+
Example:
|
|
133
|
+
>>> data = {
|
|
134
|
+
... "person": {
|
|
135
|
+
... "name": "John Doe",
|
|
136
|
+
... "age": 30,
|
|
137
|
+
... "address": {
|
|
138
|
+
... "street": "123 Main St",
|
|
139
|
+
... "city": "Anytown",
|
|
140
|
+
... "country": "USA"
|
|
141
|
+
... },
|
|
142
|
+
... "phones": [
|
|
143
|
+
... {"type": "home", "number": "555-1234"},
|
|
144
|
+
... {"type": "work", "number": "555-5678"}
|
|
145
|
+
... ]
|
|
146
|
+
... },
|
|
147
|
+
... "company": {
|
|
148
|
+
... "name": "Acme Inc",
|
|
149
|
+
... "location": "New York"
|
|
150
|
+
... }
|
|
151
|
+
... }
|
|
152
|
+
>>> select_nested(data, ["person.address.street", "person.address.city"])
|
|
153
|
+
{'person': {'address': {'street': '123 Main St', 'city': 'Anytown'}}}
|
|
154
|
+
>>> select_nested(data, ["person.phones.number", "person.phones.type"])
|
|
155
|
+
{'person': {'phones': [{'type': 'home', 'number': '555-1234'}, {'type': 'work', 'number': '555-5678'}]}}
|
|
156
|
+
>>> select_nested(data, ["person"])
|
|
157
|
+
{'person': {'name': 'John Doe', 'age': 30, 'address': {'street': '123 Main St', 'city': 'Anytown',
|
|
158
|
+
'country': 'USA'}, 'phones': [{'type': 'home', 'number': '555-1234'}, {'type': 'work', 'number': '555-5678'}]}}
|
|
159
|
+
>>> select_nested(data, ["person.phones.type"])
|
|
160
|
+
{'person': {'phones': [{'type': 'home'}, {'type': 'work'}]}}
|
|
161
|
+
"""
|
|
162
|
+
if current_path is None:
|
|
163
|
+
current_path = []
|
|
164
|
+
matching_paths = []
|
|
165
|
+
for path in paths:
|
|
166
|
+
if isinstance(path, str):
|
|
167
|
+
path = path.split(".")
|
|
168
|
+
if path == current_path:
|
|
169
|
+
return data
|
|
170
|
+
if path[: len(current_path)] == current_path:
|
|
171
|
+
matching_paths.append(path)
|
|
172
|
+
if not matching_paths:
|
|
173
|
+
return None
|
|
174
|
+
if isinstance(data, dict):
|
|
175
|
+
new_obj = {k: select_nested(v, matching_paths, current_path + [k]) for k, v in data.items()}
|
|
176
|
+
new_obj = {k: v for k, v in new_obj.items() if v is not None}
|
|
177
|
+
return new_obj
|
|
178
|
+
if isinstance(data, list):
|
|
179
|
+
new_obj = [select_nested(v, matching_paths, current_path + []) for i, v in enumerate(data)]
|
|
180
|
+
new_obj = [v for v in new_obj if v is not None]
|
|
181
|
+
return new_obj
|
|
182
|
+
return data
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: linkml-store
|
|
3
|
-
Version: 0.1
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: linkml-store
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Author 1
|
|
@@ -18,6 +18,7 @@ Provides-Extra: chromadb
|
|
|
18
18
|
Provides-Extra: fastapi
|
|
19
19
|
Provides-Extra: frictionless
|
|
20
20
|
Provides-Extra: h5py
|
|
21
|
+
Provides-Extra: ibis
|
|
21
22
|
Provides-Extra: llm
|
|
22
23
|
Provides-Extra: map
|
|
23
24
|
Provides-Extra: mongodb
|
|
@@ -34,7 +35,9 @@ Requires-Dist: duckdb (>=0.10.1)
|
|
|
34
35
|
Requires-Dist: duckdb-engine (>=0.11.2)
|
|
35
36
|
Requires-Dist: fastapi ; extra == "fastapi"
|
|
36
37
|
Requires-Dist: frictionless ; extra == "frictionless"
|
|
38
|
+
Requires-Dist: gcsfs ; extra == "ibis"
|
|
37
39
|
Requires-Dist: h5py ; extra == "h5py"
|
|
40
|
+
Requires-Dist: ibis-framework[duckdb,examples] (>=9.3.0) ; extra == "ibis"
|
|
38
41
|
Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
|
|
39
42
|
Requires-Dist: jsonlines (>=4.0.0,<5.0.0)
|
|
40
43
|
Requires-Dist: linkml (>=1.8.0) ; extra == "validation"
|
|
@@ -43,6 +46,7 @@ Requires-Dist: linkml_map ; extra == "map"
|
|
|
43
46
|
Requires-Dist: linkml_renderer ; extra == "renderer"
|
|
44
47
|
Requires-Dist: llm ; extra == "llm"
|
|
45
48
|
Requires-Dist: matplotlib ; extra == "analytics"
|
|
49
|
+
Requires-Dist: multipledispatch ; extra == "ibis"
|
|
46
50
|
Requires-Dist: neo4j ; extra == "neo4j"
|
|
47
51
|
Requires-Dist: networkx ; extra == "neo4j"
|
|
48
52
|
Requires-Dist: pandas (>=2.2.1) ; extra == "analytics"
|
|
@@ -52,6 +56,7 @@ Requires-Dist: pyarrow ; extra == "pyarrow"
|
|
|
52
56
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
|
53
57
|
Requires-Dist: pymongo ; extra == "mongodb"
|
|
54
58
|
Requires-Dist: pystow (>=0.5.4,<0.6.0)
|
|
59
|
+
Requires-Dist: ruff (>=0.6.2) ; extra == "tests"
|
|
55
60
|
Requires-Dist: scikit-learn ; extra == "scipy"
|
|
56
61
|
Requires-Dist: scipy ; extra == "scipy"
|
|
57
62
|
Requires-Dist: seaborn ; extra == "analytics"
|
|
@@ -70,6 +75,8 @@ common query, index, and storage operations.
|
|
|
70
75
|
|
|
71
76
|
For full documentation, see [https://linkml.io/linkml-store/](https://linkml.io/linkml-store/)
|
|
72
77
|
|
|
78
|
+
See [these slides](https://docs.google.com/presentation/d/e/2PACX-1vSgtWUNUW0qNO_ZhMAGQ6fYhlXZJjBNMYT0OiZz8DDx8oj7iG9KofRs6SeaMXBBOICGknoyMG2zaHnm/embed?start=false&loop=false&delayms=3000) for a high level overview.
|
|
79
|
+
|
|
73
80
|
__Warning__ LinkML-Store is still undergoing changes and refactoring,
|
|
74
81
|
APIs and command line options are subject to change!
|
|
75
82
|
|
|
@@ -196,3 +203,4 @@ make app
|
|
|
196
203
|
|
|
197
204
|
See [these slides](https://docs.google.com/presentation/d/e/2PACX-1vSgtWUNUW0qNO_ZhMAGQ6fYhlXZJjBNMYT0OiZz8DDx8oj7iG9KofRs6SeaMXBBOICGknoyMG2zaHnm/embed?start=false&loop=false&delayms=3000) for more details
|
|
198
205
|
|
|
206
|
+
|