linkml-store 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/client.py +35 -8
- linkml_store/api/collection.py +40 -5
- linkml_store/api/config.py +20 -3
- linkml_store/api/database.py +24 -3
- linkml_store/api/stores/duckdb/duckdb_collection.py +3 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +4 -0
- linkml_store/cli.py +149 -13
- linkml_store/inference/__init__.py +13 -0
- linkml_store/inference/evaluation.py +189 -0
- linkml_store/inference/implementations/__init__.py +0 -0
- linkml_store/inference/implementations/rag_inference_engine.py +145 -0
- linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
- linkml_store/inference/implementations/sklearn_inference_engine.py +308 -0
- linkml_store/inference/inference_config.py +62 -0
- linkml_store/inference/inference_engine.py +200 -0
- linkml_store/inference/inference_engine_registry.py +74 -0
- linkml_store/utils/format_utils.py +27 -90
- linkml_store/utils/llm_utils.py +96 -0
- linkml_store/utils/object_utils.py +103 -2
- linkml_store/utils/pandas_utils.py +55 -2
- linkml_store/utils/sklearn_utils.py +193 -0
- linkml_store/utils/stats_utils.py +53 -0
- {linkml_store-0.1.13.dist-info → linkml_store-0.2.0.dist-info}/METADATA +28 -2
- {linkml_store-0.1.13.dist-info → linkml_store-0.2.0.dist-info}/RECORD +27 -15
- {linkml_store-0.1.13.dist-info → linkml_store-0.2.0.dist-info}/LICENSE +0 -0
- {linkml_store-0.1.13.dist-info → linkml_store-0.2.0.dist-info}/WHEEL +0 -0
- {linkml_store-0.1.13.dist-info → linkml_store-0.2.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, ClassVar, Dict, List, Optional, TextIO, Type, Union
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from sklearn.model_selection import cross_val_score
|
|
9
|
+
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, OneHotEncoder
|
|
10
|
+
from sklearn.tree import DecisionTreeClassifier
|
|
11
|
+
|
|
12
|
+
from linkml_store.api.collection import OBJECT
|
|
13
|
+
from linkml_store.inference.implementations.rule_based_inference_engine import RuleBasedInferenceEngine
|
|
14
|
+
from linkml_store.inference.inference_config import Inference, InferenceConfig
|
|
15
|
+
from linkml_store.inference.inference_engine import InferenceEngine, ModelSerialization
|
|
16
|
+
from linkml_store.utils.sklearn_utils import tree_to_nested_expression, visualize_decision_tree
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class SklearnInferenceEngine(InferenceEngine):
|
|
23
|
+
config: InferenceConfig
|
|
24
|
+
classifier: Any = None
|
|
25
|
+
encoders: Dict[str, Any] = field(default_factory=dict)
|
|
26
|
+
transformed_features: List[str] = field(default_factory=list)
|
|
27
|
+
transformed_targets: List[str] = field(default_factory=list)
|
|
28
|
+
skip_features: List[str] = field(default_factory=list)
|
|
29
|
+
categorical_encoder_class: Optional[Type[Union[OneHotEncoder, MultiLabelBinarizer]]] = None
|
|
30
|
+
maximum_proportion_distinct_features: float = 0.2
|
|
31
|
+
confidence: float = 0.0
|
|
32
|
+
|
|
33
|
+
strict: bool = False
|
|
34
|
+
|
|
35
|
+
PERSIST_COLS: ClassVar = [
|
|
36
|
+
"config",
|
|
37
|
+
"classifier",
|
|
38
|
+
"encoders",
|
|
39
|
+
"transformed_features",
|
|
40
|
+
"transformed_targets",
|
|
41
|
+
"skip_features",
|
|
42
|
+
"confidence",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
def _get_encoder(self, v: Union[List[Any], Any]) -> Any:
|
|
46
|
+
if isinstance(v, list):
|
|
47
|
+
if all(isinstance(x, list) for x in v):
|
|
48
|
+
return MultiLabelBinarizer()
|
|
49
|
+
elif all(isinstance(x, str) for x in v):
|
|
50
|
+
return OneHotEncoder(sparse_output=False, handle_unknown="ignore")
|
|
51
|
+
elif all(isinstance(x, (int, float)) for x in v):
|
|
52
|
+
return None
|
|
53
|
+
else:
|
|
54
|
+
raise ValueError("Mixed data types in the list are not supported")
|
|
55
|
+
else:
|
|
56
|
+
if hasattr(v, "dtype"):
|
|
57
|
+
if v.dtype == "object" or v.dtype.name == "category":
|
|
58
|
+
if isinstance(v.iloc[0], list):
|
|
59
|
+
return MultiLabelBinarizer()
|
|
60
|
+
elif self.categorical_encoder_class:
|
|
61
|
+
return self.categorical_encoder_class(handle_unknown="ignore")
|
|
62
|
+
else:
|
|
63
|
+
return OneHotEncoder(sparse_output=False, handle_unknown="ignore")
|
|
64
|
+
elif v.dtype.kind in "biufc":
|
|
65
|
+
return None
|
|
66
|
+
raise ValueError("Unable to determine appropriate encoder for the input data")
|
|
67
|
+
|
|
68
|
+
def _is_complex_column(self, column: pd.Series) -> bool:
|
|
69
|
+
"""Check if the column contains complex data types like lists or dicts."""
|
|
70
|
+
# MV_TYPE = (list, dict)
|
|
71
|
+
MV_TYPE = (list,)
|
|
72
|
+
return (column.dtype == "object" or column.dtype == "category") and any(
|
|
73
|
+
isinstance(x, MV_TYPE) for x in column.dropna()
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
def _get_unique_values(self, column: pd.Series) -> set:
|
|
77
|
+
"""Get unique values from a column, handling list-type data."""
|
|
78
|
+
if self._is_complex_column(column):
|
|
79
|
+
# For columns with lists, flatten the lists and get unique values
|
|
80
|
+
return set(
|
|
81
|
+
item for sublist in column.dropna() for item in (sublist if isinstance(sublist, list) else [sublist])
|
|
82
|
+
)
|
|
83
|
+
else:
|
|
84
|
+
return set(column.unique())
|
|
85
|
+
|
|
86
|
+
def initialize_model(self, **kwargs):
|
|
87
|
+
logger.info(f"Initializing model with config: {self.config}")
|
|
88
|
+
df = self.training_data.as_dataframe(flattened=True)
|
|
89
|
+
logger.info(f"Training data shape: {df.shape}")
|
|
90
|
+
target_cols = self.config.target_attributes
|
|
91
|
+
feature_cols = self.config.feature_attributes
|
|
92
|
+
if len(target_cols) != 1:
|
|
93
|
+
raise ValueError("Only one target column is supported")
|
|
94
|
+
if not feature_cols:
|
|
95
|
+
feature_cols = df.columns.difference(target_cols).tolist()
|
|
96
|
+
self.config.feature_attributes = feature_cols
|
|
97
|
+
target_col = target_cols[0]
|
|
98
|
+
logger.info(f"Feature columns: {feature_cols}")
|
|
99
|
+
X = df[feature_cols].copy()
|
|
100
|
+
logger.info(f"Target column: {target_col}")
|
|
101
|
+
y = df[target_col].copy()
|
|
102
|
+
|
|
103
|
+
# find list of features to skip (categorical with > N categories)
|
|
104
|
+
skip_features = []
|
|
105
|
+
for col in X.columns:
|
|
106
|
+
unique_values = self._get_unique_values(X[col])
|
|
107
|
+
if len(unique_values) > self.maximum_proportion_distinct_features * len(X[col]):
|
|
108
|
+
skip_features.append(col)
|
|
109
|
+
if False and (X[col].dtype == "object" or X[col].dtype.name == "category"):
|
|
110
|
+
if len(X[col].unique()) > self.maximum_proportion_distinct_features * len(X[col]):
|
|
111
|
+
skip_features.append(col)
|
|
112
|
+
self.skip_features = skip_features
|
|
113
|
+
X = X.drop(skip_features, axis=1)
|
|
114
|
+
logger.info(f"Skipping features: {skip_features}")
|
|
115
|
+
|
|
116
|
+
# Encode features
|
|
117
|
+
encoded_features = []
|
|
118
|
+
for col in X.columns:
|
|
119
|
+
logger.info(f"Checking whether to encode: {col}")
|
|
120
|
+
col_encoder = self._get_encoder(X[col])
|
|
121
|
+
if col_encoder:
|
|
122
|
+
self.encoders[col] = col_encoder
|
|
123
|
+
if isinstance(col_encoder, OneHotEncoder):
|
|
124
|
+
encoded = col_encoder.fit_transform(X[[col]])
|
|
125
|
+
feature_names = col_encoder.get_feature_names_out([col])
|
|
126
|
+
encoded_df = pd.DataFrame(encoded, columns=feature_names, index=X.index)
|
|
127
|
+
X = pd.concat([X.drop(col, axis=1), encoded_df], axis=1)
|
|
128
|
+
encoded_features.extend(feature_names)
|
|
129
|
+
elif isinstance(col_encoder, MultiLabelBinarizer):
|
|
130
|
+
encoded = col_encoder.fit_transform(X[col])
|
|
131
|
+
feature_names = [f"{col}_{c}" for c in col_encoder.classes_]
|
|
132
|
+
encoded_df = pd.DataFrame(encoded, columns=feature_names, index=X.index)
|
|
133
|
+
X = pd.concat([X.drop(col, axis=1), encoded_df], axis=1)
|
|
134
|
+
encoded_features.extend(feature_names)
|
|
135
|
+
else:
|
|
136
|
+
X[col] = col_encoder.fit_transform(X[col])
|
|
137
|
+
encoded_features.append(col)
|
|
138
|
+
else:
|
|
139
|
+
encoded_features.append(col)
|
|
140
|
+
|
|
141
|
+
self.transformed_features = encoded_features
|
|
142
|
+
logger.info(f"Encoded features: {self.transformed_features}")
|
|
143
|
+
logger.info(f"Number of features after encoding: {len(self.transformed_features)}")
|
|
144
|
+
|
|
145
|
+
# Encode target
|
|
146
|
+
# y_encoder = LabelEncoder()
|
|
147
|
+
y_encoder = self._get_encoder(y)
|
|
148
|
+
if isinstance(y_encoder, OneHotEncoder):
|
|
149
|
+
y_encoder = LabelEncoder()
|
|
150
|
+
# self.encoders[target_col] = y_encoder
|
|
151
|
+
if y_encoder:
|
|
152
|
+
self.encoders[target_col] = y_encoder
|
|
153
|
+
y = y_encoder.fit_transform(y.values.ravel()) # Convert to 1D numpy array
|
|
154
|
+
self.transformed_targets = y_encoder.classes_
|
|
155
|
+
|
|
156
|
+
logger.info(f"Fitting model with features: {X.columns}")
|
|
157
|
+
clf = DecisionTreeClassifier(random_state=42)
|
|
158
|
+
clf.fit(X, y)
|
|
159
|
+
self.classifier = clf
|
|
160
|
+
logger.info("Model fit complete")
|
|
161
|
+
cv_scores = cross_val_score(self.classifier, X, y, cv=5)
|
|
162
|
+
self.confidence = cv_scores.mean()
|
|
163
|
+
logger.info(f"Cross-validation scores: {cv_scores}")
|
|
164
|
+
|
|
165
|
+
def derive(self, object: OBJECT) -> Optional[Inference]:
|
|
166
|
+
object = self._normalize(object)
|
|
167
|
+
new_X = pd.DataFrame([object])
|
|
168
|
+
|
|
169
|
+
# Apply encodings
|
|
170
|
+
encoded_features = {}
|
|
171
|
+
for col in self.config.feature_attributes:
|
|
172
|
+
if col in self.skip_features:
|
|
173
|
+
continue
|
|
174
|
+
if col in self.encoders:
|
|
175
|
+
encoder = self.encoders[col]
|
|
176
|
+
if isinstance(encoder, OneHotEncoder):
|
|
177
|
+
print(f"Encoding: {col} v={object[col]} df={new_X[[col]]} encoder={encoder}")
|
|
178
|
+
encoded = encoder.transform(new_X[[col]])
|
|
179
|
+
feature_names = encoder.get_feature_names_out([col])
|
|
180
|
+
for i, name in enumerate(feature_names):
|
|
181
|
+
encoded_features[name] = encoded[0, i]
|
|
182
|
+
elif isinstance(encoder, MultiLabelBinarizer):
|
|
183
|
+
encoded = encoder.transform(new_X[col])
|
|
184
|
+
feature_names = [f"{col}_{c}" for c in encoder.classes_]
|
|
185
|
+
for i, name in enumerate(feature_names):
|
|
186
|
+
encoded_features[name] = encoded[0, i]
|
|
187
|
+
else: # LabelEncoder or similar
|
|
188
|
+
encoded_features[col] = encoder.transform(new_X[col].astype(str))[0]
|
|
189
|
+
else:
|
|
190
|
+
encoded_features[col] = new_X[col].iloc[0]
|
|
191
|
+
|
|
192
|
+
# Ensure all expected features are present and in the correct order
|
|
193
|
+
final_features = []
|
|
194
|
+
for feature in self.transformed_features:
|
|
195
|
+
if feature in encoded_features:
|
|
196
|
+
final_features.append(encoded_features[feature])
|
|
197
|
+
else:
|
|
198
|
+
final_features.append(0) # or some other default value
|
|
199
|
+
|
|
200
|
+
# Create the final input array
|
|
201
|
+
new_X_array = np.array(final_features).reshape(1, -1)
|
|
202
|
+
|
|
203
|
+
logger.info(f"Input features: {self.transformed_features}")
|
|
204
|
+
logger.info(f"Number of input features: {len(self.transformed_features)}")
|
|
205
|
+
|
|
206
|
+
predictions = self.classifier.predict(new_X_array)
|
|
207
|
+
target_attribute = self.config.target_attributes[0]
|
|
208
|
+
y_encoder = self.encoders.get(target_attribute)
|
|
209
|
+
|
|
210
|
+
if y_encoder:
|
|
211
|
+
v = y_encoder.inverse_transform(predictions)
|
|
212
|
+
else:
|
|
213
|
+
v = predictions
|
|
214
|
+
|
|
215
|
+
predicted_object = {target_attribute: v[0]}
|
|
216
|
+
logger.info(f"Predicted object: {predicted_object}")
|
|
217
|
+
return Inference(predicted_object=predicted_object, confidence=self.confidence)
|
|
218
|
+
|
|
219
|
+
def _normalize(self, object: OBJECT) -> OBJECT:
|
|
220
|
+
"""
|
|
221
|
+
Normalize the input object to ensure it has all the expected attributes.
|
|
222
|
+
|
|
223
|
+
Also remove any numpy/pandas oddities
|
|
224
|
+
|
|
225
|
+
:param object:
|
|
226
|
+
:return:
|
|
227
|
+
"""
|
|
228
|
+
np_map = {np.nan: None}
|
|
229
|
+
|
|
230
|
+
def _tr(x: Any):
|
|
231
|
+
# TODO: figure a more elegant way to do this
|
|
232
|
+
try:
|
|
233
|
+
return np_map.get(x, x)
|
|
234
|
+
except TypeError:
|
|
235
|
+
return x
|
|
236
|
+
|
|
237
|
+
return {k: _tr(object.get(k, None)) for k in self.config.feature_attributes}
|
|
238
|
+
|
|
239
|
+
def export_model(
|
|
240
|
+
self, output: Optional[Union[str, Path, TextIO]], model_serialization: ModelSerialization = None, **kwargs
|
|
241
|
+
):
|
|
242
|
+
def as_file():
|
|
243
|
+
if isinstance(output, (str, Path)):
|
|
244
|
+
return open(output, "w")
|
|
245
|
+
return output
|
|
246
|
+
|
|
247
|
+
if model_serialization is None:
|
|
248
|
+
if isinstance(output, (str, Path)):
|
|
249
|
+
model_serialization = ModelSerialization.from_filepath(output)
|
|
250
|
+
if model_serialization is None:
|
|
251
|
+
model_serialization = ModelSerialization.JOBLIB
|
|
252
|
+
|
|
253
|
+
if model_serialization == ModelSerialization.LINKML_EXPRESSION:
|
|
254
|
+
expr = tree_to_nested_expression(
|
|
255
|
+
self.classifier,
|
|
256
|
+
self.transformed_features,
|
|
257
|
+
self.encoders.keys(),
|
|
258
|
+
feature_encoders=self.encoders,
|
|
259
|
+
target_encoder=self.encoders.get(self.config.target_attributes[0]),
|
|
260
|
+
)
|
|
261
|
+
as_file().write(expr)
|
|
262
|
+
elif model_serialization == ModelSerialization.JOBLIB:
|
|
263
|
+
self.save_model(output)
|
|
264
|
+
elif model_serialization == ModelSerialization.RULE_BASED:
|
|
265
|
+
rbie = RuleBasedInferenceEngine(config=self.config)
|
|
266
|
+
rbie.import_model_from(self)
|
|
267
|
+
rbie.save_model(output)
|
|
268
|
+
elif model_serialization == ModelSerialization.PNG:
|
|
269
|
+
visualize_decision_tree(self.classifier, self.transformed_features, self.transformed_targets, output)
|
|
270
|
+
else:
|
|
271
|
+
raise ValueError(f"Unsupported model serialization: {model_serialization}")
|
|
272
|
+
|
|
273
|
+
def save_model(self, output: Union[str, Path]) -> None:
|
|
274
|
+
"""
|
|
275
|
+
Save the trained model and related data to a file.
|
|
276
|
+
|
|
277
|
+
:param output: Path to save the model
|
|
278
|
+
"""
|
|
279
|
+
import joblib
|
|
280
|
+
|
|
281
|
+
if self.classifier is None:
|
|
282
|
+
raise ValueError("Model has not been trained. Call initialize_model() first.")
|
|
283
|
+
|
|
284
|
+
# Use self.PERSIST_COLS
|
|
285
|
+
model_data = {k: getattr(self, k) for k in self.PERSIST_COLS}
|
|
286
|
+
|
|
287
|
+
joblib.dump(model_data, output)
|
|
288
|
+
|
|
289
|
+
@classmethod
|
|
290
|
+
def load_model(cls, file_path: Union[str, Path]) -> "SklearnInferenceEngine":
|
|
291
|
+
"""
|
|
292
|
+
Load a trained model and related data from a file.
|
|
293
|
+
|
|
294
|
+
:param file_path: Path to the saved model
|
|
295
|
+
:return: SklearnInferenceEngine instance with loaded model
|
|
296
|
+
"""
|
|
297
|
+
import joblib
|
|
298
|
+
|
|
299
|
+
model_data = joblib.load(file_path)
|
|
300
|
+
|
|
301
|
+
engine = cls(config=model_data["config"])
|
|
302
|
+
for k, v in model_data.items():
|
|
303
|
+
if k == "config":
|
|
304
|
+
continue
|
|
305
|
+
setattr(engine, k, v)
|
|
306
|
+
|
|
307
|
+
logger.info(f"Model loaded from {file_path}")
|
|
308
|
+
return engine
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
5
|
+
|
|
6
|
+
from linkml_store.api.collection import OBJECT
|
|
7
|
+
from linkml_store.utils.format_utils import Format, load_objects
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LLMConfig(BaseModel, extra="forbid"):
|
|
13
|
+
"""
|
|
14
|
+
Configuration for the LLM indexer.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
model_config = ConfigDict(protected_namespaces=())
|
|
18
|
+
|
|
19
|
+
model_name: str = "gpt-4o-mini"
|
|
20
|
+
token_limit: Optional[int] = None
|
|
21
|
+
number_of_few_shot_examples: Optional[int] = None
|
|
22
|
+
role: str = "Domain Expert"
|
|
23
|
+
cached_embeddings_database: Optional[str] = None
|
|
24
|
+
cached_embeddings_collection: Optional[str] = None
|
|
25
|
+
text_template: Optional[str] = None
|
|
26
|
+
text_template_syntax: Optional[str] = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class InferenceConfig(BaseModel, extra="forbid"):
|
|
30
|
+
"""
|
|
31
|
+
Configuration for inference engines.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
target_attributes: Optional[List[str]] = None
|
|
35
|
+
feature_attributes: Optional[List[str]] = None
|
|
36
|
+
train_test_split: Optional[Tuple[float, float]] = None
|
|
37
|
+
llm_config: Optional[LLMConfig] = None
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def from_file(cls, file_path: str, format: Optional[Format] = None) -> "InferenceConfig":
|
|
41
|
+
"""
|
|
42
|
+
Load an inference config from a file.
|
|
43
|
+
|
|
44
|
+
:param file_path: Path to the file.
|
|
45
|
+
:param format: Format of the file (YAML is recommended).
|
|
46
|
+
:return: InferenceConfig
|
|
47
|
+
"""
|
|
48
|
+
if format and format.is_xsv():
|
|
49
|
+
logger.warning("XSV format is not recommended for inference config files")
|
|
50
|
+
objs = load_objects(file_path, format=format)
|
|
51
|
+
if len(objs) != 1:
|
|
52
|
+
raise ValueError(f"Expected 1 object, got {len(objs)}")
|
|
53
|
+
return cls(**objs[0])
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class Inference(BaseModel, extra="forbid"):
|
|
57
|
+
"""
|
|
58
|
+
Result of an inference derivation.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
predicted_object: OBJECT = Field(..., description="The predicted object.")
|
|
62
|
+
confidence: Optional[float] = Field(default=None, description="The confidence of the prediction.", le=1.0, ge=0.0)
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import random
|
|
3
|
+
from abc import ABC
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional, TextIO, Tuple, Union
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from pydantic import BaseModel, ConfigDict
|
|
11
|
+
|
|
12
|
+
from linkml_store.api.collection import OBJECT, Collection
|
|
13
|
+
from linkml_store.inference.inference_config import Inference, InferenceConfig
|
|
14
|
+
from linkml_store.utils.pandas_utils import nested_objects_to_dataframe
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ModelSerialization(str, Enum):
|
|
20
|
+
"""
|
|
21
|
+
Enum for model serialization types.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
PICKLE = "pickle"
|
|
25
|
+
ONNX = "onnx"
|
|
26
|
+
PMML = "pmml"
|
|
27
|
+
PFA = "pfa"
|
|
28
|
+
JOBLIB = "joblib"
|
|
29
|
+
PNG = "png"
|
|
30
|
+
LINKML_EXPRESSION = "linkml_expression"
|
|
31
|
+
RULE_BASED = "rulebased"
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def from_filepath(cls, file_path: str) -> Optional["ModelSerialization"]:
|
|
35
|
+
"""
|
|
36
|
+
Get the serialization type from the file path.
|
|
37
|
+
|
|
38
|
+
>>> ModelSerialization.from_filepath("model.onnx")
|
|
39
|
+
<ModelSerialization.ONNX: 'onnx'>
|
|
40
|
+
>>> ModelSerialization.from_filepath("model.pkl")
|
|
41
|
+
<ModelSerialization.PICKLE: 'pickle'>
|
|
42
|
+
>>> assert ModelSerialization.from_filepath("poor_file_name") is None
|
|
43
|
+
|
|
44
|
+
:param file_path:
|
|
45
|
+
:return:
|
|
46
|
+
"""
|
|
47
|
+
toks = file_path.split(".")
|
|
48
|
+
suffix = toks[-1]
|
|
49
|
+
if len(toks) > 2:
|
|
50
|
+
if suffix == "yaml" and toks[-2] == "rulebased":
|
|
51
|
+
return cls.RULE_BASED
|
|
52
|
+
# Generate mapping dynamically
|
|
53
|
+
extension_mapping = {v.lower(): v for v in cls}
|
|
54
|
+
# Add special cases
|
|
55
|
+
extension_mapping["pkl"] = cls.PICKLE
|
|
56
|
+
extension_mapping["py"] = cls.LINKML_EXPRESSION
|
|
57
|
+
return extension_mapping.get(suffix, None)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class CollectionSlice(BaseModel):
|
|
61
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
62
|
+
|
|
63
|
+
name: Optional[str] = None
|
|
64
|
+
base_collection: Optional[Collection] = None
|
|
65
|
+
# _dataframe: Optional[pd.DataFrame] = None
|
|
66
|
+
# slice: Tuple[Optional[int], Optional[int]] = Field(default=(None, None))
|
|
67
|
+
indices: Optional[Tuple[int, ...]] = None
|
|
68
|
+
_collection: Optional[Collection] = None
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def collection(self) -> Collection:
|
|
72
|
+
if not self._collection:
|
|
73
|
+
rows = self.base_collection.find({}, limit=-1).rows
|
|
74
|
+
# subset based on indices
|
|
75
|
+
subset = [rows[i] for i in self.indices]
|
|
76
|
+
db = self.base_collection.parent
|
|
77
|
+
subset_name = f"{self.base_collection.alias}__rag_{self.name}"
|
|
78
|
+
subset_collection = db.get_collection(subset_name, create_if_not_exists=True)
|
|
79
|
+
subset_collection.insert(subset)
|
|
80
|
+
self._collection = subset_collection
|
|
81
|
+
return self._collection
|
|
82
|
+
|
|
83
|
+
def as_dataframe(self, flattened=False) -> pd.DataFrame:
|
|
84
|
+
"""
|
|
85
|
+
Return the slice of the collection as a dataframe.
|
|
86
|
+
|
|
87
|
+
:return:
|
|
88
|
+
"""
|
|
89
|
+
rs = self.collection.find({}, limit=-1)
|
|
90
|
+
if flattened:
|
|
91
|
+
return nested_objects_to_dataframe(rs.rows)
|
|
92
|
+
else:
|
|
93
|
+
return rs.rows_dataframe
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@dataclass
|
|
97
|
+
class InferenceEngine(ABC):
|
|
98
|
+
"""
|
|
99
|
+
Base class for all inference engine.
|
|
100
|
+
|
|
101
|
+
An InferenceEngine is capable of deriving inferences from input objects and a collection.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
predictor_type: Optional[str] = None
|
|
105
|
+
config: Optional[InferenceConfig] = None
|
|
106
|
+
|
|
107
|
+
training_data: Optional[CollectionSlice] = None
|
|
108
|
+
testing_data: Optional[CollectionSlice] = None
|
|
109
|
+
|
|
110
|
+
def load_and_split_data(self, collection: Collection, split: Optional[Tuple[float, float]] = None, randomize=True):
|
|
111
|
+
"""
|
|
112
|
+
Load the data and split it into training and testing sets.
|
|
113
|
+
|
|
114
|
+
:param collection:
|
|
115
|
+
:param split:
|
|
116
|
+
:return:
|
|
117
|
+
"""
|
|
118
|
+
split = split or self.config.train_test_split
|
|
119
|
+
if not split:
|
|
120
|
+
split = (0.7, 0.3)
|
|
121
|
+
logger.info(f"Loading and splitting data from collection {collection.alias}")
|
|
122
|
+
size = collection.size()
|
|
123
|
+
indices = range(size)
|
|
124
|
+
if randomize:
|
|
125
|
+
train_indices = random.sample(indices, int(size * split[0]))
|
|
126
|
+
test_indices = set(indices) - set(train_indices)
|
|
127
|
+
else:
|
|
128
|
+
train_indices = indices[: int(size * split[0])]
|
|
129
|
+
test_indices = indices[int(size * split[0]) :]
|
|
130
|
+
self.training_data = CollectionSlice(name="train", base_collection=collection, indices=train_indices)
|
|
131
|
+
self.testing_data = CollectionSlice(name="test", base_collection=collection, indices=test_indices)
|
|
132
|
+
# all_data = collection.find({}, limit=size).rows
|
|
133
|
+
# all_data_df = nested_objects_to_dataframe(all_data)
|
|
134
|
+
# all_data_df = collection.find({}, limit=size).rows_dataframe
|
|
135
|
+
# randomize/shuffle order of rows in dataframe
|
|
136
|
+
# all_data_df = all_data_df.sample(frac=1).reset_index(drop=True)
|
|
137
|
+
# self.training_data = CollectionSlice(dataframe=all_data_df[: int(size * split[0])])
|
|
138
|
+
# self.testing_data = CollectionSlice(dataframe=all_data_df[int(size * split[0]) : size])
|
|
139
|
+
# self.training_data = CollectionSlice(base_collection=collection, slice=(0, int(size * split[0])))
|
|
140
|
+
# self.testing_data = CollectionSlice(base_collection=collection, slice=(int(size * split[0]), size))
|
|
141
|
+
|
|
142
|
+
def initialize_model(self, **kwargs):
|
|
143
|
+
"""
|
|
144
|
+
Initialize the model.
|
|
145
|
+
|
|
146
|
+
:param kwargs:
|
|
147
|
+
:return:
|
|
148
|
+
"""
|
|
149
|
+
raise NotImplementedError("Initialize model method must be implemented by subclass")
|
|
150
|
+
|
|
151
|
+
def export_model(
|
|
152
|
+
self, output: Optional[Union[str, Path, TextIO]], model_serialization: ModelSerialization = None, **kwargs
|
|
153
|
+
):
|
|
154
|
+
"""
|
|
155
|
+
Export the model to the given output.
|
|
156
|
+
|
|
157
|
+
:param model_serialization:
|
|
158
|
+
:param output:
|
|
159
|
+
:param kwargs:
|
|
160
|
+
:return:
|
|
161
|
+
"""
|
|
162
|
+
raise NotImplementedError("Export model method must be implemented by subclass")
|
|
163
|
+
|
|
164
|
+
def import_model_from(self, inference_engine: "InferenceEngine", **kwargs):
|
|
165
|
+
"""
|
|
166
|
+
Import the model from the given inference engine.
|
|
167
|
+
|
|
168
|
+
:param inference_engine:
|
|
169
|
+
:param kwargs:
|
|
170
|
+
:return:
|
|
171
|
+
"""
|
|
172
|
+
raise NotImplementedError("Import model method must be implemented by subclass")
|
|
173
|
+
|
|
174
|
+
def save_model(self, output: Union[str, Path]) -> None:
|
|
175
|
+
"""
|
|
176
|
+
Save the model to the given output.
|
|
177
|
+
|
|
178
|
+
:param output:
|
|
179
|
+
:return:
|
|
180
|
+
"""
|
|
181
|
+
raise NotImplementedError("Save model method must be implemented by subclass")
|
|
182
|
+
|
|
183
|
+
@classmethod
|
|
184
|
+
def load_model(cls, file_path: Union[str, Path]) -> "InferenceEngine":
|
|
185
|
+
"""
|
|
186
|
+
Load the model from the given file path.
|
|
187
|
+
|
|
188
|
+
:param file_path:
|
|
189
|
+
:return:
|
|
190
|
+
"""
|
|
191
|
+
raise NotImplementedError("Load model method must be implemented by subclass")
|
|
192
|
+
|
|
193
|
+
def derive(self, object: OBJECT) -> Optional[Inference]:
|
|
194
|
+
"""
|
|
195
|
+
Derive the prediction for the given object.
|
|
196
|
+
|
|
197
|
+
:param object:
|
|
198
|
+
:return:
|
|
199
|
+
"""
|
|
200
|
+
raise NotImplementedError("Predict method must be implemented by subclass")
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import inspect
|
|
3
|
+
import os
|
|
4
|
+
from typing import Dict, Type
|
|
5
|
+
|
|
6
|
+
from linkml_store.inference.inference_config import InferenceConfig
|
|
7
|
+
from linkml_store.inference.inference_engine import InferenceEngine
|
|
8
|
+
from linkml_store.utils.object_utils import object_path_update
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class InferenceEngineRegistry:
|
|
12
|
+
def __init__(self):
|
|
13
|
+
self.engines: Dict[str, Type[InferenceEngine]] = {}
|
|
14
|
+
|
|
15
|
+
def register(self, name: str, engine_class: Type[InferenceEngine]):
|
|
16
|
+
self.engines[name] = engine_class
|
|
17
|
+
|
|
18
|
+
def get_engine_class(self, name: str) -> Type[InferenceEngine]:
|
|
19
|
+
if name not in self.engines:
|
|
20
|
+
raise ValueError(f"Unknown inference engine type: {name}" f"Known engines: {list(self.engines.keys())}")
|
|
21
|
+
return self.engines[name]
|
|
22
|
+
|
|
23
|
+
def create_engine(self, engine_type: str, config: InferenceConfig = None, **kwargs) -> InferenceEngine:
|
|
24
|
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
|
25
|
+
if ":" in engine_type:
|
|
26
|
+
engine_type, conf_args = engine_type.split(":", 1)
|
|
27
|
+
if config is None:
|
|
28
|
+
config = InferenceConfig()
|
|
29
|
+
for arg in conf_args.split(","):
|
|
30
|
+
k, v = arg.split("=")
|
|
31
|
+
config = object_path_update(config, k, v)
|
|
32
|
+
|
|
33
|
+
engine_class = self.get_engine_class(engine_type)
|
|
34
|
+
kwargs["predictor_type"] = engine_type
|
|
35
|
+
return engine_class(config=config, **kwargs)
|
|
36
|
+
|
|
37
|
+
@classmethod
|
|
38
|
+
def load_engines(cls, package_path: str):
|
|
39
|
+
registry = cls()
|
|
40
|
+
package_dir = os.path.dirname(importlib.import_module(package_path).__file__)
|
|
41
|
+
for filename in os.listdir(package_dir):
|
|
42
|
+
if filename.endswith(".py") and not filename.startswith("__"):
|
|
43
|
+
module_name = f"{package_path}.{filename[:-3]}"
|
|
44
|
+
try:
|
|
45
|
+
module = importlib.import_module(module_name)
|
|
46
|
+
for name, obj in inspect.getmembers(module):
|
|
47
|
+
if inspect.isclass(obj) and issubclass(obj, InferenceEngine) and obj != InferenceEngine:
|
|
48
|
+
engine_name = name.lower().replace("inferenceengine", "")
|
|
49
|
+
registry.register(engine_name, obj)
|
|
50
|
+
except ImportError as e:
|
|
51
|
+
print(f"Error importing {module_name}: {e}")
|
|
52
|
+
return registry
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# Initialize the registry
|
|
56
|
+
registry = InferenceEngineRegistry.load_engines("linkml_store.inference.implementations")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# Function to get an inference engine (can be used as before)
|
|
60
|
+
def get_inference_engine(engine_type: str, config: InferenceConfig = None, **kwargs) -> InferenceEngine:
|
|
61
|
+
"""
|
|
62
|
+
Get an inference engine.
|
|
63
|
+
|
|
64
|
+
>>> from linkml_store.inference import get_inference_engine
|
|
65
|
+
>>> ie = get_inference_engine('sklearn')
|
|
66
|
+
>>> type(ie)
|
|
67
|
+
<class 'linkml_store.inference.implementations.sklearn_inference_engine.SklearnInferenceEngine'>
|
|
68
|
+
|
|
69
|
+
:param engine_type:
|
|
70
|
+
:param config:
|
|
71
|
+
:param kwargs:
|
|
72
|
+
:return:
|
|
73
|
+
"""
|
|
74
|
+
return registry.create_engine(engine_type, config, **kwargs)
|