linkml-store 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

@@ -0,0 +1,173 @@
1
+ import logging
2
+ from abc import ABC
3
+ from dataclasses import dataclass
4
+ from enum import Enum
5
+ from pathlib import Path
6
+ from typing import Optional, TextIO, Tuple, Union
7
+
8
+ import pandas as pd
9
+ from pydantic import BaseModel, ConfigDict, Field
10
+
11
+ from linkml_store.api.collection import OBJECT, Collection
12
+ from linkml_store.inference.inference_config import Inference, InferenceConfig
13
+ from linkml_store.utils.pandas_utils import nested_objects_to_dataframe
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class ModelSerialization(str, Enum):
19
+ """
20
+ Enum for model serialization types.
21
+ """
22
+
23
+ PICKLE = "pickle"
24
+ ONNX = "onnx"
25
+ PMML = "pmml"
26
+ PFA = "pfa"
27
+ JOBLIB = "joblib"
28
+ PNG = "png"
29
+ LINKML_EXPRESSION = "linkml_expression"
30
+ RULE_BASED = "rulebased"
31
+
32
+ @classmethod
33
+ def from_filepath(cls, file_path: str) -> Optional["ModelSerialization"]:
34
+ """
35
+ Get the serialization type from the file path.
36
+
37
+ >>> ModelSerialization.from_filepath("model.onnx")
38
+ <ModelSerialization.ONNX: 'onnx'>
39
+ >>> ModelSerialization.from_filepath("model.pkl")
40
+ <ModelSerialization.PICKLE: 'pickle'>
41
+ >>> assert ModelSerialization.from_filepath("poor_file_name") is None
42
+
43
+ :param file_path:
44
+ :return:
45
+ """
46
+ toks = file_path.split(".")
47
+ suffix = toks[-1]
48
+ if len(toks) > 2:
49
+ if suffix == "yaml" and toks[-2] == "rulebased":
50
+ return cls.RULE_BASED
51
+ # Generate mapping dynamically
52
+ extension_mapping = {v.lower(): v for v in cls}
53
+ # Add special cases
54
+ extension_mapping["pkl"] = cls.PICKLE
55
+ extension_mapping["py"] = cls.LINKML_EXPRESSION
56
+ return extension_mapping.get(suffix, None)
57
+
58
+
59
+ class CollectionSlice(BaseModel):
60
+ model_config = ConfigDict(arbitrary_types_allowed=True)
61
+
62
+ collection: Optional[Collection] = None
63
+ dataframe: Optional[pd.DataFrame] = None
64
+ slice: Tuple[Optional[int], Optional[int]] = Field(default=(None, None))
65
+
66
+ def as_dataframe(self, flattened=False) -> pd.DataFrame:
67
+ """
68
+ Return the slice of the collection as a dataframe.
69
+
70
+ :return:
71
+ """
72
+ if self.dataframe is not None:
73
+ df = self.dataframe
74
+ return df.iloc[self.slice[0] : self.slice[1]]
75
+ elif self.collection is not None:
76
+ rs = self.collection.find({}, offset=self.slice[0], limit=self.slice[1] - self.slice[0])
77
+ if flattened:
78
+ return nested_objects_to_dataframe(rs.rows)
79
+ else:
80
+ return rs.rows_dataframe
81
+ else:
82
+ raise ValueError("No dataframe or collection provided")
83
+
84
+
85
+ @dataclass
86
+ class InferenceEngine(ABC):
87
+ """
88
+ Base class for all inference engine.
89
+
90
+ An InferenceEngine is capable of deriving inferences from input objects and a collection.
91
+ """
92
+
93
+ predictor_type: Optional[str] = None
94
+ config: Optional[InferenceConfig] = None
95
+
96
+ training_data: Optional[CollectionSlice] = None
97
+ testing_data: Optional[CollectionSlice] = None
98
+
99
+ def load_and_split_data(self, collection: Collection, split: Optional[Tuple[float, float]] = None):
100
+ """
101
+ Load the data and split it into training and testing sets.
102
+
103
+ :param collection:
104
+ :param split:
105
+ :return:
106
+ """
107
+ split = split or self.config.train_test_split
108
+ if not split:
109
+ split = (0.7, 0.3)
110
+ logger.info(f"Loading and splitting data from collection {collection.alias}")
111
+ size = collection.size()
112
+ self.training_data = CollectionSlice(collection=collection, slice=(0, int(size * split[0])))
113
+ self.testing_data = CollectionSlice(collection=collection, slice=(int(size * split[0]), size))
114
+
115
+ def initialize_model(self, **kwargs):
116
+ """
117
+ Initialize the model.
118
+
119
+ :param kwargs:
120
+ :return:
121
+ """
122
+ raise NotImplementedError("Initialize model method must be implemented by subclass")
123
+
124
+ def export_model(
125
+ self, output: Optional[Union[str, Path, TextIO]], model_serialization: ModelSerialization = None, **kwargs
126
+ ):
127
+ """
128
+ Export the model to the given output.
129
+
130
+ :param model_serialization:
131
+ :param output:
132
+ :param kwargs:
133
+ :return:
134
+ """
135
+ raise NotImplementedError("Export model method must be implemented by subclass")
136
+
137
+ def import_model_from(self, inference_engine: "InferenceEngine", **kwargs):
138
+ """
139
+ Import the model from the given inference engine.
140
+
141
+ :param inference_engine:
142
+ :param kwargs:
143
+ :return:
144
+ """
145
+ raise NotImplementedError("Import model method must be implemented by subclass")
146
+
147
+ def save_model(self, output: Union[str, Path]) -> None:
148
+ """
149
+ Save the model to the given output.
150
+
151
+ :param output:
152
+ :return:
153
+ """
154
+ raise NotImplementedError("Save model method must be implemented by subclass")
155
+
156
+ @classmethod
157
+ def load_model(cls, file_path: Union[str, Path]) -> "InferenceEngine":
158
+ """
159
+ Load the model from the given file path.
160
+
161
+ :param file_path:
162
+ :return:
163
+ """
164
+ raise NotImplementedError("Load model method must be implemented by subclass")
165
+
166
+ def derive(self, object: OBJECT) -> Optional[Inference]:
167
+ """
168
+ Derive the prediction for the given object.
169
+
170
+ :param object:
171
+ :return:
172
+ """
173
+ raise NotImplementedError("Predict method must be implemented by subclass")
@@ -0,0 +1,74 @@
1
+ import importlib
2
+ import inspect
3
+ import os
4
+ from typing import Dict, Type
5
+
6
+ from linkml_store.inference.inference_config import InferenceConfig
7
+ from linkml_store.inference.inference_engine import InferenceEngine
8
+ from linkml_store.utils.object_utils import object_path_update
9
+
10
+
11
+ class InferenceEngineRegistry:
12
+ def __init__(self):
13
+ self.engines: Dict[str, Type[InferenceEngine]] = {}
14
+
15
+ def register(self, name: str, engine_class: Type[InferenceEngine]):
16
+ self.engines[name] = engine_class
17
+
18
+ def get_engine_class(self, name: str) -> Type[InferenceEngine]:
19
+ if name not in self.engines:
20
+ raise ValueError(f"Unknown inference engine type: {name}" f"Known engines: {list(self.engines.keys())}")
21
+ return self.engines[name]
22
+
23
+ def create_engine(self, engine_type: str, config: InferenceConfig = None, **kwargs) -> InferenceEngine:
24
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
25
+ if ":" in engine_type:
26
+ engine_type, conf_args = engine_type.split(":", 1)
27
+ if config is None:
28
+ config = InferenceConfig()
29
+ for arg in conf_args.split(","):
30
+ k, v = arg.split("=")
31
+ config = object_path_update(config, k, v)
32
+
33
+ engine_class = self.get_engine_class(engine_type)
34
+ kwargs["predictor_type"] = engine_type
35
+ return engine_class(config=config, **kwargs)
36
+
37
+ @classmethod
38
+ def load_engines(cls, package_path: str):
39
+ registry = cls()
40
+ package_dir = os.path.dirname(importlib.import_module(package_path).__file__)
41
+ for filename in os.listdir(package_dir):
42
+ if filename.endswith(".py") and not filename.startswith("__"):
43
+ module_name = f"{package_path}.{filename[:-3]}"
44
+ try:
45
+ module = importlib.import_module(module_name)
46
+ for name, obj in inspect.getmembers(module):
47
+ if inspect.isclass(obj) and issubclass(obj, InferenceEngine) and obj != InferenceEngine:
48
+ engine_name = name.lower().replace("inferenceengine", "")
49
+ registry.register(engine_name, obj)
50
+ except ImportError as e:
51
+ print(f"Error importing {module_name}: {e}")
52
+ return registry
53
+
54
+
55
+ # Initialize the registry
56
+ registry = InferenceEngineRegistry.load_engines("linkml_store.inference.implementations")
57
+
58
+
59
+ # Function to get an inference engine (can be used as before)
60
+ def get_inference_engine(engine_type: str, config: InferenceConfig = None, **kwargs) -> InferenceEngine:
61
+ """
62
+ Get an inference engine.
63
+
64
+ >>> from linkml_store.inference import get_inference_engine
65
+ >>> ie = get_inference_engine('sklearn')
66
+ >>> type(ie)
67
+ <class 'linkml_store.inference.implementations.sklearn_inference_engine.SklearnInferenceEngine'>
68
+
69
+ :param engine_type:
70
+ :param config:
71
+ :param kwargs:
72
+ :return:
73
+ """
74
+ return registry.create_engine(engine_type, config, **kwargs)
@@ -27,6 +27,7 @@ class Format(Enum):
27
27
  JSON = "json"
28
28
  JSONL = "jsonl"
29
29
  YAML = "yaml"
30
+ YAMLL = "yamll"
30
31
  TSV = "tsv"
31
32
  CSV = "csv"
32
33
  PYTHON = "python"
@@ -63,6 +64,9 @@ class Format(Enum):
63
64
  def is_dump_format(self):
64
65
  return self in [Format.SQLDUMP_DUCKDB, Format.SQLDUMP_POSTGRES, Format.DUMP_MONGODB]
65
66
 
67
+ def is_xsv(self):
68
+ return self in [Format.TSV, Format.CSV]
69
+
66
70
 
67
71
  def load_objects_from_url(
68
72
  url: str,
@@ -135,11 +139,14 @@ def load_objects(
135
139
  compression: Optional[str] = None,
136
140
  expected_type: Optional[Type] = None,
137
141
  header_comment_token: Optional[str] = None,
142
+ select_query: Optional[str] = None,
138
143
  ) -> List[Dict[str, Any]]:
139
144
  """
140
145
  Load objects from a file or archive in supported formats.
141
146
  For tgz archives, it processes all files and concatenates the results.
142
147
 
148
+ TODO: Add schema hints for CSV/TSV parsing.
149
+
143
150
  :param file_path: The path to the file or archive.
144
151
  :param format: The format of the file. Can be a Format enum or a string value.
145
152
  :param compression: The compression type. Supports 'gz' for gzip and 'tgz' for tar.gz.
@@ -177,98 +184,22 @@ def load_objects(
177
184
  all_objects = process_file(f, format, expected_type, header_comment_token)
178
185
 
179
186
  logger.debug(f"Loaded {len(all_objects)} objects from {file_path}")
187
+ if select_query:
188
+ import jsonpath_ng as jp
189
+
190
+ path_expr = jp.parse(select_query)
191
+ new_objs = []
192
+ for obj in all_objects:
193
+ for match in path_expr.find(obj):
194
+ logging.debug(f"Match: {match.value}")
195
+ if isinstance(match.value, list):
196
+ new_objs.extend(match.value)
197
+ else:
198
+ new_objs.append(match.value)
199
+ all_objects = new_objs
180
200
  return all_objects
181
201
 
182
202
 
183
- def xxxload_objects(
184
- file_path: Union[str, Path],
185
- format: Union[Format, str] = None,
186
- compression: Optional[str] = None,
187
- expected_type: Type = None,
188
- header_comment_token: Optional[str] = None,
189
- ) -> List[Dict[str, Any]]:
190
- """
191
- Load objects from a file in JSON, JSONLines, YAML, CSV, or TSV format.
192
-
193
- >>> load_objects("tests/input/test_data/data.csv")
194
- [{'id': '1', 'name': 'John', 'age': '30'},
195
- {'id': '2', 'name': 'Alice', 'age': '25'}, {'id': '3', 'name': 'Bob', 'age': '35'}]
196
-
197
- :param file_path: The path to the file.
198
- :param format: The format of the file. Can be a Format enum or a string value.
199
- :param expected_type: The target type to load the objects into, e.g. list
200
- :return: A list of dictionaries representing the loaded objects.
201
- """
202
- if isinstance(format, str):
203
- format = Format(format)
204
-
205
- if isinstance(file_path, Path):
206
- file_path = str(file_path)
207
-
208
- if not format and (file_path.endswith(".parquet") or file_path.endswith(".pq")):
209
- format = Format.PARQUET
210
- if not format and file_path.endswith(".tsv"):
211
- format = Format.TSV
212
- if not format and file_path.endswith(".csv"):
213
- format = Format.CSV
214
- if not format and file_path.endswith(".py"):
215
- format = Format.PYTHON
216
-
217
- mode = "r"
218
- if format == Format.PARQUET:
219
- mode = "rb"
220
-
221
- if file_path == "-":
222
- # set file_path to be a stream from stdin
223
- f = sys.stdin
224
- else:
225
- f = open(file_path, mode)
226
-
227
- if format == Format.JSON or (not format and file_path.endswith(".json")):
228
- objs = json.load(f)
229
- elif format == Format.JSONL or (not format and file_path.endswith(".jsonl")):
230
- objs = [json.loads(line) for line in f]
231
- elif format == Format.YAML or (not format and (file_path.endswith(".yaml") or file_path.endswith(".yml"))):
232
- if expected_type and expected_type == list: # noqa E721
233
- objs = list(yaml.safe_load_all(f))
234
- else:
235
- objs = yaml.safe_load(f)
236
- elif format == Format.TSV or format == Format.CSV:
237
- # Skip initial comment lines if comment_char is set
238
- if header_comment_token:
239
- # Store the original position
240
- original_pos = f.tell()
241
-
242
- # Read and store lines until we find a non-comment line
243
- lines = []
244
- for line in f:
245
- if not line.startswith(header_comment_token):
246
- break
247
- lines.append(line)
248
-
249
- # Go back to the original position
250
- f.seek(original_pos)
251
-
252
- # Skip the comment lines we found
253
- for _ in lines:
254
- f.readline()
255
- if format == Format.TSV:
256
- reader = csv.DictReader(f, delimiter="\t")
257
- else:
258
- reader = csv.DictReader(f)
259
- objs = list(reader)
260
- elif format == Format.PARQUET:
261
- import pyarrow.parquet as pq
262
-
263
- table = pq.read_table(f)
264
- objs = table.to_pandas().to_dict(orient="records")
265
- else:
266
- raise ValueError(f"Unsupported file format: {file_path}")
267
- if not isinstance(objs, list):
268
- objs = [objs]
269
- return objs
270
-
271
-
272
203
  def write_output(
273
204
  data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame],
274
205
  format: Union[Format, str] = Format.YAML,
@@ -329,7 +260,7 @@ def render_output(
329
260
  if format == Format.FORMATTED:
330
261
  if not isinstance(data, pd.DataFrame):
331
262
  data = pd.DataFrame(data)
332
- return str(data)
263
+ return data.to_string(max_rows=None)
333
264
 
334
265
  if isinstance(data, pd.DataFrame):
335
266
  data = data.to_dict(orient="records")
@@ -0,0 +1,95 @@
1
+ from typing import Callable, List, Optional
2
+
3
+ from tiktoken import Encoding
4
+
5
+ MODEL_TOKEN_MAPPING = {
6
+ "gpt-4o-mini": 128_000,
7
+ "gpt-4o": 128_000,
8
+ "gpt-4o-2024-05-13": 128_000,
9
+ "gpt-4": 8192,
10
+ "gpt-4-0314": 8192,
11
+ "gpt-4-0613": 8192,
12
+ "gpt-4-32k": 32768,
13
+ "gpt-4-32k-0314": 32768,
14
+ "gpt-4-32k-0613": 32768,
15
+ "gpt-3.5-turbo": 4096,
16
+ "gpt-3.5-turbo-0301": 4096,
17
+ "gpt-3.5-turbo-0613": 4096,
18
+ "gpt-3.5-turbo-16k": 16385,
19
+ "gpt-3.5-turbo-16k-0613": 16385,
20
+ "gpt-3.5-turbo-instruct": 4096,
21
+ "text-ada-001": 2049,
22
+ "ada": 2049,
23
+ "text-babbage-001": 2040,
24
+ "babbage": 2049,
25
+ "text-curie-001": 2049,
26
+ "curie": 2049,
27
+ "davinci": 2049,
28
+ "text-davinci-003": 4097,
29
+ "text-davinci-002": 4097,
30
+ "code-davinci-002": 8001,
31
+ "code-davinci-001": 8001,
32
+ "code-cushman-002": 2048,
33
+ "code-cushman-001": 2048,
34
+ "claude": 200_000,
35
+ }
36
+
37
+
38
+ def render_formatted_text(
39
+ render_func: Callable,
40
+ values: List[str],
41
+ encoding: Encoding,
42
+ token_limit: int,
43
+ additional_text: Optional[str] = None,
44
+ ) -> str:
45
+ """
46
+ Render a formatted text string with a given object, encoding, and token limit.
47
+
48
+ >>> from tiktoken import encoding_for_model
49
+ >>> encoding = encoding_for_model("gpt-4o-mini")
50
+ >>> names = ["Alice", "Bob", "DoctorHippopotamusMcHippopotamusFace"]
51
+ >>> f = lambda x: f"Hello, {' '.join(x)}!"
52
+ >>> render_formatted_text(f, names, encoding, 4096)
53
+ 'Hello, Alice Bob DoctorHippopotamusMcHippopotamusFace!'
54
+ >>> render_formatted_text(f, names, encoding, 5)
55
+ 'Hello, Alice Bob!'
56
+
57
+ :param render_func: Rendering function
58
+ :param values: Values to render
59
+ :param encoding: Encoding
60
+ :param token_limit: Token limit
61
+ :param additional_text: Additional text to consider
62
+ :return:
63
+ """
64
+ text = render_func(values)
65
+ if additional_text:
66
+ token_limit -= len(encoding.encode(additional_text))
67
+ text_length = len(encoding.encode(text))
68
+ if text_length <= token_limit:
69
+ return text
70
+ if not values:
71
+ raise ValueError(f"Cannot fit text into token limit: {text_length} > {token_limit}")
72
+ return render_formatted_text(render_func, values[0:-1], encoding=encoding, token_limit=token_limit)
73
+
74
+
75
+ def get_token_limit(model_name: str) -> int:
76
+ """
77
+ Estimate the token limit for a model.
78
+
79
+ >>> get_token_limit("gpt-4o-mini")
80
+ 128000
81
+
82
+ also works with nested names:
83
+
84
+ >>> get_token_limit("my/claude-opus")
85
+ 200000
86
+
87
+
88
+ :param model_name: Model name
89
+ :return: Estimated token limit
90
+ """
91
+ # sort MODEL_TOKEN_MAPPING by key length to ensure that the longest model names are checked first
92
+ for model, token_limit in sorted(MODEL_TOKEN_MAPPING.items(), key=lambda x: len(x[0]), reverse=True):
93
+ if model in model_name:
94
+ return token_limit
95
+ return 4096
@@ -29,7 +29,7 @@ def object_path_update(
29
29
  """
30
30
  if isinstance(obj, BaseModel):
31
31
  typ = type(obj)
32
- obj = obj.dict()
32
+ obj = obj.model_dump(exclude_none=True)
33
33
  obj = object_path_update(obj, path, value)
34
34
  return typ(**obj)
35
35
  obj = deepcopy(obj)
@@ -45,6 +45,8 @@ def object_path_update(
45
45
  obj.append({})
46
46
  obj = obj[index]
47
47
  else:
48
+ if part in obj and obj[part] is None:
49
+ del obj[part]
48
50
  obj = obj.setdefault(part, {})
49
51
  last_part = parts[-1]
50
52
  if "[" in last_part:
@@ -1,7 +1,59 @@
1
- from typing import Dict, List, Tuple, Union
1
+ import logging
2
+ from typing import Any, Dict, List, Tuple, Union
2
3
 
3
4
  import pandas as pd
4
5
 
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ def flatten_dict(d: Dict[str, Any], parent_key: str = "", sep: str = ".") -> Dict[str, Any]:
10
+ """
11
+ Recursively flatten a nested dictionary.
12
+
13
+ Args:
14
+ d (Dict[str, Any]): The dictionary to flatten.
15
+ parent_key (str): The parent key for nested dictionaries.
16
+ sep (str): The separator to use between keys.
17
+
18
+ Returns:
19
+ Dict[str, Any]: A flattened dictionary.
20
+
21
+ >>> flatten_dict({'a': 1, 'b': {'c': 2, 'd': {'e': 3}}})
22
+ {'a': 1, 'b.c': 2, 'b.d.e': 3}
23
+ """
24
+ items = []
25
+ for k, v in d.items():
26
+ new_key = f"{parent_key}{sep}{k}" if parent_key else k
27
+ if isinstance(v, dict):
28
+ items.extend(flatten_dict(v, new_key, sep=sep).items())
29
+ else:
30
+ items.append((new_key, v))
31
+ return dict(items)
32
+
33
+
34
+ def nested_objects_to_dataframe(data: List[Dict[str, Any]]) -> pd.DataFrame:
35
+ """
36
+ Convert a list of nested objects to a flattened pandas DataFrame.
37
+
38
+ Args:
39
+ data (List[Dict[str, Any]]): A list of nested dictionaries.
40
+
41
+ Returns:
42
+ pd.DataFrame: A flattened DataFrame.
43
+
44
+ >>> data = [
45
+ ... {"person": {"name": "Alice", "age": 30}, "job": {"title": "Engineer", "salary": 75000}},
46
+ ... {"person": {"name": "Bob", "age": 35}, "job": {"title": "Manager", "salary": 85000}}
47
+ ... ]
48
+ >>> df = nested_objects_to_dataframe(data)
49
+ >>> df.columns.tolist()
50
+ ['person.name', 'person.age', 'job.title', 'job.salary']
51
+ >>> df['person.name'].tolist()
52
+ ['Alice', 'Bob']
53
+ """
54
+ flattened_data = [flatten_dict(item) for item in data]
55
+ return pd.DataFrame(flattened_data)
56
+
5
57
 
6
58
  def facet_summary_to_dataframe_unmelted(
7
59
  facet_summary: Dict[Union[str, Tuple[str, ...]], List[Tuple[Union[str, Tuple[str, ...]], int]]]
@@ -22,7 +74,8 @@ def facet_summary_to_dataframe_unmelted(
22
74
  categories, value = cat_val_tuple[:-1], cat_val_tuple[-1]
23
75
  row = {"Value": value}
24
76
  for i, facet in enumerate(facet_type):
25
- row[facet] = categories[i]
77
+ logger.debug(f"FT={facet_type} i={i} Facet: {facet}, categories: {categories}")
78
+ row[facet] = categories[i] if len(categories) > i else None
26
79
  rows.append(row)
27
80
 
28
81
  df = pd.DataFrame(rows)