linkml-store 0.1.14__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

@@ -1,13 +1,17 @@
1
+ import json
1
2
  import logging
2
3
  from dataclasses import dataclass
3
- from typing import Any, Optional
4
+ from pathlib import Path
5
+ from typing import ClassVar, List, Optional, TextIO, Union
4
6
 
5
7
  import yaml
6
8
  from llm import get_key
9
+ from pydantic import BaseModel
7
10
 
8
11
  from linkml_store.api.collection import OBJECT, Collection
9
12
  from linkml_store.inference.inference_config import Inference, InferenceConfig, LLMConfig
10
- from linkml_store.inference.inference_engine import InferenceEngine
13
+ from linkml_store.inference.inference_engine import InferenceEngine, ModelSerialization
14
+ from linkml_store.utils.object_utils import select_nested
11
15
 
12
16
  logger = logging.getLogger(__name__)
13
17
 
@@ -22,6 +26,12 @@ You should return ONLY valid YAML in your response.
22
26
  """
23
27
 
24
28
 
29
+ class TrainedModel(BaseModel, extra="forbid"):
30
+ rag_collection_rows: List[OBJECT]
31
+ index_rows: List[OBJECT]
32
+ config: Optional[InferenceConfig] = None
33
+
34
+
25
35
  @dataclass
26
36
  class RAGInferenceEngine(InferenceEngine):
27
37
  """
@@ -48,14 +58,23 @@ class RAGInferenceEngine(InferenceEngine):
48
58
  >>> prediction.predicted_object
49
59
  {'capital': 'Montevideo', 'code': 'UY', 'continent': 'South America', 'languages': ['Spanish']}
50
60
 
61
+ The "model" can be saved for later use:
62
+
63
+ >>> ie.export_model("tests/output/countries.rag_model.json")
64
+
65
+ Note in this case the model is not the underlying LLM, but the "RAG Model" which is the vectorized
66
+ representation of training set objects.
67
+
51
68
  """
52
69
 
53
- classifier: Any = None
54
- encoders: dict = None
55
70
  _model: "llm.Model" = None # noqa: F821
56
71
 
57
72
  rag_collection: Collection = None
58
73
 
74
+ PERSIST_COLS: ClassVar[List[str]] = [
75
+ "config",
76
+ ]
77
+
59
78
  def __post_init__(self):
60
79
  if not self.config:
61
80
  self.config = InferenceConfig()
@@ -75,18 +94,11 @@ class RAGInferenceEngine(InferenceEngine):
75
94
  return self._model
76
95
 
77
96
  def initialize_model(self, **kwargs):
78
- td = self.training_data
79
- s = td.slice
80
- if not s[0] and not s[1]:
81
- rag_collection = td.collection
82
- else:
83
- base_collection = td.collection
84
- objs = base_collection.find({}, offset=s[0], limit=s[1] - s[0]).rows
85
- db = base_collection.parent
86
- rag_collection = db.get_collection(f"{base_collection.alias}__rag_{s[0]}_{s[1]}", create_if_not_exists=True)
87
- rag_collection.insert(objs)
88
- rag_collection.attach_indexer("llm", auto_index=False)
89
- self.rag_collection = rag_collection
97
+ logger.info(f"Initializing model {self.model}")
98
+ if self.training_data:
99
+ rag_collection = self.training_data.collection
100
+ rag_collection.attach_indexer("llm", auto_index=False)
101
+ self.rag_collection = rag_collection
90
102
 
91
103
  def object_to_text(self, object: OBJECT) -> str:
92
104
  return yaml.dump(object)
@@ -103,24 +115,34 @@ class RAGInferenceEngine(InferenceEngine):
103
115
  target_attributes = self.config.target_attributes
104
116
  num_examples = self.config.llm_config.number_of_few_shot_examples or 5
105
117
  query_text = self.object_to_text(object)
106
- if not self.rag_collection.indexers:
107
- raise ValueError("RAG collection must have an indexer attached")
108
- rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm")
109
- examples = rs.rows
110
- if not examples:
111
- raise ValueError(f"No examples found for {query_text}; size = {self.rag_collection.size()}")
118
+ if not self.rag_collection:
119
+ # TODO: zero-shot mode
120
+ examples = []
121
+ else:
122
+ if not self.rag_collection.indexers:
123
+ raise ValueError("RAG collection must have an indexer attached")
124
+ rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm")
125
+ examples = rs.rows
126
+ if not examples:
127
+ raise ValueError(f"No examples found for {query_text}; size = {self.rag_collection.size()}")
112
128
  prompt_clauses = []
129
+ query_obj = select_nested(object, feature_attributes)
130
+ query_text = self.object_to_text(query_obj)
113
131
  for example in examples:
114
- input_obj = {k: example.get(k, None) for k in feature_attributes}
115
- output_obj = {k: example.get(k, None) for k in target_attributes}
132
+ input_obj = select_nested(example, feature_attributes)
133
+ input_obj_text = self.object_to_text(input_obj)
134
+ if input_obj_text == query_text:
135
+ raise ValueError(
136
+ f"Query object {query_text} is the same as example object {input_obj_text}\n"
137
+ "This indicates possible test data leakage\n."
138
+ "TODO: allow an option that allows user to treat this as a basic lookup\n"
139
+ )
140
+ output_obj = select_nested(example, target_attributes)
116
141
  prompt_clause = (
117
- "---\nExample:\n"
118
- f"## INPUT:\n{self.object_to_text(input_obj)}\n"
119
- f"## OUTPUT:\n{self.object_to_text(output_obj)}\n"
142
+ "---\nExample:\n" f"## INPUT:\n{input_obj_text}\n" f"## OUTPUT:\n{self.object_to_text(output_obj)}\n"
120
143
  )
121
144
  prompt_clauses.append(prompt_clause)
122
- query_obj = {k: object.get(k, None) for k in feature_attributes}
123
- query_text = self.object_to_text(query_obj)
145
+
124
146
  prompt_end = "---\nQuery:\n" f"## INPUT:\n{query_text}\n" "## OUTPUT:\n"
125
147
  system_prompt = SYSTEM_PROMPT.format(llm_config=self.config.llm_config)
126
148
 
@@ -137,9 +159,74 @@ class RAGInferenceEngine(InferenceEngine):
137
159
  response = model.prompt(prompt, system_prompt)
138
160
  yaml_str = response.text()
139
161
  logger.info(f"Response: {yaml_str}")
162
+ return Inference(predicted_object=self._parse_yaml_payload(yaml_str))
163
+
164
+ def _parse_yaml_payload(self, yaml_str: str, strict=False) -> Optional[OBJECT]:
165
+ if "```" in yaml_str:
166
+ yaml_str = yaml_str.split("```")[1].strip()
167
+ if yaml_str.startswith("yaml"):
168
+ yaml_str = yaml_str[4:].strip()
140
169
  try:
141
- predicted_object = yaml.safe_load(yaml_str)
142
- return Inference(predicted_object=predicted_object)
143
- except yaml.parser.ParserError as e:
144
- logger.error(f"Error parsing response: {yaml_str}\n{e}")
170
+ return yaml.safe_load(yaml_str)
171
+ except Exception as e:
172
+ if strict:
173
+ raise e
174
+ logger.error(f"Error parsing YAML: {yaml_str}\n{e}")
145
175
  return None
176
+
177
+ def export_model(
178
+ self, output: Optional[Union[str, Path, TextIO]], model_serialization: ModelSerialization = None, **kwargs
179
+ ):
180
+ self.save_model(output)
181
+
182
+ def save_model(self, output: Union[str, Path]) -> None:
183
+ """
184
+ Save the trained model and related data to a file.
185
+
186
+ :param output: Path to save the model
187
+ """
188
+
189
+ # trigger index
190
+ _qr = self.rag_collection.search("*", limit=1)
191
+ assert len(_qr.ranked_rows) > 0
192
+
193
+ rows = self.rag_collection.find(limit=-1).rows
194
+
195
+ indexers = self.rag_collection.indexers
196
+ assert len(indexers) == 1
197
+ ix = self.rag_collection.indexers["llm"]
198
+ ix_coll = self.rag_collection.parent.get_collection(self.rag_collection.get_index_collection_name(ix))
199
+
200
+ ix_rows = ix_coll.find(limit=-1).rows
201
+ assert len(ix_rows) > 0
202
+ tm = TrainedModel(rag_collection_rows=rows, index_rows=ix_rows, config=self.config)
203
+ # tm = TrainedModel(rag_collection_rows=rows, index_rows=ix_rows)
204
+ with open(output, "w", encoding="utf-8") as f:
205
+ json.dump(tm.model_dump(), f)
206
+
207
+ @classmethod
208
+ def load_model(cls, file_path: Union[str, Path]) -> "RAGInferenceEngine":
209
+ """
210
+ Load a trained model and related data from a file.
211
+
212
+ :param file_path: Path to the saved model
213
+ :return: SklearnInferenceEngine instance with loaded model
214
+ """
215
+ with open(file_path, "r", encoding="utf-8") as f:
216
+ model_data = json.load(f)
217
+ tm = TrainedModel(**model_data)
218
+ from linkml_store.api import Client
219
+
220
+ client = Client()
221
+ db = client.attach_database("duckdb", alias="training")
222
+ db.store({"data": tm.rag_collection_rows})
223
+ collection = db.get_collection("data")
224
+ ix = collection.attach_indexer("llm", auto_index=False)
225
+ assert ix.name
226
+ ix_coll_name = collection.get_index_collection_name(ix)
227
+ assert ix_coll_name
228
+ ix_coll = db.get_collection(ix_coll_name, create_if_not_exists=True)
229
+ ix_coll.insert(tm.index_rows)
230
+ ie = cls(config=tm.config)
231
+ ie.rag_collection = collection
232
+ return ie
@@ -13,7 +13,7 @@ from linkml_runtime.utils.formatutils import underscore
13
13
  from pydantic import BaseModel
14
14
 
15
15
  from linkml_store.api.collection import OBJECT, Collection
16
- from linkml_store.inference.inference_config import Inference
16
+ from linkml_store.inference.inference_config import Inference, InferenceConfig
17
17
  from linkml_store.inference.inference_engine import InferenceEngine, ModelSerialization
18
18
 
19
19
  logger = logging.getLogger(__name__)
@@ -111,11 +111,16 @@ class RuleBasedInferenceEngine(InferenceEngine):
111
111
  object = {underscore(k): v for k, v in object.items()}
112
112
  if self.slot_expressions:
113
113
  for slot, expr in self.slot_expressions.items():
114
- print(f"EVAL {object}")
115
114
  v = eval_expr(expr, **object)
116
115
  if v is not None:
117
116
  object[slot] = v
118
- return Inference(predicted_object=object)
117
+ if self.config and self.config.target_attributes:
118
+ predicted_object = {k: object.get(k, None) for k in self.config.target_attributes}
119
+ else:
120
+ predicted_object = object
121
+ if all(v is None for v in predicted_object.values()):
122
+ return None
123
+ return Inference(predicted_object=predicted_object)
119
124
 
120
125
  def import_model_from(self, inference_engine: InferenceEngine, **kwargs):
121
126
  io = StringIO()
@@ -127,6 +132,8 @@ class RuleBasedInferenceEngine(InferenceEngine):
127
132
  if self.slot_expressions is None:
128
133
  self.slot_expressions = {}
129
134
  self.slot_expressions[target_attribute] = io.getvalue()
135
+ if not self.config:
136
+ self.config = inference_engine.config
130
137
 
131
138
  def save_model(self, output: Union[str, Path]) -> None:
132
139
  """
@@ -148,7 +155,11 @@ class RuleBasedInferenceEngine(InferenceEngine):
148
155
  def load_model(cls, file_path: Union[str, Path]) -> "RuleBasedInferenceEngine":
149
156
  model_data = yaml.safe_load(open(file_path))
150
157
 
151
- engine = cls(config=model_data["config"])
158
+ if model_data["config"]:
159
+ config = InferenceConfig(**model_data["config"])
160
+ else:
161
+ config = None
162
+ engine = cls(config=config)
152
163
  for k, v in model_data.items():
153
164
  if k == "config":
154
165
  continue
@@ -153,7 +153,7 @@ class SklearnInferenceEngine(InferenceEngine):
153
153
  y = y_encoder.fit_transform(y.values.ravel()) # Convert to 1D numpy array
154
154
  self.transformed_targets = y_encoder.classes_
155
155
 
156
- logger.info(f"Fitting model with features: {X.columns}")
156
+ # print(f"Fitting model with features: {X.columns}")
157
157
  clf = DecisionTreeClassifier(random_state=42)
158
158
  clf.fit(X, y)
159
159
  self.classifier = clf
@@ -174,6 +174,7 @@ class SklearnInferenceEngine(InferenceEngine):
174
174
  if col in self.encoders:
175
175
  encoder = self.encoders[col]
176
176
  if isinstance(encoder, OneHotEncoder):
177
+ print(f"Encoding: {col} v={object[col]} df={new_X[[col]]} encoder={encoder}")
177
178
  encoded = encoder.transform(new_X[[col]])
178
179
  feature_names = encoder.get_feature_names_out([col])
179
180
  for i, name in enumerate(feature_names):
@@ -216,7 +217,24 @@ class SklearnInferenceEngine(InferenceEngine):
216
217
  return Inference(predicted_object=predicted_object, confidence=self.confidence)
217
218
 
218
219
  def _normalize(self, object: OBJECT) -> OBJECT:
219
- return {k: object.get(k, None) for k in self.config.feature_attributes}
220
+ """
221
+ Normalize the input object to ensure it has all the expected attributes.
222
+
223
+ Also remove any numpy/pandas oddities
224
+
225
+ :param object:
226
+ :return:
227
+ """
228
+ np_map = {np.nan: None}
229
+
230
+ def _tr(x: Any):
231
+ # TODO: figure a more elegant way to do this
232
+ try:
233
+ return np_map.get(x, x)
234
+ except TypeError:
235
+ return x
236
+
237
+ return {k: _tr(object.get(k, None)) for k in self.config.feature_attributes}
220
238
 
221
239
  def export_model(
222
240
  self, output: Optional[Union[str, Path, TextIO]], model_serialization: ModelSerialization = None, **kwargs
@@ -35,6 +35,7 @@ class InferenceConfig(BaseModel, extra="forbid"):
35
35
  feature_attributes: Optional[List[str]] = None
36
36
  train_test_split: Optional[Tuple[float, float]] = None
37
37
  llm_config: Optional[LLMConfig] = None
38
+ random_seed: Optional[int] = None
38
39
 
39
40
  @classmethod
40
41
  def from_file(cls, file_path: str, format: Optional[Format] = None) -> "InferenceConfig":
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import random
2
3
  from abc import ABC
3
4
  from dataclasses import dataclass
4
5
  from enum import Enum
@@ -6,7 +7,7 @@ from pathlib import Path
6
7
  from typing import Optional, TextIO, Tuple, Union
7
8
 
8
9
  import pandas as pd
9
- from pydantic import BaseModel, ConfigDict, Field
10
+ from pydantic import BaseModel, ConfigDict
10
11
 
11
12
  from linkml_store.api.collection import OBJECT, Collection
12
13
  from linkml_store.inference.inference_config import Inference, InferenceConfig
@@ -28,6 +29,7 @@ class ModelSerialization(str, Enum):
28
29
  PNG = "png"
29
30
  LINKML_EXPRESSION = "linkml_expression"
30
31
  RULE_BASED = "rulebased"
32
+ RAG_INDEX = "rag_index"
31
33
 
32
34
  @classmethod
33
35
  def from_filepath(cls, file_path: str) -> Optional["ModelSerialization"]:
@@ -57,11 +59,36 @@ class ModelSerialization(str, Enum):
57
59
 
58
60
 
59
61
  class CollectionSlice(BaseModel):
60
- model_config = ConfigDict(arbitrary_types_allowed=True)
61
-
62
- collection: Optional[Collection] = None
63
- dataframe: Optional[pd.DataFrame] = None
64
- slice: Tuple[Optional[int], Optional[int]] = Field(default=(None, None))
62
+ model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid")
63
+
64
+ name: Optional[str] = None
65
+ base_collection: Optional[Collection] = None
66
+ # _dataframe: Optional[pd.DataFrame] = None
67
+ # slice: Tuple[Optional[int], Optional[int]] = Field(default=(None, None))
68
+ indices: Optional[Tuple[int, ...]] = None
69
+ _collection: Optional[Collection] = None
70
+
71
+ @property
72
+ def collection(self) -> Collection:
73
+ if not self._collection and not self.indices:
74
+ return self.base_collection
75
+ if not self._collection:
76
+ rows = self.base_collection.find({}, limit=-1).rows
77
+ subset = [rows[i] for i in self.indices]
78
+ db = self.base_collection.parent
79
+ subset_name = self.slice_alias
80
+ subset_collection = db.get_collection(subset_name, create_if_not_exists=True)
81
+ # ensure the collection has the same schema type as the base collection;
82
+ # this ensures that column/attribute types are preserved
83
+ subset_collection.metadata.type = self.base_collection.target_class_name
84
+ subset_collection.delete_where({})
85
+ subset_collection.insert(subset)
86
+ self._collection = subset_collection
87
+ return self._collection
88
+
89
+ @property
90
+ def slice_alias(self) -> str:
91
+ return f"{self.base_collection.alias}__rag_{self.name}"
65
92
 
66
93
  def as_dataframe(self, flattened=False) -> pd.DataFrame:
67
94
  """
@@ -69,17 +96,11 @@ class CollectionSlice(BaseModel):
69
96
 
70
97
  :return:
71
98
  """
72
- if self.dataframe is not None:
73
- df = self.dataframe
74
- return df.iloc[self.slice[0] : self.slice[1]]
75
- elif self.collection is not None:
76
- rs = self.collection.find({}, offset=self.slice[0], limit=self.slice[1] - self.slice[0])
77
- if flattened:
78
- return nested_objects_to_dataframe(rs.rows)
79
- else:
80
- return rs.rows_dataframe
99
+ rs = self.collection.find({}, limit=-1)
100
+ if flattened:
101
+ return nested_objects_to_dataframe(rs.rows)
81
102
  else:
82
- raise ValueError("No dataframe or collection provided")
103
+ return rs.rows_dataframe
83
104
 
84
105
 
85
106
  @dataclass
@@ -96,21 +117,34 @@ class InferenceEngine(ABC):
96
117
  training_data: Optional[CollectionSlice] = None
97
118
  testing_data: Optional[CollectionSlice] = None
98
119
 
99
- def load_and_split_data(self, collection: Collection, split: Optional[Tuple[float, float]] = None):
120
+ def load_and_split_data(self, collection: Collection, split: Optional[Tuple[float, float]] = None, randomize=True):
100
121
  """
101
122
  Load the data and split it into training and testing sets.
102
123
 
103
124
  :param collection:
104
125
  :param split:
126
+ :param randomize:
105
127
  :return:
106
128
  """
129
+ local_random = random.Random(self.config.random_seed) if self.config.random_seed else random.Random()
107
130
  split = split or self.config.train_test_split
108
131
  if not split:
109
132
  split = (0.7, 0.3)
133
+ if split[0] == 1.0:
134
+ self.training_data = CollectionSlice(name="train", base_collection=collection, indices=None)
135
+ self.testing_data = None
136
+ return
110
137
  logger.info(f"Loading and splitting data from collection {collection.alias}")
111
138
  size = collection.size()
112
- self.training_data = CollectionSlice(collection=collection, slice=(0, int(size * split[0])))
113
- self.testing_data = CollectionSlice(collection=collection, slice=(int(size * split[0]), size))
139
+ indices = range(size)
140
+ if randomize:
141
+ train_indices = local_random.sample(indices, int(size * split[0]))
142
+ test_indices = set(indices) - set(train_indices)
143
+ else:
144
+ train_indices = indices[: int(size * split[0])]
145
+ test_indices = indices[int(size * split[0]) :]
146
+ self.training_data = CollectionSlice(name="train", base_collection=collection, indices=train_indices)
147
+ self.testing_data = CollectionSlice(name="test", base_collection=collection, indices=test_indices)
114
148
 
115
149
  def initialize_model(self, **kwargs):
116
150
  """
@@ -47,6 +47,7 @@ class Format(Enum):
47
47
  ".jsonl": cls.JSONL,
48
48
  ".yaml": cls.YAML,
49
49
  ".yml": cls.YAML,
50
+ ".yamll": cls.YAMLL,
50
51
  ".tsv": cls.TSV,
51
52
  ".csv": cls.CSV,
52
53
  ".py": cls.PYTHON,
@@ -98,6 +99,9 @@ def process_file(
98
99
  """
99
100
  Process a single file and return a list of objects.
100
101
  """
102
+ if format == Format.YAMLL:
103
+ format = Format.YAML
104
+ expected_type = list
101
105
  if format == Format.JSON:
102
106
  objs = json.load(f)
103
107
  elif format == Format.JSONL:
@@ -105,6 +109,8 @@ def process_file(
105
109
  elif format == Format.YAML:
106
110
  if expected_type and expected_type == list: # noqa E721
107
111
  objs = list(yaml.safe_load_all(f))
112
+ # allow YAML with a `---` with no object before it
113
+ objs = [obj for obj in objs if obj is not None]
108
114
  else:
109
115
  objs = yaml.safe_load(f)
110
116
  elif format in [Format.TSV, Format.CSV]:
@@ -20,6 +20,7 @@ MODEL_TOKEN_MAPPING = {
20
20
  "gpt-3.5-turbo-instruct": 4096,
21
21
  "text-ada-001": 2049,
22
22
  "ada": 2049,
23
+ "ada-002": 8192,
23
24
  "text-babbage-001": 2040,
24
25
  "babbage": 2049,
25
26
  "text-curie-001": 2049,
@@ -32,6 +33,7 @@ MODEL_TOKEN_MAPPING = {
32
33
  "code-cushman-002": 2048,
33
34
  "code-cushman-001": 2048,
34
35
  "claude": 200_000,
36
+ "llama-3": 200_000,
35
37
  }
36
38
 
37
39
 
@@ -1,6 +1,6 @@
1
1
  import json
2
2
  from copy import deepcopy
3
- from typing import Any, Dict, List, Union
3
+ from typing import Any, Dict, List, Optional, Union
4
4
 
5
5
  from pydantic import BaseModel
6
6
 
@@ -60,6 +60,41 @@ def object_path_update(
60
60
  return ret_obj
61
61
 
62
62
 
63
+ def object_path_get(obj: Union[BaseModel, Dict[str, Any]], path: str, default_value=None) -> Any:
64
+ """
65
+ Retrieves a value from a nested object based on a path description. The path to the
66
+ desired field is given in dot and bracket notation (e.g., 'a[0].b.c[1]').
67
+
68
+ :param obj: The dictionary object to be updated.
69
+ :type obj: Dict[str, Any]
70
+ :param path: The path string indicating where to place the value within the object.
71
+ :type path: str
72
+ :return: The value at the specified path.
73
+ :rtype: Any
74
+
75
+ **Example**::
76
+
77
+ >>> data = {'persons': [{'foo': {'bar': 1}}]}
78
+ >>> object_path_get(data, 'persons[0].foo.bar')
79
+ 1
80
+ >>> object_path_get(data, 'persons[0].foo')
81
+ {'bar': 1}
82
+ >>> object_path_get({}, 'not there', "NA")
83
+ 'NA'
84
+ """
85
+ if isinstance(obj, BaseModel):
86
+ obj = obj.dict()
87
+ parts = path.split(".")
88
+ for part in parts:
89
+ if "[" in part:
90
+ key, index = part[:-1].split("[")
91
+ index = int(index)
92
+ obj = obj[key][index]
93
+ else:
94
+ obj = obj.get(part, default_value)
95
+ return obj
96
+
97
+
63
98
  def parse_update_expression(expr: str) -> Union[tuple[str, Any], None]:
64
99
  """
65
100
  Parse a string expression of the form 'path.to.field=value' into a path and a value.
@@ -81,3 +116,67 @@ def clean_empties(value: Union[Dict, List]) -> Any:
81
116
  elif isinstance(value, list):
82
117
  value = [v for v in (clean_empties(v) for v in value) if v is not None]
83
118
  return value
119
+
120
+
121
+ def select_nested(data: dict, paths: List[Union[str, List[str]]], current_path=None) -> Optional[dict]:
122
+ """
123
+ Select nested attributes from a complex dictionary based on selector strings.
124
+
125
+ Args:
126
+ data (dict): The input nested dictionary.
127
+ selectors (list): A list of selector strings.
128
+
129
+ Returns:
130
+ dict: A new dictionary with the same structure, but only the selected attributes.
131
+
132
+ Example:
133
+ >>> data = {
134
+ ... "person": {
135
+ ... "name": "John Doe",
136
+ ... "age": 30,
137
+ ... "address": {
138
+ ... "street": "123 Main St",
139
+ ... "city": "Anytown",
140
+ ... "country": "USA"
141
+ ... },
142
+ ... "phones": [
143
+ ... {"type": "home", "number": "555-1234"},
144
+ ... {"type": "work", "number": "555-5678"}
145
+ ... ]
146
+ ... },
147
+ ... "company": {
148
+ ... "name": "Acme Inc",
149
+ ... "location": "New York"
150
+ ... }
151
+ ... }
152
+ >>> select_nested(data, ["person.address.street", "person.address.city"])
153
+ {'person': {'address': {'street': '123 Main St', 'city': 'Anytown'}}}
154
+ >>> select_nested(data, ["person.phones.number", "person.phones.type"])
155
+ {'person': {'phones': [{'type': 'home', 'number': '555-1234'}, {'type': 'work', 'number': '555-5678'}]}}
156
+ >>> select_nested(data, ["person"])
157
+ {'person': {'name': 'John Doe', 'age': 30, 'address': {'street': '123 Main St', 'city': 'Anytown',
158
+ 'country': 'USA'}, 'phones': [{'type': 'home', 'number': '555-1234'}, {'type': 'work', 'number': '555-5678'}]}}
159
+ >>> select_nested(data, ["person.phones.type"])
160
+ {'person': {'phones': [{'type': 'home'}, {'type': 'work'}]}}
161
+ """
162
+ if current_path is None:
163
+ current_path = []
164
+ matching_paths = []
165
+ for path in paths:
166
+ if isinstance(path, str):
167
+ path = path.split(".")
168
+ if path == current_path:
169
+ return data
170
+ if path[: len(current_path)] == current_path:
171
+ matching_paths.append(path)
172
+ if not matching_paths:
173
+ return None
174
+ if isinstance(data, dict):
175
+ new_obj = {k: select_nested(v, matching_paths, current_path + [k]) for k, v in data.items()}
176
+ new_obj = {k: v for k, v in new_obj.items() if v is not None}
177
+ return new_obj
178
+ if isinstance(data, list):
179
+ new_obj = [select_nested(v, matching_paths, current_path + []) for i, v in enumerate(data)]
180
+ new_obj = [v for v in new_obj if v is not None]
181
+ return new_obj
182
+ return data
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: linkml-store
3
- Version: 0.1.14
3
+ Version: 0.2.1
4
4
  Summary: linkml-store
5
5
  License: MIT
6
6
  Author: Author 1
@@ -18,6 +18,7 @@ Provides-Extra: chromadb
18
18
  Provides-Extra: fastapi
19
19
  Provides-Extra: frictionless
20
20
  Provides-Extra: h5py
21
+ Provides-Extra: ibis
21
22
  Provides-Extra: llm
22
23
  Provides-Extra: map
23
24
  Provides-Extra: mongodb
@@ -34,7 +35,9 @@ Requires-Dist: duckdb (>=0.10.1)
34
35
  Requires-Dist: duckdb-engine (>=0.11.2)
35
36
  Requires-Dist: fastapi ; extra == "fastapi"
36
37
  Requires-Dist: frictionless ; extra == "frictionless"
38
+ Requires-Dist: gcsfs ; extra == "ibis"
37
39
  Requires-Dist: h5py ; extra == "h5py"
40
+ Requires-Dist: ibis-framework[duckdb,examples] (>=9.3.0) ; extra == "ibis"
38
41
  Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
39
42
  Requires-Dist: jsonlines (>=4.0.0,<5.0.0)
40
43
  Requires-Dist: linkml (>=1.8.0) ; extra == "validation"
@@ -43,6 +46,7 @@ Requires-Dist: linkml_map ; extra == "map"
43
46
  Requires-Dist: linkml_renderer ; extra == "renderer"
44
47
  Requires-Dist: llm ; extra == "llm"
45
48
  Requires-Dist: matplotlib ; extra == "analytics"
49
+ Requires-Dist: multipledispatch ; extra == "ibis"
46
50
  Requires-Dist: neo4j ; extra == "neo4j"
47
51
  Requires-Dist: networkx ; extra == "neo4j"
48
52
  Requires-Dist: pandas (>=2.2.1) ; extra == "analytics"
@@ -52,6 +56,7 @@ Requires-Dist: pyarrow ; extra == "pyarrow"
52
56
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
53
57
  Requires-Dist: pymongo ; extra == "mongodb"
54
58
  Requires-Dist: pystow (>=0.5.4,<0.6.0)
59
+ Requires-Dist: ruff (>=0.6.2) ; extra == "tests"
55
60
  Requires-Dist: scikit-learn ; extra == "scipy"
56
61
  Requires-Dist: scipy ; extra == "scipy"
57
62
  Requires-Dist: seaborn ; extra == "analytics"
@@ -70,6 +75,8 @@ common query, index, and storage operations.
70
75
 
71
76
  For full documentation, see [https://linkml.io/linkml-store/](https://linkml.io/linkml-store/)
72
77
 
78
+ See [these slides](https://docs.google.com/presentation/d/e/2PACX-1vSgtWUNUW0qNO_ZhMAGQ6fYhlXZJjBNMYT0OiZz8DDx8oj7iG9KofRs6SeaMXBBOICGknoyMG2zaHnm/embed?start=false&loop=false&delayms=3000) for a high level overview.
79
+
73
80
  __Warning__ LinkML-Store is still undergoing changes and refactoring,
74
81
  APIs and command line options are subject to change!
75
82
 
@@ -196,3 +203,4 @@ make app
196
203
 
197
204
  See [these slides](https://docs.google.com/presentation/d/e/2PACX-1vSgtWUNUW0qNO_ZhMAGQ6fYhlXZJjBNMYT0OiZz8DDx8oj7iG9KofRs6SeaMXBBOICGknoyMG2zaHnm/embed?start=false&loop=false&delayms=3000) for more details
198
205
 
206
+