linkml-store 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/client.py +35 -8
- linkml_store/api/collection.py +40 -5
- linkml_store/api/config.py +20 -3
- linkml_store/api/database.py +24 -3
- linkml_store/api/stores/mongodb/mongodb_collection.py +4 -0
- linkml_store/cli.py +140 -13
- linkml_store/inference/__init__.py +13 -0
- linkml_store/inference/implementations/__init__.py +0 -0
- linkml_store/inference/implementations/rag_inference_engine.py +145 -0
- linkml_store/inference/implementations/rule_based_inference_engine.py +158 -0
- linkml_store/inference/implementations/sklearn_inference_engine.py +290 -0
- linkml_store/inference/inference_config.py +62 -0
- linkml_store/inference/inference_engine.py +173 -0
- linkml_store/inference/inference_engine_registry.py +74 -0
- linkml_store/utils/format_utils.py +21 -90
- linkml_store/utils/llm_utils.py +95 -0
- linkml_store/utils/object_utils.py +3 -1
- linkml_store/utils/pandas_utils.py +55 -2
- linkml_store/utils/sklearn_utils.py +193 -0
- linkml_store/utils/stats_utils.py +53 -0
- {linkml_store-0.1.13.dist-info → linkml_store-0.1.14.dist-info}/METADATA +25 -2
- {linkml_store-0.1.13.dist-info → linkml_store-0.1.14.dist-info}/RECORD +25 -14
- {linkml_store-0.1.13.dist-info → linkml_store-0.1.14.dist-info}/LICENSE +0 -0
- {linkml_store-0.1.13.dist-info → linkml_store-0.1.14.dist-info}/WHEEL +0 -0
- {linkml_store-0.1.13.dist-info → linkml_store-0.1.14.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from abc import ABC
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional, TextIO, Tuple, Union
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
10
|
+
|
|
11
|
+
from linkml_store.api.collection import OBJECT, Collection
|
|
12
|
+
from linkml_store.inference.inference_config import Inference, InferenceConfig
|
|
13
|
+
from linkml_store.utils.pandas_utils import nested_objects_to_dataframe
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ModelSerialization(str, Enum):
|
|
19
|
+
"""
|
|
20
|
+
Enum for model serialization types.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
PICKLE = "pickle"
|
|
24
|
+
ONNX = "onnx"
|
|
25
|
+
PMML = "pmml"
|
|
26
|
+
PFA = "pfa"
|
|
27
|
+
JOBLIB = "joblib"
|
|
28
|
+
PNG = "png"
|
|
29
|
+
LINKML_EXPRESSION = "linkml_expression"
|
|
30
|
+
RULE_BASED = "rulebased"
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def from_filepath(cls, file_path: str) -> Optional["ModelSerialization"]:
|
|
34
|
+
"""
|
|
35
|
+
Get the serialization type from the file path.
|
|
36
|
+
|
|
37
|
+
>>> ModelSerialization.from_filepath("model.onnx")
|
|
38
|
+
<ModelSerialization.ONNX: 'onnx'>
|
|
39
|
+
>>> ModelSerialization.from_filepath("model.pkl")
|
|
40
|
+
<ModelSerialization.PICKLE: 'pickle'>
|
|
41
|
+
>>> assert ModelSerialization.from_filepath("poor_file_name") is None
|
|
42
|
+
|
|
43
|
+
:param file_path:
|
|
44
|
+
:return:
|
|
45
|
+
"""
|
|
46
|
+
toks = file_path.split(".")
|
|
47
|
+
suffix = toks[-1]
|
|
48
|
+
if len(toks) > 2:
|
|
49
|
+
if suffix == "yaml" and toks[-2] == "rulebased":
|
|
50
|
+
return cls.RULE_BASED
|
|
51
|
+
# Generate mapping dynamically
|
|
52
|
+
extension_mapping = {v.lower(): v for v in cls}
|
|
53
|
+
# Add special cases
|
|
54
|
+
extension_mapping["pkl"] = cls.PICKLE
|
|
55
|
+
extension_mapping["py"] = cls.LINKML_EXPRESSION
|
|
56
|
+
return extension_mapping.get(suffix, None)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class CollectionSlice(BaseModel):
|
|
60
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
61
|
+
|
|
62
|
+
collection: Optional[Collection] = None
|
|
63
|
+
dataframe: Optional[pd.DataFrame] = None
|
|
64
|
+
slice: Tuple[Optional[int], Optional[int]] = Field(default=(None, None))
|
|
65
|
+
|
|
66
|
+
def as_dataframe(self, flattened=False) -> pd.DataFrame:
|
|
67
|
+
"""
|
|
68
|
+
Return the slice of the collection as a dataframe.
|
|
69
|
+
|
|
70
|
+
:return:
|
|
71
|
+
"""
|
|
72
|
+
if self.dataframe is not None:
|
|
73
|
+
df = self.dataframe
|
|
74
|
+
return df.iloc[self.slice[0] : self.slice[1]]
|
|
75
|
+
elif self.collection is not None:
|
|
76
|
+
rs = self.collection.find({}, offset=self.slice[0], limit=self.slice[1] - self.slice[0])
|
|
77
|
+
if flattened:
|
|
78
|
+
return nested_objects_to_dataframe(rs.rows)
|
|
79
|
+
else:
|
|
80
|
+
return rs.rows_dataframe
|
|
81
|
+
else:
|
|
82
|
+
raise ValueError("No dataframe or collection provided")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass
|
|
86
|
+
class InferenceEngine(ABC):
|
|
87
|
+
"""
|
|
88
|
+
Base class for all inference engine.
|
|
89
|
+
|
|
90
|
+
An InferenceEngine is capable of deriving inferences from input objects and a collection.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
predictor_type: Optional[str] = None
|
|
94
|
+
config: Optional[InferenceConfig] = None
|
|
95
|
+
|
|
96
|
+
training_data: Optional[CollectionSlice] = None
|
|
97
|
+
testing_data: Optional[CollectionSlice] = None
|
|
98
|
+
|
|
99
|
+
def load_and_split_data(self, collection: Collection, split: Optional[Tuple[float, float]] = None):
|
|
100
|
+
"""
|
|
101
|
+
Load the data and split it into training and testing sets.
|
|
102
|
+
|
|
103
|
+
:param collection:
|
|
104
|
+
:param split:
|
|
105
|
+
:return:
|
|
106
|
+
"""
|
|
107
|
+
split = split or self.config.train_test_split
|
|
108
|
+
if not split:
|
|
109
|
+
split = (0.7, 0.3)
|
|
110
|
+
logger.info(f"Loading and splitting data from collection {collection.alias}")
|
|
111
|
+
size = collection.size()
|
|
112
|
+
self.training_data = CollectionSlice(collection=collection, slice=(0, int(size * split[0])))
|
|
113
|
+
self.testing_data = CollectionSlice(collection=collection, slice=(int(size * split[0]), size))
|
|
114
|
+
|
|
115
|
+
def initialize_model(self, **kwargs):
|
|
116
|
+
"""
|
|
117
|
+
Initialize the model.
|
|
118
|
+
|
|
119
|
+
:param kwargs:
|
|
120
|
+
:return:
|
|
121
|
+
"""
|
|
122
|
+
raise NotImplementedError("Initialize model method must be implemented by subclass")
|
|
123
|
+
|
|
124
|
+
def export_model(
|
|
125
|
+
self, output: Optional[Union[str, Path, TextIO]], model_serialization: ModelSerialization = None, **kwargs
|
|
126
|
+
):
|
|
127
|
+
"""
|
|
128
|
+
Export the model to the given output.
|
|
129
|
+
|
|
130
|
+
:param model_serialization:
|
|
131
|
+
:param output:
|
|
132
|
+
:param kwargs:
|
|
133
|
+
:return:
|
|
134
|
+
"""
|
|
135
|
+
raise NotImplementedError("Export model method must be implemented by subclass")
|
|
136
|
+
|
|
137
|
+
def import_model_from(self, inference_engine: "InferenceEngine", **kwargs):
|
|
138
|
+
"""
|
|
139
|
+
Import the model from the given inference engine.
|
|
140
|
+
|
|
141
|
+
:param inference_engine:
|
|
142
|
+
:param kwargs:
|
|
143
|
+
:return:
|
|
144
|
+
"""
|
|
145
|
+
raise NotImplementedError("Import model method must be implemented by subclass")
|
|
146
|
+
|
|
147
|
+
def save_model(self, output: Union[str, Path]) -> None:
|
|
148
|
+
"""
|
|
149
|
+
Save the model to the given output.
|
|
150
|
+
|
|
151
|
+
:param output:
|
|
152
|
+
:return:
|
|
153
|
+
"""
|
|
154
|
+
raise NotImplementedError("Save model method must be implemented by subclass")
|
|
155
|
+
|
|
156
|
+
@classmethod
|
|
157
|
+
def load_model(cls, file_path: Union[str, Path]) -> "InferenceEngine":
|
|
158
|
+
"""
|
|
159
|
+
Load the model from the given file path.
|
|
160
|
+
|
|
161
|
+
:param file_path:
|
|
162
|
+
:return:
|
|
163
|
+
"""
|
|
164
|
+
raise NotImplementedError("Load model method must be implemented by subclass")
|
|
165
|
+
|
|
166
|
+
def derive(self, object: OBJECT) -> Optional[Inference]:
|
|
167
|
+
"""
|
|
168
|
+
Derive the prediction for the given object.
|
|
169
|
+
|
|
170
|
+
:param object:
|
|
171
|
+
:return:
|
|
172
|
+
"""
|
|
173
|
+
raise NotImplementedError("Predict method must be implemented by subclass")
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import inspect
|
|
3
|
+
import os
|
|
4
|
+
from typing import Dict, Type
|
|
5
|
+
|
|
6
|
+
from linkml_store.inference.inference_config import InferenceConfig
|
|
7
|
+
from linkml_store.inference.inference_engine import InferenceEngine
|
|
8
|
+
from linkml_store.utils.object_utils import object_path_update
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class InferenceEngineRegistry:
|
|
12
|
+
def __init__(self):
|
|
13
|
+
self.engines: Dict[str, Type[InferenceEngine]] = {}
|
|
14
|
+
|
|
15
|
+
def register(self, name: str, engine_class: Type[InferenceEngine]):
|
|
16
|
+
self.engines[name] = engine_class
|
|
17
|
+
|
|
18
|
+
def get_engine_class(self, name: str) -> Type[InferenceEngine]:
|
|
19
|
+
if name not in self.engines:
|
|
20
|
+
raise ValueError(f"Unknown inference engine type: {name}" f"Known engines: {list(self.engines.keys())}")
|
|
21
|
+
return self.engines[name]
|
|
22
|
+
|
|
23
|
+
def create_engine(self, engine_type: str, config: InferenceConfig = None, **kwargs) -> InferenceEngine:
|
|
24
|
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
|
25
|
+
if ":" in engine_type:
|
|
26
|
+
engine_type, conf_args = engine_type.split(":", 1)
|
|
27
|
+
if config is None:
|
|
28
|
+
config = InferenceConfig()
|
|
29
|
+
for arg in conf_args.split(","):
|
|
30
|
+
k, v = arg.split("=")
|
|
31
|
+
config = object_path_update(config, k, v)
|
|
32
|
+
|
|
33
|
+
engine_class = self.get_engine_class(engine_type)
|
|
34
|
+
kwargs["predictor_type"] = engine_type
|
|
35
|
+
return engine_class(config=config, **kwargs)
|
|
36
|
+
|
|
37
|
+
@classmethod
|
|
38
|
+
def load_engines(cls, package_path: str):
|
|
39
|
+
registry = cls()
|
|
40
|
+
package_dir = os.path.dirname(importlib.import_module(package_path).__file__)
|
|
41
|
+
for filename in os.listdir(package_dir):
|
|
42
|
+
if filename.endswith(".py") and not filename.startswith("__"):
|
|
43
|
+
module_name = f"{package_path}.{filename[:-3]}"
|
|
44
|
+
try:
|
|
45
|
+
module = importlib.import_module(module_name)
|
|
46
|
+
for name, obj in inspect.getmembers(module):
|
|
47
|
+
if inspect.isclass(obj) and issubclass(obj, InferenceEngine) and obj != InferenceEngine:
|
|
48
|
+
engine_name = name.lower().replace("inferenceengine", "")
|
|
49
|
+
registry.register(engine_name, obj)
|
|
50
|
+
except ImportError as e:
|
|
51
|
+
print(f"Error importing {module_name}: {e}")
|
|
52
|
+
return registry
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# Initialize the registry
|
|
56
|
+
registry = InferenceEngineRegistry.load_engines("linkml_store.inference.implementations")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# Function to get an inference engine (can be used as before)
|
|
60
|
+
def get_inference_engine(engine_type: str, config: InferenceConfig = None, **kwargs) -> InferenceEngine:
|
|
61
|
+
"""
|
|
62
|
+
Get an inference engine.
|
|
63
|
+
|
|
64
|
+
>>> from linkml_store.inference import get_inference_engine
|
|
65
|
+
>>> ie = get_inference_engine('sklearn')
|
|
66
|
+
>>> type(ie)
|
|
67
|
+
<class 'linkml_store.inference.implementations.sklearn_inference_engine.SklearnInferenceEngine'>
|
|
68
|
+
|
|
69
|
+
:param engine_type:
|
|
70
|
+
:param config:
|
|
71
|
+
:param kwargs:
|
|
72
|
+
:return:
|
|
73
|
+
"""
|
|
74
|
+
return registry.create_engine(engine_type, config, **kwargs)
|
|
@@ -27,6 +27,7 @@ class Format(Enum):
|
|
|
27
27
|
JSON = "json"
|
|
28
28
|
JSONL = "jsonl"
|
|
29
29
|
YAML = "yaml"
|
|
30
|
+
YAMLL = "yamll"
|
|
30
31
|
TSV = "tsv"
|
|
31
32
|
CSV = "csv"
|
|
32
33
|
PYTHON = "python"
|
|
@@ -63,6 +64,9 @@ class Format(Enum):
|
|
|
63
64
|
def is_dump_format(self):
|
|
64
65
|
return self in [Format.SQLDUMP_DUCKDB, Format.SQLDUMP_POSTGRES, Format.DUMP_MONGODB]
|
|
65
66
|
|
|
67
|
+
def is_xsv(self):
|
|
68
|
+
return self in [Format.TSV, Format.CSV]
|
|
69
|
+
|
|
66
70
|
|
|
67
71
|
def load_objects_from_url(
|
|
68
72
|
url: str,
|
|
@@ -135,11 +139,14 @@ def load_objects(
|
|
|
135
139
|
compression: Optional[str] = None,
|
|
136
140
|
expected_type: Optional[Type] = None,
|
|
137
141
|
header_comment_token: Optional[str] = None,
|
|
142
|
+
select_query: Optional[str] = None,
|
|
138
143
|
) -> List[Dict[str, Any]]:
|
|
139
144
|
"""
|
|
140
145
|
Load objects from a file or archive in supported formats.
|
|
141
146
|
For tgz archives, it processes all files and concatenates the results.
|
|
142
147
|
|
|
148
|
+
TODO: Add schema hints for CSV/TSV parsing.
|
|
149
|
+
|
|
143
150
|
:param file_path: The path to the file or archive.
|
|
144
151
|
:param format: The format of the file. Can be a Format enum or a string value.
|
|
145
152
|
:param compression: The compression type. Supports 'gz' for gzip and 'tgz' for tar.gz.
|
|
@@ -177,98 +184,22 @@ def load_objects(
|
|
|
177
184
|
all_objects = process_file(f, format, expected_type, header_comment_token)
|
|
178
185
|
|
|
179
186
|
logger.debug(f"Loaded {len(all_objects)} objects from {file_path}")
|
|
187
|
+
if select_query:
|
|
188
|
+
import jsonpath_ng as jp
|
|
189
|
+
|
|
190
|
+
path_expr = jp.parse(select_query)
|
|
191
|
+
new_objs = []
|
|
192
|
+
for obj in all_objects:
|
|
193
|
+
for match in path_expr.find(obj):
|
|
194
|
+
logging.debug(f"Match: {match.value}")
|
|
195
|
+
if isinstance(match.value, list):
|
|
196
|
+
new_objs.extend(match.value)
|
|
197
|
+
else:
|
|
198
|
+
new_objs.append(match.value)
|
|
199
|
+
all_objects = new_objs
|
|
180
200
|
return all_objects
|
|
181
201
|
|
|
182
202
|
|
|
183
|
-
def xxxload_objects(
|
|
184
|
-
file_path: Union[str, Path],
|
|
185
|
-
format: Union[Format, str] = None,
|
|
186
|
-
compression: Optional[str] = None,
|
|
187
|
-
expected_type: Type = None,
|
|
188
|
-
header_comment_token: Optional[str] = None,
|
|
189
|
-
) -> List[Dict[str, Any]]:
|
|
190
|
-
"""
|
|
191
|
-
Load objects from a file in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
192
|
-
|
|
193
|
-
>>> load_objects("tests/input/test_data/data.csv")
|
|
194
|
-
[{'id': '1', 'name': 'John', 'age': '30'},
|
|
195
|
-
{'id': '2', 'name': 'Alice', 'age': '25'}, {'id': '3', 'name': 'Bob', 'age': '35'}]
|
|
196
|
-
|
|
197
|
-
:param file_path: The path to the file.
|
|
198
|
-
:param format: The format of the file. Can be a Format enum or a string value.
|
|
199
|
-
:param expected_type: The target type to load the objects into, e.g. list
|
|
200
|
-
:return: A list of dictionaries representing the loaded objects.
|
|
201
|
-
"""
|
|
202
|
-
if isinstance(format, str):
|
|
203
|
-
format = Format(format)
|
|
204
|
-
|
|
205
|
-
if isinstance(file_path, Path):
|
|
206
|
-
file_path = str(file_path)
|
|
207
|
-
|
|
208
|
-
if not format and (file_path.endswith(".parquet") or file_path.endswith(".pq")):
|
|
209
|
-
format = Format.PARQUET
|
|
210
|
-
if not format and file_path.endswith(".tsv"):
|
|
211
|
-
format = Format.TSV
|
|
212
|
-
if not format and file_path.endswith(".csv"):
|
|
213
|
-
format = Format.CSV
|
|
214
|
-
if not format and file_path.endswith(".py"):
|
|
215
|
-
format = Format.PYTHON
|
|
216
|
-
|
|
217
|
-
mode = "r"
|
|
218
|
-
if format == Format.PARQUET:
|
|
219
|
-
mode = "rb"
|
|
220
|
-
|
|
221
|
-
if file_path == "-":
|
|
222
|
-
# set file_path to be a stream from stdin
|
|
223
|
-
f = sys.stdin
|
|
224
|
-
else:
|
|
225
|
-
f = open(file_path, mode)
|
|
226
|
-
|
|
227
|
-
if format == Format.JSON or (not format and file_path.endswith(".json")):
|
|
228
|
-
objs = json.load(f)
|
|
229
|
-
elif format == Format.JSONL or (not format and file_path.endswith(".jsonl")):
|
|
230
|
-
objs = [json.loads(line) for line in f]
|
|
231
|
-
elif format == Format.YAML or (not format and (file_path.endswith(".yaml") or file_path.endswith(".yml"))):
|
|
232
|
-
if expected_type and expected_type == list: # noqa E721
|
|
233
|
-
objs = list(yaml.safe_load_all(f))
|
|
234
|
-
else:
|
|
235
|
-
objs = yaml.safe_load(f)
|
|
236
|
-
elif format == Format.TSV or format == Format.CSV:
|
|
237
|
-
# Skip initial comment lines if comment_char is set
|
|
238
|
-
if header_comment_token:
|
|
239
|
-
# Store the original position
|
|
240
|
-
original_pos = f.tell()
|
|
241
|
-
|
|
242
|
-
# Read and store lines until we find a non-comment line
|
|
243
|
-
lines = []
|
|
244
|
-
for line in f:
|
|
245
|
-
if not line.startswith(header_comment_token):
|
|
246
|
-
break
|
|
247
|
-
lines.append(line)
|
|
248
|
-
|
|
249
|
-
# Go back to the original position
|
|
250
|
-
f.seek(original_pos)
|
|
251
|
-
|
|
252
|
-
# Skip the comment lines we found
|
|
253
|
-
for _ in lines:
|
|
254
|
-
f.readline()
|
|
255
|
-
if format == Format.TSV:
|
|
256
|
-
reader = csv.DictReader(f, delimiter="\t")
|
|
257
|
-
else:
|
|
258
|
-
reader = csv.DictReader(f)
|
|
259
|
-
objs = list(reader)
|
|
260
|
-
elif format == Format.PARQUET:
|
|
261
|
-
import pyarrow.parquet as pq
|
|
262
|
-
|
|
263
|
-
table = pq.read_table(f)
|
|
264
|
-
objs = table.to_pandas().to_dict(orient="records")
|
|
265
|
-
else:
|
|
266
|
-
raise ValueError(f"Unsupported file format: {file_path}")
|
|
267
|
-
if not isinstance(objs, list):
|
|
268
|
-
objs = [objs]
|
|
269
|
-
return objs
|
|
270
|
-
|
|
271
|
-
|
|
272
203
|
def write_output(
|
|
273
204
|
data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame],
|
|
274
205
|
format: Union[Format, str] = Format.YAML,
|
|
@@ -329,7 +260,7 @@ def render_output(
|
|
|
329
260
|
if format == Format.FORMATTED:
|
|
330
261
|
if not isinstance(data, pd.DataFrame):
|
|
331
262
|
data = pd.DataFrame(data)
|
|
332
|
-
return
|
|
263
|
+
return data.to_string(max_rows=None)
|
|
333
264
|
|
|
334
265
|
if isinstance(data, pd.DataFrame):
|
|
335
266
|
data = data.to_dict(orient="records")
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from typing import Callable, List, Optional
|
|
2
|
+
|
|
3
|
+
from tiktoken import Encoding
|
|
4
|
+
|
|
5
|
+
MODEL_TOKEN_MAPPING = {
|
|
6
|
+
"gpt-4o-mini": 128_000,
|
|
7
|
+
"gpt-4o": 128_000,
|
|
8
|
+
"gpt-4o-2024-05-13": 128_000,
|
|
9
|
+
"gpt-4": 8192,
|
|
10
|
+
"gpt-4-0314": 8192,
|
|
11
|
+
"gpt-4-0613": 8192,
|
|
12
|
+
"gpt-4-32k": 32768,
|
|
13
|
+
"gpt-4-32k-0314": 32768,
|
|
14
|
+
"gpt-4-32k-0613": 32768,
|
|
15
|
+
"gpt-3.5-turbo": 4096,
|
|
16
|
+
"gpt-3.5-turbo-0301": 4096,
|
|
17
|
+
"gpt-3.5-turbo-0613": 4096,
|
|
18
|
+
"gpt-3.5-turbo-16k": 16385,
|
|
19
|
+
"gpt-3.5-turbo-16k-0613": 16385,
|
|
20
|
+
"gpt-3.5-turbo-instruct": 4096,
|
|
21
|
+
"text-ada-001": 2049,
|
|
22
|
+
"ada": 2049,
|
|
23
|
+
"text-babbage-001": 2040,
|
|
24
|
+
"babbage": 2049,
|
|
25
|
+
"text-curie-001": 2049,
|
|
26
|
+
"curie": 2049,
|
|
27
|
+
"davinci": 2049,
|
|
28
|
+
"text-davinci-003": 4097,
|
|
29
|
+
"text-davinci-002": 4097,
|
|
30
|
+
"code-davinci-002": 8001,
|
|
31
|
+
"code-davinci-001": 8001,
|
|
32
|
+
"code-cushman-002": 2048,
|
|
33
|
+
"code-cushman-001": 2048,
|
|
34
|
+
"claude": 200_000,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def render_formatted_text(
|
|
39
|
+
render_func: Callable,
|
|
40
|
+
values: List[str],
|
|
41
|
+
encoding: Encoding,
|
|
42
|
+
token_limit: int,
|
|
43
|
+
additional_text: Optional[str] = None,
|
|
44
|
+
) -> str:
|
|
45
|
+
"""
|
|
46
|
+
Render a formatted text string with a given object, encoding, and token limit.
|
|
47
|
+
|
|
48
|
+
>>> from tiktoken import encoding_for_model
|
|
49
|
+
>>> encoding = encoding_for_model("gpt-4o-mini")
|
|
50
|
+
>>> names = ["Alice", "Bob", "DoctorHippopotamusMcHippopotamusFace"]
|
|
51
|
+
>>> f = lambda x: f"Hello, {' '.join(x)}!"
|
|
52
|
+
>>> render_formatted_text(f, names, encoding, 4096)
|
|
53
|
+
'Hello, Alice Bob DoctorHippopotamusMcHippopotamusFace!'
|
|
54
|
+
>>> render_formatted_text(f, names, encoding, 5)
|
|
55
|
+
'Hello, Alice Bob!'
|
|
56
|
+
|
|
57
|
+
:param render_func: Rendering function
|
|
58
|
+
:param values: Values to render
|
|
59
|
+
:param encoding: Encoding
|
|
60
|
+
:param token_limit: Token limit
|
|
61
|
+
:param additional_text: Additional text to consider
|
|
62
|
+
:return:
|
|
63
|
+
"""
|
|
64
|
+
text = render_func(values)
|
|
65
|
+
if additional_text:
|
|
66
|
+
token_limit -= len(encoding.encode(additional_text))
|
|
67
|
+
text_length = len(encoding.encode(text))
|
|
68
|
+
if text_length <= token_limit:
|
|
69
|
+
return text
|
|
70
|
+
if not values:
|
|
71
|
+
raise ValueError(f"Cannot fit text into token limit: {text_length} > {token_limit}")
|
|
72
|
+
return render_formatted_text(render_func, values[0:-1], encoding=encoding, token_limit=token_limit)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_token_limit(model_name: str) -> int:
|
|
76
|
+
"""
|
|
77
|
+
Estimate the token limit for a model.
|
|
78
|
+
|
|
79
|
+
>>> get_token_limit("gpt-4o-mini")
|
|
80
|
+
128000
|
|
81
|
+
|
|
82
|
+
also works with nested names:
|
|
83
|
+
|
|
84
|
+
>>> get_token_limit("my/claude-opus")
|
|
85
|
+
200000
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
:param model_name: Model name
|
|
89
|
+
:return: Estimated token limit
|
|
90
|
+
"""
|
|
91
|
+
# sort MODEL_TOKEN_MAPPING by key length to ensure that the longest model names are checked first
|
|
92
|
+
for model, token_limit in sorted(MODEL_TOKEN_MAPPING.items(), key=lambda x: len(x[0]), reverse=True):
|
|
93
|
+
if model in model_name:
|
|
94
|
+
return token_limit
|
|
95
|
+
return 4096
|
|
@@ -29,7 +29,7 @@ def object_path_update(
|
|
|
29
29
|
"""
|
|
30
30
|
if isinstance(obj, BaseModel):
|
|
31
31
|
typ = type(obj)
|
|
32
|
-
obj = obj.
|
|
32
|
+
obj = obj.model_dump(exclude_none=True)
|
|
33
33
|
obj = object_path_update(obj, path, value)
|
|
34
34
|
return typ(**obj)
|
|
35
35
|
obj = deepcopy(obj)
|
|
@@ -45,6 +45,8 @@ def object_path_update(
|
|
|
45
45
|
obj.append({})
|
|
46
46
|
obj = obj[index]
|
|
47
47
|
else:
|
|
48
|
+
if part in obj and obj[part] is None:
|
|
49
|
+
del obj[part]
|
|
48
50
|
obj = obj.setdefault(part, {})
|
|
49
51
|
last_part = parts[-1]
|
|
50
52
|
if "[" in last_part:
|
|
@@ -1,7 +1,59 @@
|
|
|
1
|
-
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
2
3
|
|
|
3
4
|
import pandas as pd
|
|
4
5
|
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def flatten_dict(d: Dict[str, Any], parent_key: str = "", sep: str = ".") -> Dict[str, Any]:
|
|
10
|
+
"""
|
|
11
|
+
Recursively flatten a nested dictionary.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
d (Dict[str, Any]): The dictionary to flatten.
|
|
15
|
+
parent_key (str): The parent key for nested dictionaries.
|
|
16
|
+
sep (str): The separator to use between keys.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Dict[str, Any]: A flattened dictionary.
|
|
20
|
+
|
|
21
|
+
>>> flatten_dict({'a': 1, 'b': {'c': 2, 'd': {'e': 3}}})
|
|
22
|
+
{'a': 1, 'b.c': 2, 'b.d.e': 3}
|
|
23
|
+
"""
|
|
24
|
+
items = []
|
|
25
|
+
for k, v in d.items():
|
|
26
|
+
new_key = f"{parent_key}{sep}{k}" if parent_key else k
|
|
27
|
+
if isinstance(v, dict):
|
|
28
|
+
items.extend(flatten_dict(v, new_key, sep=sep).items())
|
|
29
|
+
else:
|
|
30
|
+
items.append((new_key, v))
|
|
31
|
+
return dict(items)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def nested_objects_to_dataframe(data: List[Dict[str, Any]]) -> pd.DataFrame:
|
|
35
|
+
"""
|
|
36
|
+
Convert a list of nested objects to a flattened pandas DataFrame.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
data (List[Dict[str, Any]]): A list of nested dictionaries.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
pd.DataFrame: A flattened DataFrame.
|
|
43
|
+
|
|
44
|
+
>>> data = [
|
|
45
|
+
... {"person": {"name": "Alice", "age": 30}, "job": {"title": "Engineer", "salary": 75000}},
|
|
46
|
+
... {"person": {"name": "Bob", "age": 35}, "job": {"title": "Manager", "salary": 85000}}
|
|
47
|
+
... ]
|
|
48
|
+
>>> df = nested_objects_to_dataframe(data)
|
|
49
|
+
>>> df.columns.tolist()
|
|
50
|
+
['person.name', 'person.age', 'job.title', 'job.salary']
|
|
51
|
+
>>> df['person.name'].tolist()
|
|
52
|
+
['Alice', 'Bob']
|
|
53
|
+
"""
|
|
54
|
+
flattened_data = [flatten_dict(item) for item in data]
|
|
55
|
+
return pd.DataFrame(flattened_data)
|
|
56
|
+
|
|
5
57
|
|
|
6
58
|
def facet_summary_to_dataframe_unmelted(
|
|
7
59
|
facet_summary: Dict[Union[str, Tuple[str, ...]], List[Tuple[Union[str, Tuple[str, ...]], int]]]
|
|
@@ -22,7 +74,8 @@ def facet_summary_to_dataframe_unmelted(
|
|
|
22
74
|
categories, value = cat_val_tuple[:-1], cat_val_tuple[-1]
|
|
23
75
|
row = {"Value": value}
|
|
24
76
|
for i, facet in enumerate(facet_type):
|
|
25
|
-
|
|
77
|
+
logger.debug(f"FT={facet_type} i={i} Facet: {facet}, categories: {categories}")
|
|
78
|
+
row[facet] = categories[i] if len(categories) > i else None
|
|
26
79
|
rows.append(row)
|
|
27
80
|
|
|
28
81
|
df = pd.DataFrame(rows)
|