linkml-store 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linkml_store/__init__.py +7 -0
- linkml_store/api/__init__.py +8 -0
- linkml_store/api/client.py +414 -0
- linkml_store/api/collection.py +1280 -0
- linkml_store/api/config.py +187 -0
- linkml_store/api/database.py +862 -0
- linkml_store/api/queries.py +69 -0
- linkml_store/api/stores/__init__.py +0 -0
- linkml_store/api/stores/chromadb/__init__.py +7 -0
- linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
- linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
- linkml_store/api/stores/dremio/__init__.py +10 -0
- linkml_store/api/stores/dremio/dremio_collection.py +555 -0
- linkml_store/api/stores/dremio/dremio_database.py +1052 -0
- linkml_store/api/stores/dremio/mappings.py +105 -0
- linkml_store/api/stores/dremio_rest/__init__.py +11 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
- linkml_store/api/stores/duckdb/__init__.py +16 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
- linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
- linkml_store/api/stores/duckdb/mappings.py +8 -0
- linkml_store/api/stores/filesystem/__init__.py +15 -0
- linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
- linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
- linkml_store/api/stores/hdf5/__init__.py +7 -0
- linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
- linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
- linkml_store/api/stores/ibis/__init__.py +5 -0
- linkml_store/api/stores/ibis/ibis_collection.py +488 -0
- linkml_store/api/stores/ibis/ibis_database.py +328 -0
- linkml_store/api/stores/mongodb/__init__.py +25 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
- linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
- linkml_store/api/stores/neo4j/__init__.py +0 -0
- linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
- linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
- linkml_store/api/stores/solr/__init__.py +3 -0
- linkml_store/api/stores/solr/solr_collection.py +224 -0
- linkml_store/api/stores/solr/solr_database.py +83 -0
- linkml_store/api/stores/solr/solr_utils.py +0 -0
- linkml_store/api/types.py +4 -0
- linkml_store/cli.py +1147 -0
- linkml_store/constants.py +7 -0
- linkml_store/graphs/__init__.py +0 -0
- linkml_store/graphs/graph_map.py +24 -0
- linkml_store/index/__init__.py +53 -0
- linkml_store/index/implementations/__init__.py +0 -0
- linkml_store/index/implementations/llm_indexer.py +174 -0
- linkml_store/index/implementations/simple_indexer.py +43 -0
- linkml_store/index/indexer.py +211 -0
- linkml_store/inference/__init__.py +13 -0
- linkml_store/inference/evaluation.py +195 -0
- linkml_store/inference/implementations/__init__.py +0 -0
- linkml_store/inference/implementations/llm_inference_engine.py +154 -0
- linkml_store/inference/implementations/rag_inference_engine.py +276 -0
- linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
- linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
- linkml_store/inference/inference_config.py +66 -0
- linkml_store/inference/inference_engine.py +209 -0
- linkml_store/inference/inference_engine_registry.py +74 -0
- linkml_store/plotting/__init__.py +5 -0
- linkml_store/plotting/cli.py +826 -0
- linkml_store/plotting/dimensionality_reduction.py +453 -0
- linkml_store/plotting/embedding_plot.py +489 -0
- linkml_store/plotting/facet_chart.py +73 -0
- linkml_store/plotting/heatmap.py +383 -0
- linkml_store/utils/__init__.py +0 -0
- linkml_store/utils/change_utils.py +17 -0
- linkml_store/utils/dat_parser.py +95 -0
- linkml_store/utils/embedding_matcher.py +424 -0
- linkml_store/utils/embedding_utils.py +299 -0
- linkml_store/utils/enrichment_analyzer.py +217 -0
- linkml_store/utils/file_utils.py +37 -0
- linkml_store/utils/format_utils.py +550 -0
- linkml_store/utils/io.py +38 -0
- linkml_store/utils/llm_utils.py +122 -0
- linkml_store/utils/mongodb_utils.py +145 -0
- linkml_store/utils/neo4j_utils.py +42 -0
- linkml_store/utils/object_utils.py +190 -0
- linkml_store/utils/pandas_utils.py +93 -0
- linkml_store/utils/patch_utils.py +126 -0
- linkml_store/utils/query_utils.py +89 -0
- linkml_store/utils/schema_utils.py +23 -0
- linkml_store/utils/sklearn_utils.py +193 -0
- linkml_store/utils/sql_utils.py +177 -0
- linkml_store/utils/stats_utils.py +53 -0
- linkml_store/utils/vector_utils.py +158 -0
- linkml_store/webapi/__init__.py +0 -0
- linkml_store/webapi/html/__init__.py +3 -0
- linkml_store/webapi/html/base.html.j2 +24 -0
- linkml_store/webapi/html/collection_details.html.j2 +15 -0
- linkml_store/webapi/html/database_details.html.j2 +16 -0
- linkml_store/webapi/html/databases.html.j2 +14 -0
- linkml_store/webapi/html/generic.html.j2 +43 -0
- linkml_store/webapi/main.py +855 -0
- linkml_store-0.3.0.dist-info/METADATA +226 -0
- linkml_store-0.3.0.dist-info/RECORD +101 -0
- linkml_store-0.3.0.dist-info/WHEEL +4 -0
- linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
- linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,550 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import gzip
|
|
3
|
+
import hashlib
|
|
4
|
+
import io
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import re
|
|
8
|
+
import sys
|
|
9
|
+
import tarfile
|
|
10
|
+
from enum import Enum
|
|
11
|
+
from io import StringIO
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import IO, Any, Dict, List, Optional, TextIO, Type, Union
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
import pystow
|
|
17
|
+
import xmltodict
|
|
18
|
+
import yaml
|
|
19
|
+
from pydantic import BaseModel
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Format(Enum):
|
|
25
|
+
"""
|
|
26
|
+
Supported generic file formats for loading and rendering objects.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
JSON = "json"
|
|
30
|
+
JSONL = "jsonl"
|
|
31
|
+
YAML = "yaml"
|
|
32
|
+
YAMLL = "yamll"
|
|
33
|
+
TOML = "toml"
|
|
34
|
+
TSV = "tsv"
|
|
35
|
+
CSV = "csv"
|
|
36
|
+
XML = "xml"
|
|
37
|
+
TURTLE = "turtle"
|
|
38
|
+
RDFXML = "rdfxml"
|
|
39
|
+
TEXT = "text"
|
|
40
|
+
TEXTLINES = "textlines"
|
|
41
|
+
OBO = "obo"
|
|
42
|
+
FASTA = "fasta"
|
|
43
|
+
GMT = "gmt"
|
|
44
|
+
DAT = "dat"
|
|
45
|
+
MARKDOWN = "markdown"
|
|
46
|
+
PKL = "pkl"
|
|
47
|
+
RDS = "rds"
|
|
48
|
+
PYTHON = "python"
|
|
49
|
+
PARQUET = "parquet"
|
|
50
|
+
HDF5 = "hdf5"
|
|
51
|
+
NETCDF = "netcdf"
|
|
52
|
+
FORMATTED = "formatted"
|
|
53
|
+
TABLE = "table"
|
|
54
|
+
XLSX = "xlsx"
|
|
55
|
+
PNG = "png"
|
|
56
|
+
SQLDUMP_DUCKDB = "duckdb"
|
|
57
|
+
SQLDUMP_POSTGRES = "postgres"
|
|
58
|
+
DUMP_MONGODB = "mongodb"
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def guess_format(cls, file_name: str) -> Optional["Format"]:
|
|
62
|
+
ext = Path(file_name).suffix.lower()
|
|
63
|
+
|
|
64
|
+
format_map = {
|
|
65
|
+
".json": cls.JSON,
|
|
66
|
+
".jsonl": cls.JSONL,
|
|
67
|
+
".yaml": cls.YAML,
|
|
68
|
+
".yml": cls.YAML,
|
|
69
|
+
".yamll": cls.YAMLL,
|
|
70
|
+
".tsv": cls.TSV,
|
|
71
|
+
".csv": cls.CSV,
|
|
72
|
+
".txt": cls.TEXT,
|
|
73
|
+
".xml": cls.XML,
|
|
74
|
+
".owx": cls.XML,
|
|
75
|
+
".owl": cls.RDFXML,
|
|
76
|
+
".ttl": cls.TURTLE,
|
|
77
|
+
".md": cls.MARKDOWN,
|
|
78
|
+
".py": cls.PYTHON,
|
|
79
|
+
".parquet": cls.PARQUET,
|
|
80
|
+
".pq": cls.PARQUET,
|
|
81
|
+
}
|
|
82
|
+
fmt = format_map.get(ext, None)
|
|
83
|
+
if fmt is None:
|
|
84
|
+
if ext.startswith("."):
|
|
85
|
+
ext = ext[1:]
|
|
86
|
+
if ext in [f.value for f in Format]:
|
|
87
|
+
return Format(ext)
|
|
88
|
+
return fmt
|
|
89
|
+
|
|
90
|
+
def is_dump_format(self):
|
|
91
|
+
return self in [Format.SQLDUMP_DUCKDB, Format.SQLDUMP_POSTGRES, Format.DUMP_MONGODB]
|
|
92
|
+
|
|
93
|
+
def is_binary_format(self):
|
|
94
|
+
return self in [Format.PARQUET, Format.XLSX]
|
|
95
|
+
|
|
96
|
+
def is_xsv(self):
|
|
97
|
+
return self in [Format.TSV, Format.CSV]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def load_objects_from_url(
|
|
101
|
+
url: str,
|
|
102
|
+
format: Union[Format, str] = None,
|
|
103
|
+
expected_type: Type = None,
|
|
104
|
+
local_path: Optional[str] = None,
|
|
105
|
+
**kwargs,
|
|
106
|
+
) -> List[Dict[str, Any]]:
|
|
107
|
+
"""
|
|
108
|
+
Load objects from a URL in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
109
|
+
|
|
110
|
+
:param url: The URL to the file.
|
|
111
|
+
:param format: The format of the file. Can be a Format enum or a string value.
|
|
112
|
+
:param expected_type: The target type to load the objects into.
|
|
113
|
+
:param local_path: The local path to save the file to.
|
|
114
|
+
:return: A list of dictionaries representing the loaded objects.
|
|
115
|
+
"""
|
|
116
|
+
local_path = pystow.ensure("linkml", "linkml-store", url=url)
|
|
117
|
+
logger.info(f"synced to {local_path}")
|
|
118
|
+
objs = load_objects(local_path, format=format, expected_type=expected_type, **kwargs)
|
|
119
|
+
if not objs:
|
|
120
|
+
raise ValueError(f"No objects loaded from URL: {url}")
|
|
121
|
+
return objs
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def clean_pandas_value(v):
|
|
125
|
+
"""Clean a single value from pandas."""
|
|
126
|
+
import math
|
|
127
|
+
|
|
128
|
+
if isinstance(v, float):
|
|
129
|
+
if math.isnan(v) or math.isinf(v):
|
|
130
|
+
return None
|
|
131
|
+
return float(v) # Ensures proper float type
|
|
132
|
+
return v
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def clean_nested_structure(obj):
|
|
136
|
+
"""Recursively clean a nested structure of dicts/lists from pandas."""
|
|
137
|
+
if isinstance(obj, dict):
|
|
138
|
+
return {k: clean_nested_structure(v) for k, v in obj.items()}
|
|
139
|
+
elif isinstance(obj, list):
|
|
140
|
+
return [clean_nested_structure(item) for item in obj] # Fixed: using 'item' instead of 'v'
|
|
141
|
+
else:
|
|
142
|
+
return clean_pandas_value(obj)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def process_file(
|
|
146
|
+
f: IO,
|
|
147
|
+
format: Format,
|
|
148
|
+
expected_type: Optional[Type] = None,
|
|
149
|
+
header_comment_token: Optional[str] = None,
|
|
150
|
+
format_options: Optional[Dict[str, Any]] = None,
|
|
151
|
+
) -> List[Dict[str, Any]]:
|
|
152
|
+
"""
|
|
153
|
+
Process a single file and return a list of objects.
|
|
154
|
+
|
|
155
|
+
:param f: The file object.
|
|
156
|
+
:param format: The format of the file.
|
|
157
|
+
:param expected_type: The expected type of the objects.
|
|
158
|
+
:param header_comment_token: Token used for header comments to be skipped
|
|
159
|
+
:return:
|
|
160
|
+
"""
|
|
161
|
+
if format_options is None:
|
|
162
|
+
format_options = {}
|
|
163
|
+
if format == Format.YAMLL:
|
|
164
|
+
format = Format.YAML
|
|
165
|
+
expected_type = list
|
|
166
|
+
if format == Format.JSON:
|
|
167
|
+
objs = json.load(f)
|
|
168
|
+
elif format == Format.JSONL:
|
|
169
|
+
objs = [json.loads(line) for line in f]
|
|
170
|
+
elif format == Format.YAML:
|
|
171
|
+
if expected_type and expected_type == list: # noqa E721
|
|
172
|
+
objs = list(yaml.safe_load_all(f))
|
|
173
|
+
# allow YAML with a `---` with no object before it
|
|
174
|
+
objs = [obj for obj in objs if obj is not None]
|
|
175
|
+
else:
|
|
176
|
+
objs = yaml.safe_load(f)
|
|
177
|
+
elif format == Format.TOML:
|
|
178
|
+
import toml
|
|
179
|
+
|
|
180
|
+
objs = toml.load(f)
|
|
181
|
+
if not isinstance(objs, list):
|
|
182
|
+
objs = [objs]
|
|
183
|
+
elif format == Format.TEXTLINES:
|
|
184
|
+
objs = f.readlines()
|
|
185
|
+
elif format in [Format.TSV, Format.CSV]:
|
|
186
|
+
if header_comment_token:
|
|
187
|
+
while True:
|
|
188
|
+
pos = f.tell()
|
|
189
|
+
line = f.readline()
|
|
190
|
+
if not line.startswith(header_comment_token):
|
|
191
|
+
f.seek(pos)
|
|
192
|
+
break
|
|
193
|
+
delimiter = "\t" if format == Format.TSV else ","
|
|
194
|
+
reader = csv.DictReader(f, delimiter=delimiter)
|
|
195
|
+
objs = list(reader)
|
|
196
|
+
elif format == Format.XML:
|
|
197
|
+
objs = xmltodict.parse(f.read())
|
|
198
|
+
elif format == Format.PKL:
|
|
199
|
+
objs = pd.read_pickle(f).to_dict(orient="records")
|
|
200
|
+
elif format == Format.RDS:
|
|
201
|
+
import pyreadr
|
|
202
|
+
objs = pyreadr.read_r(f)
|
|
203
|
+
elif format == Format.XLSX:
|
|
204
|
+
xls = pd.ExcelFile(f)
|
|
205
|
+
objs = {sheet: clean_nested_structure(xls.parse(sheet).to_dict(orient="records")) for sheet in xls.sheet_names}
|
|
206
|
+
elif format == Format.TEXT:
|
|
207
|
+
txt = f.read()
|
|
208
|
+
objs = [
|
|
209
|
+
{
|
|
210
|
+
"name": Path(f.name).name,
|
|
211
|
+
"path": f.name,
|
|
212
|
+
"content": txt,
|
|
213
|
+
"size": len(txt),
|
|
214
|
+
"lines": txt.count("\n") + 1,
|
|
215
|
+
"md5": hashlib.md5(txt.encode()).hexdigest(),
|
|
216
|
+
}
|
|
217
|
+
]
|
|
218
|
+
elif format == Format.GMT:
|
|
219
|
+
objs = []
|
|
220
|
+
lib_name = Path(f.name).name
|
|
221
|
+
for line in f:
|
|
222
|
+
parts = line.strip().split("\t")
|
|
223
|
+
desc = parts[1]
|
|
224
|
+
objs.append(
|
|
225
|
+
{
|
|
226
|
+
"library": lib_name,
|
|
227
|
+
"uid": f"{lib_name}.{parts[0]}",
|
|
228
|
+
"name": parts[0],
|
|
229
|
+
"description": desc if desc else None,
|
|
230
|
+
"genes": parts[2:],
|
|
231
|
+
}
|
|
232
|
+
)
|
|
233
|
+
elif format == Format.FASTA:
|
|
234
|
+
objs = []
|
|
235
|
+
current_obj = None
|
|
236
|
+
for line in f:
|
|
237
|
+
line = line.strip()
|
|
238
|
+
if line.startswith(">"):
|
|
239
|
+
if current_obj:
|
|
240
|
+
objs.append(current_obj)
|
|
241
|
+
current_obj = {"id": line[1:], "sequence": ""}
|
|
242
|
+
else:
|
|
243
|
+
current_obj["sequence"] += line
|
|
244
|
+
if current_obj:
|
|
245
|
+
objs.append(current_obj)
|
|
246
|
+
elif format == Format.OBO:
|
|
247
|
+
blocks = split_document(f.read(), "\n\n")
|
|
248
|
+
id_pattern = re.compile(r"id: (\S+)")
|
|
249
|
+
|
|
250
|
+
def get_id(block):
|
|
251
|
+
m = id_pattern.search(block)
|
|
252
|
+
return m.group(1) if m else None
|
|
253
|
+
|
|
254
|
+
objs = [{"id": get_id(block), "content": block} for block in blocks]
|
|
255
|
+
objs = [obj for obj in objs if obj["id"]]
|
|
256
|
+
elif format == Format.DAT:
|
|
257
|
+
from linkml_store.utils.dat_parser import parse_sib_format
|
|
258
|
+
|
|
259
|
+
_, objs = parse_sib_format(f.read())
|
|
260
|
+
elif format in (Format.RDFXML, Format.TURTLE):
|
|
261
|
+
import lightrdf
|
|
262
|
+
|
|
263
|
+
parser = lightrdf.Parser()
|
|
264
|
+
objs = []
|
|
265
|
+
ext_fmt = "rdfxml"
|
|
266
|
+
if format == Format.TURTLE:
|
|
267
|
+
ext_fmt = "ttl"
|
|
268
|
+
bytesio = io.BytesIO(f.read().encode("utf-8"))
|
|
269
|
+
buffer = io.BufferedReader(bytesio)
|
|
270
|
+
for s, p, o in parser.parse(buffer, base_iri=None, format=ext_fmt):
|
|
271
|
+
obj = {
|
|
272
|
+
"subject": s,
|
|
273
|
+
"predicate": p,
|
|
274
|
+
"object": o,
|
|
275
|
+
}
|
|
276
|
+
if format_options.get("pivot", False):
|
|
277
|
+
obj = {
|
|
278
|
+
"subject": s,
|
|
279
|
+
p: o,
|
|
280
|
+
}
|
|
281
|
+
objs.append(obj)
|
|
282
|
+
elif format == Format.PARQUET:
|
|
283
|
+
import pyarrow.parquet as pq
|
|
284
|
+
|
|
285
|
+
table = pq.read_table(f)
|
|
286
|
+
objs = table.to_pandas().to_dict(orient="records")
|
|
287
|
+
elif format in [Format.PYTHON, Format.FORMATTED, Format.TABLE]:
|
|
288
|
+
raise ValueError(f"Format {format} is not supported for loading objects")
|
|
289
|
+
else:
|
|
290
|
+
raise ValueError(f"Unsupported file format: {format}")
|
|
291
|
+
|
|
292
|
+
if not isinstance(objs, list):
|
|
293
|
+
objs = [objs]
|
|
294
|
+
return objs
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def load_objects(
|
|
298
|
+
file_path: Union[str, Path],
|
|
299
|
+
format: Optional[Union[Format, str]] = None,
|
|
300
|
+
compression: Optional[str] = None,
|
|
301
|
+
expected_type: Optional[Type] = None,
|
|
302
|
+
header_comment_token: Optional[str] = None,
|
|
303
|
+
select_query: Optional[str] = None,
|
|
304
|
+
) -> List[Dict[str, Any]]:
|
|
305
|
+
"""
|
|
306
|
+
Load objects from a file or archive in supported formats.
|
|
307
|
+
For tgz archives, it processes all files and concatenates the results.
|
|
308
|
+
|
|
309
|
+
TODO: Add schema hints for CSV/TSV parsing.
|
|
310
|
+
|
|
311
|
+
:param file_path: The path to the file or archive.
|
|
312
|
+
:param format: The format of the file. Can be a Format enum or a string value.
|
|
313
|
+
:param compression: The compression type. Supports 'gz' for gzip and 'tgz' for tar.gz.
|
|
314
|
+
:param expected_type: The target type to load the objects into, e.g. list
|
|
315
|
+
:param header_comment_token: Token used for header comments to be skipped
|
|
316
|
+
:param select_query: JSONPath query to select specific objects from the loaded data.
|
|
317
|
+
:return: A list of dictionaries representing the loaded objects.
|
|
318
|
+
"""
|
|
319
|
+
if isinstance(file_path, Path):
|
|
320
|
+
file_path = str(file_path)
|
|
321
|
+
|
|
322
|
+
for url_scheme in ["http", "https", "ftp"]:
|
|
323
|
+
if file_path.startswith(f"{url_scheme}://"):
|
|
324
|
+
return load_objects_from_url(
|
|
325
|
+
file_path,
|
|
326
|
+
format=format,
|
|
327
|
+
expected_type=expected_type,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
if isinstance(format, str):
|
|
331
|
+
format = Format(format)
|
|
332
|
+
|
|
333
|
+
all_objects = []
|
|
334
|
+
|
|
335
|
+
if compression == "tgz":
|
|
336
|
+
with tarfile.open(file_path, "r:gz") as tar:
|
|
337
|
+
for member in tar.getmembers():
|
|
338
|
+
if member.isfile():
|
|
339
|
+
f = tar.extractfile(member)
|
|
340
|
+
if f:
|
|
341
|
+
content = io.TextIOWrapper(f)
|
|
342
|
+
member_format = Format.guess_format(member.name) if not format else format
|
|
343
|
+
logger.debug(f"Processing tar member {member.name} with format {member_format}")
|
|
344
|
+
all_objects.extend(process_file(content, member_format, expected_type, header_comment_token))
|
|
345
|
+
else:
|
|
346
|
+
if Path(file_path).is_dir():
|
|
347
|
+
raise ValueError(f"{file_path} is a dir, which is invalid for {format}")
|
|
348
|
+
open_func = gzip.open if compression == "gz" else open
|
|
349
|
+
format = Format.guess_format(file_path) if not format else format
|
|
350
|
+
mode = "rb" if (format and format.is_binary_format()) or compression == "gz" else "r"
|
|
351
|
+
with open_func(file_path, mode) if file_path != "-" else sys.stdin as f:
|
|
352
|
+
if compression == "gz" and mode == "r":
|
|
353
|
+
f = io.TextIOWrapper(f)
|
|
354
|
+
all_objects = process_file(f, format, expected_type, header_comment_token)
|
|
355
|
+
|
|
356
|
+
logger.debug(f"Loaded {len(all_objects)} objects from {file_path}")
|
|
357
|
+
all_objects = transform_objects(all_objects, select_query)
|
|
358
|
+
return all_objects
|
|
359
|
+
|
|
360
|
+
def transform_objects(all_objects: List[Dict[str, Any]], select_query: Optional[str]) -> List[Dict[str, Any]]:
|
|
361
|
+
if not select_query:
|
|
362
|
+
return all_objects
|
|
363
|
+
import jsonpath_ng as jp
|
|
364
|
+
|
|
365
|
+
path_expr = jp.parse(select_query)
|
|
366
|
+
new_objs = []
|
|
367
|
+
for obj in all_objects:
|
|
368
|
+
for match in path_expr.find(obj):
|
|
369
|
+
logging.debug(f"Match: {match.value}")
|
|
370
|
+
if isinstance(match.value, list):
|
|
371
|
+
new_objs.extend(match.value)
|
|
372
|
+
else:
|
|
373
|
+
new_objs.append(match.value)
|
|
374
|
+
all_objects = new_objs
|
|
375
|
+
return all_objects
|
|
376
|
+
|
|
377
|
+
def remove_control_chars_from_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
378
|
+
df_clean = df.copy()
|
|
379
|
+
for col in df_clean.select_dtypes(include=['object']).columns:
|
|
380
|
+
df_clean[col] = df_clean[col].astype(str).str.replace(r'[\x00-\x1f\x7f-\x9f]', '', regex=True)
|
|
381
|
+
return df_clean
|
|
382
|
+
|
|
383
|
+
def write_output(
|
|
384
|
+
data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame],
|
|
385
|
+
format: Union[Format, str] = Format.YAML,
|
|
386
|
+
target: Optional[Union[TextIO, str, Path]] = None,
|
|
387
|
+
) -> None:
|
|
388
|
+
"""
|
|
389
|
+
Write output data to a file in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
390
|
+
|
|
391
|
+
>>> write_output([{"a": 1, "b": 2}, {"a": 3, "b": 4}], Format.JSON, sys.stdout)
|
|
392
|
+
[
|
|
393
|
+
{
|
|
394
|
+
"a": 1,
|
|
395
|
+
"b": 2
|
|
396
|
+
},
|
|
397
|
+
{
|
|
398
|
+
"a": 3,
|
|
399
|
+
"b": 4
|
|
400
|
+
}
|
|
401
|
+
]
|
|
402
|
+
"""
|
|
403
|
+
if isinstance(format, str):
|
|
404
|
+
format = Format(format)
|
|
405
|
+
if format == Format.XLSX:
|
|
406
|
+
if not target:
|
|
407
|
+
raise ValueError("XLSX output requires a target file")
|
|
408
|
+
if not isinstance(data, pd.DataFrame):
|
|
409
|
+
data = pd.DataFrame(data)
|
|
410
|
+
data = remove_control_chars_from_df(data)
|
|
411
|
+
data.to_excel(target, index=False)
|
|
412
|
+
return
|
|
413
|
+
output_str = render_output(data, format)
|
|
414
|
+
if target:
|
|
415
|
+
if isinstance(target, str):
|
|
416
|
+
with open(target, "w") as target:
|
|
417
|
+
target.write(output_str)
|
|
418
|
+
else:
|
|
419
|
+
target.write(output_str)
|
|
420
|
+
else:
|
|
421
|
+
print(output_str)
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def render_output(
|
|
425
|
+
data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame, List[BaseModel]],
|
|
426
|
+
format: Optional[Union[Format, str]] = Format.YAML,
|
|
427
|
+
) -> str:
|
|
428
|
+
"""
|
|
429
|
+
Render output data in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
430
|
+
|
|
431
|
+
>>> print(render_output([{"a": 1, "b": 2}, {"a": 3, "b": 4}], Format.JSON))
|
|
432
|
+
[
|
|
433
|
+
{
|
|
434
|
+
"a": 1,
|
|
435
|
+
"b": 2
|
|
436
|
+
},
|
|
437
|
+
{
|
|
438
|
+
"a": 3,
|
|
439
|
+
"b": 4
|
|
440
|
+
}
|
|
441
|
+
]
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
:param data: The data to be rendered.
|
|
445
|
+
:param format: The desired output format. Can be a Format enum or a string value.
|
|
446
|
+
:return: The rendered output as a string.
|
|
447
|
+
"""
|
|
448
|
+
if isinstance(format, str):
|
|
449
|
+
format = Format(format)
|
|
450
|
+
|
|
451
|
+
if format in (Format.FORMATTED, ):
|
|
452
|
+
if not isinstance(data, pd.DataFrame):
|
|
453
|
+
data = pd.DataFrame(data)
|
|
454
|
+
return data.to_string(max_rows=None)
|
|
455
|
+
|
|
456
|
+
if isinstance(data, pd.DataFrame):
|
|
457
|
+
data = data.to_dict(orient="records")
|
|
458
|
+
|
|
459
|
+
if isinstance(data, BaseModel):
|
|
460
|
+
data = data.model_dump()
|
|
461
|
+
|
|
462
|
+
if data and isinstance(data, list) and isinstance(data[0], BaseModel):
|
|
463
|
+
data = [d.model_dump() if isinstance(d, BaseModel) else d for d in data]
|
|
464
|
+
|
|
465
|
+
if isinstance(data, dict) and format in [Format.TSV, Format.CSV]:
|
|
466
|
+
data = [data]
|
|
467
|
+
|
|
468
|
+
if isinstance(data, BaseModel):
|
|
469
|
+
data = data.model_dump()
|
|
470
|
+
|
|
471
|
+
if format == Format.JSON:
|
|
472
|
+
return json.dumps(data, indent=2, default=str)
|
|
473
|
+
elif format == Format.JSONL:
|
|
474
|
+
return "\n".join(json.dumps(obj) for obj in data)
|
|
475
|
+
elif format == Format.PYTHON:
|
|
476
|
+
return str(data)
|
|
477
|
+
elif format == Format.MARKDOWN:
|
|
478
|
+
|
|
479
|
+
def as_markdown(obj: dict):
|
|
480
|
+
return "## Object\n\n" + "\n".join([f" * {k}: {v}" for k, v in obj.items()])
|
|
481
|
+
|
|
482
|
+
return "\n\n".join([as_markdown(obj) for obj in data]) if isinstance(data, list) else as_markdown(data)
|
|
483
|
+
elif format == Format.TABLE:
|
|
484
|
+
from tabulate import tabulate
|
|
485
|
+
|
|
486
|
+
return tabulate(pd.DataFrame(data), headers="keys", tablefmt="psql")
|
|
487
|
+
elif format == Format.YAML:
|
|
488
|
+
if isinstance(data, list):
|
|
489
|
+
return yaml.safe_dump_all(data, sort_keys=False)
|
|
490
|
+
else:
|
|
491
|
+
return yaml.safe_dump(data, sort_keys=False)
|
|
492
|
+
elif format == Format.TSV:
|
|
493
|
+
output = StringIO()
|
|
494
|
+
writer = csv.DictWriter(output, fieldnames=get_fieldnames(data), delimiter="\t")
|
|
495
|
+
writer.writeheader()
|
|
496
|
+
writer.writerows(data)
|
|
497
|
+
return output.getvalue()
|
|
498
|
+
elif format == Format.CSV:
|
|
499
|
+
output = StringIO()
|
|
500
|
+
writer = csv.DictWriter(output, fieldnames=get_fieldnames(data))
|
|
501
|
+
writer.writeheader()
|
|
502
|
+
writer.writerows(data)
|
|
503
|
+
return output.getvalue()
|
|
504
|
+
else:
|
|
505
|
+
raise ValueError(f"Unsupported output format: {format}")
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
def get_fieldnames(data: List[Dict[str, Any]]) -> List[str]:
|
|
509
|
+
"""
|
|
510
|
+
Get the fieldnames of a list of dictionaries.
|
|
511
|
+
|
|
512
|
+
>>> get_fieldnames([{"a": 1, "b": 2}, {"a": 3, "b": 4}])
|
|
513
|
+
['a', 'b']
|
|
514
|
+
|
|
515
|
+
:param data: The list of dictionaries.
|
|
516
|
+
:return: The fieldnames.
|
|
517
|
+
"""
|
|
518
|
+
fieldnames = []
|
|
519
|
+
for obj in data:
|
|
520
|
+
fieldnames.extend([k for k in obj.keys() if k not in fieldnames])
|
|
521
|
+
return fieldnames
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
def guess_format(path: str) -> Optional[Format]:
|
|
525
|
+
"""
|
|
526
|
+
Guess the format of a file based on its extension.
|
|
527
|
+
|
|
528
|
+
>>> guess_format("data.json")
|
|
529
|
+
<Format.JSON: 'json'>
|
|
530
|
+
>>> guess_format("data.jsonl")
|
|
531
|
+
<Format.JSONL: 'jsonl'>
|
|
532
|
+
>>> guess_format("data.yaml")
|
|
533
|
+
<Format.YAML: 'yaml'>
|
|
534
|
+
>>> assert not guess_format("data")
|
|
535
|
+
|
|
536
|
+
:param path: The path to the file.
|
|
537
|
+
:return: The guessed format.
|
|
538
|
+
"""
|
|
539
|
+
return Format.guess_format(path)
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def split_document(doc: str, delimiter: str):
|
|
543
|
+
"""
|
|
544
|
+
Split a document into parts based on a delimiter.
|
|
545
|
+
|
|
546
|
+
:param doc: The document to split.
|
|
547
|
+
:param delimiter: The delimiter.
|
|
548
|
+
:return: The parts of the document.
|
|
549
|
+
"""
|
|
550
|
+
return doc.split(delimiter)
|
linkml_store/utils/io.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Iterable, Iterator, Optional, TextIO, Union
|
|
3
|
+
|
|
4
|
+
from linkml_runtime import SchemaView
|
|
5
|
+
|
|
6
|
+
from linkml_store.api.collection import OBJECT
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def export_objects(
|
|
10
|
+
objects: Iterable[OBJECT],
|
|
11
|
+
location: Union[Path, str, TextIO],
|
|
12
|
+
output_type: Optional[str],
|
|
13
|
+
target_class: Optional[str] = None,
|
|
14
|
+
schema_view: Optional[SchemaView] = None,
|
|
15
|
+
**kwargs,
|
|
16
|
+
):
|
|
17
|
+
"""
|
|
18
|
+
Export objects to a file or stream
|
|
19
|
+
|
|
20
|
+
:param objects: objects to export
|
|
21
|
+
:param location: location to export to
|
|
22
|
+
:param kwargs:
|
|
23
|
+
:return:
|
|
24
|
+
"""
|
|
25
|
+
raise NotImplementedError
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def import_objects_iter(
|
|
29
|
+
location: Union[Path, str, TextIO], schema_view: Optional[SchemaView] = None, **kwargs
|
|
30
|
+
) -> Iterator[OBJECT]:
|
|
31
|
+
"""
|
|
32
|
+
Import objects from a file or stream
|
|
33
|
+
|
|
34
|
+
:param location:
|
|
35
|
+
:param kwargs:
|
|
36
|
+
:return:
|
|
37
|
+
"""
|
|
38
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import TYPE_CHECKING, Callable, List, Optional
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
import tiktoken
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
MODEL_TOKEN_MAPPING = {
|
|
10
|
+
"gpt-4o-mini": 128_000,
|
|
11
|
+
"gpt-4o": 128_000,
|
|
12
|
+
"gpt-4o-2024-05-13": 128_000,
|
|
13
|
+
"gpt-4": 8192,
|
|
14
|
+
"gpt-4-0314": 8192,
|
|
15
|
+
"gpt-4-0613": 8192,
|
|
16
|
+
"gpt-4-32k": 32768,
|
|
17
|
+
"gpt-4-32k-0314": 32768,
|
|
18
|
+
"gpt-4-32k-0613": 32768,
|
|
19
|
+
"gpt-3.5-turbo": 4096,
|
|
20
|
+
"gpt-3.5-turbo-0301": 4096,
|
|
21
|
+
"gpt-3.5-turbo-0613": 4096,
|
|
22
|
+
"gpt-3.5-turbo-16k": 16385,
|
|
23
|
+
"gpt-3.5-turbo-16k-0613": 16385,
|
|
24
|
+
"gpt-3.5-turbo-instruct": 4096,
|
|
25
|
+
"text-ada-001": 2049,
|
|
26
|
+
"ada": 2049,
|
|
27
|
+
"ada-002": 8192,
|
|
28
|
+
"text-babbage-001": 2040,
|
|
29
|
+
"babbage": 2049,
|
|
30
|
+
"text-curie-001": 2049,
|
|
31
|
+
"curie": 2049,
|
|
32
|
+
"davinci": 2049,
|
|
33
|
+
"text-davinci-003": 4097,
|
|
34
|
+
"text-davinci-002": 4097,
|
|
35
|
+
"code-davinci-002": 8001,
|
|
36
|
+
"code-davinci-001": 8001,
|
|
37
|
+
"code-cushman-002": 2048,
|
|
38
|
+
"code-cushman-001": 2048,
|
|
39
|
+
"claude": 200_000,
|
|
40
|
+
"llama-3": 200_000,
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def render_formatted_text(
|
|
45
|
+
render_func: Callable,
|
|
46
|
+
values: List[str],
|
|
47
|
+
encoding: "tiktoken.Encoding",
|
|
48
|
+
token_limit: int,
|
|
49
|
+
additional_text: Optional[str] = None,
|
|
50
|
+
) -> str:
|
|
51
|
+
"""
|
|
52
|
+
Render a formatted text string with a given object, encoding, and token limit.
|
|
53
|
+
|
|
54
|
+
This function safely handles text that may contain special tokens (e.g., <|fim_suffix|>,
|
|
55
|
+
<|endoftext|>) by treating them as normal text rather than raising errors.
|
|
56
|
+
|
|
57
|
+
>>> from tiktoken import encoding_for_model
|
|
58
|
+
>>> encoding = encoding_for_model("gpt-4o-mini")
|
|
59
|
+
>>> names = ["Alice", "Bob", "DoctorHippopotamusMcHippopotamusFace"]
|
|
60
|
+
>>> f = lambda x: f"Hello, {' '.join(x)}!"
|
|
61
|
+
>>> render_formatted_text(f, names, encoding, 4096)
|
|
62
|
+
'Hello, Alice Bob DoctorHippopotamusMcHippopotamusFace!'
|
|
63
|
+
>>> render_formatted_text(f, names, encoding, 5)
|
|
64
|
+
'Hello, Alice Bob!'
|
|
65
|
+
|
|
66
|
+
:param render_func: Rendering function
|
|
67
|
+
:param values: Values to render
|
|
68
|
+
:param encoding: Encoding
|
|
69
|
+
:param token_limit: Token limit
|
|
70
|
+
:param additional_text: Additional text to consider
|
|
71
|
+
:return:
|
|
72
|
+
"""
|
|
73
|
+
text = render_func(values)
|
|
74
|
+
if additional_text:
|
|
75
|
+
token_limit -= len(encoding.encode(additional_text, disallowed_special=()))
|
|
76
|
+
text_length = len(encoding.encode(text, disallowed_special=()))
|
|
77
|
+
logger.debug(f"Encoding length: {text_length} (original: {len(text)})")
|
|
78
|
+
if text_length <= token_limit:
|
|
79
|
+
return text
|
|
80
|
+
if not values:
|
|
81
|
+
raise ValueError(f"Cannot fit text into token limit: {text_length} > {token_limit}")
|
|
82
|
+
# remove last element and try again
|
|
83
|
+
return render_formatted_text(render_func, values[0:-1], encoding=encoding, token_limit=token_limit)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def get_token_limit(model_name: str) -> int:
|
|
87
|
+
"""
|
|
88
|
+
Estimate the token limit for a model.
|
|
89
|
+
|
|
90
|
+
>>> get_token_limit("gpt-4o-mini")
|
|
91
|
+
128000
|
|
92
|
+
|
|
93
|
+
also works with nested names:
|
|
94
|
+
|
|
95
|
+
>>> get_token_limit("my/claude-opus")
|
|
96
|
+
200000
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
:param model_name: Model name
|
|
100
|
+
:return: Estimated token limit
|
|
101
|
+
"""
|
|
102
|
+
# sort MODEL_TOKEN_MAPPING by key length to ensure that the longest model names are checked first
|
|
103
|
+
for model, token_limit in sorted(MODEL_TOKEN_MAPPING.items(), key=lambda x: len(x[0]), reverse=True):
|
|
104
|
+
if model in model_name:
|
|
105
|
+
return token_limit
|
|
106
|
+
return 4096
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]:
|
|
110
|
+
import yaml
|
|
111
|
+
|
|
112
|
+
if "```" in yaml_str:
|
|
113
|
+
yaml_str = yaml_str.split("```")[1].strip()
|
|
114
|
+
if yaml_str.startswith("yaml"):
|
|
115
|
+
yaml_str = yaml_str[4:].strip()
|
|
116
|
+
try:
|
|
117
|
+
return yaml.safe_load(yaml_str)
|
|
118
|
+
except Exception as e:
|
|
119
|
+
if strict:
|
|
120
|
+
raise e
|
|
121
|
+
logger.error(f"Error parsing YAML: {yaml_str}\n{e}")
|
|
122
|
+
return None
|