linkml-store 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. linkml_store/__init__.py +7 -0
  2. linkml_store/api/__init__.py +8 -0
  3. linkml_store/api/client.py +414 -0
  4. linkml_store/api/collection.py +1280 -0
  5. linkml_store/api/config.py +187 -0
  6. linkml_store/api/database.py +862 -0
  7. linkml_store/api/queries.py +69 -0
  8. linkml_store/api/stores/__init__.py +0 -0
  9. linkml_store/api/stores/chromadb/__init__.py +7 -0
  10. linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
  11. linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
  12. linkml_store/api/stores/dremio/__init__.py +10 -0
  13. linkml_store/api/stores/dremio/dremio_collection.py +555 -0
  14. linkml_store/api/stores/dremio/dremio_database.py +1052 -0
  15. linkml_store/api/stores/dremio/mappings.py +105 -0
  16. linkml_store/api/stores/dremio_rest/__init__.py +11 -0
  17. linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
  18. linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
  19. linkml_store/api/stores/duckdb/__init__.py +16 -0
  20. linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
  21. linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
  22. linkml_store/api/stores/duckdb/mappings.py +8 -0
  23. linkml_store/api/stores/filesystem/__init__.py +15 -0
  24. linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
  25. linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
  26. linkml_store/api/stores/hdf5/__init__.py +7 -0
  27. linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
  28. linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
  29. linkml_store/api/stores/ibis/__init__.py +5 -0
  30. linkml_store/api/stores/ibis/ibis_collection.py +488 -0
  31. linkml_store/api/stores/ibis/ibis_database.py +328 -0
  32. linkml_store/api/stores/mongodb/__init__.py +25 -0
  33. linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
  34. linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
  35. linkml_store/api/stores/neo4j/__init__.py +0 -0
  36. linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
  37. linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
  38. linkml_store/api/stores/solr/__init__.py +3 -0
  39. linkml_store/api/stores/solr/solr_collection.py +224 -0
  40. linkml_store/api/stores/solr/solr_database.py +83 -0
  41. linkml_store/api/stores/solr/solr_utils.py +0 -0
  42. linkml_store/api/types.py +4 -0
  43. linkml_store/cli.py +1147 -0
  44. linkml_store/constants.py +7 -0
  45. linkml_store/graphs/__init__.py +0 -0
  46. linkml_store/graphs/graph_map.py +24 -0
  47. linkml_store/index/__init__.py +53 -0
  48. linkml_store/index/implementations/__init__.py +0 -0
  49. linkml_store/index/implementations/llm_indexer.py +174 -0
  50. linkml_store/index/implementations/simple_indexer.py +43 -0
  51. linkml_store/index/indexer.py +211 -0
  52. linkml_store/inference/__init__.py +13 -0
  53. linkml_store/inference/evaluation.py +195 -0
  54. linkml_store/inference/implementations/__init__.py +0 -0
  55. linkml_store/inference/implementations/llm_inference_engine.py +154 -0
  56. linkml_store/inference/implementations/rag_inference_engine.py +276 -0
  57. linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
  58. linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
  59. linkml_store/inference/inference_config.py +66 -0
  60. linkml_store/inference/inference_engine.py +209 -0
  61. linkml_store/inference/inference_engine_registry.py +74 -0
  62. linkml_store/plotting/__init__.py +5 -0
  63. linkml_store/plotting/cli.py +826 -0
  64. linkml_store/plotting/dimensionality_reduction.py +453 -0
  65. linkml_store/plotting/embedding_plot.py +489 -0
  66. linkml_store/plotting/facet_chart.py +73 -0
  67. linkml_store/plotting/heatmap.py +383 -0
  68. linkml_store/utils/__init__.py +0 -0
  69. linkml_store/utils/change_utils.py +17 -0
  70. linkml_store/utils/dat_parser.py +95 -0
  71. linkml_store/utils/embedding_matcher.py +424 -0
  72. linkml_store/utils/embedding_utils.py +299 -0
  73. linkml_store/utils/enrichment_analyzer.py +217 -0
  74. linkml_store/utils/file_utils.py +37 -0
  75. linkml_store/utils/format_utils.py +550 -0
  76. linkml_store/utils/io.py +38 -0
  77. linkml_store/utils/llm_utils.py +122 -0
  78. linkml_store/utils/mongodb_utils.py +145 -0
  79. linkml_store/utils/neo4j_utils.py +42 -0
  80. linkml_store/utils/object_utils.py +190 -0
  81. linkml_store/utils/pandas_utils.py +93 -0
  82. linkml_store/utils/patch_utils.py +126 -0
  83. linkml_store/utils/query_utils.py +89 -0
  84. linkml_store/utils/schema_utils.py +23 -0
  85. linkml_store/utils/sklearn_utils.py +193 -0
  86. linkml_store/utils/sql_utils.py +177 -0
  87. linkml_store/utils/stats_utils.py +53 -0
  88. linkml_store/utils/vector_utils.py +158 -0
  89. linkml_store/webapi/__init__.py +0 -0
  90. linkml_store/webapi/html/__init__.py +3 -0
  91. linkml_store/webapi/html/base.html.j2 +24 -0
  92. linkml_store/webapi/html/collection_details.html.j2 +15 -0
  93. linkml_store/webapi/html/database_details.html.j2 +16 -0
  94. linkml_store/webapi/html/databases.html.j2 +14 -0
  95. linkml_store/webapi/html/generic.html.j2 +43 -0
  96. linkml_store/webapi/main.py +855 -0
  97. linkml_store-0.3.0.dist-info/METADATA +226 -0
  98. linkml_store-0.3.0.dist-info/RECORD +101 -0
  99. linkml_store-0.3.0.dist-info/WHEEL +4 -0
  100. linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
  101. linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
@@ -0,0 +1,550 @@
1
+ import csv
2
+ import gzip
3
+ import hashlib
4
+ import io
5
+ import json
6
+ import logging
7
+ import re
8
+ import sys
9
+ import tarfile
10
+ from enum import Enum
11
+ from io import StringIO
12
+ from pathlib import Path
13
+ from typing import IO, Any, Dict, List, Optional, TextIO, Type, Union
14
+
15
+ import pandas as pd
16
+ import pystow
17
+ import xmltodict
18
+ import yaml
19
+ from pydantic import BaseModel
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class Format(Enum):
25
+ """
26
+ Supported generic file formats for loading and rendering objects.
27
+ """
28
+
29
+ JSON = "json"
30
+ JSONL = "jsonl"
31
+ YAML = "yaml"
32
+ YAMLL = "yamll"
33
+ TOML = "toml"
34
+ TSV = "tsv"
35
+ CSV = "csv"
36
+ XML = "xml"
37
+ TURTLE = "turtle"
38
+ RDFXML = "rdfxml"
39
+ TEXT = "text"
40
+ TEXTLINES = "textlines"
41
+ OBO = "obo"
42
+ FASTA = "fasta"
43
+ GMT = "gmt"
44
+ DAT = "dat"
45
+ MARKDOWN = "markdown"
46
+ PKL = "pkl"
47
+ RDS = "rds"
48
+ PYTHON = "python"
49
+ PARQUET = "parquet"
50
+ HDF5 = "hdf5"
51
+ NETCDF = "netcdf"
52
+ FORMATTED = "formatted"
53
+ TABLE = "table"
54
+ XLSX = "xlsx"
55
+ PNG = "png"
56
+ SQLDUMP_DUCKDB = "duckdb"
57
+ SQLDUMP_POSTGRES = "postgres"
58
+ DUMP_MONGODB = "mongodb"
59
+
60
+ @classmethod
61
+ def guess_format(cls, file_name: str) -> Optional["Format"]:
62
+ ext = Path(file_name).suffix.lower()
63
+
64
+ format_map = {
65
+ ".json": cls.JSON,
66
+ ".jsonl": cls.JSONL,
67
+ ".yaml": cls.YAML,
68
+ ".yml": cls.YAML,
69
+ ".yamll": cls.YAMLL,
70
+ ".tsv": cls.TSV,
71
+ ".csv": cls.CSV,
72
+ ".txt": cls.TEXT,
73
+ ".xml": cls.XML,
74
+ ".owx": cls.XML,
75
+ ".owl": cls.RDFXML,
76
+ ".ttl": cls.TURTLE,
77
+ ".md": cls.MARKDOWN,
78
+ ".py": cls.PYTHON,
79
+ ".parquet": cls.PARQUET,
80
+ ".pq": cls.PARQUET,
81
+ }
82
+ fmt = format_map.get(ext, None)
83
+ if fmt is None:
84
+ if ext.startswith("."):
85
+ ext = ext[1:]
86
+ if ext in [f.value for f in Format]:
87
+ return Format(ext)
88
+ return fmt
89
+
90
+ def is_dump_format(self):
91
+ return self in [Format.SQLDUMP_DUCKDB, Format.SQLDUMP_POSTGRES, Format.DUMP_MONGODB]
92
+
93
+ def is_binary_format(self):
94
+ return self in [Format.PARQUET, Format.XLSX]
95
+
96
+ def is_xsv(self):
97
+ return self in [Format.TSV, Format.CSV]
98
+
99
+
100
+ def load_objects_from_url(
101
+ url: str,
102
+ format: Union[Format, str] = None,
103
+ expected_type: Type = None,
104
+ local_path: Optional[str] = None,
105
+ **kwargs,
106
+ ) -> List[Dict[str, Any]]:
107
+ """
108
+ Load objects from a URL in JSON, JSONLines, YAML, CSV, or TSV format.
109
+
110
+ :param url: The URL to the file.
111
+ :param format: The format of the file. Can be a Format enum or a string value.
112
+ :param expected_type: The target type to load the objects into.
113
+ :param local_path: The local path to save the file to.
114
+ :return: A list of dictionaries representing the loaded objects.
115
+ """
116
+ local_path = pystow.ensure("linkml", "linkml-store", url=url)
117
+ logger.info(f"synced to {local_path}")
118
+ objs = load_objects(local_path, format=format, expected_type=expected_type, **kwargs)
119
+ if not objs:
120
+ raise ValueError(f"No objects loaded from URL: {url}")
121
+ return objs
122
+
123
+
124
+ def clean_pandas_value(v):
125
+ """Clean a single value from pandas."""
126
+ import math
127
+
128
+ if isinstance(v, float):
129
+ if math.isnan(v) or math.isinf(v):
130
+ return None
131
+ return float(v) # Ensures proper float type
132
+ return v
133
+
134
+
135
+ def clean_nested_structure(obj):
136
+ """Recursively clean a nested structure of dicts/lists from pandas."""
137
+ if isinstance(obj, dict):
138
+ return {k: clean_nested_structure(v) for k, v in obj.items()}
139
+ elif isinstance(obj, list):
140
+ return [clean_nested_structure(item) for item in obj] # Fixed: using 'item' instead of 'v'
141
+ else:
142
+ return clean_pandas_value(obj)
143
+
144
+
145
+ def process_file(
146
+ f: IO,
147
+ format: Format,
148
+ expected_type: Optional[Type] = None,
149
+ header_comment_token: Optional[str] = None,
150
+ format_options: Optional[Dict[str, Any]] = None,
151
+ ) -> List[Dict[str, Any]]:
152
+ """
153
+ Process a single file and return a list of objects.
154
+
155
+ :param f: The file object.
156
+ :param format: The format of the file.
157
+ :param expected_type: The expected type of the objects.
158
+ :param header_comment_token: Token used for header comments to be skipped
159
+ :return:
160
+ """
161
+ if format_options is None:
162
+ format_options = {}
163
+ if format == Format.YAMLL:
164
+ format = Format.YAML
165
+ expected_type = list
166
+ if format == Format.JSON:
167
+ objs = json.load(f)
168
+ elif format == Format.JSONL:
169
+ objs = [json.loads(line) for line in f]
170
+ elif format == Format.YAML:
171
+ if expected_type and expected_type == list: # noqa E721
172
+ objs = list(yaml.safe_load_all(f))
173
+ # allow YAML with a `---` with no object before it
174
+ objs = [obj for obj in objs if obj is not None]
175
+ else:
176
+ objs = yaml.safe_load(f)
177
+ elif format == Format.TOML:
178
+ import toml
179
+
180
+ objs = toml.load(f)
181
+ if not isinstance(objs, list):
182
+ objs = [objs]
183
+ elif format == Format.TEXTLINES:
184
+ objs = f.readlines()
185
+ elif format in [Format.TSV, Format.CSV]:
186
+ if header_comment_token:
187
+ while True:
188
+ pos = f.tell()
189
+ line = f.readline()
190
+ if not line.startswith(header_comment_token):
191
+ f.seek(pos)
192
+ break
193
+ delimiter = "\t" if format == Format.TSV else ","
194
+ reader = csv.DictReader(f, delimiter=delimiter)
195
+ objs = list(reader)
196
+ elif format == Format.XML:
197
+ objs = xmltodict.parse(f.read())
198
+ elif format == Format.PKL:
199
+ objs = pd.read_pickle(f).to_dict(orient="records")
200
+ elif format == Format.RDS:
201
+ import pyreadr
202
+ objs = pyreadr.read_r(f)
203
+ elif format == Format.XLSX:
204
+ xls = pd.ExcelFile(f)
205
+ objs = {sheet: clean_nested_structure(xls.parse(sheet).to_dict(orient="records")) for sheet in xls.sheet_names}
206
+ elif format == Format.TEXT:
207
+ txt = f.read()
208
+ objs = [
209
+ {
210
+ "name": Path(f.name).name,
211
+ "path": f.name,
212
+ "content": txt,
213
+ "size": len(txt),
214
+ "lines": txt.count("\n") + 1,
215
+ "md5": hashlib.md5(txt.encode()).hexdigest(),
216
+ }
217
+ ]
218
+ elif format == Format.GMT:
219
+ objs = []
220
+ lib_name = Path(f.name).name
221
+ for line in f:
222
+ parts = line.strip().split("\t")
223
+ desc = parts[1]
224
+ objs.append(
225
+ {
226
+ "library": lib_name,
227
+ "uid": f"{lib_name}.{parts[0]}",
228
+ "name": parts[0],
229
+ "description": desc if desc else None,
230
+ "genes": parts[2:],
231
+ }
232
+ )
233
+ elif format == Format.FASTA:
234
+ objs = []
235
+ current_obj = None
236
+ for line in f:
237
+ line = line.strip()
238
+ if line.startswith(">"):
239
+ if current_obj:
240
+ objs.append(current_obj)
241
+ current_obj = {"id": line[1:], "sequence": ""}
242
+ else:
243
+ current_obj["sequence"] += line
244
+ if current_obj:
245
+ objs.append(current_obj)
246
+ elif format == Format.OBO:
247
+ blocks = split_document(f.read(), "\n\n")
248
+ id_pattern = re.compile(r"id: (\S+)")
249
+
250
+ def get_id(block):
251
+ m = id_pattern.search(block)
252
+ return m.group(1) if m else None
253
+
254
+ objs = [{"id": get_id(block), "content": block} for block in blocks]
255
+ objs = [obj for obj in objs if obj["id"]]
256
+ elif format == Format.DAT:
257
+ from linkml_store.utils.dat_parser import parse_sib_format
258
+
259
+ _, objs = parse_sib_format(f.read())
260
+ elif format in (Format.RDFXML, Format.TURTLE):
261
+ import lightrdf
262
+
263
+ parser = lightrdf.Parser()
264
+ objs = []
265
+ ext_fmt = "rdfxml"
266
+ if format == Format.TURTLE:
267
+ ext_fmt = "ttl"
268
+ bytesio = io.BytesIO(f.read().encode("utf-8"))
269
+ buffer = io.BufferedReader(bytesio)
270
+ for s, p, o in parser.parse(buffer, base_iri=None, format=ext_fmt):
271
+ obj = {
272
+ "subject": s,
273
+ "predicate": p,
274
+ "object": o,
275
+ }
276
+ if format_options.get("pivot", False):
277
+ obj = {
278
+ "subject": s,
279
+ p: o,
280
+ }
281
+ objs.append(obj)
282
+ elif format == Format.PARQUET:
283
+ import pyarrow.parquet as pq
284
+
285
+ table = pq.read_table(f)
286
+ objs = table.to_pandas().to_dict(orient="records")
287
+ elif format in [Format.PYTHON, Format.FORMATTED, Format.TABLE]:
288
+ raise ValueError(f"Format {format} is not supported for loading objects")
289
+ else:
290
+ raise ValueError(f"Unsupported file format: {format}")
291
+
292
+ if not isinstance(objs, list):
293
+ objs = [objs]
294
+ return objs
295
+
296
+
297
+ def load_objects(
298
+ file_path: Union[str, Path],
299
+ format: Optional[Union[Format, str]] = None,
300
+ compression: Optional[str] = None,
301
+ expected_type: Optional[Type] = None,
302
+ header_comment_token: Optional[str] = None,
303
+ select_query: Optional[str] = None,
304
+ ) -> List[Dict[str, Any]]:
305
+ """
306
+ Load objects from a file or archive in supported formats.
307
+ For tgz archives, it processes all files and concatenates the results.
308
+
309
+ TODO: Add schema hints for CSV/TSV parsing.
310
+
311
+ :param file_path: The path to the file or archive.
312
+ :param format: The format of the file. Can be a Format enum or a string value.
313
+ :param compression: The compression type. Supports 'gz' for gzip and 'tgz' for tar.gz.
314
+ :param expected_type: The target type to load the objects into, e.g. list
315
+ :param header_comment_token: Token used for header comments to be skipped
316
+ :param select_query: JSONPath query to select specific objects from the loaded data.
317
+ :return: A list of dictionaries representing the loaded objects.
318
+ """
319
+ if isinstance(file_path, Path):
320
+ file_path = str(file_path)
321
+
322
+ for url_scheme in ["http", "https", "ftp"]:
323
+ if file_path.startswith(f"{url_scheme}://"):
324
+ return load_objects_from_url(
325
+ file_path,
326
+ format=format,
327
+ expected_type=expected_type,
328
+ )
329
+
330
+ if isinstance(format, str):
331
+ format = Format(format)
332
+
333
+ all_objects = []
334
+
335
+ if compression == "tgz":
336
+ with tarfile.open(file_path, "r:gz") as tar:
337
+ for member in tar.getmembers():
338
+ if member.isfile():
339
+ f = tar.extractfile(member)
340
+ if f:
341
+ content = io.TextIOWrapper(f)
342
+ member_format = Format.guess_format(member.name) if not format else format
343
+ logger.debug(f"Processing tar member {member.name} with format {member_format}")
344
+ all_objects.extend(process_file(content, member_format, expected_type, header_comment_token))
345
+ else:
346
+ if Path(file_path).is_dir():
347
+ raise ValueError(f"{file_path} is a dir, which is invalid for {format}")
348
+ open_func = gzip.open if compression == "gz" else open
349
+ format = Format.guess_format(file_path) if not format else format
350
+ mode = "rb" if (format and format.is_binary_format()) or compression == "gz" else "r"
351
+ with open_func(file_path, mode) if file_path != "-" else sys.stdin as f:
352
+ if compression == "gz" and mode == "r":
353
+ f = io.TextIOWrapper(f)
354
+ all_objects = process_file(f, format, expected_type, header_comment_token)
355
+
356
+ logger.debug(f"Loaded {len(all_objects)} objects from {file_path}")
357
+ all_objects = transform_objects(all_objects, select_query)
358
+ return all_objects
359
+
360
+ def transform_objects(all_objects: List[Dict[str, Any]], select_query: Optional[str]) -> List[Dict[str, Any]]:
361
+ if not select_query:
362
+ return all_objects
363
+ import jsonpath_ng as jp
364
+
365
+ path_expr = jp.parse(select_query)
366
+ new_objs = []
367
+ for obj in all_objects:
368
+ for match in path_expr.find(obj):
369
+ logging.debug(f"Match: {match.value}")
370
+ if isinstance(match.value, list):
371
+ new_objs.extend(match.value)
372
+ else:
373
+ new_objs.append(match.value)
374
+ all_objects = new_objs
375
+ return all_objects
376
+
377
+ def remove_control_chars_from_df(df: pd.DataFrame) -> pd.DataFrame:
378
+ df_clean = df.copy()
379
+ for col in df_clean.select_dtypes(include=['object']).columns:
380
+ df_clean[col] = df_clean[col].astype(str).str.replace(r'[\x00-\x1f\x7f-\x9f]', '', regex=True)
381
+ return df_clean
382
+
383
+ def write_output(
384
+ data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame],
385
+ format: Union[Format, str] = Format.YAML,
386
+ target: Optional[Union[TextIO, str, Path]] = None,
387
+ ) -> None:
388
+ """
389
+ Write output data to a file in JSON, JSONLines, YAML, CSV, or TSV format.
390
+
391
+ >>> write_output([{"a": 1, "b": 2}, {"a": 3, "b": 4}], Format.JSON, sys.stdout)
392
+ [
393
+ {
394
+ "a": 1,
395
+ "b": 2
396
+ },
397
+ {
398
+ "a": 3,
399
+ "b": 4
400
+ }
401
+ ]
402
+ """
403
+ if isinstance(format, str):
404
+ format = Format(format)
405
+ if format == Format.XLSX:
406
+ if not target:
407
+ raise ValueError("XLSX output requires a target file")
408
+ if not isinstance(data, pd.DataFrame):
409
+ data = pd.DataFrame(data)
410
+ data = remove_control_chars_from_df(data)
411
+ data.to_excel(target, index=False)
412
+ return
413
+ output_str = render_output(data, format)
414
+ if target:
415
+ if isinstance(target, str):
416
+ with open(target, "w") as target:
417
+ target.write(output_str)
418
+ else:
419
+ target.write(output_str)
420
+ else:
421
+ print(output_str)
422
+
423
+
424
+ def render_output(
425
+ data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame, List[BaseModel]],
426
+ format: Optional[Union[Format, str]] = Format.YAML,
427
+ ) -> str:
428
+ """
429
+ Render output data in JSON, JSONLines, YAML, CSV, or TSV format.
430
+
431
+ >>> print(render_output([{"a": 1, "b": 2}, {"a": 3, "b": 4}], Format.JSON))
432
+ [
433
+ {
434
+ "a": 1,
435
+ "b": 2
436
+ },
437
+ {
438
+ "a": 3,
439
+ "b": 4
440
+ }
441
+ ]
442
+
443
+
444
+ :param data: The data to be rendered.
445
+ :param format: The desired output format. Can be a Format enum or a string value.
446
+ :return: The rendered output as a string.
447
+ """
448
+ if isinstance(format, str):
449
+ format = Format(format)
450
+
451
+ if format in (Format.FORMATTED, ):
452
+ if not isinstance(data, pd.DataFrame):
453
+ data = pd.DataFrame(data)
454
+ return data.to_string(max_rows=None)
455
+
456
+ if isinstance(data, pd.DataFrame):
457
+ data = data.to_dict(orient="records")
458
+
459
+ if isinstance(data, BaseModel):
460
+ data = data.model_dump()
461
+
462
+ if data and isinstance(data, list) and isinstance(data[0], BaseModel):
463
+ data = [d.model_dump() if isinstance(d, BaseModel) else d for d in data]
464
+
465
+ if isinstance(data, dict) and format in [Format.TSV, Format.CSV]:
466
+ data = [data]
467
+
468
+ if isinstance(data, BaseModel):
469
+ data = data.model_dump()
470
+
471
+ if format == Format.JSON:
472
+ return json.dumps(data, indent=2, default=str)
473
+ elif format == Format.JSONL:
474
+ return "\n".join(json.dumps(obj) for obj in data)
475
+ elif format == Format.PYTHON:
476
+ return str(data)
477
+ elif format == Format.MARKDOWN:
478
+
479
+ def as_markdown(obj: dict):
480
+ return "## Object\n\n" + "\n".join([f" * {k}: {v}" for k, v in obj.items()])
481
+
482
+ return "\n\n".join([as_markdown(obj) for obj in data]) if isinstance(data, list) else as_markdown(data)
483
+ elif format == Format.TABLE:
484
+ from tabulate import tabulate
485
+
486
+ return tabulate(pd.DataFrame(data), headers="keys", tablefmt="psql")
487
+ elif format == Format.YAML:
488
+ if isinstance(data, list):
489
+ return yaml.safe_dump_all(data, sort_keys=False)
490
+ else:
491
+ return yaml.safe_dump(data, sort_keys=False)
492
+ elif format == Format.TSV:
493
+ output = StringIO()
494
+ writer = csv.DictWriter(output, fieldnames=get_fieldnames(data), delimiter="\t")
495
+ writer.writeheader()
496
+ writer.writerows(data)
497
+ return output.getvalue()
498
+ elif format == Format.CSV:
499
+ output = StringIO()
500
+ writer = csv.DictWriter(output, fieldnames=get_fieldnames(data))
501
+ writer.writeheader()
502
+ writer.writerows(data)
503
+ return output.getvalue()
504
+ else:
505
+ raise ValueError(f"Unsupported output format: {format}")
506
+
507
+
508
+ def get_fieldnames(data: List[Dict[str, Any]]) -> List[str]:
509
+ """
510
+ Get the fieldnames of a list of dictionaries.
511
+
512
+ >>> get_fieldnames([{"a": 1, "b": 2}, {"a": 3, "b": 4}])
513
+ ['a', 'b']
514
+
515
+ :param data: The list of dictionaries.
516
+ :return: The fieldnames.
517
+ """
518
+ fieldnames = []
519
+ for obj in data:
520
+ fieldnames.extend([k for k in obj.keys() if k not in fieldnames])
521
+ return fieldnames
522
+
523
+
524
+ def guess_format(path: str) -> Optional[Format]:
525
+ """
526
+ Guess the format of a file based on its extension.
527
+
528
+ >>> guess_format("data.json")
529
+ <Format.JSON: 'json'>
530
+ >>> guess_format("data.jsonl")
531
+ <Format.JSONL: 'jsonl'>
532
+ >>> guess_format("data.yaml")
533
+ <Format.YAML: 'yaml'>
534
+ >>> assert not guess_format("data")
535
+
536
+ :param path: The path to the file.
537
+ :return: The guessed format.
538
+ """
539
+ return Format.guess_format(path)
540
+
541
+
542
+ def split_document(doc: str, delimiter: str):
543
+ """
544
+ Split a document into parts based on a delimiter.
545
+
546
+ :param doc: The document to split.
547
+ :param delimiter: The delimiter.
548
+ :return: The parts of the document.
549
+ """
550
+ return doc.split(delimiter)
@@ -0,0 +1,38 @@
1
+ from pathlib import Path
2
+ from typing import Iterable, Iterator, Optional, TextIO, Union
3
+
4
+ from linkml_runtime import SchemaView
5
+
6
+ from linkml_store.api.collection import OBJECT
7
+
8
+
9
+ def export_objects(
10
+ objects: Iterable[OBJECT],
11
+ location: Union[Path, str, TextIO],
12
+ output_type: Optional[str],
13
+ target_class: Optional[str] = None,
14
+ schema_view: Optional[SchemaView] = None,
15
+ **kwargs,
16
+ ):
17
+ """
18
+ Export objects to a file or stream
19
+
20
+ :param objects: objects to export
21
+ :param location: location to export to
22
+ :param kwargs:
23
+ :return:
24
+ """
25
+ raise NotImplementedError
26
+
27
+
28
+ def import_objects_iter(
29
+ location: Union[Path, str, TextIO], schema_view: Optional[SchemaView] = None, **kwargs
30
+ ) -> Iterator[OBJECT]:
31
+ """
32
+ Import objects from a file or stream
33
+
34
+ :param location:
35
+ :param kwargs:
36
+ :return:
37
+ """
38
+ raise NotImplementedError
@@ -0,0 +1,122 @@
1
+ import logging
2
+ from typing import TYPE_CHECKING, Callable, List, Optional
3
+
4
+ if TYPE_CHECKING:
5
+ import tiktoken
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ MODEL_TOKEN_MAPPING = {
10
+ "gpt-4o-mini": 128_000,
11
+ "gpt-4o": 128_000,
12
+ "gpt-4o-2024-05-13": 128_000,
13
+ "gpt-4": 8192,
14
+ "gpt-4-0314": 8192,
15
+ "gpt-4-0613": 8192,
16
+ "gpt-4-32k": 32768,
17
+ "gpt-4-32k-0314": 32768,
18
+ "gpt-4-32k-0613": 32768,
19
+ "gpt-3.5-turbo": 4096,
20
+ "gpt-3.5-turbo-0301": 4096,
21
+ "gpt-3.5-turbo-0613": 4096,
22
+ "gpt-3.5-turbo-16k": 16385,
23
+ "gpt-3.5-turbo-16k-0613": 16385,
24
+ "gpt-3.5-turbo-instruct": 4096,
25
+ "text-ada-001": 2049,
26
+ "ada": 2049,
27
+ "ada-002": 8192,
28
+ "text-babbage-001": 2040,
29
+ "babbage": 2049,
30
+ "text-curie-001": 2049,
31
+ "curie": 2049,
32
+ "davinci": 2049,
33
+ "text-davinci-003": 4097,
34
+ "text-davinci-002": 4097,
35
+ "code-davinci-002": 8001,
36
+ "code-davinci-001": 8001,
37
+ "code-cushman-002": 2048,
38
+ "code-cushman-001": 2048,
39
+ "claude": 200_000,
40
+ "llama-3": 200_000,
41
+ }
42
+
43
+
44
+ def render_formatted_text(
45
+ render_func: Callable,
46
+ values: List[str],
47
+ encoding: "tiktoken.Encoding",
48
+ token_limit: int,
49
+ additional_text: Optional[str] = None,
50
+ ) -> str:
51
+ """
52
+ Render a formatted text string with a given object, encoding, and token limit.
53
+
54
+ This function safely handles text that may contain special tokens (e.g., <|fim_suffix|>,
55
+ <|endoftext|>) by treating them as normal text rather than raising errors.
56
+
57
+ >>> from tiktoken import encoding_for_model
58
+ >>> encoding = encoding_for_model("gpt-4o-mini")
59
+ >>> names = ["Alice", "Bob", "DoctorHippopotamusMcHippopotamusFace"]
60
+ >>> f = lambda x: f"Hello, {' '.join(x)}!"
61
+ >>> render_formatted_text(f, names, encoding, 4096)
62
+ 'Hello, Alice Bob DoctorHippopotamusMcHippopotamusFace!'
63
+ >>> render_formatted_text(f, names, encoding, 5)
64
+ 'Hello, Alice Bob!'
65
+
66
+ :param render_func: Rendering function
67
+ :param values: Values to render
68
+ :param encoding: Encoding
69
+ :param token_limit: Token limit
70
+ :param additional_text: Additional text to consider
71
+ :return:
72
+ """
73
+ text = render_func(values)
74
+ if additional_text:
75
+ token_limit -= len(encoding.encode(additional_text, disallowed_special=()))
76
+ text_length = len(encoding.encode(text, disallowed_special=()))
77
+ logger.debug(f"Encoding length: {text_length} (original: {len(text)})")
78
+ if text_length <= token_limit:
79
+ return text
80
+ if not values:
81
+ raise ValueError(f"Cannot fit text into token limit: {text_length} > {token_limit}")
82
+ # remove last element and try again
83
+ return render_formatted_text(render_func, values[0:-1], encoding=encoding, token_limit=token_limit)
84
+
85
+
86
+ def get_token_limit(model_name: str) -> int:
87
+ """
88
+ Estimate the token limit for a model.
89
+
90
+ >>> get_token_limit("gpt-4o-mini")
91
+ 128000
92
+
93
+ also works with nested names:
94
+
95
+ >>> get_token_limit("my/claude-opus")
96
+ 200000
97
+
98
+
99
+ :param model_name: Model name
100
+ :return: Estimated token limit
101
+ """
102
+ # sort MODEL_TOKEN_MAPPING by key length to ensure that the longest model names are checked first
103
+ for model, token_limit in sorted(MODEL_TOKEN_MAPPING.items(), key=lambda x: len(x[0]), reverse=True):
104
+ if model in model_name:
105
+ return token_limit
106
+ return 4096
107
+
108
+
109
+ def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]:
110
+ import yaml
111
+
112
+ if "```" in yaml_str:
113
+ yaml_str = yaml_str.split("```")[1].strip()
114
+ if yaml_str.startswith("yaml"):
115
+ yaml_str = yaml_str[4:].strip()
116
+ try:
117
+ return yaml.safe_load(yaml_str)
118
+ except Exception as e:
119
+ if strict:
120
+ raise e
121
+ logger.error(f"Error parsing YAML: {yaml_str}\n{e}")
122
+ return None