linkml-store 0.2.2__tar.gz → 0.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (80) hide show
  1. {linkml_store-0.2.2 → linkml_store-0.2.5}/PKG-INFO +15 -12
  2. {linkml_store-0.2.2 → linkml_store-0.2.5}/pyproject.toml +13 -6
  3. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/client.py +34 -15
  4. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/collection.py +8 -0
  5. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/config.py +5 -1
  6. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/database.py +2 -0
  7. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/filesystem/filesystem_database.py +1 -1
  8. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/cli.py +49 -15
  9. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/index/implementations/llm_indexer.py +7 -4
  10. linkml_store-0.2.5/src/linkml_store/inference/implementations/llm_inference_engine.py +152 -0
  11. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/implementations/rag_inference_engine.py +20 -9
  12. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/inference_engine.py +6 -4
  13. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/format_utils.py +6 -1
  14. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/llm_utils.py +23 -3
  15. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/object_utils.py +3 -1
  16. {linkml_store-0.2.2 → linkml_store-0.2.5}/LICENSE +0 -0
  17. {linkml_store-0.2.2 → linkml_store-0.2.5}/README.md +0 -0
  18. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/__init__.py +0 -0
  19. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/__init__.py +0 -0
  20. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/queries.py +0 -0
  21. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/__init__.py +0 -0
  22. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/chromadb/__init__.py +0 -0
  23. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/chromadb/chromadb_collection.py +0 -0
  24. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/chromadb/chromadb_database.py +0 -0
  25. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/duckdb/__init__.py +0 -0
  26. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/duckdb/duckdb_collection.py +0 -0
  27. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/duckdb/duckdb_database.py +0 -0
  28. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/duckdb/mappings.py +0 -0
  29. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/filesystem/__init__.py +0 -0
  30. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/filesystem/filesystem_collection.py +0 -0
  31. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/hdf5/__init__.py +0 -0
  32. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/hdf5/hdf5_collection.py +0 -0
  33. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/hdf5/hdf5_database.py +0 -0
  34. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/mongodb/__init__.py +0 -0
  35. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/mongodb/mongodb_collection.py +0 -0
  36. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/mongodb/mongodb_database.py +0 -0
  37. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/neo4j/__init__.py +0 -0
  38. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/neo4j/neo4j_collection.py +0 -0
  39. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/neo4j/neo4j_database.py +0 -0
  40. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/solr/__init__.py +0 -0
  41. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/solr/solr_collection.py +0 -0
  42. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/solr/solr_database.py +0 -0
  43. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/solr/solr_utils.py +0 -0
  44. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/types.py +0 -0
  45. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/constants.py +0 -0
  46. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/graphs/__init__.py +0 -0
  47. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/graphs/graph_map.py +0 -0
  48. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/index/__init__.py +0 -0
  49. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/index/implementations/__init__.py +0 -0
  50. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/index/implementations/simple_indexer.py +0 -0
  51. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/index/indexer.py +0 -0
  52. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/__init__.py +0 -0
  53. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/evaluation.py +0 -0
  54. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/implementations/__init__.py +0 -0
  55. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/implementations/rule_based_inference_engine.py +0 -0
  56. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/implementations/sklearn_inference_engine.py +0 -0
  57. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/inference_config.py +0 -0
  58. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/inference_engine_registry.py +0 -0
  59. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/__init__.py +0 -0
  60. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/change_utils.py +0 -0
  61. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/file_utils.py +0 -0
  62. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/io.py +0 -0
  63. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/mongodb_utils.py +0 -0
  64. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/neo4j_utils.py +0 -0
  65. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/pandas_utils.py +0 -0
  66. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/patch_utils.py +0 -0
  67. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/query_utils.py +0 -0
  68. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/schema_utils.py +0 -0
  69. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/sklearn_utils.py +0 -0
  70. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/sql_utils.py +0 -0
  71. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/stats_utils.py +0 -0
  72. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/vector_utils.py +0 -0
  73. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/webapi/__init__.py +0 -0
  74. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/webapi/html/__init__.py +0 -0
  75. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/webapi/html/base.html.j2 +0 -0
  76. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/webapi/html/collection_details.html.j2 +0 -0
  77. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/webapi/html/database_details.html.j2 +0 -0
  78. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/webapi/html/databases.html.j2 +0 -0
  79. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/webapi/html/generic.html.j2 +0 -0
  80. {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/webapi/main.py +0 -0
@@ -1,24 +1,24 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: linkml-store
3
- Version: 0.2.2
3
+ Version: 0.2.5
4
4
  Summary: linkml-store
5
5
  License: MIT
6
6
  Author: Author 1
7
7
  Author-email: author@org.org
8
- Requires-Python: >=3.9, !=2.7.*, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*, !=3.7.*, !=3.8.*
8
+ Requires-Python: >=3.10,<4.0
9
9
  Classifier: License :: OSI Approved :: MIT License
10
10
  Classifier: Programming Language :: Python :: 3
11
- Classifier: Programming Language :: Python :: 3.9
12
11
  Classifier: Programming Language :: Python :: 3.10
13
12
  Classifier: Programming Language :: Python :: 3.11
14
13
  Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Provides-Extra: all
15
16
  Provides-Extra: analytics
16
17
  Provides-Extra: app
17
- Provides-Extra: chromadb
18
+ Provides-Extra: bigquery
18
19
  Provides-Extra: fastapi
19
20
  Provides-Extra: frictionless
20
21
  Provides-Extra: h5py
21
- Provides-Extra: ibis
22
22
  Provides-Extra: llm
23
23
  Provides-Extra: map
24
24
  Provides-Extra: mongodb
@@ -29,25 +29,25 @@ Provides-Extra: scipy
29
29
  Provides-Extra: tests
30
30
  Provides-Extra: validation
31
31
  Requires-Dist: black (>=24.0.0) ; extra == "tests"
32
- Requires-Dist: chromadb ; extra == "chromadb"
33
32
  Requires-Dist: click
34
33
  Requires-Dist: duckdb (>=0.10.1)
35
34
  Requires-Dist: duckdb-engine (>=0.11.2)
36
35
  Requires-Dist: fastapi ; extra == "fastapi"
37
36
  Requires-Dist: frictionless ; extra == "frictionless"
38
- Requires-Dist: gcsfs ; extra == "ibis"
37
+ Requires-Dist: gcsfs
38
+ Requires-Dist: google-cloud-bigquery ; extra == "bigquery"
39
39
  Requires-Dist: h5py ; extra == "h5py"
40
- Requires-Dist: ibis-framework[duckdb,examples] (>=9.3.0) ; extra == "ibis"
41
40
  Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
42
41
  Requires-Dist: jsonlines (>=4.0.0,<5.0.0)
42
+ Requires-Dist: jsonpatch (>=1.33)
43
43
  Requires-Dist: linkml (>=1.8.0) ; extra == "validation"
44
44
  Requires-Dist: linkml-runtime (>=1.8.0)
45
45
  Requires-Dist: linkml_map ; extra == "map"
46
46
  Requires-Dist: linkml_renderer ; extra == "renderer"
47
- Requires-Dist: llm ; extra == "llm"
47
+ Requires-Dist: llm ; extra == "llm" or extra == "all"
48
48
  Requires-Dist: matplotlib ; extra == "analytics"
49
- Requires-Dist: multipledispatch ; extra == "ibis"
50
- Requires-Dist: neo4j ; extra == "neo4j"
49
+ Requires-Dist: multipledispatch
50
+ Requires-Dist: neo4j ; extra == "neo4j" or extra == "all"
51
51
  Requires-Dist: networkx ; extra == "neo4j"
52
52
  Requires-Dist: pandas (>=2.2.1) ; extra == "analytics"
53
53
  Requires-Dist: plotly ; extra == "analytics"
@@ -56,14 +56,17 @@ Requires-Dist: pyarrow ; extra == "pyarrow"
56
56
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
57
57
  Requires-Dist: pymongo ; extra == "mongodb"
58
58
  Requires-Dist: pystow (>=0.5.4,<0.6.0)
59
+ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
59
60
  Requires-Dist: ruff (>=0.6.2) ; extra == "tests"
60
61
  Requires-Dist: scikit-learn ; extra == "scipy"
61
62
  Requires-Dist: scipy ; extra == "scipy"
62
63
  Requires-Dist: seaborn ; extra == "analytics"
63
64
  Requires-Dist: sqlalchemy
64
65
  Requires-Dist: streamlit (>=1.32.2,<2.0.0) ; extra == "app"
66
+ Requires-Dist: tabulate
65
67
  Requires-Dist: tiktoken ; extra == "llm"
66
68
  Requires-Dist: uvicorn ; extra == "fastapi"
69
+ Requires-Dist: xmltodict (>=0.13.0)
67
70
  Description-Content-Type: text/markdown
68
71
 
69
72
  # linkml-store
@@ -1,18 +1,19 @@
1
1
  [tool.poetry]
2
2
  name = "linkml-store"
3
- version = "0.2.2"
3
+ version = "0.2.5"
4
4
  description = "linkml-store"
5
5
  authors = ["Author 1 <author@org.org>"]
6
6
  license = "MIT"
7
7
  readme = "README.md"
8
8
 
9
9
  [tool.poetry.dependencies]
10
- python = "^3.9, !=3.9.7"
10
+ python = "^3.10"
11
11
  click = "*"
12
12
  pydantic = "^2.0.0"
13
13
  linkml-runtime = ">=1.8.0"
14
14
  streamlit = { version = "^1.32.2", optional = true }
15
15
  sqlalchemy = "*"
16
+ google-cloud-bigquery = "*"
16
17
  duckdb = ">=0.10.1"
17
18
  duckdb-engine = ">=0.11.2"
18
19
  matplotlib = { version = "*", optional = true }
@@ -27,7 +28,7 @@ pymongo = { version="*", optional = true }
27
28
  neo4j = { version="*", optional = true }
28
29
  py2neo = { version="*", optional = true }
29
30
  networkx = { version="*", optional = true }
30
- chromadb = { version="*", optional = true }
31
+ #chromadb = { version="*", optional = true }
31
32
  pyarrow = { version="*", optional = true }
32
33
  h5py = { version="*", optional = true }
33
34
  scipy = { version="*", optional = true }
@@ -36,14 +37,18 @@ linkml = { version=">=1.8.0", optional = true }
36
37
  linkml_map = { version="*", optional = true }
37
38
  linkml_renderer = { version="*", optional = true }
38
39
  frictionless = { version="*", optional = true }
39
- ibis-framework = { version=">=9.3.0", extras = ["duckdb", "examples"], optional = true }
40
+ #ibis-framework = { version=">=9.3.0", extras = ["duckdb", "examples"], optional = true }
40
41
  gcsfs = { version="*", optional = true }
41
42
  multipledispatch = { version="*" }
43
+ tabulate = "*"
42
44
  pandas = ">=2.2.1"
43
45
  jinja2 = "^3.1.4"
44
46
  jsonlines = "^4.0.0"
45
47
  fastapi = { version="*", optional = true }
46
48
  uvicorn = { version="*", optional = true }
49
+ xmltodict = ">=0.13.0"
50
+ jsonpatch = ">=1.33"
51
+ python-dotenv = "^1.0.1"
47
52
 
48
53
  [tool.poetry.group.dev.dependencies]
49
54
  pytest = {version = ">=7.1.2"}
@@ -77,7 +82,7 @@ tests = ["black", "ruff"]
77
82
  llm = ["llm", "tiktoken"]
78
83
  mongodb = ["pymongo"]
79
84
  neo4j = ["neo4j", "py2neo", "networkx"]
80
- chromadb = ["chromadb"]
85
+ #chromadb = ["chromadb"]
81
86
  h5py = ["h5py"]
82
87
  pyarrow = ["pyarrow"]
83
88
  validation = ["linkml"]
@@ -86,7 +91,9 @@ renderer = ["linkml_renderer"]
86
91
  fastapi = ["fastapi", "uvicorn"]
87
92
  frictionless = ["frictionless"]
88
93
  scipy = ["scipy", "scikit-learn"]
89
- ibis = ["ibis-framework", "multipledispatch", "gcsfs"]
94
+ #ibis = ["ibis-framework", "multipledispatch", "gcsfs"]
95
+ bigquery = ["google-cloud-bigquery"]
96
+ all = ["llm", "mongodb", "neo4j", "validation", "map", "renderer", "bigquery"]
90
97
 
91
98
  [tool.poetry.scripts]
92
99
  linkml-store = "linkml_store.cli:cli"
@@ -1,3 +1,4 @@
1
+ import importlib
1
2
  import logging
2
3
  from pathlib import Path
3
4
  from typing import Dict, Optional, Union
@@ -7,23 +8,22 @@ from linkml_runtime import SchemaView
7
8
 
8
9
  from linkml_store.api import Database
9
10
  from linkml_store.api.config import ClientConfig
10
- from linkml_store.api.stores.chromadb.chromadb_database import ChromaDBDatabase
11
- from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
12
- from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase
13
- from linkml_store.api.stores.mongodb.mongodb_database import MongoDBDatabase
14
- from linkml_store.api.stores.neo4j.neo4j_database import Neo4jDatabase
15
- from linkml_store.api.stores.solr.solr_database import SolrDatabase
16
11
 
17
12
  logger = logging.getLogger(__name__)
18
13
 
19
14
 
15
+
20
16
  HANDLE_MAP = {
21
- "duckdb": DuckDBDatabase,
22
- "solr": SolrDatabase,
23
- "mongodb": MongoDBDatabase,
24
- "chromadb": ChromaDBDatabase,
25
- "neo4j": Neo4jDatabase,
26
- "file": FileSystemDatabase,
17
+ "duckdb": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
18
+ "solr": "linkml_store.api.stores.solr.solr_database.SolrDatabase",
19
+ "mongodb": "linkml_store.api.stores.mongodb.mongodb_database.MongoDBDatabase",
20
+ "chromadb": "linkml_store.api.stores.chromadb.chromadb_database.ChromaDBDatabase",
21
+ "neo4j": "linkml_store.api.stores.neo4j.neo4j_database.Neo4jDatabase",
22
+ "file": "linkml_store.api.stores.filesystem.filesystem_database.FileSystemDatabase",
23
+ }
24
+
25
+ SUFFIX_MAP = {
26
+ "ddb": "duckdb:///{path}",
27
27
  }
28
28
 
29
29
 
@@ -155,6 +155,9 @@ class Client:
155
155
  if auto_attach:
156
156
  db = self.attach_database(handle, alias=name, **kwargs)
157
157
  db.from_config(db_config)
158
+ if db_config.source:
159
+ db = self.get_database(name)
160
+ db.store(db_config.source.data)
158
161
 
159
162
  def _set_database_config(self, db: Database):
160
163
  """
@@ -198,6 +201,12 @@ class Client:
198
201
  :param kwargs:
199
202
  :return:
200
203
  """
204
+ if ":" not in handle:
205
+ if alias is None:
206
+ alias = handle
207
+ suffix = handle.split(".")[-1]
208
+ if suffix in SUFFIX_MAP:
209
+ handle = SUFFIX_MAP[suffix].format(path=handle)
201
210
  if ":" not in handle:
202
211
  scheme = handle
203
212
  handle = None
@@ -207,14 +216,23 @@ class Client:
207
216
  scheme, _ = handle.split(":", 1)
208
217
  if scheme not in HANDLE_MAP:
209
218
  raise ValueError(f"Unknown scheme: {scheme}")
210
- cls = HANDLE_MAP[scheme]
219
+ module_path, class_name = HANDLE_MAP[scheme].rsplit('.', 1)
220
+ try:
221
+ module = importlib.import_module(module_path)
222
+ cls = getattr(module, class_name)
223
+ except ImportError as e:
224
+ raise ImportError(f"Failed to import {scheme} database. Make sure the correct extras are installed: {e}")
225
+
226
+ #cls = HANDLE_MAP[scheme]
211
227
  db = cls(handle=handle, recreate_if_exists=recreate_if_exists, **kwargs)
212
228
  if schema_view:
213
229
  db.set_schema_view(schema_view)
214
230
  if not alias:
215
231
  alias = handle
216
232
  if not self._databases:
233
+ logger.info("Initializing databases")
217
234
  self._databases = {}
235
+ logger.info(f"Attaching {alias}")
218
236
  self._databases[alias] = db
219
237
  db.parent = self
220
238
  if db.alias:
@@ -257,8 +275,9 @@ class Client:
257
275
  self._databases[name] = db
258
276
  if name not in self._databases:
259
277
  if create_if_not_exists:
260
- logger.info(f"Creating database: {name}")
261
- self.attach_database(name, **kwargs)
278
+ logger.info(f"Creating/attaching database: {name}")
279
+ db = self.attach_database(name, **kwargs)
280
+ name = db.alias
262
281
  else:
263
282
  raise ValueError(f"Database {name} does not exist")
264
283
  db = self._databases[name]
@@ -470,6 +470,7 @@ class Collection(Generic[DatabaseType]):
470
470
  where: Optional[Any] = None,
471
471
  index_name: Optional[str] = None,
472
472
  limit: Optional[int] = None,
473
+ select_cols: Optional[List[str]] = None,
473
474
  mmr_relevance_factor: Optional[float] = None,
474
475
  **kwargs,
475
476
  ) -> QueryResult:
@@ -503,6 +504,7 @@ class Collection(Generic[DatabaseType]):
503
504
  :param where:
504
505
  :param index_name:
505
506
  :param limit:
507
+ :param select_cols:
506
508
  :param kwargs:
507
509
  :return:
508
510
  """
@@ -538,6 +540,11 @@ class Collection(Generic[DatabaseType]):
538
540
  results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs)
539
541
  for r in results:
540
542
  del r[1][index_col]
543
+ if select_cols:
544
+ new_results = []
545
+ for r in results:
546
+ new_results.append((r[0], {k: v for k, v in r[1].items() if k in select_cols}))
547
+ results = new_results
541
548
  new_qr = QueryResult(num_rows=len(results))
542
549
  new_qr.ranked_rows = results
543
550
  new_qr.rows = [r[1] for r in results]
@@ -672,6 +679,7 @@ class Collection(Generic[DatabaseType]):
672
679
  """
673
680
  yield from self.find({}, limit=-1).rows
674
681
 
682
+ @property
675
683
  def rows(self) -> List[OBJECT]:
676
684
  """
677
685
  Return a list of objects in the collection.
@@ -91,7 +91,7 @@ class CollectionConfig(ConfiguredBaseModel):
91
91
  )
92
92
  source: Optional[CollectionSource] = Field(
93
93
  default=None,
94
- description="Metadata about the source",
94
+ description="Source for the collection",
95
95
  )
96
96
  derived_from: Optional[List[DerivationConfiguration]] = Field(
97
97
  default=None,
@@ -154,6 +154,10 @@ class DatabaseConfig(ConfiguredBaseModel):
154
154
  default=False,
155
155
  description="Whether to ensure referential integrity",
156
156
  )
157
+ source: Optional[CollectionSource] = Field(
158
+ default=None,
159
+ description="Source for the database",
160
+ )
157
161
 
158
162
 
159
163
  class ClientConfig(ConfiguredBaseModel):
@@ -470,6 +470,7 @@ class Database(ABC, Generic[CollectionType]):
470
470
  if not self._schema_view:
471
471
  self._initialize_schema()
472
472
  if not self._schema_view:
473
+ logger.info("Inducing schema view")
473
474
  self._schema_view = self.induce_schema_view()
474
475
  return self._schema_view
475
476
 
@@ -505,6 +506,7 @@ class Database(ABC, Generic[CollectionType]):
505
506
  if isinstance(schema_view, str):
506
507
  schema_view = SchemaView(schema_view)
507
508
  self._schema_view = schema_view
509
+ logger.info(f"Setting schema view for {self.handle}")
508
510
  # self._schema_view = SchemaView(schema_view.materialize_derived_schema())
509
511
  if not self._collections:
510
512
  return
@@ -3,7 +3,7 @@ from pathlib import Path
3
3
  from typing import Optional
4
4
 
5
5
  import yaml
6
- from linkml.utils.schema_builder import SchemaBuilder
6
+ from linkml_runtime.utils.schema_builder import SchemaBuilder
7
7
  from linkml_runtime import SchemaView
8
8
 
9
9
  from linkml_store.api import Database
@@ -99,6 +99,7 @@ include_internal_option = click.option("--include-internal/--no-include-internal
99
99
  @click.option("--database", "-d", help="Database name")
100
100
  @click.option("--collection", "-c", help="Collection name")
101
101
  @click.option("--input", "-i", help="Input file (alternative to database/collection)")
102
+ @click.option("--schema", "-S", help="Path to schema (LinkML yaml)")
102
103
  @click.option("--config", "-C", type=click.Path(exists=True), help="Path to the configuration file")
103
104
  @click.option("--set", help="Metadata settings in the form PATHEXPR=value", multiple=True)
104
105
  @click.option("-v", "--verbose", count=True)
@@ -111,7 +112,7 @@ include_internal_option = click.option("--include-internal/--no-include-internal
111
112
  help="If set then show full stacktrace on error",
112
113
  )
113
114
  @click.pass_context
114
- def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, config, set, input, **kwargs):
115
+ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, schema, config, set, input, **kwargs):
115
116
  """A CLI for interacting with the linkml-store."""
116
117
  if not stacktrace:
117
118
  sys.tracebacklimit = 0
@@ -135,12 +136,17 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
135
136
  logger.setLevel(logging.ERROR)
136
137
  ctx.ensure_object(dict)
137
138
  if input:
138
- stem = underscore(Path(input).stem)
139
- database = "duckdb"
140
- collection = stem
139
+ database = "duckdb" # default: store in duckdb
140
+ if input.startswith("http"):
141
+ parts = input.split("/")
142
+ collection = parts[-1]
143
+ collection = collection.split(".")[0]
144
+ else:
145
+ stem = underscore(Path(input).stem)
146
+ collection = stem
147
+ logger.info(f"Using input file: {input}, "
148
+ f"default storage is {database} and collection is {collection}")
141
149
  config = ClientConfig(databases={"duckdb": {"collections": {stem: {"source": {"local_path": input}}}}})
142
- # collection = Path(input).stem
143
- # database = f"file:{Path(input).parent}"
144
150
  if config is None and DEFAULT_LOCAL_CONF_PATH.exists():
145
151
  config = DEFAULT_LOCAL_CONF_PATH
146
152
  if config is None and DEFAULT_GLOBAL_CONF_PATH.exists():
@@ -153,6 +159,9 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
153
159
  client = Client().from_config(config, **kwargs) if config else Client()
154
160
  settings = ContextSettings(client=client, database_name=database, collection_name=collection)
155
161
  ctx.obj["settings"] = settings
162
+ if schema:
163
+ db = settings.database
164
+ db.set_schema_view(schema)
156
165
  if settings.database_name:
157
166
  db = client.get_database(database)
158
167
  if set:
@@ -178,10 +187,11 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
178
187
 
179
188
  @cli.command()
180
189
  @click.argument("files", type=click.Path(exists=True), nargs=-1)
190
+ @click.option("--replace/--no-replace", default=False, show_default=True, help="Replace existing objects")
181
191
  @click.option("--format", "-f", type=format_choice, help="Input format")
182
192
  @click.option("--object", "-i", multiple=True, help="Input object as YAML")
183
193
  @click.pass_context
184
- def insert(ctx, files, object, format):
194
+ def insert(ctx, files, replace, object, format):
185
195
  """Insert objects from files (JSON, YAML, TSV) into the specified collection.
186
196
 
187
197
  Using a configuration:
@@ -195,7 +205,6 @@ def insert(ctx, files, object, format):
195
205
  collection = settings.collection
196
206
  if not collection:
197
207
  raise ValueError("Collection must be specified.")
198
- objects = []
199
208
  if not files and not object:
200
209
  files = ["-"]
201
210
  for file_path in files:
@@ -204,13 +213,19 @@ def insert(ctx, files, object, format):
204
213
  else:
205
214
  objects = load_objects(file_path)
206
215
  logger.info(f"Inserting {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
207
- collection.insert(objects)
216
+ if replace:
217
+ collection.replace(objects)
218
+ else:
219
+ collection.insert(objects)
208
220
  click.echo(f"Inserted {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
209
221
  if object:
210
222
  for object_str in object:
211
223
  logger.info(f"Parsing: {object_str}")
212
224
  objects = yaml.safe_load(object_str)
213
- collection.insert(objects)
225
+ if replace:
226
+ collection.replace(objects)
227
+ else:
228
+ collection.insert(objects)
214
229
  click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.alias}'.")
215
230
  collection.commit()
216
231
 
@@ -523,6 +538,7 @@ def pivot(ctx, where, limit, index, columns, values, output_type, output):
523
538
  @click.option(
524
539
  "--feature-attributes", "-F", type=click.STRING, help="Feature attributes for inference (comma separated)"
525
540
  )
541
+ @click.option("--training-collection", type=click.STRING,help="Collection to use for training")
526
542
  @click.option("--inference-config-file", "-Y", type=click.Path(), help="Path to inference configuration file")
527
543
  @click.option("--export-model", "-E", type=click.Path(), help="Export model to file")
528
544
  @click.option("--load-model", "-L", type=click.Path(), help="Load model from file")
@@ -534,14 +550,17 @@ def pivot(ctx, where, limit, index, columns, values, output_type, output):
534
550
  @click.option("--evaluation-count", "-n", type=click.INT, help="Number of examples to evaluate over")
535
551
  @click.option("--evaluation-match-function", help="Name of function to use for matching objects in eval")
536
552
  @click.option("--query", "-q", type=click.STRING, help="query term")
553
+ @click.option("--where", "-w", type=click.STRING, help="query term")
537
554
  @click.pass_context
538
555
  def infer(
539
556
  ctx,
540
557
  inference_config_file,
558
+ where,
541
559
  query,
542
560
  evaluation_count,
543
561
  evaluation_match_function,
544
562
  training_test_data_split,
563
+ training_collection,
545
564
  predictor_type,
546
565
  target_attribute,
547
566
  feature_attributes,
@@ -579,6 +598,7 @@ def infer(
579
598
  linkml-store -i tests/input/iris.csv inference -t sklearn \
580
599
  -q '{"sepal_length": 5.1, "sepal_width": 3.5, "petal_length": 1.4, "petal_width": 0.2}'
581
600
  """
601
+ where_clause = yaml.safe_load(where) if where else None
582
602
  if query:
583
603
  query_obj = yaml.safe_load(query)
584
604
  else:
@@ -603,6 +623,7 @@ def infer(
603
623
  if model_format:
604
624
  model_format = ModelSerialization(model_format)
605
625
  if load_model:
626
+ logger.info(f"Loading predictor from {load_model}")
606
627
  predictor = get_inference_engine(predictor_type)
607
628
  predictor = type(predictor).load_model(load_model)
608
629
  else:
@@ -613,13 +634,18 @@ def infer(
613
634
  if training_test_data_split:
614
635
  config.train_test_split = training_test_data_split
615
636
  predictor = get_inference_engine(predictor_type, config=config)
616
- if collection:
617
- predictor.load_and_split_data(collection)
637
+ training_collection_obj = collection
638
+ if training_collection:
639
+ training_collection_obj = ctx.obj["settings"].database.get_collection(training_collection)
640
+ if training_collection_obj:
641
+ logger.info(f"Using collection: {training_collection_obj.alias} for inference")
642
+ split = training_test_data_split or (1.0, 0.0)
643
+ predictor.load_and_split_data(training_collection_obj, split=split)
618
644
  predictor.initialize_model()
619
645
  if export_model:
620
646
  logger.info(f"Exporting model to {export_model} in {model_format}")
621
647
  predictor.export_model(export_model, model_format)
622
- if not query_obj:
648
+ if not query_obj and where_clause is None:
623
649
  if not export_model and not evaluation_count:
624
650
  raise ValueError("Query or evaluate must be specified if not exporting model")
625
651
  if evaluation_count:
@@ -637,6 +663,12 @@ def infer(
637
663
  result = predictor.derive(query_obj)
638
664
  dumped_obj = result.model_dump(exclude_none=True)
639
665
  write_output([dumped_obj], output_type, target=output)
666
+ if where_clause is not None:
667
+ predicted_objs = []
668
+ for query_obj in collection.find(where_clause).rows:
669
+ result = predictor.derive(query_obj)
670
+ predicted_objs.append(result.predicted_object)
671
+ write_output(predicted_objs, output_type, target=output)
640
672
 
641
673
 
642
674
  @cli.command()
@@ -681,6 +713,7 @@ def schema(ctx, output_type, output):
681
713
  @cli.command()
682
714
  @click.argument("search_term")
683
715
  @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the search")
716
+ @click.option("--select", "-s", type=click.STRING, help="SELECT clause for the query, as YAML")
684
717
  @click.option("--limit", "-l", type=click.INT, help="Maximum number of search results")
685
718
  @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
686
719
  @click.option("--output", "-o", type=click.Path(), help="Output file path")
@@ -689,13 +722,14 @@ def schema(ctx, output_type, output):
689
722
  )
690
723
  @index_type_option
691
724
  @click.pass_context
692
- def search(ctx, search_term, where, limit, index_type, output_type, output, auto_index):
725
+ def search(ctx, search_term, where, select, limit, index_type, output_type, output, auto_index):
693
726
  """Search objects in the specified collection."""
694
727
  collection = ctx.obj["settings"].collection
695
728
  ix = get_indexer(index_type)
696
729
  logger.info(f"Attaching index to collection {collection.alias}: {ix.model_dump()}")
697
730
  collection.attach_indexer(ix, auto_index=auto_index)
698
- result = collection.search(search_term, where=where, limit=limit)
731
+ select_cols = yaml.safe_load(select) if select else None
732
+ result = collection.search(search_term, where=where, select_cols=select_cols, limit=limit)
699
733
  output_data = render_output([{"score": row[0], **row[1]} for row in result.ranked_rows], output_type)
700
734
  if output:
701
735
  with open(output, "w") as f:
@@ -3,7 +3,6 @@ from pathlib import Path
3
3
  from typing import TYPE_CHECKING, List, Optional
4
4
 
5
5
  import numpy as np
6
- from tiktoken import encoding_for_model
7
6
 
8
7
  from linkml_store.api.config import CollectionConfig
9
8
  from linkml_store.index.indexer import INDEX_ITEM, Indexer
@@ -55,7 +54,7 @@ class LLMIndexer(Indexer):
55
54
 
56
55
  def texts_to_vectors(self, texts: List[str], cache: bool = None, **kwargs) -> List[INDEX_ITEM]:
57
56
  """
58
- Use LLM to embed
57
+ Use LLM to embed.
59
58
 
60
59
  >>> indexer = LLMIndexer(cached_embeddings_database="tests/input/llm_cache.db")
61
60
  >>> vectors = indexer.texts_to_vectors(["hello", "goodbye"])
@@ -63,20 +62,24 @@ class LLMIndexer(Indexer):
63
62
  :param texts:
64
63
  :return:
65
64
  """
65
+ from tiktoken import encoding_for_model
66
66
  logging.info(f"Converting {len(texts)} texts to vectors")
67
67
  model = self.embedding_model
68
- token_limit = get_token_limit(model.model_id)
68
+ # TODO: make this more accurate
69
+ token_limit = get_token_limit(model.model_id) - 200
69
70
  encoding = encoding_for_model("gpt-4o")
70
71
 
71
72
  def truncate_text(text: str) -> str:
72
73
  # split into tokens every 1000 chars:
73
74
  parts = [text[i : i + 1000] for i in range(0, len(text), 1000)]
74
- return render_formatted_text(
75
+ truncated = render_formatted_text(
75
76
  lambda x: "".join(x),
76
77
  parts,
77
78
  encoding,
78
79
  token_limit,
79
80
  )
81
+ logger.debug(f"Truncated text from {len(text)} to {len(truncated)}")
82
+ return truncated
80
83
 
81
84
  texts = [truncate_text(text) for text in texts]
82
85
 
@@ -0,0 +1,152 @@
1
+ import json
2
+ import logging
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import ClassVar, List, Optional, TextIO, Union
6
+
7
+ import yaml
8
+ from linkml_store.utils.llm_utils import parse_yaml_payload
9
+ from llm import get_key
10
+ from pydantic import BaseModel
11
+
12
+ from linkml_store.api.collection import OBJECT, Collection
13
+ from linkml_store.inference.inference_config import Inference, InferenceConfig, LLMConfig
14
+ from linkml_store.inference.inference_engine import InferenceEngine, ModelSerialization
15
+ from linkml_store.utils.object_utils import select_nested
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ MAX_ITERATIONS = 5
20
+ DEFAULT_NUM_EXAMPLES = 20
21
+
22
+ SYSTEM_PROMPT = """
23
+ Your task is to inference the complete YAML
24
+ object output given the YAML object input. I will provide you
25
+ with contextual information, including the schema,
26
+ to help with the inference. You can use the following
27
+
28
+ You should return ONLY valid YAML in your response.
29
+ """
30
+
31
+
32
+ class TrainedModel(BaseModel, extra="forbid"):
33
+ index_rows: List[OBJECT]
34
+ config: Optional[InferenceConfig] = None
35
+
36
+
37
+ class LLMInference(Inference):
38
+ iterations: int = 0
39
+
40
+
41
+ @dataclass
42
+ class LLMInferenceEngine(InferenceEngine):
43
+ """
44
+ LLM based predictor.
45
+
46
+ Unlike the RAG predictor this performs few-shot inference
47
+ """
48
+
49
+ _model: "llm.Model" = None # noqa: F821
50
+
51
+ PERSIST_COLS: ClassVar[List[str]] = [
52
+ "config",
53
+ ]
54
+
55
+ def __post_init__(self):
56
+ if not self.config:
57
+ self.config = InferenceConfig()
58
+ if not self.config.llm_config:
59
+ self.config.llm_config = LLMConfig()
60
+
61
+ @property
62
+ def model(self) -> "llm.Model": # noqa: F821
63
+ import llm
64
+
65
+ if self._model is None:
66
+ self._model = llm.get_model(self.config.llm_config.model_name)
67
+ if self._model.needs_key:
68
+ key = get_key(None, key_alias=self._model.needs_key)
69
+ self._model.key = key
70
+
71
+ return self._model
72
+
73
+ def initialize_model(self, **kwargs):
74
+ logger.info(f"Initializing model {self.model}")
75
+
76
+ def object_to_text(self, object: OBJECT) -> str:
77
+ return yaml.dump(object)
78
+
79
+ def _schema_str(self) -> str:
80
+ db = self.training_data.base_collection.parent
81
+ from linkml_runtime.dumpers import json_dumper
82
+ schema_dict = json_dumper.to_dict(db.schema_view.schema)
83
+ return yaml.dump(schema_dict)
84
+
85
+ def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None) -> Optional[LLMInference]:
86
+ import llm
87
+
88
+ model: llm.Model = self.model
89
+ #model_name = self.config.llm_config.model_name
90
+ #feature_attributes = self.config.feature_attributes
91
+ target_attributes = self.config.target_attributes
92
+ query_text = self.object_to_text(object)
93
+
94
+ if not target_attributes:
95
+ target_attributes = [k for k, v in object.items() if v is None or v == ""]
96
+ #if not feature_attributes:
97
+ # feature_attributes = [k for k, v in object.items() if v is not None and v != ""]
98
+
99
+ system_prompt = SYSTEM_PROMPT.format(llm_config=self.config.llm_config)
100
+
101
+ system_prompt += "\n## SCHEMA:\n\n" + self._schema_str()
102
+
103
+ stub = ", ".join([f"{k}: ..." for k in target_attributes])
104
+ stub = "{" + stub + "}"
105
+ prompt = (
106
+ "Provide a YAML object of the form"
107
+ "```yaml\n"
108
+ f"{stub}\n"
109
+ "```\n"
110
+ "---\nQuery:\n" f"## INCOMPLETE OBJECT:\n{query_text}\n" "## OUTPUT:\n"
111
+ )
112
+ logger.info(f"Prompt: {prompt}")
113
+ response = model.prompt(prompt, system=system_prompt)
114
+ yaml_str = response.text()
115
+ logger.info(f"Response: {yaml_str}")
116
+ predicted_object = parse_yaml_payload(yaml_str, strict=True)
117
+ predicted_object = {**object, **predicted_object}
118
+ if self.config.validate_results:
119
+ base_collection = self.training_data.base_collection
120
+ errs = list(base_collection.iter_validate_collection([predicted_object]))
121
+ if errs:
122
+ print(f"{iteration} // FAILED TO VALIDATE: {yaml_str}")
123
+ print(f"PARSED: {predicted_object}")
124
+ print(f"ERRORS: {errs}")
125
+ if iteration > MAX_ITERATIONS:
126
+ raise ValueError(f"Validation errors: {errs}")
127
+ extra_texts = [
128
+ "Make sure results conform to the schema. Previously you provided:\n",
129
+ yaml_str,
130
+ "\nThis was invalid.\n",
131
+ "Validation errors:\n",
132
+ ] + [self.object_to_text(e) for e in errs]
133
+ return self.derive(object, iteration=iteration+1, additional_prompt_texts=extra_texts)
134
+ return LLMInference(predicted_object=predicted_object, iterations=iteration+1, query=object)
135
+
136
+
137
+ def export_model(
138
+ self, output: Optional[Union[str, Path, TextIO]], model_serialization: ModelSerialization = None, **kwargs
139
+ ):
140
+ self.save_model(output)
141
+
142
+ def save_model(self, output: Union[str, Path]) -> None:
143
+ """
144
+ Save the trained model and related data to a file.
145
+
146
+ :param output: Path to save the model
147
+ """
148
+ raise NotImplementedError("Does not make sense for this engine")
149
+
150
+ @classmethod
151
+ def load_model(cls, file_path: Union[str, Path]) -> "LLMInferenceEngine":
152
+ raise NotImplementedError("Does not make sense for this engine")
@@ -20,7 +20,7 @@ DEFAULT_NUM_EXAMPLES = 20
20
20
  DEFAULT_MMR_RELEVANCE_FACTOR = 0.8
21
21
 
22
22
  SYSTEM_PROMPT = """
23
- You are a {llm_config.role}, your task is to inference the YAML
23
+ You are a {llm_config.role}, your task is to infer the YAML
24
24
  object output given the YAML object input. I will provide you
25
25
  with a collection of examples that will provide guidance both
26
26
  on the desired structure of the response, as well as the kind
@@ -130,23 +130,34 @@ class RAGInferenceEngine(InferenceEngine):
130
130
  else:
131
131
  if not self.rag_collection.indexers:
132
132
  raise ValueError("RAG collection must have an indexer attached")
133
+ logger.info(f"Searching {self.rag_collection.alias} for examples for: {query_text}")
133
134
  rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm",
134
135
  mmr_relevance_factor=mmr_relevance_factor)
135
136
  examples = rs.rows
137
+ logger.info(f"Found {len(examples)} examples")
136
138
  if not examples:
137
139
  raise ValueError(f"No examples found for {query_text}; size = {self.rag_collection.size()}")
138
140
  prompt_clauses = []
139
- query_obj = select_nested(object, feature_attributes)
141
+ this_feature_attributes = feature_attributes
142
+ if not this_feature_attributes:
143
+ this_feature_attributes = list(set(object.keys()) - set(target_attributes))
144
+ query_obj = select_nested(object, this_feature_attributes)
140
145
  query_text = self.object_to_text(query_obj)
141
146
  for example in examples:
142
- input_obj = select_nested(example, feature_attributes)
147
+ this_feature_attributes = feature_attributes
148
+ if not this_feature_attributes:
149
+ this_feature_attributes = list(set(example.keys()) - set(target_attributes))
150
+ if not this_feature_attributes:
151
+ raise ValueError(f"No feature attributes found in example {example}")
152
+ input_obj = select_nested(example, this_feature_attributes)
143
153
  input_obj_text = self.object_to_text(input_obj)
144
154
  if input_obj_text == query_text:
145
- raise ValueError(
146
- f"Query object {query_text} is the same as example object {input_obj_text}\n"
147
- "This indicates possible test data leakage\n."
148
- "TODO: allow an option that allows user to treat this as a basic lookup\n"
149
- )
155
+ continue
156
+ #raise ValueError(
157
+ # f"Query object {query_text} is the same as example object {input_obj_text}\n"
158
+ # "This indicates possible test data leakage\n."
159
+ # "TODO: allow an option that allows user to treat this as a basic lookup\n"
160
+ #)
150
161
  output_obj = select_nested(example, target_attributes)
151
162
  prompt_clause = (
152
163
  "---\nExample:\n" f"## INPUT:\n{input_obj_text}\n" f"## OUTPUT:\n{self.object_to_text(output_obj)}\n"
@@ -169,7 +180,7 @@ class RAGInferenceEngine(InferenceEngine):
169
180
  encoding=encoding, token_limit=token_limit,
170
181
  additional_text=system_prompt)
171
182
  logger.info(f"Prompt: {prompt}")
172
- response = model.prompt(prompt, system_prompt)
183
+ response = model.prompt(prompt, system=system_prompt)
173
184
  yaml_str = response.text()
174
185
  logger.info(f"Response: {yaml_str}")
175
186
  predicted_object = self._parse_yaml_payload(yaml_str, strict=True)
@@ -4,7 +4,7 @@ from abc import ABC
4
4
  from dataclasses import dataclass
5
5
  from enum import Enum
6
6
  from pathlib import Path
7
- from typing import Optional, TextIO, Tuple, Union
7
+ from typing import Optional, TextIO, Tuple, Union, Any
8
8
 
9
9
  import pandas as pd
10
10
  from pydantic import BaseModel, ConfigDict
@@ -67,13 +67,14 @@ class CollectionSlice(BaseModel):
67
67
  # slice: Tuple[Optional[int], Optional[int]] = Field(default=(None, None))
68
68
  indices: Optional[Tuple[int, ...]] = None
69
69
  _collection: Optional[Collection] = None
70
+ where: Any = None
70
71
 
71
72
  @property
72
73
  def collection(self) -> Collection:
73
74
  if not self._collection and not self.indices:
74
75
  return self.base_collection
75
76
  if not self._collection:
76
- rows = self.base_collection.find({}, limit=-1).rows
77
+ rows = self.base_collection.rows
77
78
  subset = [rows[i] for i in self.indices]
78
79
  db = self.base_collection.parent
79
80
  subset_name = self.slice_alias
@@ -94,6 +95,7 @@ class CollectionSlice(BaseModel):
94
95
  """
95
96
  Return the slice of the collection as a dataframe.
96
97
 
98
+ :param flattened: flattned nested objects to give keys like foo.bar
97
99
  :return:
98
100
  """
99
101
  rs = self.collection.find({}, limit=-1)
@@ -122,7 +124,7 @@ class InferenceEngine(ABC):
122
124
  Load the data and split it into training and testing sets.
123
125
 
124
126
  :param collection:
125
- :param split:
127
+ :param split: Tuple of training and testing split ratios.
126
128
  :param randomize:
127
129
  :return:
128
130
  """
@@ -134,7 +136,7 @@ class InferenceEngine(ABC):
134
136
  self.training_data = CollectionSlice(name="train", base_collection=collection, indices=None)
135
137
  self.testing_data = None
136
138
  return
137
- logger.info(f"Loading and splitting data from collection {collection.alias}")
139
+ logger.info(f"Loading and splitting data {split} from collection {collection.alias}")
138
140
  size = collection.size()
139
141
  indices = range(size)
140
142
  if randomize:
@@ -12,9 +12,9 @@ from typing import IO, Any, Dict, List, Optional, TextIO, Type, Union
12
12
 
13
13
  import pandas as pd
14
14
  import pystow
15
+ import xmltodict
15
16
  import yaml
16
17
  from pydantic import BaseModel
17
- from tabulate import tabulate
18
18
 
19
19
  logger = logging.getLogger(__name__)
20
20
 
@@ -30,6 +30,7 @@ class Format(Enum):
30
30
  YAMLL = "yamll"
31
31
  TSV = "tsv"
32
32
  CSV = "csv"
33
+ XML = "xml"
33
34
  PYTHON = "python"
34
35
  PARQUET = "parquet"
35
36
  FORMATTED = "formatted"
@@ -50,6 +51,7 @@ class Format(Enum):
50
51
  ".yamll": cls.YAMLL,
51
52
  ".tsv": cls.TSV,
52
53
  ".csv": cls.CSV,
54
+ ".xml": cls.XML,
53
55
  ".py": cls.PYTHON,
54
56
  ".parquet": cls.PARQUET,
55
57
  ".pq": cls.PARQUET,
@@ -124,6 +126,8 @@ def process_file(
124
126
  delimiter = "\t" if format == Format.TSV else ","
125
127
  reader = csv.DictReader(f, delimiter=delimiter)
126
128
  objs = list(reader)
129
+ elif format == Format.XML:
130
+ objs = xmltodict.parse(f.read())
127
131
  elif format == Format.PARQUET:
128
132
  import pyarrow.parquet as pq
129
133
 
@@ -284,6 +288,7 @@ def render_output(
284
288
  elif format == Format.PYTHON:
285
289
  return str(data)
286
290
  elif format == Format.TABLE:
291
+ from tabulate import tabulate
287
292
  return tabulate(pd.DataFrame(data), headers="keys", tablefmt="psql")
288
293
  elif format == Format.YAML:
289
294
  if isinstance(data, list):
@@ -1,6 +1,10 @@
1
- from typing import Callable, List, Optional
1
+ import logging
2
+ from typing import Callable, List, Optional, TYPE_CHECKING
2
3
 
3
- from tiktoken import Encoding
4
+ if TYPE_CHECKING:
5
+ import tiktoken
6
+
7
+ logger = logging.getLogger(__name__)
4
8
 
5
9
  MODEL_TOKEN_MAPPING = {
6
10
  "gpt-4o-mini": 128_000,
@@ -40,7 +44,7 @@ MODEL_TOKEN_MAPPING = {
40
44
  def render_formatted_text(
41
45
  render_func: Callable,
42
46
  values: List[str],
43
- encoding: Encoding,
47
+ encoding: "tiktoken.Encoding",
44
48
  token_limit: int,
45
49
  additional_text: Optional[str] = None,
46
50
  ) -> str:
@@ -67,6 +71,7 @@ def render_formatted_text(
67
71
  if additional_text:
68
72
  token_limit -= len(encoding.encode(additional_text))
69
73
  text_length = len(encoding.encode(text))
74
+ logger.debug(f"Encoding length: {text_length} (original: {len(text)})")
70
75
  if text_length <= token_limit:
71
76
  return text
72
77
  if not values:
@@ -95,3 +100,18 @@ def get_token_limit(model_name: str) -> int:
95
100
  if model in model_name:
96
101
  return token_limit
97
102
  return 4096
103
+
104
+
105
+ def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]:
106
+ import yaml
107
+ if "```" in yaml_str:
108
+ yaml_str = yaml_str.split("```")[1].strip()
109
+ if yaml_str.startswith("yaml"):
110
+ yaml_str = yaml_str[4:].strip()
111
+ try:
112
+ return yaml.safe_load(yaml_str)
113
+ except Exception as e:
114
+ if strict:
115
+ raise e
116
+ logger.error(f"Error parsing YAML: {yaml_str}\n{e}")
117
+ return None
@@ -124,7 +124,7 @@ def select_nested(data: dict, paths: List[Union[str, List[str]]], current_path=N
124
124
 
125
125
  Args:
126
126
  data (dict): The input nested dictionary.
127
- selectors (list): A list of selector strings.
127
+ paths (list): A list of selector strings.
128
128
 
129
129
  Returns:
130
130
  dict: A new dictionary with the same structure, but only the selected attributes.
@@ -162,6 +162,8 @@ def select_nested(data: dict, paths: List[Union[str, List[str]]], current_path=N
162
162
  if current_path is None:
163
163
  current_path = []
164
164
  matching_paths = []
165
+ if not paths:
166
+ raise ValueError("No paths provided")
165
167
  for path in paths:
166
168
  if isinstance(path, str):
167
169
  path = path.split(".")
File without changes
File without changes