linkml-store 0.2.2__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- {linkml_store-0.2.2 → linkml_store-0.2.5}/PKG-INFO +15 -12
- {linkml_store-0.2.2 → linkml_store-0.2.5}/pyproject.toml +13 -6
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/client.py +34 -15
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/collection.py +8 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/config.py +5 -1
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/database.py +2 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/filesystem/filesystem_database.py +1 -1
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/cli.py +49 -15
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/index/implementations/llm_indexer.py +7 -4
- linkml_store-0.2.5/src/linkml_store/inference/implementations/llm_inference_engine.py +152 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/implementations/rag_inference_engine.py +20 -9
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/inference_engine.py +6 -4
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/format_utils.py +6 -1
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/llm_utils.py +23 -3
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/object_utils.py +3 -1
- {linkml_store-0.2.2 → linkml_store-0.2.5}/LICENSE +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/README.md +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/__init__.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/__init__.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/queries.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/__init__.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/chromadb/__init__.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/chromadb/chromadb_collection.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/chromadb/chromadb_database.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/duckdb/__init__.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/duckdb/duckdb_collection.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/duckdb/duckdb_database.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/duckdb/mappings.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/filesystem/__init__.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/filesystem/filesystem_collection.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/hdf5/__init__.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/hdf5/hdf5_collection.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/hdf5/hdf5_database.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/mongodb/__init__.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/mongodb/mongodb_collection.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/mongodb/mongodb_database.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/neo4j/__init__.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/neo4j/neo4j_collection.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/neo4j/neo4j_database.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/solr/__init__.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/solr/solr_collection.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/solr/solr_database.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/solr/solr_utils.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/types.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/constants.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/graphs/__init__.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/graphs/graph_map.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/index/__init__.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/index/implementations/__init__.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/index/implementations/simple_indexer.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/index/indexer.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/__init__.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/evaluation.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/implementations/__init__.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/implementations/rule_based_inference_engine.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/implementations/sklearn_inference_engine.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/inference_config.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/inference_engine_registry.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/__init__.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/change_utils.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/file_utils.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/io.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/mongodb_utils.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/neo4j_utils.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/pandas_utils.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/patch_utils.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/query_utils.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/schema_utils.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/sklearn_utils.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/sql_utils.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/stats_utils.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/vector_utils.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/webapi/__init__.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/webapi/html/__init__.py +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/webapi/html/base.html.j2 +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/webapi/html/collection_details.html.j2 +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/webapi/html/database_details.html.j2 +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/webapi/html/databases.html.j2 +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/webapi/html/generic.html.j2 +0 -0
- {linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/webapi/main.py +0 -0
|
@@ -1,24 +1,24 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
2
|
Name: linkml-store
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: linkml-store
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Author 1
|
|
7
7
|
Author-email: author@org.org
|
|
8
|
-
Requires-Python: >=3.
|
|
8
|
+
Requires-Python: >=3.10,<4.0
|
|
9
9
|
Classifier: License :: OSI Approved :: MIT License
|
|
10
10
|
Classifier: Programming Language :: Python :: 3
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
12
11
|
Classifier: Programming Language :: Python :: 3.10
|
|
13
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Provides-Extra: all
|
|
15
16
|
Provides-Extra: analytics
|
|
16
17
|
Provides-Extra: app
|
|
17
|
-
Provides-Extra:
|
|
18
|
+
Provides-Extra: bigquery
|
|
18
19
|
Provides-Extra: fastapi
|
|
19
20
|
Provides-Extra: frictionless
|
|
20
21
|
Provides-Extra: h5py
|
|
21
|
-
Provides-Extra: ibis
|
|
22
22
|
Provides-Extra: llm
|
|
23
23
|
Provides-Extra: map
|
|
24
24
|
Provides-Extra: mongodb
|
|
@@ -29,25 +29,25 @@ Provides-Extra: scipy
|
|
|
29
29
|
Provides-Extra: tests
|
|
30
30
|
Provides-Extra: validation
|
|
31
31
|
Requires-Dist: black (>=24.0.0) ; extra == "tests"
|
|
32
|
-
Requires-Dist: chromadb ; extra == "chromadb"
|
|
33
32
|
Requires-Dist: click
|
|
34
33
|
Requires-Dist: duckdb (>=0.10.1)
|
|
35
34
|
Requires-Dist: duckdb-engine (>=0.11.2)
|
|
36
35
|
Requires-Dist: fastapi ; extra == "fastapi"
|
|
37
36
|
Requires-Dist: frictionless ; extra == "frictionless"
|
|
38
|
-
Requires-Dist: gcsfs
|
|
37
|
+
Requires-Dist: gcsfs
|
|
38
|
+
Requires-Dist: google-cloud-bigquery ; extra == "bigquery"
|
|
39
39
|
Requires-Dist: h5py ; extra == "h5py"
|
|
40
|
-
Requires-Dist: ibis-framework[duckdb,examples] (>=9.3.0) ; extra == "ibis"
|
|
41
40
|
Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
|
|
42
41
|
Requires-Dist: jsonlines (>=4.0.0,<5.0.0)
|
|
42
|
+
Requires-Dist: jsonpatch (>=1.33)
|
|
43
43
|
Requires-Dist: linkml (>=1.8.0) ; extra == "validation"
|
|
44
44
|
Requires-Dist: linkml-runtime (>=1.8.0)
|
|
45
45
|
Requires-Dist: linkml_map ; extra == "map"
|
|
46
46
|
Requires-Dist: linkml_renderer ; extra == "renderer"
|
|
47
|
-
Requires-Dist: llm ; extra == "llm"
|
|
47
|
+
Requires-Dist: llm ; extra == "llm" or extra == "all"
|
|
48
48
|
Requires-Dist: matplotlib ; extra == "analytics"
|
|
49
|
-
Requires-Dist: multipledispatch
|
|
50
|
-
Requires-Dist: neo4j ; extra == "neo4j"
|
|
49
|
+
Requires-Dist: multipledispatch
|
|
50
|
+
Requires-Dist: neo4j ; extra == "neo4j" or extra == "all"
|
|
51
51
|
Requires-Dist: networkx ; extra == "neo4j"
|
|
52
52
|
Requires-Dist: pandas (>=2.2.1) ; extra == "analytics"
|
|
53
53
|
Requires-Dist: plotly ; extra == "analytics"
|
|
@@ -56,14 +56,17 @@ Requires-Dist: pyarrow ; extra == "pyarrow"
|
|
|
56
56
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
|
57
57
|
Requires-Dist: pymongo ; extra == "mongodb"
|
|
58
58
|
Requires-Dist: pystow (>=0.5.4,<0.6.0)
|
|
59
|
+
Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
|
|
59
60
|
Requires-Dist: ruff (>=0.6.2) ; extra == "tests"
|
|
60
61
|
Requires-Dist: scikit-learn ; extra == "scipy"
|
|
61
62
|
Requires-Dist: scipy ; extra == "scipy"
|
|
62
63
|
Requires-Dist: seaborn ; extra == "analytics"
|
|
63
64
|
Requires-Dist: sqlalchemy
|
|
64
65
|
Requires-Dist: streamlit (>=1.32.2,<2.0.0) ; extra == "app"
|
|
66
|
+
Requires-Dist: tabulate
|
|
65
67
|
Requires-Dist: tiktoken ; extra == "llm"
|
|
66
68
|
Requires-Dist: uvicorn ; extra == "fastapi"
|
|
69
|
+
Requires-Dist: xmltodict (>=0.13.0)
|
|
67
70
|
Description-Content-Type: text/markdown
|
|
68
71
|
|
|
69
72
|
# linkml-store
|
|
@@ -1,18 +1,19 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "linkml-store"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.5"
|
|
4
4
|
description = "linkml-store"
|
|
5
5
|
authors = ["Author 1 <author@org.org>"]
|
|
6
6
|
license = "MIT"
|
|
7
7
|
readme = "README.md"
|
|
8
8
|
|
|
9
9
|
[tool.poetry.dependencies]
|
|
10
|
-
python = "^3.
|
|
10
|
+
python = "^3.10"
|
|
11
11
|
click = "*"
|
|
12
12
|
pydantic = "^2.0.0"
|
|
13
13
|
linkml-runtime = ">=1.8.0"
|
|
14
14
|
streamlit = { version = "^1.32.2", optional = true }
|
|
15
15
|
sqlalchemy = "*"
|
|
16
|
+
google-cloud-bigquery = "*"
|
|
16
17
|
duckdb = ">=0.10.1"
|
|
17
18
|
duckdb-engine = ">=0.11.2"
|
|
18
19
|
matplotlib = { version = "*", optional = true }
|
|
@@ -27,7 +28,7 @@ pymongo = { version="*", optional = true }
|
|
|
27
28
|
neo4j = { version="*", optional = true }
|
|
28
29
|
py2neo = { version="*", optional = true }
|
|
29
30
|
networkx = { version="*", optional = true }
|
|
30
|
-
chromadb = { version="*", optional = true }
|
|
31
|
+
#chromadb = { version="*", optional = true }
|
|
31
32
|
pyarrow = { version="*", optional = true }
|
|
32
33
|
h5py = { version="*", optional = true }
|
|
33
34
|
scipy = { version="*", optional = true }
|
|
@@ -36,14 +37,18 @@ linkml = { version=">=1.8.0", optional = true }
|
|
|
36
37
|
linkml_map = { version="*", optional = true }
|
|
37
38
|
linkml_renderer = { version="*", optional = true }
|
|
38
39
|
frictionless = { version="*", optional = true }
|
|
39
|
-
ibis-framework = { version=">=9.3.0", extras = ["duckdb", "examples"], optional = true }
|
|
40
|
+
#ibis-framework = { version=">=9.3.0", extras = ["duckdb", "examples"], optional = true }
|
|
40
41
|
gcsfs = { version="*", optional = true }
|
|
41
42
|
multipledispatch = { version="*" }
|
|
43
|
+
tabulate = "*"
|
|
42
44
|
pandas = ">=2.2.1"
|
|
43
45
|
jinja2 = "^3.1.4"
|
|
44
46
|
jsonlines = "^4.0.0"
|
|
45
47
|
fastapi = { version="*", optional = true }
|
|
46
48
|
uvicorn = { version="*", optional = true }
|
|
49
|
+
xmltodict = ">=0.13.0"
|
|
50
|
+
jsonpatch = ">=1.33"
|
|
51
|
+
python-dotenv = "^1.0.1"
|
|
47
52
|
|
|
48
53
|
[tool.poetry.group.dev.dependencies]
|
|
49
54
|
pytest = {version = ">=7.1.2"}
|
|
@@ -77,7 +82,7 @@ tests = ["black", "ruff"]
|
|
|
77
82
|
llm = ["llm", "tiktoken"]
|
|
78
83
|
mongodb = ["pymongo"]
|
|
79
84
|
neo4j = ["neo4j", "py2neo", "networkx"]
|
|
80
|
-
chromadb = ["chromadb"]
|
|
85
|
+
#chromadb = ["chromadb"]
|
|
81
86
|
h5py = ["h5py"]
|
|
82
87
|
pyarrow = ["pyarrow"]
|
|
83
88
|
validation = ["linkml"]
|
|
@@ -86,7 +91,9 @@ renderer = ["linkml_renderer"]
|
|
|
86
91
|
fastapi = ["fastapi", "uvicorn"]
|
|
87
92
|
frictionless = ["frictionless"]
|
|
88
93
|
scipy = ["scipy", "scikit-learn"]
|
|
89
|
-
ibis = ["ibis-framework", "multipledispatch", "gcsfs"]
|
|
94
|
+
#ibis = ["ibis-framework", "multipledispatch", "gcsfs"]
|
|
95
|
+
bigquery = ["google-cloud-bigquery"]
|
|
96
|
+
all = ["llm", "mongodb", "neo4j", "validation", "map", "renderer", "bigquery"]
|
|
90
97
|
|
|
91
98
|
[tool.poetry.scripts]
|
|
92
99
|
linkml-store = "linkml_store.cli:cli"
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import importlib
|
|
1
2
|
import logging
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
from typing import Dict, Optional, Union
|
|
@@ -7,23 +8,22 @@ from linkml_runtime import SchemaView
|
|
|
7
8
|
|
|
8
9
|
from linkml_store.api import Database
|
|
9
10
|
from linkml_store.api.config import ClientConfig
|
|
10
|
-
from linkml_store.api.stores.chromadb.chromadb_database import ChromaDBDatabase
|
|
11
|
-
from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
|
|
12
|
-
from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase
|
|
13
|
-
from linkml_store.api.stores.mongodb.mongodb_database import MongoDBDatabase
|
|
14
|
-
from linkml_store.api.stores.neo4j.neo4j_database import Neo4jDatabase
|
|
15
|
-
from linkml_store.api.stores.solr.solr_database import SolrDatabase
|
|
16
11
|
|
|
17
12
|
logger = logging.getLogger(__name__)
|
|
18
13
|
|
|
19
14
|
|
|
15
|
+
|
|
20
16
|
HANDLE_MAP = {
|
|
21
|
-
"duckdb": DuckDBDatabase,
|
|
22
|
-
"solr": SolrDatabase,
|
|
23
|
-
"mongodb": MongoDBDatabase,
|
|
24
|
-
"chromadb": ChromaDBDatabase,
|
|
25
|
-
"neo4j": Neo4jDatabase,
|
|
26
|
-
"file": FileSystemDatabase,
|
|
17
|
+
"duckdb": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
|
|
18
|
+
"solr": "linkml_store.api.stores.solr.solr_database.SolrDatabase",
|
|
19
|
+
"mongodb": "linkml_store.api.stores.mongodb.mongodb_database.MongoDBDatabase",
|
|
20
|
+
"chromadb": "linkml_store.api.stores.chromadb.chromadb_database.ChromaDBDatabase",
|
|
21
|
+
"neo4j": "linkml_store.api.stores.neo4j.neo4j_database.Neo4jDatabase",
|
|
22
|
+
"file": "linkml_store.api.stores.filesystem.filesystem_database.FileSystemDatabase",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
SUFFIX_MAP = {
|
|
26
|
+
"ddb": "duckdb:///{path}",
|
|
27
27
|
}
|
|
28
28
|
|
|
29
29
|
|
|
@@ -155,6 +155,9 @@ class Client:
|
|
|
155
155
|
if auto_attach:
|
|
156
156
|
db = self.attach_database(handle, alias=name, **kwargs)
|
|
157
157
|
db.from_config(db_config)
|
|
158
|
+
if db_config.source:
|
|
159
|
+
db = self.get_database(name)
|
|
160
|
+
db.store(db_config.source.data)
|
|
158
161
|
|
|
159
162
|
def _set_database_config(self, db: Database):
|
|
160
163
|
"""
|
|
@@ -198,6 +201,12 @@ class Client:
|
|
|
198
201
|
:param kwargs:
|
|
199
202
|
:return:
|
|
200
203
|
"""
|
|
204
|
+
if ":" not in handle:
|
|
205
|
+
if alias is None:
|
|
206
|
+
alias = handle
|
|
207
|
+
suffix = handle.split(".")[-1]
|
|
208
|
+
if suffix in SUFFIX_MAP:
|
|
209
|
+
handle = SUFFIX_MAP[suffix].format(path=handle)
|
|
201
210
|
if ":" not in handle:
|
|
202
211
|
scheme = handle
|
|
203
212
|
handle = None
|
|
@@ -207,14 +216,23 @@ class Client:
|
|
|
207
216
|
scheme, _ = handle.split(":", 1)
|
|
208
217
|
if scheme not in HANDLE_MAP:
|
|
209
218
|
raise ValueError(f"Unknown scheme: {scheme}")
|
|
210
|
-
|
|
219
|
+
module_path, class_name = HANDLE_MAP[scheme].rsplit('.', 1)
|
|
220
|
+
try:
|
|
221
|
+
module = importlib.import_module(module_path)
|
|
222
|
+
cls = getattr(module, class_name)
|
|
223
|
+
except ImportError as e:
|
|
224
|
+
raise ImportError(f"Failed to import {scheme} database. Make sure the correct extras are installed: {e}")
|
|
225
|
+
|
|
226
|
+
#cls = HANDLE_MAP[scheme]
|
|
211
227
|
db = cls(handle=handle, recreate_if_exists=recreate_if_exists, **kwargs)
|
|
212
228
|
if schema_view:
|
|
213
229
|
db.set_schema_view(schema_view)
|
|
214
230
|
if not alias:
|
|
215
231
|
alias = handle
|
|
216
232
|
if not self._databases:
|
|
233
|
+
logger.info("Initializing databases")
|
|
217
234
|
self._databases = {}
|
|
235
|
+
logger.info(f"Attaching {alias}")
|
|
218
236
|
self._databases[alias] = db
|
|
219
237
|
db.parent = self
|
|
220
238
|
if db.alias:
|
|
@@ -257,8 +275,9 @@ class Client:
|
|
|
257
275
|
self._databases[name] = db
|
|
258
276
|
if name not in self._databases:
|
|
259
277
|
if create_if_not_exists:
|
|
260
|
-
logger.info(f"Creating database: {name}")
|
|
261
|
-
self.attach_database(name, **kwargs)
|
|
278
|
+
logger.info(f"Creating/attaching database: {name}")
|
|
279
|
+
db = self.attach_database(name, **kwargs)
|
|
280
|
+
name = db.alias
|
|
262
281
|
else:
|
|
263
282
|
raise ValueError(f"Database {name} does not exist")
|
|
264
283
|
db = self._databases[name]
|
|
@@ -470,6 +470,7 @@ class Collection(Generic[DatabaseType]):
|
|
|
470
470
|
where: Optional[Any] = None,
|
|
471
471
|
index_name: Optional[str] = None,
|
|
472
472
|
limit: Optional[int] = None,
|
|
473
|
+
select_cols: Optional[List[str]] = None,
|
|
473
474
|
mmr_relevance_factor: Optional[float] = None,
|
|
474
475
|
**kwargs,
|
|
475
476
|
) -> QueryResult:
|
|
@@ -503,6 +504,7 @@ class Collection(Generic[DatabaseType]):
|
|
|
503
504
|
:param where:
|
|
504
505
|
:param index_name:
|
|
505
506
|
:param limit:
|
|
507
|
+
:param select_cols:
|
|
506
508
|
:param kwargs:
|
|
507
509
|
:return:
|
|
508
510
|
"""
|
|
@@ -538,6 +540,11 @@ class Collection(Generic[DatabaseType]):
|
|
|
538
540
|
results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs)
|
|
539
541
|
for r in results:
|
|
540
542
|
del r[1][index_col]
|
|
543
|
+
if select_cols:
|
|
544
|
+
new_results = []
|
|
545
|
+
for r in results:
|
|
546
|
+
new_results.append((r[0], {k: v for k, v in r[1].items() if k in select_cols}))
|
|
547
|
+
results = new_results
|
|
541
548
|
new_qr = QueryResult(num_rows=len(results))
|
|
542
549
|
new_qr.ranked_rows = results
|
|
543
550
|
new_qr.rows = [r[1] for r in results]
|
|
@@ -672,6 +679,7 @@ class Collection(Generic[DatabaseType]):
|
|
|
672
679
|
"""
|
|
673
680
|
yield from self.find({}, limit=-1).rows
|
|
674
681
|
|
|
682
|
+
@property
|
|
675
683
|
def rows(self) -> List[OBJECT]:
|
|
676
684
|
"""
|
|
677
685
|
Return a list of objects in the collection.
|
|
@@ -91,7 +91,7 @@ class CollectionConfig(ConfiguredBaseModel):
|
|
|
91
91
|
)
|
|
92
92
|
source: Optional[CollectionSource] = Field(
|
|
93
93
|
default=None,
|
|
94
|
-
description="
|
|
94
|
+
description="Source for the collection",
|
|
95
95
|
)
|
|
96
96
|
derived_from: Optional[List[DerivationConfiguration]] = Field(
|
|
97
97
|
default=None,
|
|
@@ -154,6 +154,10 @@ class DatabaseConfig(ConfiguredBaseModel):
|
|
|
154
154
|
default=False,
|
|
155
155
|
description="Whether to ensure referential integrity",
|
|
156
156
|
)
|
|
157
|
+
source: Optional[CollectionSource] = Field(
|
|
158
|
+
default=None,
|
|
159
|
+
description="Source for the database",
|
|
160
|
+
)
|
|
157
161
|
|
|
158
162
|
|
|
159
163
|
class ClientConfig(ConfiguredBaseModel):
|
|
@@ -470,6 +470,7 @@ class Database(ABC, Generic[CollectionType]):
|
|
|
470
470
|
if not self._schema_view:
|
|
471
471
|
self._initialize_schema()
|
|
472
472
|
if not self._schema_view:
|
|
473
|
+
logger.info("Inducing schema view")
|
|
473
474
|
self._schema_view = self.induce_schema_view()
|
|
474
475
|
return self._schema_view
|
|
475
476
|
|
|
@@ -505,6 +506,7 @@ class Database(ABC, Generic[CollectionType]):
|
|
|
505
506
|
if isinstance(schema_view, str):
|
|
506
507
|
schema_view = SchemaView(schema_view)
|
|
507
508
|
self._schema_view = schema_view
|
|
509
|
+
logger.info(f"Setting schema view for {self.handle}")
|
|
508
510
|
# self._schema_view = SchemaView(schema_view.materialize_derived_schema())
|
|
509
511
|
if not self._collections:
|
|
510
512
|
return
|
|
@@ -3,7 +3,7 @@ from pathlib import Path
|
|
|
3
3
|
from typing import Optional
|
|
4
4
|
|
|
5
5
|
import yaml
|
|
6
|
-
from
|
|
6
|
+
from linkml_runtime.utils.schema_builder import SchemaBuilder
|
|
7
7
|
from linkml_runtime import SchemaView
|
|
8
8
|
|
|
9
9
|
from linkml_store.api import Database
|
|
@@ -99,6 +99,7 @@ include_internal_option = click.option("--include-internal/--no-include-internal
|
|
|
99
99
|
@click.option("--database", "-d", help="Database name")
|
|
100
100
|
@click.option("--collection", "-c", help="Collection name")
|
|
101
101
|
@click.option("--input", "-i", help="Input file (alternative to database/collection)")
|
|
102
|
+
@click.option("--schema", "-S", help="Path to schema (LinkML yaml)")
|
|
102
103
|
@click.option("--config", "-C", type=click.Path(exists=True), help="Path to the configuration file")
|
|
103
104
|
@click.option("--set", help="Metadata settings in the form PATHEXPR=value", multiple=True)
|
|
104
105
|
@click.option("-v", "--verbose", count=True)
|
|
@@ -111,7 +112,7 @@ include_internal_option = click.option("--include-internal/--no-include-internal
|
|
|
111
112
|
help="If set then show full stacktrace on error",
|
|
112
113
|
)
|
|
113
114
|
@click.pass_context
|
|
114
|
-
def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, config, set, input, **kwargs):
|
|
115
|
+
def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, schema, config, set, input, **kwargs):
|
|
115
116
|
"""A CLI for interacting with the linkml-store."""
|
|
116
117
|
if not stacktrace:
|
|
117
118
|
sys.tracebacklimit = 0
|
|
@@ -135,12 +136,17 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
|
|
|
135
136
|
logger.setLevel(logging.ERROR)
|
|
136
137
|
ctx.ensure_object(dict)
|
|
137
138
|
if input:
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
139
|
+
database = "duckdb" # default: store in duckdb
|
|
140
|
+
if input.startswith("http"):
|
|
141
|
+
parts = input.split("/")
|
|
142
|
+
collection = parts[-1]
|
|
143
|
+
collection = collection.split(".")[0]
|
|
144
|
+
else:
|
|
145
|
+
stem = underscore(Path(input).stem)
|
|
146
|
+
collection = stem
|
|
147
|
+
logger.info(f"Using input file: {input}, "
|
|
148
|
+
f"default storage is {database} and collection is {collection}")
|
|
141
149
|
config = ClientConfig(databases={"duckdb": {"collections": {stem: {"source": {"local_path": input}}}}})
|
|
142
|
-
# collection = Path(input).stem
|
|
143
|
-
# database = f"file:{Path(input).parent}"
|
|
144
150
|
if config is None and DEFAULT_LOCAL_CONF_PATH.exists():
|
|
145
151
|
config = DEFAULT_LOCAL_CONF_PATH
|
|
146
152
|
if config is None and DEFAULT_GLOBAL_CONF_PATH.exists():
|
|
@@ -153,6 +159,9 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
|
|
|
153
159
|
client = Client().from_config(config, **kwargs) if config else Client()
|
|
154
160
|
settings = ContextSettings(client=client, database_name=database, collection_name=collection)
|
|
155
161
|
ctx.obj["settings"] = settings
|
|
162
|
+
if schema:
|
|
163
|
+
db = settings.database
|
|
164
|
+
db.set_schema_view(schema)
|
|
156
165
|
if settings.database_name:
|
|
157
166
|
db = client.get_database(database)
|
|
158
167
|
if set:
|
|
@@ -178,10 +187,11 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
|
|
|
178
187
|
|
|
179
188
|
@cli.command()
|
|
180
189
|
@click.argument("files", type=click.Path(exists=True), nargs=-1)
|
|
190
|
+
@click.option("--replace/--no-replace", default=False, show_default=True, help="Replace existing objects")
|
|
181
191
|
@click.option("--format", "-f", type=format_choice, help="Input format")
|
|
182
192
|
@click.option("--object", "-i", multiple=True, help="Input object as YAML")
|
|
183
193
|
@click.pass_context
|
|
184
|
-
def insert(ctx, files, object, format):
|
|
194
|
+
def insert(ctx, files, replace, object, format):
|
|
185
195
|
"""Insert objects from files (JSON, YAML, TSV) into the specified collection.
|
|
186
196
|
|
|
187
197
|
Using a configuration:
|
|
@@ -195,7 +205,6 @@ def insert(ctx, files, object, format):
|
|
|
195
205
|
collection = settings.collection
|
|
196
206
|
if not collection:
|
|
197
207
|
raise ValueError("Collection must be specified.")
|
|
198
|
-
objects = []
|
|
199
208
|
if not files and not object:
|
|
200
209
|
files = ["-"]
|
|
201
210
|
for file_path in files:
|
|
@@ -204,13 +213,19 @@ def insert(ctx, files, object, format):
|
|
|
204
213
|
else:
|
|
205
214
|
objects = load_objects(file_path)
|
|
206
215
|
logger.info(f"Inserting {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
|
|
207
|
-
|
|
216
|
+
if replace:
|
|
217
|
+
collection.replace(objects)
|
|
218
|
+
else:
|
|
219
|
+
collection.insert(objects)
|
|
208
220
|
click.echo(f"Inserted {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
|
|
209
221
|
if object:
|
|
210
222
|
for object_str in object:
|
|
211
223
|
logger.info(f"Parsing: {object_str}")
|
|
212
224
|
objects = yaml.safe_load(object_str)
|
|
213
|
-
|
|
225
|
+
if replace:
|
|
226
|
+
collection.replace(objects)
|
|
227
|
+
else:
|
|
228
|
+
collection.insert(objects)
|
|
214
229
|
click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.alias}'.")
|
|
215
230
|
collection.commit()
|
|
216
231
|
|
|
@@ -523,6 +538,7 @@ def pivot(ctx, where, limit, index, columns, values, output_type, output):
|
|
|
523
538
|
@click.option(
|
|
524
539
|
"--feature-attributes", "-F", type=click.STRING, help="Feature attributes for inference (comma separated)"
|
|
525
540
|
)
|
|
541
|
+
@click.option("--training-collection", type=click.STRING,help="Collection to use for training")
|
|
526
542
|
@click.option("--inference-config-file", "-Y", type=click.Path(), help="Path to inference configuration file")
|
|
527
543
|
@click.option("--export-model", "-E", type=click.Path(), help="Export model to file")
|
|
528
544
|
@click.option("--load-model", "-L", type=click.Path(), help="Load model from file")
|
|
@@ -534,14 +550,17 @@ def pivot(ctx, where, limit, index, columns, values, output_type, output):
|
|
|
534
550
|
@click.option("--evaluation-count", "-n", type=click.INT, help="Number of examples to evaluate over")
|
|
535
551
|
@click.option("--evaluation-match-function", help="Name of function to use for matching objects in eval")
|
|
536
552
|
@click.option("--query", "-q", type=click.STRING, help="query term")
|
|
553
|
+
@click.option("--where", "-w", type=click.STRING, help="query term")
|
|
537
554
|
@click.pass_context
|
|
538
555
|
def infer(
|
|
539
556
|
ctx,
|
|
540
557
|
inference_config_file,
|
|
558
|
+
where,
|
|
541
559
|
query,
|
|
542
560
|
evaluation_count,
|
|
543
561
|
evaluation_match_function,
|
|
544
562
|
training_test_data_split,
|
|
563
|
+
training_collection,
|
|
545
564
|
predictor_type,
|
|
546
565
|
target_attribute,
|
|
547
566
|
feature_attributes,
|
|
@@ -579,6 +598,7 @@ def infer(
|
|
|
579
598
|
linkml-store -i tests/input/iris.csv inference -t sklearn \
|
|
580
599
|
-q '{"sepal_length": 5.1, "sepal_width": 3.5, "petal_length": 1.4, "petal_width": 0.2}'
|
|
581
600
|
"""
|
|
601
|
+
where_clause = yaml.safe_load(where) if where else None
|
|
582
602
|
if query:
|
|
583
603
|
query_obj = yaml.safe_load(query)
|
|
584
604
|
else:
|
|
@@ -603,6 +623,7 @@ def infer(
|
|
|
603
623
|
if model_format:
|
|
604
624
|
model_format = ModelSerialization(model_format)
|
|
605
625
|
if load_model:
|
|
626
|
+
logger.info(f"Loading predictor from {load_model}")
|
|
606
627
|
predictor = get_inference_engine(predictor_type)
|
|
607
628
|
predictor = type(predictor).load_model(load_model)
|
|
608
629
|
else:
|
|
@@ -613,13 +634,18 @@ def infer(
|
|
|
613
634
|
if training_test_data_split:
|
|
614
635
|
config.train_test_split = training_test_data_split
|
|
615
636
|
predictor = get_inference_engine(predictor_type, config=config)
|
|
616
|
-
|
|
617
|
-
|
|
637
|
+
training_collection_obj = collection
|
|
638
|
+
if training_collection:
|
|
639
|
+
training_collection_obj = ctx.obj["settings"].database.get_collection(training_collection)
|
|
640
|
+
if training_collection_obj:
|
|
641
|
+
logger.info(f"Using collection: {training_collection_obj.alias} for inference")
|
|
642
|
+
split = training_test_data_split or (1.0, 0.0)
|
|
643
|
+
predictor.load_and_split_data(training_collection_obj, split=split)
|
|
618
644
|
predictor.initialize_model()
|
|
619
645
|
if export_model:
|
|
620
646
|
logger.info(f"Exporting model to {export_model} in {model_format}")
|
|
621
647
|
predictor.export_model(export_model, model_format)
|
|
622
|
-
if not query_obj:
|
|
648
|
+
if not query_obj and where_clause is None:
|
|
623
649
|
if not export_model and not evaluation_count:
|
|
624
650
|
raise ValueError("Query or evaluate must be specified if not exporting model")
|
|
625
651
|
if evaluation_count:
|
|
@@ -637,6 +663,12 @@ def infer(
|
|
|
637
663
|
result = predictor.derive(query_obj)
|
|
638
664
|
dumped_obj = result.model_dump(exclude_none=True)
|
|
639
665
|
write_output([dumped_obj], output_type, target=output)
|
|
666
|
+
if where_clause is not None:
|
|
667
|
+
predicted_objs = []
|
|
668
|
+
for query_obj in collection.find(where_clause).rows:
|
|
669
|
+
result = predictor.derive(query_obj)
|
|
670
|
+
predicted_objs.append(result.predicted_object)
|
|
671
|
+
write_output(predicted_objs, output_type, target=output)
|
|
640
672
|
|
|
641
673
|
|
|
642
674
|
@cli.command()
|
|
@@ -681,6 +713,7 @@ def schema(ctx, output_type, output):
|
|
|
681
713
|
@cli.command()
|
|
682
714
|
@click.argument("search_term")
|
|
683
715
|
@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the search")
|
|
716
|
+
@click.option("--select", "-s", type=click.STRING, help="SELECT clause for the query, as YAML")
|
|
684
717
|
@click.option("--limit", "-l", type=click.INT, help="Maximum number of search results")
|
|
685
718
|
@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
|
|
686
719
|
@click.option("--output", "-o", type=click.Path(), help="Output file path")
|
|
@@ -689,13 +722,14 @@ def schema(ctx, output_type, output):
|
|
|
689
722
|
)
|
|
690
723
|
@index_type_option
|
|
691
724
|
@click.pass_context
|
|
692
|
-
def search(ctx, search_term, where, limit, index_type, output_type, output, auto_index):
|
|
725
|
+
def search(ctx, search_term, where, select, limit, index_type, output_type, output, auto_index):
|
|
693
726
|
"""Search objects in the specified collection."""
|
|
694
727
|
collection = ctx.obj["settings"].collection
|
|
695
728
|
ix = get_indexer(index_type)
|
|
696
729
|
logger.info(f"Attaching index to collection {collection.alias}: {ix.model_dump()}")
|
|
697
730
|
collection.attach_indexer(ix, auto_index=auto_index)
|
|
698
|
-
|
|
731
|
+
select_cols = yaml.safe_load(select) if select else None
|
|
732
|
+
result = collection.search(search_term, where=where, select_cols=select_cols, limit=limit)
|
|
699
733
|
output_data = render_output([{"score": row[0], **row[1]} for row in result.ranked_rows], output_type)
|
|
700
734
|
if output:
|
|
701
735
|
with open(output, "w") as f:
|
{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/index/implementations/llm_indexer.py
RENAMED
|
@@ -3,7 +3,6 @@ from pathlib import Path
|
|
|
3
3
|
from typing import TYPE_CHECKING, List, Optional
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
|
-
from tiktoken import encoding_for_model
|
|
7
6
|
|
|
8
7
|
from linkml_store.api.config import CollectionConfig
|
|
9
8
|
from linkml_store.index.indexer import INDEX_ITEM, Indexer
|
|
@@ -55,7 +54,7 @@ class LLMIndexer(Indexer):
|
|
|
55
54
|
|
|
56
55
|
def texts_to_vectors(self, texts: List[str], cache: bool = None, **kwargs) -> List[INDEX_ITEM]:
|
|
57
56
|
"""
|
|
58
|
-
Use LLM to embed
|
|
57
|
+
Use LLM to embed.
|
|
59
58
|
|
|
60
59
|
>>> indexer = LLMIndexer(cached_embeddings_database="tests/input/llm_cache.db")
|
|
61
60
|
>>> vectors = indexer.texts_to_vectors(["hello", "goodbye"])
|
|
@@ -63,20 +62,24 @@ class LLMIndexer(Indexer):
|
|
|
63
62
|
:param texts:
|
|
64
63
|
:return:
|
|
65
64
|
"""
|
|
65
|
+
from tiktoken import encoding_for_model
|
|
66
66
|
logging.info(f"Converting {len(texts)} texts to vectors")
|
|
67
67
|
model = self.embedding_model
|
|
68
|
-
|
|
68
|
+
# TODO: make this more accurate
|
|
69
|
+
token_limit = get_token_limit(model.model_id) - 200
|
|
69
70
|
encoding = encoding_for_model("gpt-4o")
|
|
70
71
|
|
|
71
72
|
def truncate_text(text: str) -> str:
|
|
72
73
|
# split into tokens every 1000 chars:
|
|
73
74
|
parts = [text[i : i + 1000] for i in range(0, len(text), 1000)]
|
|
74
|
-
|
|
75
|
+
truncated = render_formatted_text(
|
|
75
76
|
lambda x: "".join(x),
|
|
76
77
|
parts,
|
|
77
78
|
encoding,
|
|
78
79
|
token_limit,
|
|
79
80
|
)
|
|
81
|
+
logger.debug(f"Truncated text from {len(text)} to {len(truncated)}")
|
|
82
|
+
return truncated
|
|
80
83
|
|
|
81
84
|
texts = [truncate_text(text) for text in texts]
|
|
82
85
|
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import ClassVar, List, Optional, TextIO, Union
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
from linkml_store.utils.llm_utils import parse_yaml_payload
|
|
9
|
+
from llm import get_key
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
from linkml_store.api.collection import OBJECT, Collection
|
|
13
|
+
from linkml_store.inference.inference_config import Inference, InferenceConfig, LLMConfig
|
|
14
|
+
from linkml_store.inference.inference_engine import InferenceEngine, ModelSerialization
|
|
15
|
+
from linkml_store.utils.object_utils import select_nested
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
MAX_ITERATIONS = 5
|
|
20
|
+
DEFAULT_NUM_EXAMPLES = 20
|
|
21
|
+
|
|
22
|
+
SYSTEM_PROMPT = """
|
|
23
|
+
Your task is to inference the complete YAML
|
|
24
|
+
object output given the YAML object input. I will provide you
|
|
25
|
+
with contextual information, including the schema,
|
|
26
|
+
to help with the inference. You can use the following
|
|
27
|
+
|
|
28
|
+
You should return ONLY valid YAML in your response.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class TrainedModel(BaseModel, extra="forbid"):
|
|
33
|
+
index_rows: List[OBJECT]
|
|
34
|
+
config: Optional[InferenceConfig] = None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class LLMInference(Inference):
|
|
38
|
+
iterations: int = 0
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class LLMInferenceEngine(InferenceEngine):
|
|
43
|
+
"""
|
|
44
|
+
LLM based predictor.
|
|
45
|
+
|
|
46
|
+
Unlike the RAG predictor this performs few-shot inference
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
_model: "llm.Model" = None # noqa: F821
|
|
50
|
+
|
|
51
|
+
PERSIST_COLS: ClassVar[List[str]] = [
|
|
52
|
+
"config",
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
def __post_init__(self):
|
|
56
|
+
if not self.config:
|
|
57
|
+
self.config = InferenceConfig()
|
|
58
|
+
if not self.config.llm_config:
|
|
59
|
+
self.config.llm_config = LLMConfig()
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def model(self) -> "llm.Model": # noqa: F821
|
|
63
|
+
import llm
|
|
64
|
+
|
|
65
|
+
if self._model is None:
|
|
66
|
+
self._model = llm.get_model(self.config.llm_config.model_name)
|
|
67
|
+
if self._model.needs_key:
|
|
68
|
+
key = get_key(None, key_alias=self._model.needs_key)
|
|
69
|
+
self._model.key = key
|
|
70
|
+
|
|
71
|
+
return self._model
|
|
72
|
+
|
|
73
|
+
def initialize_model(self, **kwargs):
|
|
74
|
+
logger.info(f"Initializing model {self.model}")
|
|
75
|
+
|
|
76
|
+
def object_to_text(self, object: OBJECT) -> str:
|
|
77
|
+
return yaml.dump(object)
|
|
78
|
+
|
|
79
|
+
def _schema_str(self) -> str:
|
|
80
|
+
db = self.training_data.base_collection.parent
|
|
81
|
+
from linkml_runtime.dumpers import json_dumper
|
|
82
|
+
schema_dict = json_dumper.to_dict(db.schema_view.schema)
|
|
83
|
+
return yaml.dump(schema_dict)
|
|
84
|
+
|
|
85
|
+
def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None) -> Optional[LLMInference]:
|
|
86
|
+
import llm
|
|
87
|
+
|
|
88
|
+
model: llm.Model = self.model
|
|
89
|
+
#model_name = self.config.llm_config.model_name
|
|
90
|
+
#feature_attributes = self.config.feature_attributes
|
|
91
|
+
target_attributes = self.config.target_attributes
|
|
92
|
+
query_text = self.object_to_text(object)
|
|
93
|
+
|
|
94
|
+
if not target_attributes:
|
|
95
|
+
target_attributes = [k for k, v in object.items() if v is None or v == ""]
|
|
96
|
+
#if not feature_attributes:
|
|
97
|
+
# feature_attributes = [k for k, v in object.items() if v is not None and v != ""]
|
|
98
|
+
|
|
99
|
+
system_prompt = SYSTEM_PROMPT.format(llm_config=self.config.llm_config)
|
|
100
|
+
|
|
101
|
+
system_prompt += "\n## SCHEMA:\n\n" + self._schema_str()
|
|
102
|
+
|
|
103
|
+
stub = ", ".join([f"{k}: ..." for k in target_attributes])
|
|
104
|
+
stub = "{" + stub + "}"
|
|
105
|
+
prompt = (
|
|
106
|
+
"Provide a YAML object of the form"
|
|
107
|
+
"```yaml\n"
|
|
108
|
+
f"{stub}\n"
|
|
109
|
+
"```\n"
|
|
110
|
+
"---\nQuery:\n" f"## INCOMPLETE OBJECT:\n{query_text}\n" "## OUTPUT:\n"
|
|
111
|
+
)
|
|
112
|
+
logger.info(f"Prompt: {prompt}")
|
|
113
|
+
response = model.prompt(prompt, system=system_prompt)
|
|
114
|
+
yaml_str = response.text()
|
|
115
|
+
logger.info(f"Response: {yaml_str}")
|
|
116
|
+
predicted_object = parse_yaml_payload(yaml_str, strict=True)
|
|
117
|
+
predicted_object = {**object, **predicted_object}
|
|
118
|
+
if self.config.validate_results:
|
|
119
|
+
base_collection = self.training_data.base_collection
|
|
120
|
+
errs = list(base_collection.iter_validate_collection([predicted_object]))
|
|
121
|
+
if errs:
|
|
122
|
+
print(f"{iteration} // FAILED TO VALIDATE: {yaml_str}")
|
|
123
|
+
print(f"PARSED: {predicted_object}")
|
|
124
|
+
print(f"ERRORS: {errs}")
|
|
125
|
+
if iteration > MAX_ITERATIONS:
|
|
126
|
+
raise ValueError(f"Validation errors: {errs}")
|
|
127
|
+
extra_texts = [
|
|
128
|
+
"Make sure results conform to the schema. Previously you provided:\n",
|
|
129
|
+
yaml_str,
|
|
130
|
+
"\nThis was invalid.\n",
|
|
131
|
+
"Validation errors:\n",
|
|
132
|
+
] + [self.object_to_text(e) for e in errs]
|
|
133
|
+
return self.derive(object, iteration=iteration+1, additional_prompt_texts=extra_texts)
|
|
134
|
+
return LLMInference(predicted_object=predicted_object, iterations=iteration+1, query=object)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def export_model(
|
|
138
|
+
self, output: Optional[Union[str, Path, TextIO]], model_serialization: ModelSerialization = None, **kwargs
|
|
139
|
+
):
|
|
140
|
+
self.save_model(output)
|
|
141
|
+
|
|
142
|
+
def save_model(self, output: Union[str, Path]) -> None:
|
|
143
|
+
"""
|
|
144
|
+
Save the trained model and related data to a file.
|
|
145
|
+
|
|
146
|
+
:param output: Path to save the model
|
|
147
|
+
"""
|
|
148
|
+
raise NotImplementedError("Does not make sense for this engine")
|
|
149
|
+
|
|
150
|
+
@classmethod
|
|
151
|
+
def load_model(cls, file_path: Union[str, Path]) -> "LLMInferenceEngine":
|
|
152
|
+
raise NotImplementedError("Does not make sense for this engine")
|
|
@@ -20,7 +20,7 @@ DEFAULT_NUM_EXAMPLES = 20
|
|
|
20
20
|
DEFAULT_MMR_RELEVANCE_FACTOR = 0.8
|
|
21
21
|
|
|
22
22
|
SYSTEM_PROMPT = """
|
|
23
|
-
You are a {llm_config.role}, your task is to
|
|
23
|
+
You are a {llm_config.role}, your task is to infer the YAML
|
|
24
24
|
object output given the YAML object input. I will provide you
|
|
25
25
|
with a collection of examples that will provide guidance both
|
|
26
26
|
on the desired structure of the response, as well as the kind
|
|
@@ -130,23 +130,34 @@ class RAGInferenceEngine(InferenceEngine):
|
|
|
130
130
|
else:
|
|
131
131
|
if not self.rag_collection.indexers:
|
|
132
132
|
raise ValueError("RAG collection must have an indexer attached")
|
|
133
|
+
logger.info(f"Searching {self.rag_collection.alias} for examples for: {query_text}")
|
|
133
134
|
rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm",
|
|
134
135
|
mmr_relevance_factor=mmr_relevance_factor)
|
|
135
136
|
examples = rs.rows
|
|
137
|
+
logger.info(f"Found {len(examples)} examples")
|
|
136
138
|
if not examples:
|
|
137
139
|
raise ValueError(f"No examples found for {query_text}; size = {self.rag_collection.size()}")
|
|
138
140
|
prompt_clauses = []
|
|
139
|
-
|
|
141
|
+
this_feature_attributes = feature_attributes
|
|
142
|
+
if not this_feature_attributes:
|
|
143
|
+
this_feature_attributes = list(set(object.keys()) - set(target_attributes))
|
|
144
|
+
query_obj = select_nested(object, this_feature_attributes)
|
|
140
145
|
query_text = self.object_to_text(query_obj)
|
|
141
146
|
for example in examples:
|
|
142
|
-
|
|
147
|
+
this_feature_attributes = feature_attributes
|
|
148
|
+
if not this_feature_attributes:
|
|
149
|
+
this_feature_attributes = list(set(example.keys()) - set(target_attributes))
|
|
150
|
+
if not this_feature_attributes:
|
|
151
|
+
raise ValueError(f"No feature attributes found in example {example}")
|
|
152
|
+
input_obj = select_nested(example, this_feature_attributes)
|
|
143
153
|
input_obj_text = self.object_to_text(input_obj)
|
|
144
154
|
if input_obj_text == query_text:
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
155
|
+
continue
|
|
156
|
+
#raise ValueError(
|
|
157
|
+
# f"Query object {query_text} is the same as example object {input_obj_text}\n"
|
|
158
|
+
# "This indicates possible test data leakage\n."
|
|
159
|
+
# "TODO: allow an option that allows user to treat this as a basic lookup\n"
|
|
160
|
+
#)
|
|
150
161
|
output_obj = select_nested(example, target_attributes)
|
|
151
162
|
prompt_clause = (
|
|
152
163
|
"---\nExample:\n" f"## INPUT:\n{input_obj_text}\n" f"## OUTPUT:\n{self.object_to_text(output_obj)}\n"
|
|
@@ -169,7 +180,7 @@ class RAGInferenceEngine(InferenceEngine):
|
|
|
169
180
|
encoding=encoding, token_limit=token_limit,
|
|
170
181
|
additional_text=system_prompt)
|
|
171
182
|
logger.info(f"Prompt: {prompt}")
|
|
172
|
-
response = model.prompt(prompt, system_prompt)
|
|
183
|
+
response = model.prompt(prompt, system=system_prompt)
|
|
173
184
|
yaml_str = response.text()
|
|
174
185
|
logger.info(f"Response: {yaml_str}")
|
|
175
186
|
predicted_object = self._parse_yaml_payload(yaml_str, strict=True)
|
|
@@ -4,7 +4,7 @@ from abc import ABC
|
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
from enum import Enum
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Optional, TextIO, Tuple, Union
|
|
7
|
+
from typing import Optional, TextIO, Tuple, Union, Any
|
|
8
8
|
|
|
9
9
|
import pandas as pd
|
|
10
10
|
from pydantic import BaseModel, ConfigDict
|
|
@@ -67,13 +67,14 @@ class CollectionSlice(BaseModel):
|
|
|
67
67
|
# slice: Tuple[Optional[int], Optional[int]] = Field(default=(None, None))
|
|
68
68
|
indices: Optional[Tuple[int, ...]] = None
|
|
69
69
|
_collection: Optional[Collection] = None
|
|
70
|
+
where: Any = None
|
|
70
71
|
|
|
71
72
|
@property
|
|
72
73
|
def collection(self) -> Collection:
|
|
73
74
|
if not self._collection and not self.indices:
|
|
74
75
|
return self.base_collection
|
|
75
76
|
if not self._collection:
|
|
76
|
-
rows = self.base_collection.
|
|
77
|
+
rows = self.base_collection.rows
|
|
77
78
|
subset = [rows[i] for i in self.indices]
|
|
78
79
|
db = self.base_collection.parent
|
|
79
80
|
subset_name = self.slice_alias
|
|
@@ -94,6 +95,7 @@ class CollectionSlice(BaseModel):
|
|
|
94
95
|
"""
|
|
95
96
|
Return the slice of the collection as a dataframe.
|
|
96
97
|
|
|
98
|
+
:param flattened: flattned nested objects to give keys like foo.bar
|
|
97
99
|
:return:
|
|
98
100
|
"""
|
|
99
101
|
rs = self.collection.find({}, limit=-1)
|
|
@@ -122,7 +124,7 @@ class InferenceEngine(ABC):
|
|
|
122
124
|
Load the data and split it into training and testing sets.
|
|
123
125
|
|
|
124
126
|
:param collection:
|
|
125
|
-
:param split:
|
|
127
|
+
:param split: Tuple of training and testing split ratios.
|
|
126
128
|
:param randomize:
|
|
127
129
|
:return:
|
|
128
130
|
"""
|
|
@@ -134,7 +136,7 @@ class InferenceEngine(ABC):
|
|
|
134
136
|
self.training_data = CollectionSlice(name="train", base_collection=collection, indices=None)
|
|
135
137
|
self.testing_data = None
|
|
136
138
|
return
|
|
137
|
-
logger.info(f"Loading and splitting data from collection {collection.alias}")
|
|
139
|
+
logger.info(f"Loading and splitting data {split} from collection {collection.alias}")
|
|
138
140
|
size = collection.size()
|
|
139
141
|
indices = range(size)
|
|
140
142
|
if randomize:
|
|
@@ -12,9 +12,9 @@ from typing import IO, Any, Dict, List, Optional, TextIO, Type, Union
|
|
|
12
12
|
|
|
13
13
|
import pandas as pd
|
|
14
14
|
import pystow
|
|
15
|
+
import xmltodict
|
|
15
16
|
import yaml
|
|
16
17
|
from pydantic import BaseModel
|
|
17
|
-
from tabulate import tabulate
|
|
18
18
|
|
|
19
19
|
logger = logging.getLogger(__name__)
|
|
20
20
|
|
|
@@ -30,6 +30,7 @@ class Format(Enum):
|
|
|
30
30
|
YAMLL = "yamll"
|
|
31
31
|
TSV = "tsv"
|
|
32
32
|
CSV = "csv"
|
|
33
|
+
XML = "xml"
|
|
33
34
|
PYTHON = "python"
|
|
34
35
|
PARQUET = "parquet"
|
|
35
36
|
FORMATTED = "formatted"
|
|
@@ -50,6 +51,7 @@ class Format(Enum):
|
|
|
50
51
|
".yamll": cls.YAMLL,
|
|
51
52
|
".tsv": cls.TSV,
|
|
52
53
|
".csv": cls.CSV,
|
|
54
|
+
".xml": cls.XML,
|
|
53
55
|
".py": cls.PYTHON,
|
|
54
56
|
".parquet": cls.PARQUET,
|
|
55
57
|
".pq": cls.PARQUET,
|
|
@@ -124,6 +126,8 @@ def process_file(
|
|
|
124
126
|
delimiter = "\t" if format == Format.TSV else ","
|
|
125
127
|
reader = csv.DictReader(f, delimiter=delimiter)
|
|
126
128
|
objs = list(reader)
|
|
129
|
+
elif format == Format.XML:
|
|
130
|
+
objs = xmltodict.parse(f.read())
|
|
127
131
|
elif format == Format.PARQUET:
|
|
128
132
|
import pyarrow.parquet as pq
|
|
129
133
|
|
|
@@ -284,6 +288,7 @@ def render_output(
|
|
|
284
288
|
elif format == Format.PYTHON:
|
|
285
289
|
return str(data)
|
|
286
290
|
elif format == Format.TABLE:
|
|
291
|
+
from tabulate import tabulate
|
|
287
292
|
return tabulate(pd.DataFrame(data), headers="keys", tablefmt="psql")
|
|
288
293
|
elif format == Format.YAML:
|
|
289
294
|
if isinstance(data, list):
|
|
@@ -1,6 +1,10 @@
|
|
|
1
|
-
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Callable, List, Optional, TYPE_CHECKING
|
|
2
3
|
|
|
3
|
-
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
import tiktoken
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
4
8
|
|
|
5
9
|
MODEL_TOKEN_MAPPING = {
|
|
6
10
|
"gpt-4o-mini": 128_000,
|
|
@@ -40,7 +44,7 @@ MODEL_TOKEN_MAPPING = {
|
|
|
40
44
|
def render_formatted_text(
|
|
41
45
|
render_func: Callable,
|
|
42
46
|
values: List[str],
|
|
43
|
-
encoding: Encoding,
|
|
47
|
+
encoding: "tiktoken.Encoding",
|
|
44
48
|
token_limit: int,
|
|
45
49
|
additional_text: Optional[str] = None,
|
|
46
50
|
) -> str:
|
|
@@ -67,6 +71,7 @@ def render_formatted_text(
|
|
|
67
71
|
if additional_text:
|
|
68
72
|
token_limit -= len(encoding.encode(additional_text))
|
|
69
73
|
text_length = len(encoding.encode(text))
|
|
74
|
+
logger.debug(f"Encoding length: {text_length} (original: {len(text)})")
|
|
70
75
|
if text_length <= token_limit:
|
|
71
76
|
return text
|
|
72
77
|
if not values:
|
|
@@ -95,3 +100,18 @@ def get_token_limit(model_name: str) -> int:
|
|
|
95
100
|
if model in model_name:
|
|
96
101
|
return token_limit
|
|
97
102
|
return 4096
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]:
|
|
106
|
+
import yaml
|
|
107
|
+
if "```" in yaml_str:
|
|
108
|
+
yaml_str = yaml_str.split("```")[1].strip()
|
|
109
|
+
if yaml_str.startswith("yaml"):
|
|
110
|
+
yaml_str = yaml_str[4:].strip()
|
|
111
|
+
try:
|
|
112
|
+
return yaml.safe_load(yaml_str)
|
|
113
|
+
except Exception as e:
|
|
114
|
+
if strict:
|
|
115
|
+
raise e
|
|
116
|
+
logger.error(f"Error parsing YAML: {yaml_str}\n{e}")
|
|
117
|
+
return None
|
|
@@ -124,7 +124,7 @@ def select_nested(data: dict, paths: List[Union[str, List[str]]], current_path=N
|
|
|
124
124
|
|
|
125
125
|
Args:
|
|
126
126
|
data (dict): The input nested dictionary.
|
|
127
|
-
|
|
127
|
+
paths (list): A list of selector strings.
|
|
128
128
|
|
|
129
129
|
Returns:
|
|
130
130
|
dict: A new dictionary with the same structure, but only the selected attributes.
|
|
@@ -162,6 +162,8 @@ def select_nested(data: dict, paths: List[Union[str, List[str]]], current_path=N
|
|
|
162
162
|
if current_path is None:
|
|
163
163
|
current_path = []
|
|
164
164
|
matching_paths = []
|
|
165
|
+
if not paths:
|
|
166
|
+
raise ValueError("No paths provided")
|
|
165
167
|
for path in paths:
|
|
166
168
|
if isinstance(path, str):
|
|
167
169
|
path = path.split(".")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/chromadb/chromadb_database.py
RENAMED
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/duckdb/duckdb_collection.py
RENAMED
|
File without changes
|
{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/duckdb/duckdb_database.py
RENAMED
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/filesystem/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/hdf5/hdf5_collection.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/mongodb/mongodb_collection.py
RENAMED
|
File without changes
|
{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/mongodb/mongodb_database.py
RENAMED
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/neo4j/neo4j_collection.py
RENAMED
|
File without changes
|
{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/neo4j/neo4j_database.py
RENAMED
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/solr/solr_collection.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/index/implementations/__init__.py
RENAMED
|
File without changes
|
{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/index/implementations/simple_indexer.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/implementations/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/inference_engine_registry.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/webapi/html/collection_details.html.j2
RENAMED
|
File without changes
|
{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/webapi/html/database_details.html.j2
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|