linkml-store 0.1.14__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- {linkml_store-0.1.14 → linkml_store-0.2.1}/PKG-INFO +9 -1
- {linkml_store-0.1.14 → linkml_store-0.2.1}/README.md +3 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/pyproject.toml +9 -18
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/collection.py +48 -5
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/database.py +7 -1
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/queries.py +3 -1
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/duckdb/duckdb_collection.py +8 -2
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/cli.py +44 -18
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/index/implementations/llm_indexer.py +20 -2
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/index/indexer.py +51 -1
- linkml_store-0.2.1/src/linkml_store/inference/evaluation.py +195 -0
- linkml_store-0.2.1/src/linkml_store/inference/implementations/rag_inference_engine.py +232 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/inference/implementations/rule_based_inference_engine.py +15 -4
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/inference/implementations/sklearn_inference_engine.py +20 -2
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/inference/inference_config.py +1 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/inference/inference_engine.py +53 -19
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/format_utils.py +6 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/llm_utils.py +2 -0
- linkml_store-0.2.1/src/linkml_store/utils/object_utils.py +182 -0
- linkml_store-0.1.14/src/linkml_store/inference/implementations/rag_inference_engine.py +0 -145
- linkml_store-0.1.14/src/linkml_store/utils/object_utils.py +0 -83
- {linkml_store-0.1.14 → linkml_store-0.2.1}/LICENSE +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/__init__.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/__init__.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/client.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/config.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/__init__.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/chromadb/__init__.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/chromadb/chromadb_collection.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/chromadb/chromadb_database.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/duckdb/__init__.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/duckdb/duckdb_database.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/duckdb/mappings.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/filesystem/__init__.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/filesystem/filesystem_collection.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/filesystem/filesystem_database.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/hdf5/__init__.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/hdf5/hdf5_collection.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/hdf5/hdf5_database.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/mongodb/__init__.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/mongodb/mongodb_collection.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/mongodb/mongodb_database.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/neo4j/__init__.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/neo4j/neo4j_collection.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/neo4j/neo4j_database.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/solr/__init__.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/solr/solr_collection.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/solr/solr_database.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/solr/solr_utils.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/types.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/constants.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/graphs/__init__.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/graphs/graph_map.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/index/__init__.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/index/implementations/__init__.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/index/implementations/simple_indexer.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/inference/__init__.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/inference/implementations/__init__.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/inference/inference_engine_registry.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/__init__.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/change_utils.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/file_utils.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/io.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/mongodb_utils.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/neo4j_utils.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/pandas_utils.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/patch_utils.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/query_utils.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/schema_utils.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/sklearn_utils.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/sql_utils.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/stats_utils.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/webapi/__init__.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/webapi/html/__init__.py +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/webapi/html/base.html.j2 +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/webapi/html/collection_details.html.j2 +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/webapi/html/database_details.html.j2 +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/webapi/html/databases.html.j2 +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/webapi/html/generic.html.j2 +0 -0
- {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/webapi/main.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: linkml-store
|
|
3
|
-
Version: 0.1
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: linkml-store
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Author 1
|
|
@@ -18,6 +18,7 @@ Provides-Extra: chromadb
|
|
|
18
18
|
Provides-Extra: fastapi
|
|
19
19
|
Provides-Extra: frictionless
|
|
20
20
|
Provides-Extra: h5py
|
|
21
|
+
Provides-Extra: ibis
|
|
21
22
|
Provides-Extra: llm
|
|
22
23
|
Provides-Extra: map
|
|
23
24
|
Provides-Extra: mongodb
|
|
@@ -34,7 +35,9 @@ Requires-Dist: duckdb (>=0.10.1)
|
|
|
34
35
|
Requires-Dist: duckdb-engine (>=0.11.2)
|
|
35
36
|
Requires-Dist: fastapi ; extra == "fastapi"
|
|
36
37
|
Requires-Dist: frictionless ; extra == "frictionless"
|
|
38
|
+
Requires-Dist: gcsfs ; extra == "ibis"
|
|
37
39
|
Requires-Dist: h5py ; extra == "h5py"
|
|
40
|
+
Requires-Dist: ibis-framework[duckdb,examples] (>=9.3.0) ; extra == "ibis"
|
|
38
41
|
Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
|
|
39
42
|
Requires-Dist: jsonlines (>=4.0.0,<5.0.0)
|
|
40
43
|
Requires-Dist: linkml (>=1.8.0) ; extra == "validation"
|
|
@@ -43,6 +46,7 @@ Requires-Dist: linkml_map ; extra == "map"
|
|
|
43
46
|
Requires-Dist: linkml_renderer ; extra == "renderer"
|
|
44
47
|
Requires-Dist: llm ; extra == "llm"
|
|
45
48
|
Requires-Dist: matplotlib ; extra == "analytics"
|
|
49
|
+
Requires-Dist: multipledispatch ; extra == "ibis"
|
|
46
50
|
Requires-Dist: neo4j ; extra == "neo4j"
|
|
47
51
|
Requires-Dist: networkx ; extra == "neo4j"
|
|
48
52
|
Requires-Dist: pandas (>=2.2.1) ; extra == "analytics"
|
|
@@ -52,6 +56,7 @@ Requires-Dist: pyarrow ; extra == "pyarrow"
|
|
|
52
56
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
|
53
57
|
Requires-Dist: pymongo ; extra == "mongodb"
|
|
54
58
|
Requires-Dist: pystow (>=0.5.4,<0.6.0)
|
|
59
|
+
Requires-Dist: ruff (>=0.6.2) ; extra == "tests"
|
|
55
60
|
Requires-Dist: scikit-learn ; extra == "scipy"
|
|
56
61
|
Requires-Dist: scipy ; extra == "scipy"
|
|
57
62
|
Requires-Dist: seaborn ; extra == "analytics"
|
|
@@ -70,6 +75,8 @@ common query, index, and storage operations.
|
|
|
70
75
|
|
|
71
76
|
For full documentation, see [https://linkml.io/linkml-store/](https://linkml.io/linkml-store/)
|
|
72
77
|
|
|
78
|
+
See [these slides](https://docs.google.com/presentation/d/e/2PACX-1vSgtWUNUW0qNO_ZhMAGQ6fYhlXZJjBNMYT0OiZz8DDx8oj7iG9KofRs6SeaMXBBOICGknoyMG2zaHnm/embed?start=false&loop=false&delayms=3000) for a high level overview.
|
|
79
|
+
|
|
73
80
|
__Warning__ LinkML-Store is still undergoing changes and refactoring,
|
|
74
81
|
APIs and command line options are subject to change!
|
|
75
82
|
|
|
@@ -196,3 +203,4 @@ make app
|
|
|
196
203
|
|
|
197
204
|
See [these slides](https://docs.google.com/presentation/d/e/2PACX-1vSgtWUNUW0qNO_ZhMAGQ6fYhlXZJjBNMYT0OiZz8DDx8oj7iG9KofRs6SeaMXBBOICGknoyMG2zaHnm/embed?start=false&loop=false&delayms=3000) for more details
|
|
198
205
|
|
|
206
|
+
|
|
@@ -7,6 +7,8 @@ common query, index, and storage operations.
|
|
|
7
7
|
|
|
8
8
|
For full documentation, see [https://linkml.io/linkml-store/](https://linkml.io/linkml-store/)
|
|
9
9
|
|
|
10
|
+
See [these slides](https://docs.google.com/presentation/d/e/2PACX-1vSgtWUNUW0qNO_ZhMAGQ6fYhlXZJjBNMYT0OiZz8DDx8oj7iG9KofRs6SeaMXBBOICGknoyMG2zaHnm/embed?start=false&loop=false&delayms=3000) for a high level overview.
|
|
11
|
+
|
|
10
12
|
__Warning__ LinkML-Store is still undergoing changes and refactoring,
|
|
11
13
|
APIs and command line options are subject to change!
|
|
12
14
|
|
|
@@ -132,3 +134,4 @@ make app
|
|
|
132
134
|
## Background
|
|
133
135
|
|
|
134
136
|
See [these slides](https://docs.google.com/presentation/d/e/2PACX-1vSgtWUNUW0qNO_ZhMAGQ6fYhlXZJjBNMYT0OiZz8DDx8oj7iG9KofRs6SeaMXBBOICGknoyMG2zaHnm/embed?start=false&loop=false&delayms=3000) for more details
|
|
137
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "linkml-store"
|
|
3
|
-
version = "0.1
|
|
3
|
+
version = "0.2.1"
|
|
4
4
|
description = "linkml-store"
|
|
5
5
|
authors = ["Author 1 <author@org.org>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -20,6 +20,7 @@ seaborn = { version = "*", optional = true }
|
|
|
20
20
|
plotly = { version = "*", optional = true }
|
|
21
21
|
pystow = "^0.5.4"
|
|
22
22
|
black = { version=">=24.0.0", optional = true }
|
|
23
|
+
ruff = { version=">=0.6.2", optional = true }
|
|
23
24
|
llm = { version="*", optional = true }
|
|
24
25
|
tiktoken = { version="*", optional = true }
|
|
25
26
|
pymongo = { version="*", optional = true }
|
|
@@ -35,6 +36,9 @@ linkml = { version=">=1.8.0", optional = true }
|
|
|
35
36
|
linkml_map = { version="*", optional = true }
|
|
36
37
|
linkml_renderer = { version="*", optional = true }
|
|
37
38
|
frictionless = { version="*", optional = true }
|
|
39
|
+
ibis-framework = { version=">=9.3.0", extras = ["duckdb", "examples"], optional = true }
|
|
40
|
+
gcsfs = { version="*", optional = true }
|
|
41
|
+
multipledispatch = { version="*" }
|
|
38
42
|
pandas = ">=2.2.1"
|
|
39
43
|
jinja2 = "^3.1.4"
|
|
40
44
|
jsonlines = "^4.0.0"
|
|
@@ -69,7 +73,7 @@ numpy = [
|
|
|
69
73
|
[tool.poetry.extras]
|
|
70
74
|
analytics = ["pandas", "matplotlib", "seaborn", "plotly"]
|
|
71
75
|
app = ["streamlit"]
|
|
72
|
-
tests = ["black"]
|
|
76
|
+
tests = ["black", "ruff"]
|
|
73
77
|
llm = ["llm", "tiktoken"]
|
|
74
78
|
mongodb = ["pymongo"]
|
|
75
79
|
neo4j = ["neo4j", "py2neo", "networkx"]
|
|
@@ -82,6 +86,7 @@ renderer = ["linkml_renderer"]
|
|
|
82
86
|
fastapi = ["fastapi", "uvicorn"]
|
|
83
87
|
frictionless = ["frictionless"]
|
|
84
88
|
scipy = ["scipy", "scikit-learn"]
|
|
89
|
+
ibis = ["ibis-framework", "multipledispatch", "gcsfs"]
|
|
85
90
|
|
|
86
91
|
[tool.poetry.scripts]
|
|
87
92
|
linkml-store = "linkml_store.cli:cli"
|
|
@@ -119,27 +124,13 @@ extend-exclude = [
|
|
|
119
124
|
]
|
|
120
125
|
force-exclude = true
|
|
121
126
|
line-length = 120
|
|
122
|
-
extend-ignore = ["E203"]
|
|
123
|
-
select = [
|
|
127
|
+
lint.extend-ignore = ["E203"]
|
|
128
|
+
lint.select = [
|
|
124
129
|
"E", # pycodestyle errors
|
|
125
130
|
"F", # Pyflakes
|
|
126
131
|
"I", # isort
|
|
127
132
|
]
|
|
128
|
-
# Assume Python 3.8
|
|
129
|
-
target-version = "py38"
|
|
130
133
|
|
|
131
|
-
[tool.ruff.per-file-ignores]
|
|
132
|
-
# These templates can have long lines
|
|
133
|
-
"linkml/generators/sqlalchemy/sqlalchemy_declarative_template.py" = ["E501"]
|
|
134
|
-
"linkml/generators/sqlalchemy/sqlalchemy_imperative_template.py" = ["E501"]
|
|
135
|
-
|
|
136
|
-
# Notebooks can have unsorted imports
|
|
137
|
-
"tests/test_notebooks/input/*" = ["E402"]
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
[tool.ruff.mccabe]
|
|
141
|
-
# Unlike Flake8, default to a complexity level of 10.
|
|
142
|
-
max-complexity = 10
|
|
143
134
|
|
|
144
135
|
|
|
145
136
|
[tool.codespell]
|
|
@@ -226,6 +226,18 @@ class Collection(Generic[DatabaseType]):
|
|
|
226
226
|
self._initialized = True
|
|
227
227
|
patches = [{"op": "add", "path": "/0", "value": obj} for obj in objs]
|
|
228
228
|
self._broadcast(patches, **kwargs)
|
|
229
|
+
self._post_modification_hook(**kwargs)
|
|
230
|
+
|
|
231
|
+
def _post_delete_hook(self, **kwargs):
|
|
232
|
+
self._post_modification_hook(**kwargs)
|
|
233
|
+
|
|
234
|
+
def _post_modification_hook(self, **kwargs):
|
|
235
|
+
for indexer in self.indexers.values():
|
|
236
|
+
ix_collection_name = self.get_index_collection_name(indexer)
|
|
237
|
+
ix_collection = self.parent.get_collection(ix_collection_name)
|
|
238
|
+
# Currently updating the source triggers complete reindexing
|
|
239
|
+
# TODO: make this more efficient by only deleting modified
|
|
240
|
+
ix_collection.delete_where({})
|
|
229
241
|
|
|
230
242
|
def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
|
|
231
243
|
"""
|
|
@@ -476,7 +488,7 @@ class Collection(Generic[DatabaseType]):
|
|
|
476
488
|
Now let's index, using the simple trigram-based index
|
|
477
489
|
|
|
478
490
|
>>> index = get_indexer("simple")
|
|
479
|
-
>>> collection.attach_indexer(index)
|
|
491
|
+
>>> _ = collection.attach_indexer(index)
|
|
480
492
|
|
|
481
493
|
Now let's find all objects:
|
|
482
494
|
|
|
@@ -514,7 +526,10 @@ class Collection(Generic[DatabaseType]):
|
|
|
514
526
|
if ix_coll.size() == 0:
|
|
515
527
|
logger.info(f"Index {index_name} is empty; indexing all objects")
|
|
516
528
|
all_objs = self.find(limit=-1).rows
|
|
517
|
-
|
|
529
|
+
if all_objs:
|
|
530
|
+
# print(f"Index {index_name} is empty; indexing all objects {len(all_objs)}")
|
|
531
|
+
self.index_objects(all_objs, index_name, replace=True, **kwargs)
|
|
532
|
+
assert ix_coll.size() > 0
|
|
518
533
|
qr = ix_coll.find(where=where, limit=-1, **kwargs)
|
|
519
534
|
index_col = ix.index_field
|
|
520
535
|
# TODO: optimize this for large indexes
|
|
@@ -648,7 +663,31 @@ class Collection(Generic[DatabaseType]):
|
|
|
648
663
|
"""
|
|
649
664
|
return self.find({}, limit=1).num_rows
|
|
650
665
|
|
|
651
|
-
def
|
|
666
|
+
def rows_iter(self) -> Iterable[OBJECT]:
|
|
667
|
+
"""
|
|
668
|
+
Return an iterator over the objects in the collection.
|
|
669
|
+
|
|
670
|
+
:return:
|
|
671
|
+
"""
|
|
672
|
+
yield from self.find({}, limit=-1).rows
|
|
673
|
+
|
|
674
|
+
def rows(self) -> List[OBJECT]:
|
|
675
|
+
"""
|
|
676
|
+
Return a list of objects in the collection.
|
|
677
|
+
|
|
678
|
+
:return:
|
|
679
|
+
"""
|
|
680
|
+
return list(self.rows_iter())
|
|
681
|
+
|
|
682
|
+
def ranked_rows(self) -> List[Tuple[float, OBJECT]]:
|
|
683
|
+
"""
|
|
684
|
+
Return a list of objects in the collection, with scores.
|
|
685
|
+
"""
|
|
686
|
+
return [(n, obj) for n, obj in enumerate(self.rows_iter())]
|
|
687
|
+
|
|
688
|
+
def attach_indexer(
|
|
689
|
+
self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs
|
|
690
|
+
) -> Indexer:
|
|
652
691
|
"""
|
|
653
692
|
Attach an index to the collection.
|
|
654
693
|
|
|
@@ -669,8 +708,8 @@ class Collection(Generic[DatabaseType]):
|
|
|
669
708
|
>>> full_index.name = "full"
|
|
670
709
|
>>> name_index = get_indexer("simple", text_template="{name}")
|
|
671
710
|
>>> name_index.name = "name"
|
|
672
|
-
>>> collection.attach_indexer(full_index)
|
|
673
|
-
>>> collection.attach_indexer(name_index)
|
|
711
|
+
>>> _ = collection.attach_indexer(full_index)
|
|
712
|
+
>>> _ = collection.attach_indexer(name_index)
|
|
674
713
|
|
|
675
714
|
Now let's find objects using the full index, using the string "France".
|
|
676
715
|
We expect the country France to be the top hit, but the score will
|
|
@@ -713,6 +752,10 @@ class Collection(Generic[DatabaseType]):
|
|
|
713
752
|
all_objs = self.find(limit=-1).rows
|
|
714
753
|
logger.info(f"Auto-indexing {len(all_objs)} objects")
|
|
715
754
|
self.index_objects(all_objs, index_name, replace=True, **kwargs)
|
|
755
|
+
return index
|
|
756
|
+
|
|
757
|
+
def get_index_collection_name(self, indexer: Indexer) -> str:
|
|
758
|
+
return self._index_collection_name(indexer.name)
|
|
716
759
|
|
|
717
760
|
def _index_collection_name(self, index_name: str) -> str:
|
|
718
761
|
"""
|
|
@@ -268,7 +268,7 @@ class Database(ABC, Generic[CollectionType]):
|
|
|
268
268
|
metadata: Optional[CollectionConfig] = None,
|
|
269
269
|
recreate_if_exists=False,
|
|
270
270
|
**kwargs,
|
|
271
|
-
) ->
|
|
271
|
+
) -> Collection:
|
|
272
272
|
"""
|
|
273
273
|
Create a new collection in the current database.
|
|
274
274
|
|
|
@@ -760,6 +760,12 @@ class Database(ABC, Generic[CollectionType]):
|
|
|
760
760
|
"""
|
|
761
761
|
Export a database to a file or location.
|
|
762
762
|
|
|
763
|
+
>>> from linkml_store.api.client import Client
|
|
764
|
+
>>> client = Client()
|
|
765
|
+
>>> db = client.attach_database("duckdb", alias="test")
|
|
766
|
+
>>> db.import_database("tests/input/iris.csv", Format.CSV, collection_name="iris")
|
|
767
|
+
>>> db.export_database("/tmp/iris.yaml", Format.YAML)
|
|
768
|
+
|
|
763
769
|
:param location: location of the file
|
|
764
770
|
:param target_format: target format
|
|
765
771
|
:param kwargs: additional arguments
|
{linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/duckdb/duckdb_collection.py
RENAMED
|
@@ -36,6 +36,9 @@ class DuckDBCollection(Collection):
|
|
|
36
36
|
logger.info(f"Inserting into: {self.alias} // T={table.name}")
|
|
37
37
|
engine = self.parent.engine
|
|
38
38
|
col_names = [c.name for c in table.columns]
|
|
39
|
+
bad_objs = [obj for obj in objs if not isinstance(obj, dict)]
|
|
40
|
+
if bad_objs:
|
|
41
|
+
logger.error(f"Bad objects: {bad_objs}")
|
|
39
42
|
objs = [{k: obj.get(k, None) for k in col_names} for obj in objs]
|
|
40
43
|
with engine.connect() as conn:
|
|
41
44
|
with conn.begin():
|
|
@@ -47,8 +50,9 @@ class DuckDBCollection(Collection):
|
|
|
47
50
|
if not isinstance(objs, list):
|
|
48
51
|
objs = [objs]
|
|
49
52
|
cd = self.class_definition()
|
|
50
|
-
if not cd:
|
|
53
|
+
if not cd or not cd.attributes:
|
|
51
54
|
cd = self.induce_class_definition_from_objects(objs)
|
|
55
|
+
assert cd.attributes
|
|
52
56
|
table = self._sqla_table(cd)
|
|
53
57
|
engine = self.parent.engine
|
|
54
58
|
with engine.connect() as conn:
|
|
@@ -58,7 +62,8 @@ class DuckDBCollection(Collection):
|
|
|
58
62
|
stmt = stmt.compile(engine)
|
|
59
63
|
conn.execute(stmt)
|
|
60
64
|
conn.commit()
|
|
61
|
-
|
|
65
|
+
self._post_delete_hook()
|
|
66
|
+
return None
|
|
62
67
|
|
|
63
68
|
def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
|
|
64
69
|
logger.info(f"Deleting from {self.target_class_name} where: {where}")
|
|
@@ -84,6 +89,7 @@ class DuckDBCollection(Collection):
|
|
|
84
89
|
if deleted_rows_count == 0 and not missing_ok:
|
|
85
90
|
raise ValueError(f"No rows found for {where}")
|
|
86
91
|
conn.commit()
|
|
92
|
+
self._post_delete_hook()
|
|
87
93
|
return deleted_rows_count if deleted_rows_count > -1 else None
|
|
88
94
|
|
|
89
95
|
def query_facets(
|
|
@@ -7,6 +7,7 @@ from typing import Optional
|
|
|
7
7
|
import click
|
|
8
8
|
import yaml
|
|
9
9
|
from linkml_runtime.dumpers import json_dumper
|
|
10
|
+
from linkml_runtime.utils.formatutils import underscore
|
|
10
11
|
from pydantic import BaseModel
|
|
11
12
|
|
|
12
13
|
from linkml_store import Client
|
|
@@ -17,6 +18,7 @@ from linkml_store.index import get_indexer
|
|
|
17
18
|
from linkml_store.index.implementations.simple_indexer import SimpleIndexer
|
|
18
19
|
from linkml_store.index.indexer import Indexer
|
|
19
20
|
from linkml_store.inference import get_inference_engine
|
|
21
|
+
from linkml_store.inference.evaluation import evaluate_predictor, score_text_overlap
|
|
20
22
|
from linkml_store.inference.inference_config import InferenceConfig
|
|
21
23
|
from linkml_store.inference.inference_engine import ModelSerialization
|
|
22
24
|
from linkml_store.utils.format_utils import Format, guess_format, load_objects, render_output, write_output
|
|
@@ -74,6 +76,8 @@ class ContextSettings(BaseModel):
|
|
|
74
76
|
if name is None:
|
|
75
77
|
# if len(self.database.list_collections()) > 1:
|
|
76
78
|
# raise ValueError("Collection must be specified if there are multiple collections.")
|
|
79
|
+
if not self.database:
|
|
80
|
+
return None
|
|
77
81
|
if not self.database.list_collections():
|
|
78
82
|
return None
|
|
79
83
|
name = list(self.database.list_collections())[0]
|
|
@@ -130,7 +134,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
|
|
|
130
134
|
logger.setLevel(logging.ERROR)
|
|
131
135
|
ctx.ensure_object(dict)
|
|
132
136
|
if input:
|
|
133
|
-
stem = Path(input).stem
|
|
137
|
+
stem = underscore(Path(input).stem)
|
|
134
138
|
database = "duckdb"
|
|
135
139
|
collection = stem
|
|
136
140
|
config = ClientConfig(databases={"duckdb": {"collections": {stem: {"source": {"local_path": input}}}}})
|
|
@@ -216,7 +220,10 @@ def insert(ctx, files, object, format):
|
|
|
216
220
|
@click.option("--object", "-i", multiple=True, help="Input object as YAML")
|
|
217
221
|
@click.pass_context
|
|
218
222
|
def store(ctx, files, object, format):
|
|
219
|
-
"""Store objects from files (JSON, YAML, TSV) into the
|
|
223
|
+
"""Store objects from files (JSON, YAML, TSV) into the database.
|
|
224
|
+
|
|
225
|
+
Note: this is similar to insert, but a collection does not need to be specified
|
|
226
|
+
"""
|
|
220
227
|
settings = ctx.obj["settings"]
|
|
221
228
|
db = settings.database
|
|
222
229
|
if not files and not object:
|
|
@@ -496,12 +503,16 @@ def describe(ctx, where, output_type, output, limit):
|
|
|
496
503
|
@click.option(
|
|
497
504
|
"--predictor-type", "-t", default="sklearn", show_default=True, type=click.STRING, help="Type of predictor"
|
|
498
505
|
)
|
|
506
|
+
@click.option("--evaluation-count", "-n", type=click.INT, help="Number of examples to evaluate over")
|
|
507
|
+
@click.option("--evaluation-match-function", help="Name of function to use for matching objects in eval")
|
|
499
508
|
@click.option("--query", "-q", type=click.STRING, help="query term")
|
|
500
509
|
@click.pass_context
|
|
501
510
|
def infer(
|
|
502
511
|
ctx,
|
|
503
512
|
inference_config_file,
|
|
504
513
|
query,
|
|
514
|
+
evaluation_count,
|
|
515
|
+
evaluation_match_function,
|
|
505
516
|
training_test_data_split,
|
|
506
517
|
predictor_type,
|
|
507
518
|
target_attribute,
|
|
@@ -545,25 +556,28 @@ def infer(
|
|
|
545
556
|
else:
|
|
546
557
|
query_obj = None
|
|
547
558
|
collection = ctx.obj["settings"].collection
|
|
548
|
-
|
|
559
|
+
if collection:
|
|
560
|
+
atts = collection.class_definition().attributes.keys()
|
|
561
|
+
else:
|
|
562
|
+
atts = []
|
|
563
|
+
if feature_attributes:
|
|
564
|
+
features = feature_attributes.split(",")
|
|
565
|
+
features = [f.strip() for f in features]
|
|
566
|
+
else:
|
|
567
|
+
if query_obj:
|
|
568
|
+
features = query_obj.keys()
|
|
569
|
+
else:
|
|
570
|
+
features = None
|
|
571
|
+
if target_attribute:
|
|
572
|
+
target_attributes = list(target_attribute)
|
|
573
|
+
else:
|
|
574
|
+
target_attributes = [att for att in atts if att not in features]
|
|
549
575
|
if model_format:
|
|
550
576
|
model_format = ModelSerialization(model_format)
|
|
551
577
|
if load_model:
|
|
552
578
|
predictor = get_inference_engine(predictor_type)
|
|
553
579
|
predictor = type(predictor).load_model(load_model)
|
|
554
580
|
else:
|
|
555
|
-
if feature_attributes:
|
|
556
|
-
features = feature_attributes.split(",")
|
|
557
|
-
features = [f.strip() for f in features]
|
|
558
|
-
else:
|
|
559
|
-
if query_obj:
|
|
560
|
-
features = query_obj.keys()
|
|
561
|
-
else:
|
|
562
|
-
features = None
|
|
563
|
-
if target_attribute:
|
|
564
|
-
target_attributes = list(target_attribute)
|
|
565
|
-
else:
|
|
566
|
-
target_attributes = [att for att in atts if att not in features]
|
|
567
581
|
if inference_config_file:
|
|
568
582
|
config = InferenceConfig.from_file(inference_config_file)
|
|
569
583
|
else:
|
|
@@ -571,14 +585,26 @@ def infer(
|
|
|
571
585
|
if training_test_data_split:
|
|
572
586
|
config.train_test_split = training_test_data_split
|
|
573
587
|
predictor = get_inference_engine(predictor_type, config=config)
|
|
574
|
-
|
|
588
|
+
if collection:
|
|
589
|
+
predictor.load_and_split_data(collection)
|
|
575
590
|
predictor.initialize_model()
|
|
576
591
|
if export_model:
|
|
577
592
|
logger.info(f"Exporting model to {export_model} in {model_format}")
|
|
578
593
|
predictor.export_model(export_model, model_format)
|
|
579
594
|
if not query_obj:
|
|
580
|
-
if not export_model:
|
|
581
|
-
raise ValueError("Query must be specified if not exporting model")
|
|
595
|
+
if not export_model and not evaluation_count:
|
|
596
|
+
raise ValueError("Query or evaluate must be specified if not exporting model")
|
|
597
|
+
if evaluation_count:
|
|
598
|
+
if evaluation_match_function == "score_text_overlap":
|
|
599
|
+
match_function_fn = score_text_overlap
|
|
600
|
+
elif evaluation_match_function is not None:
|
|
601
|
+
raise ValueError(f"Unknown match function: {evaluation_match_function}")
|
|
602
|
+
else:
|
|
603
|
+
match_function_fn = None
|
|
604
|
+
outcome = evaluate_predictor(
|
|
605
|
+
predictor, target_attributes, evaluation_count=evaluation_count, match_function=match_function_fn
|
|
606
|
+
)
|
|
607
|
+
print(f"Outcome: {outcome} // accuracy: {outcome.accuracy}")
|
|
582
608
|
if query_obj:
|
|
583
609
|
result = predictor.derive(query_obj)
|
|
584
610
|
dumped_obj = result.model_dump(exclude_none=True)
|
{linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/index/implementations/llm_indexer.py
RENAMED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import TYPE_CHECKING, List
|
|
3
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
|
+
from tiktoken import encoding_for_model
|
|
6
7
|
|
|
7
8
|
from linkml_store.api.config import CollectionConfig
|
|
8
9
|
from linkml_store.index.indexer import INDEX_ITEM, Indexer
|
|
10
|
+
from linkml_store.utils.llm_utils import get_token_limit, render_formatted_text
|
|
9
11
|
|
|
10
12
|
if TYPE_CHECKING:
|
|
11
13
|
import llm
|
|
@@ -29,6 +31,7 @@ class LLMIndexer(Indexer):
|
|
|
29
31
|
cached_embeddings_database: str = None
|
|
30
32
|
cached_embeddings_collection: str = None
|
|
31
33
|
cache_queries: bool = False
|
|
34
|
+
truncation_method: Optional[str] = None
|
|
32
35
|
|
|
33
36
|
@property
|
|
34
37
|
def embedding_model(self):
|
|
@@ -62,6 +65,21 @@ class LLMIndexer(Indexer):
|
|
|
62
65
|
"""
|
|
63
66
|
logging.info(f"Converting {len(texts)} texts to vectors")
|
|
64
67
|
model = self.embedding_model
|
|
68
|
+
token_limit = get_token_limit(model.model_id)
|
|
69
|
+
encoding = encoding_for_model("gpt-4o")
|
|
70
|
+
|
|
71
|
+
def truncate_text(text: str) -> str:
|
|
72
|
+
# split into tokens every 1000 chars:
|
|
73
|
+
parts = [text[i : i + 1000] for i in range(0, len(text), 1000)]
|
|
74
|
+
return render_formatted_text(
|
|
75
|
+
lambda x: "".join(x),
|
|
76
|
+
parts,
|
|
77
|
+
encoding,
|
|
78
|
+
token_limit,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
texts = [truncate_text(text) for text in texts]
|
|
82
|
+
|
|
65
83
|
if self.cached_embeddings_database and (cache is None or cache or self.cache_queries):
|
|
66
84
|
model_id = model.model_id
|
|
67
85
|
if not model_id:
|
|
@@ -88,7 +106,7 @@ class LLMIndexer(Indexer):
|
|
|
88
106
|
embeddings_collection = embeddings_db.create_collection(coll_name, metadata=config)
|
|
89
107
|
else:
|
|
90
108
|
embeddings_collection = embeddings_db.create_collection(coll_name, metadata=config)
|
|
91
|
-
|
|
109
|
+
|
|
92
110
|
embeddings = list([None] * len(texts))
|
|
93
111
|
uncached_texts = []
|
|
94
112
|
n = 0
|
|
@@ -36,6 +36,54 @@ def cosine_similarity(vector1, vector2) -> float:
|
|
|
36
36
|
class Indexer(BaseModel):
|
|
37
37
|
"""
|
|
38
38
|
An indexer operates on a collection in order to search for objects.
|
|
39
|
+
|
|
40
|
+
You should use a subcllass of this; this can be looked up dynqamically:
|
|
41
|
+
|
|
42
|
+
>>> from linkml_store.index import get_indexer
|
|
43
|
+
>>> indexer = get_indexer("simple")
|
|
44
|
+
|
|
45
|
+
You can customize how objects are indexed by passing in a text template.
|
|
46
|
+
For example, if your collection has objects with "name" and "profession" attributes,
|
|
47
|
+
you can index them as "{name} {profession}".
|
|
48
|
+
|
|
49
|
+
>>> indexer = get_indexer("simple", text_template="{name} :: {profession}")
|
|
50
|
+
|
|
51
|
+
By default, python fstrings are assumed.
|
|
52
|
+
|
|
53
|
+
We can test this works using the :ref:`object_to_text` method (normally
|
|
54
|
+
you would never need to call this directly, but it's useful for testing):
|
|
55
|
+
|
|
56
|
+
>>> obj = {"name": "John", "profession": "doctor"}
|
|
57
|
+
>>> indexer.object_to_text(obj)
|
|
58
|
+
'John :: doctor'
|
|
59
|
+
|
|
60
|
+
You can also use Jinja2 templates; this gives more flexibility and logic,
|
|
61
|
+
e.g. conditional formatting:
|
|
62
|
+
|
|
63
|
+
>>> tmpl = "{{name}}{% if profession %} :: {{profession}}{% endif %}"
|
|
64
|
+
>>> indexer = get_indexer("simple", text_template=tmpl, text_template_syntax=TemplateSyntaxEnum.jinja2)
|
|
65
|
+
>>> indexer.object_to_text(obj)
|
|
66
|
+
'John :: doctor'
|
|
67
|
+
>>> indexer.object_to_text({"name": "John"})
|
|
68
|
+
'John'
|
|
69
|
+
|
|
70
|
+
You can also specify which attributes to index:
|
|
71
|
+
|
|
72
|
+
>>> indexer = get_indexer("simple", index_attributes=["name"])
|
|
73
|
+
>>> indexer.object_to_text(obj)
|
|
74
|
+
'John'
|
|
75
|
+
|
|
76
|
+
The purpose of an indexer is to translate a collection of objects into a collection of objects
|
|
77
|
+
such as vectors for purposes such as search. Unless you are implementing your own indexer, you
|
|
78
|
+
generally don't need to use the methods that return vectors, but we can examine their behavior
|
|
79
|
+
to get a sense of how they work.
|
|
80
|
+
|
|
81
|
+
>>> vectors = indexer.objects_to_vectors([{"name": "Aardvark"}, {"name": "Aardwolf"}, {"name": "Zesty"}])
|
|
82
|
+
>>> assert cosine_similarity(vectors[0], vectors[1]) > cosine_similarity(vectors[0], vectors[2])
|
|
83
|
+
|
|
84
|
+
Note you should consult the documentation for the specific indexer you are using for more details on
|
|
85
|
+
how text is converted to vectors.
|
|
86
|
+
|
|
39
87
|
"""
|
|
40
88
|
|
|
41
89
|
name: Optional[str] = None
|
|
@@ -122,7 +170,9 @@ class Indexer(BaseModel):
|
|
|
122
170
|
self, query: str, vectors: List[Tuple[str, INDEX_ITEM]], limit: Optional[int] = None
|
|
123
171
|
) -> List[Tuple[float, Any]]:
|
|
124
172
|
"""
|
|
125
|
-
|
|
173
|
+
Use the indexer to search against a database of vectors.
|
|
174
|
+
|
|
175
|
+
Note: this is a low-level method, typically you would use the :ref:`search` method on a :ref:`Collection`.
|
|
126
176
|
|
|
127
177
|
:param query: The query string to search for
|
|
128
178
|
:param vectors: A list of indexed items, where each item is a tuple of (id, vector)
|