linkml-store 0.1.12__tar.gz → 0.1.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (78) hide show
  1. {linkml_store-0.1.12 → linkml_store-0.1.14}/PKG-INFO +30 -3
  2. {linkml_store-0.1.12 → linkml_store-0.1.14}/README.md +20 -1
  3. {linkml_store-0.1.12 → linkml_store-0.1.14}/pyproject.toml +11 -3
  4. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/client.py +37 -8
  5. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/collection.py +81 -9
  6. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/config.py +28 -1
  7. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/database.py +26 -3
  8. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/mongodb/mongodb_collection.py +4 -0
  9. linkml_store-0.1.14/src/linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
  10. linkml_store-0.1.14/src/linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
  11. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/cli.py +140 -13
  12. linkml_store-0.1.14/src/linkml_store/graphs/graph_map.py +24 -0
  13. linkml_store-0.1.14/src/linkml_store/inference/__init__.py +13 -0
  14. linkml_store-0.1.14/src/linkml_store/inference/implementations/__init__.py +0 -0
  15. linkml_store-0.1.14/src/linkml_store/inference/implementations/rag_inference_engine.py +145 -0
  16. linkml_store-0.1.14/src/linkml_store/inference/implementations/rule_based_inference_engine.py +158 -0
  17. linkml_store-0.1.14/src/linkml_store/inference/implementations/sklearn_inference_engine.py +290 -0
  18. linkml_store-0.1.14/src/linkml_store/inference/inference_config.py +62 -0
  19. linkml_store-0.1.14/src/linkml_store/inference/inference_engine.py +173 -0
  20. linkml_store-0.1.14/src/linkml_store/inference/inference_engine_registry.py +74 -0
  21. linkml_store-0.1.14/src/linkml_store/utils/__init__.py +0 -0
  22. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/utils/format_utils.py +21 -90
  23. linkml_store-0.1.14/src/linkml_store/utils/llm_utils.py +95 -0
  24. linkml_store-0.1.14/src/linkml_store/utils/neo4j_utils.py +42 -0
  25. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/utils/object_utils.py +3 -1
  26. linkml_store-0.1.14/src/linkml_store/utils/pandas_utils.py +93 -0
  27. linkml_store-0.1.14/src/linkml_store/utils/sklearn_utils.py +193 -0
  28. linkml_store-0.1.14/src/linkml_store/utils/stats_utils.py +53 -0
  29. linkml_store-0.1.14/src/linkml_store/webapi/__init__.py +0 -0
  30. linkml_store-0.1.12/src/linkml_store/utils/pandas_utils.py +0 -40
  31. {linkml_store-0.1.12 → linkml_store-0.1.14}/LICENSE +0 -0
  32. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/__init__.py +0 -0
  33. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/__init__.py +0 -0
  34. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/queries.py +0 -0
  35. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/__init__.py +0 -0
  36. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/chromadb/__init__.py +0 -0
  37. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/chromadb/chromadb_collection.py +0 -0
  38. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/chromadb/chromadb_database.py +0 -0
  39. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/duckdb/__init__.py +0 -0
  40. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/duckdb/duckdb_collection.py +0 -0
  41. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/duckdb/duckdb_database.py +0 -0
  42. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/duckdb/mappings.py +0 -0
  43. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/filesystem/__init__.py +0 -0
  44. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/filesystem/filesystem_collection.py +0 -0
  45. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/filesystem/filesystem_database.py +0 -0
  46. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/hdf5/__init__.py +0 -0
  47. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/hdf5/hdf5_collection.py +0 -0
  48. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/hdf5/hdf5_database.py +0 -0
  49. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/mongodb/__init__.py +0 -0
  50. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/mongodb/mongodb_database.py +0 -0
  51. {linkml_store-0.1.12/src/linkml_store/index/implementations → linkml_store-0.1.14/src/linkml_store/api/stores/neo4j}/__init__.py +0 -0
  52. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/solr/__init__.py +0 -0
  53. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/solr/solr_collection.py +0 -0
  54. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/solr/solr_database.py +0 -0
  55. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/stores/solr/solr_utils.py +0 -0
  56. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/api/types.py +0 -0
  57. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/constants.py +0 -0
  58. {linkml_store-0.1.12/src/linkml_store/utils → linkml_store-0.1.14/src/linkml_store/graphs}/__init__.py +0 -0
  59. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/index/__init__.py +0 -0
  60. {linkml_store-0.1.12/src/linkml_store/webapi → linkml_store-0.1.14/src/linkml_store/index/implementations}/__init__.py +0 -0
  61. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/index/implementations/llm_indexer.py +0 -0
  62. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/index/implementations/simple_indexer.py +0 -0
  63. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/index/indexer.py +0 -0
  64. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/utils/change_utils.py +0 -0
  65. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/utils/file_utils.py +0 -0
  66. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/utils/io.py +0 -0
  67. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/utils/mongodb_utils.py +0 -0
  68. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/utils/patch_utils.py +0 -0
  69. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/utils/query_utils.py +0 -0
  70. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/utils/schema_utils.py +0 -0
  71. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/utils/sql_utils.py +0 -0
  72. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/webapi/html/__init__.py +0 -0
  73. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/webapi/html/base.html.j2 +0 -0
  74. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/webapi/html/collection_details.html.j2 +0 -0
  75. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/webapi/html/database_details.html.j2 +0 -0
  76. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/webapi/html/databases.html.j2 +0 -0
  77. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/webapi/html/generic.html.j2 +0 -0
  78. {linkml_store-0.1.12 → linkml_store-0.1.14}/src/linkml_store/webapi/main.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: linkml-store
3
- Version: 0.1.12
3
+ Version: 0.1.14
4
4
  Summary: linkml-store
5
5
  License: MIT
6
6
  Author: Author 1
@@ -21,14 +21,16 @@ Provides-Extra: h5py
21
21
  Provides-Extra: llm
22
22
  Provides-Extra: map
23
23
  Provides-Extra: mongodb
24
+ Provides-Extra: neo4j
24
25
  Provides-Extra: pyarrow
25
26
  Provides-Extra: renderer
27
+ Provides-Extra: scipy
26
28
  Provides-Extra: tests
27
29
  Provides-Extra: validation
28
30
  Requires-Dist: black (>=24.0.0) ; extra == "tests"
29
31
  Requires-Dist: chromadb ; extra == "chromadb"
30
32
  Requires-Dist: click
31
- Requires-Dist: duckdb (>=0.10.1,<0.11.0)
33
+ Requires-Dist: duckdb (>=0.10.1)
32
34
  Requires-Dist: duckdb-engine (>=0.11.2)
33
35
  Requires-Dist: fastapi ; extra == "fastapi"
34
36
  Requires-Dist: frictionless ; extra == "frictionless"
@@ -41,15 +43,21 @@ Requires-Dist: linkml_map ; extra == "map"
41
43
  Requires-Dist: linkml_renderer ; extra == "renderer"
42
44
  Requires-Dist: llm ; extra == "llm"
43
45
  Requires-Dist: matplotlib ; extra == "analytics"
46
+ Requires-Dist: neo4j ; extra == "neo4j"
47
+ Requires-Dist: networkx ; extra == "neo4j"
44
48
  Requires-Dist: pandas (>=2.2.1) ; extra == "analytics"
45
49
  Requires-Dist: plotly ; extra == "analytics"
50
+ Requires-Dist: py2neo ; extra == "neo4j"
46
51
  Requires-Dist: pyarrow ; extra == "pyarrow"
47
52
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
48
53
  Requires-Dist: pymongo ; extra == "mongodb"
49
54
  Requires-Dist: pystow (>=0.5.4,<0.6.0)
55
+ Requires-Dist: scikit-learn ; extra == "scipy"
56
+ Requires-Dist: scipy ; extra == "scipy"
50
57
  Requires-Dist: seaborn ; extra == "analytics"
51
58
  Requires-Dist: sqlalchemy
52
59
  Requires-Dist: streamlit (>=1.32.2,<2.0.0) ; extra == "app"
60
+ Requires-Dist: tiktoken ; extra == "llm"
53
61
  Requires-Dist: uvicorn ; extra == "fastapi"
54
62
  Description-Content-Type: text/markdown
55
63
 
@@ -57,7 +65,7 @@ Description-Content-Type: text/markdown
57
65
 
58
66
  An AI-ready data management and integration platform. LinkML-Store
59
67
  provides an abstraction layer over multiple different backends
60
- (including DuckDB, MongoDB, and local filesystems), allowing for
68
+ (including DuckDB, MongoDB, Neo4j, and local filesystems), allowing for
61
69
  common query, index, and storage operations.
62
70
 
63
71
  For full documentation, see [https://linkml.io/linkml-store/](https://linkml.io/linkml-store/)
@@ -95,6 +103,23 @@ linkml-store -d duckdb:///db/my.db -c persons validate
95
103
  * API
96
104
  * Streamlit applications
97
105
 
106
+ ## The CRUDSI pattern
107
+
108
+ Most database APIs implement the **CRUD** pattern: Create, Read, Update, Delete.
109
+ LinkML-Store adds **Search** and **Inference** to this pattern, making it **CRUDSI**.
110
+
111
+ The notion of "Search" and "Inference" is intended to be flexible and extensible,
112
+ including:
113
+
114
+ * Search
115
+ * Traditional keyword search
116
+ * Search using LLM Vector embeddings (*without* a dedicated vector database)
117
+ * Pluggable specialized search, e.g. genomic sequence (not yet implemented)
118
+ * Inference (encompassing *validation*, *repair*, and inference of missing data)
119
+ * Classic rule-based inference
120
+ * Inference using LLM Retrieval Augmented Generation (RAG)
121
+ * Statistical/ML inference
122
+
98
123
  ## Features
99
124
 
100
125
  ### Multiple Adapters
@@ -104,6 +129,8 @@ LinkML-Store is designed to work with multiple backends, giving a common abstrac
104
129
  * [MongoDB](https://linkml.io/linkml-store/how-to/Use-MongoDB.html)
105
130
  * [DuckDB](https://linkml.io/linkml-store/tutorials/Python-Tutorial.html)
106
131
  * [Solr](https://linkml.io/linkml-store/how-to/Query-Solr-using-CLI.html)
132
+ * [Neo4j](https://linkml.io/linkml-store/how-to/Use-Neo4j.html)
133
+
107
134
  * Filesystem
108
135
 
109
136
  Coming soon: any RDBMS, any triplestore, Neo4J, HDF5-based stores, ChromaDB/Vector dbs ...
@@ -2,7 +2,7 @@
2
2
 
3
3
  An AI-ready data management and integration platform. LinkML-Store
4
4
  provides an abstraction layer over multiple different backends
5
- (including DuckDB, MongoDB, and local filesystems), allowing for
5
+ (including DuckDB, MongoDB, Neo4j, and local filesystems), allowing for
6
6
  common query, index, and storage operations.
7
7
 
8
8
  For full documentation, see [https://linkml.io/linkml-store/](https://linkml.io/linkml-store/)
@@ -40,6 +40,23 @@ linkml-store -d duckdb:///db/my.db -c persons validate
40
40
  * API
41
41
  * Streamlit applications
42
42
 
43
+ ## The CRUDSI pattern
44
+
45
+ Most database APIs implement the **CRUD** pattern: Create, Read, Update, Delete.
46
+ LinkML-Store adds **Search** and **Inference** to this pattern, making it **CRUDSI**.
47
+
48
+ The notion of "Search" and "Inference" is intended to be flexible and extensible,
49
+ including:
50
+
51
+ * Search
52
+ * Traditional keyword search
53
+ * Search using LLM Vector embeddings (*without* a dedicated vector database)
54
+ * Pluggable specialized search, e.g. genomic sequence (not yet implemented)
55
+ * Inference (encompassing *validation*, *repair*, and inference of missing data)
56
+ * Classic rule-based inference
57
+ * Inference using LLM Retrieval Augmented Generation (RAG)
58
+ * Statistical/ML inference
59
+
43
60
  ## Features
44
61
 
45
62
  ### Multiple Adapters
@@ -49,6 +66,8 @@ LinkML-Store is designed to work with multiple backends, giving a common abstrac
49
66
  * [MongoDB](https://linkml.io/linkml-store/how-to/Use-MongoDB.html)
50
67
  * [DuckDB](https://linkml.io/linkml-store/tutorials/Python-Tutorial.html)
51
68
  * [Solr](https://linkml.io/linkml-store/how-to/Query-Solr-using-CLI.html)
69
+ * [Neo4j](https://linkml.io/linkml-store/how-to/Use-Neo4j.html)
70
+
52
71
  * Filesystem
53
72
 
54
73
  Coming soon: any RDBMS, any triplestore, Neo4J, HDF5-based stores, ChromaDB/Vector dbs ...
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "linkml-store"
3
- version = "0.1.12"
3
+ version = "0.1.14"
4
4
  description = "linkml-store"
5
5
  authors = ["Author 1 <author@org.org>"]
6
6
  license = "MIT"
@@ -13,7 +13,7 @@ pydantic = "^2.0.0"
13
13
  linkml-runtime = ">=1.8.0"
14
14
  streamlit = { version = "^1.32.2", optional = true }
15
15
  sqlalchemy = "*"
16
- duckdb = "^0.10.1"
16
+ duckdb = ">=0.10.1"
17
17
  duckdb-engine = ">=0.11.2"
18
18
  matplotlib = { version = "*", optional = true }
19
19
  seaborn = { version = "*", optional = true }
@@ -21,10 +21,16 @@ plotly = { version = "*", optional = true }
21
21
  pystow = "^0.5.4"
22
22
  black = { version=">=24.0.0", optional = true }
23
23
  llm = { version="*", optional = true }
24
+ tiktoken = { version="*", optional = true }
24
25
  pymongo = { version="*", optional = true }
26
+ neo4j = { version="*", optional = true }
27
+ py2neo = { version="*", optional = true }
28
+ networkx = { version="*", optional = true }
25
29
  chromadb = { version="*", optional = true }
26
30
  pyarrow = { version="*", optional = true }
27
31
  h5py = { version="*", optional = true }
32
+ scipy = { version="*", optional = true }
33
+ scikit-learn = { version="*", optional = true }
28
34
  linkml = { version=">=1.8.0", optional = true }
29
35
  linkml_map = { version="*", optional = true }
30
36
  linkml_renderer = { version="*", optional = true }
@@ -64,8 +70,9 @@ numpy = [
64
70
  analytics = ["pandas", "matplotlib", "seaborn", "plotly"]
65
71
  app = ["streamlit"]
66
72
  tests = ["black"]
67
- llm = ["llm"]
73
+ llm = ["llm", "tiktoken"]
68
74
  mongodb = ["pymongo"]
75
+ neo4j = ["neo4j", "py2neo", "networkx"]
69
76
  chromadb = ["chromadb"]
70
77
  h5py = ["h5py"]
71
78
  pyarrow = ["pyarrow"]
@@ -74,6 +81,7 @@ map = ["linkml_map"]
74
81
  renderer = ["linkml_renderer"]
75
82
  fastapi = ["fastapi", "uvicorn"]
76
83
  frictionless = ["frictionless"]
84
+ scipy = ["scipy", "scikit-learn"]
77
85
 
78
86
  [tool.poetry.scripts]
79
87
  linkml-store = "linkml_store.cli:cli"
@@ -11,6 +11,7 @@ from linkml_store.api.stores.chromadb.chromadb_database import ChromaDBDatabase
11
11
  from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
12
12
  from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase
13
13
  from linkml_store.api.stores.mongodb.mongodb_database import MongoDBDatabase
14
+ from linkml_store.api.stores.neo4j.neo4j_database import Neo4jDatabase
14
15
  from linkml_store.api.stores.solr.solr_database import SolrDatabase
15
16
 
16
17
  logger = logging.getLogger(__name__)
@@ -21,6 +22,7 @@ HANDLE_MAP = {
21
22
  "solr": SolrDatabase,
22
23
  "mongodb": MongoDBDatabase,
23
24
  "chromadb": ChromaDBDatabase,
25
+ "neo4j": Neo4jDatabase,
24
26
  "file": FileSystemDatabase,
25
27
  }
26
28
 
@@ -98,7 +100,7 @@ class Client:
98
100
  """
99
101
  return self.metadata.base_dir
100
102
 
101
- def from_config(self, config: Union[ClientConfig, dict, str, Path], base_dir=None, **kwargs):
103
+ def from_config(self, config: Union[ClientConfig, dict, str, Path], base_dir=None, auto_attach=False, **kwargs):
102
104
  """
103
105
  Create a client from a configuration.
104
106
 
@@ -107,6 +109,10 @@ class Client:
107
109
  >>> from linkml_store.api.config import ClientConfig
108
110
  >>> client = Client().from_config(ClientConfig(databases={"test": {"handle": "duckdb:///:memory:"}}))
109
111
  >>> len(client.databases)
112
+ 0
113
+ >>> client = Client().from_config(ClientConfig(databases={"test": {"handle": "duckdb:///:memory:"}}),
114
+ ... auto_attach=True)
115
+ >>> len(client.databases)
110
116
  1
111
117
  >>> "test" in client.databases
112
118
  True
@@ -114,6 +120,8 @@ class Client:
114
120
  'duckdb:///:memory:'
115
121
 
116
122
  :param config:
123
+ :param base_dir:
124
+ :param auto_attach:
117
125
  :param kwargs:
118
126
  :return:
119
127
 
@@ -123,17 +131,17 @@ class Client:
123
131
  if isinstance(config, Path):
124
132
  config = str(config)
125
133
  if isinstance(config, str):
126
- # if not base_dir:
127
- # base_dir = Path(config).parent
134
+ if not base_dir:
135
+ base_dir = Path(config).parent
128
136
  parsed_obj = yaml.safe_load(open(config))
129
137
  config = ClientConfig(**parsed_obj)
130
138
  self.metadata = config
131
139
  if base_dir:
132
140
  self.metadata.base_dir = base_dir
133
- self._initialize_databases(**kwargs)
141
+ self._initialize_databases(auto_attach=auto_attach, **kwargs)
134
142
  return self
135
143
 
136
- def _initialize_databases(self, **kwargs):
144
+ def _initialize_databases(self, auto_attach=False, **kwargs):
137
145
  for name, db_config in self.metadata.databases.items():
138
146
  base_dir = self.base_dir
139
147
  logger.info(f"Initializing database: {name}, base_dir: {base_dir}")
@@ -144,8 +152,22 @@ class Client:
144
152
  db_config.handle = handle
145
153
  if db_config.schema_location:
146
154
  db_config.schema_location = db_config.schema_location.format(base_dir=base_dir)
147
- db = self.attach_database(handle, alias=name, **kwargs)
148
- db.from_config(db_config)
155
+ if auto_attach:
156
+ db = self.attach_database(handle, alias=name, **kwargs)
157
+ db.from_config(db_config)
158
+
159
+ def _set_database_config(self, db: Database):
160
+ """
161
+ Set the configuration for a database.
162
+
163
+ :param name:
164
+ :param config:
165
+ :return:
166
+ """
167
+ if not self.metadata:
168
+ return
169
+ if db.alias in self.metadata.databases:
170
+ db.from_config(self.metadata.databases[db.alias])
149
171
 
150
172
  def attach_database(
151
173
  self,
@@ -200,6 +222,7 @@ class Client:
200
222
  raise AssertionError(f"Inconsistent alias: {db.alias} != {alias}")
201
223
  else:
202
224
  db.metadata.alias = alias
225
+ self._set_database_config(db)
203
226
  return db
204
227
 
205
228
  def get_database(self, name: Optional[str] = None, create_if_not_exists=True, **kwargs) -> Database:
@@ -228,13 +251,19 @@ class Client:
228
251
  return list(self._databases.values())[0]
229
252
  if not self._databases:
230
253
  self._databases = {}
254
+ if name not in self._databases and name in self.metadata.databases:
255
+ db_config = self.metadata.databases[name]
256
+ db = self.attach_database(db_config.handle, alias=name, **kwargs)
257
+ self._databases[name] = db
231
258
  if name not in self._databases:
232
259
  if create_if_not_exists:
233
260
  logger.info(f"Creating database: {name}")
234
261
  self.attach_database(name, **kwargs)
235
262
  else:
236
263
  raise ValueError(f"Database {name} does not exist")
237
- return self._databases[name]
264
+ db = self._databases[name]
265
+ self._set_database_config(db)
266
+ return db
238
267
 
239
268
  @property
240
269
  def databases(self) -> Dict[str, Database]:
@@ -4,7 +4,21 @@ import hashlib
4
4
  import logging
5
5
  from collections import defaultdict
6
6
  from pathlib import Path
7
- from typing import TYPE_CHECKING, Any, ClassVar, Dict, Generic, Iterator, List, Optional, TextIO, Tuple, Type, Union
7
+ from typing import (
8
+ TYPE_CHECKING,
9
+ Any,
10
+ ClassVar,
11
+ Dict,
12
+ Generic,
13
+ Iterable,
14
+ Iterator,
15
+ List,
16
+ Optional,
17
+ TextIO,
18
+ Tuple,
19
+ Type,
20
+ Union,
21
+ )
8
22
 
9
23
  import numpy as np
10
24
  from linkml_runtime import SchemaView
@@ -202,6 +216,12 @@ class Collection(Generic[DatabaseType]):
202
216
  self._materialize_derivations()
203
217
  self._initialized = True
204
218
 
219
+ def _pre_insert_hook(self, objs: List[OBJECT], **kwargs):
220
+ if self.metadata.validate_modifications:
221
+ errors = list(self.iter_validate_collection(objs))
222
+ if errors:
223
+ raise ValueError(f"Validation errors: {errors}")
224
+
205
225
  def _post_insert_hook(self, objs: List[OBJECT], **kwargs):
206
226
  self._initialized = True
207
227
  patches = [{"op": "add", "path": "/0", "value": obj} for obj in objs]
@@ -482,6 +502,7 @@ class Collection(Generic[DatabaseType]):
482
502
  index_name = self.default_index_name
483
503
  ix_coll = self.parent.get_collection(self._index_collection_name(index_name))
484
504
  if index_name not in self.indexers:
505
+ logger.debug(f"Indexer not found: {index_name} -- creating")
485
506
  ix = get_indexer(index_name)
486
507
  if not self._indexers:
487
508
  self._indexers = {}
@@ -489,6 +510,11 @@ class Collection(Generic[DatabaseType]):
489
510
  ix = self.indexers.get(index_name)
490
511
  if not ix:
491
512
  raise ValueError(f"No index named {index_name}")
513
+ logger.debug(f"Using indexer {type(ix)} with name {index_name}")
514
+ if ix_coll.size() == 0:
515
+ logger.info(f"Index {index_name} is empty; indexing all objects")
516
+ all_objs = self.find(limit=-1).rows
517
+ self.index_objects(all_objs, index_name, replace=True, **kwargs)
492
518
  qr = ix_coll.find(where=where, limit=-1, **kwargs)
493
519
  index_col = ix.index_field
494
520
  # TODO: optimize this for large indexes
@@ -498,6 +524,7 @@ class Collection(Generic[DatabaseType]):
498
524
  del r[1][index_col]
499
525
  new_qr = QueryResult(num_rows=len(results))
500
526
  new_qr.ranked_rows = results
527
+ new_qr.rows = [r[1] for r in results]
501
528
  return new_qr
502
529
 
503
530
  @property
@@ -542,6 +569,7 @@ class Collection(Generic[DatabaseType]):
542
569
  format=source.format,
543
570
  expected_type=source.expected_type,
544
571
  compression=source.compression,
572
+ select_query=source.select_query,
545
573
  **kwargs,
546
574
  )
547
575
  elif metadata.source.url:
@@ -550,9 +578,12 @@ class Collection(Generic[DatabaseType]):
550
578
  format=source.format,
551
579
  expected_type=source.expected_type,
552
580
  compression=source.compression,
581
+ select_query=source.select_query,
553
582
  **kwargs,
554
583
  )
555
- self.insert(objects)
584
+ else:
585
+ raise ValueError("No source local_path or url provided")
586
+ self.insert(objects)
556
587
 
557
588
  def _check_if_initialized(self) -> bool:
558
589
  return self._initialized
@@ -609,6 +640,14 @@ class Collection(Generic[DatabaseType]):
609
640
  self.insert(tr_objs)
610
641
  self.commit()
611
642
 
643
+ def size(self) -> int:
644
+ """
645
+ Return the number of objects in the collection.
646
+
647
+ :return: The number of objects in the collection.
648
+ """
649
+ return self.find({}, limit=1).num_rows
650
+
612
651
  def attach_indexer(self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs):
613
652
  """
614
653
  Attach an index to the collection.
@@ -757,6 +796,8 @@ class Collection(Generic[DatabaseType]):
757
796
  sv: SchemaView = self.parent.schema_view
758
797
  if sv:
759
798
  cls = sv.get_class(self.target_class_name)
799
+ # if not cls:
800
+ # logger.warning(f"{self.target_class_name} not in {sv.all_classes().keys()} ")
760
801
  # cls = sv.schema.classes[self.target_class_name]
761
802
  if cls and not cls.attributes:
762
803
  if not sv.class_induced_slots(cls.name):
@@ -880,11 +921,14 @@ class Collection(Generic[DatabaseType]):
880
921
  exact_dimensions_list.append(v.shape)
881
922
  break
882
923
  if isinstance(v, list):
924
+ # sample first item. TODO: more robust strategy
883
925
  v = v[0] if v else None
884
926
  multivalueds.append(True)
885
927
  elif isinstance(v, dict):
886
- v = list(v.values())[0]
887
- multivalueds.append(True)
928
+ pass
929
+ # TODO: check if this is a nested object or key-value list
930
+ # v = list(v.values())[0]
931
+ # multivalueds.append(True)
888
932
  else:
889
933
  multivalueds.append(False)
890
934
  if not v:
@@ -913,10 +957,21 @@ class Collection(Generic[DatabaseType]):
913
957
  # raise AssertionError(f"Empty rngs for {k} = {vs}")
914
958
  rng = rngs[0] if rngs else None
915
959
  for other_rng in rngs:
960
+ coercions = {
961
+ ("integer", "float"): "float",
962
+ }
916
963
  if rng != other_rng:
917
- raise ValueError(f"Conflict: {rng} != {other_rng} for {vs}")
964
+ if (rng, other_rng) in coercions:
965
+ rng = coercions[(rng, other_rng)]
966
+ elif (other_rng, rng) in coercions:
967
+ rng = coercions[(other_rng, rng)]
968
+ else:
969
+ raise ValueError(f"Conflict: {rng} != {other_rng} for {vs}")
918
970
  logger.debug(f"Inducing {k} as {rng} {multivalued} {inlined}")
919
- cd.attributes[k] = SlotDefinition(k, range=rng, multivalued=multivalued, inlined=inlined)
971
+ inlined_as_list = inlined and multivalued
972
+ cd.attributes[k] = SlotDefinition(
973
+ k, range=rng, multivalued=multivalued, inlined=inlined, inlined_as_list=inlined_as_list
974
+ )
920
975
  if exact_dimensions_list:
921
976
  array_expr = ArrayExpression(exact_number_dimensions=len(exact_dimensions_list[0]))
922
977
  cd.attributes[k].array = array_expr
@@ -978,11 +1033,14 @@ class Collection(Generic[DatabaseType]):
978
1033
  patches_from_objects_lists(src_objs, tgt_objs, primary_key=primary_key)
979
1034
  return patches_from_objects_lists(src_objs, tgt_objs, primary_key=primary_key)
980
1035
 
981
- def iter_validate_collection(self, **kwargs) -> Iterator["ValidationResult"]:
1036
+ def iter_validate_collection(
1037
+ self, objects: Optional[Iterable[OBJECT]] = None, **kwargs
1038
+ ) -> Iterator["ValidationResult"]:
982
1039
  """
983
1040
  Validate the contents of the collection
984
1041
 
985
1042
  :param kwargs:
1043
+ :param objects: objects to validate
986
1044
  :return: iterator over validation results
987
1045
  """
988
1046
  from linkml.validator import JsonschemaValidationPlugin, Validator
@@ -992,10 +1050,24 @@ class Collection(Generic[DatabaseType]):
992
1050
  cd = self.class_definition()
993
1051
  if not cd:
994
1052
  raise ValueError(f"Cannot find class definition for {self.target_class_name}")
1053
+ type_designator = None
1054
+ for att in self.parent.schema_view.class_induced_slots(cd.name):
1055
+ if att.designates_type:
1056
+ type_designator = att.name
995
1057
  class_name = cd.name
996
- for obj in self.find_iter(**kwargs):
1058
+ if objects is None:
1059
+ objects = self.find_iter(**kwargs)
1060
+ for obj in objects:
997
1061
  obj = clean_empties(obj)
998
- yield from validator.iter_results(obj, class_name)
1062
+ v_class_name = class_name
1063
+ if type_designator is not None:
1064
+ # TODO: move type designator logic to core linkml
1065
+ this_class_name = obj.get(type_designator)
1066
+ if this_class_name:
1067
+ if ":" in this_class_name:
1068
+ this_class_name = this_class_name.split(":")[-1]
1069
+ v_class_name = this_class_name
1070
+ yield from validator.iter_results(obj, v_class_name)
999
1071
 
1000
1072
  def commit(self):
1001
1073
  """
@@ -1,7 +1,9 @@
1
- from typing import Any, Dict, List, Optional
1
+ from typing import Any, Dict, List, Optional, Union
2
2
 
3
3
  from pydantic import BaseModel, Field
4
4
 
5
+ from linkml_store.graphs.graph_map import EdgeProjection, NodeProjection
6
+
5
7
 
6
8
  class ConfiguredBaseModel(BaseModel, extra="forbid"):
7
9
  """
@@ -28,13 +30,30 @@ class CollectionSource(ConfiguredBaseModel):
28
30
  """
29
31
 
30
32
  url: Optional[str] = None
33
+ """Remote URL to fetch data from"""
34
+
31
35
  local_path: Optional[str] = None
36
+ """Local path to fetch data from"""
37
+
32
38
  source_location: Optional[str] = None
39
+
33
40
  refresh_interval_days: Optional[float] = None
41
+ """How often to refresh the data, in days"""
42
+
34
43
  expected_type: Optional[str] = None
44
+ """The expected type of the data, e.g list"""
45
+
35
46
  format: Optional[str] = None
47
+ """The format of the data, e.g., json, yaml, csv"""
48
+
36
49
  compression: Optional[str] = None
50
+ """The compression of the data, e.g., tgz, gzip, zip"""
51
+
52
+ select_query: Optional[str] = None
53
+ """A jsonpath query to preprocess the objects with"""
54
+
37
55
  arguments: Optional[Dict[str, Any]] = None
56
+ """Optional arguments to pass to the source"""
38
57
 
39
58
 
40
59
  class CollectionConfig(ConfiguredBaseModel):
@@ -79,6 +98,14 @@ class CollectionConfig(ConfiguredBaseModel):
79
98
  description="LinkML-Map derivations",
80
99
  )
81
100
  page_size: Optional[int] = Field(default=None, description="Suggested page size (items per page) in apps and APIs")
101
+ graph_projection: Optional[Union[EdgeProjection, NodeProjection]] = Field(
102
+ default=None,
103
+ description="Optional graph projection configuration",
104
+ )
105
+ validate_modifications: Optional[bool] = Field(
106
+ default=False,
107
+ description="Whether to validate inserts, updates, and deletes",
108
+ )
82
109
 
83
110
 
84
111
  class DatabaseConfig(ConfiguredBaseModel):
@@ -505,8 +505,10 @@ class Database(ABC, Generic[CollectionType]):
505
505
  if isinstance(schema_view, str):
506
506
  schema_view = SchemaView(schema_view)
507
507
  self._schema_view = schema_view
508
+ # self._schema_view = SchemaView(schema_view.materialize_derived_schema())
508
509
  if not self._collections:
509
510
  return
511
+
510
512
  # align with induced schema
511
513
  roots = [c for c in schema_view.all_classes().values() if c.tree_root]
512
514
  if len(roots) == 0:
@@ -705,12 +707,29 @@ class Database(ABC, Generic[CollectionType]):
705
707
  """
706
708
  raise NotImplementedError()
707
709
 
708
- def import_database(self, location: str, source_format: Optional[Union[str, Format]] = None, **kwargs):
710
+ def import_database(
711
+ self,
712
+ location: str,
713
+ source_format: Optional[Union[str, Format]] = None,
714
+ collection_name: Optional[str] = None,
715
+ **kwargs,
716
+ ):
709
717
  """
710
718
  Import a database from a file or location.
711
719
 
720
+ >>> from linkml_store.api.client import Client
721
+ >>> client = Client()
722
+ >>> db = client.attach_database("duckdb", alias="test")
723
+ >>> db.import_database("tests/input/iris.csv", Format.CSV, collection_name="iris")
724
+ >>> db.list_collection_names()
725
+ ['iris']
726
+ >>> collection = db.get_collection("iris")
727
+ >>> collection.find({}).num_rows
728
+ 150
729
+
712
730
  :param location: location of the file
713
731
  :param source_format: source format
732
+ :param collection_name: (Optional) name of the collection, for data that is flat
714
733
  :param kwargs: additional arguments
715
734
  """
716
735
  if isinstance(source_format, str):
@@ -730,8 +749,12 @@ class Database(ABC, Generic[CollectionType]):
730
749
  self.store(obj)
731
750
  return
732
751
  objects = load_objects(location, format=source_format)
733
- for obj in objects:
734
- self.store(obj)
752
+ if collection_name:
753
+ collection = self.get_collection(collection_name, create_if_not_exists=True)
754
+ collection.insert(objects)
755
+ else:
756
+ for obj in objects:
757
+ self.store(obj)
735
758
 
736
759
  def export_database(self, location: str, target_format: Optional[Union[str, Format]] = None, **kwargs):
737
760
  """
@@ -51,9 +51,13 @@ class MongoDBCollection(Collection):
51
51
  if offset and offset >= 0:
52
52
  cursor = cursor.skip(offset)
53
53
 
54
+ select_cols = query.select_cols
55
+
54
56
  def _as_row(row: dict):
55
57
  row = copy(row)
56
58
  del row["_id"]
59
+ if select_cols:
60
+ row = {k: row[k] for k in select_cols if k in row}
57
61
  return row
58
62
 
59
63
  rows = [_as_row(row) for row in cursor]