linkml-store 0.1.13__tar.gz → 0.1.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (78) hide show
  1. {linkml_store-0.1.13 → linkml_store-0.1.14}/PKG-INFO +25 -2
  2. {linkml_store-0.1.13 → linkml_store-0.1.14}/README.md +20 -1
  3. {linkml_store-0.1.13 → linkml_store-0.1.14}/pyproject.toml +6 -2
  4. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/client.py +35 -8
  5. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/collection.py +40 -5
  6. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/config.py +20 -3
  7. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/database.py +24 -3
  8. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/mongodb/mongodb_collection.py +4 -0
  9. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/cli.py +140 -13
  10. linkml_store-0.1.14/src/linkml_store/inference/__init__.py +13 -0
  11. linkml_store-0.1.14/src/linkml_store/inference/implementations/rag_inference_engine.py +145 -0
  12. linkml_store-0.1.14/src/linkml_store/inference/implementations/rule_based_inference_engine.py +158 -0
  13. linkml_store-0.1.14/src/linkml_store/inference/implementations/sklearn_inference_engine.py +290 -0
  14. linkml_store-0.1.14/src/linkml_store/inference/inference_config.py +62 -0
  15. linkml_store-0.1.14/src/linkml_store/inference/inference_engine.py +173 -0
  16. linkml_store-0.1.14/src/linkml_store/inference/inference_engine_registry.py +74 -0
  17. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/utils/format_utils.py +21 -90
  18. linkml_store-0.1.14/src/linkml_store/utils/llm_utils.py +95 -0
  19. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/utils/object_utils.py +3 -1
  20. linkml_store-0.1.14/src/linkml_store/utils/pandas_utils.py +93 -0
  21. linkml_store-0.1.14/src/linkml_store/utils/sklearn_utils.py +193 -0
  22. linkml_store-0.1.14/src/linkml_store/utils/stats_utils.py +53 -0
  23. linkml_store-0.1.14/src/linkml_store/webapi/__init__.py +0 -0
  24. linkml_store-0.1.13/src/linkml_store/utils/pandas_utils.py +0 -40
  25. {linkml_store-0.1.13 → linkml_store-0.1.14}/LICENSE +0 -0
  26. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/__init__.py +0 -0
  27. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/__init__.py +0 -0
  28. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/queries.py +0 -0
  29. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/__init__.py +0 -0
  30. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/chromadb/__init__.py +0 -0
  31. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/chromadb/chromadb_collection.py +0 -0
  32. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/chromadb/chromadb_database.py +0 -0
  33. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/duckdb/__init__.py +0 -0
  34. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/duckdb/duckdb_collection.py +0 -0
  35. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/duckdb/duckdb_database.py +0 -0
  36. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/duckdb/mappings.py +0 -0
  37. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/filesystem/__init__.py +0 -0
  38. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/filesystem/filesystem_collection.py +0 -0
  39. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/filesystem/filesystem_database.py +0 -0
  40. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/hdf5/__init__.py +0 -0
  41. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/hdf5/hdf5_collection.py +0 -0
  42. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/hdf5/hdf5_database.py +0 -0
  43. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/mongodb/__init__.py +0 -0
  44. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/mongodb/mongodb_database.py +0 -0
  45. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/neo4j/__init__.py +0 -0
  46. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/neo4j/neo4j_collection.py +0 -0
  47. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/neo4j/neo4j_database.py +0 -0
  48. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/solr/__init__.py +0 -0
  49. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/solr/solr_collection.py +0 -0
  50. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/solr/solr_database.py +0 -0
  51. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/stores/solr/solr_utils.py +0 -0
  52. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/api/types.py +0 -0
  53. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/constants.py +0 -0
  54. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/graphs/__init__.py +0 -0
  55. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/graphs/graph_map.py +0 -0
  56. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/index/__init__.py +0 -0
  57. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/index/implementations/__init__.py +0 -0
  58. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/index/implementations/llm_indexer.py +0 -0
  59. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/index/implementations/simple_indexer.py +0 -0
  60. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/index/indexer.py +0 -0
  61. {linkml_store-0.1.13/src/linkml_store/utils → linkml_store-0.1.14/src/linkml_store/inference/implementations}/__init__.py +0 -0
  62. {linkml_store-0.1.13/src/linkml_store/webapi → linkml_store-0.1.14/src/linkml_store/utils}/__init__.py +0 -0
  63. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/utils/change_utils.py +0 -0
  64. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/utils/file_utils.py +0 -0
  65. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/utils/io.py +0 -0
  66. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/utils/mongodb_utils.py +0 -0
  67. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/utils/neo4j_utils.py +0 -0
  68. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/utils/patch_utils.py +0 -0
  69. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/utils/query_utils.py +0 -0
  70. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/utils/schema_utils.py +0 -0
  71. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/utils/sql_utils.py +0 -0
  72. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/webapi/html/__init__.py +0 -0
  73. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/webapi/html/base.html.j2 +0 -0
  74. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/webapi/html/collection_details.html.j2 +0 -0
  75. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/webapi/html/database_details.html.j2 +0 -0
  76. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/webapi/html/databases.html.j2 +0 -0
  77. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/webapi/html/generic.html.j2 +0 -0
  78. {linkml_store-0.1.13 → linkml_store-0.1.14}/src/linkml_store/webapi/main.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: linkml-store
3
- Version: 0.1.13
3
+ Version: 0.1.14
4
4
  Summary: linkml-store
5
5
  License: MIT
6
6
  Author: Author 1
@@ -24,6 +24,7 @@ Provides-Extra: mongodb
24
24
  Provides-Extra: neo4j
25
25
  Provides-Extra: pyarrow
26
26
  Provides-Extra: renderer
27
+ Provides-Extra: scipy
27
28
  Provides-Extra: tests
28
29
  Provides-Extra: validation
29
30
  Requires-Dist: black (>=24.0.0) ; extra == "tests"
@@ -51,9 +52,12 @@ Requires-Dist: pyarrow ; extra == "pyarrow"
51
52
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
52
53
  Requires-Dist: pymongo ; extra == "mongodb"
53
54
  Requires-Dist: pystow (>=0.5.4,<0.6.0)
55
+ Requires-Dist: scikit-learn ; extra == "scipy"
56
+ Requires-Dist: scipy ; extra == "scipy"
54
57
  Requires-Dist: seaborn ; extra == "analytics"
55
58
  Requires-Dist: sqlalchemy
56
59
  Requires-Dist: streamlit (>=1.32.2,<2.0.0) ; extra == "app"
60
+ Requires-Dist: tiktoken ; extra == "llm"
57
61
  Requires-Dist: uvicorn ; extra == "fastapi"
58
62
  Description-Content-Type: text/markdown
59
63
 
@@ -61,7 +65,7 @@ Description-Content-Type: text/markdown
61
65
 
62
66
  An AI-ready data management and integration platform. LinkML-Store
63
67
  provides an abstraction layer over multiple different backends
64
- (including DuckDB, MongoDB, and local filesystems), allowing for
68
+ (including DuckDB, MongoDB, Neo4j, and local filesystems), allowing for
65
69
  common query, index, and storage operations.
66
70
 
67
71
  For full documentation, see [https://linkml.io/linkml-store/](https://linkml.io/linkml-store/)
@@ -99,6 +103,23 @@ linkml-store -d duckdb:///db/my.db -c persons validate
99
103
  * API
100
104
  * Streamlit applications
101
105
 
106
+ ## The CRUDSI pattern
107
+
108
+ Most database APIs implement the **CRUD** pattern: Create, Read, Update, Delete.
109
+ LinkML-Store adds **Search** and **Inference** to this pattern, making it **CRUDSI**.
110
+
111
+ The notion of "Search" and "Inference" is intended to be flexible and extensible,
112
+ including:
113
+
114
+ * Search
115
+ * Traditional keyword search
116
+ * Search using LLM Vector embeddings (*without* a dedicated vector database)
117
+ * Pluggable specialized search, e.g. genomic sequence (not yet implemented)
118
+ * Inference (encompassing *validation*, *repair*, and inference of missing data)
119
+ * Classic rule-based inference
120
+ * Inference using LLM Retrieval Augmented Generation (RAG)
121
+ * Statistical/ML inference
122
+
102
123
  ## Features
103
124
 
104
125
  ### Multiple Adapters
@@ -108,6 +129,8 @@ LinkML-Store is designed to work with multiple backends, giving a common abstrac
108
129
  * [MongoDB](https://linkml.io/linkml-store/how-to/Use-MongoDB.html)
109
130
  * [DuckDB](https://linkml.io/linkml-store/tutorials/Python-Tutorial.html)
110
131
  * [Solr](https://linkml.io/linkml-store/how-to/Query-Solr-using-CLI.html)
132
+ * [Neo4j](https://linkml.io/linkml-store/how-to/Use-Neo4j.html)
133
+
111
134
  * Filesystem
112
135
 
113
136
  Coming soon: any RDBMS, any triplestore, Neo4J, HDF5-based stores, ChromaDB/Vector dbs ...
@@ -2,7 +2,7 @@
2
2
 
3
3
  An AI-ready data management and integration platform. LinkML-Store
4
4
  provides an abstraction layer over multiple different backends
5
- (including DuckDB, MongoDB, and local filesystems), allowing for
5
+ (including DuckDB, MongoDB, Neo4j, and local filesystems), allowing for
6
6
  common query, index, and storage operations.
7
7
 
8
8
  For full documentation, see [https://linkml.io/linkml-store/](https://linkml.io/linkml-store/)
@@ -40,6 +40,23 @@ linkml-store -d duckdb:///db/my.db -c persons validate
40
40
  * API
41
41
  * Streamlit applications
42
42
 
43
+ ## The CRUDSI pattern
44
+
45
+ Most database APIs implement the **CRUD** pattern: Create, Read, Update, Delete.
46
+ LinkML-Store adds **Search** and **Inference** to this pattern, making it **CRUDSI**.
47
+
48
+ The notion of "Search" and "Inference" is intended to be flexible and extensible,
49
+ including:
50
+
51
+ * Search
52
+ * Traditional keyword search
53
+ * Search using LLM Vector embeddings (*without* a dedicated vector database)
54
+ * Pluggable specialized search, e.g. genomic sequence (not yet implemented)
55
+ * Inference (encompassing *validation*, *repair*, and inference of missing data)
56
+ * Classic rule-based inference
57
+ * Inference using LLM Retrieval Augmented Generation (RAG)
58
+ * Statistical/ML inference
59
+
43
60
  ## Features
44
61
 
45
62
  ### Multiple Adapters
@@ -49,6 +66,8 @@ LinkML-Store is designed to work with multiple backends, giving a common abstrac
49
66
  * [MongoDB](https://linkml.io/linkml-store/how-to/Use-MongoDB.html)
50
67
  * [DuckDB](https://linkml.io/linkml-store/tutorials/Python-Tutorial.html)
51
68
  * [Solr](https://linkml.io/linkml-store/how-to/Query-Solr-using-CLI.html)
69
+ * [Neo4j](https://linkml.io/linkml-store/how-to/Use-Neo4j.html)
70
+
52
71
  * Filesystem
53
72
 
54
73
  Coming soon: any RDBMS, any triplestore, Neo4J, HDF5-based stores, ChromaDB/Vector dbs ...
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "linkml-store"
3
- version = "0.1.13"
3
+ version = "0.1.14"
4
4
  description = "linkml-store"
5
5
  authors = ["Author 1 <author@org.org>"]
6
6
  license = "MIT"
@@ -21,6 +21,7 @@ plotly = { version = "*", optional = true }
21
21
  pystow = "^0.5.4"
22
22
  black = { version=">=24.0.0", optional = true }
23
23
  llm = { version="*", optional = true }
24
+ tiktoken = { version="*", optional = true }
24
25
  pymongo = { version="*", optional = true }
25
26
  neo4j = { version="*", optional = true }
26
27
  py2neo = { version="*", optional = true }
@@ -28,6 +29,8 @@ networkx = { version="*", optional = true }
28
29
  chromadb = { version="*", optional = true }
29
30
  pyarrow = { version="*", optional = true }
30
31
  h5py = { version="*", optional = true }
32
+ scipy = { version="*", optional = true }
33
+ scikit-learn = { version="*", optional = true }
31
34
  linkml = { version=">=1.8.0", optional = true }
32
35
  linkml_map = { version="*", optional = true }
33
36
  linkml_renderer = { version="*", optional = true }
@@ -67,7 +70,7 @@ numpy = [
67
70
  analytics = ["pandas", "matplotlib", "seaborn", "plotly"]
68
71
  app = ["streamlit"]
69
72
  tests = ["black"]
70
- llm = ["llm"]
73
+ llm = ["llm", "tiktoken"]
71
74
  mongodb = ["pymongo"]
72
75
  neo4j = ["neo4j", "py2neo", "networkx"]
73
76
  chromadb = ["chromadb"]
@@ -78,6 +81,7 @@ map = ["linkml_map"]
78
81
  renderer = ["linkml_renderer"]
79
82
  fastapi = ["fastapi", "uvicorn"]
80
83
  frictionless = ["frictionless"]
84
+ scipy = ["scipy", "scikit-learn"]
81
85
 
82
86
  [tool.poetry.scripts]
83
87
  linkml-store = "linkml_store.cli:cli"
@@ -100,7 +100,7 @@ class Client:
100
100
  """
101
101
  return self.metadata.base_dir
102
102
 
103
- def from_config(self, config: Union[ClientConfig, dict, str, Path], base_dir=None, **kwargs):
103
+ def from_config(self, config: Union[ClientConfig, dict, str, Path], base_dir=None, auto_attach=False, **kwargs):
104
104
  """
105
105
  Create a client from a configuration.
106
106
 
@@ -109,6 +109,10 @@ class Client:
109
109
  >>> from linkml_store.api.config import ClientConfig
110
110
  >>> client = Client().from_config(ClientConfig(databases={"test": {"handle": "duckdb:///:memory:"}}))
111
111
  >>> len(client.databases)
112
+ 0
113
+ >>> client = Client().from_config(ClientConfig(databases={"test": {"handle": "duckdb:///:memory:"}}),
114
+ ... auto_attach=True)
115
+ >>> len(client.databases)
112
116
  1
113
117
  >>> "test" in client.databases
114
118
  True
@@ -116,6 +120,8 @@ class Client:
116
120
  'duckdb:///:memory:'
117
121
 
118
122
  :param config:
123
+ :param base_dir:
124
+ :param auto_attach:
119
125
  :param kwargs:
120
126
  :return:
121
127
 
@@ -125,17 +131,17 @@ class Client:
125
131
  if isinstance(config, Path):
126
132
  config = str(config)
127
133
  if isinstance(config, str):
128
- # if not base_dir:
129
- # base_dir = Path(config).parent
134
+ if not base_dir:
135
+ base_dir = Path(config).parent
130
136
  parsed_obj = yaml.safe_load(open(config))
131
137
  config = ClientConfig(**parsed_obj)
132
138
  self.metadata = config
133
139
  if base_dir:
134
140
  self.metadata.base_dir = base_dir
135
- self._initialize_databases(**kwargs)
141
+ self._initialize_databases(auto_attach=auto_attach, **kwargs)
136
142
  return self
137
143
 
138
- def _initialize_databases(self, **kwargs):
144
+ def _initialize_databases(self, auto_attach=False, **kwargs):
139
145
  for name, db_config in self.metadata.databases.items():
140
146
  base_dir = self.base_dir
141
147
  logger.info(f"Initializing database: {name}, base_dir: {base_dir}")
@@ -146,8 +152,22 @@ class Client:
146
152
  db_config.handle = handle
147
153
  if db_config.schema_location:
148
154
  db_config.schema_location = db_config.schema_location.format(base_dir=base_dir)
149
- db = self.attach_database(handle, alias=name, **kwargs)
150
- db.from_config(db_config)
155
+ if auto_attach:
156
+ db = self.attach_database(handle, alias=name, **kwargs)
157
+ db.from_config(db_config)
158
+
159
+ def _set_database_config(self, db: Database):
160
+ """
161
+ Set the configuration for a database.
162
+
163
+ :param name:
164
+ :param config:
165
+ :return:
166
+ """
167
+ if not self.metadata:
168
+ return
169
+ if db.alias in self.metadata.databases:
170
+ db.from_config(self.metadata.databases[db.alias])
151
171
 
152
172
  def attach_database(
153
173
  self,
@@ -202,6 +222,7 @@ class Client:
202
222
  raise AssertionError(f"Inconsistent alias: {db.alias} != {alias}")
203
223
  else:
204
224
  db.metadata.alias = alias
225
+ self._set_database_config(db)
205
226
  return db
206
227
 
207
228
  def get_database(self, name: Optional[str] = None, create_if_not_exists=True, **kwargs) -> Database:
@@ -230,13 +251,19 @@ class Client:
230
251
  return list(self._databases.values())[0]
231
252
  if not self._databases:
232
253
  self._databases = {}
254
+ if name not in self._databases and name in self.metadata.databases:
255
+ db_config = self.metadata.databases[name]
256
+ db = self.attach_database(db_config.handle, alias=name, **kwargs)
257
+ self._databases[name] = db
233
258
  if name not in self._databases:
234
259
  if create_if_not_exists:
235
260
  logger.info(f"Creating database: {name}")
236
261
  self.attach_database(name, **kwargs)
237
262
  else:
238
263
  raise ValueError(f"Database {name} does not exist")
239
- return self._databases[name]
264
+ db = self._databases[name]
265
+ self._set_database_config(db)
266
+ return db
240
267
 
241
268
  @property
242
269
  def databases(self) -> Dict[str, Database]:
@@ -502,6 +502,7 @@ class Collection(Generic[DatabaseType]):
502
502
  index_name = self.default_index_name
503
503
  ix_coll = self.parent.get_collection(self._index_collection_name(index_name))
504
504
  if index_name not in self.indexers:
505
+ logger.debug(f"Indexer not found: {index_name} -- creating")
505
506
  ix = get_indexer(index_name)
506
507
  if not self._indexers:
507
508
  self._indexers = {}
@@ -509,6 +510,11 @@ class Collection(Generic[DatabaseType]):
509
510
  ix = self.indexers.get(index_name)
510
511
  if not ix:
511
512
  raise ValueError(f"No index named {index_name}")
513
+ logger.debug(f"Using indexer {type(ix)} with name {index_name}")
514
+ if ix_coll.size() == 0:
515
+ logger.info(f"Index {index_name} is empty; indexing all objects")
516
+ all_objs = self.find(limit=-1).rows
517
+ self.index_objects(all_objs, index_name, replace=True, **kwargs)
512
518
  qr = ix_coll.find(where=where, limit=-1, **kwargs)
513
519
  index_col = ix.index_field
514
520
  # TODO: optimize this for large indexes
@@ -518,6 +524,7 @@ class Collection(Generic[DatabaseType]):
518
524
  del r[1][index_col]
519
525
  new_qr = QueryResult(num_rows=len(results))
520
526
  new_qr.ranked_rows = results
527
+ new_qr.rows = [r[1] for r in results]
521
528
  return new_qr
522
529
 
523
530
  @property
@@ -562,6 +569,7 @@ class Collection(Generic[DatabaseType]):
562
569
  format=source.format,
563
570
  expected_type=source.expected_type,
564
571
  compression=source.compression,
572
+ select_query=source.select_query,
565
573
  **kwargs,
566
574
  )
567
575
  elif metadata.source.url:
@@ -570,9 +578,12 @@ class Collection(Generic[DatabaseType]):
570
578
  format=source.format,
571
579
  expected_type=source.expected_type,
572
580
  compression=source.compression,
581
+ select_query=source.select_query,
573
582
  **kwargs,
574
583
  )
575
- self.insert(objects)
584
+ else:
585
+ raise ValueError("No source local_path or url provided")
586
+ self.insert(objects)
576
587
 
577
588
  def _check_if_initialized(self) -> bool:
578
589
  return self._initialized
@@ -629,6 +640,14 @@ class Collection(Generic[DatabaseType]):
629
640
  self.insert(tr_objs)
630
641
  self.commit()
631
642
 
643
+ def size(self) -> int:
644
+ """
645
+ Return the number of objects in the collection.
646
+
647
+ :return: The number of objects in the collection.
648
+ """
649
+ return self.find({}, limit=1).num_rows
650
+
632
651
  def attach_indexer(self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs):
633
652
  """
634
653
  Attach an index to the collection.
@@ -777,6 +796,8 @@ class Collection(Generic[DatabaseType]):
777
796
  sv: SchemaView = self.parent.schema_view
778
797
  if sv:
779
798
  cls = sv.get_class(self.target_class_name)
799
+ # if not cls:
800
+ # logger.warning(f"{self.target_class_name} not in {sv.all_classes().keys()} ")
780
801
  # cls = sv.schema.classes[self.target_class_name]
781
802
  if cls and not cls.attributes:
782
803
  if not sv.class_induced_slots(cls.name):
@@ -900,11 +921,14 @@ class Collection(Generic[DatabaseType]):
900
921
  exact_dimensions_list.append(v.shape)
901
922
  break
902
923
  if isinstance(v, list):
924
+ # sample first item. TODO: more robust strategy
903
925
  v = v[0] if v else None
904
926
  multivalueds.append(True)
905
927
  elif isinstance(v, dict):
906
- v = list(v.values())[0]
907
- multivalueds.append(True)
928
+ pass
929
+ # TODO: check if this is a nested object or key-value list
930
+ # v = list(v.values())[0]
931
+ # multivalueds.append(True)
908
932
  else:
909
933
  multivalueds.append(False)
910
934
  if not v:
@@ -933,10 +957,21 @@ class Collection(Generic[DatabaseType]):
933
957
  # raise AssertionError(f"Empty rngs for {k} = {vs}")
934
958
  rng = rngs[0] if rngs else None
935
959
  for other_rng in rngs:
960
+ coercions = {
961
+ ("integer", "float"): "float",
962
+ }
936
963
  if rng != other_rng:
937
- raise ValueError(f"Conflict: {rng} != {other_rng} for {vs}")
964
+ if (rng, other_rng) in coercions:
965
+ rng = coercions[(rng, other_rng)]
966
+ elif (other_rng, rng) in coercions:
967
+ rng = coercions[(other_rng, rng)]
968
+ else:
969
+ raise ValueError(f"Conflict: {rng} != {other_rng} for {vs}")
938
970
  logger.debug(f"Inducing {k} as {rng} {multivalued} {inlined}")
939
- cd.attributes[k] = SlotDefinition(k, range=rng, multivalued=multivalued, inlined=inlined)
971
+ inlined_as_list = inlined and multivalued
972
+ cd.attributes[k] = SlotDefinition(
973
+ k, range=rng, multivalued=multivalued, inlined=inlined, inlined_as_list=inlined_as_list
974
+ )
940
975
  if exact_dimensions_list:
941
976
  array_expr = ArrayExpression(exact_number_dimensions=len(exact_dimensions_list[0]))
942
977
  cd.attributes[k].array = array_expr
@@ -1,8 +1,8 @@
1
- from typing import Any, Dict, List, Optional
1
+ from typing import Any, Dict, List, Optional, Union
2
2
 
3
3
  from pydantic import BaseModel, Field
4
4
 
5
- from linkml_store.graphs.graph_map import GraphProjection
5
+ from linkml_store.graphs.graph_map import EdgeProjection, NodeProjection
6
6
 
7
7
 
8
8
  class ConfiguredBaseModel(BaseModel, extra="forbid"):
@@ -30,13 +30,30 @@ class CollectionSource(ConfiguredBaseModel):
30
30
  """
31
31
 
32
32
  url: Optional[str] = None
33
+ """Remote URL to fetch data from"""
34
+
33
35
  local_path: Optional[str] = None
36
+ """Local path to fetch data from"""
37
+
34
38
  source_location: Optional[str] = None
39
+
35
40
  refresh_interval_days: Optional[float] = None
41
+ """How often to refresh the data, in days"""
42
+
36
43
  expected_type: Optional[str] = None
44
+ """The expected type of the data, e.g list"""
45
+
37
46
  format: Optional[str] = None
47
+ """The format of the data, e.g., json, yaml, csv"""
48
+
38
49
  compression: Optional[str] = None
50
+ """The compression of the data, e.g., tgz, gzip, zip"""
51
+
52
+ select_query: Optional[str] = None
53
+ """A jsonpath query to preprocess the objects with"""
54
+
39
55
  arguments: Optional[Dict[str, Any]] = None
56
+ """Optional arguments to pass to the source"""
40
57
 
41
58
 
42
59
  class CollectionConfig(ConfiguredBaseModel):
@@ -81,7 +98,7 @@ class CollectionConfig(ConfiguredBaseModel):
81
98
  description="LinkML-Map derivations",
82
99
  )
83
100
  page_size: Optional[int] = Field(default=None, description="Suggested page size (items per page) in apps and APIs")
84
- graph_projection: Optional[GraphProjection] = Field(
101
+ graph_projection: Optional[Union[EdgeProjection, NodeProjection]] = Field(
85
102
  default=None,
86
103
  description="Optional graph projection configuration",
87
104
  )
@@ -707,12 +707,29 @@ class Database(ABC, Generic[CollectionType]):
707
707
  """
708
708
  raise NotImplementedError()
709
709
 
710
- def import_database(self, location: str, source_format: Optional[Union[str, Format]] = None, **kwargs):
710
+ def import_database(
711
+ self,
712
+ location: str,
713
+ source_format: Optional[Union[str, Format]] = None,
714
+ collection_name: Optional[str] = None,
715
+ **kwargs,
716
+ ):
711
717
  """
712
718
  Import a database from a file or location.
713
719
 
720
+ >>> from linkml_store.api.client import Client
721
+ >>> client = Client()
722
+ >>> db = client.attach_database("duckdb", alias="test")
723
+ >>> db.import_database("tests/input/iris.csv", Format.CSV, collection_name="iris")
724
+ >>> db.list_collection_names()
725
+ ['iris']
726
+ >>> collection = db.get_collection("iris")
727
+ >>> collection.find({}).num_rows
728
+ 150
729
+
714
730
  :param location: location of the file
715
731
  :param source_format: source format
732
+ :param collection_name: (Optional) name of the collection, for data that is flat
716
733
  :param kwargs: additional arguments
717
734
  """
718
735
  if isinstance(source_format, str):
@@ -732,8 +749,12 @@ class Database(ABC, Generic[CollectionType]):
732
749
  self.store(obj)
733
750
  return
734
751
  objects = load_objects(location, format=source_format)
735
- for obj in objects:
736
- self.store(obj)
752
+ if collection_name:
753
+ collection = self.get_collection(collection_name, create_if_not_exists=True)
754
+ collection.insert(objects)
755
+ else:
756
+ for obj in objects:
757
+ self.store(obj)
737
758
 
738
759
  def export_database(self, location: str, target_format: Optional[Union[str, Format]] = None, **kwargs):
739
760
  """
@@ -51,9 +51,13 @@ class MongoDBCollection(Collection):
51
51
  if offset and offset >= 0:
52
52
  cursor = cursor.skip(offset)
53
53
 
54
+ select_cols = query.select_cols
55
+
54
56
  def _as_row(row: dict):
55
57
  row = copy(row)
56
58
  del row["_id"]
59
+ if select_cols:
60
+ row = {k: row[k] for k in select_cols if k in row}
57
61
  return row
58
62
 
59
63
  rows = [_as_row(row) for row in cursor]