linkml-store 0.2.5__tar.gz → 0.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (80) hide show
  1. {linkml_store-0.2.5 → linkml_store-0.2.6}/PKG-INFO +2 -3
  2. {linkml_store-0.2.5 → linkml_store-0.2.6}/pyproject.toml +2 -2
  3. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/client.py +7 -3
  4. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/collection.py +60 -2
  5. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/database.py +15 -12
  6. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/duckdb/duckdb_collection.py +11 -5
  7. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/duckdb/duckdb_database.py +52 -19
  8. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/mongodb/mongodb_collection.py +83 -0
  9. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/mongodb/mongodb_database.py +7 -3
  10. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/cli.py +1 -1
  11. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/utils/format_utils.py +60 -1
  12. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/utils/sql_utils.py +7 -1
  13. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/utils/vector_utils.py +1 -1
  14. {linkml_store-0.2.5 → linkml_store-0.2.6}/LICENSE +0 -0
  15. {linkml_store-0.2.5 → linkml_store-0.2.6}/README.md +0 -0
  16. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/__init__.py +0 -0
  17. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/__init__.py +0 -0
  18. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/config.py +0 -0
  19. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/queries.py +0 -0
  20. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/__init__.py +0 -0
  21. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/chromadb/__init__.py +0 -0
  22. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/chromadb/chromadb_collection.py +0 -0
  23. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/chromadb/chromadb_database.py +0 -0
  24. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/duckdb/__init__.py +0 -0
  25. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/duckdb/mappings.py +0 -0
  26. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/filesystem/__init__.py +0 -0
  27. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/filesystem/filesystem_collection.py +0 -0
  28. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/filesystem/filesystem_database.py +0 -0
  29. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/hdf5/__init__.py +0 -0
  30. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/hdf5/hdf5_collection.py +0 -0
  31. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/hdf5/hdf5_database.py +0 -0
  32. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/mongodb/__init__.py +0 -0
  33. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/neo4j/__init__.py +0 -0
  34. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/neo4j/neo4j_collection.py +0 -0
  35. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/neo4j/neo4j_database.py +0 -0
  36. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/solr/__init__.py +0 -0
  37. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/solr/solr_collection.py +0 -0
  38. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/solr/solr_database.py +0 -0
  39. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/stores/solr/solr_utils.py +0 -0
  40. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/api/types.py +0 -0
  41. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/constants.py +0 -0
  42. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/graphs/__init__.py +0 -0
  43. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/graphs/graph_map.py +0 -0
  44. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/index/__init__.py +0 -0
  45. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/index/implementations/__init__.py +0 -0
  46. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/index/implementations/llm_indexer.py +0 -0
  47. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/index/implementations/simple_indexer.py +0 -0
  48. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/index/indexer.py +0 -0
  49. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/inference/__init__.py +0 -0
  50. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/inference/evaluation.py +0 -0
  51. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/inference/implementations/__init__.py +0 -0
  52. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/inference/implementations/llm_inference_engine.py +0 -0
  53. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/inference/implementations/rag_inference_engine.py +0 -0
  54. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/inference/implementations/rule_based_inference_engine.py +0 -0
  55. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/inference/implementations/sklearn_inference_engine.py +0 -0
  56. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/inference/inference_config.py +0 -0
  57. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/inference/inference_engine.py +0 -0
  58. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/inference/inference_engine_registry.py +0 -0
  59. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/utils/__init__.py +0 -0
  60. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/utils/change_utils.py +0 -0
  61. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/utils/file_utils.py +0 -0
  62. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/utils/io.py +0 -0
  63. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/utils/llm_utils.py +0 -0
  64. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/utils/mongodb_utils.py +0 -0
  65. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/utils/neo4j_utils.py +0 -0
  66. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/utils/object_utils.py +0 -0
  67. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/utils/pandas_utils.py +0 -0
  68. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/utils/patch_utils.py +0 -0
  69. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/utils/query_utils.py +0 -0
  70. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/utils/schema_utils.py +0 -0
  71. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/utils/sklearn_utils.py +0 -0
  72. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/utils/stats_utils.py +0 -0
  73. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/webapi/__init__.py +0 -0
  74. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/webapi/html/__init__.py +0 -0
  75. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/webapi/html/base.html.j2 +0 -0
  76. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/webapi/html/collection_details.html.j2 +0 -0
  77. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/webapi/html/database_details.html.j2 +0 -0
  78. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/webapi/html/databases.html.j2 +0 -0
  79. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/webapi/html/generic.html.j2 +0 -0
  80. {linkml_store-0.2.5 → linkml_store-0.2.6}/src/linkml_store/webapi/main.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: linkml-store
3
- Version: 0.2.5
3
+ Version: 0.2.6
4
4
  Summary: linkml-store
5
5
  License: MIT
6
6
  Author: Author 1
@@ -34,7 +34,6 @@ Requires-Dist: duckdb (>=0.10.1)
34
34
  Requires-Dist: duckdb-engine (>=0.11.2)
35
35
  Requires-Dist: fastapi ; extra == "fastapi"
36
36
  Requires-Dist: frictionless ; extra == "frictionless"
37
- Requires-Dist: gcsfs
38
37
  Requires-Dist: google-cloud-bigquery ; extra == "bigquery"
39
38
  Requires-Dist: h5py ; extra == "h5py"
40
39
  Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
@@ -54,7 +53,7 @@ Requires-Dist: plotly ; extra == "analytics"
54
53
  Requires-Dist: py2neo ; extra == "neo4j"
55
54
  Requires-Dist: pyarrow ; extra == "pyarrow"
56
55
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
57
- Requires-Dist: pymongo ; extra == "mongodb"
56
+ Requires-Dist: pymongo (>=4.11,<5.0) ; extra == "mongodb"
58
57
  Requires-Dist: pystow (>=0.5.4,<0.6.0)
59
58
  Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
60
59
  Requires-Dist: ruff (>=0.6.2) ; extra == "tests"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "linkml-store"
3
- version = "0.2.5"
3
+ version = "0.2.6"
4
4
  description = "linkml-store"
5
5
  authors = ["Author 1 <author@org.org>"]
6
6
  license = "MIT"
@@ -24,7 +24,7 @@ black = { version=">=24.0.0", optional = true }
24
24
  ruff = { version=">=0.6.2", optional = true }
25
25
  llm = { version="*", optional = true }
26
26
  tiktoken = { version="*", optional = true }
27
- pymongo = { version="*", optional = true }
27
+ pymongo = "^4.11"
28
28
  neo4j = { version="*", optional = true }
29
29
  py2neo = { version="*", optional = true }
30
30
  networkx = { version="*", optional = true }
@@ -15,6 +15,7 @@ logger = logging.getLogger(__name__)
15
15
 
16
16
  HANDLE_MAP = {
17
17
  "duckdb": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
18
+ "sqlite": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
18
19
  "solr": "linkml_store.api.stores.solr.solr_database.SolrDatabase",
19
20
  "mongodb": "linkml_store.api.stores.mongodb.mongodb_database.MongoDBDatabase",
20
21
  "chromadb": "linkml_store.api.stores.chromadb.chromadb_database.ChromaDBDatabase",
@@ -24,6 +25,8 @@ HANDLE_MAP = {
24
25
 
25
26
  SUFFIX_MAP = {
26
27
  "ddb": "duckdb:///{path}",
28
+ "duckdb": "duckdb:///{path}",
29
+ "db": "duckdb:///{path}",
27
30
  }
28
31
 
29
32
 
@@ -204,9 +207,10 @@ class Client:
204
207
  if ":" not in handle:
205
208
  if alias is None:
206
209
  alias = handle
207
- suffix = handle.split(".")[-1]
208
- if suffix in SUFFIX_MAP:
209
- handle = SUFFIX_MAP[suffix].format(path=handle)
210
+ if "." in handle:
211
+ suffix = handle.split(".")[-1]
212
+ if suffix in SUFFIX_MAP:
213
+ handle = SUFFIX_MAP[suffix].format(path=handle)
210
214
  if ":" not in handle:
211
215
  scheme = handle
212
216
  handle = None
@@ -1,6 +1,7 @@
1
1
  """A structure for representing collections of similar objects."""
2
2
 
3
3
  import hashlib
4
+ import json
4
5
  import logging
5
6
  from collections import defaultdict
6
7
  from pathlib import Path
@@ -210,8 +211,59 @@ class Collection(Generic[DatabaseType]):
210
211
  """
211
212
  raise NotImplementedError
212
213
 
214
+ def index (
215
+ self,
216
+ objs: Union[OBJECT, List[OBJECT]],
217
+ index_name: Optional[str] = None,
218
+ replace: bool = False,
219
+ unique: bool = False,
220
+ **kwargs,
221
+ ) -> None:
222
+ """
223
+ Index objects in the collection.
224
+
225
+ :param objs:
226
+ :param index_name:
227
+ :param replace: replace the index, or not
228
+ :param unique: boolean used to declare the index unique or not
229
+ :param kwargs:
230
+ :return:
231
+ """
232
+ raise NotImplementedError
233
+
234
+ def upsert(self,
235
+ objs: Union[OBJECT, List[OBJECT]],
236
+ filter_fields: List[str],
237
+ update_fields: Union[List[str], None] = None, **kwargs):
238
+ """
239
+ Add one or more objects to the collection.
240
+
241
+ >>> from linkml_store import Client
242
+ >>> client = Client()
243
+ >>> db = client.attach_database("mongodb", alias="test")
244
+ >>> collection = db.create_collection("Person")
245
+ >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
246
+ >>> collection.upsert(objs)
247
+
248
+ :param objs:
249
+ :param filter_fields: List of field names to use as the filter for matching existing collections.
250
+ :param update_fields: List of field names to include in the update. If None, all fields are updated.
251
+ :param kwargs:
252
+
253
+ :return:
254
+ """
255
+ raise NotImplementedError
256
+
213
257
  def _pre_query_hook(self, query: Optional[Query] = None, **kwargs):
214
- logger.info(f"Pre-query hook (state: {self._initialized}; Q= {query}")
258
+ """
259
+ Pre-query hook.
260
+
261
+ This is called before a query is executed. It is used to materialize derivations and indexes.
262
+ :param query:
263
+ :param kwargs:
264
+ :return:
265
+ """
266
+ logger.debug(f"Pre-query hook (state: {self._initialized}; Q= {query}") # if logging.info, this is very noisy.
215
267
  if not self._initialized:
216
268
  self._materialize_derivations()
217
269
  self._initialized = True
@@ -536,7 +588,13 @@ class Collection(Generic[DatabaseType]):
536
588
  qr = ix_coll.find(where=where, limit=-1, **kwargs)
537
589
  index_col = ix.index_field
538
590
  # TODO: optimize this for large indexes
539
- vector_pairs = [(row, np.array(row[index_col], dtype=float)) for row in qr.rows]
591
+ def row2array(row):
592
+ v = row[index_col]
593
+ if isinstance(v, str):
594
+ # sqlite stores arrays as strings
595
+ v = json.loads(v)
596
+ return np.array(v, dtype=float)
597
+ vector_pairs = [(row, row2array(row)) for row in qr.rows]
540
598
  results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs)
541
599
  for r in results:
542
600
  del r[1][index_col]
@@ -276,14 +276,15 @@ class Database(ABC, Generic[CollectionType]):
276
276
 
277
277
  Examples:
278
278
 
279
- >>> from linkml_store.api.client import Client
280
- >>> client = Client()
281
- >>> db = client.attach_database("duckdb", alias="test")
282
- >>> collection = db.create_collection("Person", alias="persons")
283
- >>> collection.alias
284
- 'persons'
285
- >>> collection.target_class_name
286
- 'Person'
279
+ >>> from linkml_store.api.client import Client
280
+ >>> client = Client()
281
+ >>> db = client.attach_database("duckdb", alias="test")
282
+ >>> collection = db.create_collection("Person", alias="persons")
283
+ >>> collection.alias
284
+ 'persons'
285
+
286
+ >>> collection.target_class_name
287
+ 'Person'
287
288
 
288
289
  If alias is not provided, it defaults to the name of the type.
289
290
 
@@ -419,7 +420,7 @@ class Database(ABC, Generic[CollectionType]):
419
420
  >>> from linkml_store.api.client import Client
420
421
  >>> from linkml_store.api.queries import Query
421
422
  >>> client = Client()
422
- >>> db = client.attach_database("duckdb", alias="test")
423
+ >>> db = client.attach_database("duckdb", alias="test", recreate_if_exists=True)
423
424
  >>> collection = db.create_collection("Person")
424
425
  >>> collection.insert([{"id": "P1", "name": "John"}, {"id": "P2", "name": "Alice"}])
425
426
  >>> query = Query(from_table="Person", where_clause={"name": "John"})
@@ -451,7 +452,7 @@ class Database(ABC, Generic[CollectionType]):
451
452
 
452
453
  >>> from linkml_store.api.client import Client
453
454
  >>> client = Client()
454
- >>> db = client.attach_database("duckdb", alias="test")
455
+ >>> db = client.attach_database("duckdb", alias="test", recreate_if_exists=True)
455
456
  >>> collection = db.create_collection("Person", alias="persons")
456
457
  >>> collection.insert([{"id": "P1", "name": "John", "age_in_years": 25}])
457
458
  >>> schema_view = db.schema_view
@@ -721,7 +722,7 @@ class Database(ABC, Generic[CollectionType]):
721
722
 
722
723
  >>> from linkml_store.api.client import Client
723
724
  >>> client = Client()
724
- >>> db = client.attach_database("duckdb", alias="test")
725
+ >>> db = client.attach_database("duckdb", alias="test", recreate_if_exists=True)
725
726
  >>> db.import_database("tests/input/iris.csv", Format.CSV, collection_name="iris")
726
727
  >>> db.list_collection_names()
727
728
  ['iris']
@@ -741,7 +742,9 @@ class Database(ABC, Generic[CollectionType]):
741
742
  # import into a test instance
742
743
  tmp_handle = source_format.value
743
744
  client = self.parent
744
- tmp_db = client.attach_database(tmp_handle, alias="tmp")
745
+ tmp_alias = "tmp"
746
+ client.drop_database(tmp_alias, missing_ok=True)
747
+ tmp_db = client.attach_database(tmp_handle, alias=tmp_alias, recreate_if_exists=True)
745
748
  # TODO: check for infinite recursion
746
749
  tmp_db.import_database(location, source_format=source_format)
747
750
  obj = {}
@@ -147,16 +147,22 @@ class DuckDBCollection(Collection):
147
147
  if self._table_created or self.metadata.is_prepopulated:
148
148
  logger.info(f"Already have table for: {cd.name}")
149
149
  return
150
- query = Query(
151
- from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE", "table_name": self.alias}
152
- )
153
- qr = self.parent.query(query)
154
- if qr.num_rows > 0:
150
+ if self.parent._table_exists(self.alias):
155
151
  logger.info(f"Table already exists for {cd.name}")
156
152
  self._table_created = True
157
153
  self._initialized = True
158
154
  self.metadata.is_prepopulated = True
159
155
  return
156
+ # query = Query(
157
+ # from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE", "table_name": self.alias}
158
+ # )
159
+ # qr = self.parent.query(query)
160
+ # if qr.num_rows > 0:
161
+ # logger.info(f"Table already exists for {cd.name}")
162
+ # self._table_created = True
163
+ # self._initialized = True
164
+ # self.metadata.is_prepopulated = True
165
+ # return
160
166
  logger.info(f"Creating table for {cd.name}")
161
167
  t = self._sqla_table(cd)
162
168
  ct = CreateTable(t)
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  import logging
3
3
  from pathlib import Path
4
- from typing import Optional, Union
4
+ from typing import Optional, Union, List
5
5
 
6
6
  import pandas as pd
7
7
  import sqlalchemy
@@ -14,7 +14,7 @@ from linkml_store.api import Database
14
14
  from linkml_store.api.queries import Query, QueryResult
15
15
  from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
16
16
  from linkml_store.utils.format_utils import Format
17
- from linkml_store.utils.sql_utils import introspect_schema, query_to_sql
17
+ from linkml_store.utils.sql_utils import introspect_schema, query_to_sql, where_clause_to_sql
18
18
 
19
19
  TYPE_MAP = {
20
20
  "VARCHAR": "string",
@@ -62,7 +62,7 @@ class DuckDBDatabase(Database):
62
62
  def engine(self) -> sqlalchemy.Engine:
63
63
  if not self._engine:
64
64
  handle = self.handle
65
- if not handle.startswith("duckdb://") and not handle.startswith(":"):
65
+ if not handle.startswith("duckdb://") and not handle.startswith(":") and "://" not in handle:
66
66
  handle = f"duckdb:///{handle}"
67
67
  if ":memory:" not in handle:
68
68
  # TODO: investigate this; duckdb appears to be prematurely caching
@@ -71,6 +71,10 @@ class DuckDBDatabase(Database):
71
71
  self._engine = sqlalchemy.create_engine(handle)
72
72
  return self._engine
73
73
 
74
+ @property
75
+ def _is_sqlite(self) -> bool:
76
+ return self.handle and self.handle.startswith("sqlite:")
77
+
74
78
  def commit(self, **kwargs):
75
79
  with self.engine.connect() as conn:
76
80
  conn.commit()
@@ -89,34 +93,60 @@ class DuckDBDatabase(Database):
89
93
  if not missing_ok:
90
94
  raise FileNotFoundError(f"Database file not found: {path}")
91
95
 
92
- def query(self, query: Query, **kwargs) -> QueryResult:
96
+ def _table_exists(self, table: str) -> bool:
97
+ if self._is_sqlite:
98
+ if table == "sqlite_master":
99
+ return True
100
+ meta_query = Query(
101
+ from_table="sqlite_master",
102
+ where_clause={
103
+ #"type": "table",
104
+ "name": table,
105
+ }
106
+ )
107
+ else:
108
+ if table.startswith("information_schema"):
109
+ return True
110
+ meta_query = Query(
111
+ from_table="information_schema.tables",
112
+ where_clause={
113
+ "table_type": "BASE TABLE",
114
+ "table_name": table,
115
+ }
116
+ )
117
+
118
+ qr = self.query(meta_query)
119
+ if qr.num_rows == 0:
120
+ logger.debug(f"Table {self.alias} not created yet")
121
+ return False
122
+ return True
123
+
124
+ def _json_encoded_cols(self, table_name: str) -> Optional[List[str]]:
93
125
  json_encoded_cols = []
94
- if query.from_table:
95
- if not query.from_table.startswith("information_schema"):
96
- meta_query = Query(
97
- from_table="information_schema.tables", where_clause={"table_name": query.from_table}
98
- )
99
- qr = self.query(meta_query)
100
- if qr.num_rows == 0:
101
- logger.debug(f"Table {query.from_table} not created yet")
102
- return QueryResult(query=query, num_rows=0, rows=[])
103
- if not query.from_table.startswith("information_schema"):
104
- sv = self.schema_view
105
- else:
106
- sv = None
126
+ if table_name:
127
+ if table_name.startswith("information_schema") or table_name.startswith("sqlite"):
128
+ return []
129
+ sv = self.schema_view
107
130
  if sv:
108
131
  cd = None
109
132
  for c in self._collections.values():
110
- # if c.name == query.from_table or c.metadata.alias == query.from_table:
111
- if c.alias == query.from_table or c.target_class_name == query.from_table:
133
+ if c.alias == table_name or c.target_class_name == table_name:
112
134
  cd = c.class_definition()
113
135
  break
114
136
  if cd:
115
137
  for att in sv.class_induced_slots(cd.name):
116
138
  if att.inlined or att.inlined_as_list:
117
139
  json_encoded_cols.append(att.name)
140
+ return json_encoded_cols
141
+
142
+ def query(self, query: Query, **kwargs) -> QueryResult:
143
+ if not self._table_exists(query.from_table):
144
+ return QueryResult(query=query, num_rows=0, rows=[])
145
+ json_encoded_cols = self._json_encoded_cols(query.from_table)
146
+
118
147
  with self.engine.connect() as conn:
119
148
  count_query_str = text(query_to_sql(query, count=True))
149
+ logger.debug(f"count_query_str: {count_query_str}")
120
150
  num_rows = list(conn.execute(count_query_str))[0][0]
121
151
  logger.debug(f"num_rows: {num_rows}")
122
152
  query_str = query_to_sql(query, **kwargs) # include offset, limit
@@ -167,6 +197,9 @@ class DuckDBDatabase(Database):
167
197
  logger.info(f"Inducing schema view for {self.metadata.handle} // {self}")
168
198
  sb = SchemaBuilder()
169
199
  schema = sb.schema
200
+ logger.info(f"Checking if {self.metadata.handle} is sqlite: {self._is_sqlite}")
201
+ if self._is_sqlite:
202
+ return SchemaView(schema)
170
203
  query = Query(from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE"})
171
204
  qr = self.query(query)
172
205
  logger.info(f"Found {qr.num_rows} information_schema.tables // {qr.rows}")
@@ -41,6 +41,89 @@ class MongoDBCollection(Collection):
41
41
  del obj["_id"]
42
42
  self._post_insert_hook(objs)
43
43
 
44
+
45
+ def index(self,
46
+ objs: Union[OBJECT, List[OBJECT]],
47
+ index_name: Optional[str] = None,
48
+ replace: bool = False,
49
+ unique: bool = False,
50
+ **kwargs):
51
+ """
52
+ Create indexes on the collection.
53
+
54
+ :param objs: Field(s) to index.
55
+ :param index_name: Optional name for the index.
56
+ :param replace: If True, the index will be dropped and recreated.
57
+ :param unique: If True, creates a unique index (default: False).
58
+ """
59
+
60
+ if not isinstance(objs, list):
61
+ objs = [objs]
62
+
63
+ existing_indexes = self.mongo_collection.index_information()
64
+
65
+ for obj in objs:
66
+ field_exists = False
67
+ index_to_drop = None
68
+
69
+ # Extract existing index details
70
+ for index_name_existing, index_details in existing_indexes.items():
71
+ indexed_fields = [field[0] for field in index_details.get("key", [])] # Extract field names
72
+
73
+ if obj in indexed_fields: # If this field is already indexed
74
+ field_exists = True
75
+ index_to_drop = index_name_existing if replace else None
76
+
77
+ # Drop the index if replace=True and index_to_drop is valid
78
+ if index_to_drop:
79
+ self.mongo_collection.drop_index(index_to_drop)
80
+ logging.debug(f"Dropped existing index: {index_to_drop}")
81
+
82
+ # Create the new index only if it doesn't exist or was dropped
83
+ if not field_exists or replace:
84
+ self.mongo_collection.create_index(obj, name=index_name, unique=unique)
85
+ logging.debug(f"Created new index: {index_name} on field {obj}, unique={unique}")
86
+ else:
87
+ logging.debug(f"Index already exists for field {obj}, skipping creation.")
88
+
89
+ def upsert(self,
90
+ objs: Union[OBJECT, List[OBJECT]],
91
+ filter_fields: List[str],
92
+ update_fields: Optional[List[str]] = None,
93
+ **kwargs):
94
+ """
95
+ Upsert one or more documents into the MongoDB collection.
96
+
97
+ :param objs: The document(s) to insert or update.
98
+ :param filter_fields: List of field names to use as the filter for matching existing documents.
99
+ :param update_fields: List of field names to include in the update. If None, all fields are updated.
100
+ """
101
+ if not isinstance(objs, list):
102
+ objs = [objs]
103
+
104
+ for obj in objs:
105
+ # Ensure filter fields exist in the object
106
+ filter_criteria = {field: obj[field] for field in filter_fields if field in obj}
107
+ if not filter_criteria:
108
+ raise ValueError("At least one valid filter field must be present in each object.")
109
+
110
+ # Check if a document already exists
111
+ existing_doc = self.mongo_collection.find_one(filter_criteria)
112
+
113
+ if existing_doc:
114
+ # Update only changed fields
115
+ updates = {key: obj[key] for key in update_fields if key in obj and obj[key] != existing_doc.get(key)}
116
+
117
+ if updates:
118
+ self.mongo_collection.update_one(filter_criteria, {"$set": updates})
119
+ logging.debug(f"Updated existing document: {filter_criteria} with {updates}")
120
+ else:
121
+ logging.debug(f"No changes detected for document: {filter_criteria}. Skipping update.")
122
+ else:
123
+ # Insert a new document
124
+ self.mongo_collection.insert_one(obj)
125
+ logging.debug(f"Inserted new document: {obj}")
126
+
44
127
  def query(self, query: Query, limit: Optional[int] = None, offset: Optional[int] = None, **kwargs) -> QueryResult:
45
128
  mongo_filter = self._build_mongo_filter(query.where_clause)
46
129
  limit = limit or query.limit
@@ -3,6 +3,7 @@
3
3
  import logging
4
4
  from pathlib import Path
5
5
  from typing import Optional, Union
6
+ from urllib.parse import urlparse
6
7
 
7
8
  from pymongo import MongoClient
8
9
  from pymongo.database import Database as NativeDatabase
@@ -38,10 +39,13 @@ class MongoDBDatabase(Database):
38
39
  @property
39
40
  def _db_name(self) -> str:
40
41
  if self.handle:
41
- db = self.handle.split("/")[-1]
42
+ parsed_url = urlparse(self.handle)
43
+ path_parts = parsed_url.path.lstrip("/").split("?")[0].split("/")
44
+ print(path_parts)
45
+ db_name = path_parts[0] if path_parts else "default"
42
46
  else:
43
- db = "default"
44
- return db
47
+ db_name = "default"
48
+ return db_name
45
49
 
46
50
  @property
47
51
  def native_client(self) -> MongoClient:
@@ -186,7 +186,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
186
186
 
187
187
 
188
188
  @cli.command()
189
- @click.argument("files", type=click.Path(exists=True), nargs=-1)
189
+ @click.argument("files", type=click.Path(), nargs=-1)
190
190
  @click.option("--replace/--no-replace", default=False, show_default=True, help="Replace existing objects")
191
191
  @click.option("--format", "-f", type=format_choice, help="Input format")
192
192
  @click.option("--object", "-i", multiple=True, help="Input object as YAML")
@@ -3,6 +3,7 @@ import gzip
3
3
  import io
4
4
  import json
5
5
  import logging
6
+ import re
6
7
  import sys
7
8
  import tarfile
8
9
  from enum import Enum
@@ -31,10 +32,13 @@ class Format(Enum):
31
32
  TSV = "tsv"
32
33
  CSV = "csv"
33
34
  XML = "xml"
35
+ OBO = "obo"
36
+ PKL = "pkl"
34
37
  PYTHON = "python"
35
38
  PARQUET = "parquet"
36
39
  FORMATTED = "formatted"
37
40
  TABLE = "table"
41
+ XLSX = "xlsx"
38
42
  SQLDUMP_DUCKDB = "duckdb"
39
43
  SQLDUMP_POSTGRES = "postgres"
40
44
  DUMP_MONGODB = "mongodb"
@@ -67,6 +71,9 @@ class Format(Enum):
67
71
  def is_dump_format(self):
68
72
  return self in [Format.SQLDUMP_DUCKDB, Format.SQLDUMP_POSTGRES, Format.DUMP_MONGODB]
69
73
 
74
+ def is_binary_format(self):
75
+ return self in [Format.PARQUET, Format.XLSX]
76
+
70
77
  def is_xsv(self):
71
78
  return self in [Format.TSV, Format.CSV]
72
79
 
@@ -95,6 +102,26 @@ def load_objects_from_url(
95
102
  return objs
96
103
 
97
104
 
105
+ def clean_pandas_value(v):
106
+ """Clean a single value from pandas."""
107
+ import math
108
+
109
+ if isinstance(v, float):
110
+ if math.isnan(v) or math.isinf(v):
111
+ return None
112
+ return float(v) # Ensures proper float type
113
+ return v
114
+
115
+
116
+ def clean_nested_structure(obj):
117
+ """Recursively clean a nested structure of dicts/lists from pandas."""
118
+ if isinstance(obj, dict):
119
+ return {k: clean_nested_structure(v) for k, v in obj.items()}
120
+ elif isinstance(obj, list):
121
+ return [clean_nested_structure(item) for item in obj] # Fixed: using 'item' instead of 'v'
122
+ else:
123
+ return clean_pandas_value(obj)
124
+
98
125
  def process_file(
99
126
  f: IO, format: Format, expected_type: Optional[Type] = None, header_comment_token: Optional[str] = None
100
127
  ) -> List[Dict[str, Any]]:
@@ -128,6 +155,19 @@ def process_file(
128
155
  objs = list(reader)
129
156
  elif format == Format.XML:
130
157
  objs = xmltodict.parse(f.read())
158
+ elif format == Format.PKL:
159
+ objs = pd.read_pickle(f).to_dict(orient="records")
160
+ elif format == Format.XLSX:
161
+ xls = pd.ExcelFile(f)
162
+ objs = {sheet: clean_nested_structure(xls.parse(sheet).to_dict(orient="records")) for sheet in xls.sheet_names}
163
+ elif format == Format.OBO:
164
+ blocks = split_document(f.read(), "\n\n")
165
+ id_pattern = re.compile(r"id: (\S+)")
166
+ def get_id(block):
167
+ m = id_pattern.search(block)
168
+ return m.group(1) if m else None
169
+ objs = [{"id": get_id(block), "content": block} for block in blocks]
170
+ objs = [obj for obj in objs if obj["id"]]
131
171
  elif format == Format.PARQUET:
132
172
  import pyarrow.parquet as pq
133
173
 
@@ -167,6 +207,14 @@ def load_objects(
167
207
  if isinstance(file_path, Path):
168
208
  file_path = str(file_path)
169
209
 
210
+ for url_scheme in ["http", "https", "ftp"]:
211
+ if file_path.startswith(f"{url_scheme}://"):
212
+ return load_objects_from_url(
213
+ file_path,
214
+ format=format,
215
+ expected_type=expected_type,
216
+ )
217
+
170
218
  if isinstance(format, str):
171
219
  format = Format(format)
172
220
 
@@ -185,9 +233,9 @@ def load_objects(
185
233
  else:
186
234
  if Path(file_path).is_dir():
187
235
  raise ValueError(f"{file_path} is a dir, which is invalid for {format}")
188
- mode = "rb" if format == Format.PARQUET or compression == "gz" else "r"
189
236
  open_func = gzip.open if compression == "gz" else open
190
237
  format = Format.guess_format(file_path) if not format else format
238
+ mode = "rb" if (format and format.is_binary_format()) or compression == "gz" else "r"
191
239
  with open_func(file_path, mode) if file_path != "-" else sys.stdin as f:
192
240
  if compression == "gz" and mode == "r":
193
241
  f = io.TextIOWrapper(f)
@@ -343,3 +391,14 @@ def guess_format(path: str) -> Optional[Format]:
343
391
  :return: The guessed format.
344
392
  """
345
393
  return Format.guess_format(path)
394
+
395
+
396
+ def split_document(doc: str, delimiter: str):
397
+ """
398
+ Split a document into parts based on a delimiter.
399
+
400
+ :param doc: The document to split.
401
+ :param delimiter: The delimiter.
402
+ :return: The parts of the document.
403
+ """
404
+ return doc.split(delimiter)
@@ -5,7 +5,7 @@ import sqlalchemy
5
5
  import sqlalchemy.sql.sqltypes as sqlt
6
6
  from linkml_runtime.linkml_model import SchemaDefinition, SlotDefinition
7
7
  from linkml_runtime.utils.schema_builder import SchemaBuilder
8
- from sqlalchemy import MetaData
8
+ from sqlalchemy import MetaData, quoted_name
9
9
 
10
10
  from linkml_store.api.queries import Query
11
11
 
@@ -115,7 +115,13 @@ def facet_count_sql(query: Query, facet_column: Union[str, Tuple[str, ...]], mul
115
115
  conditions = [cond for cond in where_clause_sql.split(" AND ") if not cond.startswith(f"{facet_column} ")]
116
116
  modified_where = " AND ".join(conditions)
117
117
 
118
+ def make_col_safe(col):
119
+ return '"' + quoted_name(col, True) + '"' if ' ' in col else col
120
+
121
+ if isinstance(facet_column, str):
122
+ facet_column = make_col_safe(facet_column)
118
123
  if isinstance(facet_column, tuple):
124
+ facet_column = [make_col_safe(col) for col in facet_column]
119
125
  if multivalued:
120
126
  raise NotImplementedError("Multivalued facets are not supported for multiple columns")
121
127
  facet_column = ", ".join(facet_column)
@@ -34,7 +34,7 @@ def pairwise_cosine_similarity(vector1: np.array, vector2: np.array) -> float:
34
34
  dot_product = np.dot(vector1, vector2)
35
35
  norm1 = np.linalg.norm(vector1)
36
36
  norm2 = np.linalg.norm(vector2)
37
- return dot_product / (norm1 * norm2)
37
+ return float(dot_product / (norm1 * norm2))
38
38
 
39
39
 
40
40
  def compute_cosine_similarity_matrix(list1: LOL, list2: LOL) -> np.ndarray:
File without changes
File without changes