linkml-store 0.0.0__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (37) hide show
  1. linkml_store/api/__init__.py +2 -2
  2. linkml_store/api/client.py +113 -8
  3. linkml_store/api/collection.py +272 -34
  4. linkml_store/api/config.py +101 -0
  5. linkml_store/api/database.py +282 -18
  6. linkml_store/api/queries.py +12 -1
  7. linkml_store/api/stores/chromadb/__init__.py +3 -0
  8. linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
  9. linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
  10. linkml_store/api/stores/duckdb/__init__.py +7 -0
  11. linkml_store/api/stores/duckdb/duckdb_collection.py +47 -14
  12. linkml_store/api/stores/duckdb/duckdb_database.py +38 -47
  13. linkml_store/api/stores/hdf5/__init__.py +0 -0
  14. linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
  15. linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
  16. linkml_store/api/stores/mongodb/mongodb_collection.py +92 -40
  17. linkml_store/api/stores/mongodb/mongodb_database.py +58 -67
  18. linkml_store/api/stores/solr/__init__.py +3 -0
  19. linkml_store/api/stores/solr/solr_collection.py +133 -0
  20. linkml_store/api/stores/solr/solr_database.py +83 -0
  21. linkml_store/api/stores/solr/solr_utils.py +0 -0
  22. linkml_store/cli.py +369 -0
  23. linkml_store/index/__init__.py +33 -0
  24. linkml_store/index/implementations/{llm_index.py → llm_indexer.py} +2 -2
  25. linkml_store/index/implementations/{simple_index.py → simple_indexer.py} +6 -3
  26. linkml_store/index/{index.py → indexer.py} +7 -4
  27. linkml_store/utils/format_utils.py +93 -0
  28. linkml_store/utils/object_utils.py +81 -0
  29. linkml_store/utils/sql_utils.py +46 -7
  30. {linkml_store-0.0.0.dist-info → linkml_store-0.1.7.dist-info}/METADATA +17 -6
  31. linkml_store-0.1.7.dist-info/RECORD +42 -0
  32. linkml_store-0.1.7.dist-info/entry_points.txt +3 -0
  33. linkml_store/api/metadata.py +0 -5
  34. linkml_store-0.0.0.dist-info/RECORD +0 -29
  35. linkml_store-0.0.0.dist-info/entry_points.txt +0 -3
  36. {linkml_store-0.0.0.dist-info → linkml_store-0.1.7.dist-info}/LICENSE +0 -0
  37. {linkml_store-0.0.0.dist-info → linkml_store-0.1.7.dist-info}/WHEEL +0 -0
@@ -1,56 +1,108 @@
1
- from dataclasses import dataclass
2
- from typing import Any, Dict, List, Optional, Union
1
+ import logging
2
+ from copy import copy
3
+ from typing import Any, Dict, List, Optional, Tuple, Union
4
+
5
+ from linkml_runtime.linkml_model import SlotDefinition
6
+ from pymongo.collection import Collection as MongoCollection
3
7
 
4
8
  from linkml_store.api import Collection
5
- from linkml_store.api.collection import OBJECT
9
+ from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
10
+ from linkml_store.api.queries import Query, QueryResult
11
+
12
+ logger = logging.getLogger(__name__)
6
13
 
7
14
 
8
- @dataclass
9
15
  class MongoDBCollection(Collection):
10
- """
11
- A wrapper around a MongoDB collection
12
- """
13
16
 
14
- def add(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
15
- if not isinstance(objs, list):
16
- objs = [objs]
17
- if not objs:
18
- return
19
- cd = self.class_definition()
20
- if not cd:
21
- cd = self.induce_class_definition_from_objects(objs)
22
- collection = self.parent.database[self.name]
23
- collection.insert_many(objs)
17
+ @property
18
+ def mongo_collection(self) -> MongoCollection:
19
+ if not self.name:
20
+ raise ValueError("Collection name not set")
21
+ return self.parent.native_db[self.name]
24
22
 
25
- def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> int:
23
+ def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
26
24
  if not isinstance(objs, list):
27
25
  objs = [objs]
28
- cd = self.class_definition()
29
- if not cd:
30
- cd = self.induce_class_definition_from_objects(objs)
31
- collection = self.parent.database[self.name]
32
- deleted_count = 0
33
- for obj in objs:
34
- result = collection.delete_one(obj)
35
- deleted_count += result.deleted_count
36
- return deleted_count
26
+ self.mongo_collection.insert_many(objs)
37
27
 
38
- def delete_where(self, where: Optional[Dict[str, Any]] = None, **kwargs) -> int:
39
- collection = self.parent.database[self.name]
40
- result = collection.delete_many(where)
41
- return result.deleted_count
28
+ def query(self, query: Query, **kwargs) -> QueryResult:
29
+ mongo_filter = self._build_mongo_filter(query.where_clause)
30
+ if query.limit:
31
+ cursor = self.mongo_collection.find(mongo_filter).limit(query.limit)
32
+ else:
33
+ cursor = self.mongo_collection.find(mongo_filter)
34
+
35
+ def _as_row(row: dict):
36
+ row = copy(row)
37
+ del row["_id"]
38
+ return row
39
+
40
+ rows = [_as_row(row) for row in cursor]
41
+ count = self.mongo_collection.count_documents(mongo_filter)
42
+
43
+ return QueryResult(query=query, num_rows=count, rows=rows)
44
+
45
+ def _build_mongo_filter(self, where_clause: Dict[str, Any]) -> Dict[str, Any]:
46
+ mongo_filter = {}
47
+ if where_clause:
48
+ for field, value in where_clause.items():
49
+ mongo_filter[field] = value
50
+ return mongo_filter
42
51
 
43
- def query_facets(self, where: Dict = None, facet_columns: List[str] = None) -> Dict[str, Dict[str, int]]:
52
+ def query_facets(
53
+ self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
54
+ ) -> Dict[str, List[Tuple[Any, int]]]:
44
55
  results = {}
45
- _cd = self.class_definition()
46
- collection = self.parent.database[self.name]
56
+ cd = self.class_definition()
47
57
  if not facet_columns:
48
58
  facet_columns = list(self.class_definition().attributes.keys())
59
+
49
60
  for col in facet_columns:
50
- facet_pipeline = [
51
- {"$match": where} if where else {"$match": {}},
52
- {"$group": {"_id": f"${col}", "count": {"$sum": 1}}},
53
- ]
54
- facet_results = list(collection.aggregate(facet_pipeline))
55
- results[col] = [(row["_id"], row["count"]) for row in facet_results]
61
+ logger.debug(f"Faceting on {col}")
62
+ if isinstance(col, tuple):
63
+ sd = SlotDefinition(name="PLACEHOLDER")
64
+ else:
65
+ sd = cd.attributes[col]
66
+
67
+ if sd.multivalued:
68
+ facet_pipeline = [
69
+ {"$match": where} if where else {"$match": {}},
70
+ {"$unwind": f"${col}"},
71
+ {"$group": {"_id": f"${col}", "count": {"$sum": 1}}},
72
+ {"$sort": {"count": -1}},
73
+ {"$limit": facet_limit},
74
+ ]
75
+ else:
76
+ facet_pipeline = [
77
+ {"$match": where} if where else {"$match": {}},
78
+ {"$group": {"_id": f"${col}", "count": {"$sum": 1}}},
79
+ {"$sort": {"count": -1}},
80
+ {"$limit": facet_limit},
81
+ ]
82
+
83
+ facet_results = list(self.mongo_collection.aggregate(facet_pipeline))
84
+ results[col] = [(result["_id"], result["count"]) for result in facet_results]
85
+
56
86
  return results
87
+
88
+ def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> int:
89
+ if not isinstance(objs, list):
90
+ objs = [objs]
91
+ filter_conditions = []
92
+ for obj in objs:
93
+ filter_condition = {}
94
+ for key, value in obj.items():
95
+ filter_condition[key] = value
96
+ filter_conditions.append(filter_condition)
97
+ result = self.mongo_collection.delete_many({"$or": filter_conditions})
98
+ return result.deleted_count
99
+
100
+ def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> int:
101
+ logger.info(f"Deleting from {self.target_class_name} where: {where}")
102
+ if where is None:
103
+ where = {}
104
+ result = self.mongo_collection.delete_many(where)
105
+ deleted_rows_count = result.deleted_count
106
+ if deleted_rows_count == 0 and not missing_ok:
107
+ raise ValueError(f"No rows found for {where}")
108
+ return deleted_rows_count
@@ -1,112 +1,103 @@
1
- from dataclasses import dataclass
1
+ # mongodb_database.py
2
+
3
+ import logging
2
4
  from typing import Optional
3
5
 
4
6
  from linkml_runtime import SchemaView
5
- from linkml_runtime.linkml_model import SlotDefinition
7
+ from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
6
8
  from linkml_runtime.utils.schema_builder import SchemaBuilder
7
9
  from pymongo import MongoClient
10
+ from pymongo.database import Database as NativeDatabase
8
11
 
9
12
  from linkml_store.api import Database
10
13
  from linkml_store.api.queries import Query, QueryResult
11
14
  from linkml_store.api.stores.mongodb.mongodb_collection import MongoDBCollection
12
15
 
16
+ logger = logging.getLogger(__name__)
17
+
13
18
 
14
- @dataclass
15
19
  class MongoDBDatabase(Database):
16
20
  """
17
- A wrapper around a MongoDB database
21
+ An adapter for MongoDB databases.
22
+
23
+ The LinkML-Store Database abstraction combines mongodb Client and Database.
18
24
  """
19
25
 
20
- _client: MongoClient = None
21
- _database = None
26
+ _native_client: MongoClient = None
27
+ _native_db = None
28
+ collection_class = MongoDBCollection
22
29
 
23
- def __post_init__(self):
24
- if not self.handle:
25
- self.handle = "mongodb://localhost:27017"
30
+ def __init__(self, handle: Optional[str] = None, **kwargs):
31
+ if handle is None:
32
+ handle = "mongodb://localhost:27017"
33
+ super().__init__(handle=handle, **kwargs)
26
34
 
27
35
  @property
28
- def client(self) -> MongoClient:
29
- if not self._client:
30
- self._client = MongoClient(self.handle)
31
- return self._client
36
+ def native_client(self) -> MongoClient:
37
+ if self._native_client is None:
38
+ self._native_client = MongoClient(self.handle)
39
+ return self._native_client
32
40
 
33
41
  @property
34
- def database(self):
35
- if not self._database:
36
- db_name = self.handle.split("/")[-1]
37
- self._database = self.client[db_name]
38
- return self._database
42
+ def native_db(self) -> NativeDatabase:
43
+ if self._native_db is None:
44
+ alias = self.metadata.alias
45
+ if not alias:
46
+ alias = "default"
47
+ self._native_db = self.native_client[alias]
48
+ return self._native_db
39
49
 
40
50
  def commit(self, **kwargs):
41
51
  pass
42
52
 
43
53
  def close(self, **kwargs):
44
- self.client.close()
54
+ if self._native_client:
55
+ self._native_client.close()
56
+
57
+ def drop(self, **kwargs):
58
+ self.native_client.drop_database(self.metadata.alias)
45
59
 
46
60
  def query(self, query: Query, **kwargs) -> QueryResult:
47
- collection = self.database[query.from_table]
48
- where_clause = query.where_clause or {}
49
- cursor = collection.find(where_clause)
50
- if query.limit:
51
- cursor = cursor.limit(query.limit)
52
- if query.offset:
53
- cursor = cursor.skip(query.offset)
54
- if query.sort_by:
55
- sort_key = [(col, 1) for col in query.sort_by]
56
- cursor = cursor.sort(sort_key)
57
- rows = list(cursor)
58
- num_rows = len(rows)
59
- qr = QueryResult(query=query, num_rows=num_rows, rows=rows)
60
- return qr
61
+ if query.from_table:
62
+ collection = self.get_collection(query.from_table)
63
+ return collection.query(query, **kwargs)
61
64
 
62
65
  def init_collections(self):
63
66
  if self._collections is None:
64
67
  self._collections = {}
65
- for collection_name in self.database.list_collection_names():
68
+
69
+ for collection_name in self.native_db.list_collection_names():
66
70
  if collection_name not in self._collections:
67
71
  collection = MongoDBCollection(name=collection_name, parent=self)
68
72
  self._collections[collection_name] = collection
69
73
 
70
- def create_collection(self, name: str, alias: Optional[str] = None, **kwargs) -> MongoDBCollection:
71
- collection = MongoDBCollection(name=name, parent=self)
72
- if not self._collections:
73
- self._collections = {}
74
- if not alias:
75
- alias = name
76
- self._collections[alias] = collection
77
- return collection
78
-
79
74
  def induce_schema_view(self) -> SchemaView:
75
+ logger.info(f"Inducing schema view for {self.handle}")
80
76
  sb = SchemaBuilder()
81
77
  schema = sb.schema
82
- collection_names = self.database.list_collection_names()
83
- for collection_name in collection_names:
78
+
79
+ for collection_name in self.native_db.list_collection_names():
84
80
  sb.add_class(collection_name)
85
- collection = self.database[collection_name]
86
- sample_doc = collection.find_one()
81
+ mongo_collection = self.native_db[collection_name]
82
+ sample_doc = mongo_collection.find_one()
87
83
  if sample_doc:
88
- for key, value in sample_doc.items():
89
- if key == "_id":
84
+ for field, value in sample_doc.items():
85
+ if field == "_id":
90
86
  continue
87
+ sd = SlotDefinition(field)
91
88
  if isinstance(value, list):
92
- multivalued = True
93
- if value:
94
- value = value[0]
95
- else:
96
- value = None
97
- else:
98
- multivalued = False
99
- if isinstance(value, str):
100
- rng = "string"
101
- elif isinstance(value, int):
102
- rng = "integer"
103
- elif isinstance(value, float):
104
- rng = "float"
105
- elif isinstance(value, bool):
106
- rng = "boolean"
107
- else:
108
- rng = "string"
109
- sd = SlotDefinition(key, range=rng, multivalued=multivalued)
89
+ sd.multivalued = True
90
+ if isinstance(value, dict):
91
+ sd.inlined = True
110
92
  sb.schema.classes[collection_name].attributes[sd.name] = sd
93
+
111
94
  sb.add_defaults()
95
+ for cls_name in schema.classes:
96
+ if cls_name in self.metadata.collections:
97
+ collection_metadata = self.metadata.collections[cls_name]
98
+ if collection_metadata.attributes:
99
+ del schema.classes[cls_name]
100
+ cls = ClassDefinition(name=collection_metadata.type, attributes=collection_metadata.attributes)
101
+ schema.classes[cls.name] = cls
102
+
112
103
  return SchemaView(schema)
@@ -0,0 +1,3 @@
1
+ """
2
+ Wrapper for Solr endpoints.
3
+ """
@@ -0,0 +1,133 @@
1
+ # solr_collection.py
2
+
3
+ import logging
4
+ from copy import copy
5
+ from typing import Any, Dict, List, Optional, Union
6
+
7
+ import requests
8
+
9
+ from linkml_store.api import Collection
10
+ from linkml_store.api.collection import DEFAULT_FACET_LIMIT
11
+ from linkml_store.api.queries import Query, QueryResult
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class SolrCollection(Collection):
17
+
18
+ @property
19
+ def _collection_base(self) -> str:
20
+ if self.parent.use_cores:
21
+ base_url = f"{self.parent.base_url}/{self.name}"
22
+ else:
23
+ base_url = self.parent.base_url
24
+ return base_url
25
+
26
+ def search(
27
+ self,
28
+ query: str,
29
+ where: Optional[Any] = None,
30
+ index_name: Optional[str] = None,
31
+ limit: Optional[int] = None,
32
+ **kwargs,
33
+ ) -> QueryResult:
34
+ if index_name is None:
35
+ index_name = "edismax"
36
+ qfs = self.parent.metadata.searchable_slots
37
+ if not qfs:
38
+ raise ValueError("No searchable slots configured for Solr collection")
39
+ solr_query = self._build_solr_query(where, search_term=query, extra={"defType": index_name, "qf": qfs})
40
+ logger.info(f"Querying Solr collection {self.name} with query: {solr_query}")
41
+
42
+ response = requests.get(f"{self._collection_base}/select", params=solr_query)
43
+ response.raise_for_status()
44
+
45
+ data = response.json()
46
+ num_rows = data["response"]["numFound"]
47
+ rows = data["response"]["docs"]
48
+ ranked_rows = [(1.0, row) for row in rows]
49
+ return QueryResult(query=where, search_term=query, num_rows=num_rows, rows=rows, ranked_rows=ranked_rows)
50
+
51
+ def query(self, query: Query, **kwargs) -> QueryResult:
52
+ solr_query = self._build_solr_query(query)
53
+ logger.info(f"Querying Solr collection {self.name} with query: {solr_query}")
54
+
55
+ response = requests.get(f"{self._collection_base}/select", params=solr_query)
56
+ response.raise_for_status()
57
+
58
+ data = response.json()
59
+ num_rows = data["response"]["numFound"]
60
+ rows = data["response"]["docs"]
61
+
62
+ return QueryResult(query=query, num_rows=num_rows, rows=rows)
63
+
64
+ def query_facets(
65
+ self, where: Optional[Dict] = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
66
+ ) -> Dict[str, Dict[str, int]]:
67
+ solr_query = self._build_solr_query(where)
68
+ solr_query["facet"] = "true"
69
+ solr_query["facet.field"] = facet_columns
70
+ solr_query["facet.limit"] = facet_limit
71
+
72
+ logger.info(f"Querying Solr collection {self.name} for facets with query: {solr_query}")
73
+
74
+ response = requests.get(f"{self._collection_base}/select", params=solr_query)
75
+ response.raise_for_status()
76
+
77
+ data = response.json()
78
+ facet_counts = data["facet_counts"]["facet_fields"]
79
+
80
+ results = {}
81
+ for facet_field, counts in facet_counts.items():
82
+ results[facet_field] = list(zip(counts[::2], counts[1::2]))
83
+
84
+ return results
85
+
86
+ def _build_solr_query(
87
+ self, query: Union[Query, Dict], search_term="*:*", extra: Optional[Dict] = None
88
+ ) -> Dict[str, Any]:
89
+ solr_query = {}
90
+ if query is None:
91
+ query = {}
92
+
93
+ if isinstance(query, Query):
94
+ where = query.where_clause
95
+ solr_query["fq"] = self._build_solr_where_clause(where)
96
+
97
+ if query.select_cols:
98
+ solr_query["fl"] = ",".join(query.select_cols)
99
+
100
+ if query.limit:
101
+ solr_query["rows"] = query.limit
102
+
103
+ if query.offset:
104
+ solr_query["start"] = query.offset
105
+
106
+ elif isinstance(query, dict):
107
+ solr_query["fq"] = self._build_solr_where_clause(query)
108
+
109
+ solr_query["wt"] = "json"
110
+ if "q" not in solr_query:
111
+ solr_query["q"] = search_term
112
+ if extra:
113
+ solr_query.update(extra)
114
+ logger.info(f"Built Solr query: {solr_query}")
115
+ return solr_query
116
+
117
+ def _build_solr_where_clause(self, where_clause: Dict) -> str:
118
+ if where_clause is None:
119
+ where_clause = {}
120
+ conditions = []
121
+ if self.parent.metadata.collection_type_slot:
122
+ where_clause = copy(where_clause)
123
+ where_clause[self.parent.metadata.collection_type_slot] = self.alias
124
+ for field, value in where_clause.items():
125
+ if not isinstance(value, (list, tuple)):
126
+ value = [value]
127
+ value = [f'"{v}"' if isinstance(v, str) else str(v) for v in value]
128
+ if len(value) > 1:
129
+ conditions.append(f"{field}:({' '.join(value)})")
130
+ else:
131
+ conditions.append(f"{field}:{value[0]}")
132
+
133
+ return " AND ".join(conditions)
@@ -0,0 +1,83 @@
1
+ import logging
2
+ from typing import Optional
3
+
4
+ import requests
5
+
6
+ from linkml_store.api import Collection, Database
7
+ from linkml_store.api.config import CollectionConfig
8
+ from linkml_store.api.queries import Query, QueryResult
9
+ from linkml_store.api.stores.solr.solr_collection import SolrCollection
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class SolrDatabase(Database):
15
+ base_url: str
16
+ collection_class = SolrCollection
17
+ use_cores: bool = False
18
+
19
+ def __init__(self, handle: Optional[str] = None, **kwargs):
20
+ if handle.startswith("solr:"):
21
+ self.base_url = handle.replace("solr:", "")
22
+ else:
23
+ self.base_url = handle
24
+ super().__init__(handle=handle, **kwargs)
25
+
26
+ def get_collection(self, name: str, create_if_not_exists=True, **kwargs) -> "Collection":
27
+ if not self._collections:
28
+ self.init_collections()
29
+
30
+ if name not in self._collections.keys():
31
+ if create_if_not_exists:
32
+ self._collections[name] = self.create_collection(name)
33
+ else:
34
+ raise KeyError(f"Collection {name} does not exist")
35
+
36
+ return self._collections[name]
37
+
38
+ def create_collection(
39
+ self, name: str, alias: Optional[str] = None, metadata: Optional[CollectionConfig] = None, **kwargs
40
+ ) -> Collection:
41
+ if not name:
42
+ raise ValueError(f"Collection name must be provided: alias: {alias} metadata: {metadata}")
43
+
44
+ collection_cls = self.collection_class
45
+ collection = collection_cls(name=name, alias=alias, parent=self, metadata=metadata)
46
+
47
+ if not self._collections:
48
+ self._collections = {}
49
+
50
+ if not alias:
51
+ alias = name
52
+
53
+ self._collections[alias] = collection
54
+ return collection
55
+
56
+ def init_collections(self):
57
+ if self._collections is None:
58
+ self._collections = {}
59
+ if self.metadata.collection_type_slot:
60
+ response = requests.get(
61
+ f"{self.base_url}/select",
62
+ params={
63
+ "q": "*:*",
64
+ "wt": "json",
65
+ "rows": 0,
66
+ "facet": "true",
67
+ "facet.field": self.metadata.collection_type_slot,
68
+ "facet.limit": -1,
69
+ },
70
+ )
71
+ response.raise_for_status()
72
+ data = response.json()
73
+ coll_names = data["facet_counts"]["facet_fields"][self.metadata.collection_type_slot]
74
+ coll_names = coll_names[::2]
75
+ for coll_name in coll_names:
76
+ self.create_collection(coll_name)
77
+ else:
78
+ self.create_collection("default")
79
+
80
+ def query(self, query: Query, **kwargs) -> QueryResult:
81
+ collection_name = query.from_table
82
+ collection = self.get_collection(collection_name)
83
+ return collection.query(query, **kwargs)
File without changes