linkml-store 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/client.py +76 -11
- linkml_store/api/collection.py +223 -40
- linkml_store/api/config.py +59 -9
- linkml_store/api/database.py +45 -27
- linkml_store/api/stores/duckdb/duckdb_collection.py +21 -3
- linkml_store/api/stores/duckdb/duckdb_database.py +36 -3
- linkml_store/api/stores/filesystem/filesystem_collection.py +13 -4
- linkml_store/api/stores/filesystem/filesystem_database.py +10 -1
- linkml_store/api/stores/mongodb/mongodb_collection.py +80 -34
- linkml_store/api/stores/mongodb/mongodb_database.py +1 -36
- linkml_store/api/stores/solr/solr_collection.py +4 -4
- linkml_store/cli.py +44 -18
- linkml_store/index/__init__.py +21 -5
- linkml_store/index/implementations/llm_indexer.py +2 -1
- linkml_store/index/indexer.py +20 -4
- linkml_store/utils/file_utils.py +37 -0
- linkml_store/utils/format_utils.py +69 -8
- linkml_store/utils/pandas_utils.py +40 -0
- linkml_store/utils/schema_utils.py +23 -0
- linkml_store/utils/sql_utils.py +2 -1
- linkml_store/webapi/__init__.py +0 -0
- linkml_store/webapi/html/__init__.py +3 -0
- linkml_store/webapi/html/base.html.j2 +24 -0
- linkml_store/webapi/html/collection_details.html.j2 +15 -0
- linkml_store/webapi/html/database_details.html.j2 +16 -0
- linkml_store/webapi/html/databases.html.j2 +14 -0
- linkml_store/webapi/html/generic.html.j2 +46 -0
- linkml_store/webapi/main.py +572 -0
- linkml_store-0.1.11.dist-info/METADATA +171 -0
- linkml_store-0.1.11.dist-info/RECORD +60 -0
- {linkml_store-0.1.9.dist-info → linkml_store-0.1.11.dist-info}/entry_points.txt +1 -0
- linkml_store-0.1.9.dist-info/METADATA +0 -61
- linkml_store-0.1.9.dist-info/RECORD +0 -49
- {linkml_store-0.1.9.dist-info → linkml_store-0.1.11.dist-info}/LICENSE +0 -0
- {linkml_store-0.1.9.dist-info → linkml_store-0.1.11.dist-info}/WHEEL +0 -0
linkml_store/api/database.py
CHANGED
|
@@ -149,26 +149,19 @@ class Database(ABC, Generic[CollectionType]):
|
|
|
149
149
|
def _initialize_collections(self):
|
|
150
150
|
if not self.metadata.collections:
|
|
151
151
|
return
|
|
152
|
-
for
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
# )
|
|
162
|
-
if False and typ is not None:
|
|
163
|
-
if not alias:
|
|
164
|
-
alias = name
|
|
165
|
-
name = typ
|
|
166
|
-
if not collection_config.name:
|
|
167
|
-
collection_config.name = name
|
|
168
|
-
_collection = self.create_collection(name, alias=alias, metadata=collection_config)
|
|
152
|
+
for k, collection_config in self.metadata.collections.items():
|
|
153
|
+
if collection_config.alias:
|
|
154
|
+
if collection_config.alias != k:
|
|
155
|
+
raise ValueError(f"Alias mismatch: {collection_config.alias} != {k}")
|
|
156
|
+
alias = k
|
|
157
|
+
typ = collection_config.type or alias
|
|
158
|
+
_collection = self.create_collection(typ, alias=alias, metadata=collection_config)
|
|
159
|
+
assert _collection.alias == alias
|
|
160
|
+
assert _collection.target_class_name == typ
|
|
169
161
|
if collection_config.attributes:
|
|
162
|
+
# initialize schema
|
|
170
163
|
sv = self.schema_view
|
|
171
|
-
cd = ClassDefinition(
|
|
164
|
+
cd = ClassDefinition(typ, attributes=collection_config.attributes)
|
|
172
165
|
sv.schema.classes[cd.name] = cd
|
|
173
166
|
sv.set_modified()
|
|
174
167
|
# assert collection.class_definition() is not None
|
|
@@ -275,7 +268,7 @@ class Database(ABC, Generic[CollectionType]):
|
|
|
275
268
|
metadata: Optional[CollectionConfig] = None,
|
|
276
269
|
recreate_if_exists=False,
|
|
277
270
|
**kwargs,
|
|
278
|
-
) ->
|
|
271
|
+
) -> CollectionType:
|
|
279
272
|
"""
|
|
280
273
|
Create a new collection in the current database.
|
|
281
274
|
|
|
@@ -307,8 +300,10 @@ class Database(ABC, Generic[CollectionType]):
|
|
|
307
300
|
if not name:
|
|
308
301
|
raise ValueError(f"Collection name must be provided: alias: {alias} metadata: {metadata}")
|
|
309
302
|
collection_cls = self.collection_class
|
|
310
|
-
collection = collection_cls(name=name,
|
|
311
|
-
if
|
|
303
|
+
collection = collection_cls(name=name, parent=self, metadata=metadata)
|
|
304
|
+
if alias:
|
|
305
|
+
collection.metadata.alias = alias
|
|
306
|
+
if metadata and metadata.source:
|
|
312
307
|
collection.load_from_source()
|
|
313
308
|
if metadata and metadata.attributes:
|
|
314
309
|
sv = self.schema_view
|
|
@@ -321,7 +316,7 @@ class Database(ABC, Generic[CollectionType]):
|
|
|
321
316
|
alias = name
|
|
322
317
|
self._collections[alias] = collection
|
|
323
318
|
if recreate_if_exists:
|
|
324
|
-
logger.debug(f"Recreating collection {collection.
|
|
319
|
+
logger.debug(f"Recreating collection {collection.alias}")
|
|
325
320
|
collection.delete_where({}, missing_ok=True)
|
|
326
321
|
return collection
|
|
327
322
|
|
|
@@ -339,7 +334,7 @@ class Database(ABC, Generic[CollectionType]):
|
|
|
339
334
|
>>> collections = db.list_collections()
|
|
340
335
|
>>> len(collections)
|
|
341
336
|
2
|
|
342
|
-
>>> [c.
|
|
337
|
+
>>> [c.target_class_name for c in collections]
|
|
343
338
|
['Person', 'Product']
|
|
344
339
|
|
|
345
340
|
:param include_internal: include internal collections
|
|
@@ -367,7 +362,7 @@ class Database(ABC, Generic[CollectionType]):
|
|
|
367
362
|
['Person', 'Product']
|
|
368
363
|
|
|
369
364
|
"""
|
|
370
|
-
return [c.
|
|
365
|
+
return [c.alias for c in self.list_collections(**kwargs)]
|
|
371
366
|
|
|
372
367
|
def get_collection(
|
|
373
368
|
self, name: str, type: Optional[str] = None, create_if_not_exists=True, **kwargs
|
|
@@ -410,7 +405,7 @@ class Database(ABC, Generic[CollectionType]):
|
|
|
410
405
|
"""
|
|
411
406
|
Initialize collections.
|
|
412
407
|
|
|
413
|
-
Not typically called directly: consider making
|
|
408
|
+
TODO: Not typically called directly: consider making this private
|
|
414
409
|
:return:
|
|
415
410
|
"""
|
|
416
411
|
raise NotImplementedError
|
|
@@ -502,7 +497,7 @@ class Database(ABC, Generic[CollectionType]):
|
|
|
502
497
|
>>> sorted(collection.class_definition().slots)
|
|
503
498
|
['capital', 'code', 'continent', 'languages', 'name']
|
|
504
499
|
|
|
505
|
-
:param schema_view:
|
|
500
|
+
:param schema_view: can be either a path to the schema, or a SchemaView object
|
|
506
501
|
:return:
|
|
507
502
|
"""
|
|
508
503
|
if isinstance(schema_view, Path):
|
|
@@ -585,7 +580,15 @@ class Database(ABC, Generic[CollectionType]):
|
|
|
585
580
|
|
|
586
581
|
:return: A schema view
|
|
587
582
|
"""
|
|
588
|
-
|
|
583
|
+
logger.info(f"Inducing schema view for {self.handle}")
|
|
584
|
+
from linkml_runtime.utils.schema_builder import SchemaBuilder
|
|
585
|
+
|
|
586
|
+
sb = SchemaBuilder()
|
|
587
|
+
|
|
588
|
+
for collection_name in self.list_collection_names():
|
|
589
|
+
coll = self.get_collection(collection_name)
|
|
590
|
+
sb.add_class(coll.target_class_name)
|
|
591
|
+
return SchemaView(sb.schema)
|
|
589
592
|
|
|
590
593
|
def iter_validate_database(self, **kwargs) -> Iterator["ValidationResult"]:
|
|
591
594
|
"""
|
|
@@ -683,6 +686,21 @@ class Database(ABC, Generic[CollectionType]):
|
|
|
683
686
|
"""
|
|
684
687
|
Drop the database and all collections.
|
|
685
688
|
|
|
689
|
+
>>> from linkml_store.api.client import Client
|
|
690
|
+
>>> client = Client()
|
|
691
|
+
>>> path = Path("/tmp/test.db")
|
|
692
|
+
>>> path.parent.mkdir(exist_ok=True, parents=True)
|
|
693
|
+
>>> db = client.attach_database(f"duckdb:///{path}")
|
|
694
|
+
>>> db.store({"persons": [{"id": "P1", "name": "John", "age_in_years": 30}]})
|
|
695
|
+
>>> coll = db.get_collection("persons")
|
|
696
|
+
>>> coll.find({}).num_rows
|
|
697
|
+
1
|
|
698
|
+
>>> db.drop()
|
|
699
|
+
>>> db = client.attach_database("duckdb:///tmp/test.db", alias="test")
|
|
700
|
+
>>> coll = db.get_collection("persons")
|
|
701
|
+
>>> coll.find({}).num_rows
|
|
702
|
+
0
|
|
703
|
+
|
|
686
704
|
:param kwargs: additional arguments
|
|
687
705
|
"""
|
|
688
706
|
raise NotImplementedError()
|
|
@@ -18,6 +18,9 @@ logger = logging.getLogger(__name__)
|
|
|
18
18
|
class DuckDBCollection(Collection):
|
|
19
19
|
_table_created: bool = None
|
|
20
20
|
|
|
21
|
+
def __init__(self, *args, **kwargs):
|
|
22
|
+
super().__init__(*args, **kwargs)
|
|
23
|
+
|
|
21
24
|
def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
22
25
|
logger.debug(f"Inserting {len(objs)}")
|
|
23
26
|
if not isinstance(objs, list):
|
|
@@ -90,7 +93,9 @@ class DuckDBCollection(Collection):
|
|
|
90
93
|
cd = self.class_definition()
|
|
91
94
|
with self.parent.engine.connect() as conn:
|
|
92
95
|
if not facet_columns:
|
|
93
|
-
|
|
96
|
+
if not cd:
|
|
97
|
+
raise ValueError(f"No class definition found for {self.target_class_name}")
|
|
98
|
+
facet_columns = list(cd.attributes.keys())
|
|
94
99
|
for col in facet_columns:
|
|
95
100
|
logger.debug(f"Faceting on {col}")
|
|
96
101
|
if isinstance(col, tuple):
|
|
@@ -101,7 +106,7 @@ class DuckDBCollection(Collection):
|
|
|
101
106
|
facet_query_str = facet_count_sql(facet_query, col, multivalued=sd.multivalued)
|
|
102
107
|
logger.debug(f"Facet query: {facet_query_str}")
|
|
103
108
|
rows = list(conn.execute(text(facet_query_str)))
|
|
104
|
-
results[col] = rows
|
|
109
|
+
results[col] = [tuple(row) for row in rows]
|
|
105
110
|
return results
|
|
106
111
|
|
|
107
112
|
def _sqla_table(self, cd: ClassDefinition) -> Table:
|
|
@@ -110,7 +115,7 @@ class DuckDBCollection(Collection):
|
|
|
110
115
|
cols = []
|
|
111
116
|
for att in schema_view.class_induced_slots(cd.name):
|
|
112
117
|
typ = TMAP.get(att.range, sqla.String)
|
|
113
|
-
if att.inlined:
|
|
118
|
+
if att.inlined or att.inlined_as_list:
|
|
114
119
|
typ = sqla.JSON
|
|
115
120
|
if att.multivalued:
|
|
116
121
|
typ = sqla.ARRAY(typ, dimensions=1)
|
|
@@ -121,6 +126,17 @@ class DuckDBCollection(Collection):
|
|
|
121
126
|
t = Table(self.alias, metadata_obj, *cols)
|
|
122
127
|
return t
|
|
123
128
|
|
|
129
|
+
def _check_if_initialized(self) -> bool:
|
|
130
|
+
# if self._initialized:
|
|
131
|
+
# return True
|
|
132
|
+
query = Query(
|
|
133
|
+
from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE", "table_name": self.alias}
|
|
134
|
+
)
|
|
135
|
+
qr = self.parent.query(query)
|
|
136
|
+
if qr.num_rows > 0:
|
|
137
|
+
return True
|
|
138
|
+
return False
|
|
139
|
+
|
|
124
140
|
def _create_table(self, cd: ClassDefinition):
|
|
125
141
|
if self._table_created or self.metadata.is_prepopulated:
|
|
126
142
|
logger.info(f"Already have table for: {cd.name}")
|
|
@@ -132,6 +148,7 @@ class DuckDBCollection(Collection):
|
|
|
132
148
|
if qr.num_rows > 0:
|
|
133
149
|
logger.info(f"Table already exists for {cd.name}")
|
|
134
150
|
self._table_created = True
|
|
151
|
+
self._initialized = True
|
|
135
152
|
self.metadata.is_prepopulated = True
|
|
136
153
|
return
|
|
137
154
|
logger.info(f"Creating table for {cd.name}")
|
|
@@ -142,4 +159,5 @@ class DuckDBCollection(Collection):
|
|
|
142
159
|
conn.execute(text(ddl))
|
|
143
160
|
conn.commit()
|
|
144
161
|
self._table_created = True
|
|
162
|
+
self._initialized = True
|
|
145
163
|
self.metadata.is_prepopulated = True
|
|
@@ -26,18 +26,32 @@ TYPE_MAP = {
|
|
|
26
26
|
"JSON": "Any",
|
|
27
27
|
}
|
|
28
28
|
|
|
29
|
+
MEMORY_HANDLE = "duckdb:///:memory:"
|
|
30
|
+
|
|
29
31
|
|
|
30
32
|
logger = logging.getLogger(__name__)
|
|
31
33
|
|
|
32
34
|
|
|
33
35
|
class DuckDBDatabase(Database):
|
|
36
|
+
"""
|
|
37
|
+
An adapter for DuckDB databases.
|
|
38
|
+
|
|
39
|
+
Note that this adapter does not make use of a LinkML relational model transformation and
|
|
40
|
+
SQL Alchemy ORM layer. Instead, it attempts to map each collection (which is of type
|
|
41
|
+
some LinkML class) to a *single* DuckDB table. New tables are not created for nested references,
|
|
42
|
+
and linking tables are not created for many-to-many relationships.
|
|
43
|
+
|
|
44
|
+
Instead the native DuckDB ARRAY type is used to store multivalued attributes, and DuckDB JSON
|
|
45
|
+
types are used for nested inlined objects.
|
|
46
|
+
"""
|
|
47
|
+
|
|
34
48
|
_connection: DuckDBPyConnection = None
|
|
35
49
|
_engine: sqlalchemy.Engine = None
|
|
36
50
|
collection_class = DuckDBCollection
|
|
37
51
|
|
|
38
52
|
def __init__(self, handle: Optional[str] = None, recreate_if_exists: bool = False, **kwargs):
|
|
39
53
|
if handle is None:
|
|
40
|
-
handle =
|
|
54
|
+
handle = MEMORY_HANDLE
|
|
41
55
|
if recreate_if_exists:
|
|
42
56
|
path = Path(handle.replace("duckdb:///", ""))
|
|
43
57
|
if path.exists():
|
|
@@ -64,6 +78,17 @@ class DuckDBDatabase(Database):
|
|
|
64
78
|
def close(self, **kwargs):
|
|
65
79
|
self.engine.dispose()
|
|
66
80
|
|
|
81
|
+
def drop(self, missing_ok=True, **kwargs):
|
|
82
|
+
self.close()
|
|
83
|
+
if self.handle == MEMORY_HANDLE:
|
|
84
|
+
return
|
|
85
|
+
path = Path(self.handle.replace("duckdb:///", ""))
|
|
86
|
+
if path.exists():
|
|
87
|
+
path.unlink()
|
|
88
|
+
else:
|
|
89
|
+
if not missing_ok:
|
|
90
|
+
raise FileNotFoundError(f"Database file not found: {path}")
|
|
91
|
+
|
|
67
92
|
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
68
93
|
json_encoded_cols = []
|
|
69
94
|
if query.from_table:
|
|
@@ -82,7 +107,8 @@ class DuckDBDatabase(Database):
|
|
|
82
107
|
if sv:
|
|
83
108
|
cd = None
|
|
84
109
|
for c in self._collections.values():
|
|
85
|
-
if c.name == query.from_table or c.metadata.alias == query.from_table:
|
|
110
|
+
# if c.name == query.from_table or c.metadata.alias == query.from_table:
|
|
111
|
+
if c.alias == query.from_table or c.target_class_name == query.from_table:
|
|
86
112
|
cd = c.class_definition()
|
|
87
113
|
break
|
|
88
114
|
if cd:
|
|
@@ -103,7 +129,14 @@ class DuckDBDatabase(Database):
|
|
|
103
129
|
if row[col]:
|
|
104
130
|
if isinstance(row[col], list):
|
|
105
131
|
for i in range(len(row[col])):
|
|
106
|
-
|
|
132
|
+
try:
|
|
133
|
+
parsed_val = json.loads(row[col][i])
|
|
134
|
+
except json.JSONDecodeError as e:
|
|
135
|
+
logger.error(f"Failed to parse col {col}[{i}] == {row[col][i]}")
|
|
136
|
+
raise e
|
|
137
|
+
row[col][i] = parsed_val
|
|
138
|
+
elif isinstance(row[col], dict):
|
|
139
|
+
pass
|
|
107
140
|
else:
|
|
108
141
|
row[col] = json.loads(row[col])
|
|
109
142
|
qr.set_rows(pd.DataFrame(rows))
|
|
@@ -31,7 +31,7 @@ class FileSystemCollection(Collection[DatabaseType]):
|
|
|
31
31
|
|
|
32
32
|
@property
|
|
33
33
|
def path_to_file(self):
|
|
34
|
-
return Path(self.parent.directory_path) / f"{self.
|
|
34
|
+
return Path(self.parent.directory_path) / f"{self.alias}.{self.file_format}"
|
|
35
35
|
|
|
36
36
|
@property
|
|
37
37
|
def objects_as_list(self) -> List[OBJECT]:
|
|
@@ -150,13 +150,20 @@ class FileSystemCollection(Collection[DatabaseType]):
|
|
|
150
150
|
curr_objects = [o for o in self.objects_as_list if not matches(o)]
|
|
151
151
|
self._set_objects(curr_objects)
|
|
152
152
|
|
|
153
|
-
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
154
|
-
|
|
153
|
+
def query(self, query: Query, limit: Optional[int] = None, offset: Optional[int] = None, **kwargs) -> QueryResult:
|
|
154
|
+
limit = limit or query.limit
|
|
155
|
+
offset = offset or query.offset
|
|
156
|
+
if offset is None:
|
|
157
|
+
offset = 0
|
|
155
158
|
where = query.where_clause or {}
|
|
156
159
|
match = mongo_query_to_match_function(where)
|
|
157
160
|
rows = [o for o in self.objects_as_list if match(o)]
|
|
158
161
|
count = len(rows)
|
|
159
|
-
|
|
162
|
+
if limit is None or limit < 0:
|
|
163
|
+
limit = count
|
|
164
|
+
# TODO: avoid recalculating
|
|
165
|
+
returned_row = rows[offset : offset + limit]
|
|
166
|
+
return QueryResult(query=query, num_rows=count, rows=returned_row)
|
|
160
167
|
|
|
161
168
|
def query_facets(
|
|
162
169
|
self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
|
|
@@ -170,6 +177,8 @@ class FileSystemCollection(Collection[DatabaseType]):
|
|
|
170
177
|
for fc in facet_columns:
|
|
171
178
|
if fc in row:
|
|
172
179
|
v = row[fc]
|
|
180
|
+
if not isinstance(v, str):
|
|
181
|
+
v = str(v)
|
|
173
182
|
if v not in facet_results[fc]:
|
|
174
183
|
facet_results[fc][v] = 1
|
|
175
184
|
else:
|
|
@@ -9,6 +9,7 @@ from linkml_runtime import SchemaView
|
|
|
9
9
|
from linkml_store.api import Database
|
|
10
10
|
from linkml_store.api.config import DatabaseConfig
|
|
11
11
|
from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
|
|
12
|
+
from linkml_store.utils.file_utils import safe_remove_directory
|
|
12
13
|
from linkml_store.utils.format_utils import Format, load_objects
|
|
13
14
|
|
|
14
15
|
logger = logging.getLogger(__name__)
|
|
@@ -20,6 +21,8 @@ class FileSystemDatabase(Database):
|
|
|
20
21
|
directory_path: Optional[Path] = None
|
|
21
22
|
default_file_format: Optional[str] = None
|
|
22
23
|
|
|
24
|
+
no_backup_on_drop: bool = False
|
|
25
|
+
|
|
23
26
|
def __init__(self, handle: Optional[str] = None, **kwargs):
|
|
24
27
|
handle = handle.replace("file:", "")
|
|
25
28
|
if handle.startswith("//"):
|
|
@@ -43,6 +46,12 @@ class FileSystemDatabase(Database):
|
|
|
43
46
|
def close(self, **kwargs):
|
|
44
47
|
pass
|
|
45
48
|
|
|
49
|
+
def drop(self, no_backup=False, **kwargs):
|
|
50
|
+
self.close()
|
|
51
|
+
path = self.directory_path
|
|
52
|
+
if path.exists():
|
|
53
|
+
safe_remove_directory(path, no_backup=self.no_backup_on_drop or no_backup)
|
|
54
|
+
|
|
46
55
|
def init_collections(self):
|
|
47
56
|
metadata = self.metadata
|
|
48
57
|
if self._collections is None:
|
|
@@ -63,7 +72,7 @@ class FileSystemDatabase(Database):
|
|
|
63
72
|
self._collections[n] = collection
|
|
64
73
|
collection._set_objects(objs)
|
|
65
74
|
|
|
66
|
-
def
|
|
75
|
+
def xxxinduce_schema_view(self) -> SchemaView:
|
|
67
76
|
logger.info(f"Inducing schema view for {self.handle}")
|
|
68
77
|
sb = SchemaBuilder()
|
|
69
78
|
|
|
@@ -2,7 +2,6 @@ import logging
|
|
|
2
2
|
from copy import copy
|
|
3
3
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
4
4
|
|
|
5
|
-
from linkml_runtime.linkml_model import SlotDefinition
|
|
6
5
|
from pymongo.collection import Collection as MongoCollection
|
|
7
6
|
|
|
8
7
|
from linkml_store.api import Collection
|
|
@@ -24,11 +23,15 @@ class MongoDBCollection(Collection):
|
|
|
24
23
|
|
|
25
24
|
@property
|
|
26
25
|
def mongo_collection(self) -> MongoCollection:
|
|
27
|
-
|
|
26
|
+
# collection_name = self.alias or self.name
|
|
27
|
+
collection_name = self.alias
|
|
28
|
+
if not collection_name:
|
|
28
29
|
raise ValueError("Collection name not set")
|
|
29
|
-
collection_name = self.alias or self.name
|
|
30
30
|
return self.parent.native_db[collection_name]
|
|
31
31
|
|
|
32
|
+
def _check_if_initialized(self) -> bool:
|
|
33
|
+
return self.alias in self.parent.native_db.list_collection_names()
|
|
34
|
+
|
|
32
35
|
def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
|
|
33
36
|
if not isinstance(objs, list):
|
|
34
37
|
objs = [objs]
|
|
@@ -38,13 +41,15 @@ class MongoDBCollection(Collection):
|
|
|
38
41
|
del obj["_id"]
|
|
39
42
|
self._post_insert_hook(objs)
|
|
40
43
|
|
|
41
|
-
def query(self, query: Query, limit: Optional[int] = None, **kwargs) -> QueryResult:
|
|
44
|
+
def query(self, query: Query, limit: Optional[int] = None, offset: Optional[int] = None, **kwargs) -> QueryResult:
|
|
42
45
|
mongo_filter = self._build_mongo_filter(query.where_clause)
|
|
43
46
|
limit = limit or query.limit
|
|
47
|
+
cursor = self.mongo_collection.find(mongo_filter)
|
|
44
48
|
if limit and limit >= 0:
|
|
45
|
-
cursor =
|
|
46
|
-
|
|
47
|
-
|
|
49
|
+
cursor = cursor.limit(limit)
|
|
50
|
+
offset = offset or query.offset
|
|
51
|
+
if offset and offset >= 0:
|
|
52
|
+
cursor = cursor.skip(offset)
|
|
48
53
|
|
|
49
54
|
def _as_row(row: dict):
|
|
50
55
|
row = copy(row)
|
|
@@ -63,46 +68,87 @@ class MongoDBCollection(Collection):
|
|
|
63
68
|
mongo_filter[field] = value
|
|
64
69
|
return mongo_filter
|
|
65
70
|
|
|
71
|
+
from typing import Any, Dict, List, Union
|
|
72
|
+
|
|
66
73
|
def query_facets(
|
|
67
|
-
self,
|
|
68
|
-
|
|
74
|
+
self,
|
|
75
|
+
where: Dict = None,
|
|
76
|
+
facet_columns: List[Union[str, Tuple[str, ...]]] = None,
|
|
77
|
+
facet_limit=DEFAULT_FACET_LIMIT,
|
|
78
|
+
**kwargs,
|
|
79
|
+
) -> Dict[Union[str, Tuple[str, ...]], List[Tuple[Any, int]]]:
|
|
69
80
|
results = {}
|
|
70
|
-
cd = self.class_definition()
|
|
71
81
|
if not facet_columns:
|
|
72
82
|
facet_columns = list(self.class_definition().attributes.keys())
|
|
73
83
|
|
|
74
84
|
for col in facet_columns:
|
|
75
85
|
logger.debug(f"Faceting on {col}")
|
|
86
|
+
|
|
87
|
+
# Handle tuple columns
|
|
88
|
+
if isinstance(col, tuple):
|
|
89
|
+
group_id = {k.replace(".", "_"): f"${k}" for k in col}
|
|
90
|
+
all_fields = col
|
|
91
|
+
else:
|
|
92
|
+
group_id = f"${col}"
|
|
93
|
+
all_fields = [col]
|
|
94
|
+
|
|
95
|
+
# Initial pipeline without unwinding
|
|
96
|
+
facet_pipeline = [
|
|
97
|
+
{"$match": where} if where else {"$match": {}},
|
|
98
|
+
{"$group": {"_id": group_id, "count": {"$sum": 1}}},
|
|
99
|
+
{"$sort": {"count": -1}},
|
|
100
|
+
{"$limit": facet_limit},
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
logger.info(f"Initial facet pipeline: {facet_pipeline}")
|
|
104
|
+
initial_results = list(self.mongo_collection.aggregate(facet_pipeline))
|
|
105
|
+
|
|
106
|
+
# Check if we need to unwind based on the results
|
|
107
|
+
needs_unwinding = False
|
|
76
108
|
if isinstance(col, tuple):
|
|
77
|
-
|
|
109
|
+
needs_unwinding = any(
|
|
110
|
+
isinstance(result["_id"], dict) and any(isinstance(v, list) for v in result["_id"].values())
|
|
111
|
+
for result in initial_results
|
|
112
|
+
)
|
|
113
|
+
else:
|
|
114
|
+
needs_unwinding = any(isinstance(result["_id"], list) for result in initial_results)
|
|
115
|
+
|
|
116
|
+
if needs_unwinding:
|
|
117
|
+
logger.info(f"Detected array values for {col}, unwinding...")
|
|
118
|
+
facet_pipeline = [{"$match": where} if where else {"$match": {}}]
|
|
119
|
+
|
|
120
|
+
# Unwind each field if needed
|
|
121
|
+
for field in all_fields:
|
|
122
|
+
field_parts = field.split(".")
|
|
123
|
+
for i in range(len(field_parts)):
|
|
124
|
+
facet_pipeline.append({"$unwind": f"${'.'.join(field_parts[:i + 1])}"})
|
|
125
|
+
|
|
126
|
+
facet_pipeline.extend(
|
|
127
|
+
[
|
|
128
|
+
{"$group": {"_id": group_id, "count": {"$sum": 1}}},
|
|
129
|
+
{"$sort": {"count": -1}},
|
|
130
|
+
{"$limit": facet_limit},
|
|
131
|
+
]
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
logger.info(f"Updated facet pipeline with unwinding: {facet_pipeline}")
|
|
135
|
+
facet_results = list(self.mongo_collection.aggregate(facet_pipeline))
|
|
78
136
|
else:
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
group = {"$group": {"_id": f"${col}", "count": {"$sum": 1}}}
|
|
137
|
+
facet_results = initial_results
|
|
138
|
+
|
|
139
|
+
logger.info(f"Facet results: {facet_results}")
|
|
140
|
+
|
|
141
|
+
# Process results
|
|
85
142
|
if isinstance(col, tuple):
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
{"$match": where} if where else {"$match": {}},
|
|
91
|
-
{"$unwind": f"${col}"},
|
|
92
|
-
group,
|
|
93
|
-
{"$sort": {"count": -1}},
|
|
94
|
-
{"$limit": facet_limit},
|
|
143
|
+
results[col] = [
|
|
144
|
+
(tuple(result["_id"].values()), result["count"])
|
|
145
|
+
for result in facet_results
|
|
146
|
+
if result["_id"] is not None and all(v is not None for v in result["_id"].values())
|
|
95
147
|
]
|
|
96
148
|
else:
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
group,
|
|
100
|
-
{"$sort": {"count": -1}},
|
|
101
|
-
{"$limit": facet_limit},
|
|
149
|
+
results[col] = [
|
|
150
|
+
(result["_id"], result["count"]) for result in facet_results if result["_id"] is not None
|
|
102
151
|
]
|
|
103
|
-
logger.info(f"Facet pipeline: {facet_pipeline}")
|
|
104
|
-
facet_results = list(self.mongo_collection.aggregate(facet_pipeline))
|
|
105
|
-
results[col] = [(result["_id"], result["count"]) for result in facet_results]
|
|
106
152
|
|
|
107
153
|
return results
|
|
108
154
|
|
|
@@ -3,9 +3,6 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
from typing import Optional
|
|
5
5
|
|
|
6
|
-
from linkml_runtime import SchemaView
|
|
7
|
-
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
|
|
8
|
-
from linkml_runtime.utils.schema_builder import SchemaBuilder
|
|
9
6
|
from pymongo import MongoClient
|
|
10
7
|
from pymongo.database import Database as NativeDatabase
|
|
11
8
|
|
|
@@ -63,10 +60,9 @@ class MongoDBDatabase(Database):
|
|
|
63
60
|
self._native_client.close()
|
|
64
61
|
|
|
65
62
|
def drop(self, **kwargs):
|
|
66
|
-
self.native_client.drop_database(self.
|
|
63
|
+
self.native_client.drop_database(self.native_db.name)
|
|
67
64
|
|
|
68
65
|
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
69
|
-
# TODO: DRY
|
|
70
66
|
if query.from_table:
|
|
71
67
|
collection = self.get_collection(query.from_table)
|
|
72
68
|
return collection.query(query, **kwargs)
|
|
@@ -81,34 +77,3 @@ class MongoDBDatabase(Database):
|
|
|
81
77
|
if collection_name not in self._collections:
|
|
82
78
|
collection = MongoDBCollection(name=collection_name, parent=self)
|
|
83
79
|
self._collections[collection_name] = collection
|
|
84
|
-
|
|
85
|
-
def induce_schema_view(self) -> SchemaView:
|
|
86
|
-
logger.info(f"Inducing schema view for {self.handle}")
|
|
87
|
-
sb = SchemaBuilder()
|
|
88
|
-
schema = sb.schema
|
|
89
|
-
|
|
90
|
-
for collection_name in self.native_db.list_collection_names():
|
|
91
|
-
sb.add_class(collection_name)
|
|
92
|
-
mongo_collection = self.native_db[collection_name]
|
|
93
|
-
sample_doc = mongo_collection.find_one()
|
|
94
|
-
if sample_doc:
|
|
95
|
-
for field, value in sample_doc.items():
|
|
96
|
-
if field == "_id":
|
|
97
|
-
continue
|
|
98
|
-
sd = SlotDefinition(field)
|
|
99
|
-
if isinstance(value, list):
|
|
100
|
-
sd.multivalued = True
|
|
101
|
-
if isinstance(value, dict):
|
|
102
|
-
sd.inlined = True
|
|
103
|
-
sb.schema.classes[collection_name].attributes[sd.name] = sd
|
|
104
|
-
|
|
105
|
-
sb.add_defaults()
|
|
106
|
-
for cls_name in schema.classes:
|
|
107
|
-
if cls_name in self.metadata.collections:
|
|
108
|
-
collection_metadata = self.metadata.collections[cls_name]
|
|
109
|
-
if collection_metadata.attributes:
|
|
110
|
-
del schema.classes[cls_name]
|
|
111
|
-
cls = ClassDefinition(name=collection_metadata.type, attributes=collection_metadata.attributes)
|
|
112
|
-
schema.classes[cls.name] = cls
|
|
113
|
-
|
|
114
|
-
return SchemaView(schema)
|
|
@@ -18,7 +18,7 @@ class SolrCollection(Collection):
|
|
|
18
18
|
@property
|
|
19
19
|
def _collection_base(self) -> str:
|
|
20
20
|
if self.parent.use_cores:
|
|
21
|
-
base_url = f"{self.parent.base_url}/{self.
|
|
21
|
+
base_url = f"{self.parent.base_url}/{self.alias}"
|
|
22
22
|
else:
|
|
23
23
|
base_url = self.parent.base_url
|
|
24
24
|
return base_url
|
|
@@ -37,7 +37,7 @@ class SolrCollection(Collection):
|
|
|
37
37
|
if not qfs:
|
|
38
38
|
raise ValueError("No searchable slots configured for Solr collection")
|
|
39
39
|
solr_query = self._build_solr_query(where, search_term=query, extra={"defType": index_name, "qf": qfs})
|
|
40
|
-
logger.info(f"Querying Solr collection {self.
|
|
40
|
+
logger.info(f"Querying Solr collection {self.alias} with query: {solr_query}")
|
|
41
41
|
|
|
42
42
|
response = requests.get(f"{self._collection_base}/select", params=solr_query)
|
|
43
43
|
response.raise_for_status()
|
|
@@ -50,7 +50,7 @@ class SolrCollection(Collection):
|
|
|
50
50
|
|
|
51
51
|
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
52
52
|
solr_query = self._build_solr_query(query)
|
|
53
|
-
logger.info(f"Querying Solr collection {self.
|
|
53
|
+
logger.info(f"Querying Solr collection {self.alias} with query: {solr_query}")
|
|
54
54
|
|
|
55
55
|
response = requests.get(f"{self._collection_base}/select", params=solr_query)
|
|
56
56
|
response.raise_for_status()
|
|
@@ -69,7 +69,7 @@ class SolrCollection(Collection):
|
|
|
69
69
|
solr_query["facet.field"] = facet_columns
|
|
70
70
|
solr_query["facet.limit"] = facet_limit
|
|
71
71
|
|
|
72
|
-
logger.info(f"Querying Solr collection {self.
|
|
72
|
+
logger.info(f"Querying Solr collection {self.alias} for facets with query: {solr_query}")
|
|
73
73
|
|
|
74
74
|
response = requests.get(f"{self._collection_base}/select", params=solr_query)
|
|
75
75
|
response.raise_for_status()
|