linkml-store 0.1.11__tar.gz → 0.1.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- {linkml_store-0.1.11 → linkml_store-0.1.12}/PKG-INFO +1 -1
- {linkml_store-0.1.11 → linkml_store-0.1.12}/pyproject.toml +1 -1
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/collection.py +17 -5
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/config.py +2 -1
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/database.py +32 -3
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/duckdb/duckdb_database.py +31 -3
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/mongodb/mongodb_database.py +31 -1
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/cli.py +29 -2
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/format_utils.py +132 -14
- linkml_store-0.1.12/src/linkml_store/utils/mongodb_utils.py +145 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/sql_utils.py +7 -2
- linkml_store-0.1.12/src/linkml_store/webapi/html/generic.html.j2 +43 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/webapi/main.py +346 -63
- linkml_store-0.1.11/src/linkml_store/webapi/html/generic.html.j2 +0 -46
- {linkml_store-0.1.11 → linkml_store-0.1.12}/LICENSE +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/README.md +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/__init__.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/__init__.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/client.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/queries.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/__init__.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/chromadb/__init__.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/chromadb/chromadb_collection.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/chromadb/chromadb_database.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/duckdb/__init__.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/duckdb/duckdb_collection.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/duckdb/mappings.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/filesystem/__init__.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/filesystem/filesystem_collection.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/filesystem/filesystem_database.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/hdf5/__init__.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/hdf5/hdf5_collection.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/hdf5/hdf5_database.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/mongodb/__init__.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/mongodb/mongodb_collection.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/solr/__init__.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/solr/solr_collection.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/solr/solr_database.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/solr/solr_utils.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/types.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/constants.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/index/__init__.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/index/implementations/__init__.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/index/implementations/llm_indexer.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/index/implementations/simple_indexer.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/index/indexer.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/__init__.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/change_utils.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/file_utils.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/io.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/object_utils.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/pandas_utils.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/patch_utils.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/query_utils.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/schema_utils.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/webapi/__init__.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/webapi/html/__init__.py +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/webapi/html/base.html.j2 +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/webapi/html/collection_details.html.j2 +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/webapi/html/database_details.html.j2 +0 -0
- {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/webapi/html/databases.html.j2 +0 -0
|
@@ -346,7 +346,10 @@ class Collection(Generic[DatabaseType]):
|
|
|
346
346
|
id_field = self.identifier_attribute_name
|
|
347
347
|
if not id_field:
|
|
348
348
|
raise ValueError(f"No identifier for {self.name}")
|
|
349
|
-
|
|
349
|
+
if len(ids) == 1:
|
|
350
|
+
return self.find({id_field: ids[0]})
|
|
351
|
+
else:
|
|
352
|
+
return self.find({id_field: {"$in": ids}})
|
|
350
353
|
|
|
351
354
|
def get_one(self, id: IDENTIFIER, **kwargs) -> Optional[OBJECT]:
|
|
352
355
|
"""
|
|
@@ -518,7 +521,7 @@ class Collection(Generic[DatabaseType]):
|
|
|
518
521
|
:return:
|
|
519
522
|
"""
|
|
520
523
|
cd = self.class_definition()
|
|
521
|
-
return cd is not None
|
|
524
|
+
return cd is not None and cd.attributes
|
|
522
525
|
|
|
523
526
|
def load_from_source(self, load_if_exists=False):
|
|
524
527
|
"""
|
|
@@ -535,11 +538,19 @@ class Collection(Generic[DatabaseType]):
|
|
|
535
538
|
kwargs = source.arguments or {}
|
|
536
539
|
if source.local_path:
|
|
537
540
|
objects = load_objects(
|
|
538
|
-
metadata.source.local_path,
|
|
541
|
+
metadata.source.local_path,
|
|
542
|
+
format=source.format,
|
|
543
|
+
expected_type=source.expected_type,
|
|
544
|
+
compression=source.compression,
|
|
545
|
+
**kwargs,
|
|
539
546
|
)
|
|
540
547
|
elif metadata.source.url:
|
|
541
548
|
objects = load_objects_from_url(
|
|
542
|
-
metadata.source.url,
|
|
549
|
+
metadata.source.url,
|
|
550
|
+
format=source.format,
|
|
551
|
+
expected_type=source.expected_type,
|
|
552
|
+
compression=source.compression,
|
|
553
|
+
**kwargs,
|
|
543
554
|
)
|
|
544
555
|
self.insert(objects)
|
|
545
556
|
|
|
@@ -746,6 +757,7 @@ class Collection(Generic[DatabaseType]):
|
|
|
746
757
|
sv: SchemaView = self.parent.schema_view
|
|
747
758
|
if sv:
|
|
748
759
|
cls = sv.get_class(self.target_class_name)
|
|
760
|
+
# cls = sv.schema.classes[self.target_class_name]
|
|
749
761
|
if cls and not cls.attributes:
|
|
750
762
|
if not sv.class_induced_slots(cls.name):
|
|
751
763
|
for att in self._induce_attributes():
|
|
@@ -868,7 +880,7 @@ class Collection(Generic[DatabaseType]):
|
|
|
868
880
|
exact_dimensions_list.append(v.shape)
|
|
869
881
|
break
|
|
870
882
|
if isinstance(v, list):
|
|
871
|
-
v = v[0]
|
|
883
|
+
v = v[0] if v else None
|
|
872
884
|
multivalueds.append(True)
|
|
873
885
|
elif isinstance(v, dict):
|
|
874
886
|
v = list(v.values())[0]
|
|
@@ -33,6 +33,7 @@ class CollectionSource(ConfiguredBaseModel):
|
|
|
33
33
|
refresh_interval_days: Optional[float] = None
|
|
34
34
|
expected_type: Optional[str] = None
|
|
35
35
|
format: Optional[str] = None
|
|
36
|
+
compression: Optional[str] = None
|
|
36
37
|
arguments: Optional[Dict[str, Any]] = None
|
|
37
38
|
|
|
38
39
|
|
|
@@ -73,11 +74,11 @@ class CollectionConfig(ConfiguredBaseModel):
|
|
|
73
74
|
default=None,
|
|
74
75
|
description="Metadata about the source",
|
|
75
76
|
)
|
|
76
|
-
# TODO: derived_from
|
|
77
77
|
derived_from: Optional[List[DerivationConfiguration]] = Field(
|
|
78
78
|
default=None,
|
|
79
79
|
description="LinkML-Map derivations",
|
|
80
80
|
)
|
|
81
|
+
page_size: Optional[int] = Field(default=None, description="Suggested page size (items per page) in apps and APIs")
|
|
81
82
|
|
|
82
83
|
|
|
83
84
|
class DatabaseConfig(ConfiguredBaseModel):
|
|
@@ -19,7 +19,7 @@ from typing import (
|
|
|
19
19
|
)
|
|
20
20
|
|
|
21
21
|
from linkml_store.api.types import CollectionType
|
|
22
|
-
from linkml_store.utils.format_utils import load_objects, render_output
|
|
22
|
+
from linkml_store.utils.format_utils import Format, load_objects, render_output
|
|
23
23
|
from linkml_store.utils.patch_utils import PatchDict
|
|
24
24
|
|
|
25
25
|
try:
|
|
@@ -705,7 +705,7 @@ class Database(ABC, Generic[CollectionType]):
|
|
|
705
705
|
"""
|
|
706
706
|
raise NotImplementedError()
|
|
707
707
|
|
|
708
|
-
def import_database(self, location: str, source_format: Optional[str] = None, **kwargs):
|
|
708
|
+
def import_database(self, location: str, source_format: Optional[Union[str, Format]] = None, **kwargs):
|
|
709
709
|
"""
|
|
710
710
|
Import a database from a file or location.
|
|
711
711
|
|
|
@@ -713,11 +713,27 @@ class Database(ABC, Generic[CollectionType]):
|
|
|
713
713
|
:param source_format: source format
|
|
714
714
|
:param kwargs: additional arguments
|
|
715
715
|
"""
|
|
716
|
+
if isinstance(source_format, str):
|
|
717
|
+
source_format = Format(source_format)
|
|
718
|
+
if isinstance(source_format, Format):
|
|
719
|
+
if source_format.is_dump_format() and source_format in [Format.SQLDUMP_DUCKDB, Format.DUMP_MONGODB]:
|
|
720
|
+
# import into a test instance
|
|
721
|
+
tmp_handle = source_format.value
|
|
722
|
+
client = self.parent
|
|
723
|
+
tmp_db = client.attach_database(tmp_handle, alias="tmp")
|
|
724
|
+
# TODO: check for infinite recursion
|
|
725
|
+
tmp_db.import_database(location, source_format=source_format)
|
|
726
|
+
obj = {}
|
|
727
|
+
for coll in tmp_db.list_collections():
|
|
728
|
+
qr = coll.find({}, limit=-1)
|
|
729
|
+
obj[coll.alias] = qr.rows
|
|
730
|
+
self.store(obj)
|
|
731
|
+
return
|
|
716
732
|
objects = load_objects(location, format=source_format)
|
|
717
733
|
for obj in objects:
|
|
718
734
|
self.store(obj)
|
|
719
735
|
|
|
720
|
-
def export_database(self, location: str, target_format: Optional[str] = None, **kwargs):
|
|
736
|
+
def export_database(self, location: str, target_format: Optional[Union[str, Format]] = None, **kwargs):
|
|
721
737
|
"""
|
|
722
738
|
Export a database to a file or location.
|
|
723
739
|
|
|
@@ -726,10 +742,23 @@ class Database(ABC, Generic[CollectionType]):
|
|
|
726
742
|
:param kwargs: additional arguments
|
|
727
743
|
"""
|
|
728
744
|
obj = {}
|
|
745
|
+
if isinstance(target_format, str):
|
|
746
|
+
target_format = Format(target_format)
|
|
729
747
|
for coll in self.list_collections():
|
|
730
748
|
qr = coll.find({}, limit=-1)
|
|
731
749
|
obj[coll.alias] = qr.rows
|
|
732
750
|
logger.info(f"Exporting object with {len(obj)} collections to {location} in {target_format} format")
|
|
751
|
+
if isinstance(target_format, Format):
|
|
752
|
+
if target_format.is_dump_format() and target_format in [Format.SQLDUMP_DUCKDB, Format.DUMP_MONGODB]:
|
|
753
|
+
tmp_handle = target_format.value
|
|
754
|
+
client = self.parent
|
|
755
|
+
tmp_db = client.attach_database(tmp_handle, alias="tmp")
|
|
756
|
+
tmp_db.store(obj)
|
|
757
|
+
# TODO: check for infinite recursion
|
|
758
|
+
tmp_db.export_database(location, target_format=target_format)
|
|
759
|
+
return
|
|
760
|
+
if Path(location).is_dir():
|
|
761
|
+
raise ValueError(f"{location} is a directory; cannot write {target_format} to a dir")
|
|
733
762
|
with open(location, "w", encoding="utf-8") as stream:
|
|
734
763
|
stream.write(render_output(obj, format=target_format))
|
|
735
764
|
|
{linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/duckdb/duckdb_database.py
RENAMED
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Optional
|
|
4
|
+
from typing import Optional, Union
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
import sqlalchemy
|
|
8
|
-
from duckdb import DuckDBPyConnection
|
|
9
8
|
from linkml_runtime import SchemaView
|
|
10
9
|
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
|
|
11
10
|
from linkml_runtime.utils.schema_builder import SchemaBuilder
|
|
@@ -14,6 +13,7 @@ from sqlalchemy import NullPool, text
|
|
|
14
13
|
from linkml_store.api import Database
|
|
15
14
|
from linkml_store.api.queries import Query, QueryResult
|
|
16
15
|
from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
|
|
16
|
+
from linkml_store.utils.format_utils import Format
|
|
17
17
|
from linkml_store.utils.sql_utils import introspect_schema, query_to_sql
|
|
18
18
|
|
|
19
19
|
TYPE_MAP = {
|
|
@@ -45,7 +45,7 @@ class DuckDBDatabase(Database):
|
|
|
45
45
|
types are used for nested inlined objects.
|
|
46
46
|
"""
|
|
47
47
|
|
|
48
|
-
_connection: DuckDBPyConnection = None
|
|
48
|
+
# _connection: DuckDBPyConnection = None
|
|
49
49
|
_engine: sqlalchemy.Engine = None
|
|
50
50
|
collection_class = DuckDBCollection
|
|
51
51
|
|
|
@@ -202,3 +202,31 @@ class DuckDBDatabase(Database):
|
|
|
202
202
|
cls = ClassDefinition(name=collection_metadata.type, attributes=collection_metadata.attributes)
|
|
203
203
|
schema.classes[cls.name] = cls
|
|
204
204
|
return SchemaView(schema)
|
|
205
|
+
|
|
206
|
+
def export_database(self, location: str, target_format: Optional[Union[str, Format]] = None, **kwargs):
|
|
207
|
+
if target_format == "duckdb" or target_format == Format.SQLDUMP_DUCKDB:
|
|
208
|
+
path = Path(location)
|
|
209
|
+
if path.exists():
|
|
210
|
+
if path.is_file():
|
|
211
|
+
path.unlink()
|
|
212
|
+
with self.engine.connect() as conn:
|
|
213
|
+
sql = text(f"EXPORT DATABASE '{location}'")
|
|
214
|
+
conn.execute(sql)
|
|
215
|
+
else:
|
|
216
|
+
super().export_database(location, target_format=target_format, **kwargs)
|
|
217
|
+
|
|
218
|
+
def import_database(self, location: str, source_format: Optional[str] = None, **kwargs):
|
|
219
|
+
"""
|
|
220
|
+
Import a database from a file or location.
|
|
221
|
+
|
|
222
|
+
:param location: location of the file
|
|
223
|
+
:param source_format: source format
|
|
224
|
+
:param kwargs: additional arguments
|
|
225
|
+
"""
|
|
226
|
+
if source_format == Format.SQLDUMP_DUCKDB.value or source_format == Format.SQLDUMP_DUCKDB:
|
|
227
|
+
with self.engine.connect() as conn:
|
|
228
|
+
sql = text(f"IMPORT DATABASE '{location}'")
|
|
229
|
+
conn.execute(sql)
|
|
230
|
+
conn.commit()
|
|
231
|
+
else:
|
|
232
|
+
super().import_database(location, source_format=source_format, **kwargs)
|
{linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/mongodb/mongodb_database.py
RENAMED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
# mongodb_database.py
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
from
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, Union
|
|
5
6
|
|
|
6
7
|
from pymongo import MongoClient
|
|
7
8
|
from pymongo.database import Database as NativeDatabase
|
|
@@ -9,6 +10,9 @@ from pymongo.database import Database as NativeDatabase
|
|
|
9
10
|
from linkml_store.api import Database
|
|
10
11
|
from linkml_store.api.queries import Query, QueryResult
|
|
11
12
|
from linkml_store.api.stores.mongodb.mongodb_collection import MongoDBCollection
|
|
13
|
+
from linkml_store.utils.file_utils import safe_remove_directory
|
|
14
|
+
from linkml_store.utils.format_utils import Format
|
|
15
|
+
from linkml_store.utils.mongodb_utils import import_mongodb
|
|
12
16
|
|
|
13
17
|
logger = logging.getLogger(__name__)
|
|
14
18
|
|
|
@@ -27,6 +31,8 @@ class MongoDBDatabase(Database):
|
|
|
27
31
|
def __init__(self, handle: Optional[str] = None, **kwargs):
|
|
28
32
|
if handle is None:
|
|
29
33
|
handle = "mongodb://localhost:27017/test"
|
|
34
|
+
if handle == "mongodb":
|
|
35
|
+
handle = "mongodb://localhost:27017/temporary"
|
|
30
36
|
super().__init__(handle=handle, **kwargs)
|
|
31
37
|
|
|
32
38
|
@property
|
|
@@ -77,3 +83,27 @@ class MongoDBDatabase(Database):
|
|
|
77
83
|
if collection_name not in self._collections:
|
|
78
84
|
collection = MongoDBCollection(name=collection_name, parent=self)
|
|
79
85
|
self._collections[collection_name] = collection
|
|
86
|
+
|
|
87
|
+
def export_database(self, location: str, target_format: Optional[Union[str, Format]] = None, **kwargs):
|
|
88
|
+
if target_format == Format.DUMP_MONGODB.value or target_format == Format.DUMP_MONGODB:
|
|
89
|
+
path = Path(location)
|
|
90
|
+
if path.exists():
|
|
91
|
+
safe_remove_directory(path, no_backup=True)
|
|
92
|
+
from linkml_store.utils.mongodb_utils import export_mongodb
|
|
93
|
+
|
|
94
|
+
export_mongodb(self.handle, location)
|
|
95
|
+
else:
|
|
96
|
+
super().export_database(location, target_format=target_format, **kwargs)
|
|
97
|
+
|
|
98
|
+
def import_database(self, location: str, source_format: Optional[str] = None, **kwargs):
|
|
99
|
+
"""
|
|
100
|
+
Import a database from a file or location.
|
|
101
|
+
|
|
102
|
+
:param location: location of the file
|
|
103
|
+
:param source_format: source format
|
|
104
|
+
:param kwargs: additional arguments
|
|
105
|
+
"""
|
|
106
|
+
if source_format == Format.DUMP_MONGODB.value or source_format == Format.DUMP_MONGODB:
|
|
107
|
+
import_mongodb(self.handle, location, drop=True)
|
|
108
|
+
else:
|
|
109
|
+
super().import_database(location, source_format=source_format, **kwargs)
|
|
@@ -228,7 +228,11 @@ def store(ctx, files, object, format):
|
|
|
228
228
|
@click.pass_context
|
|
229
229
|
@click.argument("files", type=click.Path(exists=True), nargs=-1)
|
|
230
230
|
def import_database(ctx, files, format):
|
|
231
|
-
"""Imports a database from a dump.
|
|
231
|
+
"""Imports a database from a dump.
|
|
232
|
+
|
|
233
|
+
See the `export` command for a full list of supported formats. The same
|
|
234
|
+
formats are generally supported for imports.
|
|
235
|
+
"""
|
|
232
236
|
settings = ctx.obj["settings"]
|
|
233
237
|
db = settings.database
|
|
234
238
|
if not files and not object:
|
|
@@ -242,7 +246,30 @@ def import_database(ctx, files, format):
|
|
|
242
246
|
@click.option("--output", "-o", required=True, type=click.Path(), help="Output file path")
|
|
243
247
|
@click.pass_context
|
|
244
248
|
def export(ctx, output_type, output):
|
|
245
|
-
"""Exports a database to a dump.
|
|
249
|
+
"""Exports a database to a standard dump format.
|
|
250
|
+
|
|
251
|
+
Example:
|
|
252
|
+
|
|
253
|
+
linkml-store -d duckdb:///countries.db export -O yaml -o countries.yaml
|
|
254
|
+
|
|
255
|
+
Export format will be guessed from extension if not specified
|
|
256
|
+
|
|
257
|
+
Example:
|
|
258
|
+
|
|
259
|
+
linkml-store -d duckdb:///countries.db export -o countries.json
|
|
260
|
+
|
|
261
|
+
Tree formats such as YAML and JSON can natively store an entire database; each collection
|
|
262
|
+
will be a distinct key in the database.
|
|
263
|
+
|
|
264
|
+
Additionally, native dump formats can be used:
|
|
265
|
+
|
|
266
|
+
Example:
|
|
267
|
+
|
|
268
|
+
linkml-store -d duckdb:///countries.db export -o countries -O duckdb
|
|
269
|
+
|
|
270
|
+
Here, `countries` is a directory. This is equivalent to running EXPORT DATABASE
|
|
271
|
+
(see https://duckdb.org/docs/sql/statements/export.html)
|
|
272
|
+
"""
|
|
246
273
|
settings = ctx.obj["settings"]
|
|
247
274
|
db = settings.database
|
|
248
275
|
if output_type is None:
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
import csv
|
|
2
|
+
import gzip
|
|
3
|
+
import io
|
|
2
4
|
import json
|
|
5
|
+
import logging
|
|
3
6
|
import sys
|
|
7
|
+
import tarfile
|
|
4
8
|
from enum import Enum
|
|
5
9
|
from io import StringIO
|
|
6
10
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Dict, List, Optional, TextIO, Type, Union
|
|
11
|
+
from typing import IO, Any, Dict, List, Optional, TextIO, Type, Union
|
|
8
12
|
|
|
9
13
|
import pandas as pd
|
|
10
14
|
import pystow
|
|
@@ -12,6 +16,8 @@ import yaml
|
|
|
12
16
|
from pydantic import BaseModel
|
|
13
17
|
from tabulate import tabulate
|
|
14
18
|
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
15
21
|
|
|
16
22
|
class Format(Enum):
|
|
17
23
|
"""
|
|
@@ -27,6 +33,35 @@ class Format(Enum):
|
|
|
27
33
|
PARQUET = "parquet"
|
|
28
34
|
FORMATTED = "formatted"
|
|
29
35
|
TABLE = "table"
|
|
36
|
+
SQLDUMP_DUCKDB = "duckdb"
|
|
37
|
+
SQLDUMP_POSTGRES = "postgres"
|
|
38
|
+
DUMP_MONGODB = "mongodb"
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def guess_format(cls, file_name: str) -> Optional["Format"]:
|
|
42
|
+
ext = Path(file_name).suffix.lower()
|
|
43
|
+
|
|
44
|
+
format_map = {
|
|
45
|
+
".json": cls.JSON,
|
|
46
|
+
".jsonl": cls.JSONL,
|
|
47
|
+
".yaml": cls.YAML,
|
|
48
|
+
".yml": cls.YAML,
|
|
49
|
+
".tsv": cls.TSV,
|
|
50
|
+
".csv": cls.CSV,
|
|
51
|
+
".py": cls.PYTHON,
|
|
52
|
+
".parquet": cls.PARQUET,
|
|
53
|
+
".pq": cls.PARQUET,
|
|
54
|
+
}
|
|
55
|
+
fmt = format_map.get(ext, None)
|
|
56
|
+
if fmt is None:
|
|
57
|
+
if ext.startswith("."):
|
|
58
|
+
ext = ext[1:]
|
|
59
|
+
if ext in [f.value for f in Format]:
|
|
60
|
+
return Format(ext)
|
|
61
|
+
return fmt
|
|
62
|
+
|
|
63
|
+
def is_dump_format(self):
|
|
64
|
+
return self in [Format.SQLDUMP_DUCKDB, Format.SQLDUMP_POSTGRES, Format.DUMP_MONGODB]
|
|
30
65
|
|
|
31
66
|
|
|
32
67
|
def load_objects_from_url(
|
|
@@ -46,15 +81,109 @@ def load_objects_from_url(
|
|
|
46
81
|
:return: A list of dictionaries representing the loaded objects.
|
|
47
82
|
"""
|
|
48
83
|
local_path = pystow.ensure("linkml", "linkml-store", url=url)
|
|
84
|
+
logger.info(f"synced to {local_path}")
|
|
49
85
|
objs = load_objects(local_path, format=format, expected_type=expected_type, **kwargs)
|
|
50
86
|
if not objs:
|
|
51
87
|
raise ValueError(f"No objects loaded from URL: {url}")
|
|
52
88
|
return objs
|
|
53
89
|
|
|
54
90
|
|
|
91
|
+
def process_file(
|
|
92
|
+
f: IO, format: Format, expected_type: Optional[Type] = None, header_comment_token: Optional[str] = None
|
|
93
|
+
) -> List[Dict[str, Any]]:
|
|
94
|
+
"""
|
|
95
|
+
Process a single file and return a list of objects.
|
|
96
|
+
"""
|
|
97
|
+
if format == Format.JSON:
|
|
98
|
+
objs = json.load(f)
|
|
99
|
+
elif format == Format.JSONL:
|
|
100
|
+
objs = [json.loads(line) for line in f]
|
|
101
|
+
elif format == Format.YAML:
|
|
102
|
+
if expected_type and expected_type == list: # noqa E721
|
|
103
|
+
objs = list(yaml.safe_load_all(f))
|
|
104
|
+
else:
|
|
105
|
+
objs = yaml.safe_load(f)
|
|
106
|
+
elif format in [Format.TSV, Format.CSV]:
|
|
107
|
+
if header_comment_token:
|
|
108
|
+
while True:
|
|
109
|
+
pos = f.tell()
|
|
110
|
+
line = f.readline()
|
|
111
|
+
if not line.startswith(header_comment_token):
|
|
112
|
+
f.seek(pos)
|
|
113
|
+
break
|
|
114
|
+
delimiter = "\t" if format == Format.TSV else ","
|
|
115
|
+
reader = csv.DictReader(f, delimiter=delimiter)
|
|
116
|
+
objs = list(reader)
|
|
117
|
+
elif format == Format.PARQUET:
|
|
118
|
+
import pyarrow.parquet as pq
|
|
119
|
+
|
|
120
|
+
table = pq.read_table(f)
|
|
121
|
+
objs = table.to_pandas().to_dict(orient="records")
|
|
122
|
+
elif format in [Format.PYTHON, Format.FORMATTED, Format.TABLE]:
|
|
123
|
+
raise ValueError(f"Format {format} is not supported for loading objects")
|
|
124
|
+
else:
|
|
125
|
+
raise ValueError(f"Unsupported file format: {format}")
|
|
126
|
+
|
|
127
|
+
if not isinstance(objs, list):
|
|
128
|
+
objs = [objs]
|
|
129
|
+
return objs
|
|
130
|
+
|
|
131
|
+
|
|
55
132
|
def load_objects(
|
|
133
|
+
file_path: Union[str, Path],
|
|
134
|
+
format: Optional[Union[Format, str]] = None,
|
|
135
|
+
compression: Optional[str] = None,
|
|
136
|
+
expected_type: Optional[Type] = None,
|
|
137
|
+
header_comment_token: Optional[str] = None,
|
|
138
|
+
) -> List[Dict[str, Any]]:
|
|
139
|
+
"""
|
|
140
|
+
Load objects from a file or archive in supported formats.
|
|
141
|
+
For tgz archives, it processes all files and concatenates the results.
|
|
142
|
+
|
|
143
|
+
:param file_path: The path to the file or archive.
|
|
144
|
+
:param format: The format of the file. Can be a Format enum or a string value.
|
|
145
|
+
:param compression: The compression type. Supports 'gz' for gzip and 'tgz' for tar.gz.
|
|
146
|
+
:param expected_type: The target type to load the objects into, e.g. list
|
|
147
|
+
:param header_comment_token: Token used for header comments to be skipped
|
|
148
|
+
:return: A list of dictionaries representing the loaded objects.
|
|
149
|
+
"""
|
|
150
|
+
if isinstance(file_path, Path):
|
|
151
|
+
file_path = str(file_path)
|
|
152
|
+
|
|
153
|
+
if isinstance(format, str):
|
|
154
|
+
format = Format(format)
|
|
155
|
+
|
|
156
|
+
all_objects = []
|
|
157
|
+
|
|
158
|
+
if compression == "tgz":
|
|
159
|
+
with tarfile.open(file_path, "r:gz") as tar:
|
|
160
|
+
for member in tar.getmembers():
|
|
161
|
+
if member.isfile():
|
|
162
|
+
f = tar.extractfile(member)
|
|
163
|
+
if f:
|
|
164
|
+
content = io.TextIOWrapper(f)
|
|
165
|
+
member_format = Format.guess_format(member.name) if not format else format
|
|
166
|
+
logger.debug(f"Processing tar member {member.name} with format {member_format}")
|
|
167
|
+
all_objects.extend(process_file(content, member_format, expected_type, header_comment_token))
|
|
168
|
+
else:
|
|
169
|
+
if Path(file_path).is_dir():
|
|
170
|
+
raise ValueError(f"{file_path} is a dir, which is invalid for {format}")
|
|
171
|
+
mode = "rb" if format == Format.PARQUET or compression == "gz" else "r"
|
|
172
|
+
open_func = gzip.open if compression == "gz" else open
|
|
173
|
+
format = Format.guess_format(file_path) if not format else format
|
|
174
|
+
with open_func(file_path, mode) if file_path != "-" else sys.stdin as f:
|
|
175
|
+
if compression == "gz" and mode == "r":
|
|
176
|
+
f = io.TextIOWrapper(f)
|
|
177
|
+
all_objects = process_file(f, format, expected_type, header_comment_token)
|
|
178
|
+
|
|
179
|
+
logger.debug(f"Loaded {len(all_objects)} objects from {file_path}")
|
|
180
|
+
return all_objects
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def xxxload_objects(
|
|
56
184
|
file_path: Union[str, Path],
|
|
57
185
|
format: Union[Format, str] = None,
|
|
186
|
+
compression: Optional[str] = None,
|
|
58
187
|
expected_type: Type = None,
|
|
59
188
|
header_comment_token: Optional[str] = None,
|
|
60
189
|
) -> List[Dict[str, Any]]:
|
|
@@ -172,7 +301,7 @@ def write_output(
|
|
|
172
301
|
|
|
173
302
|
|
|
174
303
|
def render_output(
|
|
175
|
-
data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame], format: Union[Format, str] = Format.YAML
|
|
304
|
+
data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame], format: Optional[Union[Format, str]] = Format.YAML
|
|
176
305
|
) -> str:
|
|
177
306
|
"""
|
|
178
307
|
Render output data in JSON, JSONLines, YAML, CSV, or TSV format.
|
|
@@ -271,15 +400,4 @@ def guess_format(path: str) -> Optional[Format]:
|
|
|
271
400
|
:param path: The path to the file.
|
|
272
401
|
:return: The guessed format.
|
|
273
402
|
"""
|
|
274
|
-
|
|
275
|
-
return Format.JSON
|
|
276
|
-
elif path.endswith(".jsonl"):
|
|
277
|
-
return Format.JSONL
|
|
278
|
-
elif path.endswith(".yaml") or path.endswith(".yml"):
|
|
279
|
-
return Format.YAML
|
|
280
|
-
elif path.endswith(".tsv"):
|
|
281
|
-
return Format.TSV
|
|
282
|
-
elif path.endswith(".csv"):
|
|
283
|
-
return Format.CSV
|
|
284
|
-
else:
|
|
285
|
-
return None
|
|
403
|
+
return Format.guess_format(path)
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import subprocess
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
from pymongo import MongoClient
|
|
9
|
+
from pymongo.database import Database
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def extract_connection_info(db: Database):
|
|
15
|
+
client = db.client
|
|
16
|
+
|
|
17
|
+
# Get the host and port
|
|
18
|
+
host_info = client.address
|
|
19
|
+
if host_info:
|
|
20
|
+
host, port = host_info
|
|
21
|
+
else:
|
|
22
|
+
# For replica sets or sharded clusters, we might need to get this differently
|
|
23
|
+
host = client.HOST
|
|
24
|
+
port = client.PORT
|
|
25
|
+
|
|
26
|
+
# Get the database name
|
|
27
|
+
db_name = db.name
|
|
28
|
+
|
|
29
|
+
# Get username if available
|
|
30
|
+
username = None
|
|
31
|
+
if hasattr(client, "options") and hasattr(client.options, "credentials"):
|
|
32
|
+
credentials = client.options.credentials
|
|
33
|
+
if credentials:
|
|
34
|
+
username = credentials.username
|
|
35
|
+
|
|
36
|
+
return {"host": host, "port": port, "db_name": db_name, "username": username}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_connection_string(client: MongoClient):
|
|
40
|
+
"""
|
|
41
|
+
Extract a connection string from the MongoClient.
|
|
42
|
+
This avoids triggering truth value testing on Database objects.
|
|
43
|
+
"""
|
|
44
|
+
if client.address:
|
|
45
|
+
host, port = client.address
|
|
46
|
+
return f"{host}:{port}"
|
|
47
|
+
if hasattr(client, "address") and client.address:
|
|
48
|
+
host, port = client.address
|
|
49
|
+
return f"{host}:{port}"
|
|
50
|
+
elif client.hosts:
|
|
51
|
+
# For replica sets, return all hosts
|
|
52
|
+
return ",".join(f"{host}:{port}" for host, port in client.hosts)
|
|
53
|
+
elif hasattr(client, "HOST"):
|
|
54
|
+
# If we can't determine hosts, use the entire URI
|
|
55
|
+
parsed_uri = urlparse(client.HOST)
|
|
56
|
+
return f"{parsed_uri.hostname}:{parsed_uri.port}"
|
|
57
|
+
else:
|
|
58
|
+
raise ValueError("Unable to determine connection string from client")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_connection_info(db: Database):
|
|
62
|
+
"""
|
|
63
|
+
Extract connection information from the Database object.
|
|
64
|
+
"""
|
|
65
|
+
# Get the name of the database
|
|
66
|
+
db_name = db.name
|
|
67
|
+
|
|
68
|
+
# Get the client's node list (this should work for single nodes and replica sets)
|
|
69
|
+
node_list = db.client.nodes
|
|
70
|
+
|
|
71
|
+
if not node_list:
|
|
72
|
+
raise ValueError("Unable to determine connection information from database")
|
|
73
|
+
|
|
74
|
+
# Use the first node in the list (for single node setups, this will be the only node)
|
|
75
|
+
first_node = node_list[0]
|
|
76
|
+
host, port = first_node
|
|
77
|
+
|
|
78
|
+
return host, port, db_name
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def get_auth_from_client(client: MongoClient):
|
|
82
|
+
"""Extract authentication details from MongoClient."""
|
|
83
|
+
if hasattr(client, "_MongoClient__options"):
|
|
84
|
+
# For older versions of PyMongo
|
|
85
|
+
options = client._MongoClient__options
|
|
86
|
+
elif hasattr(client, "options"):
|
|
87
|
+
# For newer versions of PyMongo
|
|
88
|
+
options = client.options
|
|
89
|
+
else:
|
|
90
|
+
return None, None, None
|
|
91
|
+
|
|
92
|
+
if hasattr(options, "credentials"):
|
|
93
|
+
creds = options.credentials
|
|
94
|
+
return creds.username, creds.password, creds.source
|
|
95
|
+
return None, None, None
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def connection_from_handle(handle: str):
|
|
99
|
+
if handle.startswith("mongodb://"):
|
|
100
|
+
handle = handle.replace("mongodb://", "")
|
|
101
|
+
host, db = handle.split("/")
|
|
102
|
+
return host, db
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def export_mongodb(handle: str, location: str, password: Optional[str] = None):
|
|
106
|
+
host, db_name = connection_from_handle(handle)
|
|
107
|
+
|
|
108
|
+
# Construct the mongodump command
|
|
109
|
+
cmd = ["mongodump", f"--host={host}", f"--db={db_name}"]
|
|
110
|
+
logger.info(f"Exporting MongoDB database {db_name} from {host} to {location}")
|
|
111
|
+
cmd.extend(["--out", location])
|
|
112
|
+
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
|
113
|
+
logger.info(f"MongoDB export completed successfully. Output: {result.stdout}")
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def import_mongodb(handle: str, dump_dir: str, drop: bool = False):
|
|
117
|
+
host, db_name = connection_from_handle(handle)
|
|
118
|
+
|
|
119
|
+
# list dirs in dump_dir
|
|
120
|
+
dir_path = Path(dump_dir)
|
|
121
|
+
if not dir_path.is_dir():
|
|
122
|
+
raise ValueError(f"{dir_path} is not a dir")
|
|
123
|
+
directories = [name for name in os.listdir(dump_dir)]
|
|
124
|
+
if len(directories) != 1:
|
|
125
|
+
raise ValueError(f"Expected exactly one database in {dump_dir}, got: {directories}")
|
|
126
|
+
src_db_name = directories[0]
|
|
127
|
+
|
|
128
|
+
# Construct the mongorestore command
|
|
129
|
+
cmd = [
|
|
130
|
+
"mongorestore",
|
|
131
|
+
f"--host={host}",
|
|
132
|
+
f"--nsFrom={src_db_name}.*",
|
|
133
|
+
f"--nsTo={db_name}.*",
|
|
134
|
+
str(dump_dir),
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
# Add drop option if specified
|
|
138
|
+
if drop:
|
|
139
|
+
cmd.append("--drop")
|
|
140
|
+
logger.info(f"CMD={cmd}")
|
|
141
|
+
# Execute mongorestore
|
|
142
|
+
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
|
143
|
+
if result.stderr:
|
|
144
|
+
logger.warning(result.stderr)
|
|
145
|
+
logger.info(f"MongoDB import completed successfully. Output: {result.stdout} // {result.stderr}")
|