linkml-store 0.1.11__tar.gz → 0.1.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (61) hide show
  1. {linkml_store-0.1.11 → linkml_store-0.1.12}/PKG-INFO +1 -1
  2. {linkml_store-0.1.11 → linkml_store-0.1.12}/pyproject.toml +1 -1
  3. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/collection.py +17 -5
  4. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/config.py +2 -1
  5. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/database.py +32 -3
  6. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/duckdb/duckdb_database.py +31 -3
  7. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/mongodb/mongodb_database.py +31 -1
  8. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/cli.py +29 -2
  9. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/format_utils.py +132 -14
  10. linkml_store-0.1.12/src/linkml_store/utils/mongodb_utils.py +145 -0
  11. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/sql_utils.py +7 -2
  12. linkml_store-0.1.12/src/linkml_store/webapi/html/generic.html.j2 +43 -0
  13. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/webapi/main.py +346 -63
  14. linkml_store-0.1.11/src/linkml_store/webapi/html/generic.html.j2 +0 -46
  15. {linkml_store-0.1.11 → linkml_store-0.1.12}/LICENSE +0 -0
  16. {linkml_store-0.1.11 → linkml_store-0.1.12}/README.md +0 -0
  17. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/__init__.py +0 -0
  18. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/__init__.py +0 -0
  19. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/client.py +0 -0
  20. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/queries.py +0 -0
  21. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/__init__.py +0 -0
  22. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/chromadb/__init__.py +0 -0
  23. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/chromadb/chromadb_collection.py +0 -0
  24. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/chromadb/chromadb_database.py +0 -0
  25. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/duckdb/__init__.py +0 -0
  26. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/duckdb/duckdb_collection.py +0 -0
  27. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/duckdb/mappings.py +0 -0
  28. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/filesystem/__init__.py +0 -0
  29. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/filesystem/filesystem_collection.py +0 -0
  30. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/filesystem/filesystem_database.py +0 -0
  31. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/hdf5/__init__.py +0 -0
  32. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/hdf5/hdf5_collection.py +0 -0
  33. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/hdf5/hdf5_database.py +0 -0
  34. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/mongodb/__init__.py +0 -0
  35. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/mongodb/mongodb_collection.py +0 -0
  36. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/solr/__init__.py +0 -0
  37. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/solr/solr_collection.py +0 -0
  38. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/solr/solr_database.py +0 -0
  39. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/stores/solr/solr_utils.py +0 -0
  40. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/api/types.py +0 -0
  41. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/constants.py +0 -0
  42. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/index/__init__.py +0 -0
  43. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/index/implementations/__init__.py +0 -0
  44. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/index/implementations/llm_indexer.py +0 -0
  45. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/index/implementations/simple_indexer.py +0 -0
  46. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/index/indexer.py +0 -0
  47. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/__init__.py +0 -0
  48. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/change_utils.py +0 -0
  49. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/file_utils.py +0 -0
  50. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/io.py +0 -0
  51. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/object_utils.py +0 -0
  52. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/pandas_utils.py +0 -0
  53. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/patch_utils.py +0 -0
  54. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/query_utils.py +0 -0
  55. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/utils/schema_utils.py +0 -0
  56. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/webapi/__init__.py +0 -0
  57. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/webapi/html/__init__.py +0 -0
  58. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/webapi/html/base.html.j2 +0 -0
  59. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/webapi/html/collection_details.html.j2 +0 -0
  60. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/webapi/html/database_details.html.j2 +0 -0
  61. {linkml_store-0.1.11 → linkml_store-0.1.12}/src/linkml_store/webapi/html/databases.html.j2 +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: linkml-store
3
- Version: 0.1.11
3
+ Version: 0.1.12
4
4
  Summary: linkml-store
5
5
  License: MIT
6
6
  Author: Author 1
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "linkml-store"
3
- version = "0.1.11"
3
+ version = "0.1.12"
4
4
  description = "linkml-store"
5
5
  authors = ["Author 1 <author@org.org>"]
6
6
  license = "MIT"
@@ -346,7 +346,10 @@ class Collection(Generic[DatabaseType]):
346
346
  id_field = self.identifier_attribute_name
347
347
  if not id_field:
348
348
  raise ValueError(f"No identifier for {self.name}")
349
- return self.find({id_field: ids})
349
+ if len(ids) == 1:
350
+ return self.find({id_field: ids[0]})
351
+ else:
352
+ return self.find({id_field: {"$in": ids}})
350
353
 
351
354
  def get_one(self, id: IDENTIFIER, **kwargs) -> Optional[OBJECT]:
352
355
  """
@@ -518,7 +521,7 @@ class Collection(Generic[DatabaseType]):
518
521
  :return:
519
522
  """
520
523
  cd = self.class_definition()
521
- return cd is not None
524
+ return cd is not None and cd.attributes
522
525
 
523
526
  def load_from_source(self, load_if_exists=False):
524
527
  """
@@ -535,11 +538,19 @@ class Collection(Generic[DatabaseType]):
535
538
  kwargs = source.arguments or {}
536
539
  if source.local_path:
537
540
  objects = load_objects(
538
- metadata.source.local_path, format=source.format, expected_type=source.expected_type, **kwargs
541
+ metadata.source.local_path,
542
+ format=source.format,
543
+ expected_type=source.expected_type,
544
+ compression=source.compression,
545
+ **kwargs,
539
546
  )
540
547
  elif metadata.source.url:
541
548
  objects = load_objects_from_url(
542
- metadata.source.url, format=source.format, expected_type=source.expected_type, **kwargs
549
+ metadata.source.url,
550
+ format=source.format,
551
+ expected_type=source.expected_type,
552
+ compression=source.compression,
553
+ **kwargs,
543
554
  )
544
555
  self.insert(objects)
545
556
 
@@ -746,6 +757,7 @@ class Collection(Generic[DatabaseType]):
746
757
  sv: SchemaView = self.parent.schema_view
747
758
  if sv:
748
759
  cls = sv.get_class(self.target_class_name)
760
+ # cls = sv.schema.classes[self.target_class_name]
749
761
  if cls and not cls.attributes:
750
762
  if not sv.class_induced_slots(cls.name):
751
763
  for att in self._induce_attributes():
@@ -868,7 +880,7 @@ class Collection(Generic[DatabaseType]):
868
880
  exact_dimensions_list.append(v.shape)
869
881
  break
870
882
  if isinstance(v, list):
871
- v = v[0]
883
+ v = v[0] if v else None
872
884
  multivalueds.append(True)
873
885
  elif isinstance(v, dict):
874
886
  v = list(v.values())[0]
@@ -33,6 +33,7 @@ class CollectionSource(ConfiguredBaseModel):
33
33
  refresh_interval_days: Optional[float] = None
34
34
  expected_type: Optional[str] = None
35
35
  format: Optional[str] = None
36
+ compression: Optional[str] = None
36
37
  arguments: Optional[Dict[str, Any]] = None
37
38
 
38
39
 
@@ -73,11 +74,11 @@ class CollectionConfig(ConfiguredBaseModel):
73
74
  default=None,
74
75
  description="Metadata about the source",
75
76
  )
76
- # TODO: derived_from
77
77
  derived_from: Optional[List[DerivationConfiguration]] = Field(
78
78
  default=None,
79
79
  description="LinkML-Map derivations",
80
80
  )
81
+ page_size: Optional[int] = Field(default=None, description="Suggested page size (items per page) in apps and APIs")
81
82
 
82
83
 
83
84
  class DatabaseConfig(ConfiguredBaseModel):
@@ -19,7 +19,7 @@ from typing import (
19
19
  )
20
20
 
21
21
  from linkml_store.api.types import CollectionType
22
- from linkml_store.utils.format_utils import load_objects, render_output
22
+ from linkml_store.utils.format_utils import Format, load_objects, render_output
23
23
  from linkml_store.utils.patch_utils import PatchDict
24
24
 
25
25
  try:
@@ -705,7 +705,7 @@ class Database(ABC, Generic[CollectionType]):
705
705
  """
706
706
  raise NotImplementedError()
707
707
 
708
- def import_database(self, location: str, source_format: Optional[str] = None, **kwargs):
708
+ def import_database(self, location: str, source_format: Optional[Union[str, Format]] = None, **kwargs):
709
709
  """
710
710
  Import a database from a file or location.
711
711
 
@@ -713,11 +713,27 @@ class Database(ABC, Generic[CollectionType]):
713
713
  :param source_format: source format
714
714
  :param kwargs: additional arguments
715
715
  """
716
+ if isinstance(source_format, str):
717
+ source_format = Format(source_format)
718
+ if isinstance(source_format, Format):
719
+ if source_format.is_dump_format() and source_format in [Format.SQLDUMP_DUCKDB, Format.DUMP_MONGODB]:
720
+ # import into a test instance
721
+ tmp_handle = source_format.value
722
+ client = self.parent
723
+ tmp_db = client.attach_database(tmp_handle, alias="tmp")
724
+ # TODO: check for infinite recursion
725
+ tmp_db.import_database(location, source_format=source_format)
726
+ obj = {}
727
+ for coll in tmp_db.list_collections():
728
+ qr = coll.find({}, limit=-1)
729
+ obj[coll.alias] = qr.rows
730
+ self.store(obj)
731
+ return
716
732
  objects = load_objects(location, format=source_format)
717
733
  for obj in objects:
718
734
  self.store(obj)
719
735
 
720
- def export_database(self, location: str, target_format: Optional[str] = None, **kwargs):
736
+ def export_database(self, location: str, target_format: Optional[Union[str, Format]] = None, **kwargs):
721
737
  """
722
738
  Export a database to a file or location.
723
739
 
@@ -726,10 +742,23 @@ class Database(ABC, Generic[CollectionType]):
726
742
  :param kwargs: additional arguments
727
743
  """
728
744
  obj = {}
745
+ if isinstance(target_format, str):
746
+ target_format = Format(target_format)
729
747
  for coll in self.list_collections():
730
748
  qr = coll.find({}, limit=-1)
731
749
  obj[coll.alias] = qr.rows
732
750
  logger.info(f"Exporting object with {len(obj)} collections to {location} in {target_format} format")
751
+ if isinstance(target_format, Format):
752
+ if target_format.is_dump_format() and target_format in [Format.SQLDUMP_DUCKDB, Format.DUMP_MONGODB]:
753
+ tmp_handle = target_format.value
754
+ client = self.parent
755
+ tmp_db = client.attach_database(tmp_handle, alias="tmp")
756
+ tmp_db.store(obj)
757
+ # TODO: check for infinite recursion
758
+ tmp_db.export_database(location, target_format=target_format)
759
+ return
760
+ if Path(location).is_dir():
761
+ raise ValueError(f"{location} is a directory; cannot write {target_format} to a dir")
733
762
  with open(location, "w", encoding="utf-8") as stream:
734
763
  stream.write(render_output(obj, format=target_format))
735
764
 
@@ -1,11 +1,10 @@
1
1
  import json
2
2
  import logging
3
3
  from pathlib import Path
4
- from typing import Optional
4
+ from typing import Optional, Union
5
5
 
6
6
  import pandas as pd
7
7
  import sqlalchemy
8
- from duckdb import DuckDBPyConnection
9
8
  from linkml_runtime import SchemaView
10
9
  from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
11
10
  from linkml_runtime.utils.schema_builder import SchemaBuilder
@@ -14,6 +13,7 @@ from sqlalchemy import NullPool, text
14
13
  from linkml_store.api import Database
15
14
  from linkml_store.api.queries import Query, QueryResult
16
15
  from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
16
+ from linkml_store.utils.format_utils import Format
17
17
  from linkml_store.utils.sql_utils import introspect_schema, query_to_sql
18
18
 
19
19
  TYPE_MAP = {
@@ -45,7 +45,7 @@ class DuckDBDatabase(Database):
45
45
  types are used for nested inlined objects.
46
46
  """
47
47
 
48
- _connection: DuckDBPyConnection = None
48
+ # _connection: DuckDBPyConnection = None
49
49
  _engine: sqlalchemy.Engine = None
50
50
  collection_class = DuckDBCollection
51
51
 
@@ -202,3 +202,31 @@ class DuckDBDatabase(Database):
202
202
  cls = ClassDefinition(name=collection_metadata.type, attributes=collection_metadata.attributes)
203
203
  schema.classes[cls.name] = cls
204
204
  return SchemaView(schema)
205
+
206
+ def export_database(self, location: str, target_format: Optional[Union[str, Format]] = None, **kwargs):
207
+ if target_format == "duckdb" or target_format == Format.SQLDUMP_DUCKDB:
208
+ path = Path(location)
209
+ if path.exists():
210
+ if path.is_file():
211
+ path.unlink()
212
+ with self.engine.connect() as conn:
213
+ sql = text(f"EXPORT DATABASE '{location}'")
214
+ conn.execute(sql)
215
+ else:
216
+ super().export_database(location, target_format=target_format, **kwargs)
217
+
218
+ def import_database(self, location: str, source_format: Optional[str] = None, **kwargs):
219
+ """
220
+ Import a database from a file or location.
221
+
222
+ :param location: location of the file
223
+ :param source_format: source format
224
+ :param kwargs: additional arguments
225
+ """
226
+ if source_format == Format.SQLDUMP_DUCKDB.value or source_format == Format.SQLDUMP_DUCKDB:
227
+ with self.engine.connect() as conn:
228
+ sql = text(f"IMPORT DATABASE '{location}'")
229
+ conn.execute(sql)
230
+ conn.commit()
231
+ else:
232
+ super().import_database(location, source_format=source_format, **kwargs)
@@ -1,7 +1,8 @@
1
1
  # mongodb_database.py
2
2
 
3
3
  import logging
4
- from typing import Optional
4
+ from pathlib import Path
5
+ from typing import Optional, Union
5
6
 
6
7
  from pymongo import MongoClient
7
8
  from pymongo.database import Database as NativeDatabase
@@ -9,6 +10,9 @@ from pymongo.database import Database as NativeDatabase
9
10
  from linkml_store.api import Database
10
11
  from linkml_store.api.queries import Query, QueryResult
11
12
  from linkml_store.api.stores.mongodb.mongodb_collection import MongoDBCollection
13
+ from linkml_store.utils.file_utils import safe_remove_directory
14
+ from linkml_store.utils.format_utils import Format
15
+ from linkml_store.utils.mongodb_utils import import_mongodb
12
16
 
13
17
  logger = logging.getLogger(__name__)
14
18
 
@@ -27,6 +31,8 @@ class MongoDBDatabase(Database):
27
31
  def __init__(self, handle: Optional[str] = None, **kwargs):
28
32
  if handle is None:
29
33
  handle = "mongodb://localhost:27017/test"
34
+ if handle == "mongodb":
35
+ handle = "mongodb://localhost:27017/temporary"
30
36
  super().__init__(handle=handle, **kwargs)
31
37
 
32
38
  @property
@@ -77,3 +83,27 @@ class MongoDBDatabase(Database):
77
83
  if collection_name not in self._collections:
78
84
  collection = MongoDBCollection(name=collection_name, parent=self)
79
85
  self._collections[collection_name] = collection
86
+
87
+ def export_database(self, location: str, target_format: Optional[Union[str, Format]] = None, **kwargs):
88
+ if target_format == Format.DUMP_MONGODB.value or target_format == Format.DUMP_MONGODB:
89
+ path = Path(location)
90
+ if path.exists():
91
+ safe_remove_directory(path, no_backup=True)
92
+ from linkml_store.utils.mongodb_utils import export_mongodb
93
+
94
+ export_mongodb(self.handle, location)
95
+ else:
96
+ super().export_database(location, target_format=target_format, **kwargs)
97
+
98
+ def import_database(self, location: str, source_format: Optional[str] = None, **kwargs):
99
+ """
100
+ Import a database from a file or location.
101
+
102
+ :param location: location of the file
103
+ :param source_format: source format
104
+ :param kwargs: additional arguments
105
+ """
106
+ if source_format == Format.DUMP_MONGODB.value or source_format == Format.DUMP_MONGODB:
107
+ import_mongodb(self.handle, location, drop=True)
108
+ else:
109
+ super().import_database(location, source_format=source_format, **kwargs)
@@ -228,7 +228,11 @@ def store(ctx, files, object, format):
228
228
  @click.pass_context
229
229
  @click.argument("files", type=click.Path(exists=True), nargs=-1)
230
230
  def import_database(ctx, files, format):
231
- """Imports a database from a dump."""
231
+ """Imports a database from a dump.
232
+
233
+ See the `export` command for a full list of supported formats. The same
234
+ formats are generally supported for imports.
235
+ """
232
236
  settings = ctx.obj["settings"]
233
237
  db = settings.database
234
238
  if not files and not object:
@@ -242,7 +246,30 @@ def import_database(ctx, files, format):
242
246
  @click.option("--output", "-o", required=True, type=click.Path(), help="Output file path")
243
247
  @click.pass_context
244
248
  def export(ctx, output_type, output):
245
- """Exports a database to a dump."""
249
+ """Exports a database to a standard dump format.
250
+
251
+ Example:
252
+
253
+ linkml-store -d duckdb:///countries.db export -O yaml -o countries.yaml
254
+
255
+ Export format will be guessed from extension if not specified
256
+
257
+ Example:
258
+
259
+ linkml-store -d duckdb:///countries.db export -o countries.json
260
+
261
+ Tree formats such as YAML and JSON can natively store an entire database; each collection
262
+ will be a distinct key in the database.
263
+
264
+ Additionally, native dump formats can be used:
265
+
266
+ Example:
267
+
268
+ linkml-store -d duckdb:///countries.db export -o countries -O duckdb
269
+
270
+ Here, `countries` is a directory. This is equivalent to running EXPORT DATABASE
271
+ (see https://duckdb.org/docs/sql/statements/export.html)
272
+ """
246
273
  settings = ctx.obj["settings"]
247
274
  db = settings.database
248
275
  if output_type is None:
@@ -1,10 +1,14 @@
1
1
  import csv
2
+ import gzip
3
+ import io
2
4
  import json
5
+ import logging
3
6
  import sys
7
+ import tarfile
4
8
  from enum import Enum
5
9
  from io import StringIO
6
10
  from pathlib import Path
7
- from typing import Any, Dict, List, Optional, TextIO, Type, Union
11
+ from typing import IO, Any, Dict, List, Optional, TextIO, Type, Union
8
12
 
9
13
  import pandas as pd
10
14
  import pystow
@@ -12,6 +16,8 @@ import yaml
12
16
  from pydantic import BaseModel
13
17
  from tabulate import tabulate
14
18
 
19
+ logger = logging.getLogger(__name__)
20
+
15
21
 
16
22
  class Format(Enum):
17
23
  """
@@ -27,6 +33,35 @@ class Format(Enum):
27
33
  PARQUET = "parquet"
28
34
  FORMATTED = "formatted"
29
35
  TABLE = "table"
36
+ SQLDUMP_DUCKDB = "duckdb"
37
+ SQLDUMP_POSTGRES = "postgres"
38
+ DUMP_MONGODB = "mongodb"
39
+
40
+ @classmethod
41
+ def guess_format(cls, file_name: str) -> Optional["Format"]:
42
+ ext = Path(file_name).suffix.lower()
43
+
44
+ format_map = {
45
+ ".json": cls.JSON,
46
+ ".jsonl": cls.JSONL,
47
+ ".yaml": cls.YAML,
48
+ ".yml": cls.YAML,
49
+ ".tsv": cls.TSV,
50
+ ".csv": cls.CSV,
51
+ ".py": cls.PYTHON,
52
+ ".parquet": cls.PARQUET,
53
+ ".pq": cls.PARQUET,
54
+ }
55
+ fmt = format_map.get(ext, None)
56
+ if fmt is None:
57
+ if ext.startswith("."):
58
+ ext = ext[1:]
59
+ if ext in [f.value for f in Format]:
60
+ return Format(ext)
61
+ return fmt
62
+
63
+ def is_dump_format(self):
64
+ return self in [Format.SQLDUMP_DUCKDB, Format.SQLDUMP_POSTGRES, Format.DUMP_MONGODB]
30
65
 
31
66
 
32
67
  def load_objects_from_url(
@@ -46,15 +81,109 @@ def load_objects_from_url(
46
81
  :return: A list of dictionaries representing the loaded objects.
47
82
  """
48
83
  local_path = pystow.ensure("linkml", "linkml-store", url=url)
84
+ logger.info(f"synced to {local_path}")
49
85
  objs = load_objects(local_path, format=format, expected_type=expected_type, **kwargs)
50
86
  if not objs:
51
87
  raise ValueError(f"No objects loaded from URL: {url}")
52
88
  return objs
53
89
 
54
90
 
91
+ def process_file(
92
+ f: IO, format: Format, expected_type: Optional[Type] = None, header_comment_token: Optional[str] = None
93
+ ) -> List[Dict[str, Any]]:
94
+ """
95
+ Process a single file and return a list of objects.
96
+ """
97
+ if format == Format.JSON:
98
+ objs = json.load(f)
99
+ elif format == Format.JSONL:
100
+ objs = [json.loads(line) for line in f]
101
+ elif format == Format.YAML:
102
+ if expected_type and expected_type == list: # noqa E721
103
+ objs = list(yaml.safe_load_all(f))
104
+ else:
105
+ objs = yaml.safe_load(f)
106
+ elif format in [Format.TSV, Format.CSV]:
107
+ if header_comment_token:
108
+ while True:
109
+ pos = f.tell()
110
+ line = f.readline()
111
+ if not line.startswith(header_comment_token):
112
+ f.seek(pos)
113
+ break
114
+ delimiter = "\t" if format == Format.TSV else ","
115
+ reader = csv.DictReader(f, delimiter=delimiter)
116
+ objs = list(reader)
117
+ elif format == Format.PARQUET:
118
+ import pyarrow.parquet as pq
119
+
120
+ table = pq.read_table(f)
121
+ objs = table.to_pandas().to_dict(orient="records")
122
+ elif format in [Format.PYTHON, Format.FORMATTED, Format.TABLE]:
123
+ raise ValueError(f"Format {format} is not supported for loading objects")
124
+ else:
125
+ raise ValueError(f"Unsupported file format: {format}")
126
+
127
+ if not isinstance(objs, list):
128
+ objs = [objs]
129
+ return objs
130
+
131
+
55
132
  def load_objects(
133
+ file_path: Union[str, Path],
134
+ format: Optional[Union[Format, str]] = None,
135
+ compression: Optional[str] = None,
136
+ expected_type: Optional[Type] = None,
137
+ header_comment_token: Optional[str] = None,
138
+ ) -> List[Dict[str, Any]]:
139
+ """
140
+ Load objects from a file or archive in supported formats.
141
+ For tgz archives, it processes all files and concatenates the results.
142
+
143
+ :param file_path: The path to the file or archive.
144
+ :param format: The format of the file. Can be a Format enum or a string value.
145
+ :param compression: The compression type. Supports 'gz' for gzip and 'tgz' for tar.gz.
146
+ :param expected_type: The target type to load the objects into, e.g. list
147
+ :param header_comment_token: Token used for header comments to be skipped
148
+ :return: A list of dictionaries representing the loaded objects.
149
+ """
150
+ if isinstance(file_path, Path):
151
+ file_path = str(file_path)
152
+
153
+ if isinstance(format, str):
154
+ format = Format(format)
155
+
156
+ all_objects = []
157
+
158
+ if compression == "tgz":
159
+ with tarfile.open(file_path, "r:gz") as tar:
160
+ for member in tar.getmembers():
161
+ if member.isfile():
162
+ f = tar.extractfile(member)
163
+ if f:
164
+ content = io.TextIOWrapper(f)
165
+ member_format = Format.guess_format(member.name) if not format else format
166
+ logger.debug(f"Processing tar member {member.name} with format {member_format}")
167
+ all_objects.extend(process_file(content, member_format, expected_type, header_comment_token))
168
+ else:
169
+ if Path(file_path).is_dir():
170
+ raise ValueError(f"{file_path} is a dir, which is invalid for {format}")
171
+ mode = "rb" if format == Format.PARQUET or compression == "gz" else "r"
172
+ open_func = gzip.open if compression == "gz" else open
173
+ format = Format.guess_format(file_path) if not format else format
174
+ with open_func(file_path, mode) if file_path != "-" else sys.stdin as f:
175
+ if compression == "gz" and mode == "r":
176
+ f = io.TextIOWrapper(f)
177
+ all_objects = process_file(f, format, expected_type, header_comment_token)
178
+
179
+ logger.debug(f"Loaded {len(all_objects)} objects from {file_path}")
180
+ return all_objects
181
+
182
+
183
+ def xxxload_objects(
56
184
  file_path: Union[str, Path],
57
185
  format: Union[Format, str] = None,
186
+ compression: Optional[str] = None,
58
187
  expected_type: Type = None,
59
188
  header_comment_token: Optional[str] = None,
60
189
  ) -> List[Dict[str, Any]]:
@@ -172,7 +301,7 @@ def write_output(
172
301
 
173
302
 
174
303
  def render_output(
175
- data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame], format: Union[Format, str] = Format.YAML
304
+ data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame], format: Optional[Union[Format, str]] = Format.YAML
176
305
  ) -> str:
177
306
  """
178
307
  Render output data in JSON, JSONLines, YAML, CSV, or TSV format.
@@ -271,15 +400,4 @@ def guess_format(path: str) -> Optional[Format]:
271
400
  :param path: The path to the file.
272
401
  :return: The guessed format.
273
402
  """
274
- if path.endswith(".json"):
275
- return Format.JSON
276
- elif path.endswith(".jsonl"):
277
- return Format.JSONL
278
- elif path.endswith(".yaml") or path.endswith(".yml"):
279
- return Format.YAML
280
- elif path.endswith(".tsv"):
281
- return Format.TSV
282
- elif path.endswith(".csv"):
283
- return Format.CSV
284
- else:
285
- return None
403
+ return Format.guess_format(path)
@@ -0,0 +1,145 @@
1
+ import logging
2
+ import os
3
+ import subprocess
4
+ from pathlib import Path
5
+ from typing import Optional
6
+ from urllib.parse import urlparse
7
+
8
+ from pymongo import MongoClient
9
+ from pymongo.database import Database
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def extract_connection_info(db: Database):
15
+ client = db.client
16
+
17
+ # Get the host and port
18
+ host_info = client.address
19
+ if host_info:
20
+ host, port = host_info
21
+ else:
22
+ # For replica sets or sharded clusters, we might need to get this differently
23
+ host = client.HOST
24
+ port = client.PORT
25
+
26
+ # Get the database name
27
+ db_name = db.name
28
+
29
+ # Get username if available
30
+ username = None
31
+ if hasattr(client, "options") and hasattr(client.options, "credentials"):
32
+ credentials = client.options.credentials
33
+ if credentials:
34
+ username = credentials.username
35
+
36
+ return {"host": host, "port": port, "db_name": db_name, "username": username}
37
+
38
+
39
+ def get_connection_string(client: MongoClient):
40
+ """
41
+ Extract a connection string from the MongoClient.
42
+ This avoids triggering truth value testing on Database objects.
43
+ """
44
+ if client.address:
45
+ host, port = client.address
46
+ return f"{host}:{port}"
47
+ if hasattr(client, "address") and client.address:
48
+ host, port = client.address
49
+ return f"{host}:{port}"
50
+ elif client.hosts:
51
+ # For replica sets, return all hosts
52
+ return ",".join(f"{host}:{port}" for host, port in client.hosts)
53
+ elif hasattr(client, "HOST"):
54
+ # If we can't determine hosts, use the entire URI
55
+ parsed_uri = urlparse(client.HOST)
56
+ return f"{parsed_uri.hostname}:{parsed_uri.port}"
57
+ else:
58
+ raise ValueError("Unable to determine connection string from client")
59
+
60
+
61
+ def get_connection_info(db: Database):
62
+ """
63
+ Extract connection information from the Database object.
64
+ """
65
+ # Get the name of the database
66
+ db_name = db.name
67
+
68
+ # Get the client's node list (this should work for single nodes and replica sets)
69
+ node_list = db.client.nodes
70
+
71
+ if not node_list:
72
+ raise ValueError("Unable to determine connection information from database")
73
+
74
+ # Use the first node in the list (for single node setups, this will be the only node)
75
+ first_node = node_list[0]
76
+ host, port = first_node
77
+
78
+ return host, port, db_name
79
+
80
+
81
+ def get_auth_from_client(client: MongoClient):
82
+ """Extract authentication details from MongoClient."""
83
+ if hasattr(client, "_MongoClient__options"):
84
+ # For older versions of PyMongo
85
+ options = client._MongoClient__options
86
+ elif hasattr(client, "options"):
87
+ # For newer versions of PyMongo
88
+ options = client.options
89
+ else:
90
+ return None, None, None
91
+
92
+ if hasattr(options, "credentials"):
93
+ creds = options.credentials
94
+ return creds.username, creds.password, creds.source
95
+ return None, None, None
96
+
97
+
98
+ def connection_from_handle(handle: str):
99
+ if handle.startswith("mongodb://"):
100
+ handle = handle.replace("mongodb://", "")
101
+ host, db = handle.split("/")
102
+ return host, db
103
+
104
+
105
+ def export_mongodb(handle: str, location: str, password: Optional[str] = None):
106
+ host, db_name = connection_from_handle(handle)
107
+
108
+ # Construct the mongodump command
109
+ cmd = ["mongodump", f"--host={host}", f"--db={db_name}"]
110
+ logger.info(f"Exporting MongoDB database {db_name} from {host} to {location}")
111
+ cmd.extend(["--out", location])
112
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True)
113
+ logger.info(f"MongoDB export completed successfully. Output: {result.stdout}")
114
+
115
+
116
+ def import_mongodb(handle: str, dump_dir: str, drop: bool = False):
117
+ host, db_name = connection_from_handle(handle)
118
+
119
+ # list dirs in dump_dir
120
+ dir_path = Path(dump_dir)
121
+ if not dir_path.is_dir():
122
+ raise ValueError(f"{dir_path} is not a dir")
123
+ directories = [name for name in os.listdir(dump_dir)]
124
+ if len(directories) != 1:
125
+ raise ValueError(f"Expected exactly one database in {dump_dir}, got: {directories}")
126
+ src_db_name = directories[0]
127
+
128
+ # Construct the mongorestore command
129
+ cmd = [
130
+ "mongorestore",
131
+ f"--host={host}",
132
+ f"--nsFrom={src_db_name}.*",
133
+ f"--nsTo={db_name}.*",
134
+ str(dump_dir),
135
+ ]
136
+
137
+ # Add drop option if specified
138
+ if drop:
139
+ cmd.append("--drop")
140
+ logger.info(f"CMD={cmd}")
141
+ # Execute mongorestore
142
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True)
143
+ if result.stderr:
144
+ logger.warning(result.stderr)
145
+ logger.info(f"MongoDB import completed successfully. Output: {result.stdout} // {result.stderr}")