lamindb_setup 1.15.2__py3-none-any.whl → 1.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb_setup/__init__.py +6 -3
- lamindb_setup/_check_setup.py +24 -85
- lamindb_setup/_connect_instance.py +9 -23
- lamindb_setup/_delete.py +10 -5
- lamindb_setup/_disconnect.py +12 -9
- lamindb_setup/_init_instance.py +0 -1
- lamindb_setup/_migrate.py +0 -14
- lamindb_setup/_schema_metadata.py +9 -11
- lamindb_setup/_setup_user.py +28 -7
- lamindb_setup/_silence_loggers.py +2 -0
- lamindb_setup/core/_aws_options.py +17 -7
- lamindb_setup/core/_clone.py +1 -1
- lamindb_setup/core/_hub_client.py +1 -2
- lamindb_setup/core/_hub_core.py +9 -6
- lamindb_setup/core/_private_django_api.py +0 -1
- lamindb_setup/core/_settings.py +14 -10
- lamindb_setup/core/_settings_instance.py +28 -6
- lamindb_setup/core/_settings_load.py +25 -7
- lamindb_setup/core/_settings_storage.py +3 -1
- lamindb_setup/core/django.py +48 -18
- lamindb_setup/core/lamin.db.gz +0 -0
- lamindb_setup/core/upath.py +34 -13
- lamindb_setup/errors.py +0 -12
- lamindb_setup/io.py +69 -33
- {lamindb_setup-1.15.2.dist-info → lamindb_setup-1.17.0.dist-info}/METADATA +5 -5
- lamindb_setup-1.17.0.dist-info/RECORD +51 -0
- {lamindb_setup-1.15.2.dist-info → lamindb_setup-1.17.0.dist-info}/WHEEL +1 -1
- lamindb_setup-1.15.2.dist-info/RECORD +0 -50
- {lamindb_setup-1.15.2.dist-info → lamindb_setup-1.17.0.dist-info/licenses}/LICENSE +0 -0
lamindb_setup/core/_settings.py
CHANGED
|
@@ -46,6 +46,12 @@ def _process_cache_path(cache_path: UPathStr | None) -> UPath | None:
|
|
|
46
46
|
return cache_dir
|
|
47
47
|
|
|
48
48
|
|
|
49
|
+
# returned by settings.branch for none/none instance
|
|
50
|
+
class MainBranchMock:
|
|
51
|
+
id = 1
|
|
52
|
+
name = "main"
|
|
53
|
+
|
|
54
|
+
|
|
49
55
|
class SetupSettings:
|
|
50
56
|
"""Setup settings."""
|
|
51
57
|
|
|
@@ -140,6 +146,10 @@ class SetupSettings:
|
|
|
140
146
|
# and we never need a DB request
|
|
141
147
|
def branch(self) -> Branch:
|
|
142
148
|
"""Default branch."""
|
|
149
|
+
# this is needed for .filter() with non-default connections
|
|
150
|
+
if not self._instance_exists:
|
|
151
|
+
return MainBranchMock()
|
|
152
|
+
|
|
143
153
|
if self._branch is None:
|
|
144
154
|
from lamindb import Branch
|
|
145
155
|
|
|
@@ -222,10 +232,9 @@ class SetupSettings:
|
|
|
222
232
|
If `True`, the current instance is connected, meaning that the db and other settings
|
|
223
233
|
are properly configured for use.
|
|
224
234
|
"""
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
return False
|
|
235
|
+
from . import django
|
|
236
|
+
|
|
237
|
+
return self._instance_exists and django.IS_SETUP
|
|
229
238
|
|
|
230
239
|
@property
|
|
231
240
|
def private_django_api(self) -> bool:
|
|
@@ -284,12 +293,7 @@ class SetupSettings:
|
|
|
284
293
|
|
|
285
294
|
@property
|
|
286
295
|
def _instance_exists(self):
|
|
287
|
-
|
|
288
|
-
self.instance # noqa
|
|
289
|
-
return True
|
|
290
|
-
# this is implicit logic that catches if no instance is loaded
|
|
291
|
-
except CurrentInstanceNotConfigured:
|
|
292
|
-
return False
|
|
296
|
+
return self.instance.slug != "none/none"
|
|
293
297
|
|
|
294
298
|
@property
|
|
295
299
|
def cache_dir(self) -> UPath:
|
|
@@ -125,10 +125,11 @@ class InstanceSettings:
|
|
|
125
125
|
if self._local_storage is not None:
|
|
126
126
|
value_local = self.local_storage
|
|
127
127
|
representation += f"\n - local storage: {value_local.root_as_str} ({value_local.region})"
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
128
|
+
if value is not None:
|
|
129
|
+
representation += (
|
|
130
|
+
f"\n - cloud storage: {value.root_as_str} ({value.region})"
|
|
131
|
+
)
|
|
132
|
+
elif value is not None:
|
|
132
133
|
representation += (
|
|
133
134
|
f"\n - storage: {value.root_as_str} ({value.region})"
|
|
134
135
|
)
|
|
@@ -513,17 +514,36 @@ class InstanceSettings:
|
|
|
513
514
|
|
|
514
515
|
@property
|
|
515
516
|
def dialect(self) -> Literal["sqlite", "postgresql"]:
|
|
516
|
-
"""SQL dialect.
|
|
517
|
+
"""SQL dialect.
|
|
518
|
+
|
|
519
|
+
Equivalent to :attr:`vendor`.
|
|
520
|
+
|
|
521
|
+
"vendor" is the Django terminology for the type of database. "dialect" is the SQLAlchemy terminology.
|
|
522
|
+
"""
|
|
517
523
|
if self._db is None or self._db.startswith("sqlite://"):
|
|
518
524
|
return "sqlite"
|
|
519
525
|
else:
|
|
520
526
|
assert self._db.startswith("postgresql"), f"Unexpected DB value: {self._db}"
|
|
521
527
|
return "postgresql"
|
|
522
528
|
|
|
529
|
+
@property
|
|
530
|
+
def vendor(self) -> Literal["sqlite", "postgresql"]:
|
|
531
|
+
"""Database vendor.
|
|
532
|
+
|
|
533
|
+
Equivalent to :attr:`dialect`.
|
|
534
|
+
|
|
535
|
+
"vendor" is the Django terminology for the type of database. "dialect" is the SQLAlchemy terminology.
|
|
536
|
+
"""
|
|
537
|
+
return self.dialect
|
|
538
|
+
|
|
523
539
|
@property
|
|
524
540
|
def _is_cloud_sqlite(self) -> bool:
|
|
525
541
|
"""Is this a cloud instance with sqlite db."""
|
|
526
|
-
return
|
|
542
|
+
return (
|
|
543
|
+
self.dialect == "sqlite"
|
|
544
|
+
and self.storage is not None
|
|
545
|
+
and self.storage.type_is_cloud
|
|
546
|
+
)
|
|
527
547
|
|
|
528
548
|
@property
|
|
529
549
|
def _cloud_sqlite_locker(self):
|
|
@@ -543,6 +563,8 @@ class InstanceSettings:
|
|
|
543
563
|
@property
|
|
544
564
|
def is_remote(self) -> bool:
|
|
545
565
|
"""Boolean indicating if an instance has no local component."""
|
|
566
|
+
if self.storage is None and self.db == "sqlite:///:memory:":
|
|
567
|
+
return False
|
|
546
568
|
return check_is_instance_remote(self.storage.root_as_str, self.db)
|
|
547
569
|
|
|
548
570
|
@property
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
|
+
from importlib.util import find_spec
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import TYPE_CHECKING
|
|
6
7
|
from uuid import UUID, uuid4
|
|
@@ -46,19 +47,36 @@ def load_cache_path_from_settings(storage_settings: Path | None = None) -> Path
|
|
|
46
47
|
return None
|
|
47
48
|
|
|
48
49
|
|
|
50
|
+
def find_module_candidates():
|
|
51
|
+
"""Find all local packages that depend on lamindb."""
|
|
52
|
+
candidates = ["bionty", "wetlab"]
|
|
53
|
+
return [c for c in candidates if find_spec(c) is not None]
|
|
54
|
+
|
|
55
|
+
|
|
49
56
|
def load_instance_settings(instance_settings_file: Path | None = None):
|
|
50
57
|
if instance_settings_file is None:
|
|
51
|
-
|
|
52
|
-
|
|
58
|
+
isettings_file = current_instance_settings_file()
|
|
59
|
+
if not isettings_file.exists():
|
|
60
|
+
isettings = InstanceSettings(
|
|
61
|
+
id=UUID("00000000-0000-0000-0000-000000000000"),
|
|
62
|
+
owner="none",
|
|
63
|
+
name="none",
|
|
64
|
+
storage=None,
|
|
65
|
+
modules=",".join(find_module_candidates()),
|
|
66
|
+
)
|
|
67
|
+
return isettings
|
|
68
|
+
else:
|
|
69
|
+
isettings_file = instance_settings_file
|
|
70
|
+
|
|
71
|
+
if not isettings_file.exists():
|
|
72
|
+
# this errors only if the file was explicitly provided
|
|
53
73
|
raise CurrentInstanceNotConfigured
|
|
54
74
|
try:
|
|
55
|
-
settings_store = InstanceSettingsStore(_env_file=
|
|
75
|
+
settings_store = InstanceSettingsStore(_env_file=isettings_file)
|
|
56
76
|
except (ValidationError, TypeError) as error:
|
|
57
|
-
with open(instance_settings_file) as f:
|
|
58
|
-
content = f.read()
|
|
59
77
|
raise SettingsEnvFileOutdated(
|
|
60
|
-
f"\n\n{error}\n\nYour instance settings file with\n\n{
|
|
61
|
-
f" (likely outdated), see validation error. Please delete {
|
|
78
|
+
f"\n\n{error}\n\nYour instance settings file with\n\n{isettings_file.read_text()}\nis invalid"
|
|
79
|
+
f" (likely outdated), see validation error. Please delete {isettings_file} &"
|
|
62
80
|
" reload (remote) or re-initialize (local) the instance with the same name & storage location."
|
|
63
81
|
) from error
|
|
64
82
|
isettings = setup_instance_from_store(settings_store)
|
|
@@ -122,6 +122,7 @@ def init_storage(
|
|
|
122
122
|
access_token: str | None = None,
|
|
123
123
|
region: str | None = None,
|
|
124
124
|
space_uuid: UUID | None = None,
|
|
125
|
+
skip_mark_storage_root: bool = False,
|
|
125
126
|
) -> tuple[
|
|
126
127
|
StorageSettings,
|
|
127
128
|
Literal["hub-record-not-created", "hub-record-retrieved", "hub-record-created"],
|
|
@@ -181,7 +182,8 @@ def init_storage(
|
|
|
181
182
|
space_id=space_uuid,
|
|
182
183
|
)
|
|
183
184
|
# we check the write access here if the storage record has not been retrieved from the hub
|
|
184
|
-
if hub_record_status
|
|
185
|
+
# Sergei: should it in fact still go through if hub_record_status == "hub-record-not-created"?
|
|
186
|
+
if hub_record_status != "hub-record-retrieved" and not skip_mark_storage_root:
|
|
185
187
|
try:
|
|
186
188
|
# (federated) credentials for AWS access are provisioned under-the-hood
|
|
187
189
|
# discussion: https://laminlabs.slack.com/archives/C04FPE8V01W/p1719260587167489
|
lamindb_setup/core/django.py
CHANGED
|
@@ -5,13 +5,15 @@ import builtins
|
|
|
5
5
|
import os
|
|
6
6
|
import sys
|
|
7
7
|
import importlib as il
|
|
8
|
+
import gzip
|
|
8
9
|
import jwt
|
|
9
10
|
import time
|
|
10
11
|
import threading
|
|
11
12
|
from pathlib import Path
|
|
13
|
+
import shutil
|
|
12
14
|
from packaging import version
|
|
13
15
|
from ._settings_instance import InstanceSettings, is_local_db_url
|
|
14
|
-
|
|
16
|
+
from ..errors import CurrentInstanceNotConfigured
|
|
15
17
|
from lamin_utils import logger
|
|
16
18
|
|
|
17
19
|
|
|
@@ -21,6 +23,24 @@ IS_MIGRATING = False
|
|
|
21
23
|
CONN_MAX_AGE = 299
|
|
22
24
|
|
|
23
25
|
|
|
26
|
+
def get_connection(connection_name: str):
|
|
27
|
+
from django.db import connections
|
|
28
|
+
|
|
29
|
+
return connections[connection_name]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def error_no_instance_wrapper(execute, sql, params, many, context):
|
|
33
|
+
connection = context["connection"]
|
|
34
|
+
|
|
35
|
+
if (
|
|
36
|
+
connection.vendor == "sqlite"
|
|
37
|
+
and connection.settings_dict.get("NAME") == ":memory:"
|
|
38
|
+
):
|
|
39
|
+
raise CurrentInstanceNotConfigured
|
|
40
|
+
|
|
41
|
+
return execute(sql, params, many, context)
|
|
42
|
+
|
|
43
|
+
|
|
24
44
|
# db token that refreshes on access if needed
|
|
25
45
|
class DBToken:
|
|
26
46
|
def __init__(
|
|
@@ -64,11 +84,6 @@ class DBTokenManager:
|
|
|
64
84
|
|
|
65
85
|
self.tokens: dict[str, DBToken] = {}
|
|
66
86
|
|
|
67
|
-
def get_connection(self, connection_name: str):
|
|
68
|
-
from django.db import connections
|
|
69
|
-
|
|
70
|
-
return connections[connection_name]
|
|
71
|
-
|
|
72
87
|
def set(self, token: DBToken, connection_name: str = "default"):
|
|
73
88
|
if connection_name in self.tokens:
|
|
74
89
|
return
|
|
@@ -77,11 +92,7 @@ class DBTokenManager:
|
|
|
77
92
|
from django.db.backends.signals import connection_created
|
|
78
93
|
|
|
79
94
|
def set_token_wrapper(execute, sql, params, many, context):
|
|
80
|
-
not_in_atomic_block =
|
|
81
|
-
context is None
|
|
82
|
-
or "connection" not in context
|
|
83
|
-
or not context["connection"].in_atomic_block
|
|
84
|
-
)
|
|
95
|
+
not_in_atomic_block = not context["connection"].in_atomic_block
|
|
85
96
|
# ignore atomic blocks
|
|
86
97
|
if not_in_atomic_block:
|
|
87
98
|
sql = token.token_query + sql
|
|
@@ -98,7 +109,7 @@ class DBTokenManager:
|
|
|
98
109
|
result.nextset()
|
|
99
110
|
return result
|
|
100
111
|
|
|
101
|
-
|
|
112
|
+
get_connection(connection_name).execute_wrappers.append(set_token_wrapper)
|
|
102
113
|
|
|
103
114
|
def connection_callback(sender, connection, **kwargs):
|
|
104
115
|
if (
|
|
@@ -124,7 +135,7 @@ class DBTokenManager:
|
|
|
124
135
|
if connection_name in self.tokens:
|
|
125
136
|
# here we don't use the connection from the closure
|
|
126
137
|
# because Atomic is a single class to manage transactions for all connections
|
|
127
|
-
connection =
|
|
138
|
+
connection = get_connection(connection_name)
|
|
128
139
|
if len(connection.atomic_blocks) == 1:
|
|
129
140
|
token = self.tokens[connection_name]
|
|
130
141
|
# use raw psycopg2 connection here
|
|
@@ -142,7 +153,7 @@ class DBTokenManager:
|
|
|
142
153
|
|
|
143
154
|
from django.db.backends.signals import connection_created
|
|
144
155
|
|
|
145
|
-
connection =
|
|
156
|
+
connection = get_connection(connection_name)
|
|
146
157
|
|
|
147
158
|
connection.execute_wrappers = [
|
|
148
159
|
w
|
|
@@ -238,6 +249,8 @@ def setup_django(
|
|
|
238
249
|
if view_schema:
|
|
239
250
|
installed_apps = installed_apps[::-1] # to fix how apps appear
|
|
240
251
|
installed_apps += ["schema_graph", "django.contrib.staticfiles"]
|
|
252
|
+
if isettings.dialect == "postgresql":
|
|
253
|
+
installed_apps.insert(0, "pgtrigger")
|
|
241
254
|
|
|
242
255
|
kwargs = dict(
|
|
243
256
|
INSTALLED_APPS=installed_apps,
|
|
@@ -289,6 +302,9 @@ def setup_django(
|
|
|
289
302
|
django.db.connections._connections = threading.local()
|
|
290
303
|
logger.debug("django.db.connections._connections has been patched")
|
|
291
304
|
|
|
305
|
+
# error if trying to query with the default connection without setting up an instance
|
|
306
|
+
get_connection("default").execute_wrappers.insert(0, error_no_instance_wrapper)
|
|
307
|
+
|
|
292
308
|
if isettings._fine_grained_access and isettings._db_permissions == "jwt":
|
|
293
309
|
db_token = DBToken(isettings)
|
|
294
310
|
db_token_manager.set(db_token) # sets for the default connection
|
|
@@ -309,10 +325,24 @@ def setup_django(
|
|
|
309
325
|
call_command("migrate", app_name, app_number, verbosity=2)
|
|
310
326
|
isettings._update_cloud_sqlite_file(unlock_cloud_sqlite=False)
|
|
311
327
|
elif init:
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
328
|
+
modules_beyond_bionty = isettings.modules.copy()
|
|
329
|
+
compressed_sqlite_path = Path(__file__).parent / "lamin.db.gz"
|
|
330
|
+
if "bionty" in modules_beyond_bionty:
|
|
331
|
+
modules_beyond_bionty.remove("bionty")
|
|
332
|
+
if (
|
|
333
|
+
isettings.dialect == "postgresql"
|
|
334
|
+
or os.getenv("LAMINDB_INIT_FROM_SCRATCH", "false") == "true"
|
|
335
|
+
or len(modules_beyond_bionty) > 0
|
|
336
|
+
or not compressed_sqlite_path.exists()
|
|
337
|
+
):
|
|
338
|
+
global IS_MIGRATING
|
|
339
|
+
IS_MIGRATING = True
|
|
340
|
+
call_command("migrate", verbosity=0)
|
|
341
|
+
IS_MIGRATING = False
|
|
342
|
+
else:
|
|
343
|
+
with gzip.open(compressed_sqlite_path, "rb") as f_in:
|
|
344
|
+
with open(isettings._sqlite_file_local, "wb") as f_out:
|
|
345
|
+
shutil.copyfileobj(f_in, f_out)
|
|
316
346
|
|
|
317
347
|
global IS_SETUP
|
|
318
348
|
IS_SETUP = True
|
|
Binary file
|
lamindb_setup/core/upath.py
CHANGED
|
@@ -93,10 +93,12 @@ def extract_suffix_from_path(path: Path, arg_name: str | None = None) -> str:
|
|
|
93
93
|
else:
|
|
94
94
|
return suffix
|
|
95
95
|
|
|
96
|
-
|
|
96
|
+
suffixes = path.suffixes
|
|
97
|
+
|
|
98
|
+
if len(suffixes) <= 1:
|
|
97
99
|
return process_digits(path.suffix)
|
|
98
100
|
|
|
99
|
-
total_suffix = "".join(
|
|
101
|
+
total_suffix = "".join(suffixes)
|
|
100
102
|
if total_suffix in VALID_SIMPLE_SUFFIXES:
|
|
101
103
|
return total_suffix
|
|
102
104
|
elif total_suffix.endswith(tuple(VALID_COMPOSITE_SUFFIXES)):
|
|
@@ -115,14 +117,24 @@ def extract_suffix_from_path(path: Path, arg_name: str | None = None) -> str:
|
|
|
115
117
|
# in COMPRESSION_SUFFIXES to detect something like .random.gz and then
|
|
116
118
|
# add ".random.gz" but concluded it's too dangerous it's safer to just
|
|
117
119
|
# use ".gz" in such a case
|
|
118
|
-
if
|
|
119
|
-
suffix = "".join(
|
|
120
|
-
|
|
120
|
+
if suffixes[-2] in VALID_SIMPLE_SUFFIXES:
|
|
121
|
+
suffix = "".join(suffixes[-2:])
|
|
122
|
+
# if the suffix preceding the compression suffixes is a valid suffix,
|
|
123
|
+
# we account for it; otherwise we don't.
|
|
124
|
+
# i.e. we should have .h5ad.tar.gz or .csv.tar.gz, not just .tar.gz
|
|
125
|
+
if (
|
|
126
|
+
suffix == ".tar.gz"
|
|
127
|
+
and len(suffixes) > 2
|
|
128
|
+
and (suffix_3 := suffixes[-3]) in VALID_SIMPLE_SUFFIXES
|
|
129
|
+
):
|
|
130
|
+
suffix = suffix_3 + suffix
|
|
121
131
|
# do not print a warning for things like .tar.gz, .fastq.gz
|
|
122
|
-
if
|
|
132
|
+
if suffixes[-1] == ".gz":
|
|
123
133
|
print_hint = False
|
|
134
|
+
else:
|
|
135
|
+
msg += f"inferring: '{suffix}'"
|
|
124
136
|
else:
|
|
125
|
-
suffix =
|
|
137
|
+
suffix = suffixes[-1] # this is equivalent to path.suffix
|
|
126
138
|
msg += (
|
|
127
139
|
f"using only last suffix: '{suffix}' - if you want your composite"
|
|
128
140
|
" suffix to be recognized add it to"
|
|
@@ -993,13 +1005,22 @@ def check_storage_is_empty(
|
|
|
993
1005
|
objects = [o for o in objects if "/.lamindb/_exclusion/" not in o]
|
|
994
1006
|
n_files = len(objects)
|
|
995
1007
|
n_diff = n_files - n_offset_objects
|
|
996
|
-
ask_for_deletion = (
|
|
997
|
-
"delete them prior to deleting the storage location"
|
|
998
|
-
if raise_error
|
|
999
|
-
else "consider deleting them"
|
|
1000
|
-
)
|
|
1001
|
-
message = f"'{directory_string}' contains {n_diff} objects - {ask_for_deletion}"
|
|
1002
1008
|
if n_diff > 0:
|
|
1009
|
+
ask_for_deletion = (
|
|
1010
|
+
"delete them prior to deleting the storage location"
|
|
1011
|
+
if raise_error
|
|
1012
|
+
else "consider deleting them"
|
|
1013
|
+
)
|
|
1014
|
+
message = f"'{directory_string}' contains {n_diff} objects:\n"
|
|
1015
|
+
message += "\n".join(
|
|
1016
|
+
[
|
|
1017
|
+
o
|
|
1018
|
+
for o in objects
|
|
1019
|
+
if not o.endswith(".lamindb/storage_uid.txt")
|
|
1020
|
+
and not (account_for_sqlite_file and o.endswith(".lamindb/lamin.db"))
|
|
1021
|
+
]
|
|
1022
|
+
)
|
|
1023
|
+
message += f"\n{ask_for_deletion}"
|
|
1003
1024
|
if raise_error:
|
|
1004
1025
|
raise StorageNotEmpty(message) from None
|
|
1005
1026
|
else:
|
lamindb_setup/errors.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""Errors.
|
|
2
2
|
|
|
3
3
|
.. autoexception:: CurrentInstanceNotConfigured
|
|
4
|
-
.. autoexception:: InstanceNotSetupError
|
|
5
4
|
.. autoexception:: ModuleWasntConfigured
|
|
6
5
|
.. autoexception:: StorageAlreadyManaged
|
|
7
6
|
.. autoexception:: StorageNotEmpty
|
|
@@ -25,17 +24,6 @@ class DefaultMessageException(Exception):
|
|
|
25
24
|
super().__init__(message)
|
|
26
25
|
|
|
27
26
|
|
|
28
|
-
# TODO: remove this exception sooner or later because we don't have a need for it anymore
|
|
29
|
-
class InstanceNotSetupError(DefaultMessageException):
|
|
30
|
-
default_message = """\
|
|
31
|
-
To use lamindb, you need to connect to an instance.
|
|
32
|
-
|
|
33
|
-
Connect to an instance: `ln.connect()`. Init an instance: `ln.setup.init()`.
|
|
34
|
-
|
|
35
|
-
If you used the CLI to set up lamindb in a notebook, restart the Python session.
|
|
36
|
-
"""
|
|
37
|
-
|
|
38
|
-
|
|
39
27
|
class CurrentInstanceNotConfigured(DefaultMessageException):
|
|
40
28
|
default_message = """\
|
|
41
29
|
No instance is connected! Call
|
lamindb_setup/io.py
CHANGED
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import io
|
|
4
4
|
import json
|
|
5
5
|
import warnings
|
|
6
|
-
from concurrent.futures import
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
7
|
from importlib import import_module
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import TYPE_CHECKING
|
|
@@ -14,24 +14,16 @@ from django.db import models, transaction
|
|
|
14
14
|
from rich.progress import Progress
|
|
15
15
|
|
|
16
16
|
if TYPE_CHECKING:
|
|
17
|
-
from collections.abc import
|
|
17
|
+
from collections.abc import Iterable
|
|
18
18
|
from typing import Literal
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def _get_registries(module_name: str) -> list[str]:
|
|
22
22
|
"""Get registry class names from a module."""
|
|
23
23
|
schema_module = import_module(module_name)
|
|
24
|
-
exclude = {"SQLRecord", "BaseSQLRecord"}
|
|
25
24
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
f"{module_name}.models."
|
|
29
|
-
) and name in dir(schema_module)
|
|
30
|
-
else:
|
|
31
|
-
module_filter = (
|
|
32
|
-
lambda cls, name: cls.__module__ == f"{module_name}.models"
|
|
33
|
-
and name in dir(schema_module)
|
|
34
|
-
)
|
|
25
|
+
# Ensure that models are loaded; we've observed empty exports otherwise
|
|
26
|
+
from django.db import models
|
|
35
27
|
|
|
36
28
|
return [
|
|
37
29
|
name
|
|
@@ -40,8 +32,8 @@ def _get_registries(module_name: str) -> list[str]:
|
|
|
40
32
|
name[0].isupper()
|
|
41
33
|
and isinstance(cls := getattr(schema_module.models, name, None), type)
|
|
42
34
|
and issubclass(cls, models.Model)
|
|
43
|
-
|
|
44
|
-
and
|
|
35
|
+
# Table names starting with `None_` are abstract base classes or Django mixins
|
|
36
|
+
and not cls._meta.db_table.startswith("None_") # type: ignore
|
|
45
37
|
)
|
|
46
38
|
]
|
|
47
39
|
|
|
@@ -59,7 +51,7 @@ def _export_full_table(
|
|
|
59
51
|
For SQLite with large tables, reads in chunks to avoid memory issues when tables exceed available RAM.
|
|
60
52
|
|
|
61
53
|
Args:
|
|
62
|
-
registry_info: Tuple of (module_name, model_name, field_name) where field_name
|
|
54
|
+
registry_info: Tuple of (module_name, model_name, field_name) where `field_name`
|
|
63
55
|
is None for regular tables or the field name for M2M link tables.
|
|
64
56
|
directory: Output directory for parquet files.
|
|
65
57
|
chunk_size: Maximum rows per chunk for SQLite large tables.
|
|
@@ -73,7 +65,7 @@ def _export_full_table(
|
|
|
73
65
|
|
|
74
66
|
module_name, model_name, field_name = registry_info
|
|
75
67
|
schema_module = import_module(module_name)
|
|
76
|
-
registry = getattr(schema_module, model_name)
|
|
68
|
+
registry = getattr(schema_module.models, model_name)
|
|
77
69
|
|
|
78
70
|
if field_name:
|
|
79
71
|
registry = getattr(registry, field_name).through
|
|
@@ -84,12 +76,19 @@ def _export_full_table(
|
|
|
84
76
|
if ln_setup.settings.instance.dialect == "postgresql":
|
|
85
77
|
buffer = io.StringIO()
|
|
86
78
|
with connection.cursor() as cursor:
|
|
79
|
+
cursor.execute("SET statement_timeout = 0")
|
|
87
80
|
cursor.copy_expert(
|
|
88
81
|
f'COPY "{table_name}" TO STDOUT WITH (FORMAT CSV, HEADER TRUE)',
|
|
89
82
|
buffer,
|
|
90
83
|
)
|
|
91
84
|
buffer.seek(0)
|
|
92
|
-
|
|
85
|
+
# Prevent pandas from converting empty strings to float NaN (which PyArrow rejects)
|
|
86
|
+
df = pd.read_csv(buffer, keep_default_na=False)
|
|
87
|
+
# Convert object columns to string to handle mixed types from data corruption,
|
|
88
|
+
# schema migrations, or manual SQL inserts. PyArrow rejects mixed-type objects.
|
|
89
|
+
df = df.astype(
|
|
90
|
+
{col: str for col in df.columns if df[col].dtype == "object"}
|
|
91
|
+
)
|
|
93
92
|
df.to_parquet(directory / f"{table_name}.parquet", compression=None)
|
|
94
93
|
return (
|
|
95
94
|
f"{module_name}.{model_name}.{field_name}"
|
|
@@ -118,11 +117,21 @@ def _export_full_table(
|
|
|
118
117
|
chunk_file = (
|
|
119
118
|
directory / f"{table_name}_chunk_{chunk_id}.parquet"
|
|
120
119
|
)
|
|
120
|
+
df = df.astype(
|
|
121
|
+
{
|
|
122
|
+
col: str
|
|
123
|
+
for col in df.columns
|
|
124
|
+
if df[col].dtype == "object"
|
|
125
|
+
}
|
|
126
|
+
)
|
|
121
127
|
df.to_parquet(chunk_file, compression=None)
|
|
122
128
|
chunk_files.append((table_name, chunk_file))
|
|
123
129
|
return chunk_files
|
|
124
130
|
else:
|
|
125
131
|
df = pd.read_sql_table(table_name, ln_setup.settings.instance.db)
|
|
132
|
+
df = df.astype(
|
|
133
|
+
{col: str for col in df.columns if df[col].dtype == "object"}
|
|
134
|
+
)
|
|
126
135
|
df.to_parquet(directory / f"{table_name}.parquet", compression=None)
|
|
127
136
|
return (
|
|
128
137
|
f"{module_name}.{model_name}.{field_name}"
|
|
@@ -136,9 +145,9 @@ def _export_full_table(
|
|
|
136
145
|
|
|
137
146
|
|
|
138
147
|
def export_db(
|
|
139
|
-
module_names:
|
|
148
|
+
module_names: Iterable[str] | None = None,
|
|
140
149
|
*,
|
|
141
|
-
output_dir: str | Path =
|
|
150
|
+
output_dir: str | Path | None = None,
|
|
142
151
|
max_workers: int = 8,
|
|
143
152
|
chunk_size: int = 500_000,
|
|
144
153
|
) -> None:
|
|
@@ -153,6 +162,11 @@ def export_db(
|
|
|
153
162
|
max_workers: Number of parallel processes.
|
|
154
163
|
chunk_size: Number of rows per chunk for large tables.
|
|
155
164
|
"""
|
|
165
|
+
import lamindb_setup as ln_setup
|
|
166
|
+
|
|
167
|
+
if output_dir is None:
|
|
168
|
+
output_dir = f"./{ln_setup.settings.instance.name}_export/"
|
|
169
|
+
|
|
156
170
|
directory = Path(output_dir)
|
|
157
171
|
directory.mkdir(parents=True, exist_ok=True)
|
|
158
172
|
|
|
@@ -163,7 +177,7 @@ def export_db(
|
|
|
163
177
|
for module_name, model_names in modules.items():
|
|
164
178
|
schema_module = import_module(module_name)
|
|
165
179
|
for model_name in model_names:
|
|
166
|
-
registry = getattr(schema_module, model_name)
|
|
180
|
+
registry = getattr(schema_module.models, model_name)
|
|
167
181
|
tasks.append((module_name, model_name, None))
|
|
168
182
|
for field in registry._meta.many_to_many:
|
|
169
183
|
tasks.append((module_name, model_name, field.name))
|
|
@@ -173,13 +187,8 @@ def export_db(
|
|
|
173
187
|
with Progress() as progress:
|
|
174
188
|
task_id = progress.add_task("Exporting", total=len(tasks))
|
|
175
189
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
mp_context = multiprocessing.get_context("spawn")
|
|
179
|
-
|
|
180
|
-
with ProcessPoolExecutor(
|
|
181
|
-
max_workers=max_workers, mp_context=mp_context
|
|
182
|
-
) as executor:
|
|
190
|
+
# This must be a ThreadPoolExecutor and not a ProcessPoolExecutor to inherit JWTs
|
|
191
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
183
192
|
futures = {
|
|
184
193
|
executor.submit(_export_full_table, task, directory, chunk_size): task
|
|
185
194
|
for task in tasks
|
|
@@ -229,7 +238,6 @@ def _import_registry(
|
|
|
229
238
|
parquet_file = directory / f"{table_name}.parquet"
|
|
230
239
|
|
|
231
240
|
if not parquet_file.exists():
|
|
232
|
-
print(f"Skipped {table_name} (file not found)")
|
|
233
241
|
return
|
|
234
242
|
|
|
235
243
|
df = pd.read_parquet(parquet_file)
|
|
@@ -244,12 +252,37 @@ def _import_registry(
|
|
|
244
252
|
if mask.any():
|
|
245
253
|
df.loc[mask, col] = df.loc[mask, col].map(_serialize_value)
|
|
246
254
|
|
|
255
|
+
for field in registry._meta.fields:
|
|
256
|
+
# Convert PostgreSQL boolean string literals ('t'/'f') to Python booleans for SQLite compatibility
|
|
257
|
+
if field.get_internal_type() == "BooleanField" and field.column in df.columns:
|
|
258
|
+
df[field.column] = df[field.column].map(
|
|
259
|
+
{"t": True, "f": False, True: True, False: False, None: None}
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
# PostgreSQL CSV export writes NULL as empty string; convert back to None for nullable fields
|
|
263
|
+
if field.null and field.column in df.columns:
|
|
264
|
+
df[field.column] = df[field.column].replace("", None)
|
|
265
|
+
|
|
266
|
+
# Convert numeric fields from strings to proper types for SQLite
|
|
267
|
+
if (
|
|
268
|
+
field.get_internal_type()
|
|
269
|
+
in (
|
|
270
|
+
"IntegerField",
|
|
271
|
+
"BigIntegerField",
|
|
272
|
+
"PositiveIntegerField",
|
|
273
|
+
"FloatField",
|
|
274
|
+
"DecimalField",
|
|
275
|
+
)
|
|
276
|
+
and field.column in df.columns
|
|
277
|
+
):
|
|
278
|
+
df[field.column] = pd.to_numeric(df[field.column], errors="coerce")
|
|
279
|
+
|
|
247
280
|
if if_exists == "append":
|
|
248
281
|
# Fill NULL values in NOT NULL columns to handle schema mismatches between postgres source and SQLite target
|
|
249
282
|
# This allows importing data where fields were nullable
|
|
250
283
|
for field in registry._meta.fields:
|
|
251
284
|
if field.column in df.columns and not field.null:
|
|
252
|
-
df[field.column] = df[field.column].fillna("")
|
|
285
|
+
df[field.column] = df[field.column].fillna("").infer_objects(copy=False)
|
|
253
286
|
|
|
254
287
|
if df.empty:
|
|
255
288
|
return
|
|
@@ -297,7 +330,7 @@ def _import_registry(
|
|
|
297
330
|
|
|
298
331
|
|
|
299
332
|
def import_db(
|
|
300
|
-
module_names:
|
|
333
|
+
module_names: Iterable[str] | None = None,
|
|
301
334
|
*,
|
|
302
335
|
input_dir: str | Path = "./lamindb_export/",
|
|
303
336
|
if_exists: Literal["fail", "replace", "append"] = "replace",
|
|
@@ -307,12 +340,15 @@ def import_db(
|
|
|
307
340
|
Temporarily disables FK constraints to allow insertion in arbitrary order.
|
|
308
341
|
Requires superuser/RDS admin privileges for postgres databases.
|
|
309
342
|
|
|
343
|
+
Note: When running in a subprocess, add a short delay or explicit connection close after `import_db()`
|
|
344
|
+
to ensure all SQLite writes are flushed to disk before process termination.
|
|
345
|
+
|
|
310
346
|
Args:
|
|
311
347
|
input_dir: Directory containing parquet files to import.
|
|
312
348
|
module_names: Module names to import (e.g., ["lamindb", "bionty", "wetlab"]).
|
|
313
349
|
if_exists: How to behave if table exists: 'fail', 'replace', or 'append'.
|
|
314
|
-
If set to 'replace', existing data is deleted and new data is imported. PKs and indices are not guaranteed to be preserved which can lead to write errors.
|
|
315
|
-
If set to 'append', new data is added to existing data without clearing the table. PKs and indices are preserved but database size will greatly increase.
|
|
350
|
+
If set to 'replace', existing data is deleted and new data is imported. All PKs and indices are not guaranteed to be preserved which can lead to write errors.
|
|
351
|
+
If set to 'append', new data is added to existing data without clearing the table. All PKs and indices are preserved allowing write operations but database size will greatly increase.
|
|
316
352
|
If set to 'fail', raises an error if the table contains any data.
|
|
317
353
|
"""
|
|
318
354
|
from django.db import connection
|
|
@@ -362,7 +398,7 @@ def import_db(
|
|
|
362
398
|
progress.update(
|
|
363
399
|
task, description=f"[cyan]{module_name}.{model_name}"
|
|
364
400
|
)
|
|
365
|
-
registry = getattr(schema_module, model_name)
|
|
401
|
+
registry = getattr(schema_module.models, model_name)
|
|
366
402
|
_import_registry(registry, directory, if_exists=if_exists)
|
|
367
403
|
for field in registry._meta.many_to_many:
|
|
368
404
|
link_orm = getattr(registry, field.name).through
|