lamindb_setup 1.15.2__py3-none-any.whl → 1.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -46,6 +46,12 @@ def _process_cache_path(cache_path: UPathStr | None) -> UPath | None:
46
46
  return cache_dir
47
47
 
48
48
 
49
+ # returned by settings.branch for none/none instance
50
+ class MainBranchMock:
51
+ id = 1
52
+ name = "main"
53
+
54
+
49
55
  class SetupSettings:
50
56
  """Setup settings."""
51
57
 
@@ -140,6 +146,10 @@ class SetupSettings:
140
146
  # and we never need a DB request
141
147
  def branch(self) -> Branch:
142
148
  """Default branch."""
149
+ # this is needed for .filter() with non-default connections
150
+ if not self._instance_exists:
151
+ return MainBranchMock()
152
+
143
153
  if self._branch is None:
144
154
  from lamindb import Branch
145
155
 
@@ -222,10 +232,9 @@ class SetupSettings:
222
232
  If `True`, the current instance is connected, meaning that the db and other settings
223
233
  are properly configured for use.
224
234
  """
225
- if self._instance_exists:
226
- return self.instance.slug != "none/none"
227
- else:
228
- return False
235
+ from . import django
236
+
237
+ return self._instance_exists and django.IS_SETUP
229
238
 
230
239
  @property
231
240
  def private_django_api(self) -> bool:
@@ -284,12 +293,7 @@ class SetupSettings:
284
293
 
285
294
  @property
286
295
  def _instance_exists(self):
287
- try:
288
- self.instance # noqa
289
- return True
290
- # this is implicit logic that catches if no instance is loaded
291
- except CurrentInstanceNotConfigured:
292
- return False
296
+ return self.instance.slug != "none/none"
293
297
 
294
298
  @property
295
299
  def cache_dir(self) -> UPath:
@@ -125,10 +125,11 @@ class InstanceSettings:
125
125
  if self._local_storage is not None:
126
126
  value_local = self.local_storage
127
127
  representation += f"\n - local storage: {value_local.root_as_str} ({value_local.region})"
128
- representation += (
129
- f"\n - cloud storage: {value.root_as_str} ({value.region})"
130
- )
131
- else:
128
+ if value is not None:
129
+ representation += (
130
+ f"\n - cloud storage: {value.root_as_str} ({value.region})"
131
+ )
132
+ elif value is not None:
132
133
  representation += (
133
134
  f"\n - storage: {value.root_as_str} ({value.region})"
134
135
  )
@@ -513,17 +514,36 @@ class InstanceSettings:
513
514
 
514
515
  @property
515
516
  def dialect(self) -> Literal["sqlite", "postgresql"]:
516
- """SQL dialect."""
517
+ """SQL dialect.
518
+
519
+ Equivalent to :attr:`vendor`.
520
+
521
+ "vendor" is the Django terminology for the type of database. "dialect" is the SQLAlchemy terminology.
522
+ """
517
523
  if self._db is None or self._db.startswith("sqlite://"):
518
524
  return "sqlite"
519
525
  else:
520
526
  assert self._db.startswith("postgresql"), f"Unexpected DB value: {self._db}"
521
527
  return "postgresql"
522
528
 
529
+ @property
530
+ def vendor(self) -> Literal["sqlite", "postgresql"]:
531
+ """Database vendor.
532
+
533
+ Equivalent to :attr:`dialect`.
534
+
535
+ "vendor" is the Django terminology for the type of database. "dialect" is the SQLAlchemy terminology.
536
+ """
537
+ return self.dialect
538
+
523
539
  @property
524
540
  def _is_cloud_sqlite(self) -> bool:
525
541
  """Is this a cloud instance with sqlite db."""
526
- return self.dialect == "sqlite" and self.storage.type_is_cloud
542
+ return (
543
+ self.dialect == "sqlite"
544
+ and self.storage is not None
545
+ and self.storage.type_is_cloud
546
+ )
527
547
 
528
548
  @property
529
549
  def _cloud_sqlite_locker(self):
@@ -543,6 +563,8 @@ class InstanceSettings:
543
563
  @property
544
564
  def is_remote(self) -> bool:
545
565
  """Boolean indicating if an instance has no local component."""
566
+ if self.storage is None and self.db == "sqlite:///:memory:":
567
+ return False
546
568
  return check_is_instance_remote(self.storage.root_as_str, self.db)
547
569
 
548
570
  @property
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import os
4
+ from importlib.util import find_spec
4
5
  from pathlib import Path
5
6
  from typing import TYPE_CHECKING
6
7
  from uuid import UUID, uuid4
@@ -46,19 +47,36 @@ def load_cache_path_from_settings(storage_settings: Path | None = None) -> Path
46
47
  return None
47
48
 
48
49
 
50
+ def find_module_candidates():
51
+ """Find all local packages that depend on lamindb."""
52
+ candidates = ["bionty", "wetlab"]
53
+ return [c for c in candidates if find_spec(c) is not None]
54
+
55
+
49
56
  def load_instance_settings(instance_settings_file: Path | None = None):
50
57
  if instance_settings_file is None:
51
- instance_settings_file = current_instance_settings_file()
52
- if not instance_settings_file.exists():
58
+ isettings_file = current_instance_settings_file()
59
+ if not isettings_file.exists():
60
+ isettings = InstanceSettings(
61
+ id=UUID("00000000-0000-0000-0000-000000000000"),
62
+ owner="none",
63
+ name="none",
64
+ storage=None,
65
+ modules=",".join(find_module_candidates()),
66
+ )
67
+ return isettings
68
+ else:
69
+ isettings_file = instance_settings_file
70
+
71
+ if not isettings_file.exists():
72
+ # this errors only if the file was explicitly provided
53
73
  raise CurrentInstanceNotConfigured
54
74
  try:
55
- settings_store = InstanceSettingsStore(_env_file=instance_settings_file)
75
+ settings_store = InstanceSettingsStore(_env_file=isettings_file)
56
76
  except (ValidationError, TypeError) as error:
57
- with open(instance_settings_file) as f:
58
- content = f.read()
59
77
  raise SettingsEnvFileOutdated(
60
- f"\n\n{error}\n\nYour instance settings file with\n\n{content}\nis invalid"
61
- f" (likely outdated), see validation error. Please delete {instance_settings_file} &"
78
+ f"\n\n{error}\n\nYour instance settings file with\n\n{isettings_file.read_text()}\nis invalid"
79
+ f" (likely outdated), see validation error. Please delete {isettings_file} &"
62
80
  " reload (remote) or re-initialize (local) the instance with the same name & storage location."
63
81
  ) from error
64
82
  isettings = setup_instance_from_store(settings_store)
@@ -122,6 +122,7 @@ def init_storage(
122
122
  access_token: str | None = None,
123
123
  region: str | None = None,
124
124
  space_uuid: UUID | None = None,
125
+ skip_mark_storage_root: bool = False,
125
126
  ) -> tuple[
126
127
  StorageSettings,
127
128
  Literal["hub-record-not-created", "hub-record-retrieved", "hub-record-created"],
@@ -181,7 +182,8 @@ def init_storage(
181
182
  space_id=space_uuid,
182
183
  )
183
184
  # we check the write access here if the storage record has not been retrieved from the hub
184
- if hub_record_status != "hub-record-retrieved":
185
+ # Sergei: should it in fact still go through if hub_record_status == "hub-record-not-created"?
186
+ if hub_record_status != "hub-record-retrieved" and not skip_mark_storage_root:
185
187
  try:
186
188
  # (federated) credentials for AWS access are provisioned under-the-hood
187
189
  # discussion: https://laminlabs.slack.com/archives/C04FPE8V01W/p1719260587167489
@@ -5,13 +5,15 @@ import builtins
5
5
  import os
6
6
  import sys
7
7
  import importlib as il
8
+ import gzip
8
9
  import jwt
9
10
  import time
10
11
  import threading
11
12
  from pathlib import Path
13
+ import shutil
12
14
  from packaging import version
13
15
  from ._settings_instance import InstanceSettings, is_local_db_url
14
-
16
+ from ..errors import CurrentInstanceNotConfigured
15
17
  from lamin_utils import logger
16
18
 
17
19
 
@@ -21,6 +23,24 @@ IS_MIGRATING = False
21
23
  CONN_MAX_AGE = 299
22
24
 
23
25
 
26
+ def get_connection(connection_name: str):
27
+ from django.db import connections
28
+
29
+ return connections[connection_name]
30
+
31
+
32
+ def error_no_instance_wrapper(execute, sql, params, many, context):
33
+ connection = context["connection"]
34
+
35
+ if (
36
+ connection.vendor == "sqlite"
37
+ and connection.settings_dict.get("NAME") == ":memory:"
38
+ ):
39
+ raise CurrentInstanceNotConfigured
40
+
41
+ return execute(sql, params, many, context)
42
+
43
+
24
44
  # db token that refreshes on access if needed
25
45
  class DBToken:
26
46
  def __init__(
@@ -64,11 +84,6 @@ class DBTokenManager:
64
84
 
65
85
  self.tokens: dict[str, DBToken] = {}
66
86
 
67
- def get_connection(self, connection_name: str):
68
- from django.db import connections
69
-
70
- return connections[connection_name]
71
-
72
87
  def set(self, token: DBToken, connection_name: str = "default"):
73
88
  if connection_name in self.tokens:
74
89
  return
@@ -77,11 +92,7 @@ class DBTokenManager:
77
92
  from django.db.backends.signals import connection_created
78
93
 
79
94
  def set_token_wrapper(execute, sql, params, many, context):
80
- not_in_atomic_block = (
81
- context is None
82
- or "connection" not in context
83
- or not context["connection"].in_atomic_block
84
- )
95
+ not_in_atomic_block = not context["connection"].in_atomic_block
85
96
  # ignore atomic blocks
86
97
  if not_in_atomic_block:
87
98
  sql = token.token_query + sql
@@ -98,7 +109,7 @@ class DBTokenManager:
98
109
  result.nextset()
99
110
  return result
100
111
 
101
- self.get_connection(connection_name).execute_wrappers.append(set_token_wrapper)
112
+ get_connection(connection_name).execute_wrappers.append(set_token_wrapper)
102
113
 
103
114
  def connection_callback(sender, connection, **kwargs):
104
115
  if (
@@ -124,7 +135,7 @@ class DBTokenManager:
124
135
  if connection_name in self.tokens:
125
136
  # here we don't use the connection from the closure
126
137
  # because Atomic is a single class to manage transactions for all connections
127
- connection = self.get_connection(connection_name)
138
+ connection = get_connection(connection_name)
128
139
  if len(connection.atomic_blocks) == 1:
129
140
  token = self.tokens[connection_name]
130
141
  # use raw psycopg2 connection here
@@ -142,7 +153,7 @@ class DBTokenManager:
142
153
 
143
154
  from django.db.backends.signals import connection_created
144
155
 
145
- connection = self.get_connection(connection_name)
156
+ connection = get_connection(connection_name)
146
157
 
147
158
  connection.execute_wrappers = [
148
159
  w
@@ -238,6 +249,8 @@ def setup_django(
238
249
  if view_schema:
239
250
  installed_apps = installed_apps[::-1] # to fix how apps appear
240
251
  installed_apps += ["schema_graph", "django.contrib.staticfiles"]
252
+ if isettings.dialect == "postgresql":
253
+ installed_apps.insert(0, "pgtrigger")
241
254
 
242
255
  kwargs = dict(
243
256
  INSTALLED_APPS=installed_apps,
@@ -289,6 +302,9 @@ def setup_django(
289
302
  django.db.connections._connections = threading.local()
290
303
  logger.debug("django.db.connections._connections has been patched")
291
304
 
305
+ # error if trying to query with the default connection without setting up an instance
306
+ get_connection("default").execute_wrappers.insert(0, error_no_instance_wrapper)
307
+
292
308
  if isettings._fine_grained_access and isettings._db_permissions == "jwt":
293
309
  db_token = DBToken(isettings)
294
310
  db_token_manager.set(db_token) # sets for the default connection
@@ -309,10 +325,24 @@ def setup_django(
309
325
  call_command("migrate", app_name, app_number, verbosity=2)
310
326
  isettings._update_cloud_sqlite_file(unlock_cloud_sqlite=False)
311
327
  elif init:
312
- global IS_MIGRATING
313
- IS_MIGRATING = True
314
- call_command("migrate", verbosity=0)
315
- IS_MIGRATING = False
328
+ modules_beyond_bionty = isettings.modules.copy()
329
+ compressed_sqlite_path = Path(__file__).parent / "lamin.db.gz"
330
+ if "bionty" in modules_beyond_bionty:
331
+ modules_beyond_bionty.remove("bionty")
332
+ if (
333
+ isettings.dialect == "postgresql"
334
+ or os.getenv("LAMINDB_INIT_FROM_SCRATCH", "false") == "true"
335
+ or len(modules_beyond_bionty) > 0
336
+ or not compressed_sqlite_path.exists()
337
+ ):
338
+ global IS_MIGRATING
339
+ IS_MIGRATING = True
340
+ call_command("migrate", verbosity=0)
341
+ IS_MIGRATING = False
342
+ else:
343
+ with gzip.open(compressed_sqlite_path, "rb") as f_in:
344
+ with open(isettings._sqlite_file_local, "wb") as f_out:
345
+ shutil.copyfileobj(f_in, f_out)
316
346
 
317
347
  global IS_SETUP
318
348
  IS_SETUP = True
Binary file
@@ -93,10 +93,12 @@ def extract_suffix_from_path(path: Path, arg_name: str | None = None) -> str:
93
93
  else:
94
94
  return suffix
95
95
 
96
- if len(path.suffixes) <= 1:
96
+ suffixes = path.suffixes
97
+
98
+ if len(suffixes) <= 1:
97
99
  return process_digits(path.suffix)
98
100
 
99
- total_suffix = "".join(path.suffixes)
101
+ total_suffix = "".join(suffixes)
100
102
  if total_suffix in VALID_SIMPLE_SUFFIXES:
101
103
  return total_suffix
102
104
  elif total_suffix.endswith(tuple(VALID_COMPOSITE_SUFFIXES)):
@@ -115,14 +117,24 @@ def extract_suffix_from_path(path: Path, arg_name: str | None = None) -> str:
115
117
  # in COMPRESSION_SUFFIXES to detect something like .random.gz and then
116
118
  # add ".random.gz" but concluded it's too dangerous it's safer to just
117
119
  # use ".gz" in such a case
118
- if path.suffixes[-2] in VALID_SIMPLE_SUFFIXES:
119
- suffix = "".join(path.suffixes[-2:])
120
- msg += f"inferring: '{suffix}'"
120
+ if suffixes[-2] in VALID_SIMPLE_SUFFIXES:
121
+ suffix = "".join(suffixes[-2:])
122
+ # if the suffix preceding the compression suffixes is a valid suffix,
123
+ # we account for it; otherwise we don't.
124
+ # i.e. we should have .h5ad.tar.gz or .csv.tar.gz, not just .tar.gz
125
+ if (
126
+ suffix == ".tar.gz"
127
+ and len(suffixes) > 2
128
+ and (suffix_3 := suffixes[-3]) in VALID_SIMPLE_SUFFIXES
129
+ ):
130
+ suffix = suffix_3 + suffix
121
131
  # do not print a warning for things like .tar.gz, .fastq.gz
122
- if path.suffixes[-1] == ".gz":
132
+ if suffixes[-1] == ".gz":
123
133
  print_hint = False
134
+ else:
135
+ msg += f"inferring: '{suffix}'"
124
136
  else:
125
- suffix = path.suffixes[-1] # this is equivalent to path.suffix
137
+ suffix = suffixes[-1] # this is equivalent to path.suffix
126
138
  msg += (
127
139
  f"using only last suffix: '{suffix}' - if you want your composite"
128
140
  " suffix to be recognized add it to"
@@ -993,13 +1005,22 @@ def check_storage_is_empty(
993
1005
  objects = [o for o in objects if "/.lamindb/_exclusion/" not in o]
994
1006
  n_files = len(objects)
995
1007
  n_diff = n_files - n_offset_objects
996
- ask_for_deletion = (
997
- "delete them prior to deleting the storage location"
998
- if raise_error
999
- else "consider deleting them"
1000
- )
1001
- message = f"'{directory_string}' contains {n_diff} objects - {ask_for_deletion}"
1002
1008
  if n_diff > 0:
1009
+ ask_for_deletion = (
1010
+ "delete them prior to deleting the storage location"
1011
+ if raise_error
1012
+ else "consider deleting them"
1013
+ )
1014
+ message = f"'{directory_string}' contains {n_diff} objects:\n"
1015
+ message += "\n".join(
1016
+ [
1017
+ o
1018
+ for o in objects
1019
+ if not o.endswith(".lamindb/storage_uid.txt")
1020
+ and not (account_for_sqlite_file and o.endswith(".lamindb/lamin.db"))
1021
+ ]
1022
+ )
1023
+ message += f"\n{ask_for_deletion}"
1003
1024
  if raise_error:
1004
1025
  raise StorageNotEmpty(message) from None
1005
1026
  else:
lamindb_setup/errors.py CHANGED
@@ -1,7 +1,6 @@
1
1
  """Errors.
2
2
 
3
3
  .. autoexception:: CurrentInstanceNotConfigured
4
- .. autoexception:: InstanceNotSetupError
5
4
  .. autoexception:: ModuleWasntConfigured
6
5
  .. autoexception:: StorageAlreadyManaged
7
6
  .. autoexception:: StorageNotEmpty
@@ -25,17 +24,6 @@ class DefaultMessageException(Exception):
25
24
  super().__init__(message)
26
25
 
27
26
 
28
- # TODO: remove this exception sooner or later because we don't have a need for it anymore
29
- class InstanceNotSetupError(DefaultMessageException):
30
- default_message = """\
31
- To use lamindb, you need to connect to an instance.
32
-
33
- Connect to an instance: `ln.connect()`. Init an instance: `ln.setup.init()`.
34
-
35
- If you used the CLI to set up lamindb in a notebook, restart the Python session.
36
- """
37
-
38
-
39
27
  class CurrentInstanceNotConfigured(DefaultMessageException):
40
28
  default_message = """\
41
29
  No instance is connected! Call
lamindb_setup/io.py CHANGED
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  import io
4
4
  import json
5
5
  import warnings
6
- from concurrent.futures import ProcessPoolExecutor, as_completed
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
7
  from importlib import import_module
8
8
  from pathlib import Path
9
9
  from typing import TYPE_CHECKING
@@ -14,24 +14,16 @@ from django.db import models, transaction
14
14
  from rich.progress import Progress
15
15
 
16
16
  if TYPE_CHECKING:
17
- from collections.abc import Sequence
17
+ from collections.abc import Iterable
18
18
  from typing import Literal
19
19
 
20
20
 
21
21
  def _get_registries(module_name: str) -> list[str]:
22
22
  """Get registry class names from a module."""
23
23
  schema_module = import_module(module_name)
24
- exclude = {"SQLRecord", "BaseSQLRecord"}
25
24
 
26
- if module_name == "lamindb":
27
- module_filter = lambda cls, name: cls.__module__.startswith(
28
- f"{module_name}.models."
29
- ) and name in dir(schema_module)
30
- else:
31
- module_filter = (
32
- lambda cls, name: cls.__module__ == f"{module_name}.models"
33
- and name in dir(schema_module)
34
- )
25
+ # Ensure that models are loaded; we've observed empty exports otherwise
26
+ from django.db import models
35
27
 
36
28
  return [
37
29
  name
@@ -40,8 +32,8 @@ def _get_registries(module_name: str) -> list[str]:
40
32
  name[0].isupper()
41
33
  and isinstance(cls := getattr(schema_module.models, name, None), type)
42
34
  and issubclass(cls, models.Model)
43
- and module_filter(cls, name)
44
- and name not in exclude
35
+ # Table names starting with `None_` are abstract base classes or Django mixins
36
+ and not cls._meta.db_table.startswith("None_") # type: ignore
45
37
  )
46
38
  ]
47
39
 
@@ -59,7 +51,7 @@ def _export_full_table(
59
51
  For SQLite with large tables, reads in chunks to avoid memory issues when tables exceed available RAM.
60
52
 
61
53
  Args:
62
- registry_info: Tuple of (module_name, model_name, field_name) where field_name
54
+ registry_info: Tuple of (module_name, model_name, field_name) where `field_name`
63
55
  is None for regular tables or the field name for M2M link tables.
64
56
  directory: Output directory for parquet files.
65
57
  chunk_size: Maximum rows per chunk for SQLite large tables.
@@ -73,7 +65,7 @@ def _export_full_table(
73
65
 
74
66
  module_name, model_name, field_name = registry_info
75
67
  schema_module = import_module(module_name)
76
- registry = getattr(schema_module, model_name)
68
+ registry = getattr(schema_module.models, model_name)
77
69
 
78
70
  if field_name:
79
71
  registry = getattr(registry, field_name).through
@@ -84,12 +76,19 @@ def _export_full_table(
84
76
  if ln_setup.settings.instance.dialect == "postgresql":
85
77
  buffer = io.StringIO()
86
78
  with connection.cursor() as cursor:
79
+ cursor.execute("SET statement_timeout = 0")
87
80
  cursor.copy_expert(
88
81
  f'COPY "{table_name}" TO STDOUT WITH (FORMAT CSV, HEADER TRUE)',
89
82
  buffer,
90
83
  )
91
84
  buffer.seek(0)
92
- df = pd.read_csv(buffer)
85
+ # Prevent pandas from converting empty strings to float NaN (which PyArrow rejects)
86
+ df = pd.read_csv(buffer, keep_default_na=False)
87
+ # Convert object columns to string to handle mixed types from data corruption,
88
+ # schema migrations, or manual SQL inserts. PyArrow rejects mixed-type objects.
89
+ df = df.astype(
90
+ {col: str for col in df.columns if df[col].dtype == "object"}
91
+ )
93
92
  df.to_parquet(directory / f"{table_name}.parquet", compression=None)
94
93
  return (
95
94
  f"{module_name}.{model_name}.{field_name}"
@@ -118,11 +117,21 @@ def _export_full_table(
118
117
  chunk_file = (
119
118
  directory / f"{table_name}_chunk_{chunk_id}.parquet"
120
119
  )
120
+ df = df.astype(
121
+ {
122
+ col: str
123
+ for col in df.columns
124
+ if df[col].dtype == "object"
125
+ }
126
+ )
121
127
  df.to_parquet(chunk_file, compression=None)
122
128
  chunk_files.append((table_name, chunk_file))
123
129
  return chunk_files
124
130
  else:
125
131
  df = pd.read_sql_table(table_name, ln_setup.settings.instance.db)
132
+ df = df.astype(
133
+ {col: str for col in df.columns if df[col].dtype == "object"}
134
+ )
126
135
  df.to_parquet(directory / f"{table_name}.parquet", compression=None)
127
136
  return (
128
137
  f"{module_name}.{model_name}.{field_name}"
@@ -136,9 +145,9 @@ def _export_full_table(
136
145
 
137
146
 
138
147
  def export_db(
139
- module_names: Sequence[str] | None = None,
148
+ module_names: Iterable[str] | None = None,
140
149
  *,
141
- output_dir: str | Path = "./lamindb_export/",
150
+ output_dir: str | Path | None = None,
142
151
  max_workers: int = 8,
143
152
  chunk_size: int = 500_000,
144
153
  ) -> None:
@@ -153,6 +162,11 @@ def export_db(
153
162
  max_workers: Number of parallel processes.
154
163
  chunk_size: Number of rows per chunk for large tables.
155
164
  """
165
+ import lamindb_setup as ln_setup
166
+
167
+ if output_dir is None:
168
+ output_dir = f"./{ln_setup.settings.instance.name}_export/"
169
+
156
170
  directory = Path(output_dir)
157
171
  directory.mkdir(parents=True, exist_ok=True)
158
172
 
@@ -163,7 +177,7 @@ def export_db(
163
177
  for module_name, model_names in modules.items():
164
178
  schema_module = import_module(module_name)
165
179
  for model_name in model_names:
166
- registry = getattr(schema_module, model_name)
180
+ registry = getattr(schema_module.models, model_name)
167
181
  tasks.append((module_name, model_name, None))
168
182
  for field in registry._meta.many_to_many:
169
183
  tasks.append((module_name, model_name, field.name))
@@ -173,13 +187,8 @@ def export_db(
173
187
  with Progress() as progress:
174
188
  task_id = progress.add_task("Exporting", total=len(tasks))
175
189
 
176
- import multiprocessing
177
-
178
- mp_context = multiprocessing.get_context("spawn")
179
-
180
- with ProcessPoolExecutor(
181
- max_workers=max_workers, mp_context=mp_context
182
- ) as executor:
190
+ # This must be a ThreadPoolExecutor and not a ProcessPoolExecutor to inherit JWTs
191
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
183
192
  futures = {
184
193
  executor.submit(_export_full_table, task, directory, chunk_size): task
185
194
  for task in tasks
@@ -229,7 +238,6 @@ def _import_registry(
229
238
  parquet_file = directory / f"{table_name}.parquet"
230
239
 
231
240
  if not parquet_file.exists():
232
- print(f"Skipped {table_name} (file not found)")
233
241
  return
234
242
 
235
243
  df = pd.read_parquet(parquet_file)
@@ -244,12 +252,37 @@ def _import_registry(
244
252
  if mask.any():
245
253
  df.loc[mask, col] = df.loc[mask, col].map(_serialize_value)
246
254
 
255
+ for field in registry._meta.fields:
256
+ # Convert PostgreSQL boolean string literals ('t'/'f') to Python booleans for SQLite compatibility
257
+ if field.get_internal_type() == "BooleanField" and field.column in df.columns:
258
+ df[field.column] = df[field.column].map(
259
+ {"t": True, "f": False, True: True, False: False, None: None}
260
+ )
261
+
262
+ # PostgreSQL CSV export writes NULL as empty string; convert back to None for nullable fields
263
+ if field.null and field.column in df.columns:
264
+ df[field.column] = df[field.column].replace("", None)
265
+
266
+ # Convert numeric fields from strings to proper types for SQLite
267
+ if (
268
+ field.get_internal_type()
269
+ in (
270
+ "IntegerField",
271
+ "BigIntegerField",
272
+ "PositiveIntegerField",
273
+ "FloatField",
274
+ "DecimalField",
275
+ )
276
+ and field.column in df.columns
277
+ ):
278
+ df[field.column] = pd.to_numeric(df[field.column], errors="coerce")
279
+
247
280
  if if_exists == "append":
248
281
  # Fill NULL values in NOT NULL columns to handle schema mismatches between postgres source and SQLite target
249
282
  # This allows importing data where fields were nullable
250
283
  for field in registry._meta.fields:
251
284
  if field.column in df.columns and not field.null:
252
- df[field.column] = df[field.column].fillna("")
285
+ df[field.column] = df[field.column].fillna("").infer_objects(copy=False)
253
286
 
254
287
  if df.empty:
255
288
  return
@@ -297,7 +330,7 @@ def _import_registry(
297
330
 
298
331
 
299
332
  def import_db(
300
- module_names: Sequence[str] | None = None,
333
+ module_names: Iterable[str] | None = None,
301
334
  *,
302
335
  input_dir: str | Path = "./lamindb_export/",
303
336
  if_exists: Literal["fail", "replace", "append"] = "replace",
@@ -307,12 +340,15 @@ def import_db(
307
340
  Temporarily disables FK constraints to allow insertion in arbitrary order.
308
341
  Requires superuser/RDS admin privileges for postgres databases.
309
342
 
343
+ Note: When running in a subprocess, add a short delay or explicit connection close after `import_db()`
344
+ to ensure all SQLite writes are flushed to disk before process termination.
345
+
310
346
  Args:
311
347
  input_dir: Directory containing parquet files to import.
312
348
  module_names: Module names to import (e.g., ["lamindb", "bionty", "wetlab"]).
313
349
  if_exists: How to behave if table exists: 'fail', 'replace', or 'append'.
314
- If set to 'replace', existing data is deleted and new data is imported. PKs and indices are not guaranteed to be preserved which can lead to write errors.
315
- If set to 'append', new data is added to existing data without clearing the table. PKs and indices are preserved but database size will greatly increase.
350
+ If set to 'replace', existing data is deleted and new data is imported. All PKs and indices are not guaranteed to be preserved which can lead to write errors.
351
+ If set to 'append', new data is added to existing data without clearing the table. All PKs and indices are preserved allowing write operations but database size will greatly increase.
316
352
  If set to 'fail', raises an error if the table contains any data.
317
353
  """
318
354
  from django.db import connection
@@ -362,7 +398,7 @@ def import_db(
362
398
  progress.update(
363
399
  task, description=f"[cyan]{module_name}.{model_name}"
364
400
  )
365
- registry = getattr(schema_module, model_name)
401
+ registry = getattr(schema_module.models, model_name)
366
402
  _import_registry(registry, directory, if_exists=if_exists)
367
403
  for field in registry._meta.many_to_many:
368
404
  link_orm = getattr(registry, field.name).through