lamindb_setup 1.18.2__py3-none-any.whl → 1.19.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. lamindb_setup/__init__.py +4 -19
  2. lamindb_setup/_cache.py +87 -87
  3. lamindb_setup/_check.py +7 -7
  4. lamindb_setup/_check_setup.py +131 -131
  5. lamindb_setup/_connect_instance.py +443 -438
  6. lamindb_setup/_delete.py +155 -151
  7. lamindb_setup/_disconnect.py +38 -38
  8. lamindb_setup/_django.py +39 -39
  9. lamindb_setup/_entry_points.py +19 -19
  10. lamindb_setup/_init_instance.py +423 -429
  11. lamindb_setup/_migrate.py +331 -327
  12. lamindb_setup/_register_instance.py +32 -32
  13. lamindb_setup/_schema.py +27 -27
  14. lamindb_setup/_schema_metadata.py +451 -451
  15. lamindb_setup/_set_managed_storage.py +81 -80
  16. lamindb_setup/_setup_user.py +198 -198
  17. lamindb_setup/_silence_loggers.py +46 -46
  18. lamindb_setup/core/__init__.py +25 -34
  19. lamindb_setup/core/_aws_options.py +276 -266
  20. lamindb_setup/core/_aws_storage.py +57 -55
  21. lamindb_setup/core/_clone.py +50 -50
  22. lamindb_setup/core/_deprecated.py +62 -62
  23. lamindb_setup/core/_docs.py +14 -14
  24. lamindb_setup/core/_hub_client.py +288 -294
  25. lamindb_setup/core/_hub_core.py +0 -2
  26. lamindb_setup/core/_hub_crud.py +247 -247
  27. lamindb_setup/core/_hub_utils.py +100 -100
  28. lamindb_setup/core/_private_django_api.py +80 -80
  29. lamindb_setup/core/_settings.py +440 -434
  30. lamindb_setup/core/_settings_instance.py +32 -7
  31. lamindb_setup/core/_settings_load.py +162 -159
  32. lamindb_setup/core/_settings_save.py +108 -96
  33. lamindb_setup/core/_settings_storage.py +433 -433
  34. lamindb_setup/core/_settings_store.py +162 -92
  35. lamindb_setup/core/_settings_user.py +55 -55
  36. lamindb_setup/core/_setup_bionty_sources.py +44 -44
  37. lamindb_setup/core/cloud_sqlite_locker.py +240 -240
  38. lamindb_setup/core/django.py +414 -413
  39. lamindb_setup/core/exceptions.py +1 -1
  40. lamindb_setup/core/hashing.py +134 -134
  41. lamindb_setup/core/types.py +1 -1
  42. lamindb_setup/core/upath.py +1031 -1028
  43. lamindb_setup/errors.py +72 -70
  44. lamindb_setup/io.py +423 -416
  45. lamindb_setup/types.py +17 -17
  46. {lamindb_setup-1.18.2.dist-info → lamindb_setup-1.19.1.dist-info}/METADATA +4 -2
  47. lamindb_setup-1.19.1.dist-info/RECORD +51 -0
  48. {lamindb_setup-1.18.2.dist-info → lamindb_setup-1.19.1.dist-info}/WHEEL +1 -1
  49. {lamindb_setup-1.18.2.dist-info → lamindb_setup-1.19.1.dist-info/licenses}/LICENSE +201 -201
  50. lamindb_setup-1.18.2.dist-info/RECORD +0 -51
lamindb_setup/io.py CHANGED
@@ -1,416 +1,423 @@
1
- from __future__ import annotations
2
-
3
- import io
4
- import json
5
- import warnings
6
- from concurrent.futures import ThreadPoolExecutor, as_completed
7
- from importlib import import_module
8
- from pathlib import Path
9
- from typing import TYPE_CHECKING
10
-
11
- import numpy as np
12
- import pandas as pd
13
- from django.db import models, transaction
14
- from rich.progress import Progress
15
-
16
- if TYPE_CHECKING:
17
- from collections.abc import Iterable
18
- from typing import Literal
19
-
20
-
21
- def _get_registries(module_name: str) -> list[str]:
22
- """Get registry class names from a module."""
23
- schema_module = import_module(module_name)
24
-
25
- # Ensure that models are loaded; we've observed empty exports otherwise
26
- from django.db import models
27
-
28
- return [
29
- name
30
- for name in dir(schema_module.models)
31
- if (
32
- name[0].isupper()
33
- and isinstance(cls := getattr(schema_module.models, name, None), type)
34
- and issubclass(cls, models.Model)
35
- # Table names starting with `None_` are abstract base classes or Django mixins
36
- and not cls._meta.db_table.startswith("None_") # type: ignore
37
- )
38
- ]
39
-
40
-
41
- def _export_full_table(
42
- registry_info: tuple[str, str, str | None],
43
- directory: Path,
44
- chunk_size: int,
45
- ) -> list[tuple[str, Path]] | str:
46
- """Export a registry table to parquet.
47
-
48
- For PostgreSQL, uses COPY TO which streams the table directly to CSV format,
49
- bypassing query planner overhead and row-by-row conversion (10-50x faster than SELECT).
50
-
51
- For SQLite with large tables, reads in chunks to avoid memory issues when tables exceed available RAM.
52
-
53
- Args:
54
- registry_info: Tuple of (module_name, model_name, field_name) where `field_name`
55
- is None for regular tables or the field name for M2M link tables.
56
- directory: Output directory for parquet files.
57
- chunk_size: Maximum rows per chunk for SQLite large tables.
58
-
59
- Returns:
60
- String identifier for single-file exports, or list of (table_name, chunk_path) tuples for chunked exports that need merging.
61
- """
62
- from django.db import connection
63
-
64
- import lamindb_setup as ln_setup
65
-
66
- module_name, model_name, field_name = registry_info
67
- schema_module = import_module(module_name)
68
- registry = getattr(schema_module.models, model_name)
69
-
70
- if field_name:
71
- registry = getattr(registry, field_name).through
72
-
73
- table_name = registry._meta.db_table
74
-
75
- try:
76
- if ln_setup.settings.instance.dialect == "postgresql":
77
- buffer = io.StringIO()
78
- with connection.cursor() as cursor:
79
- cursor.execute("SET statement_timeout = 0")
80
- cursor.copy_expert(
81
- f'COPY "{table_name}" TO STDOUT WITH (FORMAT CSV, HEADER TRUE)',
82
- buffer,
83
- )
84
- buffer.seek(0)
85
- # Prevent pandas from converting empty strings to float NaN (which PyArrow rejects)
86
- df = pd.read_csv(buffer, keep_default_na=False)
87
- # Convert object columns to string to handle mixed types from data corruption,
88
- # schema migrations, or manual SQL inserts. PyArrow rejects mixed-type objects.
89
- df = df.astype(
90
- {col: str for col in df.columns if df[col].dtype == "object"}
91
- )
92
- df.to_parquet(directory / f"{table_name}.parquet", compression=None)
93
- return (
94
- f"{module_name}.{model_name}.{field_name}"
95
- if field_name
96
- else f"{module_name}.{model_name}"
97
- )
98
- else:
99
- with warnings.catch_warnings():
100
- warnings.filterwarnings(
101
- "ignore", message="Skipped unsupported reflection"
102
- )
103
- row_count = pd.read_sql(
104
- f"SELECT COUNT(*) as count FROM {table_name}",
105
- ln_setup.settings.instance.db,
106
- ).iloc[0]["count"]
107
-
108
- if row_count > chunk_size:
109
- chunk_files = []
110
- num_chunks = (row_count + chunk_size - 1) // chunk_size
111
- for chunk_id in range(num_chunks):
112
- offset = chunk_id * chunk_size
113
- df = pd.read_sql(
114
- f"SELECT * FROM {table_name} LIMIT {chunk_size} OFFSET {offset}",
115
- ln_setup.settings.instance.db,
116
- )
117
- chunk_file = (
118
- directory / f"{table_name}_chunk_{chunk_id}.parquet"
119
- )
120
- df = df.astype(
121
- {
122
- col: str
123
- for col in df.columns
124
- if df[col].dtype == "object"
125
- }
126
- )
127
- df.to_parquet(chunk_file, compression=None)
128
- chunk_files.append((table_name, chunk_file))
129
- return chunk_files
130
- else:
131
- df = pd.read_sql_table(table_name, ln_setup.settings.instance.db)
132
- df = df.astype(
133
- {col: str for col in df.columns if df[col].dtype == "object"}
134
- )
135
- df.to_parquet(directory / f"{table_name}.parquet", compression=None)
136
- return (
137
- f"{module_name}.{model_name}.{field_name}"
138
- if field_name
139
- else f"{module_name}.{model_name}"
140
- )
141
- except (ValueError, pd.errors.DatabaseError):
142
- raise ValueError(
143
- f"Table '{table_name}' was not found. The instance might need to be migrated."
144
- ) from None
145
-
146
-
147
- def export_db(
148
- module_names: Iterable[str] | None = None,
149
- *,
150
- output_dir: str | Path | None = None,
151
- max_workers: int = 8,
152
- chunk_size: int = 500_000,
153
- ) -> None:
154
- """Export registry tables and many-to-many link tables to parquet files.
155
-
156
- Ensure that you connect to postgres instances using `use_root_db_user=True`.
157
-
158
- Args:
159
- module_names: Module names to export (e.g., ["lamindb", "bionty", "pertdb"]).
160
- Defaults to "lamindb" if not provided.
161
- output_dir: Directory path for exported parquet files.
162
- max_workers: Number of parallel processes.
163
- chunk_size: Number of rows per chunk for large tables.
164
- """
165
- import lamindb_setup as ln_setup
166
-
167
- if output_dir is None:
168
- output_dir = f"./{ln_setup.settings.instance.name}_export/"
169
-
170
- directory = Path(output_dir)
171
- directory.mkdir(parents=True, exist_ok=True)
172
-
173
- module_names = module_names or ["lamindb"]
174
- modules = {name: _get_registries(name) for name in module_names}
175
-
176
- tasks = []
177
- for module_name, model_names in modules.items():
178
- schema_module = import_module(module_name)
179
- for model_name in model_names:
180
- registry = getattr(schema_module.models, model_name)
181
- tasks.append((module_name, model_name, None))
182
- for field in registry._meta.many_to_many:
183
- tasks.append((module_name, model_name, field.name))
184
-
185
- chunk_files_by_table: dict[str, list[Path]] = {}
186
-
187
- with Progress() as progress:
188
- task_id = progress.add_task("Exporting", total=len(tasks))
189
-
190
- # This must be a ThreadPoolExecutor and not a ProcessPoolExecutor to inherit JWTs
191
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
192
- futures = {
193
- executor.submit(_export_full_table, task, directory, chunk_size): task
194
- for task in tasks
195
- }
196
-
197
- for future in as_completed(futures):
198
- result = future.result()
199
- if isinstance(result, list):
200
- for table_name, chunk_file in result:
201
- chunk_files_by_table.setdefault(table_name, []).append(
202
- chunk_file
203
- )
204
- progress.advance(task_id)
205
-
206
- for table_name, chunk_files in chunk_files_by_table.items():
207
- merged_df = pd.concat([pd.read_parquet(f) for f in sorted(chunk_files)])
208
- merged_df.to_parquet(directory / f"{table_name}.parquet", compression=None)
209
- for chunk_file in chunk_files:
210
- chunk_file.unlink()
211
-
212
-
213
- def _serialize_value(val):
214
- """Convert value to JSON string if it's a dict, list, or numpy array, otherwise return as-is."""
215
- if isinstance(val, (dict, list, np.ndarray)):
216
- return json.dumps(
217
- val, default=lambda o: o.tolist() if isinstance(o, np.ndarray) else None
218
- )
219
- return val
220
-
221
-
222
- def _import_registry(
223
- registry: type[models.Model],
224
- directory: Path,
225
- if_exists: Literal["fail", "replace", "append"] = "replace",
226
- ) -> None:
227
- """Import a single registry table from parquet.
228
-
229
- For PostgreSQL, uses COPY FROM which bypasses SQL parsing and writes directly to
230
- table pages (20-50x faster than multi-row INSERTs).
231
-
232
- For SQLite, uses multi-row INSERTs with dynamic chunking to stay under the 999
233
- variable limit (2-5x faster than single-row INSERTs).
234
- """
235
- from django.db import connection
236
-
237
- table_name = registry._meta.db_table
238
- parquet_file = directory / f"{table_name}.parquet"
239
-
240
- if not parquet_file.exists():
241
- return
242
-
243
- df = pd.read_parquet(parquet_file)
244
-
245
- old_foreign_key_columns = [col for col in df.columns if col.endswith("_old")]
246
- if old_foreign_key_columns:
247
- df = df.drop(columns=old_foreign_key_columns)
248
-
249
- for col in df.columns:
250
- if df[col].dtype == "object":
251
- mask = df[col].apply(lambda x: isinstance(x, (dict, list, np.ndarray)))
252
- if mask.any():
253
- df.loc[mask, col] = df.loc[mask, col].map(_serialize_value)
254
-
255
- for field in registry._meta.fields:
256
- # Convert PostgreSQL boolean string literals ('t'/'f') to Python booleans for SQLite compatibility
257
- if field.get_internal_type() == "BooleanField" and field.column in df.columns:
258
- df[field.column] = df[field.column].map(
259
- {"t": True, "f": False, True: True, False: False, None: None}
260
- )
261
-
262
- # PostgreSQL CSV export writes NULL as empty string; convert back to None for nullable fields
263
- if field.null and field.column in df.columns:
264
- df[field.column] = df[field.column].replace("", None)
265
-
266
- # Convert numeric fields from strings to proper types for SQLite
267
- if (
268
- field.get_internal_type()
269
- in (
270
- "IntegerField",
271
- "BigIntegerField",
272
- "PositiveIntegerField",
273
- "FloatField",
274
- "DecimalField",
275
- )
276
- and field.column in df.columns
277
- ):
278
- df[field.column] = pd.to_numeric(df[field.column], errors="coerce")
279
-
280
- if if_exists == "append":
281
- # Fill NULL values in NOT NULL columns to handle schema mismatches between postgres source and SQLite target
282
- # This allows importing data where fields were nullable
283
- for field in registry._meta.fields:
284
- if field.column in df.columns and not field.null:
285
- df[field.column] = df[field.column].fillna("").infer_objects(copy=False)
286
-
287
- if df.empty:
288
- return
289
-
290
- if if_exists == "append":
291
- # Clear existing data before import
292
- # When appending we would run into duplicate errors because of existing values like branches etc
293
- with connection.cursor() as cursor:
294
- cursor.execute(f'DELETE FROM "{table_name}"')
295
-
296
- if connection.vendor == "postgresql":
297
- columns = df.columns.tolist()
298
- column_names = ", ".join(f'"{col}"' for col in columns)
299
-
300
- buffer = io.StringIO()
301
- df.to_csv(buffer, index=False, header=False, sep="\t", na_rep="\\N")
302
- buffer.seek(0)
303
-
304
- with connection.cursor() as cursor:
305
- if if_exists == "replace":
306
- cursor.execute(f'DELETE FROM "{table_name}"')
307
- elif if_exists == "fail":
308
- cursor.execute(f'SELECT COUNT(*) FROM "{table_name}"')
309
- if cursor.fetchone()[0] > 0:
310
- raise ValueError(f"Table {table_name} already contains data")
311
-
312
- cursor.copy_expert(
313
- f"COPY \"{table_name}\" ({column_names}) FROM STDIN WITH (FORMAT CSV, DELIMITER E'\\t', NULL '\\N')",
314
- buffer,
315
- )
316
- else:
317
- num_cols = len(df.columns)
318
- max_vars = 900 # SQLite has a limit of 999 variables per statement
319
- chunksize = max(1, max_vars // num_cols)
320
-
321
- # Always use append mode since we set up the tables from a fresh instance
322
- df.to_sql(
323
- table_name,
324
- connection.connection,
325
- if_exists=if_exists,
326
- index=False,
327
- method="multi",
328
- chunksize=chunksize,
329
- )
330
-
331
-
332
- def import_db(
333
- module_names: Iterable[str] | None = None,
334
- *,
335
- input_dir: str | Path = "./lamindb_export/",
336
- if_exists: Literal["fail", "replace", "append"] = "replace",
337
- ) -> None:
338
- """Import registry and link tables from parquet files.
339
-
340
- Temporarily disables FK constraints to allow insertion in arbitrary order.
341
- Requires superuser/RDS admin privileges for postgres databases.
342
-
343
- Note: When running in a subprocess, add a short delay or explicit connection close after `import_db()`
344
- to ensure all SQLite writes are flushed to disk before process termination.
345
-
346
- Args:
347
- input_dir: Directory containing parquet files to import.
348
- module_names: Module names to import (e.g., ["lamindb", "bionty", "pertdb"]).
349
- if_exists: How to behave if table exists: 'fail', 'replace', or 'append'.
350
- If set to 'replace', existing data is deleted and new data is imported. All PKs and indices are not guaranteed to be preserved which can lead to write errors.
351
- If set to 'append', new data is added to existing data without clearing the table. All PKs and indices are preserved allowing write operations but database size will greatly increase.
352
- If set to 'fail', raises an error if the table contains any data.
353
- """
354
- from django.db import connection
355
-
356
- import lamindb_setup as ln_setup
357
-
358
- directory = Path(input_dir)
359
-
360
- if not directory.exists():
361
- raise ValueError(f"Directory does not exist: {directory}")
362
-
363
- if module_names is None:
364
- parquet_files = list(directory.glob("*.parquet"))
365
- detected_modules = {
366
- f.name.split("_")[0] for f in parquet_files if "_" in f.name
367
- }
368
- module_names = sorted(detected_modules)
369
-
370
- modules = {name: _get_registries(name) for name in module_names}
371
- total_models = sum(len(models) for models in modules.values())
372
-
373
- is_sqlite = ln_setup.settings.instance.dialect == "sqlite"
374
-
375
- try:
376
- with connection.cursor() as cursor:
377
- if ln_setup.settings.instance.dialect == "postgresql":
378
- cursor.execute("SET session_replication_role = 'replica'")
379
- elif is_sqlite:
380
- cursor.execute("PRAGMA foreign_keys = OFF")
381
- # Disables fsync - OS buffers writes to disk, 10-50x faster but can corrupt DB on crash
382
- cursor.execute("PRAGMA synchronous = OFF")
383
- # Keeps rollback journal in RAM - 2-5x faster but cannot rollback on crash
384
- cursor.execute("PRAGMA journal_mode = MEMORY")
385
- # 64MB page cache for better performance on large imports
386
- cursor.execute("PRAGMA cache_size = -64000")
387
-
388
- with transaction.atomic():
389
- if ln_setup.settings.instance.dialect == "postgresql":
390
- with connection.cursor() as cursor:
391
- cursor.execute("SET CONSTRAINTS ALL DEFERRED")
392
-
393
- with Progress() as progress:
394
- task = progress.add_task("Importing", total=total_models)
395
- for module_name, model_names in modules.items():
396
- schema_module = import_module(module_name)
397
- for model_name in model_names:
398
- progress.update(
399
- task, description=f"[cyan]{module_name}.{model_name}"
400
- )
401
- registry = getattr(schema_module.models, model_name)
402
- _import_registry(registry, directory, if_exists=if_exists)
403
- for field in registry._meta.many_to_many:
404
- link_orm = getattr(registry, field.name).through
405
- _import_registry(link_orm, directory, if_exists=if_exists)
406
- progress.advance(task)
407
- finally:
408
- with connection.cursor() as cursor:
409
- if ln_setup.settings.instance.dialect == "postgresql":
410
- cursor.execute("SET session_replication_role = 'origin'")
411
- elif is_sqlite:
412
- cursor.execute("PRAGMA synchronous = FULL")
413
- cursor.execute("PRAGMA journal_mode = DELETE")
414
- cursor.execute("PRAGMA foreign_keys = ON")
415
- # Reclaim space from DELETEs
416
- cursor.execute("VACUUM")
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import json
5
+ import warnings
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ from importlib import import_module
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING
10
+
11
+ from django.db import models, transaction
12
+
13
+ if TYPE_CHECKING:
14
+ from collections.abc import Iterable
15
+ from typing import Literal
16
+
17
+
18
+ def _get_registries(module_name: str) -> list[str]:
19
+ """Get registry class names from a module."""
20
+ schema_module = import_module(module_name)
21
+
22
+ # Ensure that models are loaded; we've observed empty exports otherwise
23
+ from django.db import models
24
+
25
+ return [
26
+ name
27
+ for name in dir(schema_module.models)
28
+ if (
29
+ name[0].isupper()
30
+ and isinstance(cls := getattr(schema_module.models, name, None), type)
31
+ and issubclass(cls, models.Model)
32
+ # Table names starting with `None_` are abstract base classes or Django mixins
33
+ and not cls._meta.db_table.startswith("None_") # type: ignore
34
+ )
35
+ ]
36
+
37
+
38
+ def _export_full_table(
39
+ registry_info: tuple[str, str, str | None],
40
+ directory: Path,
41
+ chunk_size: int,
42
+ ) -> list[tuple[str, Path]] | str:
43
+ """Export a registry table to parquet.
44
+
45
+ For PostgreSQL, uses COPY TO which streams the table directly to CSV format,
46
+ bypassing query planner overhead and row-by-row conversion (10-50x faster than SELECT).
47
+
48
+ For SQLite with large tables, reads in chunks to avoid memory issues when tables exceed available RAM.
49
+
50
+ Args:
51
+ registry_info: Tuple of (module_name, model_name, field_name) where `field_name`
52
+ is None for regular tables or the field name for M2M link tables.
53
+ directory: Output directory for parquet files.
54
+ chunk_size: Maximum rows per chunk for SQLite large tables.
55
+
56
+ Returns:
57
+ String identifier for single-file exports, or list of (table_name, chunk_path) tuples for chunked exports that need merging.
58
+ """
59
+ import pandas as pd
60
+ from django.db import connection
61
+
62
+ import lamindb_setup as ln_setup
63
+
64
+ module_name, model_name, field_name = registry_info
65
+ schema_module = import_module(module_name)
66
+ registry = getattr(schema_module.models, model_name)
67
+
68
+ if field_name:
69
+ registry = getattr(registry, field_name).through
70
+
71
+ table_name = registry._meta.db_table
72
+
73
+ try:
74
+ if ln_setup.settings.instance.dialect == "postgresql":
75
+ buffer = io.StringIO()
76
+ with connection.cursor() as cursor:
77
+ cursor.execute("SET statement_timeout = 0")
78
+ cursor.copy_expert(
79
+ f'COPY "{table_name}" TO STDOUT WITH (FORMAT CSV, HEADER TRUE)',
80
+ buffer,
81
+ )
82
+ buffer.seek(0)
83
+ # Prevent pandas from converting empty strings to float NaN (which PyArrow rejects)
84
+ df = pd.read_csv(buffer, keep_default_na=False)
85
+ # Convert object columns to string to handle mixed types from data corruption,
86
+ # schema migrations, or manual SQL inserts. PyArrow rejects mixed-type objects.
87
+ df = df.astype(
88
+ {col: str for col in df.columns if df[col].dtype == "object"}
89
+ )
90
+ df.to_parquet(directory / f"{table_name}.parquet", compression=None)
91
+ return (
92
+ f"{module_name}.{model_name}.{field_name}"
93
+ if field_name
94
+ else f"{module_name}.{model_name}"
95
+ )
96
+ else:
97
+ with warnings.catch_warnings():
98
+ warnings.filterwarnings(
99
+ "ignore", message="Skipped unsupported reflection"
100
+ )
101
+ row_count = pd.read_sql(
102
+ f"SELECT COUNT(*) as count FROM {table_name}",
103
+ ln_setup.settings.instance.db,
104
+ ).iloc[0]["count"]
105
+
106
+ if row_count > chunk_size:
107
+ chunk_files = []
108
+ num_chunks = (row_count + chunk_size - 1) // chunk_size
109
+ for chunk_id in range(num_chunks):
110
+ offset = chunk_id * chunk_size
111
+ df = pd.read_sql(
112
+ f"SELECT * FROM {table_name} LIMIT {chunk_size} OFFSET {offset}",
113
+ ln_setup.settings.instance.db,
114
+ )
115
+ chunk_file = (
116
+ directory / f"{table_name}_chunk_{chunk_id}.parquet"
117
+ )
118
+ df = df.astype(
119
+ {
120
+ col: str
121
+ for col in df.columns
122
+ if df[col].dtype == "object"
123
+ }
124
+ )
125
+ df.to_parquet(chunk_file, compression=None)
126
+ chunk_files.append((table_name, chunk_file))
127
+ return chunk_files
128
+ else:
129
+ df = pd.read_sql_table(table_name, ln_setup.settings.instance.db)
130
+ df = df.astype(
131
+ {col: str for col in df.columns if df[col].dtype == "object"}
132
+ )
133
+ df.to_parquet(directory / f"{table_name}.parquet", compression=None)
134
+ return (
135
+ f"{module_name}.{model_name}.{field_name}"
136
+ if field_name
137
+ else f"{module_name}.{model_name}"
138
+ )
139
+ except (ValueError, pd.errors.DatabaseError):
140
+ raise ValueError(
141
+ f"Table '{table_name}' was not found. The instance might need to be migrated."
142
+ ) from None
143
+
144
+
145
+ def export_db(
146
+ module_names: Iterable[str] | None = None,
147
+ *,
148
+ output_dir: str | Path | None = None,
149
+ max_workers: int = 8,
150
+ chunk_size: int = 500_000,
151
+ ) -> None:
152
+ """Export registry tables and many-to-many link tables to parquet files.
153
+
154
+ Ensure that you connect to postgres instances using `use_root_db_user=True`.
155
+
156
+ Args:
157
+ module_names: Module names to export (e.g., ["lamindb", "bionty", "pertdb"]).
158
+ Defaults to "lamindb" if not provided.
159
+ output_dir: Directory path for exported parquet files.
160
+ max_workers: Number of parallel processes.
161
+ chunk_size: Number of rows per chunk for large tables.
162
+ """
163
+ import pandas as pd
164
+ from rich.progress import Progress
165
+
166
+ import lamindb_setup as ln_setup
167
+
168
+ if output_dir is None:
169
+ output_dir = f"./{ln_setup.settings.instance.name}_export/"
170
+
171
+ directory = Path(output_dir)
172
+ directory.mkdir(parents=True, exist_ok=True)
173
+
174
+ module_names = module_names or ["lamindb"]
175
+ modules = {name: _get_registries(name) for name in module_names}
176
+
177
+ tasks = []
178
+ for module_name, model_names in modules.items():
179
+ schema_module = import_module(module_name)
180
+ for model_name in model_names:
181
+ registry = getattr(schema_module.models, model_name)
182
+ tasks.append((module_name, model_name, None))
183
+ for field in registry._meta.many_to_many:
184
+ tasks.append((module_name, model_name, field.name))
185
+
186
+ chunk_files_by_table: dict[str, list[Path]] = {}
187
+
188
+ with Progress() as progress:
189
+ task_id = progress.add_task("Exporting", total=len(tasks))
190
+
191
+ # This must be a ThreadPoolExecutor and not a ProcessPoolExecutor to inherit JWTs
192
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
193
+ futures = {
194
+ executor.submit(_export_full_table, task, directory, chunk_size): task
195
+ for task in tasks
196
+ }
197
+
198
+ for future in as_completed(futures):
199
+ result = future.result()
200
+ if isinstance(result, list):
201
+ for table_name, chunk_file in result:
202
+ chunk_files_by_table.setdefault(table_name, []).append(
203
+ chunk_file
204
+ )
205
+ progress.advance(task_id)
206
+
207
+ for table_name, chunk_files in chunk_files_by_table.items():
208
+ merged_df = pd.concat([pd.read_parquet(f) for f in sorted(chunk_files)])
209
+ merged_df.to_parquet(directory / f"{table_name}.parquet", compression=None)
210
+ for chunk_file in chunk_files:
211
+ chunk_file.unlink()
212
+
213
+
214
+ def _serialize_value(val):
215
+ """Convert value to JSON string if it's a dict, list, or numpy array, otherwise return as-is."""
216
+ # keep dynamic import to minimize import time
217
+ import numpy as np
218
+
219
+ if isinstance(val, (dict, list, np.ndarray)):
220
+ return json.dumps(
221
+ val, default=lambda o: o.tolist() if isinstance(o, np.ndarray) else None
222
+ )
223
+ return val
224
+
225
+
226
+ def _import_registry(
227
+ registry: type[models.Model],
228
+ directory: Path,
229
+ if_exists: Literal["fail", "replace", "append"] = "replace",
230
+ ) -> None:
231
+ """Import a single registry table from parquet.
232
+
233
+ For PostgreSQL, uses COPY FROM which bypasses SQL parsing and writes directly to
234
+ table pages (20-50x faster than multi-row INSERTs).
235
+
236
+ For SQLite, uses multi-row INSERTs with dynamic chunking to stay under the 999
237
+ variable limit (2-5x faster than single-row INSERTs).
238
+ """
239
+ import numpy as np
240
+ import pandas as pd
241
+ from django.db import connection
242
+
243
+ table_name = registry._meta.db_table
244
+ parquet_file = directory / f"{table_name}.parquet"
245
+
246
+ if not parquet_file.exists():
247
+ return
248
+
249
+ df = pd.read_parquet(parquet_file)
250
+
251
+ old_foreign_key_columns = [col for col in df.columns if col.endswith("_old")]
252
+ if old_foreign_key_columns:
253
+ df = df.drop(columns=old_foreign_key_columns)
254
+
255
+ for col in df.columns:
256
+ if df[col].dtype == "object":
257
+ mask = df[col].apply(lambda x: isinstance(x, (dict, list, np.ndarray)))
258
+ if mask.any():
259
+ df.loc[mask, col] = df.loc[mask, col].map(_serialize_value)
260
+
261
+ for field in registry._meta.fields:
262
+ # Convert PostgreSQL boolean string literals ('t'/'f') to Python booleans for SQLite compatibility
263
+ if field.get_internal_type() == "BooleanField" and field.column in df.columns:
264
+ df[field.column] = df[field.column].map(
265
+ {"t": True, "f": False, True: True, False: False, None: None}
266
+ )
267
+
268
+ # PostgreSQL CSV export writes NULL as empty string; convert back to None for nullable fields
269
+ if field.null and field.column in df.columns:
270
+ df[field.column] = df[field.column].replace("", None)
271
+
272
+ # Convert numeric fields from strings to proper types for SQLite
273
+ if (
274
+ field.get_internal_type()
275
+ in (
276
+ "IntegerField",
277
+ "BigIntegerField",
278
+ "PositiveIntegerField",
279
+ "FloatField",
280
+ "DecimalField",
281
+ )
282
+ and field.column in df.columns
283
+ ):
284
+ df[field.column] = pd.to_numeric(df[field.column], errors="coerce")
285
+
286
+ if if_exists == "append":
287
+ # Fill NULL values in NOT NULL columns to handle schema mismatches between postgres source and SQLite target
288
+ # This allows importing data where fields were nullable
289
+ for field in registry._meta.fields:
290
+ if field.column in df.columns and not field.null:
291
+ df[field.column] = df[field.column].fillna("").infer_objects(copy=False)
292
+
293
+ if df.empty:
294
+ return
295
+
296
+ if if_exists == "append":
297
+ # Clear existing data before import
298
+ # When appending we would run into duplicate errors because of existing values like branches etc
299
+ with connection.cursor() as cursor:
300
+ cursor.execute(f'DELETE FROM "{table_name}"')
301
+
302
+ if connection.vendor == "postgresql":
303
+ columns = df.columns.tolist()
304
+ column_names = ", ".join(f'"{col}"' for col in columns)
305
+
306
+ buffer = io.StringIO()
307
+ df.to_csv(buffer, index=False, header=False, sep="\t", na_rep="\\N")
308
+ buffer.seek(0)
309
+
310
+ with connection.cursor() as cursor:
311
+ if if_exists == "replace":
312
+ cursor.execute(f'DELETE FROM "{table_name}"')
313
+ elif if_exists == "fail":
314
+ cursor.execute(f'SELECT COUNT(*) FROM "{table_name}"')
315
+ if cursor.fetchone()[0] > 0:
316
+ raise ValueError(f"Table {table_name} already contains data")
317
+
318
+ cursor.copy_expert(
319
+ f"COPY \"{table_name}\" ({column_names}) FROM STDIN WITH (FORMAT CSV, DELIMITER E'\\t', NULL '\\N')",
320
+ buffer,
321
+ )
322
+ else:
323
+ num_cols = len(df.columns)
324
+ max_vars = 900 # SQLite has a limit of 999 variables per statement
325
+ chunksize = max(1, max_vars // num_cols)
326
+
327
+ # Always use append mode since we set up the tables from a fresh instance
328
+ df.to_sql(
329
+ table_name,
330
+ connection.connection,
331
+ if_exists=if_exists,
332
+ index=False,
333
+ method="multi",
334
+ chunksize=chunksize,
335
+ )
336
+
337
+
338
+ def import_db(
339
+ module_names: Iterable[str] | None = None,
340
+ *,
341
+ input_dir: str | Path = "./lamindb_export/",
342
+ if_exists: Literal["fail", "replace", "append"] = "replace",
343
+ ) -> None:
344
+ """Import registry and link tables from parquet files.
345
+
346
+ Temporarily disables FK constraints to allow insertion in arbitrary order.
347
+ Requires superuser/RDS admin privileges for postgres databases.
348
+
349
+ Note: When running in a subprocess, add a short delay or explicit connection close after `import_db()`
350
+ to ensure all SQLite writes are flushed to disk before process termination.
351
+
352
+ Args:
353
+ input_dir: Directory containing parquet files to import.
354
+ module_names: Module names to import (e.g., ["lamindb", "bionty", "pertdb"]).
355
+ if_exists: How to behave if table exists: 'fail', 'replace', or 'append'.
356
+ If set to 'replace', existing data is deleted and new data is imported. All PKs and indices are not guaranteed to be preserved which can lead to write errors.
357
+ If set to 'append', new data is added to existing data without clearing the table. All PKs and indices are preserved allowing write operations but database size will greatly increase.
358
+ If set to 'fail', raises an error if the table contains any data.
359
+ """
360
+ from django.db import connection
361
+ from rich.progress import Progress
362
+
363
+ import lamindb_setup as ln_setup
364
+
365
+ directory = Path(input_dir)
366
+
367
+ if not directory.exists():
368
+ raise ValueError(f"Directory does not exist: {directory}")
369
+
370
+ if module_names is None:
371
+ parquet_files = list(directory.glob("*.parquet"))
372
+ detected_modules = {
373
+ f.name.split("_")[0] for f in parquet_files if "_" in f.name
374
+ }
375
+ module_names = sorted(detected_modules)
376
+
377
+ modules = {name: _get_registries(name) for name in module_names}
378
+ total_models = sum(len(models) for models in modules.values())
379
+
380
+ is_sqlite = ln_setup.settings.instance.dialect == "sqlite"
381
+
382
+ try:
383
+ with connection.cursor() as cursor:
384
+ if ln_setup.settings.instance.dialect == "postgresql":
385
+ cursor.execute("SET session_replication_role = 'replica'")
386
+ elif is_sqlite:
387
+ cursor.execute("PRAGMA foreign_keys = OFF")
388
+ # Disables fsync - OS buffers writes to disk, 10-50x faster but can corrupt DB on crash
389
+ cursor.execute("PRAGMA synchronous = OFF")
390
+ # Keeps rollback journal in RAM - 2-5x faster but cannot rollback on crash
391
+ cursor.execute("PRAGMA journal_mode = MEMORY")
392
+ # 64MB page cache for better performance on large imports
393
+ cursor.execute("PRAGMA cache_size = -64000")
394
+
395
+ with transaction.atomic():
396
+ if ln_setup.settings.instance.dialect == "postgresql":
397
+ with connection.cursor() as cursor:
398
+ cursor.execute("SET CONSTRAINTS ALL DEFERRED")
399
+
400
+ with Progress() as progress:
401
+ task = progress.add_task("Importing", total=total_models)
402
+ for module_name, model_names in modules.items():
403
+ schema_module = import_module(module_name)
404
+ for model_name in model_names:
405
+ progress.update(
406
+ task, description=f"[cyan]{module_name}.{model_name}"
407
+ )
408
+ registry = getattr(schema_module.models, model_name)
409
+ _import_registry(registry, directory, if_exists=if_exists)
410
+ for field in registry._meta.many_to_many:
411
+ link_orm = getattr(registry, field.name).through
412
+ _import_registry(link_orm, directory, if_exists=if_exists)
413
+ progress.advance(task)
414
+ finally:
415
+ with connection.cursor() as cursor:
416
+ if ln_setup.settings.instance.dialect == "postgresql":
417
+ cursor.execute("SET session_replication_role = 'origin'")
418
+ elif is_sqlite:
419
+ cursor.execute("PRAGMA synchronous = FULL")
420
+ cursor.execute("PRAGMA journal_mode = DELETE")
421
+ cursor.execute("PRAGMA foreign_keys = ON")
422
+ # Reclaim space from DELETEs
423
+ cursor.execute("VACUUM")