lamindb_setup 1.19.0__py3-none-any.whl → 1.19.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. lamindb_setup/__init__.py +1 -1
  2. lamindb_setup/_cache.py +87 -87
  3. lamindb_setup/_check.py +7 -7
  4. lamindb_setup/_check_setup.py +131 -131
  5. lamindb_setup/_connect_instance.py +443 -441
  6. lamindb_setup/_delete.py +155 -155
  7. lamindb_setup/_disconnect.py +38 -38
  8. lamindb_setup/_django.py +39 -39
  9. lamindb_setup/_entry_points.py +19 -19
  10. lamindb_setup/_init_instance.py +423 -423
  11. lamindb_setup/_migrate.py +331 -331
  12. lamindb_setup/_register_instance.py +32 -32
  13. lamindb_setup/_schema.py +27 -27
  14. lamindb_setup/_schema_metadata.py +451 -451
  15. lamindb_setup/_set_managed_storage.py +81 -81
  16. lamindb_setup/_setup_user.py +198 -198
  17. lamindb_setup/_silence_loggers.py +46 -46
  18. lamindb_setup/core/__init__.py +25 -34
  19. lamindb_setup/core/_aws_options.py +276 -276
  20. lamindb_setup/core/_aws_storage.py +57 -57
  21. lamindb_setup/core/_clone.py +50 -50
  22. lamindb_setup/core/_deprecated.py +62 -62
  23. lamindb_setup/core/_docs.py +14 -14
  24. lamindb_setup/core/_hub_client.py +288 -288
  25. lamindb_setup/core/_hub_crud.py +247 -247
  26. lamindb_setup/core/_hub_utils.py +100 -100
  27. lamindb_setup/core/_private_django_api.py +80 -80
  28. lamindb_setup/core/_settings.py +440 -434
  29. lamindb_setup/core/_settings_instance.py +22 -1
  30. lamindb_setup/core/_settings_load.py +162 -162
  31. lamindb_setup/core/_settings_save.py +108 -108
  32. lamindb_setup/core/_settings_storage.py +433 -433
  33. lamindb_setup/core/_settings_store.py +162 -162
  34. lamindb_setup/core/_settings_user.py +55 -55
  35. lamindb_setup/core/_setup_bionty_sources.py +44 -44
  36. lamindb_setup/core/cloud_sqlite_locker.py +240 -240
  37. lamindb_setup/core/django.py +414 -413
  38. lamindb_setup/core/exceptions.py +1 -1
  39. lamindb_setup/core/hashing.py +134 -134
  40. lamindb_setup/core/types.py +1 -1
  41. lamindb_setup/core/upath.py +1031 -1028
  42. lamindb_setup/errors.py +72 -72
  43. lamindb_setup/io.py +423 -423
  44. lamindb_setup/types.py +17 -17
  45. {lamindb_setup-1.19.0.dist-info → lamindb_setup-1.19.1.dist-info}/METADATA +3 -2
  46. lamindb_setup-1.19.1.dist-info/RECORD +51 -0
  47. {lamindb_setup-1.19.0.dist-info → lamindb_setup-1.19.1.dist-info}/WHEEL +1 -1
  48. {lamindb_setup-1.19.0.dist-info → lamindb_setup-1.19.1.dist-info/licenses}/LICENSE +201 -201
  49. lamindb_setup-1.19.0.dist-info/RECORD +0 -51
lamindb_setup/io.py CHANGED
@@ -1,423 +1,423 @@
1
- from __future__ import annotations
2
-
3
- import io
4
- import json
5
- import warnings
6
- from concurrent.futures import ThreadPoolExecutor, as_completed
7
- from importlib import import_module
8
- from pathlib import Path
9
- from typing import TYPE_CHECKING
10
-
11
- from django.db import models, transaction
12
-
13
- if TYPE_CHECKING:
14
- from collections.abc import Iterable
15
- from typing import Literal
16
-
17
-
18
- def _get_registries(module_name: str) -> list[str]:
19
- """Get registry class names from a module."""
20
- schema_module = import_module(module_name)
21
-
22
- # Ensure that models are loaded; we've observed empty exports otherwise
23
- from django.db import models
24
-
25
- return [
26
- name
27
- for name in dir(schema_module.models)
28
- if (
29
- name[0].isupper()
30
- and isinstance(cls := getattr(schema_module.models, name, None), type)
31
- and issubclass(cls, models.Model)
32
- # Table names starting with `None_` are abstract base classes or Django mixins
33
- and not cls._meta.db_table.startswith("None_") # type: ignore
34
- )
35
- ]
36
-
37
-
38
- def _export_full_table(
39
- registry_info: tuple[str, str, str | None],
40
- directory: Path,
41
- chunk_size: int,
42
- ) -> list[tuple[str, Path]] | str:
43
- """Export a registry table to parquet.
44
-
45
- For PostgreSQL, uses COPY TO which streams the table directly to CSV format,
46
- bypassing query planner overhead and row-by-row conversion (10-50x faster than SELECT).
47
-
48
- For SQLite with large tables, reads in chunks to avoid memory issues when tables exceed available RAM.
49
-
50
- Args:
51
- registry_info: Tuple of (module_name, model_name, field_name) where `field_name`
52
- is None for regular tables or the field name for M2M link tables.
53
- directory: Output directory for parquet files.
54
- chunk_size: Maximum rows per chunk for SQLite large tables.
55
-
56
- Returns:
57
- String identifier for single-file exports, or list of (table_name, chunk_path) tuples for chunked exports that need merging.
58
- """
59
- import pandas as pd
60
- from django.db import connection
61
-
62
- import lamindb_setup as ln_setup
63
-
64
- module_name, model_name, field_name = registry_info
65
- schema_module = import_module(module_name)
66
- registry = getattr(schema_module.models, model_name)
67
-
68
- if field_name:
69
- registry = getattr(registry, field_name).through
70
-
71
- table_name = registry._meta.db_table
72
-
73
- try:
74
- if ln_setup.settings.instance.dialect == "postgresql":
75
- buffer = io.StringIO()
76
- with connection.cursor() as cursor:
77
- cursor.execute("SET statement_timeout = 0")
78
- cursor.copy_expert(
79
- f'COPY "{table_name}" TO STDOUT WITH (FORMAT CSV, HEADER TRUE)',
80
- buffer,
81
- )
82
- buffer.seek(0)
83
- # Prevent pandas from converting empty strings to float NaN (which PyArrow rejects)
84
- df = pd.read_csv(buffer, keep_default_na=False)
85
- # Convert object columns to string to handle mixed types from data corruption,
86
- # schema migrations, or manual SQL inserts. PyArrow rejects mixed-type objects.
87
- df = df.astype(
88
- {col: str for col in df.columns if df[col].dtype == "object"}
89
- )
90
- df.to_parquet(directory / f"{table_name}.parquet", compression=None)
91
- return (
92
- f"{module_name}.{model_name}.{field_name}"
93
- if field_name
94
- else f"{module_name}.{model_name}"
95
- )
96
- else:
97
- with warnings.catch_warnings():
98
- warnings.filterwarnings(
99
- "ignore", message="Skipped unsupported reflection"
100
- )
101
- row_count = pd.read_sql(
102
- f"SELECT COUNT(*) as count FROM {table_name}",
103
- ln_setup.settings.instance.db,
104
- ).iloc[0]["count"]
105
-
106
- if row_count > chunk_size:
107
- chunk_files = []
108
- num_chunks = (row_count + chunk_size - 1) // chunk_size
109
- for chunk_id in range(num_chunks):
110
- offset = chunk_id * chunk_size
111
- df = pd.read_sql(
112
- f"SELECT * FROM {table_name} LIMIT {chunk_size} OFFSET {offset}",
113
- ln_setup.settings.instance.db,
114
- )
115
- chunk_file = (
116
- directory / f"{table_name}_chunk_{chunk_id}.parquet"
117
- )
118
- df = df.astype(
119
- {
120
- col: str
121
- for col in df.columns
122
- if df[col].dtype == "object"
123
- }
124
- )
125
- df.to_parquet(chunk_file, compression=None)
126
- chunk_files.append((table_name, chunk_file))
127
- return chunk_files
128
- else:
129
- df = pd.read_sql_table(table_name, ln_setup.settings.instance.db)
130
- df = df.astype(
131
- {col: str for col in df.columns if df[col].dtype == "object"}
132
- )
133
- df.to_parquet(directory / f"{table_name}.parquet", compression=None)
134
- return (
135
- f"{module_name}.{model_name}.{field_name}"
136
- if field_name
137
- else f"{module_name}.{model_name}"
138
- )
139
- except (ValueError, pd.errors.DatabaseError):
140
- raise ValueError(
141
- f"Table '{table_name}' was not found. The instance might need to be migrated."
142
- ) from None
143
-
144
-
145
- def export_db(
146
- module_names: Iterable[str] | None = None,
147
- *,
148
- output_dir: str | Path | None = None,
149
- max_workers: int = 8,
150
- chunk_size: int = 500_000,
151
- ) -> None:
152
- """Export registry tables and many-to-many link tables to parquet files.
153
-
154
- Ensure that you connect to postgres instances using `use_root_db_user=True`.
155
-
156
- Args:
157
- module_names: Module names to export (e.g., ["lamindb", "bionty", "pertdb"]).
158
- Defaults to "lamindb" if not provided.
159
- output_dir: Directory path for exported parquet files.
160
- max_workers: Number of parallel processes.
161
- chunk_size: Number of rows per chunk for large tables.
162
- """
163
- import pandas as pd
164
- from rich.progress import Progress
165
-
166
- import lamindb_setup as ln_setup
167
-
168
- if output_dir is None:
169
- output_dir = f"./{ln_setup.settings.instance.name}_export/"
170
-
171
- directory = Path(output_dir)
172
- directory.mkdir(parents=True, exist_ok=True)
173
-
174
- module_names = module_names or ["lamindb"]
175
- modules = {name: _get_registries(name) for name in module_names}
176
-
177
- tasks = []
178
- for module_name, model_names in modules.items():
179
- schema_module = import_module(module_name)
180
- for model_name in model_names:
181
- registry = getattr(schema_module.models, model_name)
182
- tasks.append((module_name, model_name, None))
183
- for field in registry._meta.many_to_many:
184
- tasks.append((module_name, model_name, field.name))
185
-
186
- chunk_files_by_table: dict[str, list[Path]] = {}
187
-
188
- with Progress() as progress:
189
- task_id = progress.add_task("Exporting", total=len(tasks))
190
-
191
- # This must be a ThreadPoolExecutor and not a ProcessPoolExecutor to inherit JWTs
192
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
193
- futures = {
194
- executor.submit(_export_full_table, task, directory, chunk_size): task
195
- for task in tasks
196
- }
197
-
198
- for future in as_completed(futures):
199
- result = future.result()
200
- if isinstance(result, list):
201
- for table_name, chunk_file in result:
202
- chunk_files_by_table.setdefault(table_name, []).append(
203
- chunk_file
204
- )
205
- progress.advance(task_id)
206
-
207
- for table_name, chunk_files in chunk_files_by_table.items():
208
- merged_df = pd.concat([pd.read_parquet(f) for f in sorted(chunk_files)])
209
- merged_df.to_parquet(directory / f"{table_name}.parquet", compression=None)
210
- for chunk_file in chunk_files:
211
- chunk_file.unlink()
212
-
213
-
214
- def _serialize_value(val):
215
- """Convert value to JSON string if it's a dict, list, or numpy array, otherwise return as-is."""
216
- # keep dynamic import to minimize import time
217
- import numpy as np
218
-
219
- if isinstance(val, (dict, list, np.ndarray)):
220
- return json.dumps(
221
- val, default=lambda o: o.tolist() if isinstance(o, np.ndarray) else None
222
- )
223
- return val
224
-
225
-
226
- def _import_registry(
227
- registry: type[models.Model],
228
- directory: Path,
229
- if_exists: Literal["fail", "replace", "append"] = "replace",
230
- ) -> None:
231
- """Import a single registry table from parquet.
232
-
233
- For PostgreSQL, uses COPY FROM which bypasses SQL parsing and writes directly to
234
- table pages (20-50x faster than multi-row INSERTs).
235
-
236
- For SQLite, uses multi-row INSERTs with dynamic chunking to stay under the 999
237
- variable limit (2-5x faster than single-row INSERTs).
238
- """
239
- import numpy as np
240
- import pandas as pd
241
- from django.db import connection
242
-
243
- table_name = registry._meta.db_table
244
- parquet_file = directory / f"{table_name}.parquet"
245
-
246
- if not parquet_file.exists():
247
- return
248
-
249
- df = pd.read_parquet(parquet_file)
250
-
251
- old_foreign_key_columns = [col for col in df.columns if col.endswith("_old")]
252
- if old_foreign_key_columns:
253
- df = df.drop(columns=old_foreign_key_columns)
254
-
255
- for col in df.columns:
256
- if df[col].dtype == "object":
257
- mask = df[col].apply(lambda x: isinstance(x, (dict, list, np.ndarray)))
258
- if mask.any():
259
- df.loc[mask, col] = df.loc[mask, col].map(_serialize_value)
260
-
261
- for field in registry._meta.fields:
262
- # Convert PostgreSQL boolean string literals ('t'/'f') to Python booleans for SQLite compatibility
263
- if field.get_internal_type() == "BooleanField" and field.column in df.columns:
264
- df[field.column] = df[field.column].map(
265
- {"t": True, "f": False, True: True, False: False, None: None}
266
- )
267
-
268
- # PostgreSQL CSV export writes NULL as empty string; convert back to None for nullable fields
269
- if field.null and field.column in df.columns:
270
- df[field.column] = df[field.column].replace("", None)
271
-
272
- # Convert numeric fields from strings to proper types for SQLite
273
- if (
274
- field.get_internal_type()
275
- in (
276
- "IntegerField",
277
- "BigIntegerField",
278
- "PositiveIntegerField",
279
- "FloatField",
280
- "DecimalField",
281
- )
282
- and field.column in df.columns
283
- ):
284
- df[field.column] = pd.to_numeric(df[field.column], errors="coerce")
285
-
286
- if if_exists == "append":
287
- # Fill NULL values in NOT NULL columns to handle schema mismatches between postgres source and SQLite target
288
- # This allows importing data where fields were nullable
289
- for field in registry._meta.fields:
290
- if field.column in df.columns and not field.null:
291
- df[field.column] = df[field.column].fillna("").infer_objects(copy=False)
292
-
293
- if df.empty:
294
- return
295
-
296
- if if_exists == "append":
297
- # Clear existing data before import
298
- # When appending we would run into duplicate errors because of existing values like branches etc
299
- with connection.cursor() as cursor:
300
- cursor.execute(f'DELETE FROM "{table_name}"')
301
-
302
- if connection.vendor == "postgresql":
303
- columns = df.columns.tolist()
304
- column_names = ", ".join(f'"{col}"' for col in columns)
305
-
306
- buffer = io.StringIO()
307
- df.to_csv(buffer, index=False, header=False, sep="\t", na_rep="\\N")
308
- buffer.seek(0)
309
-
310
- with connection.cursor() as cursor:
311
- if if_exists == "replace":
312
- cursor.execute(f'DELETE FROM "{table_name}"')
313
- elif if_exists == "fail":
314
- cursor.execute(f'SELECT COUNT(*) FROM "{table_name}"')
315
- if cursor.fetchone()[0] > 0:
316
- raise ValueError(f"Table {table_name} already contains data")
317
-
318
- cursor.copy_expert(
319
- f"COPY \"{table_name}\" ({column_names}) FROM STDIN WITH (FORMAT CSV, DELIMITER E'\\t', NULL '\\N')",
320
- buffer,
321
- )
322
- else:
323
- num_cols = len(df.columns)
324
- max_vars = 900 # SQLite has a limit of 999 variables per statement
325
- chunksize = max(1, max_vars // num_cols)
326
-
327
- # Always use append mode since we set up the tables from a fresh instance
328
- df.to_sql(
329
- table_name,
330
- connection.connection,
331
- if_exists=if_exists,
332
- index=False,
333
- method="multi",
334
- chunksize=chunksize,
335
- )
336
-
337
-
338
- def import_db(
339
- module_names: Iterable[str] | None = None,
340
- *,
341
- input_dir: str | Path = "./lamindb_export/",
342
- if_exists: Literal["fail", "replace", "append"] = "replace",
343
- ) -> None:
344
- """Import registry and link tables from parquet files.
345
-
346
- Temporarily disables FK constraints to allow insertion in arbitrary order.
347
- Requires superuser/RDS admin privileges for postgres databases.
348
-
349
- Note: When running in a subprocess, add a short delay or explicit connection close after `import_db()`
350
- to ensure all SQLite writes are flushed to disk before process termination.
351
-
352
- Args:
353
- input_dir: Directory containing parquet files to import.
354
- module_names: Module names to import (e.g., ["lamindb", "bionty", "pertdb"]).
355
- if_exists: How to behave if table exists: 'fail', 'replace', or 'append'.
356
- If set to 'replace', existing data is deleted and new data is imported. All PKs and indices are not guaranteed to be preserved which can lead to write errors.
357
- If set to 'append', new data is added to existing data without clearing the table. All PKs and indices are preserved allowing write operations but database size will greatly increase.
358
- If set to 'fail', raises an error if the table contains any data.
359
- """
360
- from django.db import connection
361
- from rich.progress import Progress
362
-
363
- import lamindb_setup as ln_setup
364
-
365
- directory = Path(input_dir)
366
-
367
- if not directory.exists():
368
- raise ValueError(f"Directory does not exist: {directory}")
369
-
370
- if module_names is None:
371
- parquet_files = list(directory.glob("*.parquet"))
372
- detected_modules = {
373
- f.name.split("_")[0] for f in parquet_files if "_" in f.name
374
- }
375
- module_names = sorted(detected_modules)
376
-
377
- modules = {name: _get_registries(name) for name in module_names}
378
- total_models = sum(len(models) for models in modules.values())
379
-
380
- is_sqlite = ln_setup.settings.instance.dialect == "sqlite"
381
-
382
- try:
383
- with connection.cursor() as cursor:
384
- if ln_setup.settings.instance.dialect == "postgresql":
385
- cursor.execute("SET session_replication_role = 'replica'")
386
- elif is_sqlite:
387
- cursor.execute("PRAGMA foreign_keys = OFF")
388
- # Disables fsync - OS buffers writes to disk, 10-50x faster but can corrupt DB on crash
389
- cursor.execute("PRAGMA synchronous = OFF")
390
- # Keeps rollback journal in RAM - 2-5x faster but cannot rollback on crash
391
- cursor.execute("PRAGMA journal_mode = MEMORY")
392
- # 64MB page cache for better performance on large imports
393
- cursor.execute("PRAGMA cache_size = -64000")
394
-
395
- with transaction.atomic():
396
- if ln_setup.settings.instance.dialect == "postgresql":
397
- with connection.cursor() as cursor:
398
- cursor.execute("SET CONSTRAINTS ALL DEFERRED")
399
-
400
- with Progress() as progress:
401
- task = progress.add_task("Importing", total=total_models)
402
- for module_name, model_names in modules.items():
403
- schema_module = import_module(module_name)
404
- for model_name in model_names:
405
- progress.update(
406
- task, description=f"[cyan]{module_name}.{model_name}"
407
- )
408
- registry = getattr(schema_module.models, model_name)
409
- _import_registry(registry, directory, if_exists=if_exists)
410
- for field in registry._meta.many_to_many:
411
- link_orm = getattr(registry, field.name).through
412
- _import_registry(link_orm, directory, if_exists=if_exists)
413
- progress.advance(task)
414
- finally:
415
- with connection.cursor() as cursor:
416
- if ln_setup.settings.instance.dialect == "postgresql":
417
- cursor.execute("SET session_replication_role = 'origin'")
418
- elif is_sqlite:
419
- cursor.execute("PRAGMA synchronous = FULL")
420
- cursor.execute("PRAGMA journal_mode = DELETE")
421
- cursor.execute("PRAGMA foreign_keys = ON")
422
- # Reclaim space from DELETEs
423
- cursor.execute("VACUUM")
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import json
5
+ import warnings
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ from importlib import import_module
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING
10
+
11
+ from django.db import models, transaction
12
+
13
+ if TYPE_CHECKING:
14
+ from collections.abc import Iterable
15
+ from typing import Literal
16
+
17
+
18
+ def _get_registries(module_name: str) -> list[str]:
19
+ """Get registry class names from a module."""
20
+ schema_module = import_module(module_name)
21
+
22
+ # Ensure that models are loaded; we've observed empty exports otherwise
23
+ from django.db import models
24
+
25
+ return [
26
+ name
27
+ for name in dir(schema_module.models)
28
+ if (
29
+ name[0].isupper()
30
+ and isinstance(cls := getattr(schema_module.models, name, None), type)
31
+ and issubclass(cls, models.Model)
32
+ # Table names starting with `None_` are abstract base classes or Django mixins
33
+ and not cls._meta.db_table.startswith("None_") # type: ignore
34
+ )
35
+ ]
36
+
37
+
38
+ def _export_full_table(
39
+ registry_info: tuple[str, str, str | None],
40
+ directory: Path,
41
+ chunk_size: int,
42
+ ) -> list[tuple[str, Path]] | str:
43
+ """Export a registry table to parquet.
44
+
45
+ For PostgreSQL, uses COPY TO which streams the table directly to CSV format,
46
+ bypassing query planner overhead and row-by-row conversion (10-50x faster than SELECT).
47
+
48
+ For SQLite with large tables, reads in chunks to avoid memory issues when tables exceed available RAM.
49
+
50
+ Args:
51
+ registry_info: Tuple of (module_name, model_name, field_name) where `field_name`
52
+ is None for regular tables or the field name for M2M link tables.
53
+ directory: Output directory for parquet files.
54
+ chunk_size: Maximum rows per chunk for SQLite large tables.
55
+
56
+ Returns:
57
+ String identifier for single-file exports, or list of (table_name, chunk_path) tuples for chunked exports that need merging.
58
+ """
59
+ import pandas as pd
60
+ from django.db import connection
61
+
62
+ import lamindb_setup as ln_setup
63
+
64
+ module_name, model_name, field_name = registry_info
65
+ schema_module = import_module(module_name)
66
+ registry = getattr(schema_module.models, model_name)
67
+
68
+ if field_name:
69
+ registry = getattr(registry, field_name).through
70
+
71
+ table_name = registry._meta.db_table
72
+
73
+ try:
74
+ if ln_setup.settings.instance.dialect == "postgresql":
75
+ buffer = io.StringIO()
76
+ with connection.cursor() as cursor:
77
+ cursor.execute("SET statement_timeout = 0")
78
+ cursor.copy_expert(
79
+ f'COPY "{table_name}" TO STDOUT WITH (FORMAT CSV, HEADER TRUE)',
80
+ buffer,
81
+ )
82
+ buffer.seek(0)
83
+ # Prevent pandas from converting empty strings to float NaN (which PyArrow rejects)
84
+ df = pd.read_csv(buffer, keep_default_na=False)
85
+ # Convert object columns to string to handle mixed types from data corruption,
86
+ # schema migrations, or manual SQL inserts. PyArrow rejects mixed-type objects.
87
+ df = df.astype(
88
+ {col: str for col in df.columns if df[col].dtype == "object"}
89
+ )
90
+ df.to_parquet(directory / f"{table_name}.parquet", compression=None)
91
+ return (
92
+ f"{module_name}.{model_name}.{field_name}"
93
+ if field_name
94
+ else f"{module_name}.{model_name}"
95
+ )
96
+ else:
97
+ with warnings.catch_warnings():
98
+ warnings.filterwarnings(
99
+ "ignore", message="Skipped unsupported reflection"
100
+ )
101
+ row_count = pd.read_sql(
102
+ f"SELECT COUNT(*) as count FROM {table_name}",
103
+ ln_setup.settings.instance.db,
104
+ ).iloc[0]["count"]
105
+
106
+ if row_count > chunk_size:
107
+ chunk_files = []
108
+ num_chunks = (row_count + chunk_size - 1) // chunk_size
109
+ for chunk_id in range(num_chunks):
110
+ offset = chunk_id * chunk_size
111
+ df = pd.read_sql(
112
+ f"SELECT * FROM {table_name} LIMIT {chunk_size} OFFSET {offset}",
113
+ ln_setup.settings.instance.db,
114
+ )
115
+ chunk_file = (
116
+ directory / f"{table_name}_chunk_{chunk_id}.parquet"
117
+ )
118
+ df = df.astype(
119
+ {
120
+ col: str
121
+ for col in df.columns
122
+ if df[col].dtype == "object"
123
+ }
124
+ )
125
+ df.to_parquet(chunk_file, compression=None)
126
+ chunk_files.append((table_name, chunk_file))
127
+ return chunk_files
128
+ else:
129
+ df = pd.read_sql_table(table_name, ln_setup.settings.instance.db)
130
+ df = df.astype(
131
+ {col: str for col in df.columns if df[col].dtype == "object"}
132
+ )
133
+ df.to_parquet(directory / f"{table_name}.parquet", compression=None)
134
+ return (
135
+ f"{module_name}.{model_name}.{field_name}"
136
+ if field_name
137
+ else f"{module_name}.{model_name}"
138
+ )
139
+ except (ValueError, pd.errors.DatabaseError):
140
+ raise ValueError(
141
+ f"Table '{table_name}' was not found. The instance might need to be migrated."
142
+ ) from None
143
+
144
+
145
+ def export_db(
146
+ module_names: Iterable[str] | None = None,
147
+ *,
148
+ output_dir: str | Path | None = None,
149
+ max_workers: int = 8,
150
+ chunk_size: int = 500_000,
151
+ ) -> None:
152
+ """Export registry tables and many-to-many link tables to parquet files.
153
+
154
+ Ensure that you connect to postgres instances using `use_root_db_user=True`.
155
+
156
+ Args:
157
+ module_names: Module names to export (e.g., ["lamindb", "bionty", "pertdb"]).
158
+ Defaults to "lamindb" if not provided.
159
+ output_dir: Directory path for exported parquet files.
160
+ max_workers: Number of parallel processes.
161
+ chunk_size: Number of rows per chunk for large tables.
162
+ """
163
+ import pandas as pd
164
+ from rich.progress import Progress
165
+
166
+ import lamindb_setup as ln_setup
167
+
168
+ if output_dir is None:
169
+ output_dir = f"./{ln_setup.settings.instance.name}_export/"
170
+
171
+ directory = Path(output_dir)
172
+ directory.mkdir(parents=True, exist_ok=True)
173
+
174
+ module_names = module_names or ["lamindb"]
175
+ modules = {name: _get_registries(name) for name in module_names}
176
+
177
+ tasks = []
178
+ for module_name, model_names in modules.items():
179
+ schema_module = import_module(module_name)
180
+ for model_name in model_names:
181
+ registry = getattr(schema_module.models, model_name)
182
+ tasks.append((module_name, model_name, None))
183
+ for field in registry._meta.many_to_many:
184
+ tasks.append((module_name, model_name, field.name))
185
+
186
+ chunk_files_by_table: dict[str, list[Path]] = {}
187
+
188
+ with Progress() as progress:
189
+ task_id = progress.add_task("Exporting", total=len(tasks))
190
+
191
+ # This must be a ThreadPoolExecutor and not a ProcessPoolExecutor to inherit JWTs
192
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
193
+ futures = {
194
+ executor.submit(_export_full_table, task, directory, chunk_size): task
195
+ for task in tasks
196
+ }
197
+
198
+ for future in as_completed(futures):
199
+ result = future.result()
200
+ if isinstance(result, list):
201
+ for table_name, chunk_file in result:
202
+ chunk_files_by_table.setdefault(table_name, []).append(
203
+ chunk_file
204
+ )
205
+ progress.advance(task_id)
206
+
207
+ for table_name, chunk_files in chunk_files_by_table.items():
208
+ merged_df = pd.concat([pd.read_parquet(f) for f in sorted(chunk_files)])
209
+ merged_df.to_parquet(directory / f"{table_name}.parquet", compression=None)
210
+ for chunk_file in chunk_files:
211
+ chunk_file.unlink()
212
+
213
+
214
+ def _serialize_value(val):
215
+ """Convert value to JSON string if it's a dict, list, or numpy array, otherwise return as-is."""
216
+ # keep dynamic import to minimize import time
217
+ import numpy as np
218
+
219
+ if isinstance(val, (dict, list, np.ndarray)):
220
+ return json.dumps(
221
+ val, default=lambda o: o.tolist() if isinstance(o, np.ndarray) else None
222
+ )
223
+ return val
224
+
225
+
226
+ def _import_registry(
227
+ registry: type[models.Model],
228
+ directory: Path,
229
+ if_exists: Literal["fail", "replace", "append"] = "replace",
230
+ ) -> None:
231
+ """Import a single registry table from parquet.
232
+
233
+ For PostgreSQL, uses COPY FROM which bypasses SQL parsing and writes directly to
234
+ table pages (20-50x faster than multi-row INSERTs).
235
+
236
+ For SQLite, uses multi-row INSERTs with dynamic chunking to stay under the 999
237
+ variable limit (2-5x faster than single-row INSERTs).
238
+ """
239
+ import numpy as np
240
+ import pandas as pd
241
+ from django.db import connection
242
+
243
+ table_name = registry._meta.db_table
244
+ parquet_file = directory / f"{table_name}.parquet"
245
+
246
+ if not parquet_file.exists():
247
+ return
248
+
249
+ df = pd.read_parquet(parquet_file)
250
+
251
+ old_foreign_key_columns = [col for col in df.columns if col.endswith("_old")]
252
+ if old_foreign_key_columns:
253
+ df = df.drop(columns=old_foreign_key_columns)
254
+
255
+ for col in df.columns:
256
+ if df[col].dtype == "object":
257
+ mask = df[col].apply(lambda x: isinstance(x, (dict, list, np.ndarray)))
258
+ if mask.any():
259
+ df.loc[mask, col] = df.loc[mask, col].map(_serialize_value)
260
+
261
+ for field in registry._meta.fields:
262
+ # Convert PostgreSQL boolean string literals ('t'/'f') to Python booleans for SQLite compatibility
263
+ if field.get_internal_type() == "BooleanField" and field.column in df.columns:
264
+ df[field.column] = df[field.column].map(
265
+ {"t": True, "f": False, True: True, False: False, None: None}
266
+ )
267
+
268
+ # PostgreSQL CSV export writes NULL as empty string; convert back to None for nullable fields
269
+ if field.null and field.column in df.columns:
270
+ df[field.column] = df[field.column].replace("", None)
271
+
272
+ # Convert numeric fields from strings to proper types for SQLite
273
+ if (
274
+ field.get_internal_type()
275
+ in (
276
+ "IntegerField",
277
+ "BigIntegerField",
278
+ "PositiveIntegerField",
279
+ "FloatField",
280
+ "DecimalField",
281
+ )
282
+ and field.column in df.columns
283
+ ):
284
+ df[field.column] = pd.to_numeric(df[field.column], errors="coerce")
285
+
286
+ if if_exists == "append":
287
+ # Fill NULL values in NOT NULL columns to handle schema mismatches between postgres source and SQLite target
288
+ # This allows importing data where fields were nullable
289
+ for field in registry._meta.fields:
290
+ if field.column in df.columns and not field.null:
291
+ df[field.column] = df[field.column].fillna("").infer_objects(copy=False)
292
+
293
+ if df.empty:
294
+ return
295
+
296
+ if if_exists == "append":
297
+ # Clear existing data before import
298
+ # When appending we would run into duplicate errors because of existing values like branches etc
299
+ with connection.cursor() as cursor:
300
+ cursor.execute(f'DELETE FROM "{table_name}"')
301
+
302
+ if connection.vendor == "postgresql":
303
+ columns = df.columns.tolist()
304
+ column_names = ", ".join(f'"{col}"' for col in columns)
305
+
306
+ buffer = io.StringIO()
307
+ df.to_csv(buffer, index=False, header=False, sep="\t", na_rep="\\N")
308
+ buffer.seek(0)
309
+
310
+ with connection.cursor() as cursor:
311
+ if if_exists == "replace":
312
+ cursor.execute(f'DELETE FROM "{table_name}"')
313
+ elif if_exists == "fail":
314
+ cursor.execute(f'SELECT COUNT(*) FROM "{table_name}"')
315
+ if cursor.fetchone()[0] > 0:
316
+ raise ValueError(f"Table {table_name} already contains data")
317
+
318
+ cursor.copy_expert(
319
+ f"COPY \"{table_name}\" ({column_names}) FROM STDIN WITH (FORMAT CSV, DELIMITER E'\\t', NULL '\\N')",
320
+ buffer,
321
+ )
322
+ else:
323
+ num_cols = len(df.columns)
324
+ max_vars = 900 # SQLite has a limit of 999 variables per statement
325
+ chunksize = max(1, max_vars // num_cols)
326
+
327
+ # Always use append mode since we set up the tables from a fresh instance
328
+ df.to_sql(
329
+ table_name,
330
+ connection.connection,
331
+ if_exists=if_exists,
332
+ index=False,
333
+ method="multi",
334
+ chunksize=chunksize,
335
+ )
336
+
337
+
338
+ def import_db(
339
+ module_names: Iterable[str] | None = None,
340
+ *,
341
+ input_dir: str | Path = "./lamindb_export/",
342
+ if_exists: Literal["fail", "replace", "append"] = "replace",
343
+ ) -> None:
344
+ """Import registry and link tables from parquet files.
345
+
346
+ Temporarily disables FK constraints to allow insertion in arbitrary order.
347
+ Requires superuser/RDS admin privileges for postgres databases.
348
+
349
+ Note: When running in a subprocess, add a short delay or explicit connection close after `import_db()`
350
+ to ensure all SQLite writes are flushed to disk before process termination.
351
+
352
+ Args:
353
+ input_dir: Directory containing parquet files to import.
354
+ module_names: Module names to import (e.g., ["lamindb", "bionty", "pertdb"]).
355
+ if_exists: How to behave if table exists: 'fail', 'replace', or 'append'.
356
+ If set to 'replace', existing data is deleted and new data is imported. All PKs and indices are not guaranteed to be preserved which can lead to write errors.
357
+ If set to 'append', new data is added to existing data without clearing the table. All PKs and indices are preserved allowing write operations but database size will greatly increase.
358
+ If set to 'fail', raises an error if the table contains any data.
359
+ """
360
+ from django.db import connection
361
+ from rich.progress import Progress
362
+
363
+ import lamindb_setup as ln_setup
364
+
365
+ directory = Path(input_dir)
366
+
367
+ if not directory.exists():
368
+ raise ValueError(f"Directory does not exist: {directory}")
369
+
370
+ if module_names is None:
371
+ parquet_files = list(directory.glob("*.parquet"))
372
+ detected_modules = {
373
+ f.name.split("_")[0] for f in parquet_files if "_" in f.name
374
+ }
375
+ module_names = sorted(detected_modules)
376
+
377
+ modules = {name: _get_registries(name) for name in module_names}
378
+ total_models = sum(len(models) for models in modules.values())
379
+
380
+ is_sqlite = ln_setup.settings.instance.dialect == "sqlite"
381
+
382
+ try:
383
+ with connection.cursor() as cursor:
384
+ if ln_setup.settings.instance.dialect == "postgresql":
385
+ cursor.execute("SET session_replication_role = 'replica'")
386
+ elif is_sqlite:
387
+ cursor.execute("PRAGMA foreign_keys = OFF")
388
+ # Disables fsync - OS buffers writes to disk, 10-50x faster but can corrupt DB on crash
389
+ cursor.execute("PRAGMA synchronous = OFF")
390
+ # Keeps rollback journal in RAM - 2-5x faster but cannot rollback on crash
391
+ cursor.execute("PRAGMA journal_mode = MEMORY")
392
+ # 64MB page cache for better performance on large imports
393
+ cursor.execute("PRAGMA cache_size = -64000")
394
+
395
+ with transaction.atomic():
396
+ if ln_setup.settings.instance.dialect == "postgresql":
397
+ with connection.cursor() as cursor:
398
+ cursor.execute("SET CONSTRAINTS ALL DEFERRED")
399
+
400
+ with Progress() as progress:
401
+ task = progress.add_task("Importing", total=total_models)
402
+ for module_name, model_names in modules.items():
403
+ schema_module = import_module(module_name)
404
+ for model_name in model_names:
405
+ progress.update(
406
+ task, description=f"[cyan]{module_name}.{model_name}"
407
+ )
408
+ registry = getattr(schema_module.models, model_name)
409
+ _import_registry(registry, directory, if_exists=if_exists)
410
+ for field in registry._meta.many_to_many:
411
+ link_orm = getattr(registry, field.name).through
412
+ _import_registry(link_orm, directory, if_exists=if_exists)
413
+ progress.advance(task)
414
+ finally:
415
+ with connection.cursor() as cursor:
416
+ if ln_setup.settings.instance.dialect == "postgresql":
417
+ cursor.execute("SET session_replication_role = 'origin'")
418
+ elif is_sqlite:
419
+ cursor.execute("PRAGMA synchronous = FULL")
420
+ cursor.execute("PRAGMA journal_mode = DELETE")
421
+ cursor.execute("PRAGMA foreign_keys = ON")
422
+ # Reclaim space from DELETEs
423
+ cursor.execute("VACUUM")