lamindb_setup 1.15.0__py3-none-any.whl → 1.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb_setup/__init__.py CHANGED
@@ -35,7 +35,7 @@ Migration management
35
35
 
36
36
  """
37
37
 
38
- __version__ = "1.15.0" # denote a release candidate for 0.1.0 with 0.1rc1
38
+ __version__ = "1.15.1" # denote a release candidate for 0.1.0 with 0.1rc1
39
39
 
40
40
  import os
41
41
  import warnings
@@ -20,6 +20,7 @@ lamin_env = os.getenv("LAMIN_ENV")
20
20
  if lamin_env is None or lamin_env == "prod":
21
21
  HOSTED_BUCKETS = tuple([f"s3://lamin-{region}" for region in HOSTED_REGIONS])
22
22
  else:
23
+ logger.warning("loaded LAMIN_ENV: staging")
23
24
  HOSTED_BUCKETS = ("s3://lamin-hosted-test",) # type: ignore
24
25
 
25
26
 
lamindb_setup/io.py CHANGED
@@ -1,11 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import io
3
4
  import json
4
5
  import warnings
6
+ from concurrent.futures import ProcessPoolExecutor, as_completed
5
7
  from importlib import import_module
6
8
  from pathlib import Path
7
9
  from typing import TYPE_CHECKING
8
10
 
11
+ import numpy as np
9
12
  import pandas as pd
10
13
  from django.db import models, transaction
11
14
  from rich.progress import Progress
@@ -43,21 +46,101 @@ def _get_registries(module_name: str) -> list[str]:
43
46
  ]
44
47
 
45
48
 
46
- def _export_registry_to_parquet(registry: type[models.Model], directory: Path) -> None:
47
- """Export a single registry table to parquet."""
49
+ def _export_full_table(
50
+ registry_info: tuple[str, str, str | None],
51
+ directory: Path,
52
+ chunk_size: int,
53
+ ) -> list[tuple[str, Path]] | str:
54
+ """Export a registry table to parquet.
55
+
56
+ For PostgreSQL, uses COPY TO which streams the table directly to CSV format,
57
+ bypassing query planner overhead and row-by-row conversion (10-50x faster than SELECT).
58
+
59
+ For SQLite with large tables, reads in chunks to avoid memory issues when tables exceed available RAM.
60
+
61
+ Args:
62
+ registry_info: Tuple of (module_name, model_name, field_name) where field_name
63
+ is None for regular tables or the field name for M2M link tables.
64
+ directory: Output directory for parquet files.
65
+ chunk_size: Maximum rows per chunk for SQLite large tables.
66
+
67
+ Returns:
68
+ String identifier for single-file exports, or list of (table_name, chunk_path) tuples for chunked exports that need merging.
69
+ """
70
+ from django.db import connection
71
+
48
72
  import lamindb_setup as ln_setup
49
73
 
74
+ module_name, model_name, field_name = registry_info
75
+ schema_module = import_module(module_name)
76
+ registry = getattr(schema_module, model_name)
77
+
78
+ if field_name:
79
+ registry = getattr(registry, field_name).through
80
+
50
81
  table_name = registry._meta.db_table
51
- with warnings.catch_warnings():
52
- warnings.filterwarnings("ignore", message="Skipped unsupported reflection")
53
- df = pd.read_sql_table(table_name, ln_setup.settings.instance.db)
54
- df.to_parquet(directory / f"{table_name}.parquet", compression=None)
82
+
83
+ try:
84
+ if ln_setup.settings.instance.dialect == "postgresql":
85
+ buffer = io.StringIO()
86
+ with connection.cursor() as cursor:
87
+ cursor.copy_expert(
88
+ f'COPY "{table_name}" TO STDOUT WITH (FORMAT CSV, HEADER TRUE)',
89
+ buffer,
90
+ )
91
+ buffer.seek(0)
92
+ df = pd.read_csv(buffer)
93
+ df.to_parquet(directory / f"{table_name}.parquet", compression=None)
94
+ return (
95
+ f"{module_name}.{model_name}.{field_name}"
96
+ if field_name
97
+ else f"{module_name}.{model_name}"
98
+ )
99
+ else:
100
+ with warnings.catch_warnings():
101
+ warnings.filterwarnings(
102
+ "ignore", message="Skipped unsupported reflection"
103
+ )
104
+ row_count = pd.read_sql(
105
+ f"SELECT COUNT(*) as count FROM {table_name}",
106
+ ln_setup.settings.instance.db,
107
+ ).iloc[0]["count"]
108
+
109
+ if row_count > chunk_size:
110
+ chunk_files = []
111
+ num_chunks = (row_count + chunk_size - 1) // chunk_size
112
+ for chunk_id in range(num_chunks):
113
+ offset = chunk_id * chunk_size
114
+ df = pd.read_sql(
115
+ f"SELECT * FROM {table_name} LIMIT {chunk_size} OFFSET {offset}",
116
+ ln_setup.settings.instance.db,
117
+ )
118
+ chunk_file = (
119
+ directory / f"{table_name}_chunk_{chunk_id}.parquet"
120
+ )
121
+ df.to_parquet(chunk_file, compression=None)
122
+ chunk_files.append((table_name, chunk_file))
123
+ return chunk_files
124
+ else:
125
+ df = pd.read_sql_table(table_name, ln_setup.settings.instance.db)
126
+ df.to_parquet(directory / f"{table_name}.parquet", compression=None)
127
+ return (
128
+ f"{module_name}.{model_name}.{field_name}"
129
+ if field_name
130
+ else f"{module_name}.{model_name}"
131
+ )
132
+ except (ValueError, pd.errors.DatabaseError):
133
+ raise ValueError(
134
+ f"Table '{table_name}' was not found. The instance might need to be migrated."
135
+ ) from None
55
136
 
56
137
 
57
138
  def export_db(
58
139
  module_names: Sequence[str] | None = None,
59
140
  *,
60
141
  output_dir: str | Path = "./lamindb_export/",
142
+ max_workers: int = 8,
143
+ chunk_size: int = 500_000,
61
144
  ) -> None:
62
145
  """Export registry tables and many-to-many link tables to parquet files.
63
146
 
@@ -67,26 +150,57 @@ def export_db(
67
150
  module_names: Module names to export (e.g., ["lamindb", "bionty", "wetlab"]).
68
151
  Defaults to "lamindb" if not provided.
69
152
  output_dir: Directory path for exported parquet files.
153
+ max_workers: Number of parallel processes.
154
+ chunk_size: Number of rows per chunk for large tables.
70
155
  """
71
156
  directory = Path(output_dir)
72
157
  directory.mkdir(parents=True, exist_ok=True)
73
158
 
74
159
  module_names = module_names or ["lamindb"]
75
160
  modules = {name: _get_registries(name) for name in module_names}
76
- total_models = sum(len(models) for models in modules.values())
161
+
162
+ tasks = []
163
+ for module_name, model_names in modules.items():
164
+ schema_module = import_module(module_name)
165
+ for model_name in model_names:
166
+ registry = getattr(schema_module, model_name)
167
+ tasks.append((module_name, model_name, None))
168
+ for field in registry._meta.many_to_many:
169
+ tasks.append((module_name, model_name, field.name))
170
+
171
+ chunk_files_by_table: dict[str, list[Path]] = {}
77
172
 
78
173
  with Progress() as progress:
79
- task = progress.add_task("Exporting", total=total_models)
80
- for module_name, model_names in modules.items():
81
- schema_module = import_module(module_name)
82
- for model_name in model_names:
83
- progress.update(task, description=f"[cyan]{module_name}.{model_name}")
84
- registry = getattr(schema_module, model_name)
85
- _export_registry_to_parquet(registry, directory)
86
- for field in registry._meta.many_to_many:
87
- link_orm = getattr(registry, field.name).through
88
- _export_registry_to_parquet(link_orm, directory)
89
- progress.advance(task)
174
+ task_id = progress.add_task("Exporting", total=len(tasks))
175
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
176
+ futures = {
177
+ executor.submit(_export_full_table, task, directory, chunk_size): task
178
+ for task in tasks
179
+ }
180
+
181
+ for future in as_completed(futures):
182
+ result = future.result()
183
+ if isinstance(result, list):
184
+ for table_name, chunk_file in result:
185
+ chunk_files_by_table.setdefault(table_name, []).append(
186
+ chunk_file
187
+ )
188
+ progress.advance(task_id)
189
+
190
+ for table_name, chunk_files in chunk_files_by_table.items():
191
+ merged_df = pd.concat([pd.read_parquet(f) for f in sorted(chunk_files)])
192
+ merged_df.to_parquet(directory / f"{table_name}.parquet", compression=None)
193
+ for chunk_file in chunk_files:
194
+ chunk_file.unlink()
195
+
196
+
197
+ def _serialize_value(val):
198
+ """Convert value to JSON string if it's a dict, list, or numpy array, otherwise return as-is."""
199
+ if isinstance(val, (dict, list, np.ndarray)):
200
+ return json.dumps(
201
+ val, default=lambda o: o.tolist() if isinstance(o, np.ndarray) else None
202
+ )
203
+ return val
90
204
 
91
205
 
92
206
  def _import_registry(
@@ -96,8 +210,14 @@ def _import_registry(
96
210
  ) -> None:
97
211
  """Import a single registry table from parquet.
98
212
 
99
- Uses raw SQL export instead of django to later circumvent FK constraints.
213
+ For PostgreSQL, uses COPY FROM which bypasses SQL parsing and writes directly to
214
+ table pages (20-50x faster than multi-row INSERTs).
215
+
216
+ For SQLite, uses multi-row INSERTs with dynamic chunking to stay under the 999
217
+ variable limit (2-5x faster than single-row INSERTs).
100
218
  """
219
+ from django.db import connection
220
+
101
221
  table_name = registry._meta.db_table
102
222
  parquet_file = directory / f"{table_name}.parquet"
103
223
 
@@ -113,13 +233,46 @@ def _import_registry(
113
233
 
114
234
  for col in df.columns:
115
235
  if df[col].dtype == "object":
116
- df[col] = df[col].apply(
117
- lambda x: json.dumps(x) if isinstance(x, (dict, list)) else x
118
- )
236
+ mask = df[col].apply(lambda x: isinstance(x, (dict, list, np.ndarray)))
237
+ if mask.any():
238
+ df.loc[mask, col] = df.loc[mask, col].map(_serialize_value)
119
239
 
120
- from django.db import connection
240
+ if df.empty:
241
+ return
121
242
 
122
- df.to_sql(table_name, connection.connection, if_exists=if_exists, index=False)
243
+ if connection.vendor == "postgresql":
244
+ columns = df.columns.tolist()
245
+ column_names = ", ".join(f'"{col}"' for col in columns)
246
+
247
+ buffer = io.StringIO()
248
+ df.to_csv(buffer, index=False, header=False, sep="\t", na_rep="\\N")
249
+ buffer.seek(0)
250
+
251
+ with connection.cursor() as cursor:
252
+ if if_exists == "replace":
253
+ cursor.execute(f'DELETE FROM "{table_name}"')
254
+ elif if_exists == "fail":
255
+ cursor.execute(f'SELECT COUNT(*) FROM "{table_name}"')
256
+ if cursor.fetchone()[0] > 0:
257
+ raise ValueError(f"Table {table_name} already contains data")
258
+
259
+ cursor.copy_expert(
260
+ f"COPY \"{table_name}\" ({column_names}) FROM STDIN WITH (FORMAT CSV, DELIMITER E'\\t', NULL '\\N')",
261
+ buffer,
262
+ )
263
+ else:
264
+ num_cols = len(df.columns)
265
+ max_vars = 900 # SQLite has a limit of 999 variables per statement
266
+ chunksize = max(1, max_vars // num_cols)
267
+
268
+ df.to_sql(
269
+ table_name,
270
+ connection.connection,
271
+ if_exists=if_exists,
272
+ index=False,
273
+ method="multi",
274
+ chunksize=chunksize,
275
+ )
123
276
 
124
277
 
125
278
  def import_db(
@@ -157,38 +310,45 @@ def import_db(
157
310
  modules = {name: _get_registries(name) for name in module_names}
158
311
  total_models = sum(len(models) for models in modules.values())
159
312
 
160
- # Disable FK constraints to allow insertion in arbitrary order
161
- if ln_setup.settings.instance.dialect == "sqlite":
313
+ is_sqlite = ln_setup.settings.instance.dialect == "sqlite"
314
+
315
+ try:
162
316
  with connection.cursor() as cursor:
163
317
  if ln_setup.settings.instance.dialect == "postgresql":
164
318
  cursor.execute("SET session_replication_role = 'replica'")
165
- elif ln_setup.settings.instance.dialect == "sqlite":
319
+ elif is_sqlite:
166
320
  cursor.execute("PRAGMA foreign_keys = OFF")
167
-
168
- with transaction.atomic():
169
- if ln_setup.settings.instance.dialect == "postgresql":
170
- with connection.cursor() as cursor:
171
- cursor.execute("SET CONSTRAINTS ALL DEFERRED")
172
-
173
- with Progress() as progress:
174
- task = progress.add_task("Importing", total=total_models)
175
- for module_name, model_names in modules.items():
176
- schema_module = import_module(module_name)
177
- for model_name in model_names:
178
- progress.update(
179
- task, description=f"[cyan]{module_name}.{model_name}"
180
- )
181
- registry = getattr(schema_module, model_name)
182
- _import_registry(registry, directory, if_exists=if_exists)
183
- for field in registry._meta.many_to_many:
184
- link_orm = getattr(registry, field.name).through
185
- _import_registry(link_orm, directory, if_exists=if_exists)
186
- progress.advance(task)
187
-
188
- # Re-enable FK constraints again
189
- if ln_setup.settings.instance.dialect == "sqlite":
321
+ # Disables fsync - OS buffers writes to disk, 10-50x faster but can corrupt DB on crash
322
+ cursor.execute("PRAGMA synchronous = OFF")
323
+ # Keeps rollback journal in RAM - 2-5x faster but cannot rollback on crash
324
+ cursor.execute("PRAGMA journal_mode = MEMORY")
325
+ # 64MB page cache for better performance on large imports
326
+ cursor.execute("PRAGMA cache_size = -64000")
327
+
328
+ with transaction.atomic():
329
+ if ln_setup.settings.instance.dialect == "postgresql":
330
+ with connection.cursor() as cursor:
331
+ cursor.execute("SET CONSTRAINTS ALL DEFERRED")
332
+
333
+ with Progress() as progress:
334
+ task = progress.add_task("Importing", total=total_models)
335
+ for module_name, model_names in modules.items():
336
+ schema_module = import_module(module_name)
337
+ for model_name in model_names:
338
+ progress.update(
339
+ task, description=f"[cyan]{module_name}.{model_name}"
340
+ )
341
+ registry = getattr(schema_module, model_name)
342
+ _import_registry(registry, directory, if_exists=if_exists)
343
+ for field in registry._meta.many_to_many:
344
+ link_orm = getattr(registry, field.name).through
345
+ _import_registry(link_orm, directory, if_exists=if_exists)
346
+ progress.advance(task)
347
+ finally:
190
348
  with connection.cursor() as cursor:
191
349
  if ln_setup.settings.instance.dialect == "postgresql":
192
350
  cursor.execute("SET session_replication_role = 'origin'")
193
- elif ln_setup.settings.instance.dialect == "sqlite":
351
+ elif is_sqlite:
352
+ cursor.execute("PRAGMA synchronous = FULL")
353
+ cursor.execute("PRAGMA journal_mode = DELETE")
194
354
  cursor.execute("PRAGMA foreign_keys = ON")
@@ -1,10 +1,11 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: lamindb_setup
3
- Version: 1.15.0
3
+ Version: 1.15.1
4
4
  Summary: Setup & configure LaminDB.
5
5
  Author-email: Lamin Labs <open-source@lamin.ai>
6
6
  Requires-Python: >=3.10
7
7
  Description-Content-Type: text/markdown
8
+ License-File: LICENSE
8
9
  Requires-Dist: lamin_utils>=0.3.3
9
10
  Requires-Dist: django>=5.2,<5.3
10
11
  Requires-Dist: dj_database_url>=1.3.0,<3.0.0
@@ -1,4 +1,4 @@
1
- lamindb_setup/__init__.py,sha256=jFfiLt0yIvOHoum_g8DmraECRdU-EOd-iSmAKTZyG3c,3215
1
+ lamindb_setup/__init__.py,sha256=3Oaw6Mj_4lWEvxbDmE5Y6Q-hyP20r-8yeSKOpJXvjso,3215
2
2
  lamindb_setup/_cache.py,sha256=pGvDNVHGx4HWr_6w5ajqEJOdysmaGc6F221qFnXkT-k,2747
3
3
  lamindb_setup/_check.py,sha256=28PcG8Kp6OpjSLSi1r2boL2Ryeh6xkaCL87HFbjs6GA,129
4
4
  lamindb_setup/_check_setup.py,sha256=ToKMxsUq8dQBQh8baOrNVlSb1iC8h4zTg5dV8wMu0W4,6760
@@ -16,11 +16,11 @@ lamindb_setup/_set_managed_storage.py,sha256=y5YQASsWNYVWUYeLgh3N2YBETYP7mBtbpxe
16
16
  lamindb_setup/_setup_user.py,sha256=ojq7UP2Aia8GTCr6m8fylFx9VSuvGu0HmvIJ8RzymE0,6108
17
17
  lamindb_setup/_silence_loggers.py,sha256=AKF_YcHvX32eGXdsYK8MJlxEaZ-Uo2f6QDRzjKFCtws,1568
18
18
  lamindb_setup/errors.py,sha256=lccF3X3M2mcbHVG_0HxfuJRFFpUE-42paccIxFOfefQ,1958
19
- lamindb_setup/io.py,sha256=7wU3g2AH1h18wQerltA7p8VP0n47ZOsy32XlRcBbBAc,7321
19
+ lamindb_setup/io.py,sha256=9LstFkIaki_m_oE7hFSPN8j1eXLjfEyarf13v2wcGso,13702
20
20
  lamindb_setup/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  lamindb_setup/types.py,sha256=fuQxZJnrGYe7a_Ju9n1RqO-HhkOAr1l1xjpAg9dmBu8,605
22
22
  lamindb_setup/core/__init__.py,sha256=gMqbkPeD-T6o_KykonvNiUJJQYz5SCk2yGgFNWFUBNc,603
23
- lamindb_setup/core/_aws_options.py,sha256=6SAs3dhKxNadQqt7Ce7uHSus0DsU_LcNZoCQWvhgBWA,9626
23
+ lamindb_setup/core/_aws_options.py,sha256=9kQ5BB-cuJQrlJRGNqMRe1m48dP67xMbefOJP2c9OQw,9674
24
24
  lamindb_setup/core/_aws_storage.py,sha256=QEtV-riQrwfivcwqHnXBbkJ-9YyNEXL4fLoCmOHZ1BI,2003
25
25
  lamindb_setup/core/_clone.py,sha256=2NlXV04yykqg_k7z59C_kD1F1Hi4H-55H-JtNjhenQ0,3691
26
26
  lamindb_setup/core/_deprecated.py,sha256=M3vpM4fZPOncxY2qsXQAPeaEph28xWdv7tYaueaUyAA,2554
@@ -44,7 +44,7 @@ lamindb_setup/core/exceptions.py,sha256=qjMzqy_uzPA7mCOdnoWnS_fdA6OWbdZGftz-YYpl
44
44
  lamindb_setup/core/hashing.py,sha256=Y8Uc5uSGTfU6L2R_gb5w8DdHhGRog7RnkK-e9FEMjPY,3680
45
45
  lamindb_setup/core/types.py,sha256=T7NwspfRHgIIpYsXDcApks8jkOlGeGRW-YbVLB7jNIo,67
46
46
  lamindb_setup/core/upath.py,sha256=bi3k8AYeiGB_NtVTO9e9gHsfs2AFB4fXiVHcbNpnlpI,35780
47
- lamindb_setup-1.15.0.dist-info/LICENSE,sha256=UOZ1F5fFDe3XXvG4oNnkL1-Ecun7zpHzRxjp-XsMeAo,11324
48
- lamindb_setup-1.15.0.dist-info/WHEEL,sha256=CpUCUxeHQbRN5UGRQHYRJorO5Af-Qy_fHMctcQ8DSGI,82
49
- lamindb_setup-1.15.0.dist-info/METADATA,sha256=aVm2bsYK6bWwBDEcp1IJR3kp8xrRnJb5CYK_KT09HCA,1798
50
- lamindb_setup-1.15.0.dist-info/RECORD,,
47
+ lamindb_setup-1.15.1.dist-info/licenses/LICENSE,sha256=UOZ1F5fFDe3XXvG4oNnkL1-Ecun7zpHzRxjp-XsMeAo,11324
48
+ lamindb_setup-1.15.1.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
49
+ lamindb_setup-1.15.1.dist-info/METADATA,sha256=z9yk-pHFnYB7gv59trxqyhbbnpxzRC2XPuhMoTuujDE,1820
50
+ lamindb_setup-1.15.1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: flit 3.10.1
2
+ Generator: flit 3.12.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any