lamindb_setup 1.15.0__py3-none-any.whl → 1.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb_setup/__init__.py +1 -1
- lamindb_setup/core/_aws_options.py +1 -0
- lamindb_setup/io.py +211 -51
- {lamindb_setup-1.15.0.dist-info → lamindb_setup-1.15.1.dist-info}/METADATA +3 -2
- {lamindb_setup-1.15.0.dist-info → lamindb_setup-1.15.1.dist-info}/RECORD +7 -7
- {lamindb_setup-1.15.0.dist-info → lamindb_setup-1.15.1.dist-info}/WHEEL +1 -1
- {lamindb_setup-1.15.0.dist-info → lamindb_setup-1.15.1.dist-info/licenses}/LICENSE +0 -0
lamindb_setup/__init__.py
CHANGED
|
@@ -20,6 +20,7 @@ lamin_env = os.getenv("LAMIN_ENV")
|
|
|
20
20
|
if lamin_env is None or lamin_env == "prod":
|
|
21
21
|
HOSTED_BUCKETS = tuple([f"s3://lamin-{region}" for region in HOSTED_REGIONS])
|
|
22
22
|
else:
|
|
23
|
+
logger.warning("loaded LAMIN_ENV: staging")
|
|
23
24
|
HOSTED_BUCKETS = ("s3://lamin-hosted-test",) # type: ignore
|
|
24
25
|
|
|
25
26
|
|
lamindb_setup/io.py
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import io
|
|
3
4
|
import json
|
|
4
5
|
import warnings
|
|
6
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
5
7
|
from importlib import import_module
|
|
6
8
|
from pathlib import Path
|
|
7
9
|
from typing import TYPE_CHECKING
|
|
8
10
|
|
|
11
|
+
import numpy as np
|
|
9
12
|
import pandas as pd
|
|
10
13
|
from django.db import models, transaction
|
|
11
14
|
from rich.progress import Progress
|
|
@@ -43,21 +46,101 @@ def _get_registries(module_name: str) -> list[str]:
|
|
|
43
46
|
]
|
|
44
47
|
|
|
45
48
|
|
|
46
|
-
def
|
|
47
|
-
|
|
49
|
+
def _export_full_table(
|
|
50
|
+
registry_info: tuple[str, str, str | None],
|
|
51
|
+
directory: Path,
|
|
52
|
+
chunk_size: int,
|
|
53
|
+
) -> list[tuple[str, Path]] | str:
|
|
54
|
+
"""Export a registry table to parquet.
|
|
55
|
+
|
|
56
|
+
For PostgreSQL, uses COPY TO which streams the table directly to CSV format,
|
|
57
|
+
bypassing query planner overhead and row-by-row conversion (10-50x faster than SELECT).
|
|
58
|
+
|
|
59
|
+
For SQLite with large tables, reads in chunks to avoid memory issues when tables exceed available RAM.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
registry_info: Tuple of (module_name, model_name, field_name) where field_name
|
|
63
|
+
is None for regular tables or the field name for M2M link tables.
|
|
64
|
+
directory: Output directory for parquet files.
|
|
65
|
+
chunk_size: Maximum rows per chunk for SQLite large tables.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
String identifier for single-file exports, or list of (table_name, chunk_path) tuples for chunked exports that need merging.
|
|
69
|
+
"""
|
|
70
|
+
from django.db import connection
|
|
71
|
+
|
|
48
72
|
import lamindb_setup as ln_setup
|
|
49
73
|
|
|
74
|
+
module_name, model_name, field_name = registry_info
|
|
75
|
+
schema_module = import_module(module_name)
|
|
76
|
+
registry = getattr(schema_module, model_name)
|
|
77
|
+
|
|
78
|
+
if field_name:
|
|
79
|
+
registry = getattr(registry, field_name).through
|
|
80
|
+
|
|
50
81
|
table_name = registry._meta.db_table
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
if ln_setup.settings.instance.dialect == "postgresql":
|
|
85
|
+
buffer = io.StringIO()
|
|
86
|
+
with connection.cursor() as cursor:
|
|
87
|
+
cursor.copy_expert(
|
|
88
|
+
f'COPY "{table_name}" TO STDOUT WITH (FORMAT CSV, HEADER TRUE)',
|
|
89
|
+
buffer,
|
|
90
|
+
)
|
|
91
|
+
buffer.seek(0)
|
|
92
|
+
df = pd.read_csv(buffer)
|
|
93
|
+
df.to_parquet(directory / f"{table_name}.parquet", compression=None)
|
|
94
|
+
return (
|
|
95
|
+
f"{module_name}.{model_name}.{field_name}"
|
|
96
|
+
if field_name
|
|
97
|
+
else f"{module_name}.{model_name}"
|
|
98
|
+
)
|
|
99
|
+
else:
|
|
100
|
+
with warnings.catch_warnings():
|
|
101
|
+
warnings.filterwarnings(
|
|
102
|
+
"ignore", message="Skipped unsupported reflection"
|
|
103
|
+
)
|
|
104
|
+
row_count = pd.read_sql(
|
|
105
|
+
f"SELECT COUNT(*) as count FROM {table_name}",
|
|
106
|
+
ln_setup.settings.instance.db,
|
|
107
|
+
).iloc[0]["count"]
|
|
108
|
+
|
|
109
|
+
if row_count > chunk_size:
|
|
110
|
+
chunk_files = []
|
|
111
|
+
num_chunks = (row_count + chunk_size - 1) // chunk_size
|
|
112
|
+
for chunk_id in range(num_chunks):
|
|
113
|
+
offset = chunk_id * chunk_size
|
|
114
|
+
df = pd.read_sql(
|
|
115
|
+
f"SELECT * FROM {table_name} LIMIT {chunk_size} OFFSET {offset}",
|
|
116
|
+
ln_setup.settings.instance.db,
|
|
117
|
+
)
|
|
118
|
+
chunk_file = (
|
|
119
|
+
directory / f"{table_name}_chunk_{chunk_id}.parquet"
|
|
120
|
+
)
|
|
121
|
+
df.to_parquet(chunk_file, compression=None)
|
|
122
|
+
chunk_files.append((table_name, chunk_file))
|
|
123
|
+
return chunk_files
|
|
124
|
+
else:
|
|
125
|
+
df = pd.read_sql_table(table_name, ln_setup.settings.instance.db)
|
|
126
|
+
df.to_parquet(directory / f"{table_name}.parquet", compression=None)
|
|
127
|
+
return (
|
|
128
|
+
f"{module_name}.{model_name}.{field_name}"
|
|
129
|
+
if field_name
|
|
130
|
+
else f"{module_name}.{model_name}"
|
|
131
|
+
)
|
|
132
|
+
except (ValueError, pd.errors.DatabaseError):
|
|
133
|
+
raise ValueError(
|
|
134
|
+
f"Table '{table_name}' was not found. The instance might need to be migrated."
|
|
135
|
+
) from None
|
|
55
136
|
|
|
56
137
|
|
|
57
138
|
def export_db(
|
|
58
139
|
module_names: Sequence[str] | None = None,
|
|
59
140
|
*,
|
|
60
141
|
output_dir: str | Path = "./lamindb_export/",
|
|
142
|
+
max_workers: int = 8,
|
|
143
|
+
chunk_size: int = 500_000,
|
|
61
144
|
) -> None:
|
|
62
145
|
"""Export registry tables and many-to-many link tables to parquet files.
|
|
63
146
|
|
|
@@ -67,26 +150,57 @@ def export_db(
|
|
|
67
150
|
module_names: Module names to export (e.g., ["lamindb", "bionty", "wetlab"]).
|
|
68
151
|
Defaults to "lamindb" if not provided.
|
|
69
152
|
output_dir: Directory path for exported parquet files.
|
|
153
|
+
max_workers: Number of parallel processes.
|
|
154
|
+
chunk_size: Number of rows per chunk for large tables.
|
|
70
155
|
"""
|
|
71
156
|
directory = Path(output_dir)
|
|
72
157
|
directory.mkdir(parents=True, exist_ok=True)
|
|
73
158
|
|
|
74
159
|
module_names = module_names or ["lamindb"]
|
|
75
160
|
modules = {name: _get_registries(name) for name in module_names}
|
|
76
|
-
|
|
161
|
+
|
|
162
|
+
tasks = []
|
|
163
|
+
for module_name, model_names in modules.items():
|
|
164
|
+
schema_module = import_module(module_name)
|
|
165
|
+
for model_name in model_names:
|
|
166
|
+
registry = getattr(schema_module, model_name)
|
|
167
|
+
tasks.append((module_name, model_name, None))
|
|
168
|
+
for field in registry._meta.many_to_many:
|
|
169
|
+
tasks.append((module_name, model_name, field.name))
|
|
170
|
+
|
|
171
|
+
chunk_files_by_table: dict[str, list[Path]] = {}
|
|
77
172
|
|
|
78
173
|
with Progress() as progress:
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
174
|
+
task_id = progress.add_task("Exporting", total=len(tasks))
|
|
175
|
+
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
|
176
|
+
futures = {
|
|
177
|
+
executor.submit(_export_full_table, task, directory, chunk_size): task
|
|
178
|
+
for task in tasks
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
for future in as_completed(futures):
|
|
182
|
+
result = future.result()
|
|
183
|
+
if isinstance(result, list):
|
|
184
|
+
for table_name, chunk_file in result:
|
|
185
|
+
chunk_files_by_table.setdefault(table_name, []).append(
|
|
186
|
+
chunk_file
|
|
187
|
+
)
|
|
188
|
+
progress.advance(task_id)
|
|
189
|
+
|
|
190
|
+
for table_name, chunk_files in chunk_files_by_table.items():
|
|
191
|
+
merged_df = pd.concat([pd.read_parquet(f) for f in sorted(chunk_files)])
|
|
192
|
+
merged_df.to_parquet(directory / f"{table_name}.parquet", compression=None)
|
|
193
|
+
for chunk_file in chunk_files:
|
|
194
|
+
chunk_file.unlink()
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _serialize_value(val):
|
|
198
|
+
"""Convert value to JSON string if it's a dict, list, or numpy array, otherwise return as-is."""
|
|
199
|
+
if isinstance(val, (dict, list, np.ndarray)):
|
|
200
|
+
return json.dumps(
|
|
201
|
+
val, default=lambda o: o.tolist() if isinstance(o, np.ndarray) else None
|
|
202
|
+
)
|
|
203
|
+
return val
|
|
90
204
|
|
|
91
205
|
|
|
92
206
|
def _import_registry(
|
|
@@ -96,8 +210,14 @@ def _import_registry(
|
|
|
96
210
|
) -> None:
|
|
97
211
|
"""Import a single registry table from parquet.
|
|
98
212
|
|
|
99
|
-
|
|
213
|
+
For PostgreSQL, uses COPY FROM which bypasses SQL parsing and writes directly to
|
|
214
|
+
table pages (20-50x faster than multi-row INSERTs).
|
|
215
|
+
|
|
216
|
+
For SQLite, uses multi-row INSERTs with dynamic chunking to stay under the 999
|
|
217
|
+
variable limit (2-5x faster than single-row INSERTs).
|
|
100
218
|
"""
|
|
219
|
+
from django.db import connection
|
|
220
|
+
|
|
101
221
|
table_name = registry._meta.db_table
|
|
102
222
|
parquet_file = directory / f"{table_name}.parquet"
|
|
103
223
|
|
|
@@ -113,13 +233,46 @@ def _import_registry(
|
|
|
113
233
|
|
|
114
234
|
for col in df.columns:
|
|
115
235
|
if df[col].dtype == "object":
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
236
|
+
mask = df[col].apply(lambda x: isinstance(x, (dict, list, np.ndarray)))
|
|
237
|
+
if mask.any():
|
|
238
|
+
df.loc[mask, col] = df.loc[mask, col].map(_serialize_value)
|
|
119
239
|
|
|
120
|
-
|
|
240
|
+
if df.empty:
|
|
241
|
+
return
|
|
121
242
|
|
|
122
|
-
|
|
243
|
+
if connection.vendor == "postgresql":
|
|
244
|
+
columns = df.columns.tolist()
|
|
245
|
+
column_names = ", ".join(f'"{col}"' for col in columns)
|
|
246
|
+
|
|
247
|
+
buffer = io.StringIO()
|
|
248
|
+
df.to_csv(buffer, index=False, header=False, sep="\t", na_rep="\\N")
|
|
249
|
+
buffer.seek(0)
|
|
250
|
+
|
|
251
|
+
with connection.cursor() as cursor:
|
|
252
|
+
if if_exists == "replace":
|
|
253
|
+
cursor.execute(f'DELETE FROM "{table_name}"')
|
|
254
|
+
elif if_exists == "fail":
|
|
255
|
+
cursor.execute(f'SELECT COUNT(*) FROM "{table_name}"')
|
|
256
|
+
if cursor.fetchone()[0] > 0:
|
|
257
|
+
raise ValueError(f"Table {table_name} already contains data")
|
|
258
|
+
|
|
259
|
+
cursor.copy_expert(
|
|
260
|
+
f"COPY \"{table_name}\" ({column_names}) FROM STDIN WITH (FORMAT CSV, DELIMITER E'\\t', NULL '\\N')",
|
|
261
|
+
buffer,
|
|
262
|
+
)
|
|
263
|
+
else:
|
|
264
|
+
num_cols = len(df.columns)
|
|
265
|
+
max_vars = 900 # SQLite has a limit of 999 variables per statement
|
|
266
|
+
chunksize = max(1, max_vars // num_cols)
|
|
267
|
+
|
|
268
|
+
df.to_sql(
|
|
269
|
+
table_name,
|
|
270
|
+
connection.connection,
|
|
271
|
+
if_exists=if_exists,
|
|
272
|
+
index=False,
|
|
273
|
+
method="multi",
|
|
274
|
+
chunksize=chunksize,
|
|
275
|
+
)
|
|
123
276
|
|
|
124
277
|
|
|
125
278
|
def import_db(
|
|
@@ -157,38 +310,45 @@ def import_db(
|
|
|
157
310
|
modules = {name: _get_registries(name) for name in module_names}
|
|
158
311
|
total_models = sum(len(models) for models in modules.values())
|
|
159
312
|
|
|
160
|
-
|
|
161
|
-
|
|
313
|
+
is_sqlite = ln_setup.settings.instance.dialect == "sqlite"
|
|
314
|
+
|
|
315
|
+
try:
|
|
162
316
|
with connection.cursor() as cursor:
|
|
163
317
|
if ln_setup.settings.instance.dialect == "postgresql":
|
|
164
318
|
cursor.execute("SET session_replication_role = 'replica'")
|
|
165
|
-
elif
|
|
319
|
+
elif is_sqlite:
|
|
166
320
|
cursor.execute("PRAGMA foreign_keys = OFF")
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
for
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
321
|
+
# Disables fsync - OS buffers writes to disk, 10-50x faster but can corrupt DB on crash
|
|
322
|
+
cursor.execute("PRAGMA synchronous = OFF")
|
|
323
|
+
# Keeps rollback journal in RAM - 2-5x faster but cannot rollback on crash
|
|
324
|
+
cursor.execute("PRAGMA journal_mode = MEMORY")
|
|
325
|
+
# 64MB page cache for better performance on large imports
|
|
326
|
+
cursor.execute("PRAGMA cache_size = -64000")
|
|
327
|
+
|
|
328
|
+
with transaction.atomic():
|
|
329
|
+
if ln_setup.settings.instance.dialect == "postgresql":
|
|
330
|
+
with connection.cursor() as cursor:
|
|
331
|
+
cursor.execute("SET CONSTRAINTS ALL DEFERRED")
|
|
332
|
+
|
|
333
|
+
with Progress() as progress:
|
|
334
|
+
task = progress.add_task("Importing", total=total_models)
|
|
335
|
+
for module_name, model_names in modules.items():
|
|
336
|
+
schema_module = import_module(module_name)
|
|
337
|
+
for model_name in model_names:
|
|
338
|
+
progress.update(
|
|
339
|
+
task, description=f"[cyan]{module_name}.{model_name}"
|
|
340
|
+
)
|
|
341
|
+
registry = getattr(schema_module, model_name)
|
|
342
|
+
_import_registry(registry, directory, if_exists=if_exists)
|
|
343
|
+
for field in registry._meta.many_to_many:
|
|
344
|
+
link_orm = getattr(registry, field.name).through
|
|
345
|
+
_import_registry(link_orm, directory, if_exists=if_exists)
|
|
346
|
+
progress.advance(task)
|
|
347
|
+
finally:
|
|
190
348
|
with connection.cursor() as cursor:
|
|
191
349
|
if ln_setup.settings.instance.dialect == "postgresql":
|
|
192
350
|
cursor.execute("SET session_replication_role = 'origin'")
|
|
193
|
-
elif
|
|
351
|
+
elif is_sqlite:
|
|
352
|
+
cursor.execute("PRAGMA synchronous = FULL")
|
|
353
|
+
cursor.execute("PRAGMA journal_mode = DELETE")
|
|
194
354
|
cursor.execute("PRAGMA foreign_keys = ON")
|
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: lamindb_setup
|
|
3
|
-
Version: 1.15.
|
|
3
|
+
Version: 1.15.1
|
|
4
4
|
Summary: Setup & configure LaminDB.
|
|
5
5
|
Author-email: Lamin Labs <open-source@lamin.ai>
|
|
6
6
|
Requires-Python: >=3.10
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
8
9
|
Requires-Dist: lamin_utils>=0.3.3
|
|
9
10
|
Requires-Dist: django>=5.2,<5.3
|
|
10
11
|
Requires-Dist: dj_database_url>=1.3.0,<3.0.0
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
lamindb_setup/__init__.py,sha256=
|
|
1
|
+
lamindb_setup/__init__.py,sha256=3Oaw6Mj_4lWEvxbDmE5Y6Q-hyP20r-8yeSKOpJXvjso,3215
|
|
2
2
|
lamindb_setup/_cache.py,sha256=pGvDNVHGx4HWr_6w5ajqEJOdysmaGc6F221qFnXkT-k,2747
|
|
3
3
|
lamindb_setup/_check.py,sha256=28PcG8Kp6OpjSLSi1r2boL2Ryeh6xkaCL87HFbjs6GA,129
|
|
4
4
|
lamindb_setup/_check_setup.py,sha256=ToKMxsUq8dQBQh8baOrNVlSb1iC8h4zTg5dV8wMu0W4,6760
|
|
@@ -16,11 +16,11 @@ lamindb_setup/_set_managed_storage.py,sha256=y5YQASsWNYVWUYeLgh3N2YBETYP7mBtbpxe
|
|
|
16
16
|
lamindb_setup/_setup_user.py,sha256=ojq7UP2Aia8GTCr6m8fylFx9VSuvGu0HmvIJ8RzymE0,6108
|
|
17
17
|
lamindb_setup/_silence_loggers.py,sha256=AKF_YcHvX32eGXdsYK8MJlxEaZ-Uo2f6QDRzjKFCtws,1568
|
|
18
18
|
lamindb_setup/errors.py,sha256=lccF3X3M2mcbHVG_0HxfuJRFFpUE-42paccIxFOfefQ,1958
|
|
19
|
-
lamindb_setup/io.py,sha256=
|
|
19
|
+
lamindb_setup/io.py,sha256=9LstFkIaki_m_oE7hFSPN8j1eXLjfEyarf13v2wcGso,13702
|
|
20
20
|
lamindb_setup/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
21
|
lamindb_setup/types.py,sha256=fuQxZJnrGYe7a_Ju9n1RqO-HhkOAr1l1xjpAg9dmBu8,605
|
|
22
22
|
lamindb_setup/core/__init__.py,sha256=gMqbkPeD-T6o_KykonvNiUJJQYz5SCk2yGgFNWFUBNc,603
|
|
23
|
-
lamindb_setup/core/_aws_options.py,sha256=
|
|
23
|
+
lamindb_setup/core/_aws_options.py,sha256=9kQ5BB-cuJQrlJRGNqMRe1m48dP67xMbefOJP2c9OQw,9674
|
|
24
24
|
lamindb_setup/core/_aws_storage.py,sha256=QEtV-riQrwfivcwqHnXBbkJ-9YyNEXL4fLoCmOHZ1BI,2003
|
|
25
25
|
lamindb_setup/core/_clone.py,sha256=2NlXV04yykqg_k7z59C_kD1F1Hi4H-55H-JtNjhenQ0,3691
|
|
26
26
|
lamindb_setup/core/_deprecated.py,sha256=M3vpM4fZPOncxY2qsXQAPeaEph28xWdv7tYaueaUyAA,2554
|
|
@@ -44,7 +44,7 @@ lamindb_setup/core/exceptions.py,sha256=qjMzqy_uzPA7mCOdnoWnS_fdA6OWbdZGftz-YYpl
|
|
|
44
44
|
lamindb_setup/core/hashing.py,sha256=Y8Uc5uSGTfU6L2R_gb5w8DdHhGRog7RnkK-e9FEMjPY,3680
|
|
45
45
|
lamindb_setup/core/types.py,sha256=T7NwspfRHgIIpYsXDcApks8jkOlGeGRW-YbVLB7jNIo,67
|
|
46
46
|
lamindb_setup/core/upath.py,sha256=bi3k8AYeiGB_NtVTO9e9gHsfs2AFB4fXiVHcbNpnlpI,35780
|
|
47
|
-
lamindb_setup-1.15.
|
|
48
|
-
lamindb_setup-1.15.
|
|
49
|
-
lamindb_setup-1.15.
|
|
50
|
-
lamindb_setup-1.15.
|
|
47
|
+
lamindb_setup-1.15.1.dist-info/licenses/LICENSE,sha256=UOZ1F5fFDe3XXvG4oNnkL1-Ecun7zpHzRxjp-XsMeAo,11324
|
|
48
|
+
lamindb_setup-1.15.1.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
|
49
|
+
lamindb_setup-1.15.1.dist-info/METADATA,sha256=z9yk-pHFnYB7gv59trxqyhbbnpxzRC2XPuhMoTuujDE,1820
|
|
50
|
+
lamindb_setup-1.15.1.dist-info/RECORD,,
|
|
File without changes
|