lamindb_setup 1.18.2__py3-none-any.whl → 1.19.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb_setup/__init__.py +4 -19
- lamindb_setup/_cache.py +87 -87
- lamindb_setup/_check.py +7 -7
- lamindb_setup/_check_setup.py +131 -131
- lamindb_setup/_connect_instance.py +443 -438
- lamindb_setup/_delete.py +155 -151
- lamindb_setup/_disconnect.py +38 -38
- lamindb_setup/_django.py +39 -39
- lamindb_setup/_entry_points.py +19 -19
- lamindb_setup/_init_instance.py +423 -429
- lamindb_setup/_migrate.py +331 -327
- lamindb_setup/_register_instance.py +32 -32
- lamindb_setup/_schema.py +27 -27
- lamindb_setup/_schema_metadata.py +451 -451
- lamindb_setup/_set_managed_storage.py +81 -80
- lamindb_setup/_setup_user.py +198 -198
- lamindb_setup/_silence_loggers.py +46 -46
- lamindb_setup/core/__init__.py +25 -34
- lamindb_setup/core/_aws_options.py +276 -266
- lamindb_setup/core/_aws_storage.py +57 -55
- lamindb_setup/core/_clone.py +50 -50
- lamindb_setup/core/_deprecated.py +62 -62
- lamindb_setup/core/_docs.py +14 -14
- lamindb_setup/core/_hub_client.py +288 -294
- lamindb_setup/core/_hub_core.py +0 -2
- lamindb_setup/core/_hub_crud.py +247 -247
- lamindb_setup/core/_hub_utils.py +100 -100
- lamindb_setup/core/_private_django_api.py +80 -80
- lamindb_setup/core/_settings.py +440 -434
- lamindb_setup/core/_settings_instance.py +32 -7
- lamindb_setup/core/_settings_load.py +162 -159
- lamindb_setup/core/_settings_save.py +108 -96
- lamindb_setup/core/_settings_storage.py +433 -433
- lamindb_setup/core/_settings_store.py +162 -92
- lamindb_setup/core/_settings_user.py +55 -55
- lamindb_setup/core/_setup_bionty_sources.py +44 -44
- lamindb_setup/core/cloud_sqlite_locker.py +240 -240
- lamindb_setup/core/django.py +414 -413
- lamindb_setup/core/exceptions.py +1 -1
- lamindb_setup/core/hashing.py +134 -134
- lamindb_setup/core/types.py +1 -1
- lamindb_setup/core/upath.py +1031 -1028
- lamindb_setup/errors.py +72 -70
- lamindb_setup/io.py +423 -416
- lamindb_setup/types.py +17 -17
- {lamindb_setup-1.18.2.dist-info → lamindb_setup-1.19.1.dist-info}/METADATA +4 -2
- lamindb_setup-1.19.1.dist-info/RECORD +51 -0
- {lamindb_setup-1.18.2.dist-info → lamindb_setup-1.19.1.dist-info}/WHEEL +1 -1
- {lamindb_setup-1.18.2.dist-info → lamindb_setup-1.19.1.dist-info/licenses}/LICENSE +201 -201
- lamindb_setup-1.18.2.dist-info/RECORD +0 -51
lamindb_setup/io.py
CHANGED
|
@@ -1,416 +1,423 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import io
|
|
4
|
-
import json
|
|
5
|
-
import warnings
|
|
6
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
|
-
from importlib import import_module
|
|
8
|
-
from pathlib import Path
|
|
9
|
-
from typing import TYPE_CHECKING
|
|
10
|
-
|
|
11
|
-
import
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
from
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
and
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
For
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
#
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
f"{module_name}.{model_name}
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
col
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
)
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
f"{module_name}.{model_name}
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
directory
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
merged_df.
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
if
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
if
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
for
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
# PostgreSQL
|
|
263
|
-
if field.
|
|
264
|
-
df[field.column] = df[field.column].
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
if
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import json
|
|
5
|
+
import warnings
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
|
+
from importlib import import_module
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
from django.db import models, transaction
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from collections.abc import Iterable
|
|
15
|
+
from typing import Literal
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _get_registries(module_name: str) -> list[str]:
|
|
19
|
+
"""Get registry class names from a module."""
|
|
20
|
+
schema_module = import_module(module_name)
|
|
21
|
+
|
|
22
|
+
# Ensure that models are loaded; we've observed empty exports otherwise
|
|
23
|
+
from django.db import models
|
|
24
|
+
|
|
25
|
+
return [
|
|
26
|
+
name
|
|
27
|
+
for name in dir(schema_module.models)
|
|
28
|
+
if (
|
|
29
|
+
name[0].isupper()
|
|
30
|
+
and isinstance(cls := getattr(schema_module.models, name, None), type)
|
|
31
|
+
and issubclass(cls, models.Model)
|
|
32
|
+
# Table names starting with `None_` are abstract base classes or Django mixins
|
|
33
|
+
and not cls._meta.db_table.startswith("None_") # type: ignore
|
|
34
|
+
)
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _export_full_table(
|
|
39
|
+
registry_info: tuple[str, str, str | None],
|
|
40
|
+
directory: Path,
|
|
41
|
+
chunk_size: int,
|
|
42
|
+
) -> list[tuple[str, Path]] | str:
|
|
43
|
+
"""Export a registry table to parquet.
|
|
44
|
+
|
|
45
|
+
For PostgreSQL, uses COPY TO which streams the table directly to CSV format,
|
|
46
|
+
bypassing query planner overhead and row-by-row conversion (10-50x faster than SELECT).
|
|
47
|
+
|
|
48
|
+
For SQLite with large tables, reads in chunks to avoid memory issues when tables exceed available RAM.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
registry_info: Tuple of (module_name, model_name, field_name) where `field_name`
|
|
52
|
+
is None for regular tables or the field name for M2M link tables.
|
|
53
|
+
directory: Output directory for parquet files.
|
|
54
|
+
chunk_size: Maximum rows per chunk for SQLite large tables.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
String identifier for single-file exports, or list of (table_name, chunk_path) tuples for chunked exports that need merging.
|
|
58
|
+
"""
|
|
59
|
+
import pandas as pd
|
|
60
|
+
from django.db import connection
|
|
61
|
+
|
|
62
|
+
import lamindb_setup as ln_setup
|
|
63
|
+
|
|
64
|
+
module_name, model_name, field_name = registry_info
|
|
65
|
+
schema_module = import_module(module_name)
|
|
66
|
+
registry = getattr(schema_module.models, model_name)
|
|
67
|
+
|
|
68
|
+
if field_name:
|
|
69
|
+
registry = getattr(registry, field_name).through
|
|
70
|
+
|
|
71
|
+
table_name = registry._meta.db_table
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
if ln_setup.settings.instance.dialect == "postgresql":
|
|
75
|
+
buffer = io.StringIO()
|
|
76
|
+
with connection.cursor() as cursor:
|
|
77
|
+
cursor.execute("SET statement_timeout = 0")
|
|
78
|
+
cursor.copy_expert(
|
|
79
|
+
f'COPY "{table_name}" TO STDOUT WITH (FORMAT CSV, HEADER TRUE)',
|
|
80
|
+
buffer,
|
|
81
|
+
)
|
|
82
|
+
buffer.seek(0)
|
|
83
|
+
# Prevent pandas from converting empty strings to float NaN (which PyArrow rejects)
|
|
84
|
+
df = pd.read_csv(buffer, keep_default_na=False)
|
|
85
|
+
# Convert object columns to string to handle mixed types from data corruption,
|
|
86
|
+
# schema migrations, or manual SQL inserts. PyArrow rejects mixed-type objects.
|
|
87
|
+
df = df.astype(
|
|
88
|
+
{col: str for col in df.columns if df[col].dtype == "object"}
|
|
89
|
+
)
|
|
90
|
+
df.to_parquet(directory / f"{table_name}.parquet", compression=None)
|
|
91
|
+
return (
|
|
92
|
+
f"{module_name}.{model_name}.{field_name}"
|
|
93
|
+
if field_name
|
|
94
|
+
else f"{module_name}.{model_name}"
|
|
95
|
+
)
|
|
96
|
+
else:
|
|
97
|
+
with warnings.catch_warnings():
|
|
98
|
+
warnings.filterwarnings(
|
|
99
|
+
"ignore", message="Skipped unsupported reflection"
|
|
100
|
+
)
|
|
101
|
+
row_count = pd.read_sql(
|
|
102
|
+
f"SELECT COUNT(*) as count FROM {table_name}",
|
|
103
|
+
ln_setup.settings.instance.db,
|
|
104
|
+
).iloc[0]["count"]
|
|
105
|
+
|
|
106
|
+
if row_count > chunk_size:
|
|
107
|
+
chunk_files = []
|
|
108
|
+
num_chunks = (row_count + chunk_size - 1) // chunk_size
|
|
109
|
+
for chunk_id in range(num_chunks):
|
|
110
|
+
offset = chunk_id * chunk_size
|
|
111
|
+
df = pd.read_sql(
|
|
112
|
+
f"SELECT * FROM {table_name} LIMIT {chunk_size} OFFSET {offset}",
|
|
113
|
+
ln_setup.settings.instance.db,
|
|
114
|
+
)
|
|
115
|
+
chunk_file = (
|
|
116
|
+
directory / f"{table_name}_chunk_{chunk_id}.parquet"
|
|
117
|
+
)
|
|
118
|
+
df = df.astype(
|
|
119
|
+
{
|
|
120
|
+
col: str
|
|
121
|
+
for col in df.columns
|
|
122
|
+
if df[col].dtype == "object"
|
|
123
|
+
}
|
|
124
|
+
)
|
|
125
|
+
df.to_parquet(chunk_file, compression=None)
|
|
126
|
+
chunk_files.append((table_name, chunk_file))
|
|
127
|
+
return chunk_files
|
|
128
|
+
else:
|
|
129
|
+
df = pd.read_sql_table(table_name, ln_setup.settings.instance.db)
|
|
130
|
+
df = df.astype(
|
|
131
|
+
{col: str for col in df.columns if df[col].dtype == "object"}
|
|
132
|
+
)
|
|
133
|
+
df.to_parquet(directory / f"{table_name}.parquet", compression=None)
|
|
134
|
+
return (
|
|
135
|
+
f"{module_name}.{model_name}.{field_name}"
|
|
136
|
+
if field_name
|
|
137
|
+
else f"{module_name}.{model_name}"
|
|
138
|
+
)
|
|
139
|
+
except (ValueError, pd.errors.DatabaseError):
|
|
140
|
+
raise ValueError(
|
|
141
|
+
f"Table '{table_name}' was not found. The instance might need to be migrated."
|
|
142
|
+
) from None
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def export_db(
|
|
146
|
+
module_names: Iterable[str] | None = None,
|
|
147
|
+
*,
|
|
148
|
+
output_dir: str | Path | None = None,
|
|
149
|
+
max_workers: int = 8,
|
|
150
|
+
chunk_size: int = 500_000,
|
|
151
|
+
) -> None:
|
|
152
|
+
"""Export registry tables and many-to-many link tables to parquet files.
|
|
153
|
+
|
|
154
|
+
Ensure that you connect to postgres instances using `use_root_db_user=True`.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
module_names: Module names to export (e.g., ["lamindb", "bionty", "pertdb"]).
|
|
158
|
+
Defaults to "lamindb" if not provided.
|
|
159
|
+
output_dir: Directory path for exported parquet files.
|
|
160
|
+
max_workers: Number of parallel processes.
|
|
161
|
+
chunk_size: Number of rows per chunk for large tables.
|
|
162
|
+
"""
|
|
163
|
+
import pandas as pd
|
|
164
|
+
from rich.progress import Progress
|
|
165
|
+
|
|
166
|
+
import lamindb_setup as ln_setup
|
|
167
|
+
|
|
168
|
+
if output_dir is None:
|
|
169
|
+
output_dir = f"./{ln_setup.settings.instance.name}_export/"
|
|
170
|
+
|
|
171
|
+
directory = Path(output_dir)
|
|
172
|
+
directory.mkdir(parents=True, exist_ok=True)
|
|
173
|
+
|
|
174
|
+
module_names = module_names or ["lamindb"]
|
|
175
|
+
modules = {name: _get_registries(name) for name in module_names}
|
|
176
|
+
|
|
177
|
+
tasks = []
|
|
178
|
+
for module_name, model_names in modules.items():
|
|
179
|
+
schema_module = import_module(module_name)
|
|
180
|
+
for model_name in model_names:
|
|
181
|
+
registry = getattr(schema_module.models, model_name)
|
|
182
|
+
tasks.append((module_name, model_name, None))
|
|
183
|
+
for field in registry._meta.many_to_many:
|
|
184
|
+
tasks.append((module_name, model_name, field.name))
|
|
185
|
+
|
|
186
|
+
chunk_files_by_table: dict[str, list[Path]] = {}
|
|
187
|
+
|
|
188
|
+
with Progress() as progress:
|
|
189
|
+
task_id = progress.add_task("Exporting", total=len(tasks))
|
|
190
|
+
|
|
191
|
+
# This must be a ThreadPoolExecutor and not a ProcessPoolExecutor to inherit JWTs
|
|
192
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
193
|
+
futures = {
|
|
194
|
+
executor.submit(_export_full_table, task, directory, chunk_size): task
|
|
195
|
+
for task in tasks
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
for future in as_completed(futures):
|
|
199
|
+
result = future.result()
|
|
200
|
+
if isinstance(result, list):
|
|
201
|
+
for table_name, chunk_file in result:
|
|
202
|
+
chunk_files_by_table.setdefault(table_name, []).append(
|
|
203
|
+
chunk_file
|
|
204
|
+
)
|
|
205
|
+
progress.advance(task_id)
|
|
206
|
+
|
|
207
|
+
for table_name, chunk_files in chunk_files_by_table.items():
|
|
208
|
+
merged_df = pd.concat([pd.read_parquet(f) for f in sorted(chunk_files)])
|
|
209
|
+
merged_df.to_parquet(directory / f"{table_name}.parquet", compression=None)
|
|
210
|
+
for chunk_file in chunk_files:
|
|
211
|
+
chunk_file.unlink()
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _serialize_value(val):
|
|
215
|
+
"""Convert value to JSON string if it's a dict, list, or numpy array, otherwise return as-is."""
|
|
216
|
+
# keep dynamic import to minimize import time
|
|
217
|
+
import numpy as np
|
|
218
|
+
|
|
219
|
+
if isinstance(val, (dict, list, np.ndarray)):
|
|
220
|
+
return json.dumps(
|
|
221
|
+
val, default=lambda o: o.tolist() if isinstance(o, np.ndarray) else None
|
|
222
|
+
)
|
|
223
|
+
return val
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _import_registry(
|
|
227
|
+
registry: type[models.Model],
|
|
228
|
+
directory: Path,
|
|
229
|
+
if_exists: Literal["fail", "replace", "append"] = "replace",
|
|
230
|
+
) -> None:
|
|
231
|
+
"""Import a single registry table from parquet.
|
|
232
|
+
|
|
233
|
+
For PostgreSQL, uses COPY FROM which bypasses SQL parsing and writes directly to
|
|
234
|
+
table pages (20-50x faster than multi-row INSERTs).
|
|
235
|
+
|
|
236
|
+
For SQLite, uses multi-row INSERTs with dynamic chunking to stay under the 999
|
|
237
|
+
variable limit (2-5x faster than single-row INSERTs).
|
|
238
|
+
"""
|
|
239
|
+
import numpy as np
|
|
240
|
+
import pandas as pd
|
|
241
|
+
from django.db import connection
|
|
242
|
+
|
|
243
|
+
table_name = registry._meta.db_table
|
|
244
|
+
parquet_file = directory / f"{table_name}.parquet"
|
|
245
|
+
|
|
246
|
+
if not parquet_file.exists():
|
|
247
|
+
return
|
|
248
|
+
|
|
249
|
+
df = pd.read_parquet(parquet_file)
|
|
250
|
+
|
|
251
|
+
old_foreign_key_columns = [col for col in df.columns if col.endswith("_old")]
|
|
252
|
+
if old_foreign_key_columns:
|
|
253
|
+
df = df.drop(columns=old_foreign_key_columns)
|
|
254
|
+
|
|
255
|
+
for col in df.columns:
|
|
256
|
+
if df[col].dtype == "object":
|
|
257
|
+
mask = df[col].apply(lambda x: isinstance(x, (dict, list, np.ndarray)))
|
|
258
|
+
if mask.any():
|
|
259
|
+
df.loc[mask, col] = df.loc[mask, col].map(_serialize_value)
|
|
260
|
+
|
|
261
|
+
for field in registry._meta.fields:
|
|
262
|
+
# Convert PostgreSQL boolean string literals ('t'/'f') to Python booleans for SQLite compatibility
|
|
263
|
+
if field.get_internal_type() == "BooleanField" and field.column in df.columns:
|
|
264
|
+
df[field.column] = df[field.column].map(
|
|
265
|
+
{"t": True, "f": False, True: True, False: False, None: None}
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# PostgreSQL CSV export writes NULL as empty string; convert back to None for nullable fields
|
|
269
|
+
if field.null and field.column in df.columns:
|
|
270
|
+
df[field.column] = df[field.column].replace("", None)
|
|
271
|
+
|
|
272
|
+
# Convert numeric fields from strings to proper types for SQLite
|
|
273
|
+
if (
|
|
274
|
+
field.get_internal_type()
|
|
275
|
+
in (
|
|
276
|
+
"IntegerField",
|
|
277
|
+
"BigIntegerField",
|
|
278
|
+
"PositiveIntegerField",
|
|
279
|
+
"FloatField",
|
|
280
|
+
"DecimalField",
|
|
281
|
+
)
|
|
282
|
+
and field.column in df.columns
|
|
283
|
+
):
|
|
284
|
+
df[field.column] = pd.to_numeric(df[field.column], errors="coerce")
|
|
285
|
+
|
|
286
|
+
if if_exists == "append":
|
|
287
|
+
# Fill NULL values in NOT NULL columns to handle schema mismatches between postgres source and SQLite target
|
|
288
|
+
# This allows importing data where fields were nullable
|
|
289
|
+
for field in registry._meta.fields:
|
|
290
|
+
if field.column in df.columns and not field.null:
|
|
291
|
+
df[field.column] = df[field.column].fillna("").infer_objects(copy=False)
|
|
292
|
+
|
|
293
|
+
if df.empty:
|
|
294
|
+
return
|
|
295
|
+
|
|
296
|
+
if if_exists == "append":
|
|
297
|
+
# Clear existing data before import
|
|
298
|
+
# When appending we would run into duplicate errors because of existing values like branches etc
|
|
299
|
+
with connection.cursor() as cursor:
|
|
300
|
+
cursor.execute(f'DELETE FROM "{table_name}"')
|
|
301
|
+
|
|
302
|
+
if connection.vendor == "postgresql":
|
|
303
|
+
columns = df.columns.tolist()
|
|
304
|
+
column_names = ", ".join(f'"{col}"' for col in columns)
|
|
305
|
+
|
|
306
|
+
buffer = io.StringIO()
|
|
307
|
+
df.to_csv(buffer, index=False, header=False, sep="\t", na_rep="\\N")
|
|
308
|
+
buffer.seek(0)
|
|
309
|
+
|
|
310
|
+
with connection.cursor() as cursor:
|
|
311
|
+
if if_exists == "replace":
|
|
312
|
+
cursor.execute(f'DELETE FROM "{table_name}"')
|
|
313
|
+
elif if_exists == "fail":
|
|
314
|
+
cursor.execute(f'SELECT COUNT(*) FROM "{table_name}"')
|
|
315
|
+
if cursor.fetchone()[0] > 0:
|
|
316
|
+
raise ValueError(f"Table {table_name} already contains data")
|
|
317
|
+
|
|
318
|
+
cursor.copy_expert(
|
|
319
|
+
f"COPY \"{table_name}\" ({column_names}) FROM STDIN WITH (FORMAT CSV, DELIMITER E'\\t', NULL '\\N')",
|
|
320
|
+
buffer,
|
|
321
|
+
)
|
|
322
|
+
else:
|
|
323
|
+
num_cols = len(df.columns)
|
|
324
|
+
max_vars = 900 # SQLite has a limit of 999 variables per statement
|
|
325
|
+
chunksize = max(1, max_vars // num_cols)
|
|
326
|
+
|
|
327
|
+
# Always use append mode since we set up the tables from a fresh instance
|
|
328
|
+
df.to_sql(
|
|
329
|
+
table_name,
|
|
330
|
+
connection.connection,
|
|
331
|
+
if_exists=if_exists,
|
|
332
|
+
index=False,
|
|
333
|
+
method="multi",
|
|
334
|
+
chunksize=chunksize,
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def import_db(
|
|
339
|
+
module_names: Iterable[str] | None = None,
|
|
340
|
+
*,
|
|
341
|
+
input_dir: str | Path = "./lamindb_export/",
|
|
342
|
+
if_exists: Literal["fail", "replace", "append"] = "replace",
|
|
343
|
+
) -> None:
|
|
344
|
+
"""Import registry and link tables from parquet files.
|
|
345
|
+
|
|
346
|
+
Temporarily disables FK constraints to allow insertion in arbitrary order.
|
|
347
|
+
Requires superuser/RDS admin privileges for postgres databases.
|
|
348
|
+
|
|
349
|
+
Note: When running in a subprocess, add a short delay or explicit connection close after `import_db()`
|
|
350
|
+
to ensure all SQLite writes are flushed to disk before process termination.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
input_dir: Directory containing parquet files to import.
|
|
354
|
+
module_names: Module names to import (e.g., ["lamindb", "bionty", "pertdb"]).
|
|
355
|
+
if_exists: How to behave if table exists: 'fail', 'replace', or 'append'.
|
|
356
|
+
If set to 'replace', existing data is deleted and new data is imported. All PKs and indices are not guaranteed to be preserved which can lead to write errors.
|
|
357
|
+
If set to 'append', new data is added to existing data without clearing the table. All PKs and indices are preserved allowing write operations but database size will greatly increase.
|
|
358
|
+
If set to 'fail', raises an error if the table contains any data.
|
|
359
|
+
"""
|
|
360
|
+
from django.db import connection
|
|
361
|
+
from rich.progress import Progress
|
|
362
|
+
|
|
363
|
+
import lamindb_setup as ln_setup
|
|
364
|
+
|
|
365
|
+
directory = Path(input_dir)
|
|
366
|
+
|
|
367
|
+
if not directory.exists():
|
|
368
|
+
raise ValueError(f"Directory does not exist: {directory}")
|
|
369
|
+
|
|
370
|
+
if module_names is None:
|
|
371
|
+
parquet_files = list(directory.glob("*.parquet"))
|
|
372
|
+
detected_modules = {
|
|
373
|
+
f.name.split("_")[0] for f in parquet_files if "_" in f.name
|
|
374
|
+
}
|
|
375
|
+
module_names = sorted(detected_modules)
|
|
376
|
+
|
|
377
|
+
modules = {name: _get_registries(name) for name in module_names}
|
|
378
|
+
total_models = sum(len(models) for models in modules.values())
|
|
379
|
+
|
|
380
|
+
is_sqlite = ln_setup.settings.instance.dialect == "sqlite"
|
|
381
|
+
|
|
382
|
+
try:
|
|
383
|
+
with connection.cursor() as cursor:
|
|
384
|
+
if ln_setup.settings.instance.dialect == "postgresql":
|
|
385
|
+
cursor.execute("SET session_replication_role = 'replica'")
|
|
386
|
+
elif is_sqlite:
|
|
387
|
+
cursor.execute("PRAGMA foreign_keys = OFF")
|
|
388
|
+
# Disables fsync - OS buffers writes to disk, 10-50x faster but can corrupt DB on crash
|
|
389
|
+
cursor.execute("PRAGMA synchronous = OFF")
|
|
390
|
+
# Keeps rollback journal in RAM - 2-5x faster but cannot rollback on crash
|
|
391
|
+
cursor.execute("PRAGMA journal_mode = MEMORY")
|
|
392
|
+
# 64MB page cache for better performance on large imports
|
|
393
|
+
cursor.execute("PRAGMA cache_size = -64000")
|
|
394
|
+
|
|
395
|
+
with transaction.atomic():
|
|
396
|
+
if ln_setup.settings.instance.dialect == "postgresql":
|
|
397
|
+
with connection.cursor() as cursor:
|
|
398
|
+
cursor.execute("SET CONSTRAINTS ALL DEFERRED")
|
|
399
|
+
|
|
400
|
+
with Progress() as progress:
|
|
401
|
+
task = progress.add_task("Importing", total=total_models)
|
|
402
|
+
for module_name, model_names in modules.items():
|
|
403
|
+
schema_module = import_module(module_name)
|
|
404
|
+
for model_name in model_names:
|
|
405
|
+
progress.update(
|
|
406
|
+
task, description=f"[cyan]{module_name}.{model_name}"
|
|
407
|
+
)
|
|
408
|
+
registry = getattr(schema_module.models, model_name)
|
|
409
|
+
_import_registry(registry, directory, if_exists=if_exists)
|
|
410
|
+
for field in registry._meta.many_to_many:
|
|
411
|
+
link_orm = getattr(registry, field.name).through
|
|
412
|
+
_import_registry(link_orm, directory, if_exists=if_exists)
|
|
413
|
+
progress.advance(task)
|
|
414
|
+
finally:
|
|
415
|
+
with connection.cursor() as cursor:
|
|
416
|
+
if ln_setup.settings.instance.dialect == "postgresql":
|
|
417
|
+
cursor.execute("SET session_replication_role = 'origin'")
|
|
418
|
+
elif is_sqlite:
|
|
419
|
+
cursor.execute("PRAGMA synchronous = FULL")
|
|
420
|
+
cursor.execute("PRAGMA journal_mode = DELETE")
|
|
421
|
+
cursor.execute("PRAGMA foreign_keys = ON")
|
|
422
|
+
# Reclaim space from DELETEs
|
|
423
|
+
cursor.execute("VACUUM")
|