lamindb_setup 1.15.0__tar.gz → 1.15.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/.gitignore +1 -1
  2. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/PKG-INFO +3 -2
  3. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/__init__.py +1 -1
  4. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/_aws_options.py +1 -0
  5. lamindb_setup-1.15.1/lamindb_setup/io.py +354 -0
  6. lamindb_setup-1.15.0/lamindb_setup/io.py +0 -194
  7. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/.github/workflows/build.yml +0 -0
  8. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/.github/workflows/doc-changes.yml +0 -0
  9. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/.pre-commit-config.yaml +0 -0
  10. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/LICENSE +0 -0
  11. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/README.md +0 -0
  12. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/changelog.md +0 -0
  13. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-cloud/01-init-local-instance.ipynb +0 -0
  14. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-cloud/02-connect-local-instance.ipynb +0 -0
  15. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-cloud/03-add-managed-storage.ipynb +0 -0
  16. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-cloud/04-test-bionty.ipynb +0 -0
  17. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-cloud/05-init-hosted-instance.ipynb +0 -0
  18. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-cloud/06-connect-hosted-instance.ipynb +0 -0
  19. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-cloud/07-keep-artifacts-local.ipynb +0 -0
  20. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-cloud/08-test-multi-session.ipynb +0 -0
  21. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-cloud/09-test-migrate.ipynb +0 -0
  22. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-cloud/test_notebooks.py +0 -0
  23. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-prod/test-cache-management.ipynb +0 -0
  24. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-prod/test-cloud-sync.ipynb +0 -0
  25. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-prod/test-connect-anonymously.ipynb +0 -0
  26. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-prod/test-empty-init.ipynb +0 -0
  27. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-prod/test-import-schema.ipynb +0 -0
  28. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-prod/test-init-load-local-anonymously.ipynb +0 -0
  29. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-prod/test-insufficient-user-info.ipynb +0 -0
  30. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-prod/test-invalid-schema.ipynb +0 -0
  31. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-prod/test-sqlite-lock.ipynb +0 -0
  32. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/hub-prod/test_notebooks2.py +0 -0
  33. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/index.md +0 -0
  34. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/notebooks.md +0 -0
  35. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/docs/reference.md +0 -0
  36. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/_cache.py +0 -0
  37. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/_check.py +0 -0
  38. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/_check_setup.py +0 -0
  39. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/_connect_instance.py +0 -0
  40. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/_delete.py +0 -0
  41. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/_disconnect.py +0 -0
  42. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/_django.py +0 -0
  43. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/_entry_points.py +0 -0
  44. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/_init_instance.py +0 -0
  45. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/_migrate.py +0 -0
  46. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/_register_instance.py +0 -0
  47. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/_schema.py +0 -0
  48. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/_schema_metadata.py +0 -0
  49. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/_set_managed_storage.py +0 -0
  50. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/_setup_user.py +0 -0
  51. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/_silence_loggers.py +0 -0
  52. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/__init__.py +0 -0
  53. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/_aws_storage.py +0 -0
  54. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/_clone.py +0 -0
  55. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/_deprecated.py +0 -0
  56. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/_docs.py +0 -0
  57. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/_hub_client.py +0 -0
  58. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/_hub_core.py +0 -0
  59. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/_hub_crud.py +0 -0
  60. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/_hub_utils.py +0 -0
  61. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/_private_django_api.py +0 -0
  62. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/_settings.py +0 -0
  63. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/_settings_instance.py +0 -0
  64. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/_settings_load.py +0 -0
  65. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/_settings_save.py +0 -0
  66. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/_settings_storage.py +0 -0
  67. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/_settings_store.py +0 -0
  68. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/_settings_user.py +0 -0
  69. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/_setup_bionty_sources.py +0 -0
  70. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/cloud_sqlite_locker.py +0 -0
  71. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/django.py +0 -0
  72. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/exceptions.py +0 -0
  73. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/hashing.py +0 -0
  74. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/types.py +0 -0
  75. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/core/upath.py +0 -0
  76. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/errors.py +0 -0
  77. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/py.typed +0 -0
  78. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/lamindb_setup/types.py +0 -0
  79. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/noxfile.py +0 -0
  80. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/pyproject.toml +0 -0
  81. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-cloud/scripts/script-init-pass-user-no-writes.py +0 -0
  82. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-cloud/scripts/script-to-fail-managed-storage.py +0 -0
  83. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-cloud/test_clone_instance.py +0 -0
  84. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-cloud/test_connect_instance.py +0 -0
  85. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-cloud/test_delete_instance.py +0 -0
  86. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-cloud/test_edge_request.py +0 -0
  87. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-cloud/test_fail_managed_storage.py +0 -0
  88. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-cloud/test_init_instance.py +0 -0
  89. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-cloud/test_init_pass_user_no_writes.py +0 -0
  90. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-cloud/test_login.py +0 -0
  91. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-cloud/test_set_storage.py +0 -0
  92. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-local/README.md +0 -0
  93. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-local/conftest.py +0 -0
  94. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-local/scripts/script-connect-fine-grained-access.py +0 -0
  95. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-local/test_all.py +0 -0
  96. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-local/test_update_schema_in_hub.py +0 -0
  97. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-prod/conftest.py +0 -0
  98. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-prod/test_aws_options_manager.py +0 -0
  99. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-prod/test_django.py +0 -0
  100. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-prod/test_global_settings.py +0 -0
  101. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-prod/test_migrate.py +0 -0
  102. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-prod/test_switch_and_fallback_env.py +0 -0
  103. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/hub-prod/test_upath.py +0 -0
  104. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/storage/conftest.py +0 -0
  105. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/storage/test_db_import_export.py +0 -0
  106. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/storage/test_entry_point.py +0 -0
  107. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/storage/test_hashing.py +0 -0
  108. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/storage/test_storage_access.py +0 -0
  109. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/storage/test_storage_basis.py +0 -0
  110. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/storage/test_storage_settings.py +0 -0
  111. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/storage/test_storage_stats.py +0 -0
  112. {lamindb_setup-1.15.0 → lamindb_setup-1.15.1}/tests/storage/test_to_url.py +0 -0
@@ -114,4 +114,4 @@ _docs_tmp*
114
114
  storage_uid.txt
115
115
  test.ipynb
116
116
  test2.ipynb
117
- lamindb_export
117
+ *_export
@@ -1,10 +1,11 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: lamindb_setup
3
- Version: 1.15.0
3
+ Version: 1.15.1
4
4
  Summary: Setup & configure LaminDB.
5
5
  Author-email: Lamin Labs <open-source@lamin.ai>
6
6
  Requires-Python: >=3.10
7
7
  Description-Content-Type: text/markdown
8
+ License-File: LICENSE
8
9
  Requires-Dist: lamin_utils>=0.3.3
9
10
  Requires-Dist: django>=5.2,<5.3
10
11
  Requires-Dist: dj_database_url>=1.3.0,<3.0.0
@@ -35,7 +35,7 @@ Migration management
35
35
 
36
36
  """
37
37
 
38
- __version__ = "1.15.0" # denote a release candidate for 0.1.0 with 0.1rc1
38
+ __version__ = "1.15.1" # denote a release candidate for 0.1.0 with 0.1rc1
39
39
 
40
40
  import os
41
41
  import warnings
@@ -20,6 +20,7 @@ lamin_env = os.getenv("LAMIN_ENV")
20
20
  if lamin_env is None or lamin_env == "prod":
21
21
  HOSTED_BUCKETS = tuple([f"s3://lamin-{region}" for region in HOSTED_REGIONS])
22
22
  else:
23
+ logger.warning("loaded LAMIN_ENV: staging")
23
24
  HOSTED_BUCKETS = ("s3://lamin-hosted-test",) # type: ignore
24
25
 
25
26
 
@@ -0,0 +1,354 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import json
5
+ import warnings
6
+ from concurrent.futures import ProcessPoolExecutor, as_completed
7
+ from importlib import import_module
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+ from django.db import models, transaction
14
+ from rich.progress import Progress
15
+
16
+ if TYPE_CHECKING:
17
+ from collections.abc import Sequence
18
+ from typing import Literal
19
+
20
+
21
+ def _get_registries(module_name: str) -> list[str]:
22
+ """Get registry class names from a module."""
23
+ schema_module = import_module(module_name)
24
+ exclude = {"SQLRecord", "BaseSQLRecord"}
25
+
26
+ if module_name == "lamindb":
27
+ module_filter = lambda cls, name: cls.__module__.startswith(
28
+ f"{module_name}.models."
29
+ ) and name in dir(schema_module)
30
+ else:
31
+ module_filter = (
32
+ lambda cls, name: cls.__module__ == f"{module_name}.models"
33
+ and name in dir(schema_module)
34
+ )
35
+
36
+ return [
37
+ name
38
+ for name in dir(schema_module.models)
39
+ if (
40
+ name[0].isupper()
41
+ and isinstance(cls := getattr(schema_module.models, name, None), type)
42
+ and issubclass(cls, models.Model)
43
+ and module_filter(cls, name)
44
+ and name not in exclude
45
+ )
46
+ ]
47
+
48
+
49
+ def _export_full_table(
50
+ registry_info: tuple[str, str, str | None],
51
+ directory: Path,
52
+ chunk_size: int,
53
+ ) -> list[tuple[str, Path]] | str:
54
+ """Export a registry table to parquet.
55
+
56
+ For PostgreSQL, uses COPY TO which streams the table directly to CSV format,
57
+ bypassing query planner overhead and row-by-row conversion (10-50x faster than SELECT).
58
+
59
+ For SQLite with large tables, reads in chunks to avoid memory issues when tables exceed available RAM.
60
+
61
+ Args:
62
+ registry_info: Tuple of (module_name, model_name, field_name) where field_name
63
+ is None for regular tables or the field name for M2M link tables.
64
+ directory: Output directory for parquet files.
65
+ chunk_size: Maximum rows per chunk for SQLite large tables.
66
+
67
+ Returns:
68
+ String identifier for single-file exports, or list of (table_name, chunk_path) tuples for chunked exports that need merging.
69
+ """
70
+ from django.db import connection
71
+
72
+ import lamindb_setup as ln_setup
73
+
74
+ module_name, model_name, field_name = registry_info
75
+ schema_module = import_module(module_name)
76
+ registry = getattr(schema_module, model_name)
77
+
78
+ if field_name:
79
+ registry = getattr(registry, field_name).through
80
+
81
+ table_name = registry._meta.db_table
82
+
83
+ try:
84
+ if ln_setup.settings.instance.dialect == "postgresql":
85
+ buffer = io.StringIO()
86
+ with connection.cursor() as cursor:
87
+ cursor.copy_expert(
88
+ f'COPY "{table_name}" TO STDOUT WITH (FORMAT CSV, HEADER TRUE)',
89
+ buffer,
90
+ )
91
+ buffer.seek(0)
92
+ df = pd.read_csv(buffer)
93
+ df.to_parquet(directory / f"{table_name}.parquet", compression=None)
94
+ return (
95
+ f"{module_name}.{model_name}.{field_name}"
96
+ if field_name
97
+ else f"{module_name}.{model_name}"
98
+ )
99
+ else:
100
+ with warnings.catch_warnings():
101
+ warnings.filterwarnings(
102
+ "ignore", message="Skipped unsupported reflection"
103
+ )
104
+ row_count = pd.read_sql(
105
+ f"SELECT COUNT(*) as count FROM {table_name}",
106
+ ln_setup.settings.instance.db,
107
+ ).iloc[0]["count"]
108
+
109
+ if row_count > chunk_size:
110
+ chunk_files = []
111
+ num_chunks = (row_count + chunk_size - 1) // chunk_size
112
+ for chunk_id in range(num_chunks):
113
+ offset = chunk_id * chunk_size
114
+ df = pd.read_sql(
115
+ f"SELECT * FROM {table_name} LIMIT {chunk_size} OFFSET {offset}",
116
+ ln_setup.settings.instance.db,
117
+ )
118
+ chunk_file = (
119
+ directory / f"{table_name}_chunk_{chunk_id}.parquet"
120
+ )
121
+ df.to_parquet(chunk_file, compression=None)
122
+ chunk_files.append((table_name, chunk_file))
123
+ return chunk_files
124
+ else:
125
+ df = pd.read_sql_table(table_name, ln_setup.settings.instance.db)
126
+ df.to_parquet(directory / f"{table_name}.parquet", compression=None)
127
+ return (
128
+ f"{module_name}.{model_name}.{field_name}"
129
+ if field_name
130
+ else f"{module_name}.{model_name}"
131
+ )
132
+ except (ValueError, pd.errors.DatabaseError):
133
+ raise ValueError(
134
+ f"Table '{table_name}' was not found. The instance might need to be migrated."
135
+ ) from None
136
+
137
+
138
+ def export_db(
139
+ module_names: Sequence[str] | None = None,
140
+ *,
141
+ output_dir: str | Path = "./lamindb_export/",
142
+ max_workers: int = 8,
143
+ chunk_size: int = 500_000,
144
+ ) -> None:
145
+ """Export registry tables and many-to-many link tables to parquet files.
146
+
147
+ Ensure that you connect to postgres instances using `use_root_db_user=True`.
148
+
149
+ Args:
150
+ module_names: Module names to export (e.g., ["lamindb", "bionty", "wetlab"]).
151
+ Defaults to "lamindb" if not provided.
152
+ output_dir: Directory path for exported parquet files.
153
+ max_workers: Number of parallel processes.
154
+ chunk_size: Number of rows per chunk for large tables.
155
+ """
156
+ directory = Path(output_dir)
157
+ directory.mkdir(parents=True, exist_ok=True)
158
+
159
+ module_names = module_names or ["lamindb"]
160
+ modules = {name: _get_registries(name) for name in module_names}
161
+
162
+ tasks = []
163
+ for module_name, model_names in modules.items():
164
+ schema_module = import_module(module_name)
165
+ for model_name in model_names:
166
+ registry = getattr(schema_module, model_name)
167
+ tasks.append((module_name, model_name, None))
168
+ for field in registry._meta.many_to_many:
169
+ tasks.append((module_name, model_name, field.name))
170
+
171
+ chunk_files_by_table: dict[str, list[Path]] = {}
172
+
173
+ with Progress() as progress:
174
+ task_id = progress.add_task("Exporting", total=len(tasks))
175
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
176
+ futures = {
177
+ executor.submit(_export_full_table, task, directory, chunk_size): task
178
+ for task in tasks
179
+ }
180
+
181
+ for future in as_completed(futures):
182
+ result = future.result()
183
+ if isinstance(result, list):
184
+ for table_name, chunk_file in result:
185
+ chunk_files_by_table.setdefault(table_name, []).append(
186
+ chunk_file
187
+ )
188
+ progress.advance(task_id)
189
+
190
+ for table_name, chunk_files in chunk_files_by_table.items():
191
+ merged_df = pd.concat([pd.read_parquet(f) for f in sorted(chunk_files)])
192
+ merged_df.to_parquet(directory / f"{table_name}.parquet", compression=None)
193
+ for chunk_file in chunk_files:
194
+ chunk_file.unlink()
195
+
196
+
197
+ def _serialize_value(val):
198
+ """Convert value to JSON string if it's a dict, list, or numpy array, otherwise return as-is."""
199
+ if isinstance(val, (dict, list, np.ndarray)):
200
+ return json.dumps(
201
+ val, default=lambda o: o.tolist() if isinstance(o, np.ndarray) else None
202
+ )
203
+ return val
204
+
205
+
206
+ def _import_registry(
207
+ registry: type[models.Model],
208
+ directory: Path,
209
+ if_exists: Literal["fail", "replace", "append"] = "replace",
210
+ ) -> None:
211
+ """Import a single registry table from parquet.
212
+
213
+ For PostgreSQL, uses COPY FROM which bypasses SQL parsing and writes directly to
214
+ table pages (20-50x faster than multi-row INSERTs).
215
+
216
+ For SQLite, uses multi-row INSERTs with dynamic chunking to stay under the 999
217
+ variable limit (2-5x faster than single-row INSERTs).
218
+ """
219
+ from django.db import connection
220
+
221
+ table_name = registry._meta.db_table
222
+ parquet_file = directory / f"{table_name}.parquet"
223
+
224
+ if not parquet_file.exists():
225
+ print(f"Skipped {table_name} (file not found)")
226
+ return
227
+
228
+ df = pd.read_parquet(parquet_file)
229
+
230
+ old_foreign_key_columns = [col for col in df.columns if col.endswith("_old")]
231
+ if old_foreign_key_columns:
232
+ df = df.drop(columns=old_foreign_key_columns)
233
+
234
+ for col in df.columns:
235
+ if df[col].dtype == "object":
236
+ mask = df[col].apply(lambda x: isinstance(x, (dict, list, np.ndarray)))
237
+ if mask.any():
238
+ df.loc[mask, col] = df.loc[mask, col].map(_serialize_value)
239
+
240
+ if df.empty:
241
+ return
242
+
243
+ if connection.vendor == "postgresql":
244
+ columns = df.columns.tolist()
245
+ column_names = ", ".join(f'"{col}"' for col in columns)
246
+
247
+ buffer = io.StringIO()
248
+ df.to_csv(buffer, index=False, header=False, sep="\t", na_rep="\\N")
249
+ buffer.seek(0)
250
+
251
+ with connection.cursor() as cursor:
252
+ if if_exists == "replace":
253
+ cursor.execute(f'DELETE FROM "{table_name}"')
254
+ elif if_exists == "fail":
255
+ cursor.execute(f'SELECT COUNT(*) FROM "{table_name}"')
256
+ if cursor.fetchone()[0] > 0:
257
+ raise ValueError(f"Table {table_name} already contains data")
258
+
259
+ cursor.copy_expert(
260
+ f"COPY \"{table_name}\" ({column_names}) FROM STDIN WITH (FORMAT CSV, DELIMITER E'\\t', NULL '\\N')",
261
+ buffer,
262
+ )
263
+ else:
264
+ num_cols = len(df.columns)
265
+ max_vars = 900 # SQLite has a limit of 999 variables per statement
266
+ chunksize = max(1, max_vars // num_cols)
267
+
268
+ df.to_sql(
269
+ table_name,
270
+ connection.connection,
271
+ if_exists=if_exists,
272
+ index=False,
273
+ method="multi",
274
+ chunksize=chunksize,
275
+ )
276
+
277
+
278
+ def import_db(
279
+ module_names: Sequence[str] | None = None,
280
+ *,
281
+ input_dir: str | Path = "./lamindb_export/",
282
+ if_exists: Literal["fail", "replace", "append"] = "replace",
283
+ ) -> None:
284
+ """Import registry and link tables from parquet files.
285
+
286
+ Temporarily disables FK constraints to allow insertion in arbitrary order.
287
+ Requires superuser/RDS admin privileges for postgres databases.
288
+
289
+ Args:
290
+ input_dir: Directory containing parquet files to import.
291
+ module_names: Module names to import (e.g., ["lamindb", "bionty", "wetlab"]).
292
+ if_exists: How to behave if table exists: 'fail', 'replace', or 'append'.
293
+ """
294
+ from django.db import connection
295
+
296
+ import lamindb_setup as ln_setup
297
+
298
+ directory = Path(input_dir)
299
+
300
+ if not directory.exists():
301
+ raise ValueError(f"Directory does not exist: {directory}")
302
+
303
+ if module_names is None:
304
+ parquet_files = list(directory.glob("*.parquet"))
305
+ detected_modules = {
306
+ f.name.split("_")[0] for f in parquet_files if "_" in f.name
307
+ }
308
+ module_names = sorted(detected_modules)
309
+
310
+ modules = {name: _get_registries(name) for name in module_names}
311
+ total_models = sum(len(models) for models in modules.values())
312
+
313
+ is_sqlite = ln_setup.settings.instance.dialect == "sqlite"
314
+
315
+ try:
316
+ with connection.cursor() as cursor:
317
+ if ln_setup.settings.instance.dialect == "postgresql":
318
+ cursor.execute("SET session_replication_role = 'replica'")
319
+ elif is_sqlite:
320
+ cursor.execute("PRAGMA foreign_keys = OFF")
321
+ # Disables fsync - OS buffers writes to disk, 10-50x faster but can corrupt DB on crash
322
+ cursor.execute("PRAGMA synchronous = OFF")
323
+ # Keeps rollback journal in RAM - 2-5x faster but cannot rollback on crash
324
+ cursor.execute("PRAGMA journal_mode = MEMORY")
325
+ # 64MB page cache for better performance on large imports
326
+ cursor.execute("PRAGMA cache_size = -64000")
327
+
328
+ with transaction.atomic():
329
+ if ln_setup.settings.instance.dialect == "postgresql":
330
+ with connection.cursor() as cursor:
331
+ cursor.execute("SET CONSTRAINTS ALL DEFERRED")
332
+
333
+ with Progress() as progress:
334
+ task = progress.add_task("Importing", total=total_models)
335
+ for module_name, model_names in modules.items():
336
+ schema_module = import_module(module_name)
337
+ for model_name in model_names:
338
+ progress.update(
339
+ task, description=f"[cyan]{module_name}.{model_name}"
340
+ )
341
+ registry = getattr(schema_module, model_name)
342
+ _import_registry(registry, directory, if_exists=if_exists)
343
+ for field in registry._meta.many_to_many:
344
+ link_orm = getattr(registry, field.name).through
345
+ _import_registry(link_orm, directory, if_exists=if_exists)
346
+ progress.advance(task)
347
+ finally:
348
+ with connection.cursor() as cursor:
349
+ if ln_setup.settings.instance.dialect == "postgresql":
350
+ cursor.execute("SET session_replication_role = 'origin'")
351
+ elif is_sqlite:
352
+ cursor.execute("PRAGMA synchronous = FULL")
353
+ cursor.execute("PRAGMA journal_mode = DELETE")
354
+ cursor.execute("PRAGMA foreign_keys = ON")
@@ -1,194 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import json
4
- import warnings
5
- from importlib import import_module
6
- from pathlib import Path
7
- from typing import TYPE_CHECKING
8
-
9
- import pandas as pd
10
- from django.db import models, transaction
11
- from rich.progress import Progress
12
-
13
- if TYPE_CHECKING:
14
- from collections.abc import Sequence
15
- from typing import Literal
16
-
17
-
18
- def _get_registries(module_name: str) -> list[str]:
19
- """Get registry class names from a module."""
20
- schema_module = import_module(module_name)
21
- exclude = {"SQLRecord", "BaseSQLRecord"}
22
-
23
- if module_name == "lamindb":
24
- module_filter = lambda cls, name: cls.__module__.startswith(
25
- f"{module_name}.models."
26
- ) and name in dir(schema_module)
27
- else:
28
- module_filter = (
29
- lambda cls, name: cls.__module__ == f"{module_name}.models"
30
- and name in dir(schema_module)
31
- )
32
-
33
- return [
34
- name
35
- for name in dir(schema_module.models)
36
- if (
37
- name[0].isupper()
38
- and isinstance(cls := getattr(schema_module.models, name, None), type)
39
- and issubclass(cls, models.Model)
40
- and module_filter(cls, name)
41
- and name not in exclude
42
- )
43
- ]
44
-
45
-
46
- def _export_registry_to_parquet(registry: type[models.Model], directory: Path) -> None:
47
- """Export a single registry table to parquet."""
48
- import lamindb_setup as ln_setup
49
-
50
- table_name = registry._meta.db_table
51
- with warnings.catch_warnings():
52
- warnings.filterwarnings("ignore", message="Skipped unsupported reflection")
53
- df = pd.read_sql_table(table_name, ln_setup.settings.instance.db)
54
- df.to_parquet(directory / f"{table_name}.parquet", compression=None)
55
-
56
-
57
- def export_db(
58
- module_names: Sequence[str] | None = None,
59
- *,
60
- output_dir: str | Path = "./lamindb_export/",
61
- ) -> None:
62
- """Export registry tables and many-to-many link tables to parquet files.
63
-
64
- Ensure that you connect to postgres instances using `use_root_db_user=True`.
65
-
66
- Args:
67
- module_names: Module names to export (e.g., ["lamindb", "bionty", "wetlab"]).
68
- Defaults to "lamindb" if not provided.
69
- output_dir: Directory path for exported parquet files.
70
- """
71
- directory = Path(output_dir)
72
- directory.mkdir(parents=True, exist_ok=True)
73
-
74
- module_names = module_names or ["lamindb"]
75
- modules = {name: _get_registries(name) for name in module_names}
76
- total_models = sum(len(models) for models in modules.values())
77
-
78
- with Progress() as progress:
79
- task = progress.add_task("Exporting", total=total_models)
80
- for module_name, model_names in modules.items():
81
- schema_module = import_module(module_name)
82
- for model_name in model_names:
83
- progress.update(task, description=f"[cyan]{module_name}.{model_name}")
84
- registry = getattr(schema_module, model_name)
85
- _export_registry_to_parquet(registry, directory)
86
- for field in registry._meta.many_to_many:
87
- link_orm = getattr(registry, field.name).through
88
- _export_registry_to_parquet(link_orm, directory)
89
- progress.advance(task)
90
-
91
-
92
- def _import_registry(
93
- registry: type[models.Model],
94
- directory: Path,
95
- if_exists: Literal["fail", "replace", "append"] = "replace",
96
- ) -> None:
97
- """Import a single registry table from parquet.
98
-
99
- Uses raw SQL export instead of django to later circumvent FK constraints.
100
- """
101
- table_name = registry._meta.db_table
102
- parquet_file = directory / f"{table_name}.parquet"
103
-
104
- if not parquet_file.exists():
105
- print(f"Skipped {table_name} (file not found)")
106
- return
107
-
108
- df = pd.read_parquet(parquet_file)
109
-
110
- old_foreign_key_columns = [col for col in df.columns if col.endswith("_old")]
111
- if old_foreign_key_columns:
112
- df = df.drop(columns=old_foreign_key_columns)
113
-
114
- for col in df.columns:
115
- if df[col].dtype == "object":
116
- df[col] = df[col].apply(
117
- lambda x: json.dumps(x) if isinstance(x, (dict, list)) else x
118
- )
119
-
120
- from django.db import connection
121
-
122
- df.to_sql(table_name, connection.connection, if_exists=if_exists, index=False)
123
-
124
-
125
- def import_db(
126
- module_names: Sequence[str] | None = None,
127
- *,
128
- input_dir: str | Path = "./lamindb_export/",
129
- if_exists: Literal["fail", "replace", "append"] = "replace",
130
- ) -> None:
131
- """Import registry and link tables from parquet files.
132
-
133
- Temporarily disables FK constraints to allow insertion in arbitrary order.
134
- Requires superuser/RDS admin privileges for postgres databases.
135
-
136
- Args:
137
- input_dir: Directory containing parquet files to import.
138
- module_names: Module names to import (e.g., ["lamindb", "bionty", "wetlab"]).
139
- if_exists: How to behave if table exists: 'fail', 'replace', or 'append'.
140
- """
141
- from django.db import connection
142
-
143
- import lamindb_setup as ln_setup
144
-
145
- directory = Path(input_dir)
146
-
147
- if not directory.exists():
148
- raise ValueError(f"Directory does not exist: {directory}")
149
-
150
- if module_names is None:
151
- parquet_files = list(directory.glob("*.parquet"))
152
- detected_modules = {
153
- f.name.split("_")[0] for f in parquet_files if "_" in f.name
154
- }
155
- module_names = sorted(detected_modules)
156
-
157
- modules = {name: _get_registries(name) for name in module_names}
158
- total_models = sum(len(models) for models in modules.values())
159
-
160
- # Disable FK constraints to allow insertion in arbitrary order
161
- if ln_setup.settings.instance.dialect == "sqlite":
162
- with connection.cursor() as cursor:
163
- if ln_setup.settings.instance.dialect == "postgresql":
164
- cursor.execute("SET session_replication_role = 'replica'")
165
- elif ln_setup.settings.instance.dialect == "sqlite":
166
- cursor.execute("PRAGMA foreign_keys = OFF")
167
-
168
- with transaction.atomic():
169
- if ln_setup.settings.instance.dialect == "postgresql":
170
- with connection.cursor() as cursor:
171
- cursor.execute("SET CONSTRAINTS ALL DEFERRED")
172
-
173
- with Progress() as progress:
174
- task = progress.add_task("Importing", total=total_models)
175
- for module_name, model_names in modules.items():
176
- schema_module = import_module(module_name)
177
- for model_name in model_names:
178
- progress.update(
179
- task, description=f"[cyan]{module_name}.{model_name}"
180
- )
181
- registry = getattr(schema_module, model_name)
182
- _import_registry(registry, directory, if_exists=if_exists)
183
- for field in registry._meta.many_to_many:
184
- link_orm = getattr(registry, field.name).through
185
- _import_registry(link_orm, directory, if_exists=if_exists)
186
- progress.advance(task)
187
-
188
- # Re-enable FK constraints again
189
- if ln_setup.settings.instance.dialect == "sqlite":
190
- with connection.cursor() as cursor:
191
- if ln_setup.settings.instance.dialect == "postgresql":
192
- cursor.execute("SET session_replication_role = 'origin'")
193
- elif ln_setup.settings.instance.dialect == "sqlite":
194
- cursor.execute("PRAGMA foreign_keys = ON")
File without changes
File without changes