lamindb_setup 1.0.2__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb_setup/__init__.py CHANGED
@@ -33,10 +33,12 @@ Modules & settings:
33
33
 
34
34
  """
35
35
 
36
- __version__ = "1.0.2" # denote a release candidate for 0.1.0 with 0.1rc1
36
+ __version__ = "1.1.0" # denote a release candidate for 0.1.0 with 0.1rc1
37
37
 
38
38
  import os
39
39
 
40
+ from packaging import version as packaging_version
41
+
40
42
  from . import core
41
43
  from ._check_setup import _check_instance_setup
42
44
  from ._close import close
@@ -50,6 +52,20 @@ from ._register_instance import register
50
52
  from ._setup_user import login, logout
51
53
  from .core._settings import settings
52
54
 
55
+ # check that the version of s3fs is higher than the lower bound
56
+ # needed because spatialdata installs old versions of s3fs
57
+ try:
58
+ from s3fs import __version__ as s3fs_version
59
+
60
+ if packaging_version.parse(s3fs_version) < packaging_version.parse("2023.12.2"):
61
+ raise RuntimeError(
62
+ f"The version of s3fs you have ({s3fs_version}) is impompatible "
63
+ "with lamindb, please upgrade it: pip install s3fs>=2023.12.2"
64
+ )
65
+ except ImportError:
66
+ # might be not installed
67
+ pass
68
+
53
69
 
54
70
  def _is_CI_environment() -> bool:
55
71
  ci_env_vars = [
@@ -2,13 +2,14 @@ from __future__ import annotations
2
2
 
3
3
  import functools
4
4
  import importlib as il
5
+ import inspect
5
6
  import os
6
7
  from typing import TYPE_CHECKING
7
8
 
8
9
  from lamin_utils import logger
9
10
 
10
11
  from ._silence_loggers import silence_loggers
11
- from .core import django
12
+ from .core import django as django_lamin
12
13
  from .core._settings import settings
13
14
  from .core._settings_store import current_instance_settings_file
14
15
  from .core.exceptions import DefaultMessageException
@@ -33,8 +34,12 @@ CURRENT_ISETTINGS: InstanceSettings | None = None
33
34
  IS_LOADING: bool = False
34
35
 
35
36
 
37
+ class ModuleWasntConfigured(SystemExit):
38
+ pass
39
+
40
+
36
41
  # decorator to disable auto-connect when importing a module such as lamindb
37
- def _loading(func: Callable):
42
+ def disable_auto_connect(func: Callable):
38
43
  @functools.wraps(func)
39
44
  def wrapper(*args, **kwargs):
40
45
  global IS_LOADING
@@ -70,14 +75,66 @@ def _get_current_instance_settings() -> InstanceSettings | None:
70
75
  return None
71
76
 
72
77
 
78
+ def _normalize_module_name(module_name: str) -> str:
79
+ return module_name.replace("lnschema_", "").replace("_", "-")
80
+
81
+
82
+ # checks that the provided modules is in the modules of the provided instance
83
+ # or in the apps setup by django
84
+ def _check_module_in_instance_modules(
85
+ module: str, isettings: InstanceSettings | None = None
86
+ ) -> None:
87
+ not_in_instance_msg = (
88
+ f"'{module}' is missing from this instance. "
89
+ "Please go to your instance settings page and add it under 'schema modules'."
90
+ )
91
+
92
+ if isettings is not None:
93
+ modules_raw = isettings.modules
94
+ modules = set(modules_raw).union(
95
+ _normalize_module_name(module) for module in modules_raw
96
+ )
97
+ if _normalize_module_name(module) not in modules and module not in modules:
98
+ raise ModuleWasntConfigured(not_in_instance_msg)
99
+ else:
100
+ return
101
+
102
+ from django.apps import apps
103
+
104
+ for app in apps.get_app_configs():
105
+ # app.name is always unnormalized module (python package) name
106
+ if module == app.name or module == _normalize_module_name(app.name):
107
+ return
108
+ raise ModuleWasntConfigured(not_in_instance_msg)
109
+
110
+
111
+ # infer the name of the module that calls this function
112
+ def _infer_callers_module_name() -> str | None:
113
+ stack = inspect.stack()
114
+ if len(stack) < 3:
115
+ return None
116
+ module = inspect.getmodule(stack[2][0])
117
+ return module.__name__.partition(".")[0] if module is not None else None
118
+
119
+
73
120
  # we make this a private function because in all the places it's used,
74
121
  # users should not see it
75
122
  def _check_instance_setup(from_module: str | None = None) -> bool:
76
- if django.IS_SETUP:
123
+ if django_lamin.IS_SETUP:
77
124
  # reload logic here because module might not yet have been imported
78
125
  # upon first setup
79
- if from_module is not None and from_module != "lamindb":
80
- il.reload(il.import_module(from_module))
126
+ if from_module is not None:
127
+ if from_module != "lamindb":
128
+ _check_module_in_instance_modules(from_module)
129
+ il.reload(il.import_module(from_module))
130
+ else:
131
+ infer_module = _infer_callers_module_name()
132
+ if infer_module is not None and infer_module not in {
133
+ "lamindb",
134
+ "lamindb_setup",
135
+ "lamin_cli",
136
+ }:
137
+ _check_module_in_instance_modules(infer_module)
81
138
  return True
82
139
  silence_loggers()
83
140
  if os.environ.get("LAMINDB_MULTI_INSTANCE") == "true":
@@ -91,17 +148,19 @@ def _check_instance_setup(from_module: str | None = None) -> bool:
91
148
  if (
92
149
  from_module is not None
93
150
  and settings.auto_connect
94
- and not django.IS_SETUP
151
+ and not django_lamin.IS_SETUP
95
152
  and not IS_LOADING
96
153
  ):
97
- if not from_module == "lamindb":
154
+ if from_module != "lamindb":
155
+ _check_module_in_instance_modules(from_module, isettings)
156
+
98
157
  import lamindb
99
158
 
100
159
  il.reload(il.import_module(from_module))
101
160
  else:
102
- django.setup_django(isettings)
161
+ django_lamin.setup_django(isettings)
103
162
  logger.important(f"connected lamindb: {isettings.slug}")
104
- return django.IS_SETUP
163
+ return django_lamin.IS_SETUP
105
164
  else:
106
165
  if from_module is not None and settings.auto_connect:
107
166
  logger.warning(InstanceNotSetupError.default_message)
@@ -271,6 +271,7 @@ def connect(instance: str | None = None, **kwargs) -> str | tuple | None:
271
271
  settings_dir / f"no_lnschema_core-{isettings.slug.replace('/', '--')}"
272
272
  )
273
273
  if not no_lnschema_core_file.exists():
274
+ # sqlite file for cloud sqlite instances is already updated here
274
275
  migrate_lnschema_core(
275
276
  isettings, no_lnschema_core_file, write_file=_write_settings
276
277
  )
@@ -311,7 +312,8 @@ def connect(instance: str | None = None, **kwargs) -> str | tuple | None:
311
312
  load_from_isettings(isettings, user=_user, write_settings=_write_settings)
312
313
  if _reload_lamindb:
313
314
  importlib.reload(importlib.import_module("lamindb"))
314
- logger.important(f"connected lamindb: {isettings.slug}")
315
+ else:
316
+ logger.important(f"connected lamindb: {isettings.slug}")
315
317
  except Exception as e:
316
318
  if isettings is not None:
317
319
  if _write_settings:
@@ -339,24 +341,11 @@ def migrate_lnschema_core(
339
341
  """Migrate lnschema_core tables to lamindb tables."""
340
342
  from urllib.parse import urlparse
341
343
 
342
- if isettings.is_on_hub:
343
- from lamindb_setup.core._hub_client import call_with_fallback_auth
344
- from lamindb_setup.core._hub_crud import (
345
- select_collaborator,
346
- )
347
-
348
- # double check that user is an admin, otherwise will fail below
349
- # due to insufficient SQL permissions with cryptic error
350
- collaborator = call_with_fallback_auth(
351
- select_collaborator,
352
- instance_id=settings.instance._id,
353
- account_id=settings.user._uuid,
354
- )
355
- if collaborator is None or collaborator["role"] != "admin":
356
- raise SystemExit(
357
- "❌ Only admins can deploy migrations, please ensure that you're an"
358
- f" admin: https://lamin.ai/{settings.instance.slug}/settings"
359
- )
344
+ # we need to do this because the sqlite file should be already synced
345
+ # has no effect if not cloud sqlite
346
+ # errors if the sqlite file is not in the cloud and doesn't exist locally
347
+ # isettings.db syncs but doesn't error in this case due to error_no_origin=False
348
+ isettings._update_local_sqlite_file()
360
349
 
361
350
  parsed_uri = urlparse(isettings.db)
362
351
  db_type = parsed_uri.scheme
@@ -413,6 +402,24 @@ def migrate_lnschema_core(
413
402
  if response != "y":
414
403
  print("Aborted.")
415
404
  quit()
405
+ if isettings.is_on_hub:
406
+ from lamindb_setup.core._hub_client import call_with_fallback_auth
407
+ from lamindb_setup.core._hub_crud import (
408
+ select_collaborator,
409
+ )
410
+
411
+ # double check that user is an admin, otherwise will fail below
412
+ # due to insufficient SQL permissions with cryptic error
413
+ collaborator = call_with_fallback_auth(
414
+ select_collaborator,
415
+ instance_id=settings.instance._id,
416
+ account_id=settings.user._uuid,
417
+ )
418
+ if collaborator is None or collaborator["role"] != "admin":
419
+ raise SystemExit(
420
+ "❌ Only admins can deploy migrations, please ensure that you're an"
421
+ f" admin: https://lamin.ai/{settings.instance.slug}/settings"
422
+ )
416
423
  for table in tables_to_rename:
417
424
  if db_type == "sqlite":
418
425
  cur.execute(
lamindb_setup/_delete.py CHANGED
@@ -7,7 +7,7 @@ from uuid import UUID
7
7
  from lamin_utils import logger
8
8
 
9
9
  from ._connect_instance import _connect_instance, get_owner_name_from_identifier
10
- from .core._aws_credentials import HOSTED_BUCKETS
10
+ from .core._aws_options import HOSTED_BUCKETS
11
11
  from .core._hub_core import delete_instance as delete_instance_on_hub
12
12
  from .core._hub_core import get_storage_records_for_instance
13
13
  from .core._settings import settings
@@ -396,10 +396,5 @@ def infer_instance_name(
396
396
  if storage == "create-s3":
397
397
  raise ValueError("pass name to init if storage = 'create-s3'")
398
398
  storage_path = UPath(storage).resolve()
399
- # not sure if name is ever ""
400
- if storage_path.name != "":
401
- name = storage_path.name
402
- else:
403
- # dedicated treatment of bucket names
404
- name = storage_path.drive
399
+ name = storage_path.path.rstrip("/").split("/")[-1]
405
400
  return name.lower()
lamindb_setup/_migrate.py CHANGED
@@ -5,7 +5,7 @@ from django.db.migrations.loader import MigrationLoader
5
5
  from lamin_utils import logger
6
6
  from packaging import version
7
7
 
8
- from ._check_setup import _check_instance_setup, _loading
8
+ from ._check_setup import _check_instance_setup, disable_auto_connect
9
9
  from .core._settings import settings
10
10
  from .core.django import setup_django
11
11
 
@@ -62,7 +62,7 @@ class migrate:
62
62
  """
63
63
 
64
64
  @classmethod
65
- @_loading
65
+ @disable_auto_connect
66
66
  def create(cls) -> None:
67
67
  """Create a migration."""
68
68
  if _check_instance_setup():
@@ -70,7 +70,7 @@ class migrate:
70
70
  setup_django(settings.instance, create_migrations=True)
71
71
 
72
72
  @classmethod
73
- @_loading
73
+ @disable_auto_connect
74
74
  def deploy(cls) -> None:
75
75
  """Deploy a migration."""
76
76
  from ._schema_metadata import update_schema_in_hub
@@ -115,7 +115,7 @@ class migrate:
115
115
  )
116
116
 
117
117
  @classmethod
118
- @_loading
118
+ @disable_auto_connect
119
119
  def check(cls) -> bool:
120
120
  """Check whether Registry definitions are in sync with migrations."""
121
121
  from django.core.management import call_command
@@ -132,7 +132,7 @@ class migrate:
132
132
  return True
133
133
 
134
134
  @classmethod
135
- @_loading
135
+ @disable_auto_connect
136
136
  def squash(
137
137
  cls, package_name, migration_nr, start_migration_nr: str | None = None
138
138
  ) -> None:
@@ -148,7 +148,7 @@ class migrate:
148
148
  call_command("squashmigrations", package_name, migration_nr)
149
149
 
150
150
  @classmethod
151
- @_loading
151
+ @disable_auto_connect
152
152
  def show(cls) -> None:
153
153
  """Show migrations."""
154
154
  from django.core.management import call_command
@@ -4,7 +4,7 @@ import os
4
4
  import time
5
5
 
6
6
  from lamin_utils import logger
7
- from upath.implementations.cloud import S3Path
7
+ from upath import UPath
8
8
 
9
9
  HOSTED_REGIONS = [
10
10
  "eu-central-1",
@@ -25,22 +25,29 @@ def _keep_trailing_slash(path_str: str):
25
25
  return path_str if path_str[-1] == "/" else path_str + "/"
26
26
 
27
27
 
28
- AWS_CREDENTIALS_EXPIRATION = 11 * 60 * 60 # refresh credentials after 11 hours
28
+ AWS_CREDENTIALS_EXPIRATION: int = 11 * 60 * 60 # refresh credentials after 11 hours
29
29
 
30
30
 
31
31
  # set anon=True for these buckets if credentials fail for a public bucket
32
32
  # to be expanded
33
- PUBLIC_BUCKETS = ("cellxgene-data-public",)
33
+ PUBLIC_BUCKETS: tuple[str] = ("cellxgene-data-public",)
34
34
 
35
35
 
36
- class AWSCredentialsManager:
36
+ # s3-comaptible endpoints managed by lamin
37
+ # None means the standard aws s3 endpoint
38
+ LAMIN_ENDPOINTS: tuple[str | None] = (None,)
39
+
40
+
41
+ class AWSOptionsManager:
37
42
  def __init__(self):
38
43
  self._credentials_cache = {}
39
44
 
40
45
  from s3fs import S3FileSystem
41
46
 
42
47
  # this is cached so will be resued with the connection initialized
43
- fs = S3FileSystem(cache_regions=True)
48
+ fs = S3FileSystem(
49
+ cache_regions=True, use_listings_cache=True, version_aware=False
50
+ )
44
51
  try:
45
52
  fs.connect()
46
53
  self.anon: bool = fs.session._credentials is None
@@ -83,7 +90,7 @@ class AWSCredentialsManager:
83
90
  def _get_cached_credentials(self, root: str) -> dict:
84
91
  return self._credentials_cache[root]["credentials"]
85
92
 
86
- def _path_inject_options(self, path: S3Path, credentials: dict) -> S3Path:
93
+ def _path_inject_options(self, path: UPath, credentials: dict) -> UPath:
87
94
  if credentials == {}:
88
95
  # credentials were specified manually for the path
89
96
  if "anon" in path.storage_options:
@@ -99,13 +106,27 @@ class AWSCredentialsManager:
99
106
  connection_options = credentials
100
107
 
101
108
  if "cache_regions" in path.storage_options:
102
- cache_regions = path.storage_options["cache_regions"]
109
+ connection_options["cache_regions"] = path.storage_options["cache_regions"]
103
110
  else:
104
- cache_regions = True
105
-
106
- return S3Path(path, cache_regions=cache_regions, **connection_options)
107
-
108
- def enrich_path(self, path: S3Path, access_token: str | None = None) -> S3Path:
111
+ connection_options["cache_regions"] = (
112
+ path.storage_options.get("endpoint_url", None) is None
113
+ )
114
+ # we use cache to avoid some uneeded downloads or credential problems
115
+ # see in upload_from
116
+ connection_options["use_listings_cache"] = path.storage_options.get(
117
+ "use_listings_cache", True
118
+ )
119
+ # normally we want to ignore objects vsrsions in a versioned bucket
120
+ connection_options["version_aware"] = path.storage_options.get(
121
+ "version_aware", False
122
+ )
123
+
124
+ return UPath(path, **connection_options)
125
+
126
+ def enrich_path(self, path: UPath, access_token: str | None = None) -> UPath:
127
+ # ignore paths with non-lamin-managed endpoints
128
+ if path.storage_options.get("endpoint_url", None) not in LAMIN_ENDPOINTS:
129
+ return path
109
130
  # trailing slash is needed to avoid returning incorrect results
110
131
  # with .startswith
111
132
  # for example s3://lamindata-eu should not receive cache for s3://lamindata
@@ -160,13 +181,13 @@ class AWSCredentialsManager:
160
181
  return self._path_inject_options(path, credentials)
161
182
 
162
183
 
163
- _aws_credentials_manager: AWSCredentialsManager | None = None
184
+ _aws_options_manager: AWSOptionsManager | None = None
164
185
 
165
186
 
166
- def get_aws_credentials_manager() -> AWSCredentialsManager:
167
- global _aws_credentials_manager
187
+ def get_aws_options_manager() -> AWSOptionsManager:
188
+ global _aws_options_manager
168
189
 
169
- if _aws_credentials_manager is None:
170
- _aws_credentials_manager = AWSCredentialsManager()
190
+ if _aws_options_manager is None:
191
+ _aws_options_manager = AWSOptionsManager()
171
192
 
172
- return _aws_credentials_manager
193
+ return _aws_options_manager
@@ -66,7 +66,11 @@ def connect_hub(
66
66
  ) -> Client:
67
67
  env = Environment(fallback=fallback_env)
68
68
  if client_options is None:
69
- client_options = ClientOptions(auto_refresh_token=False)
69
+ # function_client_timeout=5 by default
70
+ # increase to avoid rare timeouts for edge functions
71
+ client_options = ClientOptions(
72
+ auto_refresh_token=False, function_client_timeout=10
73
+ )
70
74
  return create_client(env.supabase_api_url, env.supabase_anon_key, client_options)
71
75
 
72
76
 
@@ -311,6 +311,7 @@ def _init_instance(
311
311
  "db_database": db_dsn.db.database,
312
312
  }
313
313
  fields.update(db_fields)
314
+ slug = isettings.slug
314
315
  # I'd like the following to be an upsert, but this seems to violate RLS
315
316
  # Similarly, if we don't specify `returning="minimal"`, we'll violate RLS
316
317
  # we could make this idempotent by catching an error, but this seems dangerous
@@ -318,14 +319,13 @@ def _init_instance(
318
319
  try:
319
320
  client.table("instance").insert(fields, returning="minimal").execute()
320
321
  except APIError:
321
- logger.warning(
322
- f"instance already existed at: https://lamin.ai/{isettings.owner}/{isettings.name}"
323
- )
322
+ logger.warning(f"instance already existed at: https://lamin.ai/{slug}")
324
323
  return None
325
324
  client.table("storage").update(
326
325
  {"instance_id": isettings._id.hex, "is_default": True}
327
326
  ).eq("id", isettings.storage._uuid.hex).execute() # type: ignore
328
- logger.important(f"go to: https://lamin.ai/{isettings.owner}/{isettings.name}")
327
+ if isettings.dialect != "sqlite" and isettings.is_remote:
328
+ logger.important(f"go to: https://lamin.ai/{slug}")
329
329
 
330
330
 
331
331
  def _connect_instance_hub(
@@ -171,7 +171,8 @@ class InstanceSettings:
171
171
  return StorageSettings(record.root)
172
172
  elif not mute_warning:
173
173
  logger.warning(
174
- f"none of the registered local storage locations were found in your environment: {local_records}"
174
+ "none of the registered local storage locations were found:\n "
175
+ + "\n ".join(r.root for r in all_local_records)
175
176
  )
176
177
  logger.important(
177
178
  "please register a new local storage location via `ln.settings.storage_local = local_root_path` and re-load/connect the instance"
@@ -466,13 +467,11 @@ class InstanceSettings:
466
467
  settings._instance_settings = self
467
468
 
468
469
  def _init_db(self):
469
- from lamindb_setup import _check_setup
470
+ from lamindb_setup._check_setup import disable_auto_connect
470
471
 
471
472
  from .django import setup_django
472
473
 
473
- _check_setup.IS_LOADING = True
474
- setup_django(self, init=True)
475
- _check_setup.IS_LOADING = False
474
+ disable_auto_connect(setup_django)(self, init=True)
476
475
 
477
476
  from lamindb.models import Space
478
477
 
@@ -482,8 +481,6 @@ class InstanceSettings:
482
481
  )
483
482
 
484
483
  def _load_db(self) -> tuple[bool, str]:
485
- from lamindb_setup import _check_setup
486
-
487
484
  # Is the database available and initialized as LaminDB?
488
485
  # returns a tuple of status code and message
489
486
  if self.dialect == "sqlite" and not self._sqlite_file.exists():
@@ -494,15 +491,15 @@ class InstanceSettings:
494
491
  f" {legacy_file} to {self._sqlite_file}"
495
492
  )
496
493
  return False, f"SQLite file {self._sqlite_file} does not exist"
497
-
498
- from .django import setup_django
499
-
500
494
  # we need the local sqlite to setup django
501
- self._update_local_sqlite_file(lock_cloud_sqlite=self._is_cloud_sqlite)
495
+ self._update_local_sqlite_file()
502
496
  # setting up django also performs a check for migrations & prints them
503
497
  # as warnings
504
498
  # this should fail, e.g., if the db is not reachable
505
- _check_setup.IS_LOADING = True
506
- setup_django(self)
507
- _check_setup.IS_LOADING = False
499
+ from lamindb_setup._check_setup import disable_auto_connect
500
+
501
+ from .django import setup_django
502
+
503
+ disable_auto_connect(setup_django)(self)
504
+
508
505
  return True, ""
@@ -10,13 +10,13 @@ from typing import TYPE_CHECKING, Any, Literal
10
10
  import fsspec
11
11
  from lamin_utils import logger
12
12
 
13
- from ._aws_credentials import HOSTED_REGIONS, get_aws_credentials_manager
14
- from ._aws_storage import find_closest_aws_region
15
- from .upath import (
16
- LocalPathClasses,
17
- UPath,
18
- create_path,
13
+ from ._aws_options import (
14
+ HOSTED_REGIONS,
15
+ LAMIN_ENDPOINTS,
16
+ get_aws_options_manager,
19
17
  )
18
+ from ._aws_storage import find_closest_aws_region
19
+ from .upath import LocalPathClasses, UPath, _split_path_query, create_path
20
20
 
21
21
  if TYPE_CHECKING:
22
22
  from uuid import UUID
@@ -44,15 +44,27 @@ def get_storage_region(path: UPathStr) -> str | None:
44
44
  from botocore.config import Config
45
45
  from botocore.exceptions import ClientError
46
46
 
47
- # strip the prefix and any suffixes of the bucket name
48
- bucket = path_str.replace("s3://", "").split("/")[0]
47
+ # check for endpoint_url in storage options if upath
48
+ if isinstance(path, UPath):
49
+ endpoint_url = path.storage_options.get("endpoint_url", None)
50
+ else:
51
+ endpoint_url = None
52
+ path_part = path_str.replace("s3://", "")
53
+ # check for endpoint_url in the path string
54
+ if "?" in path_part:
55
+ assert endpoint_url is None
56
+ path_part, query = _split_path_query(path_part)
57
+ endpoint_url = query.get("endpoint_url", [None])[0]
58
+ bucket = path_part.split("/")[0]
49
59
  session = botocore.session.get_session()
50
60
  credentials = session.get_credentials()
51
61
  if credentials is None or credentials.access_key is None:
52
62
  config = Config(signature_version=botocore.session.UNSIGNED)
53
63
  else:
54
64
  config = None
55
- s3_client = session.create_client("s3", config=config)
65
+ s3_client = session.create_client(
66
+ "s3", endpoint_url=endpoint_url, config=config
67
+ )
56
68
  try:
57
69
  response = s3_client.head_bucket(Bucket=bucket)
58
70
  except ClientError as exc:
@@ -62,7 +74,7 @@ def get_storage_region(path: UPathStr) -> str | None:
62
74
  region = (
63
75
  response.get("ResponseMetadata", {})
64
76
  .get("HTTPHeaders", {})
65
- .get("x-amz-bucket-region")
77
+ .get("x-amz-bucket-region", None)
66
78
  )
67
79
  else:
68
80
  region = None
@@ -116,9 +128,9 @@ def init_storage(
116
128
  if region not in HOSTED_REGIONS:
117
129
  raise ValueError(f"region has to be one of {HOSTED_REGIONS}")
118
130
  if lamin_env is None or lamin_env == "prod":
119
- root_str = f"s3://lamin-{region}/{uid}"
131
+ root = f"s3://lamin-{region}/{uid}"
120
132
  else:
121
- root_str = f"s3://lamin-hosted-test/{uid}"
133
+ root = f"s3://lamin-hosted-test/{uid}"
122
134
  elif (input_protocol := fsspec.utils.get_protocol(root_str)) not in VALID_PROTOCOLS:
123
135
  valid_protocols = ("local",) + VALID_PROTOCOLS[1:] # show local instead of file
124
136
  raise ValueError(
@@ -126,7 +138,7 @@ def init_storage(
126
138
  )
127
139
  ssettings = StorageSettings(
128
140
  uid=uid,
129
- root=root_str,
141
+ root=root,
130
142
  region=region,
131
143
  instance_id=instance_id,
132
144
  access_token=access_token,
@@ -155,7 +167,7 @@ def init_storage(
155
167
  mark_storage_root(ssettings.root, ssettings.uid) # type: ignore
156
168
  except Exception:
157
169
  logger.important(
158
- f"due to lack of write access, LaminDB won't manage storage location: {ssettings.root}"
170
+ f"due to lack of write access, LaminDB won't manage storage location: {ssettings.root_as_str}"
159
171
  )
160
172
  # we have to check hub_record_status here because
161
173
  # _select_storage inside init_storage_hub also populates ssettings._uuid
@@ -190,7 +202,6 @@ class StorageSettings:
190
202
  self._root_init = self._root_init.resolve()
191
203
  except Exception:
192
204
  logger.warning(f"unable to create .lamindb folder in {self._root_init}")
193
- pass
194
205
  self._root = None
195
206
  self._instance_id = instance_id
196
207
  # we don't yet infer region here to make init fast
@@ -255,7 +266,7 @@ class StorageSettings:
255
266
  elif getattr(self._root, "protocol", "") == "s3":
256
267
  # this is needed to be sure that the root always has nonexpired credentials
257
268
  # this just checks for time of the cached credentials in most cases
258
- return get_aws_credentials_manager().enrich_path(
269
+ return get_aws_options_manager().enrich_path(
259
270
  self._root, access_token=self.access_token
260
271
  )
261
272
  return self._root
@@ -275,6 +286,12 @@ class StorageSettings:
275
286
  @property
276
287
  def root_as_str(self) -> str:
277
288
  """Formatted root string."""
289
+ # embed endpoint_url into path string for storing and displaying
290
+ if self._root_init.protocol == "s3":
291
+ endpoint_url = self._root_init.storage_options.get("endpoint_url", None)
292
+ # LAMIN_ENDPOINTS include None
293
+ if endpoint_url not in LAMIN_ENDPOINTS:
294
+ return f"s3://{self._root_init.path.rstrip('/')}?endpoint_url={endpoint_url}"
278
295
  return self._root_init.as_posix().rstrip("/")
279
296
 
280
297
  @property
@@ -41,11 +41,6 @@ def b16_to_b64(s: str):
41
41
  return to_b64_str(base64.b16decode(s.strip('"'), casefold=True))
42
42
 
43
43
 
44
- def hash_string(string: str) -> str:
45
- # as we're truncating (not here) at 22 b64, we choose md5 over sha512
46
- return to_b64_str(hashlib.md5(string.encode("utf-8")).digest())
47
-
48
-
49
44
  # a lot to read about this: lamin-notes/2022/hashing
50
45
  def hash_set(s: set[str]) -> str:
51
46
  join_s = ":".join(sorted(s))
@@ -68,7 +63,9 @@ def hash_from_hashes_list(hashes: Iterable[str]) -> str:
68
63
  return to_b64_str(digest)[:HASH_LENGTH]
69
64
 
70
65
 
71
- def hash_code(file_path: UPathStr):
66
+ # below is only used when comparing with git's sha1 hashes
67
+ # we don't use it for our own hashes
68
+ def hash_code(file_path: UPathStr) -> hashlib._Hash:
72
69
  with open(file_path, "rb") as fp:
73
70
  data = fp.read()
74
71
  data_size = len(data)
@@ -77,6 +74,16 @@ def hash_code(file_path: UPathStr):
77
74
  return hashlib.sha1(blob)
78
75
 
79
76
 
77
+ def hash_small_bytes(data: bytes) -> str:
78
+ return to_b64_str(hashlib.md5(data).digest())
79
+
80
+
81
+ # this is equivalent with hash_file for small files
82
+ def hash_string(string: str) -> str:
83
+ # as we're truncating (not here) at 22 b64, we choose md5 over sha512
84
+ return to_b64_str(hashlib.md5(string.encode("utf-8")).digest())[:HASH_LENGTH]
85
+
86
+
80
87
  def hash_file(
81
88
  file_path: Path,
82
89
  file_size: int | None = None,
@@ -11,15 +11,17 @@ from functools import partial
11
11
  from itertools import islice
12
12
  from pathlib import Path, PosixPath, PurePosixPath, WindowsPath
13
13
  from typing import TYPE_CHECKING, Any, Literal
14
+ from urllib.parse import parse_qs, urlsplit
14
15
 
15
16
  import click
16
17
  import fsspec
17
18
  from lamin_utils import logger
18
19
  from upath import UPath
19
- from upath.implementations.cloud import CloudPath # keep CloudPath!
20
+ from upath.implementations.cloud import CloudPath, S3Path # keep CloudPath!
20
21
  from upath.implementations.local import LocalPath
22
+ from upath.registry import register_implementation
21
23
 
22
- from ._aws_credentials import HOSTED_BUCKETS, get_aws_credentials_manager
24
+ from ._aws_options import HOSTED_BUCKETS, get_aws_options_manager
23
25
  from .hashing import HASH_LENGTH, b16_to_b64, hash_from_hashes_list, hash_string
24
26
 
25
27
  if TYPE_CHECKING:
@@ -360,7 +362,8 @@ def synchronize(
360
362
  print_progress: bool = False,
361
363
  callback: fsspec.callbacks.Callback | None = None,
362
364
  timestamp: float | None = None,
363
- ):
365
+ just_check: bool = False,
366
+ ) -> bool:
364
367
  """Sync to a local destination path."""
365
368
  protocol = self.protocol
366
369
  # optimize the number of network requests
@@ -394,7 +397,7 @@ def synchronize(
394
397
  elif error_no_origin:
395
398
  warn_or_error += "\nIt is not possible to synchronize."
396
399
  raise FileNotFoundError(warn_or_error)
397
- return None
400
+ return False
398
401
 
399
402
  # synchronization logic for directories
400
403
  # to synchronize directories, it should be possible to get modification times
@@ -427,6 +430,9 @@ def synchronize(
427
430
  else:
428
431
  destination_exists = False
429
432
  need_synchronize = True
433
+ # just check if synchronization is needed
434
+ if just_check:
435
+ return need_synchronize
430
436
  if need_synchronize:
431
437
  callback = ProgressCallback.requires_progress(
432
438
  callback, print_progress, objectpath.name, "synchronizing"
@@ -456,13 +462,14 @@ def synchronize(
456
462
  parent = file.parent
457
463
  if next(parent.iterdir(), None) is None:
458
464
  parent.rmdir()
459
- return None
465
+ return need_synchronize
460
466
 
461
467
  # synchronization logic for files
462
468
  callback = ProgressCallback.requires_progress(
463
469
  callback, print_progress, objectpath.name, "synchronizing"
464
470
  )
465
- if objectpath.exists():
471
+ objectpath_exists = objectpath.exists()
472
+ if objectpath_exists:
466
473
  if cloud_mts != 0:
467
474
  local_mts_obj = objectpath.stat().st_mtime
468
475
  need_synchronize = cloud_mts > local_mts_obj
@@ -474,9 +481,17 @@ def synchronize(
474
481
  local_size_obj = objectpath.stat().st_size
475
482
  need_synchronize = cloud_size != local_size_obj
476
483
  else:
477
- objectpath.parent.mkdir(parents=True, exist_ok=True)
484
+ if not just_check:
485
+ objectpath.parent.mkdir(parents=True, exist_ok=True)
478
486
  need_synchronize = True
487
+ # just check if synchronization is needed
488
+ if just_check:
489
+ return need_synchronize
479
490
  if need_synchronize:
491
+ # just to be sure that overwriting an existing file doesn't corrupt it
492
+ # we saw some frequent corruption on some systems for unclear reasons
493
+ if objectpath_exists:
494
+ objectpath.unlink()
480
495
  # hf has sync filesystem
481
496
  # on sync filesystems ChildProgressCallback.branched()
482
497
  # returns the default callback
@@ -490,6 +505,7 @@ def synchronize(
490
505
  # nothing happens if parent_update is not defined
491
506
  # because of Callback.no_op
492
507
  callback.parent_update()
508
+ return need_synchronize
493
509
 
494
510
 
495
511
  def modified(self) -> datetime | None:
@@ -739,16 +755,52 @@ warnings.filterwarnings(
739
755
  )
740
756
 
741
757
 
758
+ # split query params from path string
759
+ def _split_path_query(url: str) -> tuple[str, dict]:
760
+ split_result = urlsplit(url)
761
+ query = parse_qs(split_result.query)
762
+ path = split_result._replace(query="").geturl()
763
+ return path, query
764
+
765
+
766
+ class S3QueryPath(S3Path):
767
+ @classmethod
768
+ def _transform_init_args(cls, args, protocol, storage_options):
769
+ args, protocol, storage_options = super()._transform_init_args(
770
+ args, protocol, storage_options
771
+ )
772
+ arg0 = args[0]
773
+ path, query = _split_path_query(str(arg0))
774
+ for param, param_values in query.items():
775
+ if len(param_values) > 1:
776
+ raise ValueError(f"Multiple values for {param} query parameter")
777
+ else:
778
+ param_value = param_values[0]
779
+ if param in storage_options and param_value != storage_options[param]:
780
+ raise ValueError(
781
+ f"Incompatible {param} in query and storage_options"
782
+ )
783
+ storage_options.setdefault(param, param_value)
784
+ if hasattr(arg0, "storage_options"):
785
+ storage_options = {**arg0.storage_options, **storage_options}
786
+
787
+ return (path, *args[1:]), protocol, storage_options
788
+
789
+
790
+ register_implementation("s3", S3QueryPath, clobber=True)
791
+
792
+
742
793
  def create_path(path: UPathStr, access_token: str | None = None) -> UPath:
743
794
  upath = UPath(path)
744
795
 
745
796
  if upath.protocol == "s3":
746
797
  # add managed credentials and other options for AWS s3 paths
747
- return get_aws_credentials_manager().enrich_path(upath, access_token)
798
+ return get_aws_options_manager().enrich_path(upath, access_token)
748
799
 
749
800
  if upath.protocol in {"http", "https"}:
750
801
  # this is needed because by default aiohttp drops a connection after 5 min
751
802
  # so it is impossible to download large files
803
+ storage_options = {}
752
804
  client_kwargs = upath.storage_options.get("client_kwargs", {})
753
805
  if "timeout" not in client_kwargs:
754
806
  from aiohttp import ClientTimeout
@@ -757,7 +809,12 @@ def create_path(path: UPathStr, access_token: str | None = None) -> UPath:
757
809
  **client_kwargs,
758
810
  "timeout": ClientTimeout(sock_connect=30, sock_read=30),
759
811
  }
760
- return UPath(upath, client_kwargs=client_kwargs)
812
+ storage_options["client_kwargs"] = client_kwargs
813
+ # see download_to for the reason
814
+ if "use_listings_cache" not in upath.storage_options:
815
+ storage_options["use_listings_cache"] = True
816
+ if len(storage_options) > 0:
817
+ return UPath(upath, **storage_options)
761
818
  return upath
762
819
 
763
820
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: lamindb_setup
3
- Version: 1.0.2
3
+ Version: 1.1.0
4
4
  Summary: Setup & configure LaminDB.
5
5
  Author-email: Lamin Labs <open-source@lamin.ai>
6
6
  Requires-Python: >=3.10
@@ -11,13 +11,15 @@ Requires-Dist: dj_database_url>=1.3.0,<3.0.0
11
11
  Requires-Dist: pydantic-settings
12
12
  Requires-Dist: appdirs<2.0.0
13
13
  Requires-Dist: requests
14
- Requires-Dist: universal_pathlib==0.2.5
14
+ Requires-Dist: universal_pathlib==0.2.6
15
15
  Requires-Dist: botocore<2.0.0
16
16
  Requires-Dist: supabase>=2.8.1,<=2.11.0
17
+ Requires-Dist: storage3!=0.11.2; python_version < '3.11'
17
18
  Requires-Dist: psutil
19
+ Requires-Dist: packaging
18
20
  Requires-Dist: urllib3<2 ; extra == "aws"
19
21
  Requires-Dist: aiobotocore[boto3]>=2.5.4,<3.0.0 ; extra == "aws"
20
- Requires-Dist: s3fs>=2023.12.2,<=2024.10.0,!=2024.10.0 ; extra == "aws"
22
+ Requires-Dist: s3fs>=2023.12.2,<=2025.2.0,!=2024.10.0 ; extra == "aws"
21
23
  Requires-Dist: line_profiler ; extra == "dev"
22
24
  Requires-Dist: pyjwt<3.0.0 ; extra == "dev"
23
25
  Requires-Dist: psycopg2-binary ; extra == "dev"
@@ -29,7 +31,7 @@ Requires-Dist: pytest-xdist ; extra == "dev"
29
31
  Requires-Dist: nbproject-test>=0.4.3 ; extra == "dev"
30
32
  Requires-Dist: pandas ; extra == "dev"
31
33
  Requires-Dist: django-schema-graph ; extra == "erdiagram"
32
- Requires-Dist: gcsfs>=2023.12.2,<=2024.10.0 ; extra == "gcp"
34
+ Requires-Dist: gcsfs>=2023.12.2,<=2025.2.0 ; extra == "gcp"
33
35
  Project-URL: Home, https://github.com/laminlabs/lamindb-setup
34
36
  Provides-Extra: aws
35
37
  Provides-Extra: dev
@@ -1,16 +1,16 @@
1
- lamindb_setup/__init__.py,sha256=C1Or-KZftjcYsNBbezefoC84rq0GVDE2IJEt4kNeVec,2127
1
+ lamindb_setup/__init__.py,sha256=nG1aMZpbOPII-GtdqxGgMgeR-Lk_ltuPr_yAMiMd44o,2692
2
2
  lamindb_setup/_cache.py,sha256=aszT-zk3S5dTLKp5g1W-S_FPh2E5YVCALwWSGPJLWBM,1493
3
3
  lamindb_setup/_check.py,sha256=28PcG8Kp6OpjSLSi1r2boL2Ryeh6xkaCL87HFbjs6GA,129
4
- lamindb_setup/_check_setup.py,sha256=DJ4lZp4N9Dcri8H7PEOl-YDoi4qqfQnXvwBuL4venAw,3425
4
+ lamindb_setup/_check_setup.py,sha256=d1GS1Csy2G9AysfjoyVcNVY0lhHiWSwSLpfgdIQf35s,5477
5
5
  lamindb_setup/_close.py,sha256=pf8PHrtRBC6TycBtVAXzD9EZSGufoyp5M82o6zLHmt4,1240
6
- lamindb_setup/_connect_instance.py,sha256=1sA8i-vxZJhwOzobgqmzgsUE9BF6XaH593RDPoy6G0g,17572
7
- lamindb_setup/_delete.py,sha256=Mip5M9tCxyfsjzdcPCl6x9CQ0TkYTqKNNWDIcJ-KVMo,5677
6
+ lamindb_setup/_connect_instance.py,sha256=eMiAjuMDuZ_x8N5LLHwz5O4VXv0nH6AzYW0tnnjUSug,18190
7
+ lamindb_setup/_delete.py,sha256=4NqJkEA812VWRxTBW1o1h1YX0fF7sQwn08hj1BmvCPs,5673
8
8
  lamindb_setup/_django.py,sha256=uIQflpkp8l3axyPaKURlk3kacgpElVP5KOKmFxYSMGk,1454
9
9
  lamindb_setup/_entry_points.py,sha256=sKwXPX9xjOotoAjvgkU5LBwjjHLWVkh0ZGdiSsrch9k,522
10
10
  lamindb_setup/_exportdb.py,sha256=QLjoH4dEwqa01A12naKaDPglCCzl2_VLKWFfJRE_uSg,2113
11
11
  lamindb_setup/_importdb.py,sha256=fKv9ev5OOj_-bmzC8XZ1GxOcjIjI486yrHSHDWQrJeI,1874
12
- lamindb_setup/_init_instance.py,sha256=JV34P4ShinIdeUmHeoOvNE3WtzLbk37HcsWxfaEC_ks,13944
13
- lamindb_setup/_migrate.py,sha256=86ShCTLGnBT3HreYhua4B8YQTy6tyIOgpa0qlsdob1k,9093
12
+ lamindb_setup/_init_instance.py,sha256=nZkqGqJQ_sF1-oQmnbpIjP1ExWUow5I5N7lsyX9m-io,13811
13
+ lamindb_setup/_migrate.py,sha256=drf7Lsa8oDXBPBWMWMge0n5VyNPvsrCFAhcXlN91OxM,9165
14
14
  lamindb_setup/_register_instance.py,sha256=alQuYp2f8Ct8xvRC1gt8p_HZ0tqCd3gZD3kiPBLPpsI,1269
15
15
  lamindb_setup/_schema.py,sha256=b3uzhhWpV5mQtDwhMINc2MabGCnGLESy51ito3yl6Wc,679
16
16
  lamindb_setup/_schema_metadata.py,sha256=7ITlzIK32GHdhMq9e0GtPM3QbzJWhUvzPutiHrjjPk0,13986
@@ -18,30 +18,30 @@ lamindb_setup/_set_managed_storage.py,sha256=4tDxXQMt8Gw028uY3vIQxZQ7qBNXhQMc8sa
18
18
  lamindb_setup/_setup_user.py,sha256=-g7Xj6510BDyM8kuqAsVBZFwehlhBa_uWBSV1rPeuM8,4586
19
19
  lamindb_setup/_silence_loggers.py,sha256=AKF_YcHvX32eGXdsYK8MJlxEaZ-Uo2f6QDRzjKFCtws,1568
20
20
  lamindb_setup/core/__init__.py,sha256=BxIVMX5HQq8oZ1OuY_saUEJz5Tdd7gaCPngxVu5iou4,417
21
- lamindb_setup/core/_aws_credentials.py,sha256=_wBWC10MGx3PW9UXGhsVNlq7YvCER3RhfRgAdlxEjNM,6120
21
+ lamindb_setup/core/_aws_options.py,sha256=Evu8W9Xcr1_vYKo3Vfsv03dmD4x3itaPyWY8fBOrYcE,6954
22
22
  lamindb_setup/core/_aws_storage.py,sha256=nEjeUv4xUVpoV0Lx-zjjmyb9w804bDyaeiM-OqbfwM0,1799
23
23
  lamindb_setup/core/_deprecated.py,sha256=HN7iUBdEgahw5e4NHCd1VJooUfieNb6GRzS5x8jU-q8,2549
24
24
  lamindb_setup/core/_docs.py,sha256=3k-YY-oVaJd_9UIY-LfBg_u8raKOCNfkZQPA73KsUhs,276
25
- lamindb_setup/core/_hub_client.py,sha256=cN19XbZmvLCxL_GKdOcKbedNRL7kR47vmLmA--NMv-U,6306
26
- lamindb_setup/core/_hub_core.py,sha256=qVGGsWVfP6GK9UzmEz1kuR_B8wFkgTstMJJoMHeUF0c,20007
25
+ lamindb_setup/core/_hub_client.py,sha256=ywJ_HpGqwui6t_vsj-my80gYgSSgAiSFyZoRj03sBWk,6464
26
+ lamindb_setup/core/_hub_core.py,sha256=OPByE76h6MQaCToT7AzbeH9Tr5-BbpIAh__oIiEl1VE,20021
27
27
  lamindb_setup/core/_hub_crud.py,sha256=IAuPZes1am8OFwtcf5jSRQPGG1eKwVTEsp9Li-uq0cQ,5377
28
28
  lamindb_setup/core/_hub_utils.py,sha256=6dyDGyzYFgVfR_lE3VN3CP1jGp98gxPtr-T91PAP05U,2687
29
29
  lamindb_setup/core/_private_django_api.py,sha256=KIn43HOhiRjkbTbddyJqv-WNTTa1bAizbM1tWXoXPBg,2869
30
30
  lamindb_setup/core/_settings.py,sha256=eslFO84vb5uRRfJ3r_uu4O8677l8lU5BbpZJMSAYw6A,8244
31
- lamindb_setup/core/_settings_instance.py,sha256=agzlSvKhvLyyNPf-Gw-M3FYn2ARpR1QEoCexhIHX5IU,19437
31
+ lamindb_setup/core/_settings_instance.py,sha256=prEAS0MrSzPdxQTSjFoTzvGz6915GIL9v9RYhzPeoQ4,19362
32
32
  lamindb_setup/core/_settings_load.py,sha256=boeNntqIZ_DjelRBUAGp0ujc5akmbrrsk-LY28exa7E,4099
33
33
  lamindb_setup/core/_settings_save.py,sha256=rxGxgaK5i9exKqSJERQQyY1WZio20meoQJoYXlVW-1w,3138
34
- lamindb_setup/core/_settings_storage.py,sha256=dPIvbA6PkdjM8gsX6zxtH7VNMc4vhkuEO4luVMZY7RQ,12243
34
+ lamindb_setup/core/_settings_storage.py,sha256=a3rRqntAX-dKZn6-TkY9pTPT8UPNUlwdni1tDNTJW84,13117
35
35
  lamindb_setup/core/_settings_store.py,sha256=WcsgOmgnu9gztcrhp-N4OONNZyxICHV8M0HdJllTaEo,2219
36
36
  lamindb_setup/core/_settings_user.py,sha256=lWqV3HmZCsEq2UsU_iVNW0p9ddsNg7-B6xOaMNH1aw0,1475
37
37
  lamindb_setup/core/_setup_bionty_sources.py,sha256=qTPMV5TEbNiTB81QqG2rSs6W8j8kQ7kVQMLOXRzAxBI,4004
38
38
  lamindb_setup/core/cloud_sqlite_locker.py,sha256=i6TrT7HG0lqliPvZTlsZ_uplPaqhPBbabyfeR32SkA8,7107
39
39
  lamindb_setup/core/django.py,sha256=jSbckc_R39BhnANGhO5YFzKA8BHNANJDDgS5rwkDWmU,3828
40
40
  lamindb_setup/core/exceptions.py,sha256=4NpLUNUIfXYVTFX2FvLZF8RW34exk2Vn2X3G4YhnTRg,276
41
- lamindb_setup/core/hashing.py,sha256=EO7NSXMdKZLe9cHxCNHiqd5-6pVhv97uafqpoWV9FKc,3394
41
+ lamindb_setup/core/hashing.py,sha256=M3Q1-ywnqh4Uy5zojbQfLju19HU0ySp8Oi7FGIJXfFI,3667
42
42
  lamindb_setup/core/types.py,sha256=zJii2le38BJUmsNVvzDrbzGYr0yaeb-9Rw9IKmsBr3k,523
43
- lamindb_setup/core/upath.py,sha256=iZIGnkvG6uZhjiq2qJ8v4pgABxw2rBwm_XSnZACzHVo,31075
44
- lamindb_setup-1.0.2.dist-info/LICENSE,sha256=UOZ1F5fFDe3XXvG4oNnkL1-Ecun7zpHzRxjp-XsMeAo,11324
45
- lamindb_setup-1.0.2.dist-info/WHEEL,sha256=CpUCUxeHQbRN5UGRQHYRJorO5Af-Qy_fHMctcQ8DSGI,82
46
- lamindb_setup-1.0.2.dist-info/METADATA,sha256=mUoczZRjbbv0JbQCETPFlJxYB-EW5h34XgFEWDYmL40,1690
47
- lamindb_setup-1.0.2.dist-info/RECORD,,
43
+ lamindb_setup/core/upath.py,sha256=FUCBicCTAUPq-3lFwcDPngRLrNGfqQEn7l9ZCLjkz3Y,33335
44
+ lamindb_setup-1.1.0.dist-info/LICENSE,sha256=UOZ1F5fFDe3XXvG4oNnkL1-Ecun7zpHzRxjp-XsMeAo,11324
45
+ lamindb_setup-1.1.0.dist-info/WHEEL,sha256=CpUCUxeHQbRN5UGRQHYRJorO5Af-Qy_fHMctcQ8DSGI,82
46
+ lamindb_setup-1.1.0.dist-info/METADATA,sha256=h799g4G2jS30UqZwG69XIIB_cWf46cPvnhisMsTOdSk,1770
47
+ lamindb_setup-1.1.0.dist-info/RECORD,,