lamindb_setup 0.81.3__py3-none-any.whl → 1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb_setup/__init__.py +31 -2
- lamindb_setup/_check_setup.py +15 -16
- lamindb_setup/_connect_instance.py +35 -33
- lamindb_setup/_delete.py +2 -2
- lamindb_setup/_django.py +6 -6
- lamindb_setup/_exportdb.py +1 -1
- lamindb_setup/_init_instance.py +13 -39
- lamindb_setup/_migrate.py +5 -3
- lamindb_setup/_schema_metadata.py +10 -6
- lamindb_setup/core/_aws_credentials.py +10 -2
- lamindb_setup/core/_hub_core.py +6 -3
- lamindb_setup/core/_hub_crud.py +32 -30
- lamindb_setup/core/_settings.py +10 -5
- lamindb_setup/core/_settings_instance.py +17 -3
- lamindb_setup/core/_settings_storage.py +16 -15
- lamindb_setup/core/_settings_user.py +1 -1
- lamindb_setup/core/django.py +1 -3
- lamindb_setup/core/hashing.py +16 -5
- lamindb_setup/core/upath.py +108 -64
- {lamindb_setup-0.81.3.dist-info → lamindb_setup-1.0a1.dist-info}/METADATA +4 -5
- {lamindb_setup-0.81.3.dist-info → lamindb_setup-1.0a1.dist-info}/RECORD +23 -23
- {lamindb_setup-0.81.3.dist-info → lamindb_setup-1.0a1.dist-info}/WHEEL +1 -1
- {lamindb_setup-0.81.3.dist-info → lamindb_setup-1.0a1.dist-info}/LICENSE +0 -0
lamindb_setup/core/_hub_crud.py
CHANGED
|
@@ -12,25 +12,23 @@ def select_instance_by_owner_name(
|
|
|
12
12
|
name: str,
|
|
13
13
|
client: Client,
|
|
14
14
|
) -> dict | None:
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
)
|
|
22
|
-
.eq("name", name)
|
|
23
|
-
.eq("account.handle", owner)
|
|
24
|
-
.eq("storage.is_default", True)
|
|
25
|
-
.execute()
|
|
26
|
-
.data
|
|
15
|
+
# this won't find an instance without the default storage
|
|
16
|
+
data = (
|
|
17
|
+
client.table("instance")
|
|
18
|
+
.select(
|
|
19
|
+
"*, account!inner!instance_account_id_28936e8f_fk_account_id(*),"
|
|
20
|
+
" storage!inner!storage_instance_id_359fca71_fk_instance_id(*)"
|
|
27
21
|
)
|
|
28
|
-
|
|
29
|
-
|
|
22
|
+
.eq("name", name)
|
|
23
|
+
.eq("account.handle", owner)
|
|
24
|
+
.eq("storage.is_default", True)
|
|
25
|
+
.execute()
|
|
26
|
+
.data
|
|
27
|
+
)
|
|
30
28
|
if len(data) == 0:
|
|
31
29
|
return None
|
|
32
30
|
result = data[0]
|
|
33
|
-
# this is
|
|
31
|
+
# this is a list
|
|
34
32
|
# assume only one default storage
|
|
35
33
|
result["storage"] = result["storage"][0]
|
|
36
34
|
return result
|
|
@@ -89,15 +87,22 @@ def select_instance_by_id_with_storage(
|
|
|
89
87
|
instance_id: str,
|
|
90
88
|
client: Client,
|
|
91
89
|
):
|
|
92
|
-
|
|
90
|
+
# this won't find an instance without the default storage
|
|
91
|
+
data = (
|
|
93
92
|
client.table("instance")
|
|
94
|
-
.select("*, storage!
|
|
93
|
+
.select("*, storage!inner!storage_instance_id_359fca71_fk_instance_id(*)")
|
|
95
94
|
.eq("id", instance_id)
|
|
95
|
+
.eq("storage.is_default", True)
|
|
96
96
|
.execute()
|
|
97
|
+
.data
|
|
97
98
|
)
|
|
98
|
-
if len(
|
|
99
|
+
if len(data) == 0:
|
|
99
100
|
return None
|
|
100
|
-
|
|
101
|
+
result = data[0]
|
|
102
|
+
# this is a list
|
|
103
|
+
# assume only one default storage
|
|
104
|
+
result["storage"] = result["storage"][0]
|
|
105
|
+
return result
|
|
101
106
|
|
|
102
107
|
|
|
103
108
|
def update_instance(instance_id: str, instance_fields: dict, client: Client):
|
|
@@ -139,17 +144,14 @@ def select_collaborator(
|
|
|
139
144
|
def select_default_storage_by_instance_id(
|
|
140
145
|
instance_id: str, client: Client
|
|
141
146
|
) -> dict | None:
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
)
|
|
151
|
-
except Exception:
|
|
152
|
-
return None
|
|
147
|
+
data = (
|
|
148
|
+
client.table("storage")
|
|
149
|
+
.select("*")
|
|
150
|
+
.eq("instance_id", instance_id)
|
|
151
|
+
.eq("is_default", True)
|
|
152
|
+
.execute()
|
|
153
|
+
.data
|
|
154
|
+
)
|
|
153
155
|
if len(data) == 0:
|
|
154
156
|
return None
|
|
155
157
|
return data[0]
|
lamindb_setup/core/_settings.py
CHANGED
|
@@ -164,7 +164,7 @@ class SetupSettings:
|
|
|
164
164
|
|
|
165
165
|
@property
|
|
166
166
|
def paths(self) -> type[SetupPaths]:
|
|
167
|
-
"""Convert cloud paths to
|
|
167
|
+
"""Convert cloud paths to lamindb local paths.
|
|
168
168
|
|
|
169
169
|
Use `settings.paths.cloud_to_local_no_update`
|
|
170
170
|
or `settings.paths.cloud_to_local`.
|
|
@@ -179,7 +179,7 @@ class SetupSettings:
|
|
|
179
179
|
repr = self.user.__repr__()
|
|
180
180
|
repr += f"\nAuto-connect in Python: {self.auto_connect}\n"
|
|
181
181
|
repr += f"Private Django API: {self.private_django_api}\n"
|
|
182
|
-
repr += f"Cache directory: {self.cache_dir}\n"
|
|
182
|
+
repr += f"Cache directory: {self.cache_dir.as_posix()}\n"
|
|
183
183
|
if self._instance_exists:
|
|
184
184
|
repr += self.instance.__repr__()
|
|
185
185
|
else:
|
|
@@ -200,9 +200,14 @@ class SetupPaths:
|
|
|
200
200
|
# cache_key is ignored if filepath is a local path
|
|
201
201
|
if not isinstance(filepath, LocalPathClasses):
|
|
202
202
|
# settings is defined further in this file
|
|
203
|
-
|
|
204
|
-
filepath.path
|
|
205
|
-
|
|
203
|
+
if cache_key is None:
|
|
204
|
+
local_key = filepath.path # type: ignore
|
|
205
|
+
protocol = filepath.protocol # type: ignore
|
|
206
|
+
if protocol in {"http", "https"}:
|
|
207
|
+
local_key = local_key.removeprefix(protocol + "://")
|
|
208
|
+
else:
|
|
209
|
+
local_key = cache_key
|
|
210
|
+
local_filepath = settings.cache_dir / local_key
|
|
206
211
|
else:
|
|
207
212
|
local_filepath = filepath
|
|
208
213
|
return local_filepath
|
|
@@ -121,7 +121,7 @@ class InstanceSettings:
|
|
|
121
121
|
def _search_local_root(
|
|
122
122
|
self, local_root: str | None = None, mute_warning: bool = False
|
|
123
123
|
) -> StorageSettings | None:
|
|
124
|
-
from
|
|
124
|
+
from lamindb.models import Storage
|
|
125
125
|
|
|
126
126
|
if local_root is not None:
|
|
127
127
|
local_records = Storage.objects.filter(root=local_root)
|
|
@@ -358,7 +358,7 @@ class InstanceSettings:
|
|
|
358
358
|
sqlite_filepath = self.storage.cloud_to_local(
|
|
359
359
|
self._sqlite_file, error_no_origin=False
|
|
360
360
|
)
|
|
361
|
-
return f"sqlite:///{sqlite_filepath}"
|
|
361
|
+
return f"sqlite:///{sqlite_filepath.as_posix()}"
|
|
362
362
|
else:
|
|
363
363
|
return self._db
|
|
364
364
|
|
|
@@ -457,11 +457,24 @@ class InstanceSettings:
|
|
|
457
457
|
settings._instance_settings = self
|
|
458
458
|
|
|
459
459
|
def _init_db(self):
|
|
460
|
+
from lamindb_setup import _check_setup
|
|
461
|
+
|
|
460
462
|
from .django import setup_django
|
|
461
463
|
|
|
464
|
+
_check_setup.IS_LOADING = True
|
|
462
465
|
setup_django(self, init=True)
|
|
466
|
+
_check_setup.IS_LOADING = False
|
|
467
|
+
|
|
468
|
+
from lamindb.models import Space
|
|
469
|
+
|
|
470
|
+
Space.objects.get_or_create(
|
|
471
|
+
name="All",
|
|
472
|
+
description="Every team & user with access to the instance has access.",
|
|
473
|
+
)
|
|
463
474
|
|
|
464
475
|
def _load_db(self) -> tuple[bool, str]:
|
|
476
|
+
from lamindb_setup import _check_setup
|
|
477
|
+
|
|
465
478
|
# Is the database available and initialized as LaminDB?
|
|
466
479
|
# returns a tuple of status code and message
|
|
467
480
|
if self.dialect == "sqlite" and not self._sqlite_file.exists():
|
|
@@ -472,7 +485,6 @@ class InstanceSettings:
|
|
|
472
485
|
f" {legacy_file} to {self._sqlite_file}"
|
|
473
486
|
)
|
|
474
487
|
return False, f"SQLite file {self._sqlite_file} does not exist"
|
|
475
|
-
from lamindb_setup import settings # to check user
|
|
476
488
|
|
|
477
489
|
from .django import setup_django
|
|
478
490
|
|
|
@@ -481,5 +493,7 @@ class InstanceSettings:
|
|
|
481
493
|
# setting up django also performs a check for migrations & prints them
|
|
482
494
|
# as warnings
|
|
483
495
|
# this should fail, e.g., if the db is not reachable
|
|
496
|
+
_check_setup.IS_LOADING = True
|
|
484
497
|
setup_django(self)
|
|
498
|
+
_check_setup.IS_LOADING = False
|
|
485
499
|
return True, ""
|
|
@@ -7,6 +7,7 @@ import string
|
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from typing import TYPE_CHECKING, Any, Literal
|
|
9
9
|
|
|
10
|
+
import fsspec
|
|
10
11
|
from lamin_utils import logger
|
|
11
12
|
|
|
12
13
|
from ._aws_credentials import HOSTED_REGIONS, get_aws_credentials_manager
|
|
@@ -24,6 +25,10 @@ if TYPE_CHECKING:
|
|
|
24
25
|
|
|
25
26
|
IS_INITIALIZED_KEY = ".lamindb/_is_initialized"
|
|
26
27
|
|
|
28
|
+
# a list of supported fsspec protocols
|
|
29
|
+
# rename file to local before showing to a user
|
|
30
|
+
VALID_PROTOCOLS = ("file", "gs", "s3", "hf", "http", "https")
|
|
31
|
+
|
|
27
32
|
|
|
28
33
|
def base62(n_char: int) -> str:
|
|
29
34
|
"""Like nanoid without hyphen and underscore."""
|
|
@@ -114,16 +119,11 @@ def init_storage(
|
|
|
114
119
|
root_str = f"s3://lamin-{region}/{uid}"
|
|
115
120
|
else:
|
|
116
121
|
root_str = f"s3://lamin-hosted-test/{uid}"
|
|
117
|
-
elif
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
except Exception as e:
|
|
123
|
-
logger.error(
|
|
124
|
-
"`storage` is not a valid local, GCP storage, AWS S3 path or Hugging Face path"
|
|
125
|
-
)
|
|
126
|
-
raise e
|
|
122
|
+
elif (input_protocol := fsspec.utils.get_protocol(root_str)) not in VALID_PROTOCOLS:
|
|
123
|
+
valid_protocols = ("local",) + VALID_PROTOCOLS[1:] # show local instead of file
|
|
124
|
+
raise ValueError(
|
|
125
|
+
f"Protocol {input_protocol} is not supported, valid protocols are {', '.join(valid_protocols)}"
|
|
126
|
+
)
|
|
127
127
|
ssettings = StorageSettings(
|
|
128
128
|
uid=uid,
|
|
129
129
|
root=root_str,
|
|
@@ -227,10 +227,10 @@ class StorageSettings:
|
|
|
227
227
|
|
|
228
228
|
@property
|
|
229
229
|
def record(self) -> Any:
|
|
230
|
-
"""Storage record in current instance."""
|
|
230
|
+
"""Storage record in the current instance."""
|
|
231
231
|
if self._record is None:
|
|
232
232
|
# dynamic import because of import order
|
|
233
|
-
from
|
|
233
|
+
from lamindb.models import Storage
|
|
234
234
|
|
|
235
235
|
from ._settings import settings
|
|
236
236
|
|
|
@@ -299,14 +299,15 @@ class StorageSettings:
|
|
|
299
299
|
return self._region
|
|
300
300
|
|
|
301
301
|
@property
|
|
302
|
-
def type(self) -> Literal["local", "s3", "gs"]:
|
|
302
|
+
def type(self) -> Literal["local", "s3", "gs", "hf", "http", "https"]:
|
|
303
303
|
"""AWS S3 vs. Google Cloud vs. local.
|
|
304
304
|
|
|
305
|
-
Returns the protocol as a string: "local", "s3", "gs".
|
|
305
|
+
Returns the protocol as a string: "local", "s3", "gs", "http", "https".
|
|
306
306
|
"""
|
|
307
307
|
import fsspec
|
|
308
308
|
|
|
309
309
|
convert = {"file": "local"}
|
|
310
|
+
# init_storage checks that the root protocol belongs to VALID_PROTOCOLS
|
|
310
311
|
protocol = fsspec.utils.get_protocol(self.root_as_str)
|
|
311
312
|
return convert.get(protocol, protocol) # type: ignore
|
|
312
313
|
|
|
@@ -345,5 +346,5 @@ class StorageSettings:
|
|
|
345
346
|
return self.root / filekey
|
|
346
347
|
|
|
347
348
|
def local_filepath(self, filekey: UPathStr) -> UPath:
|
|
348
|
-
"""Local (cache) filepath from filekey
|
|
349
|
+
"""Local (cache) filepath from filekey."""
|
|
349
350
|
return self.cloud_to_local(self.key_to_filepath(filekey))
|
|
@@ -48,7 +48,7 @@ class UserSettings:
|
|
|
48
48
|
@property
|
|
49
49
|
def id(self):
|
|
50
50
|
"""Integer id valid in current intance."""
|
|
51
|
-
from
|
|
51
|
+
from lamindb.base.users import current_user_id
|
|
52
52
|
|
|
53
53
|
# there is no cache needed here because current_user_id()
|
|
54
54
|
# has its own cache
|
lamindb_setup/core/django.py
CHANGED
|
@@ -5,10 +5,8 @@ import builtins
|
|
|
5
5
|
import os
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
import time
|
|
8
|
-
from lamin_utils import logger
|
|
9
|
-
from ._settings_store import current_instance_settings_file
|
|
10
8
|
from ._settings_instance import InstanceSettings
|
|
11
|
-
|
|
9
|
+
|
|
12
10
|
|
|
13
11
|
IS_RUN_FROM_IPYTHON = getattr(builtins, "__IPYTHON__", False)
|
|
14
12
|
IS_SETUP = False
|
lamindb_setup/core/hashing.py
CHANGED
|
@@ -12,6 +12,7 @@ from __future__ import annotations
|
|
|
12
12
|
|
|
13
13
|
import base64
|
|
14
14
|
import hashlib
|
|
15
|
+
import json
|
|
15
16
|
from concurrent.futures import ThreadPoolExecutor
|
|
16
17
|
from typing import TYPE_CHECKING
|
|
17
18
|
|
|
@@ -40,11 +41,21 @@ def b16_to_b64(s: str):
|
|
|
40
41
|
return to_b64_str(base64.b16decode(s.strip('"'), casefold=True))
|
|
41
42
|
|
|
42
43
|
|
|
44
|
+
def hash_string(string: str) -> str:
|
|
45
|
+
# as we're truncating (not here) at 22 b64, we choose md5 over sha512
|
|
46
|
+
return to_b64_str(hashlib.md5(string.encode("utf-8")).digest())
|
|
47
|
+
|
|
48
|
+
|
|
43
49
|
# a lot to read about this: lamin-notes/2022/hashing
|
|
44
50
|
def hash_set(s: set[str]) -> str:
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
51
|
+
join_s = ":".join(sorted(s))
|
|
52
|
+
return hash_string(join_s)[:HASH_LENGTH]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def hash_dict(d: dict) -> str:
|
|
56
|
+
return to_b64_str(hashlib.md5(json.dumps(d, sort_keys=True).encode()).digest())[
|
|
57
|
+
:HASH_LENGTH
|
|
58
|
+
]
|
|
48
59
|
|
|
49
60
|
|
|
50
61
|
def hash_from_hashes_list(hashes: Iterable[str]) -> str:
|
|
@@ -111,6 +122,6 @@ def hash_dir(path: Path):
|
|
|
111
122
|
hashes, sizes = zip(*hashes_sizes)
|
|
112
123
|
|
|
113
124
|
hash, hash_type = hash_from_hashes_list(hashes), "md5-d"
|
|
114
|
-
|
|
125
|
+
n_files = len(hashes)
|
|
115
126
|
size = sum(sizes)
|
|
116
|
-
return size, hash, hash_type,
|
|
127
|
+
return size, hash, hash_type, n_files
|
lamindb_setup/core/upath.py
CHANGED
|
@@ -12,14 +12,15 @@ from itertools import islice
|
|
|
12
12
|
from pathlib import Path, PosixPath, PurePosixPath, WindowsPath
|
|
13
13
|
from typing import TYPE_CHECKING, Any, Literal
|
|
14
14
|
|
|
15
|
+
import click
|
|
15
16
|
import fsspec
|
|
16
17
|
from lamin_utils import logger
|
|
17
18
|
from upath import UPath
|
|
18
|
-
from upath.implementations.cloud import CloudPath
|
|
19
|
+
from upath.implementations.cloud import CloudPath # keep CloudPath!
|
|
19
20
|
from upath.implementations.local import LocalPath
|
|
20
21
|
|
|
21
22
|
from ._aws_credentials import HOSTED_BUCKETS, get_aws_credentials_manager
|
|
22
|
-
from .hashing import HASH_LENGTH, b16_to_b64, hash_from_hashes_list
|
|
23
|
+
from .hashing import HASH_LENGTH, b16_to_b64, hash_from_hashes_list, hash_string
|
|
23
24
|
|
|
24
25
|
if TYPE_CHECKING:
|
|
25
26
|
from .types import UPathStr
|
|
@@ -190,8 +191,17 @@ class ProgressCallback(fsspec.callbacks.Callback):
|
|
|
190
191
|
pass
|
|
191
192
|
|
|
192
193
|
def update_relative_value(self, inc=1):
|
|
193
|
-
|
|
194
|
-
|
|
194
|
+
if inc != 0:
|
|
195
|
+
self.value += inc
|
|
196
|
+
self.call()
|
|
197
|
+
else:
|
|
198
|
+
# this is specific to http filesystem
|
|
199
|
+
# for some reason the last update is 0 always
|
|
200
|
+
# sometimes the reported result is less that 100%
|
|
201
|
+
# here 100% is forced manually in this case
|
|
202
|
+
if self.value < 1.0 and self.value >= 0.999:
|
|
203
|
+
self.value = self.size
|
|
204
|
+
self.call()
|
|
195
205
|
|
|
196
206
|
def branch(self, path_1, path_2, kwargs):
|
|
197
207
|
if self.adjust_size:
|
|
@@ -258,7 +268,17 @@ def download_to(self, local_path: UPathStr, print_progress: bool = True, **kwarg
|
|
|
258
268
|
)
|
|
259
269
|
kwargs["callback"] = callback
|
|
260
270
|
|
|
261
|
-
|
|
271
|
+
cloud_path_str = str(self)
|
|
272
|
+
local_path_str = str(local_path)
|
|
273
|
+
# needed due to https://github.com/fsspec/filesystem_spec/issues/1766
|
|
274
|
+
# otherwise fsspec calls fs._ls_real where it reads the body and parses links
|
|
275
|
+
# so the file is downloaded 2 times
|
|
276
|
+
# upath doesn't call fs.ls to infer type, so it is safe to call
|
|
277
|
+
if self.protocol in {"http", "https"} and self.stat().as_info()["type"] == "file":
|
|
278
|
+
self.fs.use_listings_cache = True
|
|
279
|
+
self.fs.dircache[cloud_path_str] = []
|
|
280
|
+
|
|
281
|
+
self.fs.download(cloud_path_str, local_path_str, **kwargs)
|
|
262
282
|
|
|
263
283
|
|
|
264
284
|
def upload_from(
|
|
@@ -306,8 +326,7 @@ def upload_from(
|
|
|
306
326
|
destination = self.as_posix()
|
|
307
327
|
|
|
308
328
|
# the below lines are to avoid s3fs triggering create_bucket in upload if
|
|
309
|
-
# dirs are present it allows to avoid permission error
|
|
310
|
-
# would be easier to just
|
|
329
|
+
# dirs are present, it allows to avoid the permission error
|
|
311
330
|
if self.protocol == "s3" and local_path_is_dir and create_folder:
|
|
312
331
|
bucket = self.drive
|
|
313
332
|
if bucket not in self.fs.dircache:
|
|
@@ -350,27 +369,19 @@ def synchronize(
|
|
|
350
369
|
exists = True
|
|
351
370
|
cloud_mts = timestamp
|
|
352
371
|
else:
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
cloud_mts = self.modified.timestamp()
|
|
367
|
-
is_dir = False
|
|
368
|
-
exists = True
|
|
369
|
-
except FileNotFoundError:
|
|
370
|
-
exists = False
|
|
371
|
-
except IsADirectoryError:
|
|
372
|
-
is_dir = True
|
|
373
|
-
exists = True
|
|
372
|
+
try:
|
|
373
|
+
cloud_stat = self.stat()
|
|
374
|
+
cloud_info = cloud_stat.as_info()
|
|
375
|
+
exists = True
|
|
376
|
+
is_dir = cloud_info["type"] == "directory"
|
|
377
|
+
if not is_dir:
|
|
378
|
+
# hf requires special treatment
|
|
379
|
+
if protocol == "hf":
|
|
380
|
+
cloud_mts = cloud_info["last_commit"].date.timestamp()
|
|
381
|
+
else:
|
|
382
|
+
cloud_mts = cloud_stat.st_mtime
|
|
383
|
+
except FileNotFoundError:
|
|
384
|
+
exists = False
|
|
374
385
|
|
|
375
386
|
if not exists:
|
|
376
387
|
warn_or_error = f"The original path {self} does not exist anymore."
|
|
@@ -386,6 +397,7 @@ def synchronize(
|
|
|
386
397
|
return None
|
|
387
398
|
|
|
388
399
|
# synchronization logic for directories
|
|
400
|
+
# to synchronize directories, it should be possible to get modification times
|
|
389
401
|
if is_dir:
|
|
390
402
|
files = self.fs.find(str(self), detail=True)
|
|
391
403
|
if protocol == "s3":
|
|
@@ -451,8 +463,16 @@ def synchronize(
|
|
|
451
463
|
callback, print_progress, objectpath.name, "synchronizing"
|
|
452
464
|
)
|
|
453
465
|
if objectpath.exists():
|
|
454
|
-
|
|
455
|
-
|
|
466
|
+
if cloud_mts != 0:
|
|
467
|
+
local_mts_obj = objectpath.stat().st_mtime
|
|
468
|
+
need_synchronize = cloud_mts > local_mts_obj
|
|
469
|
+
else:
|
|
470
|
+
# this is true for http for example
|
|
471
|
+
# where size is present but st_mtime is not
|
|
472
|
+
# we assume that any change without the change in size is unlikely
|
|
473
|
+
cloud_size = cloud_stat.st_size
|
|
474
|
+
local_size_obj = objectpath.stat().st_size
|
|
475
|
+
need_synchronize = cloud_size != local_size_obj
|
|
456
476
|
else:
|
|
457
477
|
objectpath.parent.mkdir(parents=True, exist_ok=True)
|
|
458
478
|
need_synchronize = True
|
|
@@ -464,7 +484,8 @@ def synchronize(
|
|
|
464
484
|
self.download_to(
|
|
465
485
|
objectpath, recursive=False, print_progress=False, callback=callback
|
|
466
486
|
)
|
|
467
|
-
|
|
487
|
+
if cloud_mts != 0:
|
|
488
|
+
os.utime(objectpath, times=(cloud_mts, cloud_mts))
|
|
468
489
|
else:
|
|
469
490
|
# nothing happens if parent_update is not defined
|
|
470
491
|
# because of Callback.no_op
|
|
@@ -497,7 +518,7 @@ def compute_file_tree(
|
|
|
497
518
|
skip_suffixes_tuple = ()
|
|
498
519
|
else:
|
|
499
520
|
skip_suffixes_tuple = tuple(skip_suffixes) # type: ignore
|
|
500
|
-
|
|
521
|
+
n_files = 0
|
|
501
522
|
n_directories = 0
|
|
502
523
|
|
|
503
524
|
# by default only including registered files
|
|
@@ -510,7 +531,7 @@ def compute_file_tree(
|
|
|
510
531
|
include_paths = set()
|
|
511
532
|
|
|
512
533
|
def inner(dir_path: Path, prefix: str = "", level: int = -1):
|
|
513
|
-
nonlocal
|
|
534
|
+
nonlocal n_files, n_directories, suffixes
|
|
514
535
|
if level == 0:
|
|
515
536
|
return
|
|
516
537
|
stripped_dir_path = dir_path.as_posix().rstrip("/")
|
|
@@ -543,7 +564,7 @@ def compute_file_tree(
|
|
|
543
564
|
suffix = extract_suffix_from_path(child_path)
|
|
544
565
|
suffixes.add(suffix)
|
|
545
566
|
n_files_per_dir_and_type[suffix] += 1
|
|
546
|
-
|
|
567
|
+
n_files += 1
|
|
547
568
|
if n_files_per_dir_and_type[suffix] == n_max_files_per_dir_and_type:
|
|
548
569
|
yield prefix + "..."
|
|
549
570
|
elif n_files_per_dir_and_type[suffix] > n_max_files_per_dir_and_type:
|
|
@@ -556,15 +577,15 @@ def compute_file_tree(
|
|
|
556
577
|
for line in islice(iterator, n_max_files):
|
|
557
578
|
folder_tree += f"\n{line}"
|
|
558
579
|
if next(iterator, None):
|
|
559
|
-
folder_tree += f"\n... only showing {n_max_files} out of {
|
|
580
|
+
folder_tree += f"\n... only showing {n_max_files} out of {n_files} files"
|
|
560
581
|
directory_info = "directory" if n_directories == 1 else "directories"
|
|
561
582
|
display_suffixes = ", ".join([f"{suffix!r}" for suffix in suffixes])
|
|
562
|
-
suffix_message = f" with suffixes {display_suffixes}" if
|
|
583
|
+
suffix_message = f" with suffixes {display_suffixes}" if n_files > 0 else ""
|
|
563
584
|
message = (
|
|
564
585
|
f"{n_directories} sub-{directory_info} &"
|
|
565
|
-
f" {
|
|
586
|
+
f" {n_files} files{suffix_message}\n{path.resolve()}{folder_tree}"
|
|
566
587
|
)
|
|
567
|
-
return message,
|
|
588
|
+
return message, n_files
|
|
568
589
|
|
|
569
590
|
|
|
570
591
|
# adapted from: https://stackoverflow.com/questions/9727673
|
|
@@ -718,12 +739,26 @@ warnings.filterwarnings(
|
|
|
718
739
|
)
|
|
719
740
|
|
|
720
741
|
|
|
721
|
-
def create_path(path:
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
if
|
|
725
|
-
|
|
726
|
-
|
|
742
|
+
def create_path(path: UPathStr, access_token: str | None = None) -> UPath:
|
|
743
|
+
upath = UPath(path)
|
|
744
|
+
|
|
745
|
+
if upath.protocol == "s3":
|
|
746
|
+
# add managed credentials and other options for AWS s3 paths
|
|
747
|
+
return get_aws_credentials_manager().enrich_path(upath, access_token)
|
|
748
|
+
|
|
749
|
+
if upath.protocol in {"http", "https"}:
|
|
750
|
+
# this is needed because by default aiohttp drops a connection after 5 min
|
|
751
|
+
# so it is impossible to download large files
|
|
752
|
+
client_kwargs = upath.storage_options.get("client_kwargs", {})
|
|
753
|
+
if "timeout" not in client_kwargs:
|
|
754
|
+
from aiohttp import ClientTimeout
|
|
755
|
+
|
|
756
|
+
client_kwargs = {
|
|
757
|
+
**client_kwargs,
|
|
758
|
+
"timeout": ClientTimeout(sock_connect=30, sock_read=30),
|
|
759
|
+
}
|
|
760
|
+
return UPath(upath, client_kwargs=client_kwargs)
|
|
761
|
+
return upath
|
|
727
762
|
|
|
728
763
|
|
|
729
764
|
def get_stat_file_cloud(stat: dict) -> tuple[int, str | None, str | None]:
|
|
@@ -739,20 +774,28 @@ def get_stat_file_cloud(stat: dict) -> tuple[int, str | None, str | None]:
|
|
|
739
774
|
hash = b16_to_b64(stat["blob_id"])
|
|
740
775
|
hash_type = "sha1"
|
|
741
776
|
# s3
|
|
777
|
+
# StorageClass is checked to be sure that it is indeed s3
|
|
778
|
+
# because http also has ETag
|
|
742
779
|
elif "ETag" in stat:
|
|
743
780
|
etag = stat["ETag"]
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
# we can add more logic later down-the-road
|
|
749
|
-
hash = b16_to_b64(etag)
|
|
750
|
-
hash_type = "md5"
|
|
781
|
+
if "mimetype" in stat:
|
|
782
|
+
# http
|
|
783
|
+
hash = hash_string(etag.strip('"'))
|
|
784
|
+
hash_type = "md5-etag"
|
|
751
785
|
else:
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
786
|
+
# s3
|
|
787
|
+
# small files
|
|
788
|
+
if "-" not in etag:
|
|
789
|
+
# only store hash for non-multipart uploads
|
|
790
|
+
# we can't rapidly validate multi-part uploaded files client-side
|
|
791
|
+
# we can add more logic later down-the-road
|
|
792
|
+
hash = b16_to_b64(etag)
|
|
793
|
+
hash_type = "md5"
|
|
794
|
+
else:
|
|
795
|
+
stripped_etag, suffix = etag.split("-")
|
|
796
|
+
suffix = suffix.strip('"')
|
|
797
|
+
hash = b16_to_b64(stripped_etag)
|
|
798
|
+
hash_type = f"md5-{suffix}" # this is the S3 chunk-hashing strategy
|
|
756
799
|
if hash is not None:
|
|
757
800
|
hash = hash[:HASH_LENGTH]
|
|
758
801
|
return size, hash, hash_type
|
|
@@ -777,17 +820,18 @@ def get_stat_dir_cloud(path: UPath) -> tuple[int, str | None, str | None, int]:
|
|
|
777
820
|
if compute_list_hash:
|
|
778
821
|
hashes.append(object[accessor].strip('"='))
|
|
779
822
|
size = sum(sizes)
|
|
780
|
-
|
|
823
|
+
n_files = len(sizes)
|
|
781
824
|
if compute_list_hash:
|
|
782
825
|
hash, hash_type = hash_from_hashes_list(hashes), "md5-d"
|
|
783
|
-
return size, hash, hash_type,
|
|
826
|
+
return size, hash, hash_type, n_files
|
|
784
827
|
|
|
785
828
|
|
|
786
|
-
class InstanceNotEmpty(
|
|
787
|
-
|
|
829
|
+
class InstanceNotEmpty(click.ClickException):
|
|
830
|
+
def show(self, file=None):
|
|
831
|
+
pass
|
|
788
832
|
|
|
789
833
|
|
|
790
|
-
# is as fast as boto3: https://lamin.ai/laminlabs/
|
|
834
|
+
# is as fast as boto3: https://lamin.ai/laminlabs/lamin-site-assets/transform/krGp3hT1f78N5zKv
|
|
791
835
|
def check_storage_is_empty(
|
|
792
836
|
root: UPathStr, *, raise_error: bool = True, account_for_sqlite_file: bool = False
|
|
793
837
|
) -> int:
|
|
@@ -810,20 +854,20 @@ def check_storage_is_empty(
|
|
|
810
854
|
root_string += "/"
|
|
811
855
|
directory_string = root_string + ".lamindb"
|
|
812
856
|
objects = root_upath.fs.find(directory_string)
|
|
813
|
-
|
|
814
|
-
n_diff =
|
|
857
|
+
n_files = len(objects)
|
|
858
|
+
n_diff = n_files - n_offset_objects
|
|
815
859
|
ask_for_deletion = (
|
|
816
860
|
"delete them prior to deleting the instance"
|
|
817
861
|
if raise_error
|
|
818
862
|
else "consider deleting them"
|
|
819
863
|
)
|
|
820
864
|
message = (
|
|
821
|
-
f"Storage '{directory_string}' contains {
|
|
865
|
+
f"Storage '{directory_string}' contains {n_files - n_offset_objects} objects"
|
|
822
866
|
f" - {ask_for_deletion}"
|
|
823
867
|
)
|
|
824
868
|
if n_diff > 0:
|
|
825
869
|
if raise_error:
|
|
826
|
-
raise InstanceNotEmpty(message)
|
|
870
|
+
raise InstanceNotEmpty(message) from None
|
|
827
871
|
else:
|
|
828
872
|
logger.warning(message)
|
|
829
873
|
return n_diff
|
|
@@ -1,20 +1,19 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
2
|
Name: lamindb_setup
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.0a1
|
|
4
4
|
Summary: Setup & configure LaminDB.
|
|
5
5
|
Author-email: Lamin Labs <open-source@lamin.ai>
|
|
6
6
|
Requires-Python: >=3.9
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
8
|
-
Requires-Dist: lnschema_core>=0.51.0
|
|
9
8
|
Requires-Dist: lamin_utils>=0.3.3
|
|
10
|
-
Requires-Dist: django
|
|
9
|
+
Requires-Dist: django>=5,<5.2
|
|
11
10
|
Requires-Dist: dj_database_url>=1.3.0,<3.0.0
|
|
12
11
|
Requires-Dist: pydantic-settings
|
|
13
12
|
Requires-Dist: appdirs<2.0.0
|
|
14
13
|
Requires-Dist: requests
|
|
15
14
|
Requires-Dist: universal_pathlib==0.2.5
|
|
16
15
|
Requires-Dist: botocore<2.0.0
|
|
17
|
-
Requires-Dist: supabase>=2.8.1,<=2.
|
|
16
|
+
Requires-Dist: supabase>=2.8.1,<=2.11.0
|
|
18
17
|
Requires-Dist: psutil
|
|
19
18
|
Requires-Dist: urllib3<2 ; extra == "aws"
|
|
20
19
|
Requires-Dist: aiobotocore[boto3]>=2.5.4,<3.0.0 ; extra == "aws"
|