lamindb_setup 0.80.1__py3-none-any.whl → 0.81.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb_setup/__init__.py +1 -1
- lamindb_setup/core/_settings_storage.py +5 -2
- lamindb_setup/core/hashing.py +3 -3
- lamindb_setup/core/upath.py +87 -39
- {lamindb_setup-0.80.1.dist-info → lamindb_setup-0.81.1.dist-info}/METADATA +3 -3
- {lamindb_setup-0.80.1.dist-info → lamindb_setup-0.81.1.dist-info}/RECORD +8 -8
- {lamindb_setup-0.80.1.dist-info → lamindb_setup-0.81.1.dist-info}/LICENSE +0 -0
- {lamindb_setup-0.80.1.dist-info → lamindb_setup-0.81.1.dist-info}/WHEEL +0 -0
lamindb_setup/__init__.py
CHANGED
|
@@ -114,13 +114,15 @@ def init_storage(
|
|
|
114
114
|
root_str = f"s3://lamin-{region}/{uid}"
|
|
115
115
|
else:
|
|
116
116
|
root_str = f"s3://lamin-hosted-test/{uid}"
|
|
117
|
-
elif root_str.startswith(("gs://", "s3://")):
|
|
117
|
+
elif root_str.startswith(("gs://", "s3://", "hf://")):
|
|
118
118
|
pass
|
|
119
119
|
else: # local path
|
|
120
120
|
try:
|
|
121
121
|
_ = Path(root_str)
|
|
122
122
|
except Exception as e:
|
|
123
|
-
logger.error(
|
|
123
|
+
logger.error(
|
|
124
|
+
"`storage` is not a valid local, GCP storage, AWS S3 path or Hugging Face path"
|
|
125
|
+
)
|
|
124
126
|
raise e
|
|
125
127
|
ssettings = StorageSettings(
|
|
126
128
|
uid=uid,
|
|
@@ -161,6 +163,7 @@ def init_storage(
|
|
|
161
163
|
# only newly created
|
|
162
164
|
if hub_record_status == "hub-record-created" and ssettings._uuid is not None:
|
|
163
165
|
delete_storage_record(ssettings._uuid, access_token=access_token) # type: ignore
|
|
166
|
+
hub_record_status = "hub-record-not-created"
|
|
164
167
|
ssettings._instance_id = None
|
|
165
168
|
return ssettings, hub_record_status
|
|
166
169
|
|
lamindb_setup/core/hashing.py
CHANGED
|
@@ -47,14 +47,14 @@ def hash_set(s: set[str]) -> str:
|
|
|
47
47
|
return to_b64_str(hashlib.md5(bstr).digest())[:HASH_LENGTH]
|
|
48
48
|
|
|
49
49
|
|
|
50
|
-
def
|
|
50
|
+
def hash_from_hashes_list(hashes: Iterable[str]) -> str:
|
|
51
51
|
# need to sort below because we don't want the order of parsing the dir to
|
|
52
52
|
# affect the hash
|
|
53
53
|
digests = b"".join(
|
|
54
54
|
hashlib.md5(hash.encode("utf-8")).digest() for hash in sorted(hashes)
|
|
55
55
|
)
|
|
56
56
|
digest = hashlib.md5(digests).digest()
|
|
57
|
-
return to_b64_str(digest)[:HASH_LENGTH]
|
|
57
|
+
return to_b64_str(digest)[:HASH_LENGTH]
|
|
58
58
|
|
|
59
59
|
|
|
60
60
|
def hash_code(file_path: UPathStr):
|
|
@@ -110,7 +110,7 @@ def hash_dir(path: Path):
|
|
|
110
110
|
hashes_sizes = map(hash_size, files)
|
|
111
111
|
hashes, sizes = zip(*hashes_sizes)
|
|
112
112
|
|
|
113
|
-
hash, hash_type =
|
|
113
|
+
hash, hash_type = hash_from_hashes_list(hashes), "md5-d"
|
|
114
114
|
n_objects = len(hashes)
|
|
115
115
|
size = sum(sizes)
|
|
116
116
|
return size, hash, hash_type, n_objects
|
lamindb_setup/core/upath.py
CHANGED
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
6
|
import os
|
|
7
|
+
import warnings
|
|
7
8
|
from collections import defaultdict
|
|
8
9
|
from datetime import datetime, timezone
|
|
9
10
|
from functools import partial
|
|
@@ -18,7 +19,7 @@ from upath.implementations.cloud import CloudPath, S3Path # keep CloudPath!
|
|
|
18
19
|
from upath.implementations.local import LocalPath
|
|
19
20
|
|
|
20
21
|
from ._aws_credentials import HOSTED_BUCKETS, get_aws_credentials_manager
|
|
21
|
-
from .hashing import HASH_LENGTH, b16_to_b64,
|
|
22
|
+
from .hashing import HASH_LENGTH, b16_to_b64, hash_from_hashes_list
|
|
22
23
|
|
|
23
24
|
if TYPE_CHECKING:
|
|
24
25
|
from .types import UPathStr
|
|
@@ -340,22 +341,34 @@ def synchronize(
|
|
|
340
341
|
timestamp: float | None = None,
|
|
341
342
|
):
|
|
342
343
|
"""Sync to a local destination path."""
|
|
344
|
+
protocol = self.protocol
|
|
343
345
|
# optimize the number of network requests
|
|
344
346
|
if timestamp is not None:
|
|
345
347
|
is_dir = False
|
|
346
348
|
exists = True
|
|
347
349
|
cloud_mts = timestamp
|
|
348
350
|
else:
|
|
349
|
-
#
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
351
|
+
# hf requires special treatment
|
|
352
|
+
if protocol == "hf":
|
|
353
|
+
try:
|
|
354
|
+
stat_hf = self.stat().as_info()
|
|
355
|
+
is_dir = stat_hf["type"] == "directory"
|
|
356
|
+
exists = True
|
|
357
|
+
if not is_dir:
|
|
358
|
+
cloud_mts = stat_hf["last_commit"].date.timestamp()
|
|
359
|
+
except FileNotFoundError:
|
|
360
|
+
exists = False
|
|
361
|
+
else:
|
|
362
|
+
# perform only one network request to check existence, type and timestamp
|
|
363
|
+
try:
|
|
364
|
+
cloud_mts = self.modified.timestamp()
|
|
365
|
+
is_dir = False
|
|
366
|
+
exists = True
|
|
367
|
+
except FileNotFoundError:
|
|
368
|
+
exists = False
|
|
369
|
+
except IsADirectoryError:
|
|
370
|
+
is_dir = True
|
|
371
|
+
exists = True
|
|
359
372
|
|
|
360
373
|
if not exists:
|
|
361
374
|
warn_or_error = f"The original path {self} does not exist anymore."
|
|
@@ -373,14 +386,18 @@ def synchronize(
|
|
|
373
386
|
# synchronization logic for directories
|
|
374
387
|
if is_dir:
|
|
375
388
|
files = self.fs.find(str(self), detail=True)
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
389
|
+
if protocol == "s3":
|
|
390
|
+
get_modified = lambda file_stat: file_stat["LastModified"]
|
|
391
|
+
elif protocol == "gs":
|
|
392
|
+
get_modified = lambda file_stat: file_stat["mtime"]
|
|
393
|
+
elif protocol == "hf":
|
|
394
|
+
get_modified = lambda file_stat: file_stat["last_commit"].date
|
|
395
|
+
else:
|
|
396
|
+
raise ValueError(f"Can't synchronize a directory for {protocol}.")
|
|
380
397
|
if objectpath.exists():
|
|
381
398
|
destination_exists = True
|
|
382
399
|
cloud_mts_max = max(
|
|
383
|
-
file
|
|
400
|
+
get_modified(file) for file in files.values()
|
|
384
401
|
).timestamp()
|
|
385
402
|
local_mts = [
|
|
386
403
|
file.stat().st_mtime for file in objectpath.rglob("*") if file.is_file()
|
|
@@ -405,9 +422,8 @@ def synchronize(
|
|
|
405
422
|
for file, stat in callback.wrap(files.items()):
|
|
406
423
|
file_key = PurePosixPath(file).relative_to(self.path).as_posix()
|
|
407
424
|
origin_file_keys.append(file_key)
|
|
408
|
-
timestamp = stat
|
|
409
|
-
|
|
410
|
-
origin = f"{self.protocol}://{file}"
|
|
425
|
+
timestamp = get_modified(stat).timestamp()
|
|
426
|
+
origin = f"{protocol}://{file}"
|
|
411
427
|
destination = objectpath / file_key
|
|
412
428
|
child = callback.branched(origin, destination.as_posix())
|
|
413
429
|
UPath(origin, **self.storage_options).synchronize(
|
|
@@ -439,6 +455,10 @@ def synchronize(
|
|
|
439
455
|
objectpath.parent.mkdir(parents=True, exist_ok=True)
|
|
440
456
|
need_synchronize = True
|
|
441
457
|
if need_synchronize:
|
|
458
|
+
# hf has sync filesystem
|
|
459
|
+
# on sync filesystems ChildProgressCallback.branched()
|
|
460
|
+
# returns the default callback
|
|
461
|
+
# this is why a difference between s3 and hf in progress bars
|
|
442
462
|
self.download_to(
|
|
443
463
|
objectpath, recursive=False, print_progress=False, callback=callback
|
|
444
464
|
)
|
|
@@ -689,6 +709,12 @@ Args:
|
|
|
689
709
|
pathlike: A string or Path to a local/cloud file/directory/folder.
|
|
690
710
|
"""
|
|
691
711
|
|
|
712
|
+
# suppress the warning from upath about hf (huggingface) filesystem
|
|
713
|
+
# not being explicitly implemented in upath
|
|
714
|
+
warnings.filterwarnings(
|
|
715
|
+
"ignore", module="upath", message=".*'hf' filesystem not explicitly implemented.*"
|
|
716
|
+
)
|
|
717
|
+
|
|
692
718
|
|
|
693
719
|
def create_path(path: UPath, access_token: str | None = None) -> UPath:
|
|
694
720
|
path = UPath(path)
|
|
@@ -698,38 +724,60 @@ def create_path(path: UPath, access_token: str | None = None) -> UPath:
|
|
|
698
724
|
return get_aws_credentials_manager().enrich_path(path, access_token)
|
|
699
725
|
|
|
700
726
|
|
|
701
|
-
def get_stat_file_cloud(stat: dict) -> tuple[int, str, str]:
|
|
727
|
+
def get_stat_file_cloud(stat: dict) -> tuple[int, str | None, str | None]:
|
|
702
728
|
size = stat["size"]
|
|
703
|
-
|
|
704
|
-
#
|
|
705
|
-
if "
|
|
706
|
-
#
|
|
707
|
-
|
|
708
|
-
# we can add more logic later down-the-road
|
|
709
|
-
hash = b16_to_b64(etag)
|
|
729
|
+
hash, hash_type = None, None
|
|
730
|
+
# gs, use md5Hash instead of etag for now
|
|
731
|
+
if "md5Hash" in stat:
|
|
732
|
+
# gs hash is already in base64
|
|
733
|
+
hash = stat["md5Hash"].strip('"=')
|
|
710
734
|
hash_type = "md5"
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
735
|
+
# hf
|
|
736
|
+
elif "blob_id" in stat:
|
|
737
|
+
hash = b16_to_b64(stat["blob_id"])
|
|
738
|
+
hash_type = "sha1"
|
|
739
|
+
# s3
|
|
740
|
+
elif "ETag" in stat:
|
|
741
|
+
etag = stat["ETag"]
|
|
742
|
+
# small files
|
|
743
|
+
if "-" not in etag:
|
|
744
|
+
# only store hash for non-multipart uploads
|
|
745
|
+
# we can't rapidly validate multi-part uploaded files client-side
|
|
746
|
+
# we can add more logic later down-the-road
|
|
747
|
+
hash = b16_to_b64(etag)
|
|
748
|
+
hash_type = "md5"
|
|
749
|
+
else:
|
|
750
|
+
stripped_etag, suffix = etag.split("-")
|
|
751
|
+
suffix = suffix.strip('"')
|
|
752
|
+
hash = b16_to_b64(stripped_etag)
|
|
753
|
+
hash_type = f"md5-{suffix}" # this is the S3 chunk-hashing strategy
|
|
754
|
+
if hash is not None:
|
|
755
|
+
hash = hash[:HASH_LENGTH]
|
|
756
|
+
return size, hash, hash_type
|
|
717
757
|
|
|
718
758
|
|
|
719
|
-
def get_stat_dir_cloud(path: UPath) -> tuple[int, str, str, int]:
|
|
720
|
-
sizes = []
|
|
721
|
-
md5s = []
|
|
759
|
+
def get_stat_dir_cloud(path: UPath) -> tuple[int, str | None, str | None, int]:
|
|
722
760
|
objects = path.fs.find(path.as_posix(), detail=True)
|
|
761
|
+
hash, hash_type = None, None
|
|
762
|
+
compute_list_hash = True
|
|
723
763
|
if path.protocol == "s3":
|
|
724
764
|
accessor = "ETag"
|
|
725
765
|
elif path.protocol == "gs":
|
|
726
766
|
accessor = "md5Hash"
|
|
767
|
+
elif path.protocol == "hf":
|
|
768
|
+
accessor = "blob_id"
|
|
769
|
+
else:
|
|
770
|
+
compute_list_hash = False
|
|
771
|
+
sizes = []
|
|
772
|
+
hashes = []
|
|
727
773
|
for object in objects.values():
|
|
728
774
|
sizes.append(object["size"])
|
|
729
|
-
|
|
775
|
+
if compute_list_hash:
|
|
776
|
+
hashes.append(object[accessor].strip('"='))
|
|
730
777
|
size = sum(sizes)
|
|
731
|
-
|
|
732
|
-
|
|
778
|
+
n_objects = len(sizes)
|
|
779
|
+
if compute_list_hash:
|
|
780
|
+
hash, hash_type = hash_from_hashes_list(hashes), "md5-d"
|
|
733
781
|
return size, hash, hash_type, n_objects
|
|
734
782
|
|
|
735
783
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: lamindb_setup
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.81.1
|
|
4
4
|
Summary: Setup & configure LaminDB.
|
|
5
5
|
Author-email: Lamin Labs <open-source@lamin.ai>
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -19,7 +19,7 @@ Requires-Dist: supabase==2.2.1
|
|
|
19
19
|
Requires-Dist: psutil
|
|
20
20
|
Requires-Dist: urllib3<2 ; extra == "aws"
|
|
21
21
|
Requires-Dist: aiobotocore[boto3]>=2.5.4,<3.0.0 ; extra == "aws"
|
|
22
|
-
Requires-Dist: s3fs>=2023.12.2,<=2024.
|
|
22
|
+
Requires-Dist: s3fs>=2023.12.2,<=2024.10.0 ; extra == "aws"
|
|
23
23
|
Requires-Dist: line_profiler ; extra == "dev"
|
|
24
24
|
Requires-Dist: pyjwt<3.0.0 ; extra == "dev"
|
|
25
25
|
Requires-Dist: psycopg2-binary ; extra == "dev"
|
|
@@ -31,7 +31,7 @@ Requires-Dist: pytest-xdist ; extra == "dev"
|
|
|
31
31
|
Requires-Dist: nbproject-test>=0.4.3 ; extra == "dev"
|
|
32
32
|
Requires-Dist: pandas ; extra == "dev"
|
|
33
33
|
Requires-Dist: django-schema-graph ; extra == "erdiagram"
|
|
34
|
-
Requires-Dist: gcsfs>=2023.12.2,<=2024.
|
|
34
|
+
Requires-Dist: gcsfs>=2023.12.2,<=2024.10.0 ; extra == "gcp"
|
|
35
35
|
Project-URL: Home, https://github.com/laminlabs/lamindb-setup
|
|
36
36
|
Provides-Extra: aws
|
|
37
37
|
Provides-Extra: dev
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
lamindb_setup/__init__.py,sha256=
|
|
1
|
+
lamindb_setup/__init__.py,sha256=dFAjI0r6-lM4WoJ3n0uo6iCdKHvnV-20y76RufgIFCQ,1714
|
|
2
2
|
lamindb_setup/_cache.py,sha256=1XnM-V_KprbjpgPY7Bg3FYn53Iz_2_fEgcMOaSdKKbg,1332
|
|
3
3
|
lamindb_setup/_check.py,sha256=28PcG8Kp6OpjSLSi1r2boL2Ryeh6xkaCL87HFbjs6GA,129
|
|
4
4
|
lamindb_setup/_check_setup.py,sha256=6cSfpmVOSgU7YiVHfJpBTGTQ7rrnwunt1pJT_jkgNM8,3196
|
|
@@ -31,17 +31,17 @@ lamindb_setup/core/_settings.py,sha256=mpGsSb98UsBedLsW2RuowZ17EP2tI2XRGPztqrJtr
|
|
|
31
31
|
lamindb_setup/core/_settings_instance.py,sha256=ajcq9zRNE598tTqyMkMqaEOubVfFeE998DPtbgyzK3A,18801
|
|
32
32
|
lamindb_setup/core/_settings_load.py,sha256=5OpghcbkrK9KBM_0Iu-61FTI76UbOpPkkJpUittXS-w,4098
|
|
33
33
|
lamindb_setup/core/_settings_save.py,sha256=rxGxgaK5i9exKqSJERQQyY1WZio20meoQJoYXlVW-1w,3138
|
|
34
|
-
lamindb_setup/core/_settings_storage.py,sha256=
|
|
34
|
+
lamindb_setup/core/_settings_storage.py,sha256=CYwGZm0fKYN7eLLsU-sOtOKG7HzswQVjTWb0ooHKcNg,11990
|
|
35
35
|
lamindb_setup/core/_settings_store.py,sha256=WcsgOmgnu9gztcrhp-N4OONNZyxICHV8M0HdJllTaEo,2219
|
|
36
36
|
lamindb_setup/core/_settings_user.py,sha256=iz0MqFLKXqm8LYx_CHmr02_oNvYWFLIxKkJLdpS5W08,1476
|
|
37
37
|
lamindb_setup/core/_setup_bionty_sources.py,sha256=o2L5Ww8TKgSqJtL4cGUcpJwLNYxA9BZgddhCMCu_E2g,3428
|
|
38
38
|
lamindb_setup/core/cloud_sqlite_locker.py,sha256=i6TrT7HG0lqliPvZTlsZ_uplPaqhPBbabyfeR32SkA8,7107
|
|
39
39
|
lamindb_setup/core/django.py,sha256=E4U9nUlV2kHd-G5v6iSdFGAAWixlQDxOFwMwOMG9xfw,3864
|
|
40
40
|
lamindb_setup/core/exceptions.py,sha256=4NpLUNUIfXYVTFX2FvLZF8RW34exk2Vn2X3G4YhnTRg,276
|
|
41
|
-
lamindb_setup/core/hashing.py,sha256=
|
|
41
|
+
lamindb_setup/core/hashing.py,sha256=26dtak7XgmrWa_D1zuDyxObRQcriMtnc1yEigkKASmM,3142
|
|
42
42
|
lamindb_setup/core/types.py,sha256=zJii2le38BJUmsNVvzDrbzGYr0yaeb-9Rw9IKmsBr3k,523
|
|
43
|
-
lamindb_setup/core/upath.py,sha256=
|
|
44
|
-
lamindb_setup-0.
|
|
45
|
-
lamindb_setup-0.
|
|
46
|
-
lamindb_setup-0.
|
|
47
|
-
lamindb_setup-0.
|
|
43
|
+
lamindb_setup/core/upath.py,sha256=4rCC0EOe-UaCexRDReQ5D_zs13Zi626pqn1slYNvISE,28945
|
|
44
|
+
lamindb_setup-0.81.1.dist-info/LICENSE,sha256=UOZ1F5fFDe3XXvG4oNnkL1-Ecun7zpHzRxjp-XsMeAo,11324
|
|
45
|
+
lamindb_setup-0.81.1.dist-info/WHEEL,sha256=EZbGkh7Ie4PoZfRQ8I0ZuP9VklN_TvcZ6DSE5Uar4z4,81
|
|
46
|
+
lamindb_setup-0.81.1.dist-info/METADATA,sha256=yvD9AXAL_lFbagwrcvhId-DMCbgFHtWxZ8fVxobspDM,1745
|
|
47
|
+
lamindb_setup-0.81.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|