lamindb_setup 0.80.1__py3-none-any.whl → 0.81.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb_setup/__init__.py +1 -1
- lamindb_setup/core/_settings_storage.py +5 -2
- lamindb_setup/core/hashing.py +3 -3
- lamindb_setup/core/upath.py +80 -39
- {lamindb_setup-0.80.1.dist-info → lamindb_setup-0.81.0.dist-info}/METADATA +3 -3
- {lamindb_setup-0.80.1.dist-info → lamindb_setup-0.81.0.dist-info}/RECORD +8 -8
- {lamindb_setup-0.80.1.dist-info → lamindb_setup-0.81.0.dist-info}/LICENSE +0 -0
- {lamindb_setup-0.80.1.dist-info → lamindb_setup-0.81.0.dist-info}/WHEEL +0 -0
lamindb_setup/__init__.py
CHANGED
|
@@ -114,13 +114,15 @@ def init_storage(
|
|
|
114
114
|
root_str = f"s3://lamin-{region}/{uid}"
|
|
115
115
|
else:
|
|
116
116
|
root_str = f"s3://lamin-hosted-test/{uid}"
|
|
117
|
-
elif root_str.startswith(("gs://", "s3://")):
|
|
117
|
+
elif root_str.startswith(("gs://", "s3://", "hf://")):
|
|
118
118
|
pass
|
|
119
119
|
else: # local path
|
|
120
120
|
try:
|
|
121
121
|
_ = Path(root_str)
|
|
122
122
|
except Exception as e:
|
|
123
|
-
logger.error(
|
|
123
|
+
logger.error(
|
|
124
|
+
"`storage` is not a valid local, GCP storage, AWS S3 path or Hugging Face path"
|
|
125
|
+
)
|
|
124
126
|
raise e
|
|
125
127
|
ssettings = StorageSettings(
|
|
126
128
|
uid=uid,
|
|
@@ -161,6 +163,7 @@ def init_storage(
|
|
|
161
163
|
# only newly created
|
|
162
164
|
if hub_record_status == "hub-record-created" and ssettings._uuid is not None:
|
|
163
165
|
delete_storage_record(ssettings._uuid, access_token=access_token) # type: ignore
|
|
166
|
+
hub_record_status = "hub-record-not-created"
|
|
164
167
|
ssettings._instance_id = None
|
|
165
168
|
return ssettings, hub_record_status
|
|
166
169
|
|
lamindb_setup/core/hashing.py
CHANGED
|
@@ -47,14 +47,14 @@ def hash_set(s: set[str]) -> str:
|
|
|
47
47
|
return to_b64_str(hashlib.md5(bstr).digest())[:HASH_LENGTH]
|
|
48
48
|
|
|
49
49
|
|
|
50
|
-
def
|
|
50
|
+
def hash_from_hashes_list(hashes: Iterable[str]) -> str:
|
|
51
51
|
# need to sort below because we don't want the order of parsing the dir to
|
|
52
52
|
# affect the hash
|
|
53
53
|
digests = b"".join(
|
|
54
54
|
hashlib.md5(hash.encode("utf-8")).digest() for hash in sorted(hashes)
|
|
55
55
|
)
|
|
56
56
|
digest = hashlib.md5(digests).digest()
|
|
57
|
-
return to_b64_str(digest)[:HASH_LENGTH]
|
|
57
|
+
return to_b64_str(digest)[:HASH_LENGTH]
|
|
58
58
|
|
|
59
59
|
|
|
60
60
|
def hash_code(file_path: UPathStr):
|
|
@@ -110,7 +110,7 @@ def hash_dir(path: Path):
|
|
|
110
110
|
hashes_sizes = map(hash_size, files)
|
|
111
111
|
hashes, sizes = zip(*hashes_sizes)
|
|
112
112
|
|
|
113
|
-
hash, hash_type =
|
|
113
|
+
hash, hash_type = hash_from_hashes_list(hashes), "md5-d"
|
|
114
114
|
n_objects = len(hashes)
|
|
115
115
|
size = sum(sizes)
|
|
116
116
|
return size, hash, hash_type, n_objects
|
lamindb_setup/core/upath.py
CHANGED
|
@@ -18,7 +18,7 @@ from upath.implementations.cloud import CloudPath, S3Path # keep CloudPath!
|
|
|
18
18
|
from upath.implementations.local import LocalPath
|
|
19
19
|
|
|
20
20
|
from ._aws_credentials import HOSTED_BUCKETS, get_aws_credentials_manager
|
|
21
|
-
from .hashing import HASH_LENGTH, b16_to_b64,
|
|
21
|
+
from .hashing import HASH_LENGTH, b16_to_b64, hash_from_hashes_list
|
|
22
22
|
|
|
23
23
|
if TYPE_CHECKING:
|
|
24
24
|
from .types import UPathStr
|
|
@@ -340,22 +340,34 @@ def synchronize(
|
|
|
340
340
|
timestamp: float | None = None,
|
|
341
341
|
):
|
|
342
342
|
"""Sync to a local destination path."""
|
|
343
|
+
protocol = self.protocol
|
|
343
344
|
# optimize the number of network requests
|
|
344
345
|
if timestamp is not None:
|
|
345
346
|
is_dir = False
|
|
346
347
|
exists = True
|
|
347
348
|
cloud_mts = timestamp
|
|
348
349
|
else:
|
|
349
|
-
#
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
350
|
+
# hf requires special treatment
|
|
351
|
+
if protocol == "hf":
|
|
352
|
+
try:
|
|
353
|
+
stat_hf = self.stat().as_info()
|
|
354
|
+
is_dir = stat_hf["type"] == "directory"
|
|
355
|
+
exists = True
|
|
356
|
+
if not is_dir:
|
|
357
|
+
cloud_mts = stat_hf["last_commit"].date.timestamp()
|
|
358
|
+
except FileNotFoundError:
|
|
359
|
+
exists = False
|
|
360
|
+
else:
|
|
361
|
+
# perform only one network request to check existence, type and timestamp
|
|
362
|
+
try:
|
|
363
|
+
cloud_mts = self.modified.timestamp()
|
|
364
|
+
is_dir = False
|
|
365
|
+
exists = True
|
|
366
|
+
except FileNotFoundError:
|
|
367
|
+
exists = False
|
|
368
|
+
except IsADirectoryError:
|
|
369
|
+
is_dir = True
|
|
370
|
+
exists = True
|
|
359
371
|
|
|
360
372
|
if not exists:
|
|
361
373
|
warn_or_error = f"The original path {self} does not exist anymore."
|
|
@@ -373,14 +385,18 @@ def synchronize(
|
|
|
373
385
|
# synchronization logic for directories
|
|
374
386
|
if is_dir:
|
|
375
387
|
files = self.fs.find(str(self), detail=True)
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
388
|
+
if protocol == "s3":
|
|
389
|
+
get_modified = lambda file_stat: file_stat["LastModified"]
|
|
390
|
+
elif protocol == "gs":
|
|
391
|
+
get_modified = lambda file_stat: file_stat["mtime"]
|
|
392
|
+
elif protocol == "hf":
|
|
393
|
+
get_modified = lambda file_stat: file_stat["last_commit"].date
|
|
394
|
+
else:
|
|
395
|
+
raise ValueError(f"Can't synchronize a directory for {protocol}.")
|
|
380
396
|
if objectpath.exists():
|
|
381
397
|
destination_exists = True
|
|
382
398
|
cloud_mts_max = max(
|
|
383
|
-
file
|
|
399
|
+
get_modified(file) for file in files.values()
|
|
384
400
|
).timestamp()
|
|
385
401
|
local_mts = [
|
|
386
402
|
file.stat().st_mtime for file in objectpath.rglob("*") if file.is_file()
|
|
@@ -405,9 +421,8 @@ def synchronize(
|
|
|
405
421
|
for file, stat in callback.wrap(files.items()):
|
|
406
422
|
file_key = PurePosixPath(file).relative_to(self.path).as_posix()
|
|
407
423
|
origin_file_keys.append(file_key)
|
|
408
|
-
timestamp = stat
|
|
409
|
-
|
|
410
|
-
origin = f"{self.protocol}://{file}"
|
|
424
|
+
timestamp = get_modified(stat).timestamp()
|
|
425
|
+
origin = f"{protocol}://{file}"
|
|
411
426
|
destination = objectpath / file_key
|
|
412
427
|
child = callback.branched(origin, destination.as_posix())
|
|
413
428
|
UPath(origin, **self.storage_options).synchronize(
|
|
@@ -439,6 +454,10 @@ def synchronize(
|
|
|
439
454
|
objectpath.parent.mkdir(parents=True, exist_ok=True)
|
|
440
455
|
need_synchronize = True
|
|
441
456
|
if need_synchronize:
|
|
457
|
+
# hf has sync filesystem
|
|
458
|
+
# on sync filesystems ChildProgressCallback.branched()
|
|
459
|
+
# returns the default callback
|
|
460
|
+
# this is why a difference between s3 and hf in progress bars
|
|
442
461
|
self.download_to(
|
|
443
462
|
objectpath, recursive=False, print_progress=False, callback=callback
|
|
444
463
|
)
|
|
@@ -698,38 +717,60 @@ def create_path(path: UPath, access_token: str | None = None) -> UPath:
|
|
|
698
717
|
return get_aws_credentials_manager().enrich_path(path, access_token)
|
|
699
718
|
|
|
700
719
|
|
|
701
|
-
def get_stat_file_cloud(stat: dict) -> tuple[int, str, str]:
|
|
720
|
+
def get_stat_file_cloud(stat: dict) -> tuple[int, str | None, str | None]:
|
|
702
721
|
size = stat["size"]
|
|
703
|
-
|
|
704
|
-
#
|
|
705
|
-
if "
|
|
706
|
-
#
|
|
707
|
-
|
|
708
|
-
# we can add more logic later down-the-road
|
|
709
|
-
hash = b16_to_b64(etag)
|
|
722
|
+
hash, hash_type = None, None
|
|
723
|
+
# gs, use md5Hash instead of etag for now
|
|
724
|
+
if "md5Hash" in stat:
|
|
725
|
+
# gs hash is already in base64
|
|
726
|
+
hash = stat["md5Hash"].strip('"=')
|
|
710
727
|
hash_type = "md5"
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
728
|
+
# hf
|
|
729
|
+
elif "blob_id" in stat:
|
|
730
|
+
hash = b16_to_b64(stat["blob_id"])
|
|
731
|
+
hash_type = "sha1"
|
|
732
|
+
# s3
|
|
733
|
+
elif "ETag" in stat:
|
|
734
|
+
etag = stat["ETag"]
|
|
735
|
+
# small files
|
|
736
|
+
if "-" not in etag:
|
|
737
|
+
# only store hash for non-multipart uploads
|
|
738
|
+
# we can't rapidly validate multi-part uploaded files client-side
|
|
739
|
+
# we can add more logic later down-the-road
|
|
740
|
+
hash = b16_to_b64(etag)
|
|
741
|
+
hash_type = "md5"
|
|
742
|
+
else:
|
|
743
|
+
stripped_etag, suffix = etag.split("-")
|
|
744
|
+
suffix = suffix.strip('"')
|
|
745
|
+
hash = b16_to_b64(stripped_etag)
|
|
746
|
+
hash_type = f"md5-{suffix}" # this is the S3 chunk-hashing strategy
|
|
747
|
+
if hash is not None:
|
|
748
|
+
hash = hash[:HASH_LENGTH]
|
|
749
|
+
return size, hash, hash_type
|
|
717
750
|
|
|
718
751
|
|
|
719
|
-
def get_stat_dir_cloud(path: UPath) -> tuple[int, str, str, int]:
|
|
720
|
-
sizes = []
|
|
721
|
-
md5s = []
|
|
752
|
+
def get_stat_dir_cloud(path: UPath) -> tuple[int, str | None, str | None, int]:
|
|
722
753
|
objects = path.fs.find(path.as_posix(), detail=True)
|
|
754
|
+
hash, hash_type = None, None
|
|
755
|
+
compute_list_hash = True
|
|
723
756
|
if path.protocol == "s3":
|
|
724
757
|
accessor = "ETag"
|
|
725
758
|
elif path.protocol == "gs":
|
|
726
759
|
accessor = "md5Hash"
|
|
760
|
+
elif path.protocol == "hf":
|
|
761
|
+
accessor = "blob_id"
|
|
762
|
+
else:
|
|
763
|
+
compute_list_hash = False
|
|
764
|
+
sizes = []
|
|
765
|
+
hashes = []
|
|
727
766
|
for object in objects.values():
|
|
728
767
|
sizes.append(object["size"])
|
|
729
|
-
|
|
768
|
+
if compute_list_hash:
|
|
769
|
+
hashes.append(object[accessor].strip('"='))
|
|
730
770
|
size = sum(sizes)
|
|
731
|
-
|
|
732
|
-
|
|
771
|
+
n_objects = len(sizes)
|
|
772
|
+
if compute_list_hash:
|
|
773
|
+
hash, hash_type = hash_from_hashes_list(hashes), "md5-d"
|
|
733
774
|
return size, hash, hash_type, n_objects
|
|
734
775
|
|
|
735
776
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: lamindb_setup
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.81.0
|
|
4
4
|
Summary: Setup & configure LaminDB.
|
|
5
5
|
Author-email: Lamin Labs <open-source@lamin.ai>
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -19,7 +19,7 @@ Requires-Dist: supabase==2.2.1
|
|
|
19
19
|
Requires-Dist: psutil
|
|
20
20
|
Requires-Dist: urllib3<2 ; extra == "aws"
|
|
21
21
|
Requires-Dist: aiobotocore[boto3]>=2.5.4,<3.0.0 ; extra == "aws"
|
|
22
|
-
Requires-Dist: s3fs>=2023.12.2,<=2024.
|
|
22
|
+
Requires-Dist: s3fs>=2023.12.2,<=2024.10.0 ; extra == "aws"
|
|
23
23
|
Requires-Dist: line_profiler ; extra == "dev"
|
|
24
24
|
Requires-Dist: pyjwt<3.0.0 ; extra == "dev"
|
|
25
25
|
Requires-Dist: psycopg2-binary ; extra == "dev"
|
|
@@ -31,7 +31,7 @@ Requires-Dist: pytest-xdist ; extra == "dev"
|
|
|
31
31
|
Requires-Dist: nbproject-test>=0.4.3 ; extra == "dev"
|
|
32
32
|
Requires-Dist: pandas ; extra == "dev"
|
|
33
33
|
Requires-Dist: django-schema-graph ; extra == "erdiagram"
|
|
34
|
-
Requires-Dist: gcsfs>=2023.12.2,<=2024.
|
|
34
|
+
Requires-Dist: gcsfs>=2023.12.2,<=2024.10.0 ; extra == "gcp"
|
|
35
35
|
Project-URL: Home, https://github.com/laminlabs/lamindb-setup
|
|
36
36
|
Provides-Extra: aws
|
|
37
37
|
Provides-Extra: dev
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
lamindb_setup/__init__.py,sha256=
|
|
1
|
+
lamindb_setup/__init__.py,sha256=Xm__DIYPUbJSjykjcT4bC-mR75b1GFYVGZ0owhVfCyI,1714
|
|
2
2
|
lamindb_setup/_cache.py,sha256=1XnM-V_KprbjpgPY7Bg3FYn53Iz_2_fEgcMOaSdKKbg,1332
|
|
3
3
|
lamindb_setup/_check.py,sha256=28PcG8Kp6OpjSLSi1r2boL2Ryeh6xkaCL87HFbjs6GA,129
|
|
4
4
|
lamindb_setup/_check_setup.py,sha256=6cSfpmVOSgU7YiVHfJpBTGTQ7rrnwunt1pJT_jkgNM8,3196
|
|
@@ -31,17 +31,17 @@ lamindb_setup/core/_settings.py,sha256=mpGsSb98UsBedLsW2RuowZ17EP2tI2XRGPztqrJtr
|
|
|
31
31
|
lamindb_setup/core/_settings_instance.py,sha256=ajcq9zRNE598tTqyMkMqaEOubVfFeE998DPtbgyzK3A,18801
|
|
32
32
|
lamindb_setup/core/_settings_load.py,sha256=5OpghcbkrK9KBM_0Iu-61FTI76UbOpPkkJpUittXS-w,4098
|
|
33
33
|
lamindb_setup/core/_settings_save.py,sha256=rxGxgaK5i9exKqSJERQQyY1WZio20meoQJoYXlVW-1w,3138
|
|
34
|
-
lamindb_setup/core/_settings_storage.py,sha256=
|
|
34
|
+
lamindb_setup/core/_settings_storage.py,sha256=CYwGZm0fKYN7eLLsU-sOtOKG7HzswQVjTWb0ooHKcNg,11990
|
|
35
35
|
lamindb_setup/core/_settings_store.py,sha256=WcsgOmgnu9gztcrhp-N4OONNZyxICHV8M0HdJllTaEo,2219
|
|
36
36
|
lamindb_setup/core/_settings_user.py,sha256=iz0MqFLKXqm8LYx_CHmr02_oNvYWFLIxKkJLdpS5W08,1476
|
|
37
37
|
lamindb_setup/core/_setup_bionty_sources.py,sha256=o2L5Ww8TKgSqJtL4cGUcpJwLNYxA9BZgddhCMCu_E2g,3428
|
|
38
38
|
lamindb_setup/core/cloud_sqlite_locker.py,sha256=i6TrT7HG0lqliPvZTlsZ_uplPaqhPBbabyfeR32SkA8,7107
|
|
39
39
|
lamindb_setup/core/django.py,sha256=E4U9nUlV2kHd-G5v6iSdFGAAWixlQDxOFwMwOMG9xfw,3864
|
|
40
40
|
lamindb_setup/core/exceptions.py,sha256=4NpLUNUIfXYVTFX2FvLZF8RW34exk2Vn2X3G4YhnTRg,276
|
|
41
|
-
lamindb_setup/core/hashing.py,sha256=
|
|
41
|
+
lamindb_setup/core/hashing.py,sha256=26dtak7XgmrWa_D1zuDyxObRQcriMtnc1yEigkKASmM,3142
|
|
42
42
|
lamindb_setup/core/types.py,sha256=zJii2le38BJUmsNVvzDrbzGYr0yaeb-9Rw9IKmsBr3k,523
|
|
43
|
-
lamindb_setup/core/upath.py,sha256=
|
|
44
|
-
lamindb_setup-0.
|
|
45
|
-
lamindb_setup-0.
|
|
46
|
-
lamindb_setup-0.
|
|
47
|
-
lamindb_setup-0.
|
|
43
|
+
lamindb_setup/core/upath.py,sha256=GD-EW9QSqJH023ox53hPDvjE86hFjXVhb0MSEU02HeY,28702
|
|
44
|
+
lamindb_setup-0.81.0.dist-info/LICENSE,sha256=UOZ1F5fFDe3XXvG4oNnkL1-Ecun7zpHzRxjp-XsMeAo,11324
|
|
45
|
+
lamindb_setup-0.81.0.dist-info/WHEEL,sha256=EZbGkh7Ie4PoZfRQ8I0ZuP9VklN_TvcZ6DSE5Uar4z4,81
|
|
46
|
+
lamindb_setup-0.81.0.dist-info/METADATA,sha256=mEUNPAf6Q4NfqeO2YuUIlADbKkLQLOFFzL-BNkvk3dY,1745
|
|
47
|
+
lamindb_setup-0.81.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|