lamindb_setup 1.18.2__py3-none-any.whl → 1.19.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb_setup/__init__.py +4 -19
- lamindb_setup/_cache.py +87 -87
- lamindb_setup/_check.py +7 -7
- lamindb_setup/_check_setup.py +131 -131
- lamindb_setup/_connect_instance.py +443 -438
- lamindb_setup/_delete.py +155 -151
- lamindb_setup/_disconnect.py +38 -38
- lamindb_setup/_django.py +39 -39
- lamindb_setup/_entry_points.py +19 -19
- lamindb_setup/_init_instance.py +423 -429
- lamindb_setup/_migrate.py +331 -327
- lamindb_setup/_register_instance.py +32 -32
- lamindb_setup/_schema.py +27 -27
- lamindb_setup/_schema_metadata.py +451 -451
- lamindb_setup/_set_managed_storage.py +81 -80
- lamindb_setup/_setup_user.py +198 -198
- lamindb_setup/_silence_loggers.py +46 -46
- lamindb_setup/core/__init__.py +25 -34
- lamindb_setup/core/_aws_options.py +276 -266
- lamindb_setup/core/_aws_storage.py +57 -55
- lamindb_setup/core/_clone.py +50 -50
- lamindb_setup/core/_deprecated.py +62 -62
- lamindb_setup/core/_docs.py +14 -14
- lamindb_setup/core/_hub_client.py +288 -294
- lamindb_setup/core/_hub_core.py +0 -2
- lamindb_setup/core/_hub_crud.py +247 -247
- lamindb_setup/core/_hub_utils.py +100 -100
- lamindb_setup/core/_private_django_api.py +80 -80
- lamindb_setup/core/_settings.py +440 -434
- lamindb_setup/core/_settings_instance.py +32 -7
- lamindb_setup/core/_settings_load.py +162 -159
- lamindb_setup/core/_settings_save.py +108 -96
- lamindb_setup/core/_settings_storage.py +433 -433
- lamindb_setup/core/_settings_store.py +162 -92
- lamindb_setup/core/_settings_user.py +55 -55
- lamindb_setup/core/_setup_bionty_sources.py +44 -44
- lamindb_setup/core/cloud_sqlite_locker.py +240 -240
- lamindb_setup/core/django.py +414 -413
- lamindb_setup/core/exceptions.py +1 -1
- lamindb_setup/core/hashing.py +134 -134
- lamindb_setup/core/types.py +1 -1
- lamindb_setup/core/upath.py +1031 -1028
- lamindb_setup/errors.py +72 -70
- lamindb_setup/io.py +423 -416
- lamindb_setup/types.py +17 -17
- {lamindb_setup-1.18.2.dist-info → lamindb_setup-1.19.1.dist-info}/METADATA +4 -2
- lamindb_setup-1.19.1.dist-info/RECORD +51 -0
- {lamindb_setup-1.18.2.dist-info → lamindb_setup-1.19.1.dist-info}/WHEEL +1 -1
- {lamindb_setup-1.18.2.dist-info → lamindb_setup-1.19.1.dist-info/licenses}/LICENSE +201 -201
- lamindb_setup-1.18.2.dist-info/RECORD +0 -51
lamindb_setup/core/exceptions.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
from lamindb_setup.errors import DefaultMessageException # backwards compatibility
|
|
1
|
+
from lamindb_setup.errors import DefaultMessageException # backwards compatibility
|
lamindb_setup/core/hashing.py
CHANGED
|
@@ -1,134 +1,134 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
"""Hashing.
|
|
4
|
-
|
|
5
|
-
.. autosummary::
|
|
6
|
-
:toctree: .
|
|
7
|
-
|
|
8
|
-
hash_set
|
|
9
|
-
hash_file
|
|
10
|
-
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
import base64
|
|
14
|
-
import hashlib
|
|
15
|
-
import json
|
|
16
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
17
|
-
from typing import TYPE_CHECKING
|
|
18
|
-
|
|
19
|
-
import psutil
|
|
20
|
-
|
|
21
|
-
HASH_LENGTH = 22
|
|
22
|
-
|
|
23
|
-
if TYPE_CHECKING:
|
|
24
|
-
from collections.abc import Iterable
|
|
25
|
-
|
|
26
|
-
from lamindb_setup.types import Path, UPathStr
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def hash_and_encode_as_b62(s: str) -> str:
|
|
30
|
-
from lamin_utils._base62 import encodebytes
|
|
31
|
-
|
|
32
|
-
return encodebytes(hashlib.md5(s.encode()).digest())
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def to_b64_str(bstr: bytes):
|
|
36
|
-
b64 = base64.urlsafe_b64encode(bstr).decode().strip("=")
|
|
37
|
-
return b64
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def b16_to_b64(s: str):
|
|
41
|
-
return to_b64_str(base64.b16decode(s.strip('"'), casefold=True))
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
# a lot to read about this: lamin-notes/2022/hashing
|
|
45
|
-
def hash_set(s: set[str]) -> str:
|
|
46
|
-
join_s = ":".join(sorted(s))
|
|
47
|
-
return hash_string(join_s)[:HASH_LENGTH]
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def hash_dict(d: dict) -> str:
|
|
51
|
-
return to_b64_str(hashlib.md5(json.dumps(d, sort_keys=True).encode()).digest())[
|
|
52
|
-
:HASH_LENGTH
|
|
53
|
-
]
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def hash_from_hashes_list(hashes: Iterable[str]) -> str:
|
|
57
|
-
# need to sort below because we don't want the order of parsing the dir to
|
|
58
|
-
# affect the hash
|
|
59
|
-
digests = b"".join(
|
|
60
|
-
hashlib.md5(hash.encode("utf-8")).digest() for hash in sorted(hashes)
|
|
61
|
-
)
|
|
62
|
-
digest = hashlib.md5(digests).digest()
|
|
63
|
-
return to_b64_str(digest)[:HASH_LENGTH]
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
# below is only used when comparing with git's sha1 hashes
|
|
67
|
-
# we don't use it for our own hashes
|
|
68
|
-
def hash_code(file_path: UPathStr) -> hashlib._Hash:
|
|
69
|
-
with open(file_path, "rb") as fp:
|
|
70
|
-
data = fp.read()
|
|
71
|
-
data_size = len(data)
|
|
72
|
-
header = f"blob {data_size}\0".encode()
|
|
73
|
-
blob = header + data
|
|
74
|
-
return hashlib.sha1(blob)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def hash_small_bytes(data: bytes) -> str:
|
|
78
|
-
return to_b64_str(hashlib.md5(data).digest())
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
# this is equivalent with hash_file for small files
|
|
82
|
-
def hash_string(string: str) -> str:
|
|
83
|
-
# as we're truncating (not here) at 22 b64, we choose md5 over sha512
|
|
84
|
-
return to_b64_str(hashlib.md5(string.encode("utf-8")).digest())[:HASH_LENGTH]
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def hash_file(
|
|
88
|
-
file_path: Path,
|
|
89
|
-
file_size: int | None = None,
|
|
90
|
-
chunk_size: int | None = 50 * 1024 * 1024,
|
|
91
|
-
) -> tuple[int, str, str]:
|
|
92
|
-
with open(file_path, "rb") as fp:
|
|
93
|
-
if file_size is None:
|
|
94
|
-
fp.seek(0, 2)
|
|
95
|
-
file_size = fp.tell()
|
|
96
|
-
fp.seek(0, 0)
|
|
97
|
-
if chunk_size is None:
|
|
98
|
-
chunk_size = file_size
|
|
99
|
-
first_chunk = fp.read(chunk_size)
|
|
100
|
-
if file_size <= chunk_size:
|
|
101
|
-
digest = hashlib.md5(first_chunk).digest()
|
|
102
|
-
hash_type = "md5"
|
|
103
|
-
else:
|
|
104
|
-
fp.seek(-chunk_size, 2)
|
|
105
|
-
last_chunk = fp.read(chunk_size)
|
|
106
|
-
digest = hashlib.sha1(
|
|
107
|
-
hashlib.sha1(first_chunk).digest() + hashlib.sha1(last_chunk).digest()
|
|
108
|
-
).digest()
|
|
109
|
-
hash_type = "sha1-fl"
|
|
110
|
-
return file_size, to_b64_str(digest)[:HASH_LENGTH], hash_type
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def hash_dir(path: Path) -> tuple[int, str, str, int]:
|
|
114
|
-
files = (subpath for subpath in path.rglob("*") if subpath.is_file())
|
|
115
|
-
|
|
116
|
-
def hash_size(file):
|
|
117
|
-
size, hash, _ = hash_file(file)
|
|
118
|
-
return hash, size
|
|
119
|
-
|
|
120
|
-
try:
|
|
121
|
-
n_workers = len(psutil.Process().cpu_affinity())
|
|
122
|
-
except AttributeError:
|
|
123
|
-
n_workers = psutil.cpu_count()
|
|
124
|
-
if n_workers > 1:
|
|
125
|
-
with ThreadPoolExecutor(n_workers) as pool:
|
|
126
|
-
hashes_sizes = pool.map(hash_size, files)
|
|
127
|
-
else:
|
|
128
|
-
hashes_sizes = map(hash_size, files)
|
|
129
|
-
hashes, sizes = zip(*hashes_sizes, strict=False)
|
|
130
|
-
|
|
131
|
-
hash, hash_type = hash_from_hashes_list(hashes), "md5-d"
|
|
132
|
-
n_files = len(hashes)
|
|
133
|
-
size = sum(sizes)
|
|
134
|
-
return size, hash, hash_type, n_files
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
"""Hashing.
|
|
4
|
+
|
|
5
|
+
.. autosummary::
|
|
6
|
+
:toctree: .
|
|
7
|
+
|
|
8
|
+
hash_set
|
|
9
|
+
hash_file
|
|
10
|
+
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import base64
|
|
14
|
+
import hashlib
|
|
15
|
+
import json
|
|
16
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
17
|
+
from typing import TYPE_CHECKING
|
|
18
|
+
|
|
19
|
+
import psutil
|
|
20
|
+
|
|
21
|
+
HASH_LENGTH = 22
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from collections.abc import Iterable
|
|
25
|
+
|
|
26
|
+
from lamindb_setup.types import Path, UPathStr
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def hash_and_encode_as_b62(s: str) -> str:
|
|
30
|
+
from lamin_utils._base62 import encodebytes
|
|
31
|
+
|
|
32
|
+
return encodebytes(hashlib.md5(s.encode()).digest())
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def to_b64_str(bstr: bytes):
|
|
36
|
+
b64 = base64.urlsafe_b64encode(bstr).decode().strip("=")
|
|
37
|
+
return b64
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def b16_to_b64(s: str):
|
|
41
|
+
return to_b64_str(base64.b16decode(s.strip('"'), casefold=True))
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# a lot to read about this: lamin-notes/2022/hashing
|
|
45
|
+
def hash_set(s: set[str]) -> str:
|
|
46
|
+
join_s = ":".join(sorted(s))
|
|
47
|
+
return hash_string(join_s)[:HASH_LENGTH]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def hash_dict(d: dict) -> str:
|
|
51
|
+
return to_b64_str(hashlib.md5(json.dumps(d, sort_keys=True).encode()).digest())[
|
|
52
|
+
:HASH_LENGTH
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def hash_from_hashes_list(hashes: Iterable[str]) -> str:
|
|
57
|
+
# need to sort below because we don't want the order of parsing the dir to
|
|
58
|
+
# affect the hash
|
|
59
|
+
digests = b"".join(
|
|
60
|
+
hashlib.md5(hash.encode("utf-8")).digest() for hash in sorted(hashes)
|
|
61
|
+
)
|
|
62
|
+
digest = hashlib.md5(digests).digest()
|
|
63
|
+
return to_b64_str(digest)[:HASH_LENGTH]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# below is only used when comparing with git's sha1 hashes
|
|
67
|
+
# we don't use it for our own hashes
|
|
68
|
+
def hash_code(file_path: UPathStr) -> hashlib._Hash:
|
|
69
|
+
with open(file_path, "rb") as fp:
|
|
70
|
+
data = fp.read()
|
|
71
|
+
data_size = len(data)
|
|
72
|
+
header = f"blob {data_size}\0".encode()
|
|
73
|
+
blob = header + data
|
|
74
|
+
return hashlib.sha1(blob)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def hash_small_bytes(data: bytes) -> str:
|
|
78
|
+
return to_b64_str(hashlib.md5(data).digest())
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# this is equivalent with hash_file for small files
|
|
82
|
+
def hash_string(string: str) -> str:
|
|
83
|
+
# as we're truncating (not here) at 22 b64, we choose md5 over sha512
|
|
84
|
+
return to_b64_str(hashlib.md5(string.encode("utf-8")).digest())[:HASH_LENGTH]
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def hash_file(
|
|
88
|
+
file_path: Path,
|
|
89
|
+
file_size: int | None = None,
|
|
90
|
+
chunk_size: int | None = 50 * 1024 * 1024,
|
|
91
|
+
) -> tuple[int, str, str]:
|
|
92
|
+
with open(file_path, "rb") as fp:
|
|
93
|
+
if file_size is None:
|
|
94
|
+
fp.seek(0, 2)
|
|
95
|
+
file_size = fp.tell()
|
|
96
|
+
fp.seek(0, 0)
|
|
97
|
+
if chunk_size is None:
|
|
98
|
+
chunk_size = file_size
|
|
99
|
+
first_chunk = fp.read(chunk_size)
|
|
100
|
+
if file_size <= chunk_size:
|
|
101
|
+
digest = hashlib.md5(first_chunk).digest()
|
|
102
|
+
hash_type = "md5"
|
|
103
|
+
else:
|
|
104
|
+
fp.seek(-chunk_size, 2)
|
|
105
|
+
last_chunk = fp.read(chunk_size)
|
|
106
|
+
digest = hashlib.sha1(
|
|
107
|
+
hashlib.sha1(first_chunk).digest() + hashlib.sha1(last_chunk).digest()
|
|
108
|
+
).digest()
|
|
109
|
+
hash_type = "sha1-fl"
|
|
110
|
+
return file_size, to_b64_str(digest)[:HASH_LENGTH], hash_type
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def hash_dir(path: Path) -> tuple[int, str, str, int]:
|
|
114
|
+
files = (subpath for subpath in path.rglob("*") if subpath.is_file())
|
|
115
|
+
|
|
116
|
+
def hash_size(file):
|
|
117
|
+
size, hash, _ = hash_file(file)
|
|
118
|
+
return hash, size
|
|
119
|
+
|
|
120
|
+
try:
|
|
121
|
+
n_workers = len(psutil.Process().cpu_affinity())
|
|
122
|
+
except AttributeError:
|
|
123
|
+
n_workers = psutil.cpu_count()
|
|
124
|
+
if n_workers > 1:
|
|
125
|
+
with ThreadPoolExecutor(n_workers) as pool:
|
|
126
|
+
hashes_sizes = pool.map(hash_size, files)
|
|
127
|
+
else:
|
|
128
|
+
hashes_sizes = map(hash_size, files)
|
|
129
|
+
hashes, sizes = zip(*hashes_sizes, strict=False)
|
|
130
|
+
|
|
131
|
+
hash, hash_type = hash_from_hashes_list(hashes), "md5-d"
|
|
132
|
+
n_files = len(hashes)
|
|
133
|
+
size = sum(sizes)
|
|
134
|
+
return size, hash, hash_type, n_files
|
lamindb_setup/core/types.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
from lamindb_setup.types import UPathStr # backward compatibility
|
|
1
|
+
from lamindb_setup.types import UPathStr # backward compatibility
|