lamindb_setup 0.78.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb_setup/__init__.py +74 -0
- lamindb_setup/_cache.py +48 -0
- lamindb_setup/_check.py +7 -0
- lamindb_setup/_check_setup.py +92 -0
- lamindb_setup/_close.py +35 -0
- lamindb_setup/_connect_instance.py +429 -0
- lamindb_setup/_delete.py +141 -0
- lamindb_setup/_django.py +39 -0
- lamindb_setup/_entry_points.py +22 -0
- lamindb_setup/_exportdb.py +68 -0
- lamindb_setup/_importdb.py +50 -0
- lamindb_setup/_init_instance.py +411 -0
- lamindb_setup/_migrate.py +239 -0
- lamindb_setup/_register_instance.py +36 -0
- lamindb_setup/_schema.py +27 -0
- lamindb_setup/_schema_metadata.py +411 -0
- lamindb_setup/_set_managed_storage.py +55 -0
- lamindb_setup/_setup_user.py +137 -0
- lamindb_setup/_silence_loggers.py +44 -0
- lamindb_setup/core/__init__.py +21 -0
- lamindb_setup/core/_aws_credentials.py +151 -0
- lamindb_setup/core/_aws_storage.py +48 -0
- lamindb_setup/core/_deprecated.py +55 -0
- lamindb_setup/core/_docs.py +14 -0
- lamindb_setup/core/_hub_client.py +173 -0
- lamindb_setup/core/_hub_core.py +554 -0
- lamindb_setup/core/_hub_crud.py +211 -0
- lamindb_setup/core/_hub_utils.py +109 -0
- lamindb_setup/core/_private_django_api.py +88 -0
- lamindb_setup/core/_settings.py +184 -0
- lamindb_setup/core/_settings_instance.py +485 -0
- lamindb_setup/core/_settings_load.py +117 -0
- lamindb_setup/core/_settings_save.py +92 -0
- lamindb_setup/core/_settings_storage.py +350 -0
- lamindb_setup/core/_settings_store.py +75 -0
- lamindb_setup/core/_settings_user.py +55 -0
- lamindb_setup/core/_setup_bionty_sources.py +101 -0
- lamindb_setup/core/cloud_sqlite_locker.py +237 -0
- lamindb_setup/core/django.py +115 -0
- lamindb_setup/core/exceptions.py +10 -0
- lamindb_setup/core/hashing.py +116 -0
- lamindb_setup/core/types.py +17 -0
- lamindb_setup/core/upath.py +779 -0
- lamindb_setup-0.78.0.dist-info/LICENSE +201 -0
- lamindb_setup-0.78.0.dist-info/METADATA +47 -0
- lamindb_setup-0.78.0.dist-info/RECORD +47 -0
- lamindb_setup-0.78.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from functools import wraps
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from lamin_utils import logger
|
|
8
|
+
|
|
9
|
+
from .upath import UPath, create_mapper, infer_filesystem
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from uuid import UUID
|
|
14
|
+
|
|
15
|
+
from ._settings_instance import InstanceSettings
|
|
16
|
+
from ._settings_user import UserSettings
|
|
17
|
+
|
|
18
|
+
EXPIRATION_TIME = 24 * 60 * 60 * 7 # 7 days
|
|
19
|
+
|
|
20
|
+
MAX_MSG_COUNTER = 100 # print the msg after this number of iterations
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# raise if an instance is already locked
|
|
24
|
+
# ignored by unlock_cloud_sqlite_upon_exception
|
|
25
|
+
class InstanceLockedException(Exception):
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class empty_locker:
|
|
30
|
+
has_lock = True
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def lock(cls):
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def unlock(cls):
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class Locker:
|
|
42
|
+
def __init__(self, user_uid: str, storage_root: UPath | Path, instance_id: UUID):
|
|
43
|
+
logger.debug(
|
|
44
|
+
f"init cloud sqlite locker: {user_uid}, {storage_root}, {instance_id}."
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
self._counter = 0
|
|
48
|
+
|
|
49
|
+
self.user = user_uid
|
|
50
|
+
self.instance_id = instance_id
|
|
51
|
+
|
|
52
|
+
self.root = storage_root
|
|
53
|
+
self.fs, root_str = infer_filesystem(storage_root)
|
|
54
|
+
|
|
55
|
+
exclusion_path = storage_root / f".lamindb/_exclusion/{instance_id.hex}"
|
|
56
|
+
|
|
57
|
+
self.mapper = create_mapper(self.fs, str(exclusion_path), create=True)
|
|
58
|
+
|
|
59
|
+
priorities_path = str(exclusion_path / "priorities")
|
|
60
|
+
if self.fs.exists(priorities_path):
|
|
61
|
+
self.users = self.mapper["priorities"].decode().split("*")
|
|
62
|
+
|
|
63
|
+
if self.user not in self.users:
|
|
64
|
+
self.priority = len(self.users)
|
|
65
|
+
self.users.append(self.user)
|
|
66
|
+
# potential problem here if 2 users join at the same time
|
|
67
|
+
# can be avoided by using separate files for each user
|
|
68
|
+
# and giving priority by timestamp
|
|
69
|
+
# here writing the whole list back because gcs
|
|
70
|
+
# does not support the append mode
|
|
71
|
+
self.mapper["priorities"] = "*".join(self.users).encode()
|
|
72
|
+
else:
|
|
73
|
+
self.priority = self.users.index(self.user)
|
|
74
|
+
else:
|
|
75
|
+
self.mapper["priorities"] = self.user.encode()
|
|
76
|
+
self.users = [self.user]
|
|
77
|
+
self.priority = 0
|
|
78
|
+
|
|
79
|
+
self.mapper[f"numbers/{self.user}"] = b"0"
|
|
80
|
+
self.mapper[f"entering/{self.user}"] = b"0"
|
|
81
|
+
|
|
82
|
+
# clean up failures
|
|
83
|
+
for user in self.users:
|
|
84
|
+
for endpoint in ("numbers", "entering"):
|
|
85
|
+
user_endpoint = f"{endpoint}/{user}"
|
|
86
|
+
user_path = str(exclusion_path / user_endpoint)
|
|
87
|
+
if not self.fs.exists(user_path):
|
|
88
|
+
continue
|
|
89
|
+
if self.mapper[user_endpoint] == b"0":
|
|
90
|
+
continue
|
|
91
|
+
period = (datetime.now() - self.modified(user_path)).total_seconds()
|
|
92
|
+
if period > EXPIRATION_TIME:
|
|
93
|
+
logger.info(
|
|
94
|
+
f"the lock of the user {user} seems to be stale, clearing"
|
|
95
|
+
f" {endpoint}."
|
|
96
|
+
)
|
|
97
|
+
self.mapper[user_endpoint] = b"0"
|
|
98
|
+
|
|
99
|
+
self._has_lock = None
|
|
100
|
+
self._locked_by = None
|
|
101
|
+
|
|
102
|
+
def modified(self, path):
|
|
103
|
+
mtime = self.fs.modified(path)
|
|
104
|
+
# always convert to the local timezone before returning
|
|
105
|
+
# assume in utc if the time zone is not specified
|
|
106
|
+
if mtime.tzinfo is None:
|
|
107
|
+
mtime = mtime.replace(tzinfo=timezone.utc)
|
|
108
|
+
return mtime.astimezone().replace(tzinfo=None)
|
|
109
|
+
|
|
110
|
+
def _msg_on_counter(self, user):
|
|
111
|
+
if self._counter == MAX_MSG_COUNTER:
|
|
112
|
+
logger.warning(f"competing for the lock with the user {user}.")
|
|
113
|
+
|
|
114
|
+
if self._counter <= MAX_MSG_COUNTER:
|
|
115
|
+
self._counter += 1
|
|
116
|
+
|
|
117
|
+
# Lamport's bakery algorithm
|
|
118
|
+
def _lock_unsafe(self):
|
|
119
|
+
if self._has_lock:
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
self._has_lock = True
|
|
123
|
+
self._locked_by = self.user
|
|
124
|
+
|
|
125
|
+
self.users = self.mapper["priorities"].decode().split("*")
|
|
126
|
+
|
|
127
|
+
self.mapper[f"entering/{self.user}"] = b"1"
|
|
128
|
+
|
|
129
|
+
numbers = [int(self.mapper[f"numbers/{user}"]) for user in self.users]
|
|
130
|
+
number = 1 + max(numbers)
|
|
131
|
+
self.mapper[f"numbers/{self.user}"] = str(number).encode()
|
|
132
|
+
|
|
133
|
+
self.mapper[f"entering/{self.user}"] = b"0"
|
|
134
|
+
|
|
135
|
+
for i, user in enumerate(self.users):
|
|
136
|
+
if i == self.priority:
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
while self.mapper[f"entering/{user}"] == b"1":
|
|
140
|
+
self._msg_on_counter(user)
|
|
141
|
+
|
|
142
|
+
c_number = int(self.mapper[f"numbers/{user}"])
|
|
143
|
+
|
|
144
|
+
if c_number == 0:
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
if (number > c_number) or (number == c_number and self.priority > i):
|
|
148
|
+
self._has_lock = False
|
|
149
|
+
self._locked_by = user
|
|
150
|
+
self.mapper[f"numbers/{self.user}"] = b"0"
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
def lock(self):
|
|
154
|
+
try:
|
|
155
|
+
self._lock_unsafe()
|
|
156
|
+
except BaseException as e:
|
|
157
|
+
self.unlock()
|
|
158
|
+
self._clear()
|
|
159
|
+
raise e
|
|
160
|
+
|
|
161
|
+
def unlock(self):
|
|
162
|
+
self.mapper[f"numbers/{self.user}"] = b"0"
|
|
163
|
+
self._has_lock = None
|
|
164
|
+
self._locked_by = None
|
|
165
|
+
self._counter = 0
|
|
166
|
+
|
|
167
|
+
def _clear(self):
|
|
168
|
+
self.mapper[f"entering/{self.user}"] = b"0"
|
|
169
|
+
|
|
170
|
+
@property
|
|
171
|
+
def has_lock(self):
|
|
172
|
+
if self._has_lock is None:
|
|
173
|
+
logger.info("the lock has not been initialized, trying to obtain the lock.")
|
|
174
|
+
self.lock()
|
|
175
|
+
|
|
176
|
+
return self._has_lock
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
_locker: Locker | None = None
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def get_locker(
|
|
183
|
+
isettings: InstanceSettings, usettings: UserSettings | None = None
|
|
184
|
+
) -> Locker:
|
|
185
|
+
from ._settings import settings
|
|
186
|
+
|
|
187
|
+
global _locker
|
|
188
|
+
|
|
189
|
+
user_uid = settings.user.uid if usettings is None else usettings.uid
|
|
190
|
+
storage_root = isettings.storage.root
|
|
191
|
+
|
|
192
|
+
if (
|
|
193
|
+
_locker is None
|
|
194
|
+
or _locker.user != user_uid
|
|
195
|
+
or _locker.root != storage_root
|
|
196
|
+
or _locker.instance_id != isettings._id
|
|
197
|
+
):
|
|
198
|
+
_locker = Locker(user_uid, storage_root, isettings._id)
|
|
199
|
+
|
|
200
|
+
return _locker
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def clear_locker():
|
|
204
|
+
global _locker
|
|
205
|
+
|
|
206
|
+
_locker = None
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
# decorator
|
|
210
|
+
def unlock_cloud_sqlite_upon_exception(ignore_prev_locker: bool = False):
|
|
211
|
+
"""Decorator to unlock a cloud sqlite instance upon an exception.
|
|
212
|
+
|
|
213
|
+
Ignores `InstanceLockedException`.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
ignore_prev_locker: `bool` - Do not unlock if locker hasn't changed.
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
def wrap_with_args(func):
|
|
220
|
+
# https://stackoverflow.com/questions/1782843/python-decorator-handling-docstrings
|
|
221
|
+
@wraps(func)
|
|
222
|
+
def wrapper(*args, **kwargs):
|
|
223
|
+
prev_locker = _locker
|
|
224
|
+
try:
|
|
225
|
+
return func(*args, **kwargs)
|
|
226
|
+
except Exception as exc:
|
|
227
|
+
if isinstance(exc, InstanceLockedException):
|
|
228
|
+
raise exc
|
|
229
|
+
if ignore_prev_locker and _locker is prev_locker:
|
|
230
|
+
raise exc
|
|
231
|
+
if _locker is not None and _locker._has_lock:
|
|
232
|
+
_locker.unlock()
|
|
233
|
+
raise exc
|
|
234
|
+
|
|
235
|
+
return wrapper
|
|
236
|
+
|
|
237
|
+
return wrap_with_args
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
# flake8: noqa
|
|
4
|
+
import builtins
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import time
|
|
8
|
+
from lamin_utils import logger
|
|
9
|
+
from ._settings_store import current_instance_settings_file
|
|
10
|
+
from ._settings_instance import InstanceSettings
|
|
11
|
+
import sys
|
|
12
|
+
|
|
13
|
+
IS_RUN_FROM_IPYTHON = getattr(builtins, "__IPYTHON__", False)
|
|
14
|
+
IS_SETUP = False
|
|
15
|
+
IS_MIGRATING = False
|
|
16
|
+
CONN_MAX_AGE = 299
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def close_if_health_check_failed(self) -> None:
|
|
20
|
+
if self.close_at is not None:
|
|
21
|
+
if time.monotonic() >= self.close_at:
|
|
22
|
+
self.close()
|
|
23
|
+
self.close_at = time.monotonic() + CONN_MAX_AGE
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# this bundles set up and migration management
|
|
27
|
+
def setup_django(
|
|
28
|
+
isettings: InstanceSettings,
|
|
29
|
+
deploy_migrations: bool = False,
|
|
30
|
+
create_migrations: bool = False,
|
|
31
|
+
configure_only: bool = False,
|
|
32
|
+
init: bool = False,
|
|
33
|
+
view_schema: bool = False,
|
|
34
|
+
):
|
|
35
|
+
if IS_RUN_FROM_IPYTHON:
|
|
36
|
+
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
|
|
37
|
+
|
|
38
|
+
import dj_database_url
|
|
39
|
+
import django
|
|
40
|
+
from django.conf import settings
|
|
41
|
+
from django.core.management import call_command
|
|
42
|
+
|
|
43
|
+
# configuration
|
|
44
|
+
if not settings.configured:
|
|
45
|
+
default_db = dj_database_url.config(
|
|
46
|
+
env="LAMINDB_DJANGO_DATABASE_URL",
|
|
47
|
+
default=isettings.db,
|
|
48
|
+
# see comment next to patching BaseDatabaseWrapper below
|
|
49
|
+
conn_max_age=CONN_MAX_AGE,
|
|
50
|
+
conn_health_checks=True,
|
|
51
|
+
)
|
|
52
|
+
DATABASES = {
|
|
53
|
+
"default": default_db,
|
|
54
|
+
}
|
|
55
|
+
from .._init_instance import get_schema_module_name
|
|
56
|
+
|
|
57
|
+
schema_names = ["core"] + list(isettings.schema)
|
|
58
|
+
installed_apps = [get_schema_module_name(n) for n in schema_names]
|
|
59
|
+
if view_schema:
|
|
60
|
+
installed_apps = installed_apps[::-1] # to fix how apps appear
|
|
61
|
+
installed_apps += ["schema_graph", "django.contrib.staticfiles"]
|
|
62
|
+
|
|
63
|
+
kwargs = dict(
|
|
64
|
+
INSTALLED_APPS=installed_apps,
|
|
65
|
+
DATABASES=DATABASES,
|
|
66
|
+
DEFAULT_AUTO_FIELD="django.db.models.BigAutoField",
|
|
67
|
+
TIME_ZONE="UTC",
|
|
68
|
+
USE_TZ=True,
|
|
69
|
+
)
|
|
70
|
+
if view_schema:
|
|
71
|
+
kwargs.update(
|
|
72
|
+
DEBUG=True,
|
|
73
|
+
ROOT_URLCONF="lamindb_setup._schema",
|
|
74
|
+
SECRET_KEY="dummy",
|
|
75
|
+
TEMPLATES=[
|
|
76
|
+
{
|
|
77
|
+
"BACKEND": "django.template.backends.django.DjangoTemplates",
|
|
78
|
+
"APP_DIRS": True,
|
|
79
|
+
},
|
|
80
|
+
],
|
|
81
|
+
STATIC_ROOT=f"{Path.home().as_posix()}/.lamin/",
|
|
82
|
+
STATICFILES_FINDERS=[
|
|
83
|
+
"django.contrib.staticfiles.finders.AppDirectoriesFinder",
|
|
84
|
+
],
|
|
85
|
+
STATIC_URL="static/",
|
|
86
|
+
)
|
|
87
|
+
settings.configure(**kwargs)
|
|
88
|
+
django.setup(set_prefix=False)
|
|
89
|
+
# https://laminlabs.slack.com/archives/C04FPE8V01W/p1698239551460289
|
|
90
|
+
from django.db.backends.base.base import BaseDatabaseWrapper
|
|
91
|
+
|
|
92
|
+
BaseDatabaseWrapper.close_if_health_check_failed = close_if_health_check_failed
|
|
93
|
+
|
|
94
|
+
if configure_only:
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
# migrations management
|
|
98
|
+
if create_migrations:
|
|
99
|
+
call_command("makemigrations")
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
if deploy_migrations:
|
|
103
|
+
call_command("migrate", verbosity=2)
|
|
104
|
+
isettings._update_cloud_sqlite_file(unlock_cloud_sqlite=False)
|
|
105
|
+
elif init:
|
|
106
|
+
global IS_MIGRATING
|
|
107
|
+
IS_MIGRATING = True
|
|
108
|
+
call_command("migrate", verbosity=0)
|
|
109
|
+
IS_MIGRATING = False
|
|
110
|
+
|
|
111
|
+
global IS_SETUP
|
|
112
|
+
IS_SETUP = True
|
|
113
|
+
|
|
114
|
+
if isettings.keep_artifacts_local:
|
|
115
|
+
isettings._search_local_root()
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
"""Hashing.
|
|
4
|
+
|
|
5
|
+
.. autosummary::
|
|
6
|
+
:toctree: .
|
|
7
|
+
|
|
8
|
+
hash_set
|
|
9
|
+
hash_file
|
|
10
|
+
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import base64
|
|
14
|
+
import hashlib
|
|
15
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
16
|
+
from typing import TYPE_CHECKING
|
|
17
|
+
|
|
18
|
+
import psutil
|
|
19
|
+
|
|
20
|
+
HASH_LENGTH = 22
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from collections.abc import Iterable
|
|
24
|
+
|
|
25
|
+
from .types import Path, UPathStr
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def hash_and_encode_as_b62(s: str) -> str:
|
|
29
|
+
from lamin_utils._base62 import encodebytes
|
|
30
|
+
|
|
31
|
+
return encodebytes(hashlib.md5(s.encode()).digest())
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def to_b64_str(bstr: bytes):
|
|
35
|
+
b64 = base64.urlsafe_b64encode(bstr).decode().strip("=")
|
|
36
|
+
return b64
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def b16_to_b64(s: str):
|
|
40
|
+
return to_b64_str(base64.b16decode(s.strip('"'), casefold=True))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# a lot to read about this: lamin-notes/2022/hashing
|
|
44
|
+
def hash_set(s: set[str]) -> str:
|
|
45
|
+
bstr = ":".join(sorted(s)).encode("utf-8")
|
|
46
|
+
# as we're truncating at 22 b64, we choose md5 over sha512
|
|
47
|
+
return to_b64_str(hashlib.md5(bstr).digest())[:HASH_LENGTH]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def hash_md5s_from_dir(hashes: Iterable[str]) -> tuple[str, str]:
|
|
51
|
+
# need to sort below because we don't want the order of parsing the dir to
|
|
52
|
+
# affect the hash
|
|
53
|
+
digests = b"".join(
|
|
54
|
+
hashlib.md5(hash.encode("utf-8")).digest() for hash in sorted(hashes)
|
|
55
|
+
)
|
|
56
|
+
digest = hashlib.md5(digests).digest()
|
|
57
|
+
return to_b64_str(digest)[:HASH_LENGTH], "md5-d"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def hash_code(file_path: UPathStr):
|
|
61
|
+
with open(file_path, "rb") as fp:
|
|
62
|
+
data = fp.read()
|
|
63
|
+
data_size = len(data)
|
|
64
|
+
header = f"blob {data_size}\0".encode()
|
|
65
|
+
blob = header + data
|
|
66
|
+
return hashlib.sha1(blob)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def hash_file(
|
|
70
|
+
file_path: Path,
|
|
71
|
+
file_size: int | None = None,
|
|
72
|
+
chunk_size: int | None = 50 * 1024 * 1024,
|
|
73
|
+
) -> tuple[str, str]:
|
|
74
|
+
with open(file_path, "rb") as fp:
|
|
75
|
+
if file_size is None:
|
|
76
|
+
fp.seek(0, 2)
|
|
77
|
+
file_size = fp.tell()
|
|
78
|
+
fp.seek(0, 0)
|
|
79
|
+
if chunk_size is None:
|
|
80
|
+
chunk_size = file_size
|
|
81
|
+
first_chunk = fp.read(chunk_size)
|
|
82
|
+
if file_size <= chunk_size:
|
|
83
|
+
digest = hashlib.md5(first_chunk).digest()
|
|
84
|
+
hash_type = "md5"
|
|
85
|
+
else:
|
|
86
|
+
fp.seek(-chunk_size, 2)
|
|
87
|
+
last_chunk = fp.read(chunk_size)
|
|
88
|
+
digest = hashlib.sha1(
|
|
89
|
+
hashlib.sha1(first_chunk).digest() + hashlib.sha1(last_chunk).digest()
|
|
90
|
+
).digest()
|
|
91
|
+
hash_type = "sha1-fl"
|
|
92
|
+
return to_b64_str(digest)[:HASH_LENGTH], hash_type
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def hash_dir(path: Path):
|
|
96
|
+
files = (subpath for subpath in path.rglob("*") if subpath.is_file())
|
|
97
|
+
|
|
98
|
+
def hash_size(file):
|
|
99
|
+
file_size = file.stat().st_size
|
|
100
|
+
return hash_file(file, file_size)[0], file_size
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
n_workers = len(psutil.Process().cpu_affinity())
|
|
104
|
+
except AttributeError:
|
|
105
|
+
n_workers = psutil.cpu_count()
|
|
106
|
+
if n_workers > 1:
|
|
107
|
+
with ThreadPoolExecutor(n_workers) as pool:
|
|
108
|
+
hashes_sizes = pool.map(hash_size, files)
|
|
109
|
+
else:
|
|
110
|
+
hashes_sizes = map(hash_size, files)
|
|
111
|
+
hashes, sizes = zip(*hashes_sizes)
|
|
112
|
+
|
|
113
|
+
hash, hash_type = hash_md5s_from_dir(hashes)
|
|
114
|
+
n_objects = len(hashes)
|
|
115
|
+
size = sum(sizes)
|
|
116
|
+
return size, hash, hash_type, n_objects
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
"""Types.
|
|
4
|
+
|
|
5
|
+
.. autosummary::
|
|
6
|
+
:toctree: .
|
|
7
|
+
|
|
8
|
+
UPathStr
|
|
9
|
+
"""
|
|
10
|
+
# we need Union here because __future__ annotations doesn't work with TypeAlias
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Union
|
|
13
|
+
|
|
14
|
+
# UPath is subclass of Path, hence, it's not necessary to list UPath
|
|
15
|
+
# we keep it in the name of the TypeAlias to make it clear to users that
|
|
16
|
+
# cloud paths are allowed / PathStr is often associated with local paths
|
|
17
|
+
UPathStr = Union[str, Path] # typing.TypeAlias, >3.10 on but already deprecated
|