lamindb_setup 0.81.3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -232,21 +232,21 @@ class _ModelHandler:
232
232
  return related_fields
233
233
 
234
234
  def _get_field_metadata(self, model, field: Field):
235
- from lnschema_core.models import LinkORM
235
+ from lamindb.models import LinkORM
236
236
 
237
237
  internal_type = field.get_internal_type()
238
238
  model_name = field.model._meta.model_name
239
239
  relation_type = self._get_relation_type(model, field)
240
240
  if field.related_model is None:
241
- schema_name = field.model.__get_schema_name__()
241
+ schema_name = field.model.__get_module_name__()
242
242
  related_model_name = None
243
243
  related_schema_name = None
244
244
  related_field_name = None
245
245
  field_name = field.name
246
246
  else:
247
247
  related_model_name = field.related_model._meta.model_name
248
- related_schema_name = field.related_model.__get_schema_name__()
249
- schema_name = field.model.__get_schema_name__()
248
+ related_schema_name = field.related_model.__get_module_name__()
249
+ schema_name = field.model.__get_module_name__()
250
250
  related_field_name = field.remote_field.name
251
251
  field_name = field.name
252
252
 
@@ -273,14 +273,16 @@ class _ModelHandler:
273
273
  through = self._get_through(field)
274
274
 
275
275
  return FieldMetadata(
276
- schema_name=schema_name,
276
+ schema_name=schema_name if schema_name != "lamindb" else "core",
277
277
  model_name=model_name,
278
278
  field_name=field_name,
279
279
  type=internal_type,
280
280
  is_link_table=issubclass(field.model, LinkORM),
281
281
  column_name=column,
282
282
  relation_type=relation_type,
283
- related_schema_name=related_schema_name,
283
+ related_schema_name=related_schema_name
284
+ if related_schema_name != "lamindb"
285
+ else "core",
284
286
  related_model_name=related_model_name,
285
287
  related_field_name=related_field_name,
286
288
  through=through,
@@ -288,7 +290,7 @@ class _ModelHandler:
288
290
 
289
291
  @staticmethod
290
292
  def _get_through_many_to_many(field_or_rel: ManyToManyField | ManyToManyRel):
291
- from lnschema_core.models import Registry
293
+ from lamindb.models import Registry
292
294
 
293
295
  if isinstance(field_or_rel, ManyToManyField):
294
296
  if field_or_rel.model != Registry:
@@ -360,12 +362,12 @@ class _ModelHandler:
360
362
 
361
363
  class _SchemaHandler:
362
364
  def __init__(self) -> None:
363
- self.included_modules = ["core"] + list(settings.instance.schema)
365
+ self.included_modules = ["core"] + list(settings.instance.modules)
364
366
  self.modules = self._get_modules_metadata()
365
367
 
366
368
  def to_dict(self, include_django_objects: bool = True):
367
369
  return {
368
- module_name: {
370
+ module_name if module_name != "lamindb" else "core": {
369
371
  model_name: model.to_dict(include_django_objects)
370
372
  for model_name, model in module.items()
371
373
  }
@@ -376,7 +378,7 @@ class _SchemaHandler:
376
378
  return self.to_dict(include_django_objects=False)
377
379
 
378
380
  def _get_modules_metadata(self):
379
- from lnschema_core.models import Record, Registry
381
+ from lamindb.models import Record, Registry
380
382
 
381
383
  all_models = {
382
384
  module_name: {
@@ -389,7 +391,7 @@ class _SchemaHandler:
389
391
  if model.__class__ is Registry
390
392
  and model is not Record
391
393
  and not model._meta.abstract
392
- and model.__get_schema_name__() == module_name
394
+ and model.__get_module_name__() == module_name
393
395
  }
394
396
  for module_name in self.included_modules
395
397
  }
@@ -401,6 +403,8 @@ class _SchemaHandler:
401
403
  module_set_info = []
402
404
  for module_name in self.included_modules:
403
405
  module = self._get_schema_module(module_name)
406
+ if module_name == "lamindb":
407
+ module_name = "core"
404
408
  module_set_info.append(
405
409
  {"id": 0, "name": module_name, "version": module.__version__}
406
410
  )
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  import os
4
4
  import time
5
5
 
6
+ from lamin_utils import logger
6
7
  from upath.implementations.cloud import S3Path
7
8
 
8
9
  HOSTED_REGIONS = [
@@ -40,8 +41,15 @@ class AWSCredentialsManager:
40
41
 
41
42
  # this is cached so will be resued with the connection initialized
42
43
  fs = S3FileSystem(cache_regions=True)
43
- fs.connect()
44
- self.anon: bool = fs.session._credentials is None
44
+ try:
45
+ fs.connect()
46
+ self.anon: bool = fs.session._credentials is None
47
+ except Exception as e:
48
+ logger.warning(
49
+ f"There is a problem with your default AWS Credentials: {e}\n"
50
+ "`anon` mode will be used for all non-managed buckets."
51
+ )
52
+ self.anon = True
45
53
  self.anon_public: bool | None = None
46
54
  if not self.anon:
47
55
  try:
@@ -30,23 +30,30 @@ from functools import wraps
30
30
  def deprecated(new_name: str):
31
31
  """Deprecated.
32
32
 
33
- This is a decorator which can be used to mark functions
33
+ This is a decorator which can be used to mark functions, methods and properties
34
34
  as deprecated. It will result in a warning being emitted
35
35
  when the function is used.
36
+
37
+ It will also hide the function from the docs.
38
+
39
+ Example::
40
+
41
+ @property
42
+ @deprecated("n_files")
43
+ def n_objects(self) -> int:
44
+ return self.n_files
45
+
36
46
  """
37
47
 
38
48
  def decorator(func):
39
49
  @wraps(func)
40
50
  def new_func(*args, **kwargs):
41
- # turn off filter
42
- warnings.simplefilter("always", DeprecationWarning)
43
51
  warnings.warn(
44
52
  f"Use {new_name} instead of {func.__name__}, "
45
53
  f"{func.__name__} will be removed in the future.",
46
- category=DeprecationWarning,
54
+ category=FutureWarning,
47
55
  stacklevel=2,
48
56
  )
49
- warnings.simplefilter("default", DeprecationWarning) # reset filter
50
57
  return func(*args, **kwargs)
51
58
 
52
59
  setattr(new_func, "__deprecated", True)
@@ -233,12 +233,16 @@ def _delete_instance(
233
233
  )
234
234
  if require_empty:
235
235
  for storage_record in storage_records:
236
+ root_string: str = storage_record["root"] # type: ignore
236
237
  account_for_sqlite_file = (
237
238
  instance_with_storage["db_scheme"] is None
238
- and instance_with_storage["storage"]["root"] == storage_record["root"]
239
+ and instance_with_storage["storage"]["root"] == root_string
239
240
  )
240
- root_string = storage_record["root"]
241
241
  # gate storage and instance deletion on empty storage location for
242
+ # normally auth.get_session() doesn't have access_token
243
+ # so this block is useless i think (Sergei)
244
+ # the token is received from user settings inside create_path
245
+ # might be needed in the hub though
242
246
  if client.auth.get_session() is not None:
243
247
  access_token = client.auth.get_session().access_token
244
248
  else:
@@ -251,7 +255,6 @@ def _delete_instance(
251
255
  check_storage_is_empty(
252
256
  root_path, account_for_sqlite_file=account_for_sqlite_file
253
257
  )
254
- _update_instance_record(instance_with_storage["id"], {"storage_id": None}, client)
255
258
  # first delete the storage records because we will turn instance_id on
256
259
  # storage into a FK soon
257
260
  for storage_record in storage_records:
@@ -12,25 +12,23 @@ def select_instance_by_owner_name(
12
12
  name: str,
13
13
  client: Client,
14
14
  ) -> dict | None:
15
- try:
16
- data = (
17
- client.table("instance")
18
- .select(
19
- "*, account!inner!instance_account_id_28936e8f_fk_account_id(*),"
20
- " storage!inner!storage_instance_id_359fca71_fk_instance_id(*)"
21
- )
22
- .eq("name", name)
23
- .eq("account.handle", owner)
24
- .eq("storage.is_default", True)
25
- .execute()
26
- .data
15
+ # this won't find an instance without the default storage
16
+ data = (
17
+ client.table("instance")
18
+ .select(
19
+ "*, account!inner!instance_account_id_28936e8f_fk_account_id(*),"
20
+ " storage!inner!storage_instance_id_359fca71_fk_instance_id(*)"
27
21
  )
28
- except Exception:
29
- return None
22
+ .eq("name", name)
23
+ .eq("account.handle", owner)
24
+ .eq("storage.is_default", True)
25
+ .execute()
26
+ .data
27
+ )
30
28
  if len(data) == 0:
31
29
  return None
32
30
  result = data[0]
33
- # this is now a list
31
+ # this is a list
34
32
  # assume only one default storage
35
33
  result["storage"] = result["storage"][0]
36
34
  return result
@@ -89,15 +87,22 @@ def select_instance_by_id_with_storage(
89
87
  instance_id: str,
90
88
  client: Client,
91
89
  ):
92
- response = (
90
+ # this won't find an instance without the default storage
91
+ data = (
93
92
  client.table("instance")
94
- .select("*, storage!instance_storage_id_87963cc8_fk_storage_id(*)")
93
+ .select("*, storage!inner!storage_instance_id_359fca71_fk_instance_id(*)")
95
94
  .eq("id", instance_id)
95
+ .eq("storage.is_default", True)
96
96
  .execute()
97
+ .data
97
98
  )
98
- if len(response.data) == 0:
99
+ if len(data) == 0:
99
100
  return None
100
- return response.data[0]
101
+ result = data[0]
102
+ # this is a list
103
+ # assume only one default storage
104
+ result["storage"] = result["storage"][0]
105
+ return result
101
106
 
102
107
 
103
108
  def update_instance(instance_id: str, instance_fields: dict, client: Client):
@@ -139,17 +144,14 @@ def select_collaborator(
139
144
  def select_default_storage_by_instance_id(
140
145
  instance_id: str, client: Client
141
146
  ) -> dict | None:
142
- try:
143
- data = (
144
- client.table("storage")
145
- .select("*")
146
- .eq("instance_id", instance_id)
147
- .eq("is_default", True)
148
- .execute()
149
- .data
150
- )
151
- except Exception:
152
- return None
147
+ data = (
148
+ client.table("storage")
149
+ .select("*")
150
+ .eq("instance_id", instance_id)
151
+ .eq("is_default", True)
152
+ .execute()
153
+ .data
154
+ )
153
155
  if len(data) == 0:
154
156
  return None
155
157
  return data[0]
@@ -7,15 +7,6 @@ from pydantic import BaseModel, Field, GetCoreSchemaHandler
7
7
  from pydantic_core import CoreSchema, core_schema
8
8
 
9
9
 
10
- def validate_schema_arg(schema: str | None = None) -> str:
11
- if schema is None or schema == "":
12
- return ""
13
- # currently no actual validation, can add back if we see a need
14
- # the following just strips white spaces
15
- to_be_validated = [s.strip() for s in schema.split(",")]
16
- return ",".join(to_be_validated)
17
-
18
-
19
10
  def validate_db_arg(db: str | None) -> None:
20
11
  if db is not None:
21
12
  LaminDsnModel(db=db)
@@ -164,7 +164,7 @@ class SetupSettings:
164
164
 
165
165
  @property
166
166
  def paths(self) -> type[SetupPaths]:
167
- """Convert cloud paths to lamidb local paths.
167
+ """Convert cloud paths to lamindb local paths.
168
168
 
169
169
  Use `settings.paths.cloud_to_local_no_update`
170
170
  or `settings.paths.cloud_to_local`.
@@ -179,7 +179,7 @@ class SetupSettings:
179
179
  repr = self.user.__repr__()
180
180
  repr += f"\nAuto-connect in Python: {self.auto_connect}\n"
181
181
  repr += f"Private Django API: {self.private_django_api}\n"
182
- repr += f"Cache directory: {self.cache_dir}\n"
182
+ repr += f"Cache directory: {self.cache_dir.as_posix()}\n"
183
183
  if self._instance_exists:
184
184
  repr += self.instance.__repr__()
185
185
  else:
@@ -200,9 +200,14 @@ class SetupPaths:
200
200
  # cache_key is ignored if filepath is a local path
201
201
  if not isinstance(filepath, LocalPathClasses):
202
202
  # settings is defined further in this file
203
- local_filepath = settings.cache_dir / (
204
- filepath.path if cache_key is None else cache_key # type: ignore
205
- )
203
+ if cache_key is None:
204
+ local_key = filepath.path # type: ignore
205
+ protocol = filepath.protocol # type: ignore
206
+ if protocol in {"http", "https"}:
207
+ local_key = local_key.removeprefix(protocol + "://")
208
+ else:
209
+ local_key = cache_key
210
+ local_filepath = settings.cache_dir / local_key
206
211
  else:
207
212
  local_filepath = filepath
208
213
  return local_filepath
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Literal
8
8
  from django.db.utils import ProgrammingError
9
9
  from lamin_utils import logger
10
10
 
11
+ from ._deprecated import deprecated
11
12
  from ._hub_client import call_with_fallback
12
13
  from ._hub_crud import select_account_handle_name_by_lnid
13
14
  from ._hub_utils import LaminDsn, LaminDsnModel
@@ -53,7 +54,7 @@ class InstanceSettings:
53
54
  keep_artifacts_local: bool = False, # default to local storage
54
55
  uid: str | None = None, # instance uid/lnid
55
56
  db: str | None = None, # DB URI
56
- schema: str | None = None, # comma-separated string of schema names
57
+ modules: str | None = None, # comma-separated string of module names
57
58
  git_repo: str | None = None, # a git repo URL
58
59
  is_on_hub: bool | None = None, # initialized from hub
59
60
  api_url: str | None = None,
@@ -69,7 +70,7 @@ class InstanceSettings:
69
70
  self._storage: StorageSettings = storage
70
71
  validate_db_arg(db)
71
72
  self._db: str | None = db
72
- self._schema_str: str | None = schema
73
+ self._schema_str: str | None = modules
73
74
  self._git_repo = None if git_repo is None else sanitize_git_repo_url(git_repo)
74
75
  # local storage
75
76
  self._keep_artifacts_local = keep_artifacts_local
@@ -84,7 +85,7 @@ class InstanceSettings:
84
85
  def __repr__(self):
85
86
  """Rich string representation."""
86
87
  representation = f"Current instance: {self.slug}"
87
- attrs = ["owner", "name", "storage", "db", "schema", "git_repo"]
88
+ attrs = ["owner", "name", "storage", "db", "modules", "git_repo"]
88
89
  for attr in attrs:
89
90
  value = getattr(self, attr)
90
91
  if attr == "storage":
@@ -121,7 +122,7 @@ class InstanceSettings:
121
122
  def _search_local_root(
122
123
  self, local_root: str | None = None, mute_warning: bool = False
123
124
  ) -> StorageSettings | None:
124
- from lnschema_core.models import Storage
125
+ from lamindb.models import Storage
125
126
 
126
127
  if local_root is not None:
127
128
  local_records = Storage.objects.filter(root=local_root)
@@ -271,12 +272,20 @@ class InstanceSettings:
271
272
  return hash_and_encode_as_b62(self._id.hex)[:12]
272
273
 
273
274
  @property
274
- def schema(self) -> set[str]:
275
- """Schema modules in addition to core schema."""
275
+ def modules(self) -> set[str]:
276
+ """The set of modules that defines the database schema.
277
+
278
+ The core schema contained in lamindb is not included in this set.
279
+ """
276
280
  if self._schema_str is None:
277
281
  return {} # type: ignore
278
282
  else:
279
- return {schema for schema in self._schema_str.split(",") if schema != ""}
283
+ return {module for module in self._schema_str.split(",") if module != ""}
284
+
285
+ @property
286
+ @deprecated("modules")
287
+ def schema(self) -> set[str]:
288
+ return self.modules
280
289
 
281
290
  @property
282
291
  def _sqlite_file(self) -> UPath:
@@ -358,7 +367,7 @@ class InstanceSettings:
358
367
  sqlite_filepath = self.storage.cloud_to_local(
359
368
  self._sqlite_file, error_no_origin=False
360
369
  )
361
- return f"sqlite:///{sqlite_filepath}"
370
+ return f"sqlite:///{sqlite_filepath.as_posix()}"
362
371
  else:
363
372
  return self._db
364
373
 
@@ -457,11 +466,24 @@ class InstanceSettings:
457
466
  settings._instance_settings = self
458
467
 
459
468
  def _init_db(self):
469
+ from lamindb_setup import _check_setup
470
+
460
471
  from .django import setup_django
461
472
 
473
+ _check_setup.IS_LOADING = True
462
474
  setup_django(self, init=True)
475
+ _check_setup.IS_LOADING = False
476
+
477
+ from lamindb.models import Space
478
+
479
+ Space.objects.get_or_create(
480
+ name="All",
481
+ description="Every team & user with access to the instance has access.",
482
+ )
463
483
 
464
484
  def _load_db(self) -> tuple[bool, str]:
485
+ from lamindb_setup import _check_setup
486
+
465
487
  # Is the database available and initialized as LaminDB?
466
488
  # returns a tuple of status code and message
467
489
  if self.dialect == "sqlite" and not self._sqlite_file.exists():
@@ -472,7 +494,6 @@ class InstanceSettings:
472
494
  f" {legacy_file} to {self._sqlite_file}"
473
495
  )
474
496
  return False, f"SQLite file {self._sqlite_file} does not exist"
475
- from lamindb_setup import settings # to check user
476
497
 
477
498
  from .django import setup_django
478
499
 
@@ -481,5 +502,7 @@ class InstanceSettings:
481
502
  # setting up django also performs a check for migrations & prints them
482
503
  # as warnings
483
504
  # this should fail, e.g., if the db is not reachable
505
+ _check_setup.IS_LOADING = True
484
506
  setup_django(self)
507
+ _check_setup.IS_LOADING = False
485
508
  return True, ""
@@ -98,7 +98,7 @@ def setup_instance_from_store(store: InstanceSettingsStore) -> InstanceSettings:
98
98
  name=store.name,
99
99
  storage=ssettings,
100
100
  db=_null_to_value(store.db),
101
- schema=_null_to_value(store.schema_str),
101
+ modules=_null_to_value(store.schema_str),
102
102
  git_repo=_null_to_value(store.git_repo),
103
103
  keep_artifacts_local=store.keep_artifacts_local, # type: ignore
104
104
  )
@@ -7,6 +7,7 @@ import string
7
7
  from pathlib import Path
8
8
  from typing import TYPE_CHECKING, Any, Literal
9
9
 
10
+ import fsspec
10
11
  from lamin_utils import logger
11
12
 
12
13
  from ._aws_credentials import HOSTED_REGIONS, get_aws_credentials_manager
@@ -24,6 +25,10 @@ if TYPE_CHECKING:
24
25
 
25
26
  IS_INITIALIZED_KEY = ".lamindb/_is_initialized"
26
27
 
28
+ # a list of supported fsspec protocols
29
+ # rename file to local before showing to a user
30
+ VALID_PROTOCOLS = ("file", "gs", "s3", "hf", "http", "https")
31
+
27
32
 
28
33
  def base62(n_char: int) -> str:
29
34
  """Like nanoid without hyphen and underscore."""
@@ -114,16 +119,11 @@ def init_storage(
114
119
  root_str = f"s3://lamin-{region}/{uid}"
115
120
  else:
116
121
  root_str = f"s3://lamin-hosted-test/{uid}"
117
- elif root_str.startswith(("gs://", "s3://", "hf://")):
118
- pass
119
- else: # local path
120
- try:
121
- _ = Path(root_str)
122
- except Exception as e:
123
- logger.error(
124
- "`storage` is not a valid local, GCP storage, AWS S3 path or Hugging Face path"
125
- )
126
- raise e
122
+ elif (input_protocol := fsspec.utils.get_protocol(root_str)) not in VALID_PROTOCOLS:
123
+ valid_protocols = ("local",) + VALID_PROTOCOLS[1:] # show local instead of file
124
+ raise ValueError(
125
+ f"Protocol {input_protocol} is not supported, valid protocols are {', '.join(valid_protocols)}"
126
+ )
127
127
  ssettings = StorageSettings(
128
128
  uid=uid,
129
129
  root=root_str,
@@ -227,10 +227,10 @@ class StorageSettings:
227
227
 
228
228
  @property
229
229
  def record(self) -> Any:
230
- """Storage record in current instance."""
230
+ """Storage record in the current instance."""
231
231
  if self._record is None:
232
232
  # dynamic import because of import order
233
- from lnschema_core.models import Storage
233
+ from lamindb.models import Storage
234
234
 
235
235
  from ._settings import settings
236
236
 
@@ -299,14 +299,15 @@ class StorageSettings:
299
299
  return self._region
300
300
 
301
301
  @property
302
- def type(self) -> Literal["local", "s3", "gs"]:
302
+ def type(self) -> Literal["local", "s3", "gs", "hf", "http", "https"]:
303
303
  """AWS S3 vs. Google Cloud vs. local.
304
304
 
305
- Returns the protocol as a string: "local", "s3", "gs".
305
+ Returns the protocol as a string: "local", "s3", "gs", "http", "https".
306
306
  """
307
307
  import fsspec
308
308
 
309
309
  convert = {"file": "local"}
310
+ # init_storage checks that the root protocol belongs to VALID_PROTOCOLS
310
311
  protocol = fsspec.utils.get_protocol(self.root_as_str)
311
312
  return convert.get(protocol, protocol) # type: ignore
312
313
 
@@ -345,5 +346,5 @@ class StorageSettings:
345
346
  return self.root / filekey
346
347
 
347
348
  def local_filepath(self, filekey: UPathStr) -> UPath:
348
- """Local (cache) filepath from filekey: `local(filepath(...))`."""
349
+ """Local (cache) filepath from filekey."""
349
350
  return self.cloud_to_local(self.key_to_filepath(filekey))
@@ -48,7 +48,7 @@ class UserSettings:
48
48
  @property
49
49
  def id(self):
50
50
  """Integer id valid in current intance."""
51
- from lnschema_core.users import current_user_id
51
+ from lamindb.base.users import current_user_id
52
52
 
53
53
  # there is no cache needed here because current_user_id()
54
54
  # has its own cache
@@ -17,7 +17,7 @@ RENAME = {"name": "source", "description": "source_name"}
17
17
 
18
18
  def write_bionty_sources(isettings: InstanceSettings) -> None:
19
19
  """Write bionty sources to Source table."""
20
- if "bionty" not in isettings.schema:
20
+ if "bionty" not in isettings.modules:
21
21
  return None
22
22
  import shutil
23
23
 
@@ -79,7 +79,7 @@ def load_bionty_sources(isettings: InstanceSettings | None = None):
79
79
  # not setting up bionty sources
80
80
  return None
81
81
  if isettings is not None:
82
- if "bionty" not in isettings.schema:
82
+ if "bionty" not in isettings.modules:
83
83
  # no need to setup anything
84
84
  return None
85
85
 
@@ -96,7 +96,7 @@ def load_bionty_sources(isettings: InstanceSettings | None = None):
96
96
  for kwargs in active_records:
97
97
  for db_field, base_col in RENAME.items():
98
98
  kwargs[base_col] = kwargs.pop(db_field)
99
- # TODO: non-bionty schema?
99
+ # TODO: non-bionty modules?
100
100
  kwargs["entity"] = kwargs["entity"].replace("bionty.", "")
101
101
  write_yaml(
102
102
  parse_currently_used_sources(active_records),
@@ -5,10 +5,8 @@ import builtins
5
5
  import os
6
6
  from pathlib import Path
7
7
  import time
8
- from lamin_utils import logger
9
- from ._settings_store import current_instance_settings_file
10
8
  from ._settings_instance import InstanceSettings
11
- import sys
9
+
12
10
 
13
11
  IS_RUN_FROM_IPYTHON = getattr(builtins, "__IPYTHON__", False)
14
12
  IS_SETUP = False
@@ -54,14 +52,15 @@ def setup_django(
54
52
  }
55
53
  from .._init_instance import get_schema_module_name
56
54
 
57
- schema_names = ["core"] + list(isettings.schema)
55
+ module_names = ["core"] + list(isettings.modules)
58
56
  raise_import_error = True if init else False
59
- installed_apps = [
57
+ installed_apps = ["django.contrib.contenttypes"]
58
+ installed_apps += [
60
59
  package_name
61
- for n in schema_names
60
+ for name in module_names
62
61
  if (
63
62
  package_name := get_schema_module_name(
64
- n, raise_import_error=raise_import_error
63
+ name, raise_import_error=raise_import_error
65
64
  )
66
65
  )
67
66
  is not None
@@ -12,6 +12,7 @@ from __future__ import annotations
12
12
 
13
13
  import base64
14
14
  import hashlib
15
+ import json
15
16
  from concurrent.futures import ThreadPoolExecutor
16
17
  from typing import TYPE_CHECKING
17
18
 
@@ -40,11 +41,21 @@ def b16_to_b64(s: str):
40
41
  return to_b64_str(base64.b16decode(s.strip('"'), casefold=True))
41
42
 
42
43
 
44
+ def hash_string(string: str) -> str:
45
+ # as we're truncating (not here) at 22 b64, we choose md5 over sha512
46
+ return to_b64_str(hashlib.md5(string.encode("utf-8")).digest())
47
+
48
+
43
49
  # a lot to read about this: lamin-notes/2022/hashing
44
50
  def hash_set(s: set[str]) -> str:
45
- bstr = ":".join(sorted(s)).encode("utf-8")
46
- # as we're truncating at 22 b64, we choose md5 over sha512
47
- return to_b64_str(hashlib.md5(bstr).digest())[:HASH_LENGTH]
51
+ join_s = ":".join(sorted(s))
52
+ return hash_string(join_s)[:HASH_LENGTH]
53
+
54
+
55
+ def hash_dict(d: dict) -> str:
56
+ return to_b64_str(hashlib.md5(json.dumps(d, sort_keys=True).encode()).digest())[
57
+ :HASH_LENGTH
58
+ ]
48
59
 
49
60
 
50
61
  def hash_from_hashes_list(hashes: Iterable[str]) -> str:
@@ -111,6 +122,6 @@ def hash_dir(path: Path):
111
122
  hashes, sizes = zip(*hashes_sizes)
112
123
 
113
124
  hash, hash_type = hash_from_hashes_list(hashes), "md5-d"
114
- n_objects = len(hashes)
125
+ n_files = len(hashes)
115
126
  size = sum(sizes)
116
- return size, hash, hash_type, n_objects
127
+ return size, hash, hash_type, n_files