lamindb_setup 0.80.1__py3-none-any.whl → 0.81.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb_setup/__init__.py CHANGED
@@ -33,7 +33,7 @@ Modules & settings:
33
33
 
34
34
  """
35
35
 
36
- __version__ = "0.80.1" # denote a release candidate for 0.1.0 with 0.1rc1
36
+ __version__ = "0.81.1" # denote a release candidate for 0.1.0 with 0.1rc1
37
37
 
38
38
  import os as _os
39
39
  import sys as _sys
@@ -114,13 +114,15 @@ def init_storage(
114
114
  root_str = f"s3://lamin-{region}/{uid}"
115
115
  else:
116
116
  root_str = f"s3://lamin-hosted-test/{uid}"
117
- elif root_str.startswith(("gs://", "s3://")):
117
+ elif root_str.startswith(("gs://", "s3://", "hf://")):
118
118
  pass
119
119
  else: # local path
120
120
  try:
121
121
  _ = Path(root_str)
122
122
  except Exception as e:
123
- logger.error("`storage` is not a valid local, GCP storage or AWS S3 path")
123
+ logger.error(
124
+ "`storage` is not a valid local, GCP storage, AWS S3 path or Hugging Face path"
125
+ )
124
126
  raise e
125
127
  ssettings = StorageSettings(
126
128
  uid=uid,
@@ -161,6 +163,7 @@ def init_storage(
161
163
  # only newly created
162
164
  if hub_record_status == "hub-record-created" and ssettings._uuid is not None:
163
165
  delete_storage_record(ssettings._uuid, access_token=access_token) # type: ignore
166
+ hub_record_status = "hub-record-not-created"
164
167
  ssettings._instance_id = None
165
168
  return ssettings, hub_record_status
166
169
 
@@ -47,14 +47,14 @@ def hash_set(s: set[str]) -> str:
47
47
  return to_b64_str(hashlib.md5(bstr).digest())[:HASH_LENGTH]
48
48
 
49
49
 
50
- def hash_md5s_from_dir(hashes: Iterable[str]) -> tuple[str, str]:
50
+ def hash_from_hashes_list(hashes: Iterable[str]) -> str:
51
51
  # need to sort below because we don't want the order of parsing the dir to
52
52
  # affect the hash
53
53
  digests = b"".join(
54
54
  hashlib.md5(hash.encode("utf-8")).digest() for hash in sorted(hashes)
55
55
  )
56
56
  digest = hashlib.md5(digests).digest()
57
- return to_b64_str(digest)[:HASH_LENGTH], "md5-d"
57
+ return to_b64_str(digest)[:HASH_LENGTH]
58
58
 
59
59
 
60
60
  def hash_code(file_path: UPathStr):
@@ -110,7 +110,7 @@ def hash_dir(path: Path):
110
110
  hashes_sizes = map(hash_size, files)
111
111
  hashes, sizes = zip(*hashes_sizes)
112
112
 
113
- hash, hash_type = hash_md5s_from_dir(hashes)
113
+ hash, hash_type = hash_from_hashes_list(hashes), "md5-d"
114
114
  n_objects = len(hashes)
115
115
  size = sum(sizes)
116
116
  return size, hash, hash_type, n_objects
@@ -4,6 +4,7 @@
4
4
  from __future__ import annotations
5
5
 
6
6
  import os
7
+ import warnings
7
8
  from collections import defaultdict
8
9
  from datetime import datetime, timezone
9
10
  from functools import partial
@@ -18,7 +19,7 @@ from upath.implementations.cloud import CloudPath, S3Path # keep CloudPath!
18
19
  from upath.implementations.local import LocalPath
19
20
 
20
21
  from ._aws_credentials import HOSTED_BUCKETS, get_aws_credentials_manager
21
- from .hashing import HASH_LENGTH, b16_to_b64, hash_md5s_from_dir
22
+ from .hashing import HASH_LENGTH, b16_to_b64, hash_from_hashes_list
22
23
 
23
24
  if TYPE_CHECKING:
24
25
  from .types import UPathStr
@@ -340,22 +341,34 @@ def synchronize(
340
341
  timestamp: float | None = None,
341
342
  ):
342
343
  """Sync to a local destination path."""
344
+ protocol = self.protocol
343
345
  # optimize the number of network requests
344
346
  if timestamp is not None:
345
347
  is_dir = False
346
348
  exists = True
347
349
  cloud_mts = timestamp
348
350
  else:
349
- # perform only one network request to check existence, type and timestamp
350
- try:
351
- cloud_mts = self.modified.timestamp()
352
- is_dir = False
353
- exists = True
354
- except FileNotFoundError:
355
- exists = False
356
- except IsADirectoryError:
357
- is_dir = True
358
- exists = True
351
+ # hf requires special treatment
352
+ if protocol == "hf":
353
+ try:
354
+ stat_hf = self.stat().as_info()
355
+ is_dir = stat_hf["type"] == "directory"
356
+ exists = True
357
+ if not is_dir:
358
+ cloud_mts = stat_hf["last_commit"].date.timestamp()
359
+ except FileNotFoundError:
360
+ exists = False
361
+ else:
362
+ # perform only one network request to check existence, type and timestamp
363
+ try:
364
+ cloud_mts = self.modified.timestamp()
365
+ is_dir = False
366
+ exists = True
367
+ except FileNotFoundError:
368
+ exists = False
369
+ except IsADirectoryError:
370
+ is_dir = True
371
+ exists = True
359
372
 
360
373
  if not exists:
361
374
  warn_or_error = f"The original path {self} does not exist anymore."
@@ -373,14 +386,18 @@ def synchronize(
373
386
  # synchronization logic for directories
374
387
  if is_dir:
375
388
  files = self.fs.find(str(self), detail=True)
376
- protocol_modified = {"s3": "LastModified", "gs": "mtime"}
377
- modified_key = protocol_modified.get(self.protocol, None)
378
- if modified_key is None:
379
- raise ValueError(f"Can't synchronize a directory for {self.protocol}.")
389
+ if protocol == "s3":
390
+ get_modified = lambda file_stat: file_stat["LastModified"]
391
+ elif protocol == "gs":
392
+ get_modified = lambda file_stat: file_stat["mtime"]
393
+ elif protocol == "hf":
394
+ get_modified = lambda file_stat: file_stat["last_commit"].date
395
+ else:
396
+ raise ValueError(f"Can't synchronize a directory for {protocol}.")
380
397
  if objectpath.exists():
381
398
  destination_exists = True
382
399
  cloud_mts_max = max(
383
- file[modified_key] for file in files.values()
400
+ get_modified(file) for file in files.values()
384
401
  ).timestamp()
385
402
  local_mts = [
386
403
  file.stat().st_mtime for file in objectpath.rglob("*") if file.is_file()
@@ -405,9 +422,8 @@ def synchronize(
405
422
  for file, stat in callback.wrap(files.items()):
406
423
  file_key = PurePosixPath(file).relative_to(self.path).as_posix()
407
424
  origin_file_keys.append(file_key)
408
- timestamp = stat[modified_key].timestamp()
409
-
410
- origin = f"{self.protocol}://{file}"
425
+ timestamp = get_modified(stat).timestamp()
426
+ origin = f"{protocol}://{file}"
411
427
  destination = objectpath / file_key
412
428
  child = callback.branched(origin, destination.as_posix())
413
429
  UPath(origin, **self.storage_options).synchronize(
@@ -439,6 +455,10 @@ def synchronize(
439
455
  objectpath.parent.mkdir(parents=True, exist_ok=True)
440
456
  need_synchronize = True
441
457
  if need_synchronize:
458
+ # hf has sync filesystem
459
+ # on sync filesystems ChildProgressCallback.branched()
460
+ # returns the default callback
461
+ # this is why a difference between s3 and hf in progress bars
442
462
  self.download_to(
443
463
  objectpath, recursive=False, print_progress=False, callback=callback
444
464
  )
@@ -689,6 +709,12 @@ Args:
689
709
  pathlike: A string or Path to a local/cloud file/directory/folder.
690
710
  """
691
711
 
712
+ # suppress the warning from upath about hf (huggingface) filesystem
713
+ # not being explicitly implemented in upath
714
+ warnings.filterwarnings(
715
+ "ignore", module="upath", message=".*'hf' filesystem not explicitly implemented.*"
716
+ )
717
+
692
718
 
693
719
  def create_path(path: UPath, access_token: str | None = None) -> UPath:
694
720
  path = UPath(path)
@@ -698,38 +724,60 @@ def create_path(path: UPath, access_token: str | None = None) -> UPath:
698
724
  return get_aws_credentials_manager().enrich_path(path, access_token)
699
725
 
700
726
 
701
- def get_stat_file_cloud(stat: dict) -> tuple[int, str, str]:
727
+ def get_stat_file_cloud(stat: dict) -> tuple[int, str | None, str | None]:
702
728
  size = stat["size"]
703
- etag = stat["ETag"]
704
- # small files
705
- if "-" not in etag:
706
- # only store hash for non-multipart uploads
707
- # we can't rapidly validate multi-part uploaded files client-side
708
- # we can add more logic later down-the-road
709
- hash = b16_to_b64(etag)
729
+ hash, hash_type = None, None
730
+ # gs, use md5Hash instead of etag for now
731
+ if "md5Hash" in stat:
732
+ # gs hash is already in base64
733
+ hash = stat["md5Hash"].strip('"=')
710
734
  hash_type = "md5"
711
- else:
712
- stripped_etag, suffix = etag.split("-")
713
- suffix = suffix.strip('"')
714
- hash = b16_to_b64(stripped_etag)
715
- hash_type = f"md5-{suffix}" # this is the S3 chunk-hashing strategy
716
- return size, hash[:HASH_LENGTH], hash_type
735
+ # hf
736
+ elif "blob_id" in stat:
737
+ hash = b16_to_b64(stat["blob_id"])
738
+ hash_type = "sha1"
739
+ # s3
740
+ elif "ETag" in stat:
741
+ etag = stat["ETag"]
742
+ # small files
743
+ if "-" not in etag:
744
+ # only store hash for non-multipart uploads
745
+ # we can't rapidly validate multi-part uploaded files client-side
746
+ # we can add more logic later down-the-road
747
+ hash = b16_to_b64(etag)
748
+ hash_type = "md5"
749
+ else:
750
+ stripped_etag, suffix = etag.split("-")
751
+ suffix = suffix.strip('"')
752
+ hash = b16_to_b64(stripped_etag)
753
+ hash_type = f"md5-{suffix}" # this is the S3 chunk-hashing strategy
754
+ if hash is not None:
755
+ hash = hash[:HASH_LENGTH]
756
+ return size, hash, hash_type
717
757
 
718
758
 
719
- def get_stat_dir_cloud(path: UPath) -> tuple[int, str, str, int]:
720
- sizes = []
721
- md5s = []
759
+ def get_stat_dir_cloud(path: UPath) -> tuple[int, str | None, str | None, int]:
722
760
  objects = path.fs.find(path.as_posix(), detail=True)
761
+ hash, hash_type = None, None
762
+ compute_list_hash = True
723
763
  if path.protocol == "s3":
724
764
  accessor = "ETag"
725
765
  elif path.protocol == "gs":
726
766
  accessor = "md5Hash"
767
+ elif path.protocol == "hf":
768
+ accessor = "blob_id"
769
+ else:
770
+ compute_list_hash = False
771
+ sizes = []
772
+ hashes = []
727
773
  for object in objects.values():
728
774
  sizes.append(object["size"])
729
- md5s.append(object[accessor].strip('"='))
775
+ if compute_list_hash:
776
+ hashes.append(object[accessor].strip('"='))
730
777
  size = sum(sizes)
731
- hash, hash_type = hash_md5s_from_dir(md5s)
732
- n_objects = len(md5s)
778
+ n_objects = len(sizes)
779
+ if compute_list_hash:
780
+ hash, hash_type = hash_from_hashes_list(hashes), "md5-d"
733
781
  return size, hash, hash_type, n_objects
734
782
 
735
783
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lamindb_setup
3
- Version: 0.80.1
3
+ Version: 0.81.1
4
4
  Summary: Setup & configure LaminDB.
5
5
  Author-email: Lamin Labs <open-source@lamin.ai>
6
6
  Requires-Python: >=3.9
@@ -19,7 +19,7 @@ Requires-Dist: supabase==2.2.1
19
19
  Requires-Dist: psutil
20
20
  Requires-Dist: urllib3<2 ; extra == "aws"
21
21
  Requires-Dist: aiobotocore[boto3]>=2.5.4,<3.0.0 ; extra == "aws"
22
- Requires-Dist: s3fs>=2023.12.2,<=2024.6.1 ; extra == "aws"
22
+ Requires-Dist: s3fs>=2023.12.2,<=2024.10.0 ; extra == "aws"
23
23
  Requires-Dist: line_profiler ; extra == "dev"
24
24
  Requires-Dist: pyjwt<3.0.0 ; extra == "dev"
25
25
  Requires-Dist: psycopg2-binary ; extra == "dev"
@@ -31,7 +31,7 @@ Requires-Dist: pytest-xdist ; extra == "dev"
31
31
  Requires-Dist: nbproject-test>=0.4.3 ; extra == "dev"
32
32
  Requires-Dist: pandas ; extra == "dev"
33
33
  Requires-Dist: django-schema-graph ; extra == "erdiagram"
34
- Requires-Dist: gcsfs>=2023.12.2,<=2024.6.1 ; extra == "gcp"
34
+ Requires-Dist: gcsfs>=2023.12.2,<=2024.10.0 ; extra == "gcp"
35
35
  Project-URL: Home, https://github.com/laminlabs/lamindb-setup
36
36
  Provides-Extra: aws
37
37
  Provides-Extra: dev
@@ -1,4 +1,4 @@
1
- lamindb_setup/__init__.py,sha256=kc1LSkxZIe6SeFBsho6Xg6WyYxKcsUYoLNgQlQSRnxo,1714
1
+ lamindb_setup/__init__.py,sha256=dFAjI0r6-lM4WoJ3n0uo6iCdKHvnV-20y76RufgIFCQ,1714
2
2
  lamindb_setup/_cache.py,sha256=1XnM-V_KprbjpgPY7Bg3FYn53Iz_2_fEgcMOaSdKKbg,1332
3
3
  lamindb_setup/_check.py,sha256=28PcG8Kp6OpjSLSi1r2boL2Ryeh6xkaCL87HFbjs6GA,129
4
4
  lamindb_setup/_check_setup.py,sha256=6cSfpmVOSgU7YiVHfJpBTGTQ7rrnwunt1pJT_jkgNM8,3196
@@ -31,17 +31,17 @@ lamindb_setup/core/_settings.py,sha256=mpGsSb98UsBedLsW2RuowZ17EP2tI2XRGPztqrJtr
31
31
  lamindb_setup/core/_settings_instance.py,sha256=ajcq9zRNE598tTqyMkMqaEOubVfFeE998DPtbgyzK3A,18801
32
32
  lamindb_setup/core/_settings_load.py,sha256=5OpghcbkrK9KBM_0Iu-61FTI76UbOpPkkJpUittXS-w,4098
33
33
  lamindb_setup/core/_settings_save.py,sha256=rxGxgaK5i9exKqSJERQQyY1WZio20meoQJoYXlVW-1w,3138
34
- lamindb_setup/core/_settings_storage.py,sha256=15B7taJF1zxJ1_qAb67NuXkTFvO2TRTWMt6KTzDf1mw,11875
34
+ lamindb_setup/core/_settings_storage.py,sha256=CYwGZm0fKYN7eLLsU-sOtOKG7HzswQVjTWb0ooHKcNg,11990
35
35
  lamindb_setup/core/_settings_store.py,sha256=WcsgOmgnu9gztcrhp-N4OONNZyxICHV8M0HdJllTaEo,2219
36
36
  lamindb_setup/core/_settings_user.py,sha256=iz0MqFLKXqm8LYx_CHmr02_oNvYWFLIxKkJLdpS5W08,1476
37
37
  lamindb_setup/core/_setup_bionty_sources.py,sha256=o2L5Ww8TKgSqJtL4cGUcpJwLNYxA9BZgddhCMCu_E2g,3428
38
38
  lamindb_setup/core/cloud_sqlite_locker.py,sha256=i6TrT7HG0lqliPvZTlsZ_uplPaqhPBbabyfeR32SkA8,7107
39
39
  lamindb_setup/core/django.py,sha256=E4U9nUlV2kHd-G5v6iSdFGAAWixlQDxOFwMwOMG9xfw,3864
40
40
  lamindb_setup/core/exceptions.py,sha256=4NpLUNUIfXYVTFX2FvLZF8RW34exk2Vn2X3G4YhnTRg,276
41
- lamindb_setup/core/hashing.py,sha256=bkuvZyAuC7-Y_qZumJd_rybF-upJ5J3KxnKiymRUifw,3148
41
+ lamindb_setup/core/hashing.py,sha256=26dtak7XgmrWa_D1zuDyxObRQcriMtnc1yEigkKASmM,3142
42
42
  lamindb_setup/core/types.py,sha256=zJii2le38BJUmsNVvzDrbzGYr0yaeb-9Rw9IKmsBr3k,523
43
- lamindb_setup/core/upath.py,sha256=q6WvpdXO_-Ajl5qjU4CIf0Q1ZbYxA0Po54LG9iHYn28,27151
44
- lamindb_setup-0.80.1.dist-info/LICENSE,sha256=UOZ1F5fFDe3XXvG4oNnkL1-Ecun7zpHzRxjp-XsMeAo,11324
45
- lamindb_setup-0.80.1.dist-info/WHEEL,sha256=EZbGkh7Ie4PoZfRQ8I0ZuP9VklN_TvcZ6DSE5Uar4z4,81
46
- lamindb_setup-0.80.1.dist-info/METADATA,sha256=kMZ0UUKGc3sxehsLkXFPedNRYpo_VNNHKA-iu7I5W4I,1743
47
- lamindb_setup-0.80.1.dist-info/RECORD,,
43
+ lamindb_setup/core/upath.py,sha256=4rCC0EOe-UaCexRDReQ5D_zs13Zi626pqn1slYNvISE,28945
44
+ lamindb_setup-0.81.1.dist-info/LICENSE,sha256=UOZ1F5fFDe3XXvG4oNnkL1-Ecun7zpHzRxjp-XsMeAo,11324
45
+ lamindb_setup-0.81.1.dist-info/WHEEL,sha256=EZbGkh7Ie4PoZfRQ8I0ZuP9VklN_TvcZ6DSE5Uar4z4,81
46
+ lamindb_setup-0.81.1.dist-info/METADATA,sha256=yvD9AXAL_lFbagwrcvhId-DMCbgFHtWxZ8fVxobspDM,1745
47
+ lamindb_setup-0.81.1.dist-info/RECORD,,