lamindb_setup 0.80.1__py3-none-any.whl → 0.81.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb_setup/__init__.py CHANGED
@@ -33,7 +33,7 @@ Modules & settings:
33
33
 
34
34
  """
35
35
 
36
- __version__ = "0.80.1" # denote a release candidate for 0.1.0 with 0.1rc1
36
+ __version__ = "0.81.0" # denote a release candidate for 0.1.0 with 0.1rc1
37
37
 
38
38
  import os as _os
39
39
  import sys as _sys
@@ -114,13 +114,15 @@ def init_storage(
114
114
  root_str = f"s3://lamin-{region}/{uid}"
115
115
  else:
116
116
  root_str = f"s3://lamin-hosted-test/{uid}"
117
- elif root_str.startswith(("gs://", "s3://")):
117
+ elif root_str.startswith(("gs://", "s3://", "hf://")):
118
118
  pass
119
119
  else: # local path
120
120
  try:
121
121
  _ = Path(root_str)
122
122
  except Exception as e:
123
- logger.error("`storage` is not a valid local, GCP storage or AWS S3 path")
123
+ logger.error(
124
+ "`storage` is not a valid local, GCP storage, AWS S3 path or Hugging Face path"
125
+ )
124
126
  raise e
125
127
  ssettings = StorageSettings(
126
128
  uid=uid,
@@ -161,6 +163,7 @@ def init_storage(
161
163
  # only newly created
162
164
  if hub_record_status == "hub-record-created" and ssettings._uuid is not None:
163
165
  delete_storage_record(ssettings._uuid, access_token=access_token) # type: ignore
166
+ hub_record_status = "hub-record-not-created"
164
167
  ssettings._instance_id = None
165
168
  return ssettings, hub_record_status
166
169
 
@@ -47,14 +47,14 @@ def hash_set(s: set[str]) -> str:
47
47
  return to_b64_str(hashlib.md5(bstr).digest())[:HASH_LENGTH]
48
48
 
49
49
 
50
- def hash_md5s_from_dir(hashes: Iterable[str]) -> tuple[str, str]:
50
+ def hash_from_hashes_list(hashes: Iterable[str]) -> str:
51
51
  # need to sort below because we don't want the order of parsing the dir to
52
52
  # affect the hash
53
53
  digests = b"".join(
54
54
  hashlib.md5(hash.encode("utf-8")).digest() for hash in sorted(hashes)
55
55
  )
56
56
  digest = hashlib.md5(digests).digest()
57
- return to_b64_str(digest)[:HASH_LENGTH], "md5-d"
57
+ return to_b64_str(digest)[:HASH_LENGTH]
58
58
 
59
59
 
60
60
  def hash_code(file_path: UPathStr):
@@ -110,7 +110,7 @@ def hash_dir(path: Path):
110
110
  hashes_sizes = map(hash_size, files)
111
111
  hashes, sizes = zip(*hashes_sizes)
112
112
 
113
- hash, hash_type = hash_md5s_from_dir(hashes)
113
+ hash, hash_type = hash_from_hashes_list(hashes), "md5-d"
114
114
  n_objects = len(hashes)
115
115
  size = sum(sizes)
116
116
  return size, hash, hash_type, n_objects
@@ -18,7 +18,7 @@ from upath.implementations.cloud import CloudPath, S3Path # keep CloudPath!
18
18
  from upath.implementations.local import LocalPath
19
19
 
20
20
  from ._aws_credentials import HOSTED_BUCKETS, get_aws_credentials_manager
21
- from .hashing import HASH_LENGTH, b16_to_b64, hash_md5s_from_dir
21
+ from .hashing import HASH_LENGTH, b16_to_b64, hash_from_hashes_list
22
22
 
23
23
  if TYPE_CHECKING:
24
24
  from .types import UPathStr
@@ -340,22 +340,34 @@ def synchronize(
340
340
  timestamp: float | None = None,
341
341
  ):
342
342
  """Sync to a local destination path."""
343
+ protocol = self.protocol
343
344
  # optimize the number of network requests
344
345
  if timestamp is not None:
345
346
  is_dir = False
346
347
  exists = True
347
348
  cloud_mts = timestamp
348
349
  else:
349
- # perform only one network request to check existence, type and timestamp
350
- try:
351
- cloud_mts = self.modified.timestamp()
352
- is_dir = False
353
- exists = True
354
- except FileNotFoundError:
355
- exists = False
356
- except IsADirectoryError:
357
- is_dir = True
358
- exists = True
350
+ # hf requires special treatment
351
+ if protocol == "hf":
352
+ try:
353
+ stat_hf = self.stat().as_info()
354
+ is_dir = stat_hf["type"] == "directory"
355
+ exists = True
356
+ if not is_dir:
357
+ cloud_mts = stat_hf["last_commit"].date.timestamp()
358
+ except FileNotFoundError:
359
+ exists = False
360
+ else:
361
+ # perform only one network request to check existence, type and timestamp
362
+ try:
363
+ cloud_mts = self.modified.timestamp()
364
+ is_dir = False
365
+ exists = True
366
+ except FileNotFoundError:
367
+ exists = False
368
+ except IsADirectoryError:
369
+ is_dir = True
370
+ exists = True
359
371
 
360
372
  if not exists:
361
373
  warn_or_error = f"The original path {self} does not exist anymore."
@@ -373,14 +385,18 @@ def synchronize(
373
385
  # synchronization logic for directories
374
386
  if is_dir:
375
387
  files = self.fs.find(str(self), detail=True)
376
- protocol_modified = {"s3": "LastModified", "gs": "mtime"}
377
- modified_key = protocol_modified.get(self.protocol, None)
378
- if modified_key is None:
379
- raise ValueError(f"Can't synchronize a directory for {self.protocol}.")
388
+ if protocol == "s3":
389
+ get_modified = lambda file_stat: file_stat["LastModified"]
390
+ elif protocol == "gs":
391
+ get_modified = lambda file_stat: file_stat["mtime"]
392
+ elif protocol == "hf":
393
+ get_modified = lambda file_stat: file_stat["last_commit"].date
394
+ else:
395
+ raise ValueError(f"Can't synchronize a directory for {protocol}.")
380
396
  if objectpath.exists():
381
397
  destination_exists = True
382
398
  cloud_mts_max = max(
383
- file[modified_key] for file in files.values()
399
+ get_modified(file) for file in files.values()
384
400
  ).timestamp()
385
401
  local_mts = [
386
402
  file.stat().st_mtime for file in objectpath.rglob("*") if file.is_file()
@@ -405,9 +421,8 @@ def synchronize(
405
421
  for file, stat in callback.wrap(files.items()):
406
422
  file_key = PurePosixPath(file).relative_to(self.path).as_posix()
407
423
  origin_file_keys.append(file_key)
408
- timestamp = stat[modified_key].timestamp()
409
-
410
- origin = f"{self.protocol}://{file}"
424
+ timestamp = get_modified(stat).timestamp()
425
+ origin = f"{protocol}://{file}"
411
426
  destination = objectpath / file_key
412
427
  child = callback.branched(origin, destination.as_posix())
413
428
  UPath(origin, **self.storage_options).synchronize(
@@ -439,6 +454,10 @@ def synchronize(
439
454
  objectpath.parent.mkdir(parents=True, exist_ok=True)
440
455
  need_synchronize = True
441
456
  if need_synchronize:
457
+ # hf has sync filesystem
458
+ # on sync filesystems ChildProgressCallback.branched()
459
+ # returns the default callback
460
+ # this is why a difference between s3 and hf in progress bars
442
461
  self.download_to(
443
462
  objectpath, recursive=False, print_progress=False, callback=callback
444
463
  )
@@ -698,38 +717,60 @@ def create_path(path: UPath, access_token: str | None = None) -> UPath:
698
717
  return get_aws_credentials_manager().enrich_path(path, access_token)
699
718
 
700
719
 
701
- def get_stat_file_cloud(stat: dict) -> tuple[int, str, str]:
720
+ def get_stat_file_cloud(stat: dict) -> tuple[int, str | None, str | None]:
702
721
  size = stat["size"]
703
- etag = stat["ETag"]
704
- # small files
705
- if "-" not in etag:
706
- # only store hash for non-multipart uploads
707
- # we can't rapidly validate multi-part uploaded files client-side
708
- # we can add more logic later down-the-road
709
- hash = b16_to_b64(etag)
722
+ hash, hash_type = None, None
723
+ # gs, use md5Hash instead of etag for now
724
+ if "md5Hash" in stat:
725
+ # gs hash is already in base64
726
+ hash = stat["md5Hash"].strip('"=')
710
727
  hash_type = "md5"
711
- else:
712
- stripped_etag, suffix = etag.split("-")
713
- suffix = suffix.strip('"')
714
- hash = b16_to_b64(stripped_etag)
715
- hash_type = f"md5-{suffix}" # this is the S3 chunk-hashing strategy
716
- return size, hash[:HASH_LENGTH], hash_type
728
+ # hf
729
+ elif "blob_id" in stat:
730
+ hash = b16_to_b64(stat["blob_id"])
731
+ hash_type = "sha1"
732
+ # s3
733
+ elif "ETag" in stat:
734
+ etag = stat["ETag"]
735
+ # small files
736
+ if "-" not in etag:
737
+ # only store hash for non-multipart uploads
738
+ # we can't rapidly validate multi-part uploaded files client-side
739
+ # we can add more logic later down-the-road
740
+ hash = b16_to_b64(etag)
741
+ hash_type = "md5"
742
+ else:
743
+ stripped_etag, suffix = etag.split("-")
744
+ suffix = suffix.strip('"')
745
+ hash = b16_to_b64(stripped_etag)
746
+ hash_type = f"md5-{suffix}" # this is the S3 chunk-hashing strategy
747
+ if hash is not None:
748
+ hash = hash[:HASH_LENGTH]
749
+ return size, hash, hash_type
717
750
 
718
751
 
719
- def get_stat_dir_cloud(path: UPath) -> tuple[int, str, str, int]:
720
- sizes = []
721
- md5s = []
752
+ def get_stat_dir_cloud(path: UPath) -> tuple[int, str | None, str | None, int]:
722
753
  objects = path.fs.find(path.as_posix(), detail=True)
754
+ hash, hash_type = None, None
755
+ compute_list_hash = True
723
756
  if path.protocol == "s3":
724
757
  accessor = "ETag"
725
758
  elif path.protocol == "gs":
726
759
  accessor = "md5Hash"
760
+ elif path.protocol == "hf":
761
+ accessor = "blob_id"
762
+ else:
763
+ compute_list_hash = False
764
+ sizes = []
765
+ hashes = []
727
766
  for object in objects.values():
728
767
  sizes.append(object["size"])
729
- md5s.append(object[accessor].strip('"='))
768
+ if compute_list_hash:
769
+ hashes.append(object[accessor].strip('"='))
730
770
  size = sum(sizes)
731
- hash, hash_type = hash_md5s_from_dir(md5s)
732
- n_objects = len(md5s)
771
+ n_objects = len(sizes)
772
+ if compute_list_hash:
773
+ hash, hash_type = hash_from_hashes_list(hashes), "md5-d"
733
774
  return size, hash, hash_type, n_objects
734
775
 
735
776
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lamindb_setup
3
- Version: 0.80.1
3
+ Version: 0.81.0
4
4
  Summary: Setup & configure LaminDB.
5
5
  Author-email: Lamin Labs <open-source@lamin.ai>
6
6
  Requires-Python: >=3.9
@@ -19,7 +19,7 @@ Requires-Dist: supabase==2.2.1
19
19
  Requires-Dist: psutil
20
20
  Requires-Dist: urllib3<2 ; extra == "aws"
21
21
  Requires-Dist: aiobotocore[boto3]>=2.5.4,<3.0.0 ; extra == "aws"
22
- Requires-Dist: s3fs>=2023.12.2,<=2024.6.1 ; extra == "aws"
22
+ Requires-Dist: s3fs>=2023.12.2,<=2024.10.0 ; extra == "aws"
23
23
  Requires-Dist: line_profiler ; extra == "dev"
24
24
  Requires-Dist: pyjwt<3.0.0 ; extra == "dev"
25
25
  Requires-Dist: psycopg2-binary ; extra == "dev"
@@ -31,7 +31,7 @@ Requires-Dist: pytest-xdist ; extra == "dev"
31
31
  Requires-Dist: nbproject-test>=0.4.3 ; extra == "dev"
32
32
  Requires-Dist: pandas ; extra == "dev"
33
33
  Requires-Dist: django-schema-graph ; extra == "erdiagram"
34
- Requires-Dist: gcsfs>=2023.12.2,<=2024.6.1 ; extra == "gcp"
34
+ Requires-Dist: gcsfs>=2023.12.2,<=2024.10.0 ; extra == "gcp"
35
35
  Project-URL: Home, https://github.com/laminlabs/lamindb-setup
36
36
  Provides-Extra: aws
37
37
  Provides-Extra: dev
@@ -1,4 +1,4 @@
1
- lamindb_setup/__init__.py,sha256=kc1LSkxZIe6SeFBsho6Xg6WyYxKcsUYoLNgQlQSRnxo,1714
1
+ lamindb_setup/__init__.py,sha256=Xm__DIYPUbJSjykjcT4bC-mR75b1GFYVGZ0owhVfCyI,1714
2
2
  lamindb_setup/_cache.py,sha256=1XnM-V_KprbjpgPY7Bg3FYn53Iz_2_fEgcMOaSdKKbg,1332
3
3
  lamindb_setup/_check.py,sha256=28PcG8Kp6OpjSLSi1r2boL2Ryeh6xkaCL87HFbjs6GA,129
4
4
  lamindb_setup/_check_setup.py,sha256=6cSfpmVOSgU7YiVHfJpBTGTQ7rrnwunt1pJT_jkgNM8,3196
@@ -31,17 +31,17 @@ lamindb_setup/core/_settings.py,sha256=mpGsSb98UsBedLsW2RuowZ17EP2tI2XRGPztqrJtr
31
31
  lamindb_setup/core/_settings_instance.py,sha256=ajcq9zRNE598tTqyMkMqaEOubVfFeE998DPtbgyzK3A,18801
32
32
  lamindb_setup/core/_settings_load.py,sha256=5OpghcbkrK9KBM_0Iu-61FTI76UbOpPkkJpUittXS-w,4098
33
33
  lamindb_setup/core/_settings_save.py,sha256=rxGxgaK5i9exKqSJERQQyY1WZio20meoQJoYXlVW-1w,3138
34
- lamindb_setup/core/_settings_storage.py,sha256=15B7taJF1zxJ1_qAb67NuXkTFvO2TRTWMt6KTzDf1mw,11875
34
+ lamindb_setup/core/_settings_storage.py,sha256=CYwGZm0fKYN7eLLsU-sOtOKG7HzswQVjTWb0ooHKcNg,11990
35
35
  lamindb_setup/core/_settings_store.py,sha256=WcsgOmgnu9gztcrhp-N4OONNZyxICHV8M0HdJllTaEo,2219
36
36
  lamindb_setup/core/_settings_user.py,sha256=iz0MqFLKXqm8LYx_CHmr02_oNvYWFLIxKkJLdpS5W08,1476
37
37
  lamindb_setup/core/_setup_bionty_sources.py,sha256=o2L5Ww8TKgSqJtL4cGUcpJwLNYxA9BZgddhCMCu_E2g,3428
38
38
  lamindb_setup/core/cloud_sqlite_locker.py,sha256=i6TrT7HG0lqliPvZTlsZ_uplPaqhPBbabyfeR32SkA8,7107
39
39
  lamindb_setup/core/django.py,sha256=E4U9nUlV2kHd-G5v6iSdFGAAWixlQDxOFwMwOMG9xfw,3864
40
40
  lamindb_setup/core/exceptions.py,sha256=4NpLUNUIfXYVTFX2FvLZF8RW34exk2Vn2X3G4YhnTRg,276
41
- lamindb_setup/core/hashing.py,sha256=bkuvZyAuC7-Y_qZumJd_rybF-upJ5J3KxnKiymRUifw,3148
41
+ lamindb_setup/core/hashing.py,sha256=26dtak7XgmrWa_D1zuDyxObRQcriMtnc1yEigkKASmM,3142
42
42
  lamindb_setup/core/types.py,sha256=zJii2le38BJUmsNVvzDrbzGYr0yaeb-9Rw9IKmsBr3k,523
43
- lamindb_setup/core/upath.py,sha256=q6WvpdXO_-Ajl5qjU4CIf0Q1ZbYxA0Po54LG9iHYn28,27151
44
- lamindb_setup-0.80.1.dist-info/LICENSE,sha256=UOZ1F5fFDe3XXvG4oNnkL1-Ecun7zpHzRxjp-XsMeAo,11324
45
- lamindb_setup-0.80.1.dist-info/WHEEL,sha256=EZbGkh7Ie4PoZfRQ8I0ZuP9VklN_TvcZ6DSE5Uar4z4,81
46
- lamindb_setup-0.80.1.dist-info/METADATA,sha256=kMZ0UUKGc3sxehsLkXFPedNRYpo_VNNHKA-iu7I5W4I,1743
47
- lamindb_setup-0.80.1.dist-info/RECORD,,
43
+ lamindb_setup/core/upath.py,sha256=GD-EW9QSqJH023ox53hPDvjE86hFjXVhb0MSEU02HeY,28702
44
+ lamindb_setup-0.81.0.dist-info/LICENSE,sha256=UOZ1F5fFDe3XXvG4oNnkL1-Ecun7zpHzRxjp-XsMeAo,11324
45
+ lamindb_setup-0.81.0.dist-info/WHEEL,sha256=EZbGkh7Ie4PoZfRQ8I0ZuP9VklN_TvcZ6DSE5Uar4z4,81
46
+ lamindb_setup-0.81.0.dist-info/METADATA,sha256=mEUNPAf6Q4NfqeO2YuUIlADbKkLQLOFFzL-BNkvk3dY,1745
47
+ lamindb_setup-0.81.0.dist-info/RECORD,,