lamindb 0.63.5__py3-none-any.whl → 0.64.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb/_save.py CHANGED
@@ -11,12 +11,12 @@ from django.db import transaction
11
11
  from django.utils.functional import partition
12
12
  from lamin_utils import logger
13
13
  from lamindb_setup.dev.upath import print_hook
14
- from lnschema_core.models import File, Registry
14
+ from lnschema_core.models import Artifact, Registry
15
15
 
16
- from lamindb.dev.storage import store_object
17
16
  from lamindb.dev.storage.file import (
18
- auto_storage_key_from_file,
17
+ auto_storage_key_from_artifact,
19
18
  delete_storage_using_key,
19
+ store_artifact,
20
20
  )
21
21
 
22
22
  try:
@@ -73,20 +73,20 @@ def save(
73
73
 
74
74
  # previously, this was all set based,
75
75
  # but models without primary keys aren't hashable
76
- # we distinguish between files and non-files
77
- # for files, we want to bulk-upload
76
+ # we distinguish between artifacts and non-artifacts
77
+ # for artifacts, we want to bulk-upload
78
78
  # rather than upload one-by-one
79
- non_files, files = partition(lambda r: isinstance(r, File), records)
80
- if non_files:
79
+ non_artifacts, artifacts = partition(lambda r: isinstance(r, Artifact), records)
80
+ if non_artifacts:
81
81
  # first save all records that do not yet have a primary key without
82
82
  # recursing parents
83
- _, non_files_without_pk = partition(lambda r: r.pk is None, non_files)
84
- bulk_create(non_files_without_pk, ignore_conflicts=ignore_conflicts)
85
- non_files_with_parents = [
86
- r for r in non_files_without_pk if hasattr(r, "_parents")
83
+ _, non_artifacts_without_pk = partition(lambda r: r.pk is None, non_artifacts)
84
+ bulk_create(non_artifacts_without_pk, ignore_conflicts=ignore_conflicts)
85
+ non_artifacts_with_parents = [
86
+ r for r in non_artifacts_without_pk if hasattr(r, "_parents")
87
87
  ]
88
88
 
89
- if len(non_files_with_parents) > 0 and kwargs.get("parents") is not False:
89
+ if len(non_artifacts_with_parents) > 0 and kwargs.get("parents") is not False:
90
90
  # this can only happen within lnschema_bionty right now!!
91
91
  # we might extend to core lamindb later
92
92
  import lnschema_bionty as lb
@@ -105,14 +105,14 @@ def save(
105
105
  "you can switch this off via: lb.settings.auto_save_parents ="
106
106
  " False"
107
107
  )
108
- for record in non_files_with_parents:
108
+ for record in non_artifacts_with_parents:
109
109
  record._save_ontology_parents(mute=True)
110
110
 
111
- if files:
111
+ if artifacts:
112
112
  with transaction.atomic():
113
- for record in files:
113
+ for record in artifacts:
114
114
  record._save_skip_storage()
115
- store_files(files)
115
+ store_artifacts(artifacts)
116
116
 
117
117
  # this function returns None as potentially 10k records might be saved
118
118
  # refreshing all of them from the DB would mean a severe performance penalty
@@ -128,27 +128,27 @@ def bulk_create(records: Iterable[Registry], ignore_conflicts: Optional[bool] =
128
128
  orm.objects.bulk_create(records, ignore_conflicts=ignore_conflicts)
129
129
 
130
130
 
131
- # This is also used within File.save()
132
- def check_and_attempt_upload(file: File) -> Optional[Exception]:
133
- # if File object is either newly instantiated or replace() was called on
131
+ # This is also used within Artifact.save()
132
+ def check_and_attempt_upload(artifact: Artifact) -> Optional[Exception]:
133
+ # if Artifact object is either newly instantiated or replace() was called on
134
134
  # a local env it will have a _local_filepath and needs to be uploaded
135
- if hasattr(file, "_local_filepath"):
135
+ if hasattr(artifact, "_local_filepath"):
136
136
  try:
137
- upload_data_object(file)
137
+ upload_artifact(artifact)
138
138
  except Exception as exception:
139
- logger.warning(f"could not upload file: {file}")
139
+ logger.warning(f"could not upload artifact: {artifact}")
140
140
  return exception
141
141
  # copies (if on-disk) or moves the temporary file (if in-memory) to the cache
142
- copy_or_move_to_cache(file)
142
+ copy_or_move_to_cache(artifact)
143
143
  # after successful upload, we should remove the attribute so that another call
144
144
  # call to save won't upload again, the user should call replace() then
145
- del file._local_filepath
145
+ del artifact._local_filepath
146
146
  # returning None means proceed (either success or no action needed)
147
147
  return None
148
148
 
149
149
 
150
- def copy_or_move_to_cache(file: File):
151
- local_path = file._local_filepath
150
+ def copy_or_move_to_cache(artifact: Artifact):
151
+ local_path = artifact._local_filepath
152
152
 
153
153
  # in-memory zarr or on-disk zarr
154
154
  if local_path is None or not local_path.is_file():
@@ -164,7 +164,7 @@ def copy_or_move_to_cache(file: File):
164
164
  return None
165
165
 
166
166
  # maybe create something like storage.key_to_local(key) later to simplfy
167
- storage_key = auto_storage_key_from_file(file)
167
+ storage_key = auto_storage_key_from_artifact(artifact)
168
168
  storage_path = lamindb_setup.settings.storage.key_to_filepath(storage_key)
169
169
  cache_path = lamindb_setup.settings.storage.cloud_to_local_no_update(storage_path)
170
170
  cache_path.parent.mkdir(parents=True, exist_ok=True)
@@ -178,58 +178,58 @@ def copy_or_move_to_cache(file: File):
178
178
  os.utime(cache_path, times=(mts, mts))
179
179
 
180
180
 
181
- # This is also used within File.save()
182
- def check_and_attempt_clearing(file: File) -> Optional[Exception]:
181
+ # This is also used within Artifact.save()
182
+ def check_and_attempt_clearing(artifact: Artifact) -> Optional[Exception]:
183
183
  # this is a clean-up operation after replace() was called
184
184
  # this will only evaluate to True if replace() was called
185
- if hasattr(file, "_clear_storagekey"):
185
+ if hasattr(artifact, "_clear_storagekey"):
186
186
  try:
187
- if file._clear_storagekey is not None:
188
- delete_storage_using_key(file, file._clear_storagekey)
187
+ if artifact._clear_storagekey is not None:
188
+ delete_storage_using_key(artifact, artifact._clear_storagekey)
189
189
  logger.success(
190
- f"deleted stale object at storage key {file._clear_storagekey}"
190
+ f"deleted stale object at storage key {artifact._clear_storagekey}"
191
191
  )
192
- file._clear_storagekey = None
192
+ artifact._clear_storagekey = None
193
193
  except Exception as exception:
194
194
  return exception
195
195
  # returning None means proceed (either success or no action needed)
196
196
  return None
197
197
 
198
198
 
199
- def store_files(files: Iterable[File]) -> None:
200
- """Upload files in a list of database-committed files to storage.
199
+ def store_artifacts(artifacts: Iterable[Artifact]) -> None:
200
+ """Upload artifacts in a list of database-committed artifacts to storage.
201
201
 
202
- If any upload fails, subsequent files are cleaned up from the DB.
202
+ If any upload fails, subsequent artifacts are cleaned up from the DB.
203
203
  """
204
204
  exception: Optional[Exception] = None
205
205
  # because uploads might fail, we need to maintain a new list
206
206
  # of the succeeded uploads
207
- stored_files = []
207
+ stored_artifacts = []
208
208
 
209
- # upload new local files
210
- for file in files:
211
- exception = check_and_attempt_upload(file)
209
+ # upload new local artifacts
210
+ for artifact in artifacts:
211
+ exception = check_and_attempt_upload(artifact)
212
212
  if exception is not None:
213
213
  break
214
- stored_files += [file]
215
- exception = check_and_attempt_clearing(file)
214
+ stored_artifacts += [artifact]
215
+ exception = check_and_attempt_clearing(artifact)
216
216
  if exception is not None:
217
- logger.warning(f"clean up of {file._clear_storagekey} failed")
217
+ logger.warning(f"clean up of {artifact._clear_storagekey} failed")
218
218
  break
219
219
 
220
220
  if exception is not None:
221
- # clean up metadata for files not uploaded to storage
221
+ # clean up metadata for artifacts not uploaded to storage
222
222
  with transaction.atomic():
223
- for file in files:
224
- if file not in stored_files:
225
- file._delete_skip_storage()
226
- error_message = prepare_error_message(files, stored_files, exception)
223
+ for artifact in artifacts:
224
+ if artifact not in stored_artifacts:
225
+ artifact._delete_skip_storage()
226
+ error_message = prepare_error_message(artifacts, stored_artifacts, exception)
227
227
  raise RuntimeError(error_message)
228
228
  return None
229
229
 
230
230
 
231
- def prepare_error_message(records, stored_files, exception) -> str:
232
- if len(records) == 1 or len(stored_files) == 0:
231
+ def prepare_error_message(records, stored_artifacts, exception) -> str:
232
+ if len(records) == 1 or len(stored_artifacts) == 0:
233
233
  error_message = (
234
234
  "No entries were uploaded or committed"
235
235
  " to the database. See error message:\n\n"
@@ -239,7 +239,7 @@ def prepare_error_message(records, stored_files, exception) -> str:
239
239
  "The following entries have been"
240
240
  " successfully uploaded and committed to the database:\n"
241
241
  )
242
- for record in stored_files:
242
+ for record in stored_artifacts:
243
243
  error_message += (
244
244
  f"- {', '.join(record.__repr__().split(', ')[:3]) + ', ...)'}\n"
245
245
  )
@@ -248,24 +248,24 @@ def prepare_error_message(records, stored_files, exception) -> str:
248
248
  return error_message
249
249
 
250
250
 
251
- def upload_data_object(file) -> None:
251
+ def upload_artifact(artifact) -> None:
252
252
  """Store and add file and its linked entries."""
253
253
  # do NOT hand-craft the storage key!
254
- file_storage_key = auto_storage_key_from_file(file)
254
+ artifact_storage_key = auto_storage_key_from_artifact(artifact)
255
255
  storage_path = lamindb_setup.settings.instance.storage.key_to_filepath(
256
- file_storage_key
256
+ artifact_storage_key
257
257
  )
258
- msg = f"storing file '{file.uid}' at '{storage_path}'"
258
+ msg = f"storing artifact '{artifact.uid}' at '{storage_path}'"
259
259
  if (
260
- file.suffix in {".zarr", ".zrad"}
261
- and hasattr(file, "_memory_rep")
262
- and file._memory_rep is not None
260
+ artifact.suffix in {".zarr", ".zrad"}
261
+ and hasattr(artifact, "_memory_rep")
262
+ and artifact._memory_rep is not None
263
263
  ):
264
264
  logger.save(msg)
265
265
  print_progress = partial(
266
- print_hook, filepath=file_storage_key, action="uploading"
266
+ print_hook, filepath=artifact_storage_key, action="uploading"
267
267
  )
268
- write_adata_zarr(file._memory_rep, storage_path, callback=print_progress)
269
- elif hasattr(file, "_to_store") and file._to_store:
268
+ write_adata_zarr(artifact._memory_rep, storage_path, callback=print_progress)
269
+ elif hasattr(artifact, "_to_store") and artifact._to_store:
270
270
  logger.save(msg)
271
- store_object(file._local_filepath, file_storage_key)
271
+ store_artifact(artifact._local_filepath, artifact_storage_key)
lamindb/dev/_data.py CHANGED
@@ -4,11 +4,11 @@ from typing import Any, Dict, Iterable, List, Optional, Union
4
4
  from lamin_utils import colors, logger
5
5
  from lamindb_setup.dev._docs import doc_args
6
6
  from lnschema_core.models import (
7
+ Artifact,
7
8
  Data,
8
9
  Dataset,
9
10
  Feature,
10
11
  FeatureSet,
11
- File,
12
12
  Registry,
13
13
  Run,
14
14
  ULabel,
@@ -54,7 +54,7 @@ def add_transform_to_kwargs(kwargs: Dict[str, Any], run: Run):
54
54
  kwargs["transform"] = run.transform
55
55
 
56
56
 
57
- def save_feature_sets(self: Union[File, Dataset]) -> None:
57
+ def save_feature_sets(self: Union[Artifact, Dataset]) -> None:
58
58
  if hasattr(self, "_feature_sets"):
59
59
  saved_feature_sets = {}
60
60
  for key, feature_set in self._feature_sets.items():
@@ -72,7 +72,7 @@ def save_feature_sets(self: Union[File, Dataset]) -> None:
72
72
  )
73
73
 
74
74
 
75
- def save_feature_set_links(self: Union[File, Dataset]) -> None:
75
+ def save_feature_set_links(self: Union[Artifact, Dataset]) -> None:
76
76
  from lamindb._save import bulk_create
77
77
 
78
78
  Data = self.__class__
@@ -116,7 +116,7 @@ def describe(self: Data):
116
116
  "initial_version": "🔖",
117
117
  "file": "📄",
118
118
  }
119
- if len(foreign_key_fields) > 0: # always True for File and Dataset
119
+ if len(foreign_key_fields) > 0: # always True for Artifact and Dataset
120
120
  record_msg = f"{colors.green(model_name)}{__repr__(self, include_foreign_keys=False).lstrip(model_name)}" # noqa
121
121
  msg += f"{record_msg}\n\n"
122
122
 
@@ -229,7 +229,7 @@ def add_labels(
229
229
  " feature=ln.Feature(name='my_feature'))"
230
230
  )
231
231
  if feature.registries is not None:
232
- orm_dict = dict_schema_name_to_model_name(File)
232
+ orm_dict = dict_schema_name_to_model_name(Artifact)
233
233
  for reg in feature.registries.split("|"):
234
234
  orm = orm_dict.get(reg)
235
235
  records_validated += orm.from_values(records, field=field)
@@ -398,14 +398,15 @@ def _track_run_input(
398
398
  if run is None:
399
399
  raise ValueError(
400
400
  "No run context set. Call ln.track() or link input to a"
401
- " run object via `run.input_files.add(file)`"
401
+ " run object via `run.input_artifacts.add(artifact)`"
402
402
  )
403
403
  # avoid adding the same run twice
404
404
  run.save()
405
- if data_class_name == "file":
406
- LinkORM = run.input_files.through
405
+ if data_class_name == "artifact":
406
+ LinkORM = run.input_artifacts.through
407
407
  links = [
408
- LinkORM(run_id=run.id, file_id=data_id) for data_id in input_data_ids
408
+ LinkORM(run_id=run.id, artifact_id=data_id)
409
+ for data_id in input_data_ids
409
410
  ]
410
411
  else:
411
412
  LinkORM = run.input_datasets.through
@@ -2,7 +2,7 @@ from typing import Dict, Union
2
2
 
3
3
  import numpy as np
4
4
  from lamin_utils import colors
5
- from lnschema_core.models import Data, Dataset, Feature, File
5
+ from lnschema_core.models import Artifact, Data, Dataset, Feature
6
6
 
7
7
  from .._feature_set import FeatureSet
8
8
  from .._query_set import QuerySet
@@ -15,15 +15,15 @@ from .._registry import (
15
15
  from .._save import save
16
16
 
17
17
 
18
- def get_host_id_field(host: Union[File, Dataset]) -> str:
19
- if isinstance(host, File):
20
- host_id_field = "file_id"
18
+ def get_host_id_field(host: Union[Artifact, Dataset]) -> str:
19
+ if isinstance(host, Artifact):
20
+ host_id_field = "artifact_id"
21
21
  else:
22
22
  host_id_field = "dataset_id"
23
23
  return host_id_field
24
24
 
25
25
 
26
- def get_accessor_by_orm(host: Union[File, Dataset]) -> Dict:
26
+ def get_accessor_by_orm(host: Union[Artifact, Dataset]) -> Dict:
27
27
  dictionary = {
28
28
  field.related_model.__get_name_with_schema__(): field.name
29
29
  for field in host._meta.related_objects
@@ -56,7 +56,7 @@ def get_feature_set_by_slot(host) -> Dict:
56
56
 
57
57
 
58
58
  def get_label_links(
59
- host: Union[File, Dataset], registry: str, feature: Feature
59
+ host: Union[Artifact, Dataset], registry: str, feature: Feature
60
60
  ) -> QuerySet:
61
61
  host_id_field = get_host_id_field(host)
62
62
  kwargs = {host_id_field: host.id, "feature_id": feature.id}
@@ -68,7 +68,7 @@ def get_label_links(
68
68
  return link_records
69
69
 
70
70
 
71
- def get_feature_set_links(host: Union[File, Dataset]) -> QuerySet:
71
+ def get_feature_set_links(host: Union[Artifact, Dataset]) -> QuerySet:
72
72
  host_id_field = get_host_id_field(host)
73
73
  kwargs = {host_id_field: host.id}
74
74
  feature_set_links = host.feature_sets.through.objects.filter(**kwargs)
@@ -124,7 +124,7 @@ class FeatureManager:
124
124
  See :class:`~lamindb.dev.Data` for more information.
125
125
  """
126
126
 
127
- def __init__(self, host: Union[File, Dataset]):
127
+ def __init__(self, host: Union[Artifact, Dataset]):
128
128
  self._host = host
129
129
  self._feature_set_by_slot = get_feature_set_by_slot(host)
130
130
  self._accessor_by_orm = get_accessor_by_orm(host)
@@ -160,7 +160,7 @@ class FeatureManager:
160
160
  """
161
161
  if self._host._state.adding:
162
162
  raise ValueError(
163
- "Please save the file or dataset before adding a feature set!"
163
+ "Please save the artifact or dataset before adding a feature set!"
164
164
  )
165
165
  host_db = self._host._state.db
166
166
  feature_set.save(using=host_db)
@@ -180,7 +180,7 @@ class FeatureManager:
180
180
  self._feature_set_by_slot[slot] = feature_set
181
181
 
182
182
  def _add_from(self, data: Data):
183
- """Transfer features from a file or dataset."""
183
+ """Transfer features from a artifact or dataset."""
184
184
  for slot, feature_set in data.features._feature_set_by_slot.items():
185
185
  members = feature_set.members
186
186
  registry = members[0].__class__
@@ -2,7 +2,7 @@ from typing import Dict, List, Optional, Union
2
2
 
3
3
  import numpy as np
4
4
  from lamin_utils import colors, logger
5
- from lnschema_core.models import Data, Dataset, Feature, File, Registry
5
+ from lnschema_core.models import Artifact, Data, Dataset, Feature, Registry
6
6
 
7
7
  from .._feature_set import dict_related_model_to_related_name
8
8
  from .._from_values import _print_values
@@ -103,7 +103,7 @@ class LabelManager:
103
103
  See :class:`~lamindb.dev.Data` for more information.
104
104
  """
105
105
 
106
- def __init__(self, host: Union[File, Dataset]):
106
+ def __init__(self, host: Union[Artifact, Dataset]):
107
107
  self._host = host
108
108
 
109
109
  def __repr__(self) -> str:
@@ -150,9 +150,9 @@ class LabelManager:
150
150
  """Transfer labels from a file or dataset.
151
151
 
152
152
  Examples:
153
- >>> file1 = ln.File(pd.DataFrame(index=[0, 1]))
153
+ >>> file1 = ln.Artifact(pd.DataFrame(index=[0, 1]))
154
154
  >>> file1.save()
155
- >>> file2 = ln.File(pd.DataFrame(index=[2, 3]))
155
+ >>> file2 = ln.Artifact(pd.DataFrame(index=[2, 3]))
156
156
  >>> file2.save()
157
157
  >>> ulabels = ln.ULabel.from_values(["Label1", "Label2"], field="name")
158
158
  >>> ln.save(ulabels)
@@ -555,7 +555,7 @@ class run_context:
555
555
  else:
556
556
  # check whether there was an update
557
557
  if (
558
- transform.source_file_id is not None
558
+ transform.source_code_id is not None
559
559
  or transform.latest_report_id is not None
560
560
  ):
561
561
  if os.getenv("LAMIN_TESTING") is None:
@@ -572,7 +572,7 @@ class run_context:
572
572
  else:
573
573
  logger.warning(
574
574
  "not tracking this transform, either increase version or delete"
575
- " the saved transform.source_file and transform.latest_report"
575
+ " the saved transform.source_code and transform.latest_report"
576
576
  )
577
577
  return False
578
578
  if transform.name != name or transform.short_name != short_name:
lamindb/dev/_settings.py CHANGED
@@ -29,7 +29,7 @@ class Settings:
29
29
  self._verbosity_int: int = 1 # success-level logging
30
30
  logger.set_verbosity(self._verbosity_int)
31
31
 
32
- upon_file_create_if_hash_exists: Literal[
32
+ upon_artifact_create_if_hash_exists: Literal[
33
33
  "warn_return_existing", "error", "warn_create_new"
34
34
  ] = "warn_return_existing"
35
35
  """Behavior if file hash exists (default `"warn_return_existing"`).
@@ -61,10 +61,11 @@ class Settings:
61
61
  """
62
62
  silence_file_run_transform_warning: bool = False
63
63
  """Silence warning about missing run & transform during file creation."""
64
- file_use_virtual_keys: bool = True
65
- """The `key` parameter in :class:`~lamindb.File` is treated as a virtual storage key.
64
+ artifact_use_virtual_keys: bool = True
65
+ """Treat `key` parameter in :class:`~lamindb.Artifact` as virtual.
66
66
 
67
- If `True`, the `key` is **not** used to construct file paths.
67
+ If `True`, the `key` is **not** used to construct file paths, but file paths are
68
+ based on the `uid` of artifact.
68
69
  """
69
70
 
70
71
  @property
lamindb/dev/_view_tree.py CHANGED
@@ -3,7 +3,7 @@ from collections import defaultdict
3
3
  from typing import Iterable
4
4
 
5
5
  from lamindb_setup import settings as setup_settings
6
- from lnschema_core.models import File, Storage
6
+ from lnschema_core.models import Artifact, Storage
7
7
 
8
8
 
9
9
  def view_tree(
@@ -18,7 +18,7 @@ def view_tree(
18
18
  print("queryset")
19
19
  qs = cls
20
20
  storage_ids = qs.list("storage_id")
21
- elif cls == File:
21
+ elif cls == Artifact:
22
22
  print("file")
23
23
  qs = cls.filter(storage_id=setup_settings.storage.id).all()
24
24
  storage_ids = Storage.filter().list("id")
@@ -30,9 +30,9 @@ def view_tree(
30
30
  storage_id: storages.get(id=storage_id).root for storage_id in storage_ids
31
31
  }
32
32
  keys = set()
33
- for file in qs:
34
- root = storage_roots.get(file.storage_id, "")
35
- keys.add(f"{root}/{file.key}")
33
+ for artifact in qs:
34
+ root = storage_roots.get(artifact.storage_id, "")
35
+ keys.add(f"{root}/{artifact.key}")
36
36
 
37
37
  _view_tree(
38
38
  keys=keys,
@@ -12,7 +12,7 @@ from .._settings import settings
12
12
 
13
13
 
14
14
  def file_fcs() -> Path:
15
- """Example FCS file."""
15
+ """Example FCS artifact."""
16
16
  filepath, _ = urlretrieve(
17
17
  "https://lamindb-test.s3.amazonaws.com/example.fcs", "example.fcs"
18
18
  )
@@ -93,25 +93,25 @@ def file_tsv_rnaseq_nfcore_salmon_merged_gene_counts(
93
93
 
94
94
 
95
95
  def file_fastq(in_storage_root=False) -> Path:
96
- """Mini mock fastq file."""
96
+ """Mini mock fastq artifact."""
97
97
  basedir = Path(".") if not in_storage_root else settings.storage
98
98
  filepath = basedir / "input.fastq.gz"
99
99
  with open(filepath, "w") as f:
100
- f.write("Mock fastq file.")
100
+ f.write("Mock fastq artifact.")
101
101
  return filepath
102
102
 
103
103
 
104
104
  def file_bam(in_storage_root=False) -> Path: # pragma: no cover
105
- """Mini mock bam file."""
105
+ """Mini mock bam artifact."""
106
106
  basedir = Path(".") if not in_storage_root else settings.storage
107
107
  filepath = basedir / "output.bam"
108
108
  with open(filepath, "w") as f:
109
- f.write("Mock bam file.")
109
+ f.write("Mock bam artifact.")
110
110
  return filepath
111
111
 
112
112
 
113
113
  def file_mini_csv(in_storage_root=False) -> Path:
114
- """Mini csv file."""
114
+ """Mini csv artifact."""
115
115
  basedir = Path(".") if not in_storage_root else settings.storage
116
116
  filepath = basedir / "mini.csv"
117
117
  df = pd.DataFrame([1, 2, 3], columns=["test"])
lamindb/dev/hashing.py CHANGED
@@ -10,7 +10,7 @@
10
10
 
11
11
  import base64
12
12
  import hashlib
13
- from typing import Set, Tuple
13
+ from typing import List, Set, Tuple
14
14
 
15
15
 
16
16
  def to_b64_str(bstr: bytes):
@@ -29,6 +29,16 @@ def hash_set(s: Set[str]) -> str:
29
29
  return to_b64_str(hashlib.md5(bstr).digest())[:20]
30
30
 
31
31
 
32
+ def hash_md5s_from_dir(etags: List[str]) -> Tuple[str, str]:
33
+ # need to sort below because we don't want the order of parsing the dir to
34
+ # affect the hash
35
+ digests = b"".join(
36
+ hashlib.md5(etag.encode("utf-8")).digest() for etag in sorted(etags)
37
+ )
38
+ digest = hashlib.md5(digests).digest()
39
+ return to_b64_str(digest)[:22], "md5-d"
40
+
41
+
32
42
  def hash_file(file_path, chunk_size=50 * 1024 * 1024) -> Tuple[str, str]:
33
43
  chunks = []
34
44
  with open(file_path, "rb") as fp:
@@ -10,5 +10,5 @@ from lamindb_setup.dev.upath import LocalPathClasses, UPath, infer_filesystem
10
10
 
11
11
  from ._anndata_sizes import size_adata
12
12
  from ._backed_access import AnnDataAccessor, BackedAccessor
13
- from .file import delete_storage, load_to_memory, store_object
13
+ from .file import delete_storage, load_to_memory
14
14
  from .object import infer_suffix, write_to_file
@@ -19,10 +19,10 @@ from fsspec.core import OpenFile
19
19
  from fsspec.implementations.local import LocalFileSystem
20
20
  from lamin_utils import logger
21
21
  from lamindb_setup.dev.upath import UPath, infer_filesystem
22
- from lnschema_core import File
22
+ from lnschema_core import Artifact
23
23
  from packaging import version
24
24
 
25
- from lamindb.dev.storage.file import filepath_from_file
25
+ from lamindb.dev.storage.file import filepath_from_artifact
26
26
 
27
27
  anndata_version_parse = version.parse(anndata_version)
28
28
 
@@ -684,12 +684,12 @@ class BackedAccessor:
684
684
 
685
685
 
686
686
  def backed_access(
687
- file_or_filepath: Union[File, Path]
687
+ artifact_or_filepath: Union[Artifact, Path]
688
688
  ) -> Union[AnnDataAccessor, BackedAccessor]:
689
- if isinstance(file_or_filepath, File):
690
- filepath = filepath_from_file(file_or_filepath)
689
+ if isinstance(artifact_or_filepath, Artifact):
690
+ filepath = filepath_from_artifact(artifact_or_filepath)
691
691
  else:
692
- filepath = file_or_filepath
692
+ filepath = artifact_or_filepath
693
693
  name = filepath.name
694
694
 
695
695
  if filepath.suffix in (".h5", ".hdf5", ".h5ad"):