lamindb 0.74.3__py3-none-any.whl → 0.75.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb/_feature_set.py CHANGED
@@ -118,7 +118,7 @@ def from_values(
118
118
  name: str | None = None,
119
119
  mute: bool = False,
120
120
  organism: Record | str | None = None,
121
- public_source: Record | None = None,
121
+ source: Record | None = None,
122
122
  raise_validation_error: bool = True,
123
123
  ) -> FeatureSet:
124
124
  """{}""" # noqa: D415
@@ -139,7 +139,7 @@ def from_values(
139
139
  not_validated_values = values_array[~validated]
140
140
  msg = (
141
141
  f"These values could not be validated: {not_validated_values.tolist()}\n"
142
- f"If there are no typos, add them to their registry: {registry}"
142
+ f"If there are no typos, add them to their registry: {registry.__name__}"
143
143
  )
144
144
  if raise_validation_error:
145
145
  raise ValidationError(msg)
@@ -149,7 +149,7 @@ def from_values(
149
149
  validated_values,
150
150
  field=field,
151
151
  organism=organism,
152
- public_source=public_source,
152
+ source=source,
153
153
  )
154
154
  feature_set = FeatureSet(
155
155
  features=validated_features,
@@ -168,7 +168,7 @@ def from_df(
168
168
  name: str | None = None,
169
169
  mute: bool = False,
170
170
  organism: Record | str | None = None,
171
- public_source: Record | None = None,
171
+ source: Record | None = None,
172
172
  ) -> FeatureSet | None:
173
173
  """{}""" # noqa: D415
174
174
  registry = field.field.model
@@ -189,7 +189,7 @@ def from_df(
189
189
  df.columns[validated],
190
190
  field=field,
191
191
  organism=organism,
192
- public_source=public_source,
192
+ source=source,
193
193
  )
194
194
  feature_set = FeatureSet(
195
195
  features=validated_features,
lamindb/_filter.py CHANGED
@@ -21,9 +21,9 @@ def filter(Record: type[Record], **expressions) -> QuerySet:
21
21
  ):
22
22
  visibility = "visibility"
23
23
  if not any(e.startswith(visibility) for e in expressions):
24
- expressions[
25
- visibility
26
- ] = VisibilityChoice.default.value # default visibility
24
+ expressions[visibility] = (
25
+ VisibilityChoice.default.value
26
+ ) # default visibility
27
27
  # if visibility is None, do not apply a filter
28
28
  # otherwise, it would mean filtering for NULL values, which doesn't make
29
29
  # sense for a non-NULLABLE column
lamindb/_finish.py CHANGED
@@ -80,8 +80,8 @@ def save_run_context_core(
80
80
 
81
81
  # for scripts, things are easy
82
82
  is_consecutive = True
83
- is_notebook = transform.type == TransformType.notebook
84
- source_code_path = filepath
83
+ is_notebook = transform.type == "notebook"
84
+ _source_code_artifact_path = filepath
85
85
  # for notebooks, we need more work
86
86
  if is_notebook:
87
87
  try:
@@ -134,12 +134,12 @@ def save_run_context_core(
134
134
  )
135
135
  # strip the output from the notebook to create the source code file
136
136
  # first, copy the notebook file to a temporary file in the cache
137
- source_code_path = ln_setup.settings.storage.cache_dir / filepath.name
138
- shutil.copy2(filepath, source_code_path) # copy
137
+ _source_code_artifact_path = ln_setup.settings.storage.cache_dir / filepath.name
138
+ shutil.copy2(filepath, _source_code_artifact_path) # copy
139
139
  subprocess.run(
140
140
  [
141
141
  "nbstripout",
142
- source_code_path,
142
+ _source_code_artifact_path,
143
143
  "--extra-keys",
144
144
  "metadata.version metadata.kernelspec metadata.language_info metadata.pygments_lexer metadata.name metadata.file_extension",
145
145
  ],
@@ -152,31 +152,34 @@ def save_run_context_core(
152
152
  transform_family = transform.versions
153
153
  if len(transform_family) > 0:
154
154
  for prev_transform in transform_family.order_by("-created_at"):
155
- if prev_transform.latest_report_id is not None:
156
- prev_report = prev_transform.latest_report
157
- if prev_transform.source_code_id is not None:
158
- prev_source = prev_transform.source_code
155
+ if (
156
+ prev_transform.latest_run is not None
157
+ and prev_transform.latest_run.report_id is not None
158
+ ):
159
+ prev_report = prev_transform.latest_run.report
160
+ if prev_transform._source_code_artifact_id is not None:
161
+ prev_source = prev_transform._source_code_artifact
159
162
  ln.settings.creation.artifact_silence_missing_run_warning = True
160
163
 
161
164
  # track source code
162
- if transform.source_code_id is not None:
165
+ if transform._source_code_artifact_id is not None:
163
166
  # check if the hash of the transform source code matches
164
167
  # (for scripts, we already run the same logic in track() - we can deduplicate the call at some point)
165
- hash, _ = hash_file(source_code_path) # ignore hash_type for now
166
- if hash != transform.source_code.hash:
168
+ hash, _ = hash_file(_source_code_artifact_path) # ignore hash_type for now
169
+ if hash != transform._source_code_artifact.hash:
167
170
  if os.getenv("LAMIN_TESTING") is None:
168
171
  # in test, auto-confirm overwrite
169
172
  response = input(
170
- f"You are about to replace (overwrite) existing source code (hash '{transform.source_code.hash}') for transform version"
173
+ f"You are about to replace (overwrite) existing source code (hash '{transform._source_code_artifact.hash}') for transform version"
171
174
  f" '{transform.version}'. Proceed? (y/n)"
172
175
  )
173
176
  else:
174
177
  response = "y"
175
178
  if response == "y":
176
- transform.source_code.replace(source_code_path)
177
- transform.source_code.save(upload=True)
179
+ transform._source_code_artifact.replace(_source_code_artifact_path)
180
+ transform._source_code_artifact.save(upload=True)
178
181
  logger.success(
179
- f"replaced transform.source_code: {transform.source_code}"
182
+ f"replaced transform._source_code_artifact: {transform._source_code_artifact}"
180
183
  )
181
184
  else:
182
185
  logger.warning("Please re-run `ln.track()` to make a new version")
@@ -184,17 +187,19 @@ def save_run_context_core(
184
187
  else:
185
188
  logger.important("source code is already saved")
186
189
  else:
187
- source_code = ln.Artifact(
188
- source_code_path,
190
+ _source_code_artifact = ln.Artifact(
191
+ _source_code_artifact_path,
189
192
  description=f"Source of transform {transform.uid}",
190
193
  version=transform.version,
191
194
  is_new_version_of=prev_source,
192
195
  visibility=0, # hidden file
193
196
  run=False,
194
197
  )
195
- source_code.save(upload=True, print_progress=False)
196
- transform.source_code = source_code
197
- logger.debug(f"saved transform.source_code: {transform.source_code}")
198
+ _source_code_artifact.save(upload=True, print_progress=False)
199
+ transform._source_code_artifact = _source_code_artifact
200
+ logger.debug(
201
+ f"saved transform._source_code_artifact: {transform._source_code_artifact}"
202
+ )
198
203
 
199
204
  # track environment
200
205
  env_path = ln_setup.settings.storage.cache_dir / f"run_env_pip_{run.uid}.txt"
@@ -257,8 +262,9 @@ def save_run_context_core(
257
262
  run.report = report_file
258
263
  run.is_consecutive = is_consecutive
259
264
  run.save()
260
- transform.latest_report = run.report
261
- logger.debug(f"saved transform.latest_report: {transform.latest_report}")
265
+ logger.debug(
266
+ f"saved transform.latest_run.report: {transform.latest_run.report}"
267
+ )
262
268
  transform.save()
263
269
 
264
270
  # finalize
lamindb/_from_values.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import TYPE_CHECKING, Any, Iterable
3
+ from typing import TYPE_CHECKING, Iterable
4
4
 
5
5
  import pandas as pd
6
6
  from django.core.exceptions import FieldDoesNotExist
@@ -19,9 +19,9 @@ def get_or_create_records(
19
19
  field: StrField,
20
20
  *,
21
21
  create: bool = False,
22
- from_public: bool = False,
22
+ from_source: bool = False,
23
23
  organism: Record | str | None = None,
24
- public_source: Record | None = None,
24
+ source: Record | None = None,
25
25
  mute: bool = False,
26
26
  ) -> list[Record]:
27
27
  """Get or create records from iterables."""
@@ -34,8 +34,8 @@ def get_or_create_records(
34
34
  kwargs: dict = {}
35
35
  if organism is not None:
36
36
  kwargs["organism"] = organism
37
- if public_source is not None:
38
- kwargs["public_source"] = public_source
37
+ if source is not None:
38
+ kwargs["source"] = source
39
39
  settings.creation.search_names = False
40
40
  try:
41
41
  iterable_idx = index_iterable(iterable)
@@ -47,8 +47,17 @@ def get_or_create_records(
47
47
 
48
48
  # new records to be created based on new values
49
49
  if len(nonexist_values) > 0:
50
- if from_public:
51
- records_bionty, unmapped_values = create_records_from_public(
50
+ if source:
51
+ from_source = not source.in_db
52
+ elif (
53
+ records
54
+ and hasattr(records[0], "source_id")
55
+ and records[0].source_id
56
+ and records[0].source.in_db
57
+ ):
58
+ from_source = False
59
+ if from_source:
60
+ records_bionty, unmapped_values = create_records_from_source(
52
61
  iterable_idx=nonexist_values,
53
62
  field=field,
54
63
  msg=msg,
@@ -58,7 +67,7 @@ def get_or_create_records(
58
67
  if len(records_bionty) > 0:
59
68
  msg = ""
60
69
  for record in records_bionty:
61
- record._from_public = True
70
+ record._from_source = True
62
71
  records += records_bionty
63
72
  else:
64
73
  unmapped_values = nonexist_values
@@ -75,7 +84,7 @@ def get_or_create_records(
75
84
  f"{colors.red('did not create')} {name} record{s} for "
76
85
  f"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}"
77
86
  )
78
- if Record.__module__.startswith("lnschema_bionty.") or Record == ULabel:
87
+ if Record.__module__.startswith("bionty.") or Record == ULabel:
79
88
  if isinstance(iterable, pd.Series):
80
89
  feature = iterable.name
81
90
  feature_name = None
@@ -100,8 +109,8 @@ def get_existing_records(
100
109
  model = field.field.model
101
110
  condition: dict = {} if len(kwargs) == 0 else kwargs.copy()
102
111
  # existing records matching is agnostic to the bionty source
103
- if "public_source" in condition:
104
- condition.pop("public_source")
112
+ if "source" in condition:
113
+ condition.pop("source")
105
114
 
106
115
  # standardize based on the DB reference
107
116
  # log synonyms mapped terms
@@ -109,7 +118,7 @@ def get_existing_records(
109
118
  iterable_idx,
110
119
  field=field,
111
120
  organism=kwargs.get("organism"),
112
- public_source=kwargs.get("public_source"),
121
+ source=kwargs.get("source"),
113
122
  mute=True,
114
123
  )
115
124
  syn_mapper = result.synonyms_mapper
@@ -174,7 +183,7 @@ def get_existing_records(
174
183
  return records, nonexist_values, msg
175
184
 
176
185
 
177
- def create_records_from_public(
186
+ def create_records_from_source(
178
187
  iterable_idx: pd.Index,
179
188
  field: StrField,
180
189
  msg: str = "",
@@ -184,7 +193,8 @@ def create_records_from_public(
184
193
  model = field.field.model
185
194
  records: list = []
186
195
  # populate additional fields from bionty
187
- from lnschema_bionty._bionty import get_public_source_record
196
+ from bionty._bionty import get_source_record
197
+ from bionty.core._bionty import filter_bionty_df_columns
188
198
 
189
199
  # create the corresponding bionty object from model
190
200
  try:
@@ -195,17 +205,20 @@ def create_records_from_public(
195
205
  organism = "human"
196
206
  elif iterable_idx[0].startswith("ENSMUSG"):
197
207
  organism = "mouse"
198
- public_ontology = model.public(
199
- organism=organism, public_source=kwargs.get("public_source")
200
- )
208
+ public_ontology = model.public(organism=organism, source=kwargs.get("source"))
201
209
  except Exception:
202
210
  # for custom records that are not created from public sources
203
211
  return records, iterable_idx
204
- # add public_source record to the kwargs
205
- kwargs.update({"public_source": get_public_source_record(public_ontology)})
212
+ # add source record to the kwargs
213
+ source_record = get_source_record(public_ontology)
214
+ if source_record is not None and source_record.in_db:
215
+ # skips the creation of records from public if the source is already in the db
216
+ return records, iterable_idx
217
+
218
+ kwargs.update({"source": source_record})
206
219
 
207
220
  # filter the columns in bionty df based on fields
208
- bionty_df = _filter_bionty_df_columns(model=model, public_ontology=public_ontology)
221
+ bionty_df = filter_bionty_df_columns(model=model, public_ontology=public_ontology)
209
222
 
210
223
  # standardize in the bionty reference
211
224
  result = public_ontology.inspect(iterable_idx, field=field.field.name, mute=True)
@@ -301,43 +314,6 @@ def _print_values(names: Iterable, n: int = 20, quotes: bool = True) -> str:
301
314
  return print_values
302
315
 
303
316
 
304
- def _filter_bionty_df_columns(model: Record, public_ontology: Any) -> pd.DataFrame:
305
- bionty_df = pd.DataFrame()
306
- if public_ontology is not None:
307
- model_field_names = {i.name for i in model._meta.fields}
308
- # parents needs to be added here as relationships aren't in fields
309
- model_field_names.add("parents")
310
- bionty_df = public_ontology.df().reset_index()
311
- if model.__name__ == "Gene":
312
- # groupby ensembl_gene_id and concat ncbi_gene_ids
313
- groupby_id_col = (
314
- "ensembl_gene_id" if "ensembl_gene_id" in bionty_df else "stable_id"
315
- )
316
- bionty_df.drop(
317
- columns=["hgnc_id", "mgi_id", "index"], errors="ignore", inplace=True
318
- )
319
- bionty_df.drop_duplicates([groupby_id_col, "ncbi_gene_id"], inplace=True)
320
- bionty_df["ncbi_gene_id"] = bionty_df["ncbi_gene_id"].fillna("")
321
- bionty_df = (
322
- bionty_df.groupby(groupby_id_col)
323
- .agg(
324
- {
325
- "symbol": "first",
326
- "ncbi_gene_id": "|".join,
327
- "biotype": "first",
328
- "description": "first",
329
- "synonyms": "first",
330
- }
331
- )
332
- .reset_index()
333
- )
334
- bionty_df.rename(columns={"ncbi_gene_id": "ncbi_gene_ids"}, inplace=True)
335
- # rename definition to description for the lnschema_bionty
336
- bionty_df.rename(columns={"definition": "description"}, inplace=True)
337
- bionty_df = bionty_df.loc[:, bionty_df.columns.isin(model_field_names)]
338
- return bionty_df
339
-
340
-
341
317
  def _bulk_create_dicts_from_df(
342
318
  keys: set | list, column_name: str, df: pd.DataFrame
343
319
  ) -> tuple[dict, str]:
@@ -359,9 +335,9 @@ def _bulk_create_dicts_from_df(
359
335
  return df.reset_index().to_dict(orient="records"), multi_msg
360
336
 
361
337
 
362
- def _has_organism_field(orm: Record) -> bool:
338
+ def _has_organism_field(registry: type[Record]) -> bool:
363
339
  try:
364
- orm._meta.get_field("organism")
340
+ registry._meta.get_field("organism")
365
341
  return True
366
342
  except FieldDoesNotExist:
367
343
  return False
@@ -370,12 +346,17 @@ def _has_organism_field(orm: Record) -> bool:
370
346
  def _get_organism_record(
371
347
  field: StrField, organism: str | Record, force: bool = False
372
348
  ) -> Record:
373
- model = field.field.model
374
- check = True if force else field.field.name != "ensembl_gene_id"
375
-
376
- if _has_organism_field(model) and check:
377
- from lnschema_bionty._bionty import create_or_get_organism_record
378
-
379
- organism_record = create_or_get_organism_record(organism=organism, orm=model)
349
+ registry = field.field.model
350
+ check = True
351
+ if not force and hasattr(registry, "_ontology_id_field"):
352
+ check = field.field.name != registry._ontology_id_field
353
+ # e.g. bionty.CellMarker has "name" as _ontology_id_field
354
+ if not registry._ontology_id_field.endswith("id"):
355
+ check = True
356
+
357
+ if _has_organism_field(registry) and check:
358
+ from bionty._bionty import create_or_get_organism_record
359
+
360
+ organism_record = create_or_get_organism_record(organism=organism, orm=registry)
380
361
  if organism_record is not None:
381
362
  return organism_record
lamindb/_is_versioned.py CHANGED
@@ -16,7 +16,7 @@ def _add_to_version_family(
16
16
  ):
17
17
  old_uid = self.uid
18
18
  new_uid, version = get_uid_from_old_version(is_new_version_of, version)
19
- if self.__class__.__name__ == "Artifact" and self.key_is_virtual:
19
+ if self.__class__.__name__ == "Artifact" and self._key_is_virtual:
20
20
  old_path = self.path
21
21
  new_path = get_new_path_from_uid(
22
22
  old_path=old_path, old_uid=old_uid, new_uid=new_uid
lamindb/_parents.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import builtins
4
- from typing import TYPE_CHECKING
4
+ from typing import TYPE_CHECKING, Literal
5
5
 
6
6
  import lamindb_setup as ln_setup
7
7
  from lamin_utils import logger
@@ -10,7 +10,7 @@ from lnschema_core.models import HasParents, format_field_value
10
10
 
11
11
  from lamindb._utils import attach_func_to_class_method
12
12
 
13
- from ._record import get_default_str_field
13
+ from ._record import get_name_field
14
14
 
15
15
  if TYPE_CHECKING:
16
16
  from lnschema_core.types import StrField
@@ -61,7 +61,7 @@ def view_parents(
61
61
  distance: int = 5,
62
62
  ):
63
63
  if field is None:
64
- field = get_default_str_field(self)
64
+ field = get_name_field(self)
65
65
  if not isinstance(field, str):
66
66
  field = field.field.name
67
67
 
@@ -137,10 +137,14 @@ def view_lineage(data: Artifact | Collection, with_children: bool = True) -> Non
137
137
 
138
138
 
139
139
  def _view_parents(
140
- record: Record, field: str, with_children: bool = False, distance: int = 100
140
+ record: Record,
141
+ field: str,
142
+ with_children: bool = False,
143
+ distance: int = 100,
144
+ attr_name: Literal["parents", "predecessors"] = "parents",
141
145
  ):
142
146
  """Graph of parents."""
143
- if not hasattr(record, "parents"):
147
+ if not hasattr(record, attr_name):
144
148
  raise NotImplementedError(
145
149
  f"Parents view is not supported for {record.__class__.__name__}!"
146
150
  )
@@ -149,13 +153,17 @@ def _view_parents(
149
153
 
150
154
  df_edges = None
151
155
  df_edges_parents = _df_edges_from_parents(
152
- record=record, field=field, distance=distance
156
+ record=record, field=field, distance=distance, attr_name=attr_name
153
157
  )
154
158
  if df_edges_parents is not None:
155
159
  df_edges = df_edges_parents
156
160
  if with_children:
157
161
  df_edges_children = _df_edges_from_parents(
158
- record=record, field=field, distance=distance, children=True
162
+ record=record,
163
+ field=field,
164
+ distance=distance,
165
+ children=True,
166
+ attr_name=attr_name,
159
167
  )
160
168
  if df_edges_children is not None:
161
169
  if df_edges is not None:
@@ -197,12 +205,18 @@ def _view_parents(
197
205
  _view(u)
198
206
 
199
207
 
200
- def _get_parents(record: Record, field: str, distance: int, children: bool = False):
208
+ def _get_parents(
209
+ record: Record,
210
+ field: str,
211
+ distance: int,
212
+ children: bool = False,
213
+ attr_name: Literal["parents", "predecessors"] = "parents",
214
+ ):
201
215
  """Recursively get parent records within a distance."""
202
216
  if children:
203
- key = "parents"
217
+ key = attr_name
204
218
  else:
205
- key = "children"
219
+ key = "children" if attr_name == "parents" else "successors" # type: ignore
206
220
  model = record.__class__
207
221
  condition = f"{key}__{field}"
208
222
  results = model.filter(**{condition: record.__getattribute__(field)}).all()
@@ -228,12 +242,23 @@ def _get_parents(record: Record, field: str, distance: int, children: bool = Fal
228
242
 
229
243
 
230
244
  def _df_edges_from_parents(
231
- record: Record, field: str, distance: int, children: bool = False
245
+ record: Record,
246
+ field: str,
247
+ distance: int,
248
+ children: bool = False,
249
+ attr_name: Literal["parents", "predecessors"] = "parents",
232
250
  ):
233
251
  """Construct a DataFrame of edges as the input of graphviz.Digraph."""
234
- key = "children" if children else "parents"
252
+ if attr_name == "parents":
253
+ key = "children" if children else "parents"
254
+ else:
255
+ key = "successors" if children else "predecessors"
235
256
  parents = _get_parents(
236
- record=record, field=field, distance=distance, children=children
257
+ record=record,
258
+ field=field,
259
+ distance=distance,
260
+ children=children,
261
+ attr_name=attr_name,
237
262
  )
238
263
  all = record.__class__.objects
239
264
  records = parents | all.filter(id=record.id)