lamindb 0.74.3__py3-none-any.whl → 0.75.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb/_curate.py CHANGED
@@ -9,7 +9,6 @@ from lamin_utils import colors, logger
9
9
  from lamindb_setup.core._docs import doc_args
10
10
  from lnschema_core import (
11
11
  Artifact,
12
- Collection,
13
12
  Feature,
14
13
  Record,
15
14
  Run,
@@ -92,6 +91,7 @@ class DataFrameCurator:
92
91
  using: The reference instance containing registries to validate against.
93
92
  verbosity: The verbosity level.
94
93
  organism: The organism name.
94
+ sources: A dictionary mapping column names to Source records.
95
95
 
96
96
  Examples:
97
97
  >>> import bionty as bt
@@ -109,6 +109,7 @@ class DataFrameCurator:
109
109
  using: str | None = None,
110
110
  verbosity: str = "hint",
111
111
  organism: str | None = None,
112
+ sources: dict[str, Record] | None = None,
112
113
  ) -> None:
113
114
  from lamindb.core._settings import settings
114
115
 
@@ -121,6 +122,9 @@ class DataFrameCurator:
121
122
  self._collection = None
122
123
  self._validated = False
123
124
  self._kwargs = {"organism": organism} if organism else {}
125
+ if sources is None:
126
+ sources = {}
127
+ self._sources = sources
124
128
  self._save_columns()
125
129
 
126
130
  @property
@@ -158,6 +162,7 @@ class DataFrameCurator:
158
162
  save_function="add_new_from_columns",
159
163
  using=self._using,
160
164
  validated_only=False,
165
+ source=self._sources.get("columns"),
161
166
  **kwargs,
162
167
  )
163
168
 
@@ -172,6 +177,7 @@ class DataFrameCurator:
172
177
  using=self._using,
173
178
  validated_only=validated_only,
174
179
  df=self._df, # Get the Feature type from df
180
+ source=self._sources.get("columns"),
175
181
  **kwargs,
176
182
  )
177
183
 
@@ -222,6 +228,7 @@ class DataFrameCurator:
222
228
  key=categorical,
223
229
  using=self._using,
224
230
  validated_only=validated_only,
231
+ sources=self._sources.get(categorical),
225
232
  **kwargs,
226
233
  )
227
234
 
@@ -242,6 +249,7 @@ class DataFrameCurator:
242
249
  self._df,
243
250
  fields=self.fields,
244
251
  using=self._using,
252
+ sources=self._sources,
245
253
  **self._kwargs,
246
254
  )
247
255
  return self._validated
@@ -283,41 +291,6 @@ class DataFrameCurator:
283
291
 
284
292
  return self._artifact
285
293
 
286
- def save_collection(
287
- self,
288
- artifact: Artifact | Iterable[Artifact],
289
- name: str,
290
- description: str | None = None,
291
- reference: str | None = None,
292
- reference_type: str | None = None,
293
- ) -> Collection:
294
- """Save a collection from artifact/artifacts.
295
-
296
- Args:
297
- artifact: One or several saved Artifacts.
298
- name: Title of the publication.
299
- description: Description of the publication.
300
- reference: Accession number (e.g. GSE#, E-MTAB#, etc.).
301
- reference_type: Source type (e.g. GEO, ArrayExpress, SRA, etc.).
302
- """
303
- collection = Collection(
304
- artifact,
305
- name=name,
306
- description=description,
307
- reference=reference,
308
- reference_type=reference_type,
309
- )
310
- slug = ln_setup.settings.instance.slug
311
- if collection._state.adding:
312
- collection.save()
313
- else: # pragma: no cover
314
- collection.save()
315
- logger.warning(f"collection already exists in {colors.italic(slug)}!")
316
- if ln_setup.settings.instance.is_remote: # pragma: no cover
317
- logger.print(f"go to https://lamin.ai/{slug}/collection/{collection.uid}")
318
- self._collection = collection
319
- return collection
320
-
321
294
  def clean_up_failed_runs(self):
322
295
  """Clean up previous failed runs that don't save any outputs."""
323
296
  from lamindb.core._run_context import run_context
@@ -338,6 +311,7 @@ class AnnDataCurator(DataFrameCurator):
338
311
  using: A reference LaminDB instance.
339
312
  verbosity: The verbosity level.
340
313
  organism: The organism name.
314
+ sources: A dictionary mapping ``.obs.columns`` to Source records.
341
315
 
342
316
  Examples:
343
317
  >>> import bionty as bt
@@ -357,11 +331,14 @@ class AnnDataCurator(DataFrameCurator):
357
331
  using: str = "default",
358
332
  verbosity: str = "hint",
359
333
  organism: str | None = None,
334
+ sources: dict[str, Record] | None = None,
360
335
  ) -> None:
361
336
  from lamindb_setup.core import upath
362
337
 
363
338
  from ._artifact import data_is_anndata
364
339
 
340
+ if sources is None:
341
+ sources = {}
365
342
  if not data_is_anndata(data):
366
343
  raise ValueError(
367
344
  "data has to be an AnnData object or a path to AnnData-like"
@@ -381,6 +358,7 @@ class AnnDataCurator(DataFrameCurator):
381
358
  using=using,
382
359
  verbosity=verbosity,
383
360
  organism=organism,
361
+ sources=sources,
384
362
  )
385
363
  self._obs_fields = categoricals
386
364
  self._save_from_var_index(validated_only=True, **self._kwargs)
@@ -421,6 +399,7 @@ class AnnDataCurator(DataFrameCurator):
421
399
  using=self._using,
422
400
  validated_only=validated_only,
423
401
  organism=organism,
402
+ source=self._sources.get("var_index"),
424
403
  )
425
404
 
426
405
  def add_new_from_var_index(self, organism: str | None = None, **kwargs):
@@ -455,7 +434,11 @@ class AnnDataCurator(DataFrameCurator):
455
434
  **self._kwargs,
456
435
  )
457
436
  validated_obs = validate_categories_in_df(
458
- self._adata.obs, fields=self.categoricals, using=self._using, **self._kwargs
437
+ self._adata.obs,
438
+ fields=self.categoricals,
439
+ using=self._using,
440
+ sources=self._sources,
441
+ **self._kwargs,
459
442
  )
460
443
  self._validated = validated_var and validated_obs
461
444
  return self._validated
@@ -519,7 +502,11 @@ class MuDataCurator:
519
502
  using: str = "default",
520
503
  verbosity: str = "hint",
521
504
  organism: str | None = None,
505
+ sources: dict[str, Record] | None = None,
522
506
  ) -> None:
507
+ if sources is None:
508
+ sources = {}
509
+ self._sources = sources
523
510
  self._mdata = mdata
524
511
  self._kwargs = {"organism": organism} if organism else {}
525
512
  self._var_fields = var_index
@@ -534,6 +521,7 @@ class MuDataCurator:
534
521
  categoricals=self._obs_fields.get(modality, {}),
535
522
  using=using,
536
523
  verbosity=verbosity,
524
+ sources=self._sources.get(modality),
537
525
  **self._kwargs,
538
526
  )
539
527
  for modality in self._modalities
@@ -713,7 +701,11 @@ class MuDataCurator:
713
701
  else:
714
702
  obs = self._mdata[modality].obs
715
703
  validated_obs &= validate_categories_in_df(
716
- obs, fields=fields, using=self._using, **self._kwargs
704
+ obs,
705
+ fields=fields,
706
+ using=self._using,
707
+ sources=self._sources.get(modality),
708
+ **self._kwargs,
717
709
  )
718
710
  self._validated = validated_var and validated_obs
719
711
  return self._validated
@@ -776,6 +768,7 @@ class Curate:
776
768
  using: str = "default",
777
769
  verbosity: str = "hint",
778
770
  organism: str | None = None,
771
+ sources: dict[str, Record] | None = None,
779
772
  ) -> AnnDataCurator:
780
773
  """{}""" # noqa: D415
781
774
  return AnnDataCurator(
@@ -785,6 +778,7 @@ class Curate:
785
778
  using=using,
786
779
  verbosity=verbosity,
787
780
  organism=organism,
781
+ sources=sources,
788
782
  )
789
783
 
790
784
  @classmethod
@@ -848,6 +842,7 @@ def validate_categories(
848
842
  key: str,
849
843
  using: str | None = None,
850
844
  organism: str | None = None,
845
+ source: Record | None = None,
851
846
  ) -> bool:
852
847
  """Validate ontology terms in a pandas series using LaminDB registries."""
853
848
  from lamindb._from_values import _print_values
@@ -862,6 +857,7 @@ def validate_categories(
862
857
 
863
858
  registry = field.field.model
864
859
  filter_kwargs = check_registry_organism(registry, organism)
860
+ filter_kwargs.update({"source": source} if source else {})
865
861
 
866
862
  # Inspect the default instance
867
863
  inspect_result = standardize_and_inspect(
@@ -927,9 +923,12 @@ def validate_categories_in_df(
927
923
  df: pd.DataFrame,
928
924
  fields: dict[str, FieldAttr],
929
925
  using: str | None = None,
926
+ sources: dict[str, Record] = None,
930
927
  **kwargs,
931
928
  ) -> bool:
932
929
  """Validate categories in DataFrame columns using LaminDB registries."""
930
+ if sources is None:
931
+ sources = {}
933
932
  validated = True
934
933
  for key, field in fields.items():
935
934
  validated &= validate_categories(
@@ -937,6 +936,7 @@ def validate_categories_in_df(
937
936
  field=field,
938
937
  key=key,
939
938
  using=using,
939
+ source=sources.get(key),
940
940
  **kwargs,
941
941
  )
942
942
  return validated
@@ -998,13 +998,13 @@ def save_artifact(
998
998
  organism,
999
999
  )
1000
1000
 
1001
- if artifact.accessor == "DataFrame":
1001
+ if artifact._accessor == "DataFrame":
1002
1002
  artifact.features._add_set_from_df(field=columns_field, **feature_kwargs)
1003
- elif artifact.accessor == "AnnData":
1003
+ elif artifact._accessor == "AnnData":
1004
1004
  artifact.features._add_set_from_anndata(
1005
1005
  var_field=columns_field, **feature_kwargs
1006
1006
  )
1007
- elif artifact.accessor == "MuData":
1007
+ elif artifact._accessor == "MuData":
1008
1008
  artifact.features._add_set_from_mudata(
1009
1009
  var_fields=columns_field, **feature_kwargs
1010
1010
  )
@@ -1021,7 +1021,7 @@ def save_artifact(
1021
1021
  labels = registry.from_values(df[key], field=field, **filter_kwargs)
1022
1022
  artifact.labels.add(labels, feature)
1023
1023
 
1024
- if artifact.accessor == "MuData":
1024
+ if artifact._accessor == "MuData":
1025
1025
  for modality, modality_fields in fields.items():
1026
1026
  if modality == "obs":
1027
1027
  _add_labels(data, artifact, modality_fields)
@@ -1046,6 +1046,7 @@ def update_registry(
1046
1046
  df: pd.DataFrame | None = None,
1047
1047
  organism: str | None = None,
1048
1048
  dtype: str | None = None,
1049
+ source: Record | None = None,
1049
1050
  **kwargs,
1050
1051
  ) -> list[Record]:
1051
1052
  """Save features or labels records in the default instance from the using instance.
@@ -1060,6 +1061,7 @@ def update_registry(
1060
1061
  df: A DataFrame to save labels from.
1061
1062
  organism: The organism name.
1062
1063
  dtype: The type of the feature.
1064
+ source: The source record.
1063
1065
  kwargs: Additional keyword arguments to pass to the registry model to create new records.
1064
1066
  """
1065
1067
  from lamindb._save import save as ln_save
@@ -1067,6 +1069,7 @@ def update_registry(
1067
1069
 
1068
1070
  registry = field.field.model
1069
1071
  filter_kwargs = check_registry_organism(registry, organism)
1072
+ filter_kwargs.update({"source": source} if source else {})
1070
1073
 
1071
1074
  verbosity = settings.verbosity
1072
1075
  try:
@@ -1098,6 +1101,10 @@ def update_registry(
1098
1101
  if non_validated_labels
1099
1102
  else []
1100
1103
  )
1104
+ # here we check to only save the public records if they are from the specified source
1105
+ # TODO: this if shouldn't be needed
1106
+ if source:
1107
+ public_records = [r for r in public_records if r.source == source]
1101
1108
  ln_save(public_records)
1102
1109
  labels_saved["from public"] = [
1103
1110
  getattr(r, field.field.name) for r in public_records
@@ -1119,7 +1126,11 @@ def update_registry(
1119
1126
  if registry == Feature:
1120
1127
  init_kwargs["dtype"] = "cat" if dtype is None else dtype
1121
1128
  non_validated_records.append(
1122
- registry(**init_kwargs, **filter_kwargs, **kwargs)
1129
+ registry(
1130
+ **init_kwargs,
1131
+ **{k: v for k, v in filter_kwargs.items() if k != "source"},
1132
+ **{k: v for k, v in kwargs.items() if k != "sources"},
1133
+ )
1123
1134
  )
1124
1135
  ln_save(non_validated_records)
1125
1136
 
@@ -1242,7 +1253,7 @@ def _save_organism(name: str): # pragma: no cover
1242
1253
 
1243
1254
  organism = bt.Organism.filter(name=name).one_or_none()
1244
1255
  if organism is None:
1245
- organism = bt.Organism.from_public(name=name)
1256
+ organism = bt.Organism.from_source(name=name)
1246
1257
  if organism is None:
1247
1258
  raise ValueError(
1248
1259
  f"Organism '{name}' not found\n"
lamindb/_feature_set.py CHANGED
@@ -118,7 +118,7 @@ def from_values(
118
118
  name: str | None = None,
119
119
  mute: bool = False,
120
120
  organism: Record | str | None = None,
121
- public_source: Record | None = None,
121
+ source: Record | None = None,
122
122
  raise_validation_error: bool = True,
123
123
  ) -> FeatureSet:
124
124
  """{}""" # noqa: D415
@@ -139,7 +139,7 @@ def from_values(
139
139
  not_validated_values = values_array[~validated]
140
140
  msg = (
141
141
  f"These values could not be validated: {not_validated_values.tolist()}\n"
142
- f"If there are no typos, add them to their registry: {registry}"
142
+ f"If there are no typos, add them to their registry: {registry.__name__}"
143
143
  )
144
144
  if raise_validation_error:
145
145
  raise ValidationError(msg)
@@ -149,7 +149,7 @@ def from_values(
149
149
  validated_values,
150
150
  field=field,
151
151
  organism=organism,
152
- public_source=public_source,
152
+ source=source,
153
153
  )
154
154
  feature_set = FeatureSet(
155
155
  features=validated_features,
@@ -168,7 +168,7 @@ def from_df(
168
168
  name: str | None = None,
169
169
  mute: bool = False,
170
170
  organism: Record | str | None = None,
171
- public_source: Record | None = None,
171
+ source: Record | None = None,
172
172
  ) -> FeatureSet | None:
173
173
  """{}""" # noqa: D415
174
174
  registry = field.field.model
@@ -189,7 +189,7 @@ def from_df(
189
189
  df.columns[validated],
190
190
  field=field,
191
191
  organism=organism,
192
- public_source=public_source,
192
+ source=source,
193
193
  )
194
194
  feature_set = FeatureSet(
195
195
  features=validated_features,
lamindb/_filter.py CHANGED
@@ -21,9 +21,9 @@ def filter(Record: type[Record], **expressions) -> QuerySet:
21
21
  ):
22
22
  visibility = "visibility"
23
23
  if not any(e.startswith(visibility) for e in expressions):
24
- expressions[
25
- visibility
26
- ] = VisibilityChoice.default.value # default visibility
24
+ expressions[visibility] = (
25
+ VisibilityChoice.default.value
26
+ ) # default visibility
27
27
  # if visibility is None, do not apply a filter
28
28
  # otherwise, it would mean filtering for NULL values, which doesn't make
29
29
  # sense for a non-NULLABLE column
lamindb/_finish.py CHANGED
@@ -80,8 +80,8 @@ def save_run_context_core(
80
80
 
81
81
  # for scripts, things are easy
82
82
  is_consecutive = True
83
- is_notebook = transform.type == TransformType.notebook
84
- source_code_path = filepath
83
+ is_notebook = transform.type == "notebook"
84
+ _source_code_artifact_path = filepath
85
85
  # for notebooks, we need more work
86
86
  if is_notebook:
87
87
  try:
@@ -134,12 +134,12 @@ def save_run_context_core(
134
134
  )
135
135
  # strip the output from the notebook to create the source code file
136
136
  # first, copy the notebook file to a temporary file in the cache
137
- source_code_path = ln_setup.settings.storage.cache_dir / filepath.name
138
- shutil.copy2(filepath, source_code_path) # copy
137
+ _source_code_artifact_path = ln_setup.settings.storage.cache_dir / filepath.name
138
+ shutil.copy2(filepath, _source_code_artifact_path) # copy
139
139
  subprocess.run(
140
140
  [
141
141
  "nbstripout",
142
- source_code_path,
142
+ _source_code_artifact_path,
143
143
  "--extra-keys",
144
144
  "metadata.version metadata.kernelspec metadata.language_info metadata.pygments_lexer metadata.name metadata.file_extension",
145
145
  ],
@@ -152,31 +152,34 @@ def save_run_context_core(
152
152
  transform_family = transform.versions
153
153
  if len(transform_family) > 0:
154
154
  for prev_transform in transform_family.order_by("-created_at"):
155
- if prev_transform.latest_report_id is not None:
156
- prev_report = prev_transform.latest_report
157
- if prev_transform.source_code_id is not None:
158
- prev_source = prev_transform.source_code
155
+ if (
156
+ prev_transform.latest_run is not None
157
+ and prev_transform.latest_run.report_id is not None
158
+ ):
159
+ prev_report = prev_transform.latest_run.report
160
+ if prev_transform._source_code_artifact_id is not None:
161
+ prev_source = prev_transform._source_code_artifact
159
162
  ln.settings.creation.artifact_silence_missing_run_warning = True
160
163
 
161
164
  # track source code
162
- if transform.source_code_id is not None:
165
+ if transform._source_code_artifact_id is not None:
163
166
  # check if the hash of the transform source code matches
164
167
  # (for scripts, we already run the same logic in track() - we can deduplicate the call at some point)
165
- hash, _ = hash_file(source_code_path) # ignore hash_type for now
166
- if hash != transform.source_code.hash:
168
+ hash, _ = hash_file(_source_code_artifact_path) # ignore hash_type for now
169
+ if hash != transform._source_code_artifact.hash:
167
170
  if os.getenv("LAMIN_TESTING") is None:
168
171
  # in test, auto-confirm overwrite
169
172
  response = input(
170
- f"You are about to replace (overwrite) existing source code (hash '{transform.source_code.hash}') for transform version"
173
+ f"You are about to replace (overwrite) existing source code (hash '{transform._source_code_artifact.hash}') for transform version"
171
174
  f" '{transform.version}'. Proceed? (y/n)"
172
175
  )
173
176
  else:
174
177
  response = "y"
175
178
  if response == "y":
176
- transform.source_code.replace(source_code_path)
177
- transform.source_code.save(upload=True)
179
+ transform._source_code_artifact.replace(_source_code_artifact_path)
180
+ transform._source_code_artifact.save(upload=True)
178
181
  logger.success(
179
- f"replaced transform.source_code: {transform.source_code}"
182
+ f"replaced transform._source_code_artifact: {transform._source_code_artifact}"
180
183
  )
181
184
  else:
182
185
  logger.warning("Please re-run `ln.track()` to make a new version")
@@ -184,17 +187,19 @@ def save_run_context_core(
184
187
  else:
185
188
  logger.important("source code is already saved")
186
189
  else:
187
- source_code = ln.Artifact(
188
- source_code_path,
190
+ _source_code_artifact = ln.Artifact(
191
+ _source_code_artifact_path,
189
192
  description=f"Source of transform {transform.uid}",
190
193
  version=transform.version,
191
194
  is_new_version_of=prev_source,
192
195
  visibility=0, # hidden file
193
196
  run=False,
194
197
  )
195
- source_code.save(upload=True, print_progress=False)
196
- transform.source_code = source_code
197
- logger.debug(f"saved transform.source_code: {transform.source_code}")
198
+ _source_code_artifact.save(upload=True, print_progress=False)
199
+ transform._source_code_artifact = _source_code_artifact
200
+ logger.debug(
201
+ f"saved transform._source_code_artifact: {transform._source_code_artifact}"
202
+ )
198
203
 
199
204
  # track environment
200
205
  env_path = ln_setup.settings.storage.cache_dir / f"run_env_pip_{run.uid}.txt"
@@ -257,8 +262,9 @@ def save_run_context_core(
257
262
  run.report = report_file
258
263
  run.is_consecutive = is_consecutive
259
264
  run.save()
260
- transform.latest_report = run.report
261
- logger.debug(f"saved transform.latest_report: {transform.latest_report}")
265
+ logger.debug(
266
+ f"saved transform.latest_run.report: {transform.latest_run.report}"
267
+ )
262
268
  transform.save()
263
269
 
264
270
  # finalize
lamindb/_from_values.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import TYPE_CHECKING, Any, Iterable
3
+ from typing import TYPE_CHECKING, Iterable
4
4
 
5
5
  import pandas as pd
6
6
  from django.core.exceptions import FieldDoesNotExist
@@ -19,9 +19,9 @@ def get_or_create_records(
19
19
  field: StrField,
20
20
  *,
21
21
  create: bool = False,
22
- from_public: bool = False,
22
+ from_source: bool = False,
23
23
  organism: Record | str | None = None,
24
- public_source: Record | None = None,
24
+ source: Record | None = None,
25
25
  mute: bool = False,
26
26
  ) -> list[Record]:
27
27
  """Get or create records from iterables."""
@@ -34,8 +34,8 @@ def get_or_create_records(
34
34
  kwargs: dict = {}
35
35
  if organism is not None:
36
36
  kwargs["organism"] = organism
37
- if public_source is not None:
38
- kwargs["public_source"] = public_source
37
+ if source is not None:
38
+ kwargs["source"] = source
39
39
  settings.creation.search_names = False
40
40
  try:
41
41
  iterable_idx = index_iterable(iterable)
@@ -47,8 +47,17 @@ def get_or_create_records(
47
47
 
48
48
  # new records to be created based on new values
49
49
  if len(nonexist_values) > 0:
50
- if from_public:
51
- records_bionty, unmapped_values = create_records_from_public(
50
+ if source:
51
+ from_source = not source.in_db
52
+ elif (
53
+ records
54
+ and hasattr(records[0], "source_id")
55
+ and records[0].source_id
56
+ and records[0].source.in_db
57
+ ):
58
+ from_source = False
59
+ if from_source:
60
+ records_bionty, unmapped_values = create_records_from_source(
52
61
  iterable_idx=nonexist_values,
53
62
  field=field,
54
63
  msg=msg,
@@ -58,7 +67,7 @@ def get_or_create_records(
58
67
  if len(records_bionty) > 0:
59
68
  msg = ""
60
69
  for record in records_bionty:
61
- record._from_public = True
70
+ record._from_source = True
62
71
  records += records_bionty
63
72
  else:
64
73
  unmapped_values = nonexist_values
@@ -75,7 +84,7 @@ def get_or_create_records(
75
84
  f"{colors.red('did not create')} {name} record{s} for "
76
85
  f"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}"
77
86
  )
78
- if Record.__module__.startswith("lnschema_bionty.") or Record == ULabel:
87
+ if Record.__module__.startswith("bionty.") or Record == ULabel:
79
88
  if isinstance(iterable, pd.Series):
80
89
  feature = iterable.name
81
90
  feature_name = None
@@ -100,8 +109,8 @@ def get_existing_records(
100
109
  model = field.field.model
101
110
  condition: dict = {} if len(kwargs) == 0 else kwargs.copy()
102
111
  # existing records matching is agnostic to the bionty source
103
- if "public_source" in condition:
104
- condition.pop("public_source")
112
+ if "source" in condition:
113
+ condition.pop("source")
105
114
 
106
115
  # standardize based on the DB reference
107
116
  # log synonyms mapped terms
@@ -109,7 +118,7 @@ def get_existing_records(
109
118
  iterable_idx,
110
119
  field=field,
111
120
  organism=kwargs.get("organism"),
112
- public_source=kwargs.get("public_source"),
121
+ source=kwargs.get("source"),
113
122
  mute=True,
114
123
  )
115
124
  syn_mapper = result.synonyms_mapper
@@ -174,7 +183,7 @@ def get_existing_records(
174
183
  return records, nonexist_values, msg
175
184
 
176
185
 
177
- def create_records_from_public(
186
+ def create_records_from_source(
178
187
  iterable_idx: pd.Index,
179
188
  field: StrField,
180
189
  msg: str = "",
@@ -184,7 +193,8 @@ def create_records_from_public(
184
193
  model = field.field.model
185
194
  records: list = []
186
195
  # populate additional fields from bionty
187
- from lnschema_bionty._bionty import get_public_source_record
196
+ from bionty._bionty import get_source_record
197
+ from bionty.core._bionty import filter_bionty_df_columns
188
198
 
189
199
  # create the corresponding bionty object from model
190
200
  try:
@@ -195,17 +205,20 @@ def create_records_from_public(
195
205
  organism = "human"
196
206
  elif iterable_idx[0].startswith("ENSMUSG"):
197
207
  organism = "mouse"
198
- public_ontology = model.public(
199
- organism=organism, public_source=kwargs.get("public_source")
200
- )
208
+ public_ontology = model.public(organism=organism, source=kwargs.get("source"))
201
209
  except Exception:
202
210
  # for custom records that are not created from public sources
203
211
  return records, iterable_idx
204
- # add public_source record to the kwargs
205
- kwargs.update({"public_source": get_public_source_record(public_ontology)})
212
+ # add source record to the kwargs
213
+ source_record = get_source_record(public_ontology)
214
+ if source_record is not None and source_record.in_db:
215
+ # skips the creation of records from public if the source is already in the db
216
+ return records, iterable_idx
217
+
218
+ kwargs.update({"source": source_record})
206
219
 
207
220
  # filter the columns in bionty df based on fields
208
- bionty_df = _filter_bionty_df_columns(model=model, public_ontology=public_ontology)
221
+ bionty_df = filter_bionty_df_columns(model=model, public_ontology=public_ontology)
209
222
 
210
223
  # standardize in the bionty reference
211
224
  result = public_ontology.inspect(iterable_idx, field=field.field.name, mute=True)
@@ -301,43 +314,6 @@ def _print_values(names: Iterable, n: int = 20, quotes: bool = True) -> str:
301
314
  return print_values
302
315
 
303
316
 
304
- def _filter_bionty_df_columns(model: Record, public_ontology: Any) -> pd.DataFrame:
305
- bionty_df = pd.DataFrame()
306
- if public_ontology is not None:
307
- model_field_names = {i.name for i in model._meta.fields}
308
- # parents needs to be added here as relationships aren't in fields
309
- model_field_names.add("parents")
310
- bionty_df = public_ontology.df().reset_index()
311
- if model.__name__ == "Gene":
312
- # groupby ensembl_gene_id and concat ncbi_gene_ids
313
- groupby_id_col = (
314
- "ensembl_gene_id" if "ensembl_gene_id" in bionty_df else "stable_id"
315
- )
316
- bionty_df.drop(
317
- columns=["hgnc_id", "mgi_id", "index"], errors="ignore", inplace=True
318
- )
319
- bionty_df.drop_duplicates([groupby_id_col, "ncbi_gene_id"], inplace=True)
320
- bionty_df["ncbi_gene_id"] = bionty_df["ncbi_gene_id"].fillna("")
321
- bionty_df = (
322
- bionty_df.groupby(groupby_id_col)
323
- .agg(
324
- {
325
- "symbol": "first",
326
- "ncbi_gene_id": "|".join,
327
- "biotype": "first",
328
- "description": "first",
329
- "synonyms": "first",
330
- }
331
- )
332
- .reset_index()
333
- )
334
- bionty_df.rename(columns={"ncbi_gene_id": "ncbi_gene_ids"}, inplace=True)
335
- # rename definition to description for the lnschema_bionty
336
- bionty_df.rename(columns={"definition": "description"}, inplace=True)
337
- bionty_df = bionty_df.loc[:, bionty_df.columns.isin(model_field_names)]
338
- return bionty_df
339
-
340
-
341
317
  def _bulk_create_dicts_from_df(
342
318
  keys: set | list, column_name: str, df: pd.DataFrame
343
319
  ) -> tuple[dict, str]:
@@ -359,7 +335,7 @@ def _bulk_create_dicts_from_df(
359
335
  return df.reset_index().to_dict(orient="records"), multi_msg
360
336
 
361
337
 
362
- def _has_organism_field(orm: Record) -> bool:
338
+ def _has_organism_field(orm: type[Record]) -> bool:
363
339
  try:
364
340
  orm._meta.get_field("organism")
365
341
  return True
@@ -371,10 +347,15 @@ def _get_organism_record(
371
347
  field: StrField, organism: str | Record, force: bool = False
372
348
  ) -> Record:
373
349
  model = field.field.model
374
- check = True if force else field.field.name != "ensembl_gene_id"
350
+ check = True
351
+ if not force and hasattr(model, "_ontology_id_field"):
352
+ check = field.field.name != model._ontology_id_field
353
+ # e.g. bionty.CellMarker has "name" as _ontology_id_field
354
+ if not model._ontology_id_field.endswith("id"):
355
+ check = True
375
356
 
376
357
  if _has_organism_field(model) and check:
377
- from lnschema_bionty._bionty import create_or_get_organism_record
358
+ from bionty._bionty import create_or_get_organism_record
378
359
 
379
360
  organism_record = create_or_get_organism_record(organism=organism, orm=model)
380
361
  if organism_record is not None: