lamindb 0.77.0__py3-none-any.whl → 0.77.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb/_feature.py CHANGED
@@ -1,38 +1,66 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import TYPE_CHECKING
3
+ from typing import TYPE_CHECKING, Any, Literal, get_args
4
4
 
5
5
  import lamindb_setup as ln_setup
6
6
  import pandas as pd
7
+ from lamin_utils import logger
7
8
  from lamindb_setup.core._docs import doc_args
8
- from lnschema_core.models import Artifact, Feature
9
+ from lnschema_core.models import Artifact, Feature, Record
10
+ from lnschema_core.types import FeatureDtype
9
11
  from pandas.api.types import CategoricalDtype, is_string_dtype
10
12
 
11
- from ._query_set import RecordsList
13
+ from lamindb.core.exceptions import ValidationError
14
+
15
+ from ._query_set import RecordList
12
16
  from ._utils import attach_func_to_class_method
13
17
  from .core._settings import settings
14
18
  from .core.schema import dict_schema_name_to_model_name
15
19
 
16
20
  if TYPE_CHECKING:
17
- from lnschema_core.types import FieldAttr
18
-
19
- FEATURE_TYPES = {
20
- "number": "number",
21
- "int": "int",
22
- "float": "float",
23
- "bool": "bool",
24
- "str": "cat",
25
- "object": "cat",
26
- }
21
+ from collections.abc import Iterable
27
22
 
28
-
29
- def convert_numpy_dtype_to_lamin_feature_type(dtype, str_as_cat: bool = True) -> str:
30
- orig_type = dtype.name
31
- # strip precision qualifiers
32
- type = "".join(i for i in orig_type if not i.isdigit())
33
- if type == "object" or type == "str":
34
- type = "cat" if str_as_cat else "str"
35
- return type
23
+ from lnschema_core.types import FieldAttr
24
+ from pandas.core.dtypes.base import ExtensionDtype
25
+
26
+
27
+ FEATURE_DTYPES = set(get_args(FeatureDtype))
28
+
29
+
30
+ def get_dtype_str_from_dtype(dtype: Any) -> str:
31
+ if not isinstance(dtype, list) and dtype.__name__ in FEATURE_DTYPES:
32
+ dtype_str = dtype.__name__
33
+ else:
34
+ error_message = "dtype has to be of type Record or list[Record]"
35
+ if isinstance(dtype, Record):
36
+ dtype = [dtype]
37
+ elif not isinstance(dtype, list):
38
+ raise ValueError(error_message)
39
+ registries_str = ""
40
+ for registry in dtype:
41
+ if not hasattr(registry, "__get_name_with_schema__"):
42
+ raise ValueError(error_message)
43
+ registries_str += registry.__get_name_with_schema__() + "|"
44
+ dtype_str = f'cat[{registries_str.rstrip("|")}]'
45
+ return dtype_str
46
+
47
+
48
+ def convert_pandas_dtype_to_lamin_dtype(pandas_dtype: ExtensionDtype) -> str:
49
+ if is_string_dtype(pandas_dtype):
50
+ if not isinstance(pandas_dtype, CategoricalDtype):
51
+ dtype = "str"
52
+ else:
53
+ dtype = "cat"
54
+ # there are string-like categoricals and "pure" categoricals (pd.Categorical)
55
+ elif isinstance(pandas_dtype, CategoricalDtype):
56
+ dtype = "cat"
57
+ else:
58
+ # strip precision qualifiers
59
+ dtype = "".join(dt for dt in pandas_dtype.name if not dt.isdigit())
60
+ if dtype.startswith("datetime"):
61
+ dtype = dtype.split("[")[0]
62
+ assert dtype in FEATURE_DTYPES # noqa: S101
63
+ return dtype
36
64
 
37
65
 
38
66
  def __init__(self, *args, **kwargs):
@@ -45,28 +73,16 @@ def __init__(self, *args, **kwargs):
45
73
  dtype: type | str = kwargs.pop("dtype") if "dtype" in kwargs else None
46
74
  # cast type
47
75
  if dtype is None:
48
- raise ValueError("Please pass dtype!")
76
+ raise ValueError(f"Please pass dtype, one of {FEATURE_DTYPES}")
49
77
  elif dtype is not None:
50
78
  if not isinstance(dtype, str):
51
- if not isinstance(dtype, list) and dtype.__name__ in FEATURE_TYPES:
52
- dtype_str = FEATURE_TYPES[dtype.__name__]
53
- else:
54
- if not isinstance(dtype, list):
55
- raise ValueError("dtype has to be a list of Record types")
56
- registries_str = ""
57
- for cls in dtype:
58
- if not hasattr(cls, "__get_name_with_schema__"):
59
- raise ValueError("each element of the list has to be a Record")
60
- registries_str += cls.__get_name_with_schema__() + "|"
61
- dtype_str = f'cat[{registries_str.rstrip("|")}]'
79
+ dtype_str = get_dtype_str_from_dtype(dtype)
62
80
  else:
63
81
  dtype_str = dtype
64
82
  # add validation that a registry actually exists
65
- if dtype_str not in FEATURE_TYPES.values() and not dtype_str.startswith(
66
- "cat"
67
- ):
83
+ if dtype_str not in FEATURE_DTYPES and not dtype_str.startswith("cat"):
68
84
  raise ValueError(
69
- f"dtype is {dtype_str} but has to be one of 'number', 'int', 'float', 'cat', 'bool', 'cat[...]'!"
85
+ f"dtype is {dtype_str} but has to be one of {FEATURE_DTYPES}!"
70
86
  )
71
87
  if dtype_str != "cat" and dtype_str.startswith("cat"):
72
88
  registries_str = dtype_str.replace("cat[", "").rstrip("]")
@@ -79,6 +95,27 @@ def __init__(self, *args, **kwargs):
79
95
  )
80
96
  kwargs["dtype"] = dtype_str
81
97
  super(Feature, self).__init__(*args, **kwargs)
98
+ if not self._state.adding:
99
+ if not (
100
+ self.dtype.startswith("cat") if dtype == "cat" else self.dtype == dtype
101
+ ):
102
+ raise ValidationError(
103
+ f"Feature {self.name} already exists with dtype {self.dtype}, you passed {dtype}"
104
+ )
105
+
106
+
107
+ def suggest_categorical_for_str_iterable(
108
+ iterable: Iterable[str], key: str = None
109
+ ) -> str:
110
+ c = pd.Categorical(iterable)
111
+ message = ""
112
+ if len(c.categories) < len(c):
113
+ if key != "":
114
+ key_note = f" for feature {key}"
115
+ else:
116
+ key_note = ""
117
+ message = f"You have few permissible values{key_note}, consider dtype 'cat' instead of 'str'"
118
+ return message
82
119
 
83
120
 
84
121
  def categoricals_from_df(df: pd.DataFrame) -> dict:
@@ -90,42 +127,31 @@ def categoricals_from_df(df: pd.DataFrame) -> dict:
90
127
  if isinstance(df[col].dtype, CategoricalDtype)
91
128
  }
92
129
  for key in string_cols:
93
- c = pd.Categorical(df[key])
94
- if len(c.categories) < len(c):
95
- categoricals[key] = c
130
+ message = suggest_categorical_for_str_iterable(df[key], key)
131
+ if message:
132
+ logger.warning(message)
96
133
  return categoricals
97
134
 
98
135
 
99
136
  @classmethod # type:ignore
100
137
  @doc_args(Feature.from_df.__doc__)
101
- def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordsList:
138
+ def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordList:
102
139
  """{}""" # noqa: D415
103
140
  field = Feature.name if field is None else field
141
+ registry = field.field.model
142
+ if registry != Feature:
143
+ raise ValueError("field must be a Feature FieldAttr!")
104
144
  categoricals = categoricals_from_df(df)
105
-
106
145
  dtypes = {}
107
- # categoricals_with_unmapped_categories = {} # type: ignore
108
146
  for name, col in df.items():
109
147
  if name in categoricals:
110
148
  dtypes[name] = "cat"
111
149
  else:
112
- dtypes[name] = convert_numpy_dtype_to_lamin_feature_type(col.dtype)
113
-
114
- # silence the warning "loaded record with exact same name "
115
- verbosity = settings.verbosity
116
- try:
117
- settings.verbosity = "error"
118
-
119
- registry = field.field.model
120
- if registry != Feature:
121
- raise ValueError("field must be a Feature FieldAttr!")
122
- # create records for all features including non-validated
150
+ dtypes[name] = convert_pandas_dtype_to_lamin_dtype(col.dtype)
151
+ with logger.mute(): # silence the warning "loaded record with exact same name "
123
152
  features = [Feature(name=name, dtype=dtype) for name, dtype in dtypes.items()]
124
- finally:
125
- settings.verbosity = verbosity
126
-
127
153
  assert len(features) == len(df.columns) # noqa: S101
128
- return RecordsList(features)
154
+ return RecordList(features)
129
155
 
130
156
 
131
157
  @doc_args(Feature.save.__doc__)
lamindb/_feature_set.py CHANGED
@@ -10,7 +10,7 @@ from lamindb_setup.core.hashing import hash_set
10
10
  from lnschema_core import Feature, FeatureSet, Record, ids
11
11
  from lnschema_core.types import FieldAttr, ListLike
12
12
 
13
- from ._feature import convert_numpy_dtype_to_lamin_feature_type
13
+ from ._feature import convert_pandas_dtype_to_lamin_dtype
14
14
  from ._record import init_self_from_db
15
15
  from ._utils import attach_func_to_class_method
16
16
  from .core.exceptions import ValidationError
@@ -26,7 +26,7 @@ if TYPE_CHECKING:
26
26
 
27
27
  from ._query_set import QuerySet
28
28
 
29
- NUMBER_TYPE = "number"
29
+ NUMBER_TYPE = "num"
30
30
  DICT_KEYS_TYPE = type({}.keys()) # type: ignore
31
31
 
32
32
 
@@ -179,13 +179,15 @@ def from_df(
179
179
  logger.warning("no validated features, skip creating feature set")
180
180
  return None
181
181
  if registry == Feature:
182
- validated_features = Feature.from_df(df.loc[:, validated])
182
+ validated_features = Feature.from_values(
183
+ df.columns, field=field, organism=organism
184
+ )
183
185
  feature_set = FeatureSet(validated_features, name=name, dtype=None)
184
186
  else:
185
187
  dtypes = [col.dtype for (_, col) in df.loc[:, validated].items()]
186
188
  if len(set(dtypes)) != 1:
187
189
  raise ValueError(f"data types are heterogeneous: {set(dtypes)}")
188
- dtype = convert_numpy_dtype_to_lamin_feature_type(dtypes[0])
190
+ dtype = convert_pandas_dtype_to_lamin_dtype(dtypes[0])
189
191
  validated_features = registry.from_values(
190
192
  df.columns[validated],
191
193
  field=field,
lamindb/_finish.py CHANGED
@@ -8,6 +8,8 @@ import lamindb_setup as ln_setup
8
8
  from lamin_utils import logger
9
9
  from lamindb_setup.core.hashing import hash_file
10
10
 
11
+ from lamindb.core.exceptions import NotebookNotSaved
12
+
11
13
  if TYPE_CHECKING:
12
14
  from pathlib import Path
13
15
 
@@ -16,6 +18,20 @@ if TYPE_CHECKING:
16
18
  from ._query_set import QuerySet
17
19
 
18
20
 
21
+ def get_r_save_notebook_message() -> str:
22
+ return f"Please save the notebook in RStudio (shortcut `{get_shortcut()}`) within 2 sec before calling `db$finish()`"
23
+
24
+
25
+ def get_shortcut() -> str:
26
+ import platform
27
+
28
+ return "CMD + s" if platform.system() == "Darwin" else "CTRL + s"
29
+
30
+
31
+ def get_seconds_since_modified(filepath) -> float:
32
+ return datetime.now().timestamp() - filepath.stat().st_mtime
33
+
34
+
19
35
  # this is from the get_title function in nbproject
20
36
  # should be moved into lamindb sooner or later
21
37
  def prepare_notebook(
@@ -82,6 +98,29 @@ def notebook_to_script(
82
98
  script_path.write_text(py_content)
83
99
 
84
100
 
101
+ # removes NotebookNotSaved error message from notebook html
102
+ def clean_r_notebook_html(file_path: Path) -> tuple[str | None, Path]:
103
+ import re
104
+
105
+ cleaned_content = (
106
+ file_path.read_text()
107
+ ) # at this point cleaned_content is still raw
108
+ pattern_title = r"<title>(.*?)</title>"
109
+ title_match = re.search(pattern_title, cleaned_content)
110
+ title_text = None
111
+ if title_match:
112
+ title_text = title_match.group(1)
113
+ pattern_h1 = f"<h1[^>]*>{re.escape(title_text)}</h1>"
114
+ cleaned_content = re.sub(pattern_title, "", cleaned_content)
115
+ cleaned_content = re.sub(pattern_h1, "", cleaned_content)
116
+ cleaned_content = cleaned_content.replace(
117
+ f"NotebookNotSaved: {get_r_save_notebook_message()}", ""
118
+ )
119
+ cleaned_path = file_path.parent / (f"{file_path.stem}.cleaned{file_path.suffix}")
120
+ cleaned_path.write_text(cleaned_content)
121
+ return title_text, cleaned_path
122
+
123
+
85
124
  def save_context_core(
86
125
  *,
87
126
  run: Run,
@@ -104,7 +143,9 @@ def save_context_core(
104
143
  # for scripts, things are easy
105
144
  is_consecutive = True
106
145
  is_ipynb = filepath.suffix == ".ipynb"
146
+ is_r_notebook = filepath.suffix in {".qmd", ".Rmd"}
107
147
  source_code_path = filepath
148
+ report_path: Path | None = None
108
149
  # for notebooks, we need more work
109
150
  if is_ipynb:
110
151
  try:
@@ -139,12 +180,21 @@ def save_context_core(
139
180
  ".ipynb", ".py"
140
181
  )
141
182
  notebook_to_script(transform, filepath, source_code_path)
183
+ elif is_r_notebook:
184
+ if filepath.with_suffix(".nb.html").exists():
185
+ report_path = filepath.with_suffix(".nb.html")
186
+ elif filepath.with_suffix(".html").exists():
187
+ report_path = filepath.with_suffix(".html")
188
+ else:
189
+ logger.warning(
190
+ f"no {filepath.with_suffix('.nb.html')} found, save your manually rendered .html report via the CLI: lamin save {filepath}"
191
+ )
142
192
  ln.settings.creation.artifact_silence_missing_run_warning = True
143
193
  # track source code
144
194
  hash, _ = hash_file(source_code_path) # ignore hash_type for now
145
195
  if (
146
196
  transform._source_code_artifact_id is not None
147
- or transform.source_code is not None # equivalent to transform.hash is not None
197
+ or transform.hash is not None # .hash is equivalent to .transform
148
198
  ):
149
199
  # check if the hash of the transform source code matches
150
200
  # (for scripts, we already run the same logic in track() - we can deduplicate the call at some point)
@@ -165,7 +215,7 @@ def save_context_core(
165
215
  logger.warning("Please re-run `ln.track()` to make a new version")
166
216
  return "rerun-the-notebook"
167
217
  else:
168
- logger.important("source code is already saved")
218
+ logger.debug("source code is already saved")
169
219
  else:
170
220
  transform.source_code = source_code_path.read_text()
171
221
  transform.hash = hash
@@ -198,10 +248,15 @@ def save_context_core(
198
248
  run.finished_at = datetime.now(timezone.utc)
199
249
 
200
250
  # track report and set is_consecutive
201
- if not is_ipynb:
202
- run.is_consecutive = True
203
- run.save()
204
- else:
251
+ if report_path is not None:
252
+ if not from_cli:
253
+ if get_seconds_since_modified(report_path) > 2 and not ln_setup._TESTING:
254
+ # this can happen when auto-knitting an html with RStudio
255
+ raise NotebookNotSaved(get_r_save_notebook_message())
256
+ if is_r_notebook:
257
+ title_text, report_path = clean_r_notebook_html(report_path)
258
+ if title_text is not None:
259
+ transform.name = title_text
205
260
  if run.report_id is not None:
206
261
  hash, _ = hash_file(report_path) # ignore hash_type for now
207
262
  if hash != run.report.hash:
@@ -210,7 +265,7 @@ def save_context_core(
210
265
  )
211
266
  if response == "y":
212
267
  run.report.replace(report_path)
213
- run.report.save(upload=True)
268
+ run.report.save(upload=True, print_progress=False)
214
269
  else:
215
270
  logger.important("keeping old report")
216
271
  else:
@@ -224,11 +279,13 @@ def save_context_core(
224
279
  )
225
280
  report_file.save(upload=True, print_progress=False)
226
281
  run.report = report_file
227
- run.is_consecutive = is_consecutive
228
- run.save()
229
282
  logger.debug(
230
283
  f"saved transform.latest_run.report: {transform.latest_run.report}"
231
284
  )
285
+ run.is_consecutive = is_consecutive
286
+
287
+ # save both run & transform records if we arrive here
288
+ run.save()
232
289
  transform.save()
233
290
 
234
291
  # finalize
@@ -250,11 +307,9 @@ def save_context_core(
250
307
  f"go to: https://lamin.ai/{identifier}/transform/{transform.uid}"
251
308
  )
252
309
  if not from_cli:
253
- thing, name = (
254
- ("notebook", "notebook.ipynb") if is_ipynb else ("script", "script.py")
255
- )
310
+ thing = "notebook" if (is_ipynb or is_r_notebook) else "script"
256
311
  logger.important(
257
- f"if you want to update your {thing} without re-running it, use `lamin save {name}`"
312
+ f"if you want to update your {thing} without re-running it, use `lamin save {filepath}`"
258
313
  )
259
314
  # because run & transform changed, update the global context
260
315
  context._run = run
lamindb/_from_values.py CHANGED
@@ -5,7 +5,9 @@ from typing import TYPE_CHECKING
5
5
  import pandas as pd
6
6
  from django.core.exceptions import FieldDoesNotExist
7
7
  from lamin_utils import colors, logger
8
- from lnschema_core.models import Feature, Field, Record, ULabel
8
+ from lnschema_core.models import Record
9
+
10
+ from lamindb._query_set import RecordList
9
11
 
10
12
  from .core._settings import settings
11
13
 
@@ -25,11 +27,11 @@ def get_or_create_records(
25
27
  organism: Record | str | None = None,
26
28
  source: Record | None = None,
27
29
  mute: bool = False,
28
- ) -> list[Record]:
30
+ ) -> RecordList:
29
31
  """Get or create records from iterables."""
30
32
  registry = field.field.model
31
33
  if create:
32
- return [registry(**{field.field.name: value}) for value in iterable]
34
+ return RecordList([registry(**{field.field.name: value}) for value in iterable])
33
35
  creation_search_names = settings.creation.search_names
34
36
  organism = _get_organism_record(field, organism)
35
37
  settings.creation.search_names = False
@@ -112,7 +114,7 @@ def get_or_create_records(
112
114
  # for record in records:
113
115
  # record._feature = feature_name
114
116
  # logger.debug(f"added default feature '{feature_name}'")
115
- return records
117
+ return RecordList(records)
116
118
  finally:
117
119
  settings.creation.search_names = creation_search_names
118
120
 
@@ -305,7 +307,9 @@ def index_iterable(iterable: Iterable) -> pd.Index:
305
307
  return idx[(idx != "") & (~idx.isnull())]
306
308
 
307
309
 
308
- def _print_values(names: Iterable, n: int = 20, quotes: bool = True) -> str:
310
+ def _print_values(
311
+ names: Iterable, n: int = 20, quotes: bool = True, sep: str = "'"
312
+ ) -> str:
309
313
  if isinstance(names, dict):
310
314
  items = {
311
315
  f"{key}: {value}": None
@@ -319,7 +323,7 @@ def _print_values(names: Iterable, n: int = 20, quotes: bool = True) -> str:
319
323
  unique_items = list(items.keys())
320
324
 
321
325
  if quotes:
322
- unique_items = [f"'{item}'" for item in unique_items]
326
+ unique_items = [f"{sep}{item}{sep}" for item in unique_items]
323
327
 
324
328
  print_values = ", ".join(unique_items[:n])
325
329