PyPI - lamindb - Versions diffs - 0.77.0__py3-none-any.whl → 0.77.2__py3-none-any.whl - Mend

lamindb 0.77.0py3-none-any.whl → 0.77.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

lamindb/__init__.py +1 -1
lamindb/_artifact.py +6 -3
lamindb/_can_curate.py +3 -1
lamindb/_collection.py +1 -1
lamindb/_curate.py +387 -318
lamindb/_feature.py +84 -58
lamindb/_feature_set.py +6 -4
lamindb/_finish.py +68 -13
lamindb/_from_values.py +10 -6
lamindb/_query_set.py +321 -102
lamindb/_record.py +5 -3
lamindb/_save.py +1 -0
lamindb/_view.py +105 -9
lamindb/core/__init__.py +2 -2
lamindb/core/_context.py +9 -13
lamindb/core/_data.py +58 -88
lamindb/core/_describe.py +139 -0
lamindb/core/_django.py +5 -6
lamindb/core/_feature_manager.py +408 -198
lamindb/core/_label_manager.py +147 -109
lamindb/core/datasets/__init__.py +31 -2
lamindb/core/datasets/_core.py +0 -27
lamindb/core/datasets/_small.py +100 -0
lamindb/core/exceptions.py +1 -1
lamindb/core/storage/paths.py +9 -4
lamindb/core/types.py +12 -2
{lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/METADATA +7 -8
{lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/RECORD +30 -28
{lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/LICENSE +0 -0
{lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/WHEEL +0 -0

lamindb/_feature.py CHANGED Viewed

@@ -1,38 +1,66 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Literal, get_args
 import lamindb_setup as ln_setup
 import pandas as pd
+from lamin_utils import logger
 from lamindb_setup.core._docs import doc_args
-from lnschema_core.models import Artifact, Feature
+from lnschema_core.models import Artifact, Feature, Record
+from lnschema_core.types import FeatureDtype
 from pandas.api.types import CategoricalDtype, is_string_dtype
-from ._query_set import RecordsList
+from lamindb.core.exceptions import ValidationError
+from ._query_set import RecordList
 from ._utils import attach_func_to_class_method
 from .core._settings import settings
 from .core.schema import dict_schema_name_to_model_name
 if TYPE_CHECKING:
-    from lnschema_core.types import FieldAttr
-FEATURE_TYPES = {
-    "number": "number",
-    "int": "int",
-    "float": "float",
-    "bool": "bool",
-    "str": "cat",
-    "object": "cat",
-}
+    from collections.abc import Iterable
-def convert_numpy_dtype_to_lamin_feature_type(dtype, str_as_cat: bool = True) -> str:
-    orig_type = dtype.name
-    # strip precision qualifiers
-    type = "".join(i for i in orig_type if not i.isdigit())
-    if type == "object" or type == "str":
-        type = "cat" if str_as_cat else "str"
-    return type
+    from lnschema_core.types import FieldAttr
+    from pandas.core.dtypes.base import ExtensionDtype
+FEATURE_DTYPES = set(get_args(FeatureDtype))
+def get_dtype_str_from_dtype(dtype: Any) -> str:
+    if not isinstance(dtype, list) and dtype.__name__ in FEATURE_DTYPES:
+        dtype_str = dtype.__name__
+    else:
+        error_message = "dtype has to be of type Record or list[Record]"
+        if isinstance(dtype, Record):
+            dtype = [dtype]
+        elif not isinstance(dtype, list):
+            raise ValueError(error_message)
+        registries_str = ""
+        for registry in dtype:
+            if not hasattr(registry, "__get_name_with_schema__"):
+                raise ValueError(error_message)
+            registries_str += registry.__get_name_with_schema__() + "|"
+        dtype_str = f'cat[{registries_str.rstrip("|")}]'
+    return dtype_str
+def convert_pandas_dtype_to_lamin_dtype(pandas_dtype: ExtensionDtype) -> str:
+    if is_string_dtype(pandas_dtype):
+        if not isinstance(pandas_dtype, CategoricalDtype):
+            dtype = "str"
+        else:
+            dtype = "cat"
+    # there are string-like categoricals and "pure" categoricals (pd.Categorical)
+    elif isinstance(pandas_dtype, CategoricalDtype):
+        dtype = "cat"
+    else:
+        # strip precision qualifiers
+        dtype = "".join(dt for dt in pandas_dtype.name if not dt.isdigit())
+    if dtype.startswith("datetime"):
+        dtype = dtype.split("[")[0]
+    assert dtype in FEATURE_DTYPES  # noqa: S101
+    return dtype
 def __init__(self, *args, **kwargs):
@@ -45,28 +73,16 @@ def __init__(self, *args, **kwargs):
     dtype: type | str = kwargs.pop("dtype") if "dtype" in kwargs else None
     # cast type
     if dtype is None:
-        raise ValueError("Please pass dtype!")
+        raise ValueError(f"Please pass dtype, one of {FEATURE_DTYPES}")
     elif dtype is not None:
         if not isinstance(dtype, str):
-            if not isinstance(dtype, list) and dtype.__name__ in FEATURE_TYPES:
-                dtype_str = FEATURE_TYPES[dtype.__name__]
-            else:
-                if not isinstance(dtype, list):
-                    raise ValueError("dtype has to be a list of Record types")
-                registries_str = ""
-                for cls in dtype:
-                    if not hasattr(cls, "__get_name_with_schema__"):
-                        raise ValueError("each element of the list has to be a Record")
-                    registries_str += cls.__get_name_with_schema__() + "|"
-                dtype_str = f'cat[{registries_str.rstrip("|")}]'
+            dtype_str = get_dtype_str_from_dtype(dtype)
         else:
             dtype_str = dtype
             # add validation that a registry actually exists
-            if dtype_str not in FEATURE_TYPES.values() and not dtype_str.startswith(
-                "cat"
-            ):
+            if dtype_str not in FEATURE_DTYPES and not dtype_str.startswith("cat"):
                 raise ValueError(
-                    f"dtype is {dtype_str} but has to be one of 'number', 'int', 'float', 'cat', 'bool', 'cat[...]'!"
+                    f"dtype is {dtype_str} but has to be one of {FEATURE_DTYPES}!"
                 )
             if dtype_str != "cat" and dtype_str.startswith("cat"):
                 registries_str = dtype_str.replace("cat[", "").rstrip("]")
@@ -79,6 +95,27 @@ def __init__(self, *args, **kwargs):
                             )
     kwargs["dtype"] = dtype_str
     super(Feature, self).__init__(*args, **kwargs)
+    if not self._state.adding:
+        if not (
+            self.dtype.startswith("cat") if dtype == "cat" else self.dtype == dtype
+        ):
+            raise ValidationError(
+                f"Feature {self.name} already exists with dtype {self.dtype}, you passed {dtype}"
+            )
+def suggest_categorical_for_str_iterable(
+    iterable: Iterable[str], key: str = None
+) -> str:
+    c = pd.Categorical(iterable)
+    message = ""
+    if len(c.categories) < len(c):
+        if key != "":
+            key_note = f" for feature {key}"
+        else:
+            key_note = ""
+        message = f"You have few permissible values{key_note}, consider dtype 'cat' instead of 'str'"
+    return message
 def categoricals_from_df(df: pd.DataFrame) -> dict:
@@ -90,42 +127,31 @@ def categoricals_from_df(df: pd.DataFrame) -> dict:
         if isinstance(df[col].dtype, CategoricalDtype)
     }
     for key in string_cols:
-        c = pd.Categorical(df[key])
-        if len(c.categories) < len(c):
-            categoricals[key] = c
+        message = suggest_categorical_for_str_iterable(df[key], key)
+        if message:
+            logger.warning(message)
     return categoricals
 @classmethod  # type:ignore
 @doc_args(Feature.from_df.__doc__)
-def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordsList:
+def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordList:
     """{}"""  # noqa: D415
     field = Feature.name if field is None else field
+    registry = field.field.model
+    if registry != Feature:
+        raise ValueError("field must be a Feature FieldAttr!")
     categoricals = categoricals_from_df(df)
     dtypes = {}
-    # categoricals_with_unmapped_categories = {}  # type: ignore
     for name, col in df.items():
         if name in categoricals:
             dtypes[name] = "cat"
         else:
-            dtypes[name] = convert_numpy_dtype_to_lamin_feature_type(col.dtype)
-    # silence the warning "loaded record with exact same name "
-    verbosity = settings.verbosity
-    try:
-        settings.verbosity = "error"
-        registry = field.field.model
-        if registry != Feature:
-            raise ValueError("field must be a Feature FieldAttr!")
-        # create records for all features including non-validated
+            dtypes[name] = convert_pandas_dtype_to_lamin_dtype(col.dtype)
+    with logger.mute():  # silence the warning "loaded record with exact same name "
         features = [Feature(name=name, dtype=dtype) for name, dtype in dtypes.items()]
-    finally:
-        settings.verbosity = verbosity
     assert len(features) == len(df.columns)  # noqa: S101
-    return RecordsList(features)
+    return RecordList(features)
 @doc_args(Feature.save.__doc__)

lamindb/_feature_set.py CHANGED Viewed

@@ -10,7 +10,7 @@ from lamindb_setup.core.hashing import hash_set
 from lnschema_core import Feature, FeatureSet, Record, ids
 from lnschema_core.types import FieldAttr, ListLike
-from ._feature import convert_numpy_dtype_to_lamin_feature_type
+from ._feature import convert_pandas_dtype_to_lamin_dtype
 from ._record import init_self_from_db
 from ._utils import attach_func_to_class_method
 from .core.exceptions import ValidationError
@@ -26,7 +26,7 @@ if TYPE_CHECKING:
     from ._query_set import QuerySet
-NUMBER_TYPE = "number"
+NUMBER_TYPE = "num"
 DICT_KEYS_TYPE = type({}.keys())  # type: ignore
@@ -179,13 +179,15 @@ def from_df(
             logger.warning("no validated features, skip creating feature set")
         return None
     if registry == Feature:
-        validated_features = Feature.from_df(df.loc[:, validated])
+        validated_features = Feature.from_values(
+            df.columns, field=field, organism=organism
+        )
         feature_set = FeatureSet(validated_features, name=name, dtype=None)
     else:
         dtypes = [col.dtype for (_, col) in df.loc[:, validated].items()]
         if len(set(dtypes)) != 1:
             raise ValueError(f"data types are heterogeneous: {set(dtypes)}")
-        dtype = convert_numpy_dtype_to_lamin_feature_type(dtypes[0])
+        dtype = convert_pandas_dtype_to_lamin_dtype(dtypes[0])
         validated_features = registry.from_values(
             df.columns[validated],
             field=field,

lamindb/_finish.py CHANGED Viewed

@@ -8,6 +8,8 @@ import lamindb_setup as ln_setup
 from lamin_utils import logger
 from lamindb_setup.core.hashing import hash_file
+from lamindb.core.exceptions import NotebookNotSaved
 if TYPE_CHECKING:
     from pathlib import Path
@@ -16,6 +18,20 @@ if TYPE_CHECKING:
     from ._query_set import QuerySet
+def get_r_save_notebook_message() -> str:
+    return f"Please save the notebook in RStudio (shortcut `{get_shortcut()}`) within 2 sec before calling `db$finish()`"
+def get_shortcut() -> str:
+    import platform
+    return "CMD + s" if platform.system() == "Darwin" else "CTRL + s"
+def get_seconds_since_modified(filepath) -> float:
+    return datetime.now().timestamp() - filepath.stat().st_mtime
 # this is from the get_title function in nbproject
 # should be moved into lamindb sooner or later
 def prepare_notebook(
@@ -82,6 +98,29 @@ def notebook_to_script(
     script_path.write_text(py_content)
+# removes NotebookNotSaved error message from notebook html
+def clean_r_notebook_html(file_path: Path) -> tuple[str | None, Path]:
+    import re
+    cleaned_content = (
+        file_path.read_text()
+    )  # at this point cleaned_content is still raw
+    pattern_title = r"<title>(.*?)</title>"
+    title_match = re.search(pattern_title, cleaned_content)
+    title_text = None
+    if title_match:
+        title_text = title_match.group(1)
+        pattern_h1 = f"<h1[^>]*>{re.escape(title_text)}</h1>"
+        cleaned_content = re.sub(pattern_title, "", cleaned_content)
+        cleaned_content = re.sub(pattern_h1, "", cleaned_content)
+    cleaned_content = cleaned_content.replace(
+        f"NotebookNotSaved: {get_r_save_notebook_message()}", ""
+    )
+    cleaned_path = file_path.parent / (f"{file_path.stem}.cleaned{file_path.suffix}")
+    cleaned_path.write_text(cleaned_content)
+    return title_text, cleaned_path
 def save_context_core(
     *,
     run: Run,
@@ -104,7 +143,9 @@ def save_context_core(
     # for scripts, things are easy
     is_consecutive = True
     is_ipynb = filepath.suffix == ".ipynb"
+    is_r_notebook = filepath.suffix in {".qmd", ".Rmd"}
     source_code_path = filepath
+    report_path: Path | None = None
     # for notebooks, we need more work
     if is_ipynb:
         try:
@@ -139,12 +180,21 @@ def save_context_core(
             ".ipynb", ".py"
         )
         notebook_to_script(transform, filepath, source_code_path)
+    elif is_r_notebook:
+        if filepath.with_suffix(".nb.html").exists():
+            report_path = filepath.with_suffix(".nb.html")
+        elif filepath.with_suffix(".html").exists():
+            report_path = filepath.with_suffix(".html")
+        else:
+            logger.warning(
+                f"no {filepath.with_suffix('.nb.html')} found, save your manually rendered .html report via the CLI: lamin save {filepath}"
+            )
     ln.settings.creation.artifact_silence_missing_run_warning = True
     # track source code
     hash, _ = hash_file(source_code_path)  # ignore hash_type for now
     if (
         transform._source_code_artifact_id is not None
-        or transform.source_code is not None  # equivalent to transform.hash is not None
+        or transform.hash is not None  # .hash is equivalent to .transform
     ):
         # check if the hash of the transform source code matches
         # (for scripts, we already run the same logic in track() - we can deduplicate the call at some point)
@@ -165,7 +215,7 @@ def save_context_core(
                 logger.warning("Please re-run `ln.track()` to make a new version")
                 return "rerun-the-notebook"
         else:
-            logger.important("source code is already saved")
+            logger.debug("source code is already saved")
     else:
         transform.source_code = source_code_path.read_text()
         transform.hash = hash
@@ -198,10 +248,15 @@ def save_context_core(
         run.finished_at = datetime.now(timezone.utc)
     # track report and set is_consecutive
-    if not is_ipynb:
-        run.is_consecutive = True
-        run.save()
-    else:
+    if report_path is not None:
+        if not from_cli:
+            if get_seconds_since_modified(report_path) > 2 and not ln_setup._TESTING:
+                # this can happen when auto-knitting an html with RStudio
+                raise NotebookNotSaved(get_r_save_notebook_message())
+        if is_r_notebook:
+            title_text, report_path = clean_r_notebook_html(report_path)
+            if title_text is not None:
+                transform.name = title_text
         if run.report_id is not None:
             hash, _ = hash_file(report_path)  # ignore hash_type for now
             if hash != run.report.hash:
@@ -210,7 +265,7 @@ def save_context_core(
                 )
                 if response == "y":
                     run.report.replace(report_path)
-                    run.report.save(upload=True)
+                    run.report.save(upload=True, print_progress=False)
                 else:
                     logger.important("keeping old report")
             else:
@@ -224,11 +279,13 @@ def save_context_core(
             )
             report_file.save(upload=True, print_progress=False)
             run.report = report_file
-        run.is_consecutive = is_consecutive
-        run.save()
         logger.debug(
             f"saved transform.latest_run.report: {transform.latest_run.report}"
         )
+    run.is_consecutive = is_consecutive
+    # save both run & transform records if we arrive here
+    run.save()
     transform.save()
     # finalize
@@ -250,11 +307,9 @@ def save_context_core(
             f"go to: https://lamin.ai/{identifier}/transform/{transform.uid}"
         )
         if not from_cli:
-            thing, name = (
-                ("notebook", "notebook.ipynb") if is_ipynb else ("script", "script.py")
-            )
+            thing = "notebook" if (is_ipynb or is_r_notebook) else "script"
             logger.important(
-                f"if you want to update your {thing} without re-running it, use `lamin save {name}`"
+                f"if you want to update your {thing} without re-running it, use `lamin save {filepath}`"
             )
     # because run & transform changed, update the global context
     context._run = run

lamindb/_from_values.py CHANGED Viewed

@@ -5,7 +5,9 @@ from typing import TYPE_CHECKING
 import pandas as pd
 from django.core.exceptions import FieldDoesNotExist
 from lamin_utils import colors, logger
-from lnschema_core.models import Feature, Field, Record, ULabel
+from lnschema_core.models import Record
+from lamindb._query_set import RecordList
 from .core._settings import settings
@@ -25,11 +27,11 @@ def get_or_create_records(
     organism: Record | str | None = None,
     source: Record | None = None,
     mute: bool = False,
-) -> list[Record]:
+) -> RecordList:
     """Get or create records from iterables."""
     registry = field.field.model
     if create:
-        return [registry(**{field.field.name: value}) for value in iterable]
+        return RecordList([registry(**{field.field.name: value}) for value in iterable])
     creation_search_names = settings.creation.search_names
     organism = _get_organism_record(field, organism)
     settings.creation.search_names = False
@@ -112,7 +114,7 @@ def get_or_create_records(
         #             for record in records:
         #                 record._feature = feature_name
         #         logger.debug(f"added default feature '{feature_name}'")
-        return records
+        return RecordList(records)
     finally:
         settings.creation.search_names = creation_search_names
@@ -305,7 +307,9 @@ def index_iterable(iterable: Iterable) -> pd.Index:
     return idx[(idx != "") & (~idx.isnull())]
-def _print_values(names: Iterable, n: int = 20, quotes: bool = True) -> str:
+def _print_values(
+    names: Iterable, n: int = 20, quotes: bool = True, sep: str = "'"
+) -> str:
     if isinstance(names, dict):
         items = {
             f"{key}: {value}": None
@@ -319,7 +323,7 @@ def _print_values(names: Iterable, n: int = 20, quotes: bool = True) -> str:
     unique_items = list(items.keys())
     if quotes:
-        unique_items = [f"'{item}'" for item in unique_items]
+        unique_items = [f"{sep}{item}{sep}" for item in unique_items]
     print_values = ", ".join(unique_items[:n])

lamindb 0.77.0__py3-none-any.whl → 0.77.2__py3-none-any.whl

lamindb 0.77.0py3-none-any.whl → 0.77.2py3-none-any.whl