lamindb 0.77.0__py3-none-any.whl → 0.77.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -1
- lamindb/_artifact.py +6 -3
- lamindb/_can_curate.py +3 -1
- lamindb/_collection.py +1 -1
- lamindb/_curate.py +387 -318
- lamindb/_feature.py +84 -58
- lamindb/_feature_set.py +6 -4
- lamindb/_finish.py +68 -13
- lamindb/_from_values.py +10 -6
- lamindb/_query_set.py +321 -102
- lamindb/_record.py +5 -3
- lamindb/_save.py +1 -0
- lamindb/_view.py +105 -9
- lamindb/core/__init__.py +2 -2
- lamindb/core/_context.py +9 -13
- lamindb/core/_data.py +58 -88
- lamindb/core/_describe.py +139 -0
- lamindb/core/_django.py +5 -6
- lamindb/core/_feature_manager.py +408 -198
- lamindb/core/_label_manager.py +147 -109
- lamindb/core/datasets/__init__.py +31 -2
- lamindb/core/datasets/_core.py +0 -27
- lamindb/core/datasets/_small.py +100 -0
- lamindb/core/exceptions.py +1 -1
- lamindb/core/storage/paths.py +9 -4
- lamindb/core/types.py +12 -2
- {lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/METADATA +7 -8
- {lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/RECORD +30 -28
- {lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/LICENSE +0 -0
- {lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/WHEEL +0 -0
lamindb/_query_set.py
CHANGED
@@ -1,19 +1,22 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import re
|
3
4
|
from collections import UserList
|
4
5
|
from collections.abc import Iterable
|
5
6
|
from collections.abc import Iterable as IterableType
|
6
|
-
from typing import TYPE_CHECKING, Any, NamedTuple
|
7
|
+
from typing import TYPE_CHECKING, Any, Generic, NamedTuple, TypeVar
|
7
8
|
|
8
9
|
import pandas as pd
|
9
10
|
from django.db import models
|
10
|
-
from django.db.models import F
|
11
|
-
from
|
11
|
+
from django.db.models import F, ForeignKey, ManyToManyField
|
12
|
+
from django.db.models.fields.related import ForeignObjectRel
|
13
|
+
from lamin_utils import logger
|
12
14
|
from lamindb_setup.core._docs import doc_args
|
13
15
|
from lnschema_core.models import (
|
14
16
|
Artifact,
|
15
17
|
CanCurate,
|
16
18
|
Collection,
|
19
|
+
Feature,
|
17
20
|
IsVersioned,
|
18
21
|
Record,
|
19
22
|
Registry,
|
@@ -24,6 +27,8 @@ from lnschema_core.models import (
|
|
24
27
|
|
25
28
|
from .core.exceptions import DoesNotExist
|
26
29
|
|
30
|
+
T = TypeVar("T")
|
31
|
+
|
27
32
|
if TYPE_CHECKING:
|
28
33
|
from collections.abc import Iterable
|
29
34
|
|
@@ -34,6 +39,9 @@ class MultipleResultsFound(Exception):
|
|
34
39
|
pass
|
35
40
|
|
36
41
|
|
42
|
+
pd.set_option("display.max_columns", 200)
|
43
|
+
|
44
|
+
|
37
45
|
# def format_and_convert_to_local_time(series: pd.Series):
|
38
46
|
# tzinfo = datetime.now().astimezone().tzinfo
|
39
47
|
# timedelta = tzinfo.utcoffset(datetime.now()) # type: ignore
|
@@ -155,21 +163,295 @@ def get(
|
|
155
163
|
return registry.objects.using(qs.db).get(**expressions)
|
156
164
|
|
157
165
|
|
158
|
-
class
|
166
|
+
class RecordList(UserList, Generic[T]):
|
159
167
|
"""Is ordered, can't be queried, but has `.df()`."""
|
160
168
|
|
161
|
-
def __init__(self, records: Iterable[
|
162
|
-
|
169
|
+
def __init__(self, records: Iterable[T]):
|
170
|
+
if isinstance(records, list):
|
171
|
+
self.data = records # Direct assignment if already a list, no copy
|
172
|
+
else:
|
173
|
+
super().__init__(records) # Let UserList handle the conversion
|
163
174
|
|
164
175
|
def df(self) -> pd.DataFrame:
|
165
176
|
keys = get_keys_from_df(self.data, self.data[0].__class__)
|
166
177
|
values = [record.__dict__ for record in self.data]
|
167
178
|
return pd.DataFrame(values, columns=keys)
|
168
179
|
|
169
|
-
def one(self) ->
|
180
|
+
def one(self) -> T:
|
170
181
|
"""Exactly one result. Throws error if there are more or none."""
|
171
182
|
return one_helper(self)
|
172
183
|
|
184
|
+
def save(self) -> RecordList[T]:
|
185
|
+
"""Save all records to the database."""
|
186
|
+
from lamindb._save import save
|
187
|
+
|
188
|
+
save(self)
|
189
|
+
return self
|
190
|
+
|
191
|
+
|
192
|
+
def get_basic_field_names(
|
193
|
+
qs: QuerySet, include: list[str], features: bool | list[str] = False
|
194
|
+
) -> list[str]:
|
195
|
+
exclude_field_names = ["updated_at"]
|
196
|
+
field_names = [
|
197
|
+
field.name
|
198
|
+
for field in qs.model._meta.fields
|
199
|
+
if (
|
200
|
+
not isinstance(field, models.ForeignKey)
|
201
|
+
and field.name not in exclude_field_names
|
202
|
+
)
|
203
|
+
]
|
204
|
+
field_names += [
|
205
|
+
f"{field.name}_id"
|
206
|
+
for field in qs.model._meta.fields
|
207
|
+
if isinstance(field, models.ForeignKey)
|
208
|
+
]
|
209
|
+
for field_name in [
|
210
|
+
"version",
|
211
|
+
"is_latest",
|
212
|
+
"run_id",
|
213
|
+
"created_at",
|
214
|
+
"created_by_id",
|
215
|
+
"updated_at",
|
216
|
+
]:
|
217
|
+
if field_name in field_names:
|
218
|
+
field_names.remove(field_name)
|
219
|
+
field_names.append(field_name)
|
220
|
+
if field_names[0] != "uid" and "uid" in field_names:
|
221
|
+
field_names.remove("uid")
|
222
|
+
field_names.insert(0, "uid")
|
223
|
+
if include or features:
|
224
|
+
subset_field_names = field_names[:4]
|
225
|
+
intersection = set(field_names) & set(include)
|
226
|
+
subset_field_names += list(intersection)
|
227
|
+
field_names = subset_field_names
|
228
|
+
return field_names
|
229
|
+
|
230
|
+
|
231
|
+
def get_feature_annotate_kwargs(show_features: bool | list[str]) -> dict[str, Any]:
|
232
|
+
features = Feature.filter()
|
233
|
+
if isinstance(show_features, list):
|
234
|
+
features.filter(name__in=show_features)
|
235
|
+
# Get the categorical features
|
236
|
+
cat_feature_types = {
|
237
|
+
feature.dtype.replace("cat[", "").replace("]", "")
|
238
|
+
for feature in features
|
239
|
+
if feature.dtype.startswith("cat[")
|
240
|
+
}
|
241
|
+
# Get relationships of labels and features
|
242
|
+
link_models_on_models = {
|
243
|
+
getattr(
|
244
|
+
Artifact, obj.related_name
|
245
|
+
).through.__get_name_with_schema__(): obj.related_model.__get_name_with_schema__()
|
246
|
+
for obj in Artifact._meta.related_objects
|
247
|
+
if obj.related_model.__get_name_with_schema__() in cat_feature_types
|
248
|
+
}
|
249
|
+
link_models_on_models["ArtifactULabel"] = "ULabel"
|
250
|
+
link_attributes_on_models = {
|
251
|
+
obj.related_name: link_models_on_models[
|
252
|
+
obj.related_model.__get_name_with_schema__()
|
253
|
+
]
|
254
|
+
for obj in Artifact._meta.related_objects
|
255
|
+
if obj.related_model.__get_name_with_schema__() in link_models_on_models
|
256
|
+
}
|
257
|
+
# Prepare Django's annotate for features
|
258
|
+
annotate_kwargs = {}
|
259
|
+
for link_attr, feature_type in link_attributes_on_models.items():
|
260
|
+
annotate_kwargs[f"{link_attr}__feature__name"] = F(
|
261
|
+
f"{link_attr}__feature__name"
|
262
|
+
)
|
263
|
+
field_name = (
|
264
|
+
feature_type.split(".")[1] if "." in feature_type else feature_type
|
265
|
+
).lower()
|
266
|
+
annotate_kwargs[f"{link_attr}__{field_name}__name"] = F(
|
267
|
+
f"{link_attr}__{field_name}__name"
|
268
|
+
)
|
269
|
+
|
270
|
+
annotate_kwargs["_feature_values__feature__name"] = F(
|
271
|
+
"_feature_values__feature__name"
|
272
|
+
)
|
273
|
+
annotate_kwargs["_feature_values__value"] = F("_feature_values__value")
|
274
|
+
return annotate_kwargs
|
275
|
+
|
276
|
+
|
277
|
+
# https://claude.ai/share/16280046-6ae5-4f6a-99ac-dec01813dc3c
|
278
|
+
def analyze_lookup_cardinality(
|
279
|
+
model_class: Record, lookup_paths: list[str] | None
|
280
|
+
) -> dict[str, str]:
|
281
|
+
"""Analyze lookup cardinality.
|
282
|
+
|
283
|
+
Analyzes Django model lookups to determine if they will result in
|
284
|
+
one-to-one or one-to-many relationships when used in annotations.
|
285
|
+
|
286
|
+
Args:
|
287
|
+
model_class: The Django model class to analyze
|
288
|
+
include: List of lookup paths (e.g. ["created_by__name", "ulabels__name"])
|
289
|
+
|
290
|
+
Returns:
|
291
|
+
Dictionary mapping lookup paths to either 'one' or 'many'
|
292
|
+
"""
|
293
|
+
result = {} # type: ignore
|
294
|
+
if lookup_paths is None:
|
295
|
+
return result
|
296
|
+
for lookup_path in lookup_paths:
|
297
|
+
parts = lookup_path.split("__")
|
298
|
+
current_model = model_class
|
299
|
+
is_many = False
|
300
|
+
|
301
|
+
# Walk through each part of the lookup path
|
302
|
+
for part in parts[:-1]: # Exclude the last part as it's an attribute
|
303
|
+
field = None
|
304
|
+
|
305
|
+
# Handle reverse relations
|
306
|
+
for f in current_model._meta.get_fields():
|
307
|
+
if isinstance(f, ForeignObjectRel) and f.get_accessor_name() == part:
|
308
|
+
field = f
|
309
|
+
is_many = not f.one_to_one
|
310
|
+
if hasattr(f, "field"):
|
311
|
+
current_model = f.field.model
|
312
|
+
break
|
313
|
+
|
314
|
+
# Handle forward relations
|
315
|
+
if field is None:
|
316
|
+
field = current_model._meta.get_field(part)
|
317
|
+
if isinstance(field, ManyToManyField):
|
318
|
+
is_many = True
|
319
|
+
current_model = field.remote_field.model
|
320
|
+
elif isinstance(field, ForeignKey):
|
321
|
+
current_model = field.remote_field.model
|
322
|
+
|
323
|
+
result[lookup_path] = "many" if is_many else "one"
|
324
|
+
|
325
|
+
return result
|
326
|
+
|
327
|
+
|
328
|
+
# https://lamin.ai/laminlabs/lamindata/transform/BblTiuKxsb2g0003
|
329
|
+
# https://claude.ai/chat/6ea2498c-944d-4e7a-af08-29e5ddf637d2
|
330
|
+
def reshape_annotate_result(
|
331
|
+
field_names: list[str],
|
332
|
+
df: pd.DataFrame,
|
333
|
+
extra_columns: dict[str, str] | None = None,
|
334
|
+
features: bool | list[str] = False,
|
335
|
+
) -> pd.DataFrame:
|
336
|
+
"""Reshapes experimental data with optional feature handling.
|
337
|
+
|
338
|
+
Parameters:
|
339
|
+
field_names: List of basic fields to include in result
|
340
|
+
df: Input dataframe with experimental data
|
341
|
+
extra_columns: Dict specifying additional columns to process with types ('one' or 'many')
|
342
|
+
e.g., {'ulabels__name': 'many', 'created_by__name': 'one'}
|
343
|
+
features: If False, skip feature processing. If True, process all features.
|
344
|
+
If list of strings, only process specified features.
|
345
|
+
|
346
|
+
Returns:
|
347
|
+
DataFrame with reshaped data
|
348
|
+
"""
|
349
|
+
extra_columns = extra_columns or {}
|
350
|
+
|
351
|
+
# Initialize result with basic fields
|
352
|
+
result = df[field_names].drop_duplicates(subset=["id"])
|
353
|
+
|
354
|
+
# Process features if requested
|
355
|
+
if features:
|
356
|
+
# Handle _feature_values if columns exist
|
357
|
+
feature_cols = ["_feature_values__feature__name", "_feature_values__value"]
|
358
|
+
if all(col in df.columns for col in feature_cols):
|
359
|
+
feature_values = process_feature_values(df, features)
|
360
|
+
if not feature_values.empty:
|
361
|
+
for col in feature_values.columns:
|
362
|
+
if col in result.columns:
|
363
|
+
continue
|
364
|
+
result.insert(4, col, feature_values[col])
|
365
|
+
|
366
|
+
# Handle links features if they exist
|
367
|
+
links_features = [
|
368
|
+
col
|
369
|
+
for col in df.columns
|
370
|
+
if "feature__name" in col and col.startswith("links_")
|
371
|
+
]
|
372
|
+
|
373
|
+
if links_features:
|
374
|
+
result = process_links_features(df, result, links_features, features)
|
375
|
+
|
376
|
+
# Process extra columns
|
377
|
+
if extra_columns:
|
378
|
+
result = process_extra_columns(df, result, extra_columns)
|
379
|
+
|
380
|
+
return result
|
381
|
+
|
382
|
+
|
383
|
+
def process_feature_values(
|
384
|
+
df: pd.DataFrame, features: bool | list[str]
|
385
|
+
) -> pd.DataFrame:
|
386
|
+
"""Process _feature_values columns."""
|
387
|
+
feature_values = df.groupby(["id", "_feature_values__feature__name"])[
|
388
|
+
"_feature_values__value"
|
389
|
+
].agg(set)
|
390
|
+
|
391
|
+
# Filter features if specific ones requested
|
392
|
+
if isinstance(features, list):
|
393
|
+
feature_values = feature_values[
|
394
|
+
feature_values.index.get_level_values(
|
395
|
+
"_feature_values__feature__name"
|
396
|
+
).isin(features)
|
397
|
+
]
|
398
|
+
|
399
|
+
return feature_values.unstack().reset_index()
|
400
|
+
|
401
|
+
|
402
|
+
def process_links_features(
|
403
|
+
df: pd.DataFrame,
|
404
|
+
result: pd.DataFrame,
|
405
|
+
feature_cols: list[str],
|
406
|
+
features: bool | list[str],
|
407
|
+
) -> pd.DataFrame:
|
408
|
+
"""Process links_XXX feature columns."""
|
409
|
+
# this loops over different entities that might be linked under a feature
|
410
|
+
for feature_col in feature_cols:
|
411
|
+
prefix = re.match(r"links_(.+?)__feature__name", feature_col).group(1)
|
412
|
+
|
413
|
+
value_cols = [
|
414
|
+
col
|
415
|
+
for col in df.columns
|
416
|
+
if col.startswith(f"links_{prefix}__")
|
417
|
+
and col.endswith("__name")
|
418
|
+
and "feature__name" not in col
|
419
|
+
]
|
420
|
+
|
421
|
+
if not value_cols:
|
422
|
+
continue
|
423
|
+
|
424
|
+
value_col = value_cols[0]
|
425
|
+
feature_names = df[feature_col].unique()
|
426
|
+
feature_names = feature_names[~pd.isna(feature_names)]
|
427
|
+
|
428
|
+
# Filter features if specific ones requested
|
429
|
+
if isinstance(features, list):
|
430
|
+
feature_names = [f for f in feature_names if f in features]
|
431
|
+
|
432
|
+
for feature_name in feature_names:
|
433
|
+
mask = df[feature_col] == feature_name
|
434
|
+
feature_values = df[mask].groupby("id")[value_col].agg(set)
|
435
|
+
result.insert(4, feature_name, result["id"].map(feature_values))
|
436
|
+
|
437
|
+
return result
|
438
|
+
|
439
|
+
|
440
|
+
def process_extra_columns(
|
441
|
+
df: pd.DataFrame, result: pd.DataFrame, extra_columns: dict[str, str]
|
442
|
+
) -> pd.DataFrame:
|
443
|
+
"""Process additional columns based on their specified types."""
|
444
|
+
for col, col_type in extra_columns.items():
|
445
|
+
if col not in df.columns:
|
446
|
+
continue
|
447
|
+
if col in result.columns:
|
448
|
+
continue
|
449
|
+
|
450
|
+
values = df.groupby("id")[col].agg(set if col_type == "many" else "first")
|
451
|
+
result.insert(4, col, result["id"].map(values))
|
452
|
+
|
453
|
+
return result
|
454
|
+
|
173
455
|
|
174
456
|
class QuerySet(models.QuerySet):
|
175
457
|
"""Sets of records returned by queries.
|
@@ -180,108 +462,45 @@ class QuerySet(models.QuerySet):
|
|
180
462
|
|
181
463
|
Examples:
|
182
464
|
|
183
|
-
>>>
|
184
|
-
>>> queryset =
|
465
|
+
>>> ULabel(name="my label").save()
|
466
|
+
>>> queryset = ULabel.filter(name="my label")
|
185
467
|
>>> queryset
|
186
468
|
"""
|
187
469
|
|
188
470
|
@doc_args(Record.df.__doc__)
|
189
471
|
def df(
|
190
|
-
self,
|
472
|
+
self,
|
473
|
+
include: str | list[str] | None = None,
|
474
|
+
features: bool | list[str] = False,
|
191
475
|
) -> pd.DataFrame:
|
192
476
|
"""{}""" # noqa: D415
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
field_names.remove(field_name)
|
211
|
-
field_names.append(field_name)
|
212
|
-
if field_names[0] != "uid" and "uid" in field_names:
|
213
|
-
field_names.remove("uid")
|
214
|
-
field_names.insert(0, "uid")
|
215
|
-
# create the dataframe
|
216
|
-
df = pd.DataFrame(self.values(), columns=field_names)
|
217
|
-
# if len(df) > 0 and "updated_at" in df:
|
218
|
-
# df.updated_at = format_and_convert_to_local_time(df.updated_at)
|
219
|
-
# if len(df) > 0 and "started_at" in df:
|
220
|
-
# df.started_at = format_and_convert_to_local_time(df.started_at)
|
221
|
-
pk_name = self.model._meta.pk.name
|
222
|
-
pk_column_name = pk_name if pk_name in df.columns else f"{pk_name}_id"
|
223
|
-
if pk_column_name in df.columns:
|
224
|
-
df = df.set_index(pk_column_name)
|
477
|
+
if include is None:
|
478
|
+
include = []
|
479
|
+
elif isinstance(include, str):
|
480
|
+
include = [include]
|
481
|
+
field_names = get_basic_field_names(self, include, features)
|
482
|
+
annotate_kwargs = {}
|
483
|
+
if features:
|
484
|
+
annotate_kwargs.update(get_feature_annotate_kwargs(features))
|
485
|
+
if include:
|
486
|
+
include = include.copy()[::-1]
|
487
|
+
include_kwargs = {s: F(s) for s in include if s not in field_names}
|
488
|
+
annotate_kwargs.update(include_kwargs)
|
489
|
+
if annotate_kwargs:
|
490
|
+
queryset = self.annotate(**annotate_kwargs)
|
491
|
+
else:
|
492
|
+
queryset = self
|
493
|
+
df = pd.DataFrame(queryset.values(*field_names, *list(annotate_kwargs.keys())))
|
225
494
|
if len(df) == 0:
|
226
|
-
|
495
|
+
df = pd.DataFrame({}, columns=field_names)
|
227
496
|
return df
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
field_name = split[0]
|
236
|
-
if len(split) > 1:
|
237
|
-
lookup_str = "__".join(split[1:])
|
238
|
-
else:
|
239
|
-
lookup_str = "id"
|
240
|
-
Record = self.model
|
241
|
-
field = getattr(Record, field_name)
|
242
|
-
if isinstance(field.field, models.ManyToManyField):
|
243
|
-
related_ORM = (
|
244
|
-
field.field.model
|
245
|
-
if field.field.model != Record
|
246
|
-
else field.field.related_model
|
247
|
-
)
|
248
|
-
if Record == related_ORM:
|
249
|
-
left_side_link_model = f"from_{Record.__name__.lower()}"
|
250
|
-
values_expression = (
|
251
|
-
f"to_{Record.__name__.lower()}__{lookup_str}"
|
252
|
-
)
|
253
|
-
else:
|
254
|
-
left_side_link_model = f"{Record.__name__.lower()}"
|
255
|
-
values_expression = (
|
256
|
-
f"{related_ORM.__name__.lower()}__{lookup_str}"
|
257
|
-
)
|
258
|
-
link_df = pd.DataFrame(
|
259
|
-
field.through.objects.using(self.db).values(
|
260
|
-
left_side_link_model, values_expression
|
261
|
-
)
|
262
|
-
)
|
263
|
-
if link_df.shape[0] == 0:
|
264
|
-
logger.warning(
|
265
|
-
f"{colors.yellow(expression)} is not shown because no values are found"
|
266
|
-
)
|
267
|
-
continue
|
268
|
-
link_groupby = link_df.groupby(left_side_link_model)[
|
269
|
-
values_expression
|
270
|
-
].apply(list)
|
271
|
-
df = pd.concat((link_groupby, df), axis=1, join=join)
|
272
|
-
df.rename(columns={values_expression: expression}, inplace=True)
|
273
|
-
else:
|
274
|
-
# the F() based implementation could also work for many-to-many,
|
275
|
-
# would need to test what is faster
|
276
|
-
df_anno = pd.DataFrame(
|
277
|
-
self.annotate(expression=F(expression)).values(
|
278
|
-
pk_column_name, "expression"
|
279
|
-
)
|
280
|
-
)
|
281
|
-
df_anno = df_anno.set_index(pk_column_name)
|
282
|
-
df_anno.rename(columns={"expression": expression}, inplace=True)
|
283
|
-
df = pd.concat((df_anno, df), axis=1, join=join)
|
284
|
-
return df
|
497
|
+
extra_cols = analyze_lookup_cardinality(self.model, include) # type: ignore
|
498
|
+
df_reshaped = reshape_annotate_result(field_names, df, extra_cols, features)
|
499
|
+
pk_name = self.model._meta.pk.name
|
500
|
+
pk_column_name = pk_name if pk_name in df.columns else f"{pk_name}_id"
|
501
|
+
if pk_column_name in df_reshaped.columns:
|
502
|
+
df_reshaped = df_reshaped.set_index(pk_column_name)
|
503
|
+
return df_reshaped
|
285
504
|
|
286
505
|
def delete(self, *args, **kwargs):
|
287
506
|
"""Delete all records in the query set."""
|
@@ -335,8 +554,8 @@ class QuerySet(models.QuerySet):
|
|
335
554
|
"""At most one result. Returns it if there is one, otherwise returns ``None``.
|
336
555
|
|
337
556
|
Examples:
|
338
|
-
>>>
|
339
|
-
>>>
|
557
|
+
>>> ULabel.filter(name="benchmark").one_or_none()
|
558
|
+
>>> ULabel.filter(name="non existing label").one_or_none()
|
340
559
|
"""
|
341
560
|
if len(self) == 0:
|
342
561
|
return None
|
lamindb/_record.py
CHANGED
@@ -264,14 +264,14 @@ def get(
|
|
264
264
|
def df(
|
265
265
|
cls,
|
266
266
|
include: str | list[str] | None = None,
|
267
|
-
|
267
|
+
features: bool | list[str] = False,
|
268
268
|
limit: int = 100,
|
269
269
|
) -> pd.DataFrame:
|
270
270
|
"""{}""" # noqa: D415
|
271
271
|
query_set = cls.filter()
|
272
272
|
if hasattr(cls, "updated_at"):
|
273
273
|
query_set = query_set.order_by("-updated_at")
|
274
|
-
return query_set[:limit].df(include=include,
|
274
|
+
return query_set[:limit].df(include=include, features=features)
|
275
275
|
|
276
276
|
|
277
277
|
def _search(
|
@@ -345,7 +345,9 @@ def _search(
|
|
345
345
|
ranks.append(sub_rank)
|
346
346
|
# startswith and avoid matching string with " " on the right
|
347
347
|
# mostly for truncated
|
348
|
-
startswith_expr = regex_lookup(
|
348
|
+
startswith_expr = regex_lookup(
|
349
|
+
field_expr, rf"(?:^|.*\|){string}[^ ]*(?:\|.*|$)"
|
350
|
+
)
|
349
351
|
startswith_rank = Cast(startswith_expr, output_field=IntegerField()) * 8
|
350
352
|
ranks.append(startswith_rank)
|
351
353
|
# match as sub-phrase from the left, mostly for truncated
|
lamindb/_save.py
CHANGED
@@ -112,6 +112,7 @@ def bulk_create(records: Iterable[Record], ignore_conflicts: bool | None = False
|
|
112
112
|
records_by_orm[record.__class__].append(record)
|
113
113
|
for registry, records in records_by_orm.items():
|
114
114
|
registry.objects.bulk_create(records, ignore_conflicts=ignore_conflicts)
|
115
|
+
# records[:] = created # In-place list update; does not seem to be necessary
|
115
116
|
|
116
117
|
|
117
118
|
def bulk_update(records: Iterable[Record], ignore_conflicts: bool | None = False):
|
lamindb/_view.py
CHANGED
@@ -3,22 +3,107 @@ from __future__ import annotations
|
|
3
3
|
import builtins
|
4
4
|
import importlib
|
5
5
|
import inspect
|
6
|
+
from typing import TYPE_CHECKING
|
6
7
|
|
8
|
+
from IPython.display import HTML, display
|
7
9
|
from lamin_utils import colors, logger
|
8
10
|
from lamindb_setup import settings
|
9
11
|
from lamindb_setup._init_instance import get_schema_module_name
|
10
|
-
from lnschema_core import Record
|
12
|
+
from lnschema_core import Feature, Record
|
13
|
+
|
14
|
+
from lamindb.core import FeatureValue, ParamValue
|
15
|
+
|
16
|
+
from ._feature import convert_pandas_dtype_to_lamin_dtype
|
17
|
+
|
18
|
+
if TYPE_CHECKING:
|
19
|
+
import pandas as pd
|
11
20
|
|
12
21
|
is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
|
13
22
|
|
14
23
|
|
24
|
+
def display_df_with_descriptions(
|
25
|
+
df: pd.DataFrame, descriptions: dict[str, str] | None = None
|
26
|
+
):
|
27
|
+
if descriptions is None:
|
28
|
+
display(df)
|
29
|
+
return None
|
30
|
+
|
31
|
+
# Start building HTML table
|
32
|
+
html = '<table class="dataframe">'
|
33
|
+
|
34
|
+
# Create header with title and description rows
|
35
|
+
html += "<thead>"
|
36
|
+
|
37
|
+
# Column names row
|
38
|
+
html += "<tr>"
|
39
|
+
html += '<th class="header-title index-header"></th>' # Index header
|
40
|
+
for col in df.columns:
|
41
|
+
html += f'<th class="header-title">{col}</th>'
|
42
|
+
html += "</tr>"
|
43
|
+
|
44
|
+
# Descriptions row
|
45
|
+
html += "<tr>"
|
46
|
+
html += f'<th class="header-desc index-header">{df.index.name or ""}</th>' # Index column
|
47
|
+
for col in df.columns:
|
48
|
+
desc = descriptions.get(col, "")
|
49
|
+
html += f'<th class="header-desc">{desc}</th>'
|
50
|
+
html += "</tr>"
|
51
|
+
|
52
|
+
html += "</thead>"
|
53
|
+
|
54
|
+
# Add body rows
|
55
|
+
html += "<tbody>"
|
56
|
+
for idx, row in df.iterrows():
|
57
|
+
html += "<tr>"
|
58
|
+
html += f'<th class="row-index">{idx}</th>' # Index value
|
59
|
+
for col in df.columns:
|
60
|
+
html += f"<td>{row[col]}</td>"
|
61
|
+
html += "</tr>"
|
62
|
+
html += "</tbody>"
|
63
|
+
html += "</table>"
|
64
|
+
|
65
|
+
# Add CSS styles
|
66
|
+
styled_html = f"""
|
67
|
+
<style>
|
68
|
+
.dataframe {{
|
69
|
+
border-collapse: collapse;
|
70
|
+
margin: 10px 0;
|
71
|
+
}}
|
72
|
+
.dataframe th, .dataframe td {{
|
73
|
+
border: 1px solid #ddd;
|
74
|
+
padding: 8px;
|
75
|
+
text-align: left;
|
76
|
+
}}
|
77
|
+
.header-title {{
|
78
|
+
font-weight: bold;
|
79
|
+
}}
|
80
|
+
.header-desc {{
|
81
|
+
color: #666;
|
82
|
+
font-weight: normal;
|
83
|
+
}}
|
84
|
+
.row-index {{
|
85
|
+
font-weight: bold;
|
86
|
+
}}
|
87
|
+
.index-header {{
|
88
|
+
font-weight: bold;
|
89
|
+
}}
|
90
|
+
</style>
|
91
|
+
{html}
|
92
|
+
"""
|
93
|
+
return display(HTML(styled_html))
|
94
|
+
|
95
|
+
|
15
96
|
def view(
|
16
|
-
|
97
|
+
df: pd.DataFrame | None = None,
|
98
|
+
limit: int = 7,
|
99
|
+
schema: str | None = None,
|
100
|
+
registries: list[str] | None = None,
|
17
101
|
) -> None:
|
18
|
-
"""View
|
102
|
+
"""View metadata.
|
19
103
|
|
20
104
|
Args:
|
21
|
-
|
105
|
+
df: A DataFrame to display.
|
106
|
+
limit: Display the latest `n` records
|
22
107
|
schema: Schema module to view. Default's to
|
23
108
|
`None` and displays all schema modules.
|
24
109
|
registries: List of Record names. Defaults to
|
@@ -27,6 +112,16 @@ def view(
|
|
27
112
|
Examples:
|
28
113
|
>>> ln.view()
|
29
114
|
"""
|
115
|
+
if df is not None:
|
116
|
+
descriptions = {
|
117
|
+
col_name: convert_pandas_dtype_to_lamin_dtype(dtype)
|
118
|
+
for col_name, dtype in df.dtypes.to_dict().items()
|
119
|
+
}
|
120
|
+
feature_dtypes = dict(Feature.objects.values_list("name", "dtype"))
|
121
|
+
descriptions.update(feature_dtypes)
|
122
|
+
display_df_with_descriptions(df, descriptions)
|
123
|
+
return None
|
124
|
+
|
30
125
|
if is_run_from_ipython:
|
31
126
|
from IPython.display import display as show
|
32
127
|
else:
|
@@ -39,6 +134,9 @@ def view(
|
|
39
134
|
|
40
135
|
for schema_name in schema_names:
|
41
136
|
schema_module = importlib.import_module(get_schema_module_name(schema_name))
|
137
|
+
# the below is necessary because a schema module might not have been
|
138
|
+
# explicitly accessed
|
139
|
+
importlib.reload(schema_module)
|
42
140
|
|
43
141
|
all_registries = {
|
44
142
|
registry
|
@@ -47,6 +145,8 @@ def view(
|
|
47
145
|
and issubclass(registry, Record)
|
48
146
|
and registry is not Record
|
49
147
|
}
|
148
|
+
if schema_name == "core":
|
149
|
+
all_registries.update({FeatureValue, ParamValue})
|
50
150
|
if registries is not None:
|
51
151
|
filtered_registries = {
|
52
152
|
registry
|
@@ -62,11 +162,7 @@ def view(
|
|
62
162
|
logger.print(section)
|
63
163
|
logger.print("*" * len(section_no_color))
|
64
164
|
for registry in sorted(filtered_registries, key=lambda x: x.__name__):
|
65
|
-
|
66
|
-
df = registry.filter().order_by("-updated_at")[:n].df()
|
67
|
-
else:
|
68
|
-
# need to adjust in the future
|
69
|
-
df = registry.df().iloc[-n:]
|
165
|
+
df = registry.df(limit=limit)
|
70
166
|
if df.shape[0] > 0:
|
71
167
|
logger.print(colors.blue(colors.bold(registry.__name__)))
|
72
168
|
show(df)
|