datachain 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +3 -4
- datachain/cache.py +10 -4
- datachain/catalog/catalog.py +35 -15
- datachain/cli.py +37 -32
- datachain/data_storage/metastore.py +24 -0
- datachain/data_storage/warehouse.py +3 -1
- datachain/job.py +56 -0
- datachain/lib/arrow.py +19 -7
- datachain/lib/clip.py +89 -66
- datachain/lib/convert/{type_converter.py → python_to_sql.py} +6 -6
- datachain/lib/convert/sql_to_python.py +23 -0
- datachain/lib/convert/values_to_tuples.py +51 -33
- datachain/lib/data_model.py +6 -27
- datachain/lib/dataset_info.py +70 -0
- datachain/lib/dc.py +646 -152
- datachain/lib/file.py +117 -15
- datachain/lib/image.py +1 -1
- datachain/lib/meta_formats.py +14 -2
- datachain/lib/model_store.py +3 -2
- datachain/lib/pytorch.py +10 -7
- datachain/lib/signal_schema.py +39 -14
- datachain/lib/text.py +2 -1
- datachain/lib/udf.py +56 -5
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +4 -3
- datachain/node.py +11 -8
- datachain/query/dataset.py +66 -147
- datachain/query/dispatch.py +15 -13
- datachain/query/schema.py +2 -0
- datachain/query/session.py +4 -4
- datachain/sql/functions/array.py +12 -0
- datachain/sql/functions/string.py +8 -0
- datachain/torch/__init__.py +1 -1
- datachain/utils.py +45 -0
- datachain-0.2.12.dist-info/METADATA +412 -0
- {datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/RECORD +40 -45
- {datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/WHEEL +1 -1
- datachain/lib/feature_registry.py +0 -77
- datachain/lib/gpt4_vision.py +0 -97
- datachain/lib/hf_image_to_text.py +0 -97
- datachain/lib/hf_pipeline.py +0 -90
- datachain/lib/image_transform.py +0 -103
- datachain/lib/iptc_exif_xmp.py +0 -76
- datachain/lib/unstructured.py +0 -41
- datachain/text/__init__.py +0 -3
- datachain-0.2.10.dist-info/METADATA +0 -430
- {datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/LICENSE +0 -0
- {datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/entry_points.txt +0 -0
- {datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/top_level.txt +0 -0
datachain/lib/clip.py
CHANGED
|
@@ -31,7 +31,7 @@ def _get_encoder(model: Any, type: Literal["image", "text"]) -> Callable:
|
|
|
31
31
|
)
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
def
|
|
34
|
+
def clip_similarity_scores(
|
|
35
35
|
images: Union[None, "Image.Image", list["Image.Image"]],
|
|
36
36
|
text: Union[None, str, list[str]],
|
|
37
37
|
model: Any,
|
|
@@ -43,71 +43,91 @@ def similarity_scores(
|
|
|
43
43
|
"""
|
|
44
44
|
Calculate CLIP similarity scores between one or more images and/or text.
|
|
45
45
|
|
|
46
|
-
|
|
47
|
-
images: Images to use as inputs.
|
|
48
|
-
text: Text to use as inputs.
|
|
49
|
-
model: Model from clip or open_clip packages.
|
|
50
|
-
preprocess: Image preprocessor to apply.
|
|
51
|
-
tokenizer: Text tokenizer.
|
|
52
|
-
prob: Compute softmax probabilities.
|
|
53
|
-
image_to_text: Whether to compute for image-to-text or text-to-image. Ignored
|
|
54
|
-
only one of images or text provided.
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
46
|
+
Parameters:
|
|
47
|
+
images : Images to use as inputs.
|
|
48
|
+
text : Text to use as inputs.
|
|
49
|
+
model : Model from clip or open_clip packages.
|
|
50
|
+
preprocess : Image preprocessor to apply.
|
|
51
|
+
tokenizer : Text tokenizer.
|
|
52
|
+
prob : Compute softmax probabilities.
|
|
53
|
+
image_to_text : Whether to compute for image-to-text or text-to-image. Ignored
|
|
54
|
+
if only one of images or text provided.
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
Example:
|
|
58
|
+
Using https://github.com/openai/CLIP
|
|
59
|
+
```py
|
|
60
|
+
>>> import clip
|
|
61
|
+
>>> model, preprocess = clip.load("ViT-B/32")
|
|
62
|
+
>>> similarity_scores(img, "cat", model, preprocess, clip.tokenize)
|
|
63
|
+
[[21.813]]
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Using https://github.com/mlfoundations/open_clip
|
|
67
|
+
```py
|
|
68
|
+
>>> import open_clip
|
|
69
|
+
>>> model, _, preprocess = open_clip.create_model_and_transforms(
|
|
70
|
+
... "ViT-B-32", pretrained="laion2b_s34b_b79k"
|
|
71
|
+
... )
|
|
72
|
+
>>> tokenizer = open_clip.get_tokenizer("ViT-B-32")
|
|
73
|
+
>>> similarity_scores(img, "cat", model, preprocess, tokenizer)
|
|
74
|
+
[[21.813]]
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Using https://huggingface.co/docs/transformers/en/model_doc/clip
|
|
78
|
+
```py
|
|
79
|
+
>>> from transformers import CLIPProcessor, CLIPModel
|
|
80
|
+
>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
|
81
|
+
>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
|
82
|
+
>>> scores = similarity_scores(
|
|
83
|
+
... img, "cat", model, processor.image_processor, processor.tokenizer
|
|
84
|
+
... )
|
|
85
|
+
[[21.813]]
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Image -> list of text
|
|
89
|
+
```py
|
|
90
|
+
>>> similarity_scores(img, ["cat", "dog"], model, preprocess, tokenizer)
|
|
91
|
+
[[21.813, 35.313]]
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
List of images -> text
|
|
95
|
+
```py
|
|
96
|
+
>>> similarity_scores([img1, img2], "cat", model, preprocess, tokenizer)
|
|
97
|
+
[[21.813], [83.123]]
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
List of images -> list of text
|
|
101
|
+
```py
|
|
102
|
+
>>> similarity_scores(
|
|
103
|
+
... [img1, img2], ["cat", "dog"], model, preprocess, tokenizer)
|
|
104
|
+
... )
|
|
105
|
+
[[21.813, 35.313], [83.123, 34.843]]
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
List of images -> list of images
|
|
109
|
+
```py
|
|
110
|
+
>>> similarity_scores([img1, img2], None, model, preprocess, tokenizer)
|
|
111
|
+
[[94.189, 37.092]]
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
List of text -> list of text
|
|
115
|
+
```py
|
|
116
|
+
>>> similarity_scores(None, ["cat", "dog"], model, preprocess, tokenizer)
|
|
117
|
+
[[67.334, 23.588]]
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Text -> list of images
|
|
121
|
+
```py
|
|
122
|
+
>>> similarity_scores([img1, img2], "cat", ..., image_to_text=False)
|
|
123
|
+
[[19.708, 19.842]]
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Show scores as softmax probabilities
|
|
127
|
+
```py
|
|
128
|
+
>>> similarity_scores(img, ["cat", "dog"], ..., prob=True)
|
|
129
|
+
[[0.423, 0.577]]
|
|
130
|
+
```
|
|
111
131
|
"""
|
|
112
132
|
|
|
113
133
|
with torch.no_grad():
|
|
@@ -144,3 +164,6 @@ def similarity_scores(
|
|
|
144
164
|
scores = logits
|
|
145
165
|
|
|
146
166
|
return scores.tolist()
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
similarity_scores = clip_similarity_scores
|
|
@@ -19,7 +19,7 @@ from datachain.sql.types import (
|
|
|
19
19
|
String,
|
|
20
20
|
)
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
PYTHON_TO_SQL = {
|
|
23
23
|
int: Int64,
|
|
24
24
|
str: String,
|
|
25
25
|
Literal: String,
|
|
@@ -34,14 +34,14 @@ TYPE_TO_DATACHAIN = {
|
|
|
34
34
|
}
|
|
35
35
|
|
|
36
36
|
|
|
37
|
-
def
|
|
37
|
+
def python_to_sql(typ): # noqa: PLR0911
|
|
38
38
|
if inspect.isclass(typ):
|
|
39
39
|
if issubclass(typ, SQLType):
|
|
40
40
|
return typ
|
|
41
41
|
if issubclass(typ, Enum):
|
|
42
42
|
return str
|
|
43
43
|
|
|
44
|
-
res =
|
|
44
|
+
res = PYTHON_TO_SQL.get(typ)
|
|
45
45
|
if res:
|
|
46
46
|
return res
|
|
47
47
|
|
|
@@ -59,19 +59,19 @@ def convert_to_db_type(typ): # noqa: PLR0911
|
|
|
59
59
|
if ModelStore.is_pydantic(args0):
|
|
60
60
|
return Array(JSON())
|
|
61
61
|
|
|
62
|
-
next_type =
|
|
62
|
+
next_type = python_to_sql(args0)
|
|
63
63
|
return Array(next_type)
|
|
64
64
|
|
|
65
65
|
if orig is Annotated:
|
|
66
66
|
# Ignoring annotations
|
|
67
|
-
return
|
|
67
|
+
return python_to_sql(args[0])
|
|
68
68
|
|
|
69
69
|
if inspect.isclass(orig) and issubclass(dict, orig):
|
|
70
70
|
return JSON
|
|
71
71
|
|
|
72
72
|
if orig == Union:
|
|
73
73
|
if len(args) == 2 and (type(None) in args):
|
|
74
|
-
return
|
|
74
|
+
return python_to_sql(args[0])
|
|
75
75
|
|
|
76
76
|
if _is_json_inside_union(orig, args):
|
|
77
77
|
return JSON
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from sqlalchemy import ARRAY, JSON, Boolean, DateTime, Float, Integer, String
|
|
5
|
+
|
|
6
|
+
from datachain.data_storage.sqlite import Column
|
|
7
|
+
|
|
8
|
+
SQL_TO_PYTHON = {
|
|
9
|
+
String: str,
|
|
10
|
+
Integer: int,
|
|
11
|
+
Float: float,
|
|
12
|
+
Boolean: bool,
|
|
13
|
+
DateTime: datetime,
|
|
14
|
+
ARRAY: list,
|
|
15
|
+
JSON: dict,
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def sql_to_python(args_map: dict[str, Column]) -> dict[str, Any]:
|
|
20
|
+
return {
|
|
21
|
+
k: SQL_TO_PYTHON.get(type(v.type), str) # type: ignore[union-attr]
|
|
22
|
+
for k, v in args_map.items()
|
|
23
|
+
}
|
|
@@ -9,41 +9,16 @@ class ValuesToTupleError(DataChainParamsError):
|
|
|
9
9
|
def __init__(self, ds_name, msg):
|
|
10
10
|
if ds_name:
|
|
11
11
|
ds_name = f"' {ds_name}'"
|
|
12
|
-
super().__init__(f"Cannot convert
|
|
12
|
+
super().__init__(f"Cannot convert signals for dataset{ds_name}: {msg}")
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
def values_to_tuples(
|
|
15
|
+
def values_to_tuples( # noqa: C901, PLR0912
|
|
16
16
|
ds_name: str = "",
|
|
17
17
|
output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
|
|
18
18
|
**fr_map,
|
|
19
19
|
) -> tuple[Any, Any, Any]:
|
|
20
|
-
types_map = {}
|
|
21
|
-
length = -1
|
|
22
|
-
for k, v in fr_map.items():
|
|
23
|
-
if not isinstance(v, Sequence) or isinstance(v, str):
|
|
24
|
-
raise ValuesToTupleError(ds_name, f"features '{k}' is not a sequence")
|
|
25
|
-
len_ = len(v)
|
|
26
|
-
|
|
27
|
-
if len_ == 0:
|
|
28
|
-
raise ValuesToTupleError(ds_name, f"feature '{k}' is empty list")
|
|
29
|
-
|
|
30
|
-
if length < 0:
|
|
31
|
-
length = len_
|
|
32
|
-
elif length != len_:
|
|
33
|
-
raise ValuesToTupleError(
|
|
34
|
-
ds_name,
|
|
35
|
-
f"feature '{k}' should have length {length} while {len_} is given",
|
|
36
|
-
)
|
|
37
|
-
typ = type(v[0])
|
|
38
|
-
if not is_chain_type(typ):
|
|
39
|
-
raise ValuesToTupleError(
|
|
40
|
-
ds_name,
|
|
41
|
-
f"feature '{k}' has unsupported type '{typ.__name__}'."
|
|
42
|
-
f" Please use Feature types: {DataTypeNames}",
|
|
43
|
-
)
|
|
44
|
-
types_map[k] = typ
|
|
45
20
|
if output:
|
|
46
|
-
if not isinstance(output, Sequence
|
|
21
|
+
if not isinstance(output, (Sequence, str, dict)):
|
|
47
22
|
if len(fr_map) != 1:
|
|
48
23
|
raise ValuesToTupleError(
|
|
49
24
|
ds_name,
|
|
@@ -58,21 +33,64 @@ def values_to_tuples(
|
|
|
58
33
|
key: str = next(iter(fr_map.keys()))
|
|
59
34
|
output = {key: output} # type: ignore[dict-item]
|
|
60
35
|
|
|
36
|
+
if not isinstance(output, dict):
|
|
37
|
+
raise ValuesToTupleError(
|
|
38
|
+
ds_name,
|
|
39
|
+
"output type must be dict[str, DataType] while "
|
|
40
|
+
f"'{type(output).__name__}' is given",
|
|
41
|
+
)
|
|
42
|
+
|
|
61
43
|
if len(output) != len(fr_map):
|
|
62
44
|
raise ValuesToTupleError(
|
|
63
45
|
ds_name,
|
|
64
46
|
f"number of outputs '{len(output)}' should match"
|
|
65
|
-
f" number of
|
|
47
|
+
f" number of signals '{len(fr_map)}'",
|
|
66
48
|
)
|
|
67
|
-
|
|
49
|
+
|
|
50
|
+
types_map = {}
|
|
51
|
+
length = -1
|
|
52
|
+
for k, v in fr_map.items():
|
|
53
|
+
if not isinstance(v, Sequence) or isinstance(v, str):
|
|
54
|
+
raise ValuesToTupleError(ds_name, f"signals '{k}' is not a sequence")
|
|
55
|
+
len_ = len(v)
|
|
56
|
+
|
|
57
|
+
if output:
|
|
58
|
+
if k not in output: # type: ignore[operator]
|
|
59
|
+
raise ValuesToTupleError(
|
|
60
|
+
ds_name,
|
|
61
|
+
f"signal '{k}' is not present in the output",
|
|
62
|
+
)
|
|
63
|
+
else:
|
|
64
|
+
if len_ == 0:
|
|
65
|
+
raise ValuesToTupleError(ds_name, f"signal '{k}' is empty list")
|
|
66
|
+
|
|
67
|
+
typ = type(v[0])
|
|
68
|
+
if not is_chain_type(typ):
|
|
69
|
+
raise ValuesToTupleError(
|
|
70
|
+
ds_name,
|
|
71
|
+
f"signal '{k}' has unsupported type '{typ.__name__}'."
|
|
72
|
+
f" Please use DataModel types: {DataTypeNames}",
|
|
73
|
+
)
|
|
74
|
+
types_map[k] = typ
|
|
75
|
+
|
|
76
|
+
if length < 0:
|
|
77
|
+
length = len_
|
|
78
|
+
elif length != len_:
|
|
68
79
|
raise ValuesToTupleError(
|
|
69
80
|
ds_name,
|
|
70
|
-
"
|
|
71
|
-
f"'{type(output).__name__}' is given",
|
|
81
|
+
f"signal '{k}' should have length {length} while {len_} is given",
|
|
72
82
|
)
|
|
73
|
-
|
|
83
|
+
|
|
84
|
+
if not output:
|
|
74
85
|
output = types_map # type: ignore[assignment]
|
|
75
86
|
|
|
87
|
+
if not output:
|
|
88
|
+
raise ValuesToTupleError(
|
|
89
|
+
ds_name,
|
|
90
|
+
"output type must be dict[str, DataType] while empty is given"
|
|
91
|
+
" and no signals are provided",
|
|
92
|
+
)
|
|
93
|
+
|
|
76
94
|
output_types: list[type] = list(output.values()) # type: ignore[union-attr,call-arg,arg-type]
|
|
77
95
|
if len(output) > 1: # type: ignore[arg-type]
|
|
78
96
|
tuple_type = tuple(output_types)
|
datachain/lib/data_model.py
CHANGED
|
@@ -1,14 +1,11 @@
|
|
|
1
1
|
from collections.abc import Sequence
|
|
2
2
|
from datetime import datetime
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import ClassVar, Union, get_args, get_origin
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel
|
|
6
6
|
|
|
7
7
|
from datachain.lib.model_store import ModelStore
|
|
8
8
|
|
|
9
|
-
if TYPE_CHECKING:
|
|
10
|
-
from datachain.catalog import Catalog
|
|
11
|
-
|
|
12
9
|
StandardType = Union[
|
|
13
10
|
type[int],
|
|
14
11
|
type[str],
|
|
@@ -24,18 +21,14 @@ DataTypeNames = "BaseModel, int, str, float, bool, list, dict, bytes, datetime"
|
|
|
24
21
|
|
|
25
22
|
|
|
26
23
|
class DataModel(BaseModel):
|
|
27
|
-
|
|
24
|
+
"""Pydantic model wrapper that registers model with `DataChain`."""
|
|
28
25
|
|
|
29
|
-
|
|
30
|
-
"""Getting value from data. It's used in conjunction with method that operate
|
|
31
|
-
with raw data such as to_pytorch(). In contrast to method that operated with
|
|
32
|
-
data structures such as pydantic"""
|
|
33
|
-
return
|
|
26
|
+
_version: ClassVar[int] = 1
|
|
34
27
|
|
|
35
28
|
@classmethod
|
|
36
29
|
def __pydantic_init_subclass__(cls):
|
|
37
30
|
"""It automatically registers every declared DataModel child class."""
|
|
38
|
-
ModelStore.
|
|
31
|
+
ModelStore.register(cls)
|
|
39
32
|
|
|
40
33
|
@staticmethod
|
|
41
34
|
def register(models: Union[DataType, Sequence[DataType]]):
|
|
@@ -44,25 +37,11 @@ class DataModel(BaseModel):
|
|
|
44
37
|
if not isinstance(models, Sequence):
|
|
45
38
|
models = [models]
|
|
46
39
|
for val in models:
|
|
47
|
-
ModelStore.
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class FileBasic(DataModel):
|
|
51
|
-
def _set_stream(self, catalog: "Catalog", caching_enabled: bool = False) -> None:
|
|
52
|
-
pass
|
|
53
|
-
|
|
54
|
-
def open(self):
|
|
55
|
-
raise NotImplementedError
|
|
56
|
-
|
|
57
|
-
def read(self):
|
|
58
|
-
with self.open() as stream:
|
|
59
|
-
return stream.read()
|
|
60
|
-
|
|
61
|
-
def get_value(self):
|
|
62
|
-
return self.read()
|
|
40
|
+
ModelStore.register(val)
|
|
63
41
|
|
|
64
42
|
|
|
65
43
|
def is_chain_type(t: type) -> bool:
|
|
44
|
+
"""Return true if type is supported by `DataChain`."""
|
|
66
45
|
if ModelStore.is_pydantic(t):
|
|
67
46
|
return True
|
|
68
47
|
if any(t is ft or t is get_args(ft)[0] for ft in get_args(StandardType)):
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
4
|
+
|
|
5
|
+
from pydantic import Field, field_validator
|
|
6
|
+
|
|
7
|
+
from datachain.dataset import DatasetRecord, DatasetStatus, DatasetVersion
|
|
8
|
+
from datachain.job import Job
|
|
9
|
+
from datachain.lib.data_model import DataModel
|
|
10
|
+
from datachain.utils import TIME_ZERO
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from typing_extensions import Self
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DatasetInfo(DataModel):
|
|
17
|
+
name: str
|
|
18
|
+
version: int = Field(default=1)
|
|
19
|
+
status: int = Field(default=DatasetStatus.CREATED)
|
|
20
|
+
created_at: datetime = Field(default=TIME_ZERO)
|
|
21
|
+
finished_at: Optional[datetime] = Field(default=None)
|
|
22
|
+
num_objects: Optional[int] = Field(default=None)
|
|
23
|
+
size: Optional[int] = Field(default=None)
|
|
24
|
+
params: dict[str, str] = Field(default=dict)
|
|
25
|
+
metrics: dict[str, Any] = Field(default=dict)
|
|
26
|
+
|
|
27
|
+
@staticmethod
|
|
28
|
+
def _validate_dict(
|
|
29
|
+
v: Optional[Union[str, dict]],
|
|
30
|
+
) -> dict:
|
|
31
|
+
if v is None or v == "":
|
|
32
|
+
return {}
|
|
33
|
+
if isinstance(v, str):
|
|
34
|
+
try:
|
|
35
|
+
return json.loads(v)
|
|
36
|
+
except Exception as e: # noqa: BLE001
|
|
37
|
+
raise ValueError(
|
|
38
|
+
f"Unable to convert string '{v}' to dict for Dataset feature: {e}"
|
|
39
|
+
) from None
|
|
40
|
+
return v
|
|
41
|
+
|
|
42
|
+
# Workaround for empty JSONs converted to empty strings in some DBs.
|
|
43
|
+
@field_validator("params", mode="before")
|
|
44
|
+
@classmethod
|
|
45
|
+
def validate_location(cls, v):
|
|
46
|
+
return cls._validate_dict(v)
|
|
47
|
+
|
|
48
|
+
@field_validator("metrics", mode="before")
|
|
49
|
+
@classmethod
|
|
50
|
+
def validate_metrics(cls, v):
|
|
51
|
+
return cls._validate_dict(v)
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def from_models(
|
|
55
|
+
cls,
|
|
56
|
+
dataset: DatasetRecord,
|
|
57
|
+
version: DatasetVersion,
|
|
58
|
+
job: Optional[Job],
|
|
59
|
+
) -> "Self":
|
|
60
|
+
return cls(
|
|
61
|
+
name=dataset.name,
|
|
62
|
+
version=version.version,
|
|
63
|
+
status=version.status,
|
|
64
|
+
created_at=version.created_at,
|
|
65
|
+
finished_at=version.finished_at,
|
|
66
|
+
num_objects=version.num_objects,
|
|
67
|
+
size=version.size,
|
|
68
|
+
params=job.params if job else {},
|
|
69
|
+
metrics=job.metrics if job else {},
|
|
70
|
+
)
|