datachain 0.2.9__py3-none-any.whl → 0.2.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +17 -8
- datachain/catalog/catalog.py +5 -5
- datachain/cli.py +0 -2
- datachain/data_storage/schema.py +5 -5
- datachain/data_storage/sqlite.py +1 -1
- datachain/data_storage/warehouse.py +7 -7
- datachain/lib/arrow.py +25 -8
- datachain/lib/clip.py +6 -11
- datachain/lib/convert/__init__.py +0 -0
- datachain/lib/convert/flatten.py +67 -0
- datachain/lib/convert/type_converter.py +96 -0
- datachain/lib/convert/unflatten.py +69 -0
- datachain/lib/convert/values_to_tuples.py +85 -0
- datachain/lib/data_model.py +74 -0
- datachain/lib/dc.py +192 -167
- datachain/lib/feature_registry.py +36 -10
- datachain/lib/file.py +41 -41
- datachain/lib/gpt4_vision.py +1 -9
- datachain/lib/hf_image_to_text.py +9 -17
- datachain/lib/hf_pipeline.py +4 -12
- datachain/lib/image.py +2 -18
- datachain/lib/image_transform.py +0 -1
- datachain/lib/iptc_exif_xmp.py +8 -15
- datachain/lib/meta_formats.py +1 -5
- datachain/lib/model_store.py +77 -0
- datachain/lib/pytorch.py +9 -21
- datachain/lib/signal_schema.py +120 -58
- datachain/lib/text.py +5 -16
- datachain/lib/udf.py +114 -30
- datachain/lib/udf_signature.py +5 -5
- datachain/lib/webdataset.py +3 -4
- datachain/lib/webdataset_laion.py +2 -3
- datachain/node.py +4 -4
- datachain/query/batch.py +1 -1
- datachain/query/dataset.py +40 -60
- datachain/query/dispatch.py +28 -17
- datachain/query/udf.py +46 -26
- datachain/remote/studio.py +1 -9
- datachain/torch/__init__.py +21 -0
- {datachain-0.2.9.dist-info → datachain-0.2.10.dist-info}/METADATA +13 -12
- {datachain-0.2.9.dist-info → datachain-0.2.10.dist-info}/RECORD +45 -42
- datachain/image/__init__.py +0 -3
- datachain/lib/cached_stream.py +0 -38
- datachain/lib/claude.py +0 -69
- datachain/lib/feature.py +0 -412
- datachain/lib/feature_utils.py +0 -154
- {datachain-0.2.9.dist-info → datachain-0.2.10.dist-info}/LICENSE +0 -0
- {datachain-0.2.9.dist-info → datachain-0.2.10.dist-info}/WHEEL +0 -0
- {datachain-0.2.9.dist-info → datachain-0.2.10.dist-info}/entry_points.txt +0 -0
- {datachain-0.2.9.dist-info → datachain-0.2.10.dist-info}/top_level.txt +0 -0
datachain/lib/feature_utils.py
DELETED
|
@@ -1,154 +0,0 @@
|
|
|
1
|
-
import inspect
|
|
2
|
-
import string
|
|
3
|
-
from collections.abc import Sequence
|
|
4
|
-
from enum import Enum
|
|
5
|
-
from typing import Any, Union, get_args, get_origin
|
|
6
|
-
|
|
7
|
-
from pydantic import BaseModel, create_model
|
|
8
|
-
|
|
9
|
-
from datachain.lib.feature import (
|
|
10
|
-
TYPE_TO_DATACHAIN,
|
|
11
|
-
Feature,
|
|
12
|
-
FeatureType,
|
|
13
|
-
FeatureTypeNames,
|
|
14
|
-
convert_type_to_datachain,
|
|
15
|
-
)
|
|
16
|
-
from datachain.lib.utils import DataChainParamsError
|
|
17
|
-
|
|
18
|
-
AUTO_FEATURE_PREFIX = "_auto_fr"
|
|
19
|
-
SUFFIX_SYMBOLS = string.digits + string.ascii_lowercase
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class FeatureToTupleError(DataChainParamsError):
|
|
23
|
-
def __init__(self, ds_name, msg):
|
|
24
|
-
if ds_name:
|
|
25
|
-
ds_name = f"' {ds_name}'"
|
|
26
|
-
super().__init__(f"Cannot convert features for dataset{ds_name}: {msg}")
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
feature_cache: dict[type[BaseModel], type[Feature]] = {}
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def pydantic_to_feature(data_cls: type[BaseModel]) -> type[Feature]:
|
|
33
|
-
if data_cls in feature_cache:
|
|
34
|
-
return feature_cache[data_cls]
|
|
35
|
-
|
|
36
|
-
fields = {}
|
|
37
|
-
for name, field_info in data_cls.model_fields.items():
|
|
38
|
-
anno = field_info.annotation
|
|
39
|
-
if anno not in TYPE_TO_DATACHAIN:
|
|
40
|
-
anno = _to_feature_type(anno)
|
|
41
|
-
fields[name] = (anno, field_info.default)
|
|
42
|
-
|
|
43
|
-
cls = create_model(
|
|
44
|
-
data_cls.__name__,
|
|
45
|
-
__base__=(data_cls, Feature), # type: ignore[call-overload]
|
|
46
|
-
**fields,
|
|
47
|
-
)
|
|
48
|
-
feature_cache[data_cls] = cls
|
|
49
|
-
return cls
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def _to_feature_type(anno):
|
|
53
|
-
if inspect.isclass(anno) and issubclass(anno, Enum):
|
|
54
|
-
return str
|
|
55
|
-
|
|
56
|
-
orig = get_origin(anno)
|
|
57
|
-
if orig is list:
|
|
58
|
-
anno = get_args(anno) # type: ignore[assignment]
|
|
59
|
-
if isinstance(anno, Sequence):
|
|
60
|
-
anno = anno[0] # type: ignore[unreachable]
|
|
61
|
-
is_list = True
|
|
62
|
-
else:
|
|
63
|
-
is_list = False
|
|
64
|
-
|
|
65
|
-
try:
|
|
66
|
-
convert_type_to_datachain(anno)
|
|
67
|
-
except TypeError:
|
|
68
|
-
if not Feature.is_feature(anno): # type: ignore[arg-type]
|
|
69
|
-
orig = get_origin(anno)
|
|
70
|
-
if orig in TYPE_TO_DATACHAIN:
|
|
71
|
-
anno = _to_feature_type(anno)
|
|
72
|
-
else:
|
|
73
|
-
if orig == Union:
|
|
74
|
-
args = get_args(anno)
|
|
75
|
-
if len(args) == 2 and (type(None) in args):
|
|
76
|
-
return _to_feature_type(args[0])
|
|
77
|
-
|
|
78
|
-
anno = pydantic_to_feature(anno) # type: ignore[arg-type]
|
|
79
|
-
if is_list:
|
|
80
|
-
anno = list[anno] # type: ignore[valid-type]
|
|
81
|
-
return anno
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
def features_to_tuples(
|
|
85
|
-
ds_name: str = "",
|
|
86
|
-
output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]] = None,
|
|
87
|
-
**fr_map,
|
|
88
|
-
) -> tuple[Any, Any, Any]:
|
|
89
|
-
types_map = {}
|
|
90
|
-
length = -1
|
|
91
|
-
for k, v in fr_map.items():
|
|
92
|
-
if not isinstance(v, Sequence) or isinstance(v, str):
|
|
93
|
-
raise FeatureToTupleError(ds_name, f"features '{k}' is not a sequence")
|
|
94
|
-
len_ = len(v)
|
|
95
|
-
|
|
96
|
-
if len_ == 0:
|
|
97
|
-
raise FeatureToTupleError(ds_name, f"feature '{k}' is empty list")
|
|
98
|
-
|
|
99
|
-
if length < 0:
|
|
100
|
-
length = len_
|
|
101
|
-
elif length != len_:
|
|
102
|
-
raise FeatureToTupleError(
|
|
103
|
-
ds_name,
|
|
104
|
-
f"feature '{k}' should have length {length} while {len_} is given",
|
|
105
|
-
)
|
|
106
|
-
typ = type(v[0])
|
|
107
|
-
if not Feature.is_feature_type(typ):
|
|
108
|
-
raise FeatureToTupleError(
|
|
109
|
-
ds_name,
|
|
110
|
-
f"feature '{k}' has unsupported type '{typ.__name__}'."
|
|
111
|
-
f" Please use Feature types: {FeatureTypeNames}",
|
|
112
|
-
)
|
|
113
|
-
types_map[k] = typ
|
|
114
|
-
if output:
|
|
115
|
-
if not isinstance(output, Sequence) and not isinstance(output, str):
|
|
116
|
-
if len(fr_map) != 1:
|
|
117
|
-
raise FeatureToTupleError(
|
|
118
|
-
ds_name,
|
|
119
|
-
f"only one output type was specified, {len(fr_map)} expected",
|
|
120
|
-
)
|
|
121
|
-
if not isinstance(output, type):
|
|
122
|
-
raise FeatureToTupleError(
|
|
123
|
-
ds_name,
|
|
124
|
-
f"output must specify a type while '{output}' was given",
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
key: str = next(iter(fr_map.keys()))
|
|
128
|
-
output = {key: output} # type: ignore[dict-item]
|
|
129
|
-
|
|
130
|
-
if len(output) != len(fr_map):
|
|
131
|
-
raise FeatureToTupleError(
|
|
132
|
-
ds_name,
|
|
133
|
-
f"number of outputs '{len(output)}' should match"
|
|
134
|
-
f" number of features '{len(fr_map)}'",
|
|
135
|
-
)
|
|
136
|
-
if isinstance(output, dict):
|
|
137
|
-
raise FeatureToTupleError(
|
|
138
|
-
ds_name,
|
|
139
|
-
"output type must be dict[str, FeatureType] while "
|
|
140
|
-
f"'{type(output).__name__}' is given",
|
|
141
|
-
)
|
|
142
|
-
else:
|
|
143
|
-
output = types_map
|
|
144
|
-
|
|
145
|
-
output_types: list[type] = list(output.values()) # type: ignore[union-attr,arg-type]
|
|
146
|
-
if len(output) > 1:
|
|
147
|
-
tuple_type = tuple(output_types)
|
|
148
|
-
res_type = tuple[tuple_type] # type: ignore[valid-type]
|
|
149
|
-
res_values = list(zip(*fr_map.values()))
|
|
150
|
-
else:
|
|
151
|
-
res_type = output_types[0] # type: ignore[misc]
|
|
152
|
-
res_values = next(iter(fr_map.values()))
|
|
153
|
-
|
|
154
|
-
return res_type, output, res_values
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|