datachain 0.2.9__py3-none-any.whl → 0.2.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (50) hide show
  1. datachain/__init__.py +17 -8
  2. datachain/catalog/catalog.py +5 -5
  3. datachain/cli.py +0 -2
  4. datachain/data_storage/schema.py +5 -5
  5. datachain/data_storage/sqlite.py +1 -1
  6. datachain/data_storage/warehouse.py +7 -7
  7. datachain/lib/arrow.py +25 -8
  8. datachain/lib/clip.py +6 -11
  9. datachain/lib/convert/__init__.py +0 -0
  10. datachain/lib/convert/flatten.py +67 -0
  11. datachain/lib/convert/type_converter.py +96 -0
  12. datachain/lib/convert/unflatten.py +69 -0
  13. datachain/lib/convert/values_to_tuples.py +85 -0
  14. datachain/lib/data_model.py +74 -0
  15. datachain/lib/dc.py +192 -167
  16. datachain/lib/feature_registry.py +36 -10
  17. datachain/lib/file.py +41 -41
  18. datachain/lib/gpt4_vision.py +1 -9
  19. datachain/lib/hf_image_to_text.py +9 -17
  20. datachain/lib/hf_pipeline.py +4 -12
  21. datachain/lib/image.py +2 -18
  22. datachain/lib/image_transform.py +0 -1
  23. datachain/lib/iptc_exif_xmp.py +8 -15
  24. datachain/lib/meta_formats.py +1 -5
  25. datachain/lib/model_store.py +77 -0
  26. datachain/lib/pytorch.py +9 -21
  27. datachain/lib/signal_schema.py +120 -58
  28. datachain/lib/text.py +5 -16
  29. datachain/lib/udf.py +114 -30
  30. datachain/lib/udf_signature.py +5 -5
  31. datachain/lib/webdataset.py +3 -4
  32. datachain/lib/webdataset_laion.py +2 -3
  33. datachain/node.py +4 -4
  34. datachain/query/batch.py +1 -1
  35. datachain/query/dataset.py +40 -60
  36. datachain/query/dispatch.py +28 -17
  37. datachain/query/udf.py +46 -26
  38. datachain/remote/studio.py +1 -9
  39. datachain/torch/__init__.py +21 -0
  40. {datachain-0.2.9.dist-info → datachain-0.2.10.dist-info}/METADATA +13 -12
  41. {datachain-0.2.9.dist-info → datachain-0.2.10.dist-info}/RECORD +45 -42
  42. datachain/image/__init__.py +0 -3
  43. datachain/lib/cached_stream.py +0 -38
  44. datachain/lib/claude.py +0 -69
  45. datachain/lib/feature.py +0 -412
  46. datachain/lib/feature_utils.py +0 -154
  47. {datachain-0.2.9.dist-info → datachain-0.2.10.dist-info}/LICENSE +0 -0
  48. {datachain-0.2.9.dist-info → datachain-0.2.10.dist-info}/WHEEL +0 -0
  49. {datachain-0.2.9.dist-info → datachain-0.2.10.dist-info}/entry_points.txt +0 -0
  50. {datachain-0.2.9.dist-info → datachain-0.2.10.dist-info}/top_level.txt +0 -0
@@ -1,154 +0,0 @@
1
- import inspect
2
- import string
3
- from collections.abc import Sequence
4
- from enum import Enum
5
- from typing import Any, Union, get_args, get_origin
6
-
7
- from pydantic import BaseModel, create_model
8
-
9
- from datachain.lib.feature import (
10
- TYPE_TO_DATACHAIN,
11
- Feature,
12
- FeatureType,
13
- FeatureTypeNames,
14
- convert_type_to_datachain,
15
- )
16
- from datachain.lib.utils import DataChainParamsError
17
-
18
- AUTO_FEATURE_PREFIX = "_auto_fr"
19
- SUFFIX_SYMBOLS = string.digits + string.ascii_lowercase
20
-
21
-
22
- class FeatureToTupleError(DataChainParamsError):
23
- def __init__(self, ds_name, msg):
24
- if ds_name:
25
- ds_name = f"' {ds_name}'"
26
- super().__init__(f"Cannot convert features for dataset{ds_name}: {msg}")
27
-
28
-
29
- feature_cache: dict[type[BaseModel], type[Feature]] = {}
30
-
31
-
32
- def pydantic_to_feature(data_cls: type[BaseModel]) -> type[Feature]:
33
- if data_cls in feature_cache:
34
- return feature_cache[data_cls]
35
-
36
- fields = {}
37
- for name, field_info in data_cls.model_fields.items():
38
- anno = field_info.annotation
39
- if anno not in TYPE_TO_DATACHAIN:
40
- anno = _to_feature_type(anno)
41
- fields[name] = (anno, field_info.default)
42
-
43
- cls = create_model(
44
- data_cls.__name__,
45
- __base__=(data_cls, Feature), # type: ignore[call-overload]
46
- **fields,
47
- )
48
- feature_cache[data_cls] = cls
49
- return cls
50
-
51
-
52
- def _to_feature_type(anno):
53
- if inspect.isclass(anno) and issubclass(anno, Enum):
54
- return str
55
-
56
- orig = get_origin(anno)
57
- if orig is list:
58
- anno = get_args(anno) # type: ignore[assignment]
59
- if isinstance(anno, Sequence):
60
- anno = anno[0] # type: ignore[unreachable]
61
- is_list = True
62
- else:
63
- is_list = False
64
-
65
- try:
66
- convert_type_to_datachain(anno)
67
- except TypeError:
68
- if not Feature.is_feature(anno): # type: ignore[arg-type]
69
- orig = get_origin(anno)
70
- if orig in TYPE_TO_DATACHAIN:
71
- anno = _to_feature_type(anno)
72
- else:
73
- if orig == Union:
74
- args = get_args(anno)
75
- if len(args) == 2 and (type(None) in args):
76
- return _to_feature_type(args[0])
77
-
78
- anno = pydantic_to_feature(anno) # type: ignore[arg-type]
79
- if is_list:
80
- anno = list[anno] # type: ignore[valid-type]
81
- return anno
82
-
83
-
84
- def features_to_tuples(
85
- ds_name: str = "",
86
- output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]] = None,
87
- **fr_map,
88
- ) -> tuple[Any, Any, Any]:
89
- types_map = {}
90
- length = -1
91
- for k, v in fr_map.items():
92
- if not isinstance(v, Sequence) or isinstance(v, str):
93
- raise FeatureToTupleError(ds_name, f"features '{k}' is not a sequence")
94
- len_ = len(v)
95
-
96
- if len_ == 0:
97
- raise FeatureToTupleError(ds_name, f"feature '{k}' is empty list")
98
-
99
- if length < 0:
100
- length = len_
101
- elif length != len_:
102
- raise FeatureToTupleError(
103
- ds_name,
104
- f"feature '{k}' should have length {length} while {len_} is given",
105
- )
106
- typ = type(v[0])
107
- if not Feature.is_feature_type(typ):
108
- raise FeatureToTupleError(
109
- ds_name,
110
- f"feature '{k}' has unsupported type '{typ.__name__}'."
111
- f" Please use Feature types: {FeatureTypeNames}",
112
- )
113
- types_map[k] = typ
114
- if output:
115
- if not isinstance(output, Sequence) and not isinstance(output, str):
116
- if len(fr_map) != 1:
117
- raise FeatureToTupleError(
118
- ds_name,
119
- f"only one output type was specified, {len(fr_map)} expected",
120
- )
121
- if not isinstance(output, type):
122
- raise FeatureToTupleError(
123
- ds_name,
124
- f"output must specify a type while '{output}' was given",
125
- )
126
-
127
- key: str = next(iter(fr_map.keys()))
128
- output = {key: output} # type: ignore[dict-item]
129
-
130
- if len(output) != len(fr_map):
131
- raise FeatureToTupleError(
132
- ds_name,
133
- f"number of outputs '{len(output)}' should match"
134
- f" number of features '{len(fr_map)}'",
135
- )
136
- if isinstance(output, dict):
137
- raise FeatureToTupleError(
138
- ds_name,
139
- "output type must be dict[str, FeatureType] while "
140
- f"'{type(output).__name__}' is given",
141
- )
142
- else:
143
- output = types_map
144
-
145
- output_types: list[type] = list(output.values()) # type: ignore[union-attr,arg-type]
146
- if len(output) > 1:
147
- tuple_type = tuple(output_types)
148
- res_type = tuple[tuple_type] # type: ignore[valid-type]
149
- res_values = list(zip(*fr_map.values()))
150
- else:
151
- res_type = output_types[0] # type: ignore[misc]
152
- res_values = next(iter(fr_map.values()))
153
-
154
- return res_type, output, res_values