datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +20 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +7 -7
- datachain/catalog/__init__.py +2 -2
- datachain/catalog/catalog.py +621 -507
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +28 -18
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +24 -33
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +83 -52
- datachain/cli/commands/ls.py +17 -17
- datachain/cli/commands/show.py +4 -4
- datachain/cli/parser/__init__.py +8 -74
- datachain/cli/parser/job.py +95 -3
- datachain/cli/parser/studio.py +11 -4
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +4 -4
- datachain/client/fsspec.py +45 -28
- datachain/client/gcs.py +6 -6
- datachain/client/hf.py +29 -2
- datachain/client/http.py +157 -0
- datachain/client/local.py +15 -11
- datachain/client/s3.py +17 -9
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +5 -1
- datachain/data_storage/metastore.py +1252 -186
- datachain/data_storage/schema.py +58 -45
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +286 -127
- datachain/data_storage/warehouse.py +250 -113
- datachain/dataset.py +353 -148
- datachain/delta.py +391 -0
- datachain/diff/__init__.py +27 -29
- datachain/error.py +60 -0
- datachain/func/__init__.py +2 -1
- datachain/func/aggregate.py +66 -42
- datachain/func/array.py +242 -38
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +110 -60
- datachain/func/func.py +96 -45
- datachain/func/numeric.py +55 -38
- datachain/func/path.py +32 -20
- datachain/func/random.py +2 -2
- datachain/func/string.py +67 -37
- datachain/func/window.py +7 -8
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +58 -22
- datachain/lib/audio.py +245 -0
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/flatten.py +5 -3
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/sql_to_python.py +8 -0
- datachain/lib/convert/values_to_tuples.py +156 -51
- datachain/lib/data_model.py +42 -20
- datachain/lib/dataset_info.py +36 -8
- datachain/lib/dc/__init__.py +8 -2
- datachain/lib/dc/csv.py +25 -28
- datachain/lib/dc/database.py +398 -0
- datachain/lib/dc/datachain.py +1289 -425
- datachain/lib/dc/datasets.py +320 -38
- datachain/lib/dc/hf.py +38 -24
- datachain/lib/dc/json.py +29 -32
- datachain/lib/dc/listings.py +112 -8
- datachain/lib/dc/pandas.py +16 -12
- datachain/lib/dc/parquet.py +35 -23
- datachain/lib/dc/records.py +31 -23
- datachain/lib/dc/storage.py +154 -64
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +24 -16
- datachain/lib/dc/values.py +8 -9
- datachain/lib/file.py +622 -89
- datachain/lib/hf.py +69 -39
- datachain/lib/image.py +14 -14
- datachain/lib/listing.py +14 -11
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +3 -4
- datachain/lib/model_store.py +39 -7
- datachain/lib/namespaces.py +125 -0
- datachain/lib/projects.py +130 -0
- datachain/lib/pytorch.py +32 -21
- datachain/lib/settings.py +192 -56
- datachain/lib/signal_schema.py +427 -104
- datachain/lib/tar.py +1 -2
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +164 -76
- datachain/lib/udf_signature.py +60 -35
- datachain/lib/utils.py +118 -4
- datachain/lib/video.py +17 -9
- datachain/lib/webdataset.py +61 -56
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +22 -10
- datachain/model/bbox.py +3 -1
- datachain/model/ultralytics/bbox.py +16 -12
- datachain/model/ultralytics/pose.py +16 -12
- datachain/model/ultralytics/segment.py +16 -12
- datachain/namespace.py +84 -0
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +78 -0
- datachain/query/batch.py +40 -41
- datachain/query/dataset.py +604 -322
- datachain/query/dispatch.py +261 -154
- datachain/query/metrics.py +4 -6
- datachain/query/params.py +2 -3
- datachain/query/queue.py +3 -12
- datachain/query/schema.py +11 -6
- datachain/query/session.py +200 -33
- datachain/query/udf.py +34 -2
- datachain/remote/studio.py +171 -69
- datachain/script_meta.py +12 -12
- datachain/semver.py +68 -0
- datachain/sql/__init__.py +2 -0
- datachain/sql/functions/array.py +33 -1
- datachain/sql/postgresql_dialect.py +9 -0
- datachain/sql/postgresql_types.py +21 -0
- datachain/sql/sqlite/__init__.py +5 -1
- datachain/sql/sqlite/base.py +102 -29
- datachain/sql/sqlite/types.py +8 -13
- datachain/sql/types.py +70 -15
- datachain/studio.py +223 -46
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +101 -59
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
- datachain-0.39.0.dist-info/RECORD +173 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
- datachain/cli/commands/query.py +0 -53
- datachain/query/utils.py +0 -42
- datachain-0.14.2.dist-info/RECORD +0 -158
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/func/array.py
CHANGED
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
from collections.abc import Sequence
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any
|
|
3
3
|
|
|
4
|
+
from datachain.query.schema import Column
|
|
4
5
|
from datachain.sql.functions import array
|
|
5
6
|
|
|
6
7
|
from .func import Func
|
|
7
8
|
|
|
8
9
|
|
|
9
|
-
def cosine_distance(*args:
|
|
10
|
+
def cosine_distance(*args: str | Column | Func | Sequence) -> Func:
|
|
10
11
|
"""
|
|
11
|
-
|
|
12
|
+
Returns the cosine distance between two vectors.
|
|
12
13
|
|
|
13
14
|
The cosine distance is derived from the cosine similarity, which measures the angle
|
|
14
15
|
between two vectors. This function returns the dissimilarity between the vectors,
|
|
@@ -16,29 +17,33 @@ def cosine_distance(*args: Union[str, Sequence]) -> Func:
|
|
|
16
17
|
indicate higher dissimilarity.
|
|
17
18
|
|
|
18
19
|
Args:
|
|
19
|
-
args (str | Sequence): Two vectors to compute the cosine
|
|
20
|
+
args (str | Column | Func | Sequence): Two vectors to compute the cosine
|
|
21
|
+
distance between.
|
|
20
22
|
If a string is provided, it is assumed to be the name of the column vector.
|
|
23
|
+
If a Column is provided, it is assumed to be an array column.
|
|
24
|
+
If a Func is provided, it is assumed to be a function returning an array.
|
|
21
25
|
If a sequence is provided, it is assumed to be a vector of values.
|
|
22
26
|
|
|
23
27
|
Returns:
|
|
24
|
-
Func: A Func object that represents the cosine_distance function.
|
|
28
|
+
Func: A `Func` object that represents the cosine_distance function.
|
|
25
29
|
|
|
26
30
|
Example:
|
|
27
31
|
```py
|
|
28
32
|
target_embedding = [0.1, 0.2, 0.3]
|
|
29
33
|
dc.mutate(
|
|
30
34
|
cos_dist1=func.cosine_distance("embedding", target_embedding),
|
|
31
|
-
cos_dist2=func.cosine_distance(
|
|
35
|
+
cos_dist2=func.cosine_distance(dc.C("emb1"), "emb2"),
|
|
36
|
+
cos_dist3=func.cosine_distance(target_embedding, [0.4, 0.5, 0.6]),
|
|
32
37
|
)
|
|
33
38
|
```
|
|
34
39
|
|
|
35
40
|
Notes:
|
|
36
41
|
- Ensure both vectors have the same number of elements.
|
|
37
|
-
-
|
|
42
|
+
- The result column will always be of type float.
|
|
38
43
|
"""
|
|
39
44
|
cols, func_args = [], []
|
|
40
45
|
for arg in args:
|
|
41
|
-
if isinstance(arg, str):
|
|
46
|
+
if isinstance(arg, (str, Column, Func)):
|
|
42
47
|
cols.append(arg)
|
|
43
48
|
else:
|
|
44
49
|
func_args.append(list(arg))
|
|
@@ -57,37 +62,41 @@ def cosine_distance(*args: Union[str, Sequence]) -> Func:
|
|
|
57
62
|
)
|
|
58
63
|
|
|
59
64
|
|
|
60
|
-
def euclidean_distance(*args:
|
|
65
|
+
def euclidean_distance(*args: str | Column | Func | Sequence) -> Func:
|
|
61
66
|
"""
|
|
62
|
-
|
|
67
|
+
Returns the Euclidean distance between two vectors.
|
|
63
68
|
|
|
64
69
|
The Euclidean distance is the straight-line distance between two points
|
|
65
70
|
in Euclidean space. This function returns the distance between the two vectors.
|
|
66
71
|
|
|
67
72
|
Args:
|
|
68
|
-
args (str | Sequence): Two vectors to compute the Euclidean
|
|
73
|
+
args (str | Column | Func | Sequence): Two vectors to compute the Euclidean
|
|
74
|
+
distance between.
|
|
69
75
|
If a string is provided, it is assumed to be the name of the column vector.
|
|
76
|
+
If a Column is provided, it is assumed to be an array column.
|
|
77
|
+
If a Func is provided, it is assumed to be a function returning an array.
|
|
70
78
|
If a sequence is provided, it is assumed to be a vector of values.
|
|
71
79
|
|
|
72
80
|
Returns:
|
|
73
|
-
Func: A Func object that represents the euclidean_distance function.
|
|
81
|
+
Func: A `Func` object that represents the euclidean_distance function.
|
|
74
82
|
|
|
75
83
|
Example:
|
|
76
84
|
```py
|
|
77
85
|
target_embedding = [0.1, 0.2, 0.3]
|
|
78
86
|
dc.mutate(
|
|
79
87
|
eu_dist1=func.euclidean_distance("embedding", target_embedding),
|
|
80
|
-
eu_dist2=func.euclidean_distance(
|
|
88
|
+
eu_dist2=func.euclidean_distance(dc.C("emb1"), "emb2"),
|
|
89
|
+
eu_dist3=func.euclidean_distance(target_embedding, [0.4, 0.5, 0.6]),
|
|
81
90
|
)
|
|
82
91
|
```
|
|
83
92
|
|
|
84
93
|
Notes:
|
|
85
94
|
- Ensure both vectors have the same number of elements.
|
|
86
|
-
-
|
|
95
|
+
- The result column will always be of type float.
|
|
87
96
|
"""
|
|
88
97
|
cols, func_args = [], []
|
|
89
98
|
for arg in args:
|
|
90
|
-
if isinstance(arg, str):
|
|
99
|
+
if isinstance(arg, (str, Column, Func)):
|
|
91
100
|
cols.append(arg)
|
|
92
101
|
else:
|
|
93
102
|
func_args.append(list(arg))
|
|
@@ -106,31 +115,33 @@ def euclidean_distance(*args: Union[str, Sequence]) -> Func:
|
|
|
106
115
|
)
|
|
107
116
|
|
|
108
117
|
|
|
109
|
-
def length(arg:
|
|
118
|
+
def length(arg: str | Column | Func | Sequence) -> Func:
|
|
110
119
|
"""
|
|
111
120
|
Returns the length of the array.
|
|
112
121
|
|
|
113
122
|
Args:
|
|
114
|
-
arg (str |
|
|
123
|
+
arg (str | Column | Func | Sequence): Array to compute the length of.
|
|
115
124
|
If a string is provided, it is assumed to be the name of the array column.
|
|
116
|
-
If a
|
|
125
|
+
If a Column is provided, it is assumed to be an array column.
|
|
117
126
|
If a Func is provided, it is assumed to be a function returning an array.
|
|
127
|
+
If a sequence is provided, it is assumed to be an array of values.
|
|
118
128
|
|
|
119
129
|
Returns:
|
|
120
|
-
Func: A Func object that represents the array length function.
|
|
130
|
+
Func: A `Func` object that represents the array length function.
|
|
121
131
|
|
|
122
132
|
Example:
|
|
123
133
|
```py
|
|
124
134
|
dc.mutate(
|
|
125
135
|
len1=func.array.length("signal.values"),
|
|
126
|
-
len2=func.array.length(
|
|
136
|
+
len2=func.array.length(dc.C("signal.values")),
|
|
137
|
+
len3=func.array.length([1, 2, 3, 4, 5]),
|
|
127
138
|
)
|
|
128
139
|
```
|
|
129
140
|
|
|
130
|
-
|
|
131
|
-
-
|
|
141
|
+
Notes:
|
|
142
|
+
- The result column will always be of type int.
|
|
132
143
|
"""
|
|
133
|
-
if isinstance(arg, (str, Func)):
|
|
144
|
+
if isinstance(arg, (str, Column, Func)):
|
|
134
145
|
cols = [arg]
|
|
135
146
|
args = None
|
|
136
147
|
else:
|
|
@@ -140,35 +151,41 @@ def length(arg: Union[str, Sequence, Func]) -> Func:
|
|
|
140
151
|
return Func("length", inner=array.length, cols=cols, args=args, result_type=int)
|
|
141
152
|
|
|
142
153
|
|
|
143
|
-
def contains(arr:
|
|
154
|
+
def contains(arr: str | Column | Func | Sequence, elem: Any) -> Func:
|
|
144
155
|
"""
|
|
145
|
-
Checks whether the
|
|
156
|
+
Checks whether the array contains the specified element.
|
|
146
157
|
|
|
147
158
|
Args:
|
|
148
|
-
arr (str |
|
|
159
|
+
arr (str | Column | Func | Sequence): Array to check for the element.
|
|
149
160
|
If a string is provided, it is assumed to be the name of the array column.
|
|
150
|
-
If a
|
|
161
|
+
If a Column is provided, it is assumed to be an array column.
|
|
151
162
|
If a Func is provided, it is assumed to be a function returning an array.
|
|
163
|
+
If a sequence is provided, it is assumed to be an array of values.
|
|
152
164
|
elem (Any): Element to check for in the array.
|
|
153
165
|
|
|
154
166
|
Returns:
|
|
155
|
-
Func: A Func object that represents the contains function. Result of the
|
|
156
|
-
function will be 1 if the element is present in the array,
|
|
167
|
+
Func: A `Func` object that represents the contains function. Result of the
|
|
168
|
+
function will be `1` if the element is present in the array,
|
|
169
|
+
and `0` otherwise.
|
|
157
170
|
|
|
158
171
|
Example:
|
|
159
172
|
```py
|
|
160
173
|
dc.mutate(
|
|
161
174
|
contains1=func.array.contains("signal.values", 3),
|
|
162
|
-
contains2=func.array.contains(
|
|
175
|
+
contains2=func.array.contains(dc.C("signal.values"), 7),
|
|
176
|
+
contains3=func.array.contains([1, 2, 3, 4, 5], 7),
|
|
163
177
|
)
|
|
164
178
|
```
|
|
179
|
+
|
|
180
|
+
Notes:
|
|
181
|
+
- The result column will always be of type int.
|
|
165
182
|
"""
|
|
166
183
|
|
|
167
184
|
def inner(arg):
|
|
168
185
|
is_json = type(elem) in [list, dict]
|
|
169
186
|
return array.contains(arg, elem, is_json)
|
|
170
187
|
|
|
171
|
-
if isinstance(arr, (str, Func)):
|
|
188
|
+
if isinstance(arr, (str, Column, Func)):
|
|
172
189
|
cols = [arr]
|
|
173
190
|
args = None
|
|
174
191
|
else:
|
|
@@ -178,31 +195,218 @@ def contains(arr: Union[str, Sequence, Func], elem: Any) -> Func:
|
|
|
178
195
|
return Func("contains", inner=inner, cols=cols, args=args, result_type=int)
|
|
179
196
|
|
|
180
197
|
|
|
181
|
-
def
|
|
198
|
+
def slice(
|
|
199
|
+
arr: str | Column | Func | Sequence,
|
|
200
|
+
offset: int,
|
|
201
|
+
length: int | None = None,
|
|
202
|
+
) -> Func:
|
|
182
203
|
"""
|
|
183
|
-
|
|
204
|
+
Returns a slice of the array starting from the specified offset.
|
|
184
205
|
|
|
185
206
|
Args:
|
|
186
|
-
|
|
207
|
+
arr (str | Column | Func | Sequence): Array to slice.
|
|
187
208
|
If a string is provided, it is assumed to be the name of the array column.
|
|
209
|
+
If a Column is provided, it is assumed to be an array column.
|
|
210
|
+
If a Func is provided, it is assumed to be a function returning an array.
|
|
211
|
+
If a sequence is provided, it is assumed to be an array of values.
|
|
212
|
+
offset (int): Starting position of the slice (0-based).
|
|
213
|
+
length (int, optional): Number of elements to include in the slice.
|
|
214
|
+
If not provided, returns all elements from offset to the end.
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
Func: A `Func` object that represents the slice function.
|
|
218
|
+
|
|
219
|
+
Example:
|
|
220
|
+
```py
|
|
221
|
+
dc.mutate(
|
|
222
|
+
slice1=func.array.slice("signal.values", 1, 3),
|
|
223
|
+
slice2=func.array.slice(dc.C("signal.values"), 2),
|
|
224
|
+
slice3=func.array.slice([1, 2, 3, 4, 5], 1, 2),
|
|
225
|
+
)
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
Notes:
|
|
229
|
+
- The result column will be of type array with the same element type
|
|
230
|
+
as the input.
|
|
231
|
+
"""
|
|
232
|
+
|
|
233
|
+
def inner(arg):
|
|
234
|
+
if length is not None:
|
|
235
|
+
return array.slice(arg, offset, length)
|
|
236
|
+
return array.slice(arg, offset)
|
|
237
|
+
|
|
238
|
+
def element_type(el):
|
|
239
|
+
if isinstance(el, list):
|
|
240
|
+
try:
|
|
241
|
+
return list[element_type(el[0])]
|
|
242
|
+
except IndexError:
|
|
243
|
+
# if the array is empty, return list[str] as default type
|
|
244
|
+
return list[str]
|
|
245
|
+
return type(el)
|
|
246
|
+
|
|
247
|
+
def type_from_args(arr, *_):
|
|
248
|
+
if isinstance(arr, list):
|
|
249
|
+
try:
|
|
250
|
+
return list[element_type(arr[0])]
|
|
251
|
+
except IndexError:
|
|
252
|
+
pass
|
|
253
|
+
# if not an array or array is empty, return list[str] as default type
|
|
254
|
+
return list[str]
|
|
255
|
+
|
|
256
|
+
if isinstance(arr, (str, Column, Func)):
|
|
257
|
+
cols = [arr]
|
|
258
|
+
args = None
|
|
259
|
+
else:
|
|
260
|
+
cols = None
|
|
261
|
+
args = [arr]
|
|
262
|
+
|
|
263
|
+
return Func(
|
|
264
|
+
"slice",
|
|
265
|
+
inner=inner,
|
|
266
|
+
cols=cols,
|
|
267
|
+
args=args,
|
|
268
|
+
from_array=True,
|
|
269
|
+
is_array=True,
|
|
270
|
+
type_from_args=type_from_args,
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def join(
|
|
275
|
+
arr: str | Column | Func | Sequence,
|
|
276
|
+
sep: str = "",
|
|
277
|
+
) -> Func:
|
|
278
|
+
"""
|
|
279
|
+
Returns a string that is the concatenation of the elements of the array.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
arr (str | Column | Func | Sequence): Array to join.
|
|
283
|
+
If a string is provided, it is assumed to be the name of the array column.
|
|
284
|
+
If a Column is provided, it is assumed to be an array column.
|
|
285
|
+
If a Func is provided, it is assumed to be a function returning an array.
|
|
286
|
+
If a sequence is provided, it is assumed to be an array of values.
|
|
287
|
+
sep (str): Separator to use for the concatenation. Default is an empty string.
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
Func: A `Func` object that represents the join function.
|
|
291
|
+
|
|
292
|
+
Example:
|
|
293
|
+
```py
|
|
294
|
+
dc.mutate(
|
|
295
|
+
join1=func.array.join("signal.values", ":"),
|
|
296
|
+
join2=func.array.join(dc.C("signal.values"), ","),
|
|
297
|
+
join3=func.array.join(["1", "2", "3", "4", "5"], "/"),
|
|
298
|
+
)
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
Notes:
|
|
302
|
+
- The result column will always be of type string.
|
|
303
|
+
"""
|
|
304
|
+
|
|
305
|
+
def inner(arg):
|
|
306
|
+
return array.join(arg, sep)
|
|
307
|
+
|
|
308
|
+
if isinstance(arr, (str, Column, Func)):
|
|
309
|
+
cols = [arr]
|
|
310
|
+
args = None
|
|
311
|
+
else:
|
|
312
|
+
cols = None
|
|
313
|
+
args = [arr]
|
|
314
|
+
|
|
315
|
+
return Func(
|
|
316
|
+
"join",
|
|
317
|
+
inner=inner,
|
|
318
|
+
cols=cols,
|
|
319
|
+
args=args,
|
|
320
|
+
from_array=True,
|
|
321
|
+
result_type=str,
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def get_element(arg: str | Column | Func | Sequence, index: int) -> Func:
|
|
326
|
+
"""
|
|
327
|
+
Returns the element at the given index from the array.
|
|
328
|
+
If the index is out of bounds, it returns None or columns default value.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
arg (str | Column | Func | Sequence): Array to get the element from.
|
|
332
|
+
If a string is provided, it is assumed to be the name of the array column.
|
|
333
|
+
If a Column is provided, it is assumed to be an array column.
|
|
334
|
+
If a Func is provided, it is assumed to be a function returning an array.
|
|
335
|
+
If a sequence is provided, it is assumed to be an array of values.
|
|
336
|
+
index (int): Index of the element to get from the array.
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
Func: A `Func` object that represents the array get_element function.
|
|
340
|
+
|
|
341
|
+
Example:
|
|
342
|
+
```py
|
|
343
|
+
dc.mutate(
|
|
344
|
+
first_el=func.array.get_element("signal.values", 0),
|
|
345
|
+
second_el=func.array.get_element(dc.C("signal.values"), 1),
|
|
346
|
+
third_el=func.array.get_element([1, 2, 3, 4, 5], 2),
|
|
347
|
+
)
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
Notes:
|
|
351
|
+
- The result column will always be the same type as the elements of the array.
|
|
352
|
+
"""
|
|
353
|
+
|
|
354
|
+
def type_from_args(arr, _):
|
|
355
|
+
if isinstance(arr, list):
|
|
356
|
+
try:
|
|
357
|
+
return type(arr[0])
|
|
358
|
+
except IndexError:
|
|
359
|
+
return str # if the array is empty, return str as default type
|
|
360
|
+
return None
|
|
361
|
+
|
|
362
|
+
cols: str | Column | Func | Sequence | None
|
|
363
|
+
args: str | Column | Func | Sequence | int
|
|
364
|
+
|
|
365
|
+
if isinstance(arg, (str, Column, Func)):
|
|
366
|
+
cols = [arg]
|
|
367
|
+
args = [index]
|
|
368
|
+
else:
|
|
369
|
+
cols = None
|
|
370
|
+
args = [arg, index]
|
|
371
|
+
|
|
372
|
+
return Func(
|
|
373
|
+
"get_element",
|
|
374
|
+
inner=array.get_element,
|
|
375
|
+
cols=cols,
|
|
376
|
+
args=args,
|
|
377
|
+
from_array=True,
|
|
378
|
+
type_from_args=type_from_args,
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def sip_hash_64(arg: str | Column | Func | Sequence) -> Func:
|
|
383
|
+
"""
|
|
384
|
+
Returns the SipHash-64 hash of the array.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
arg (str | Column | Func | Sequence): Array to compute the SipHash-64 hash of.
|
|
388
|
+
If a string is provided, it is assumed to be the name of the array column.
|
|
389
|
+
If a Column is provided, it is assumed to be an array column.
|
|
390
|
+
If a Func is provided, it is assumed to be a function returning an array.
|
|
188
391
|
If a sequence is provided, it is assumed to be an array of values.
|
|
189
392
|
|
|
190
393
|
Returns:
|
|
191
|
-
Func: A Func object that represents the sip_hash_64 function.
|
|
394
|
+
Func: A `Func` object that represents the sip_hash_64 function.
|
|
192
395
|
|
|
193
396
|
Example:
|
|
194
397
|
```py
|
|
195
398
|
dc.mutate(
|
|
196
399
|
hash1=func.sip_hash_64("signal.values"),
|
|
197
|
-
hash2=func.sip_hash_64(
|
|
400
|
+
hash2=func.sip_hash_64(dc.C("signal.values")),
|
|
401
|
+
hash3=func.sip_hash_64([1, 2, 3, 4, 5]),
|
|
198
402
|
)
|
|
199
403
|
```
|
|
200
404
|
|
|
201
405
|
Note:
|
|
202
406
|
- This function is only available for the ClickHouse warehouse.
|
|
203
|
-
-
|
|
407
|
+
- The result column will always be of type int.
|
|
204
408
|
"""
|
|
205
|
-
if isinstance(arg, str):
|
|
409
|
+
if isinstance(arg, (str, Column, Func)):
|
|
206
410
|
cols = [arg]
|
|
207
411
|
args = None
|
|
208
412
|
else:
|
datachain/func/base.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from abc import ABCMeta, abstractmethod
|
|
2
|
-
from
|
|
2
|
+
from collections.abc import Sequence
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
3
4
|
|
|
4
5
|
if TYPE_CHECKING:
|
|
5
6
|
from sqlalchemy import TableClause
|
|
@@ -12,12 +13,14 @@ class Function:
|
|
|
12
13
|
__metaclass__ = ABCMeta
|
|
13
14
|
|
|
14
15
|
name: str
|
|
16
|
+
cols: Sequence
|
|
17
|
+
args: Sequence
|
|
15
18
|
|
|
16
19
|
@abstractmethod
|
|
17
20
|
def get_column(
|
|
18
21
|
self,
|
|
19
|
-
signals_schema:
|
|
20
|
-
label:
|
|
21
|
-
table:
|
|
22
|
+
signals_schema: "SignalSchema | None" = None,
|
|
23
|
+
label: str | None = None,
|
|
24
|
+
table: "TableClause | None" = None,
|
|
22
25
|
) -> "Column":
|
|
23
26
|
pass
|