datachain 0.18.4__py3-none-any.whl → 0.18.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +2 -10
- datachain/client/azure.py +1 -1
- datachain/client/gcs.py +1 -1
- datachain/client/s3.py +5 -3
- datachain/data_storage/metastore.py +87 -42
- datachain/func/aggregate.py +64 -38
- datachain/func/array.py +102 -73
- datachain/func/conditional.py +71 -51
- datachain/func/func.py +1 -1
- datachain/func/numeric.py +55 -36
- datachain/func/path.py +32 -20
- datachain/func/random.py +2 -2
- datachain/func/string.py +59 -37
- datachain/func/window.py +7 -8
- datachain/lib/dc/datachain.py +9 -0
- datachain/model/ultralytics/bbox.py +6 -4
- datachain/model/ultralytics/pose.py +6 -4
- datachain/model/ultralytics/segment.py +6 -4
- {datachain-0.18.4.dist-info → datachain-0.18.5.dist-info}/METADATA +3 -3
- {datachain-0.18.4.dist-info → datachain-0.18.5.dist-info}/RECORD +24 -24
- {datachain-0.18.4.dist-info → datachain-0.18.5.dist-info}/WHEEL +1 -1
- {datachain-0.18.4.dist-info → datachain-0.18.5.dist-info}/entry_points.txt +0 -0
- {datachain-0.18.4.dist-info → datachain-0.18.5.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.18.4.dist-info → datachain-0.18.5.dist-info}/top_level.txt +0 -0
datachain/func/array.py
CHANGED
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
from collections.abc import Sequence
|
|
2
2
|
from typing import Any, Optional, Union
|
|
3
3
|
|
|
4
|
+
from datachain.query.schema import Column
|
|
4
5
|
from datachain.sql.functions import array
|
|
5
6
|
|
|
6
7
|
from .func import Func
|
|
7
8
|
|
|
8
9
|
|
|
9
|
-
def cosine_distance(*args: Union[str, Sequence]) -> Func:
|
|
10
|
+
def cosine_distance(*args: Union[str, Column, Func, Sequence]) -> Func:
|
|
10
11
|
"""
|
|
11
|
-
|
|
12
|
+
Returns the cosine distance between two vectors.
|
|
12
13
|
|
|
13
14
|
The cosine distance is derived from the cosine similarity, which measures the angle
|
|
14
15
|
between two vectors. This function returns the dissimilarity between the vectors,
|
|
@@ -16,29 +17,33 @@ def cosine_distance(*args: Union[str, Sequence]) -> Func:
|
|
|
16
17
|
indicate higher dissimilarity.
|
|
17
18
|
|
|
18
19
|
Args:
|
|
19
|
-
args (str | Sequence): Two vectors to compute the cosine
|
|
20
|
+
args (str | Column | Func | Sequence): Two vectors to compute the cosine
|
|
21
|
+
distance between.
|
|
20
22
|
If a string is provided, it is assumed to be the name of the column vector.
|
|
23
|
+
If a Column is provided, it is assumed to be an array column.
|
|
24
|
+
If a Func is provided, it is assumed to be a function returning an array.
|
|
21
25
|
If a sequence is provided, it is assumed to be a vector of values.
|
|
22
26
|
|
|
23
27
|
Returns:
|
|
24
|
-
Func: A Func object that represents the cosine_distance function.
|
|
28
|
+
Func: A `Func` object that represents the cosine_distance function.
|
|
25
29
|
|
|
26
30
|
Example:
|
|
27
31
|
```py
|
|
28
32
|
target_embedding = [0.1, 0.2, 0.3]
|
|
29
33
|
dc.mutate(
|
|
30
34
|
cos_dist1=func.cosine_distance("embedding", target_embedding),
|
|
31
|
-
cos_dist2=func.cosine_distance(
|
|
35
|
+
cos_dist2=func.cosine_distance(dc.C("emb1"), "emb2"),
|
|
36
|
+
cos_dist3=func.cosine_distance(target_embedding, [0.4, 0.5, 0.6]),
|
|
32
37
|
)
|
|
33
38
|
```
|
|
34
39
|
|
|
35
40
|
Notes:
|
|
36
41
|
- Ensure both vectors have the same number of elements.
|
|
37
|
-
-
|
|
42
|
+
- The result column will always be of type float.
|
|
38
43
|
"""
|
|
39
44
|
cols, func_args = [], []
|
|
40
45
|
for arg in args:
|
|
41
|
-
if isinstance(arg, str):
|
|
46
|
+
if isinstance(arg, (str, Column, Func)):
|
|
42
47
|
cols.append(arg)
|
|
43
48
|
else:
|
|
44
49
|
func_args.append(list(arg))
|
|
@@ -57,37 +62,41 @@ def cosine_distance(*args: Union[str, Sequence]) -> Func:
|
|
|
57
62
|
)
|
|
58
63
|
|
|
59
64
|
|
|
60
|
-
def euclidean_distance(*args: Union[str, Sequence]) -> Func:
|
|
65
|
+
def euclidean_distance(*args: Union[str, Column, Func, Sequence]) -> Func:
|
|
61
66
|
"""
|
|
62
|
-
|
|
67
|
+
Returns the Euclidean distance between two vectors.
|
|
63
68
|
|
|
64
69
|
The Euclidean distance is the straight-line distance between two points
|
|
65
70
|
in Euclidean space. This function returns the distance between the two vectors.
|
|
66
71
|
|
|
67
72
|
Args:
|
|
68
|
-
args (str | Sequence): Two vectors to compute the Euclidean
|
|
73
|
+
args (str | Column | Func | Sequence): Two vectors to compute the Euclidean
|
|
74
|
+
distance between.
|
|
69
75
|
If a string is provided, it is assumed to be the name of the column vector.
|
|
76
|
+
If a Column is provided, it is assumed to be an array column.
|
|
77
|
+
If a Func is provided, it is assumed to be a function returning an array.
|
|
70
78
|
If a sequence is provided, it is assumed to be a vector of values.
|
|
71
79
|
|
|
72
80
|
Returns:
|
|
73
|
-
Func: A Func object that represents the euclidean_distance function.
|
|
81
|
+
Func: A `Func` object that represents the euclidean_distance function.
|
|
74
82
|
|
|
75
83
|
Example:
|
|
76
84
|
```py
|
|
77
85
|
target_embedding = [0.1, 0.2, 0.3]
|
|
78
86
|
dc.mutate(
|
|
79
87
|
eu_dist1=func.euclidean_distance("embedding", target_embedding),
|
|
80
|
-
eu_dist2=func.euclidean_distance(
|
|
88
|
+
eu_dist2=func.euclidean_distance(dc.C("emb1"), "emb2"),
|
|
89
|
+
eu_dist3=func.euclidean_distance(target_embedding, [0.4, 0.5, 0.6]),
|
|
81
90
|
)
|
|
82
91
|
```
|
|
83
92
|
|
|
84
93
|
Notes:
|
|
85
94
|
- Ensure both vectors have the same number of elements.
|
|
86
|
-
-
|
|
95
|
+
- The result column will always be of type float.
|
|
87
96
|
"""
|
|
88
97
|
cols, func_args = [], []
|
|
89
98
|
for arg in args:
|
|
90
|
-
if isinstance(arg, str):
|
|
99
|
+
if isinstance(arg, (str, Column, Func)):
|
|
91
100
|
cols.append(arg)
|
|
92
101
|
else:
|
|
93
102
|
func_args.append(list(arg))
|
|
@@ -106,31 +115,33 @@ def euclidean_distance(*args: Union[str, Sequence]) -> Func:
|
|
|
106
115
|
)
|
|
107
116
|
|
|
108
117
|
|
|
109
|
-
def length(arg: Union[str,
|
|
118
|
+
def length(arg: Union[str, Column, Func, Sequence]) -> Func:
|
|
110
119
|
"""
|
|
111
120
|
Returns the length of the array.
|
|
112
121
|
|
|
113
122
|
Args:
|
|
114
|
-
arg (str |
|
|
123
|
+
arg (str | Column | Func | Sequence): Array to compute the length of.
|
|
115
124
|
If a string is provided, it is assumed to be the name of the array column.
|
|
116
|
-
If a
|
|
125
|
+
If a Column is provided, it is assumed to be an array column.
|
|
117
126
|
If a Func is provided, it is assumed to be a function returning an array.
|
|
127
|
+
If a sequence is provided, it is assumed to be an array of values.
|
|
118
128
|
|
|
119
129
|
Returns:
|
|
120
|
-
Func: A Func object that represents the array length function.
|
|
130
|
+
Func: A `Func` object that represents the array length function.
|
|
121
131
|
|
|
122
132
|
Example:
|
|
123
133
|
```py
|
|
124
134
|
dc.mutate(
|
|
125
135
|
len1=func.array.length("signal.values"),
|
|
126
|
-
len2=func.array.length(
|
|
136
|
+
len2=func.array.length(dc.C("signal.values")),
|
|
137
|
+
len3=func.array.length([1, 2, 3, 4, 5]),
|
|
127
138
|
)
|
|
128
139
|
```
|
|
129
140
|
|
|
130
|
-
|
|
131
|
-
-
|
|
141
|
+
Notes:
|
|
142
|
+
- The result column will always be of type int.
|
|
132
143
|
"""
|
|
133
|
-
if isinstance(arg, (str, Func)):
|
|
144
|
+
if isinstance(arg, (str, Column, Func)):
|
|
134
145
|
cols = [arg]
|
|
135
146
|
args = None
|
|
136
147
|
else:
|
|
@@ -140,35 +151,41 @@ def length(arg: Union[str, Sequence, Func]) -> Func:
|
|
|
140
151
|
return Func("length", inner=array.length, cols=cols, args=args, result_type=int)
|
|
141
152
|
|
|
142
153
|
|
|
143
|
-
def contains(arr: Union[str,
|
|
154
|
+
def contains(arr: Union[str, Column, Func, Sequence], elem: Any) -> Func:
|
|
144
155
|
"""
|
|
145
|
-
Checks whether the
|
|
156
|
+
Checks whether the array contains the specified element.
|
|
146
157
|
|
|
147
158
|
Args:
|
|
148
|
-
arr (str |
|
|
159
|
+
arr (str | Column | Func | Sequence): Array to check for the element.
|
|
149
160
|
If a string is provided, it is assumed to be the name of the array column.
|
|
150
|
-
If a
|
|
161
|
+
If a Column is provided, it is assumed to be an array column.
|
|
151
162
|
If a Func is provided, it is assumed to be a function returning an array.
|
|
163
|
+
If a sequence is provided, it is assumed to be an array of values.
|
|
152
164
|
elem (Any): Element to check for in the array.
|
|
153
165
|
|
|
154
166
|
Returns:
|
|
155
|
-
Func: A Func object that represents the contains function. Result of the
|
|
156
|
-
function will be 1 if the element is present in the array,
|
|
167
|
+
Func: A `Func` object that represents the contains function. Result of the
|
|
168
|
+
function will be `1` if the element is present in the array,
|
|
169
|
+
and `0` otherwise.
|
|
157
170
|
|
|
158
171
|
Example:
|
|
159
172
|
```py
|
|
160
173
|
dc.mutate(
|
|
161
174
|
contains1=func.array.contains("signal.values", 3),
|
|
162
|
-
contains2=func.array.contains(
|
|
175
|
+
contains2=func.array.contains(dc.C("signal.values"), 7),
|
|
176
|
+
contains3=func.array.contains([1, 2, 3, 4, 5], 7),
|
|
163
177
|
)
|
|
164
178
|
```
|
|
179
|
+
|
|
180
|
+
Notes:
|
|
181
|
+
- The result column will always be of type int.
|
|
165
182
|
"""
|
|
166
183
|
|
|
167
184
|
def inner(arg):
|
|
168
185
|
is_json = type(elem) in [list, dict]
|
|
169
186
|
return array.contains(arg, elem, is_json)
|
|
170
187
|
|
|
171
|
-
if isinstance(arr, (str, Func)):
|
|
188
|
+
if isinstance(arr, (str, Column, Func)):
|
|
172
189
|
cols = [arr]
|
|
173
190
|
args = None
|
|
174
191
|
else:
|
|
@@ -179,34 +196,38 @@ def contains(arr: Union[str, Sequence, Func], elem: Any) -> Func:
|
|
|
179
196
|
|
|
180
197
|
|
|
181
198
|
def slice(
|
|
182
|
-
arr: Union[str,
|
|
199
|
+
arr: Union[str, Column, Func, Sequence],
|
|
183
200
|
offset: int,
|
|
184
201
|
length: Optional[int] = None,
|
|
185
202
|
) -> Func:
|
|
186
203
|
"""
|
|
187
|
-
Returns a slice of the array.
|
|
204
|
+
Returns a slice of the array starting from the specified offset.
|
|
188
205
|
|
|
189
206
|
Args:
|
|
190
|
-
arr (str |
|
|
207
|
+
arr (str | Column | Func | Sequence): Array to slice.
|
|
191
208
|
If a string is provided, it is assumed to be the name of the array column.
|
|
192
|
-
If a
|
|
209
|
+
If a Column is provided, it is assumed to be an array column.
|
|
193
210
|
If a Func is provided, it is assumed to be a function returning an array.
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
211
|
+
If a sequence is provided, it is assumed to be an array of values.
|
|
212
|
+
offset (int): Starting position of the slice (0-based).
|
|
213
|
+
length (int, optional): Number of elements to include in the slice.
|
|
214
|
+
If not provided, returns all elements from offset to the end.
|
|
197
215
|
|
|
198
216
|
Returns:
|
|
199
|
-
Func: A Func object that represents the slice function.
|
|
200
|
-
function will be a slice of the array starting from the offset
|
|
201
|
-
and with the given length.
|
|
217
|
+
Func: A `Func` object that represents the slice function.
|
|
202
218
|
|
|
203
219
|
Example:
|
|
204
220
|
```py
|
|
205
221
|
dc.mutate(
|
|
206
|
-
|
|
207
|
-
|
|
222
|
+
slice1=func.array.slice("signal.values", 1, 3),
|
|
223
|
+
slice2=func.array.slice(dc.C("signal.values"), 2),
|
|
224
|
+
slice3=func.array.slice([1, 2, 3, 4, 5], 1, 2),
|
|
208
225
|
)
|
|
209
226
|
```
|
|
227
|
+
|
|
228
|
+
Notes:
|
|
229
|
+
- The result column will be of type array with the same element type
|
|
230
|
+
as the input.
|
|
210
231
|
"""
|
|
211
232
|
|
|
212
233
|
def inner(arg):
|
|
@@ -228,11 +249,11 @@ def slice(
|
|
|
228
249
|
try:
|
|
229
250
|
return list[element_type(arr[0])]
|
|
230
251
|
except IndexError:
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
return
|
|
252
|
+
pass
|
|
253
|
+
# if not an array or array is empty, return list[str] as default type
|
|
254
|
+
return list[str]
|
|
234
255
|
|
|
235
|
-
if isinstance(arr, (str, Func)):
|
|
256
|
+
if isinstance(arr, (str, Column, Func)):
|
|
236
257
|
cols = [arr]
|
|
237
258
|
args = None
|
|
238
259
|
else:
|
|
@@ -251,37 +272,40 @@ def slice(
|
|
|
251
272
|
|
|
252
273
|
|
|
253
274
|
def join(
|
|
254
|
-
arr: Union[str,
|
|
275
|
+
arr: Union[str, Column, Func, Sequence],
|
|
255
276
|
sep: str = "",
|
|
256
277
|
) -> Func:
|
|
257
278
|
"""
|
|
258
|
-
Returns a string that is the concatenation of the elements of the array
|
|
279
|
+
Returns a string that is the concatenation of the elements of the array.
|
|
259
280
|
|
|
260
281
|
Args:
|
|
261
|
-
arr (str |
|
|
282
|
+
arr (str | Column | Func | Sequence): Array to join.
|
|
262
283
|
If a string is provided, it is assumed to be the name of the array column.
|
|
263
|
-
If a
|
|
284
|
+
If a Column is provided, it is assumed to be an array column.
|
|
264
285
|
If a Func is provided, it is assumed to be a function returning an array.
|
|
286
|
+
If a sequence is provided, it is assumed to be an array of values.
|
|
265
287
|
sep (str): Separator to use for the concatenation. Default is an empty string.
|
|
266
288
|
|
|
267
289
|
Returns:
|
|
268
|
-
Func: A Func object that represents the join function.
|
|
269
|
-
function will be a string that is the concatenation of the elements
|
|
270
|
-
of the array, separated by the given separator.
|
|
290
|
+
Func: A `Func` object that represents the join function.
|
|
271
291
|
|
|
272
292
|
Example:
|
|
273
293
|
```py
|
|
274
294
|
dc.mutate(
|
|
275
|
-
|
|
276
|
-
|
|
295
|
+
join1=func.array.join("signal.values", ":"),
|
|
296
|
+
join2=func.array.join(dc.C("signal.values"), ","),
|
|
297
|
+
join3=func.array.join(["1", "2", "3", "4", "5"], "/"),
|
|
277
298
|
)
|
|
278
299
|
```
|
|
300
|
+
|
|
301
|
+
Notes:
|
|
302
|
+
- The result column will always be of type string.
|
|
279
303
|
"""
|
|
280
304
|
|
|
281
305
|
def inner(arg):
|
|
282
306
|
return array.join(arg, sep)
|
|
283
307
|
|
|
284
|
-
if isinstance(arr, (str, Func)):
|
|
308
|
+
if isinstance(arr, (str, Column, Func)):
|
|
285
309
|
cols = [arr]
|
|
286
310
|
args = None
|
|
287
311
|
else:
|
|
@@ -298,31 +322,33 @@ def join(
|
|
|
298
322
|
)
|
|
299
323
|
|
|
300
324
|
|
|
301
|
-
def get_element(arg: Union[str,
|
|
325
|
+
def get_element(arg: Union[str, Column, Func, Sequence], index: int) -> Func:
|
|
302
326
|
"""
|
|
303
327
|
Returns the element at the given index from the array.
|
|
304
328
|
If the index is out of bounds, it returns None or columns default value.
|
|
305
329
|
|
|
306
330
|
Args:
|
|
307
|
-
arg (str |
|
|
331
|
+
arg (str | Column | Func | Sequence): Array to get the element from.
|
|
308
332
|
If a string is provided, it is assumed to be the name of the array column.
|
|
309
|
-
If a
|
|
333
|
+
If a Column is provided, it is assumed to be an array column.
|
|
310
334
|
If a Func is provided, it is assumed to be a function returning an array.
|
|
335
|
+
If a sequence is provided, it is assumed to be an array of values.
|
|
311
336
|
index (int): Index of the element to get from the array.
|
|
312
337
|
|
|
313
338
|
Returns:
|
|
314
|
-
Func: A Func object that represents the array get_element function.
|
|
339
|
+
Func: A `Func` object that represents the array get_element function.
|
|
315
340
|
|
|
316
341
|
Example:
|
|
317
342
|
```py
|
|
318
343
|
dc.mutate(
|
|
319
344
|
first_el=func.array.get_element("signal.values", 0),
|
|
320
|
-
second_el=func.array.get_element(
|
|
345
|
+
second_el=func.array.get_element(dc.C("signal.values"), 1),
|
|
346
|
+
third_el=func.array.get_element([1, 2, 3, 4, 5], 2),
|
|
321
347
|
)
|
|
322
348
|
```
|
|
323
349
|
|
|
324
|
-
|
|
325
|
-
-
|
|
350
|
+
Notes:
|
|
351
|
+
- The result column will always be the same type as the elements of the array.
|
|
326
352
|
"""
|
|
327
353
|
|
|
328
354
|
def type_from_args(arr, _):
|
|
@@ -333,10 +359,10 @@ def get_element(arg: Union[str, Sequence, Func], index: int) -> Func:
|
|
|
333
359
|
return str # if the array is empty, return str as default type
|
|
334
360
|
return None
|
|
335
361
|
|
|
336
|
-
cols: Optional[Union[str,
|
|
337
|
-
args: Union[str,
|
|
362
|
+
cols: Optional[Union[str, Column, Func, Sequence]]
|
|
363
|
+
args: Union[str, Column, Func, Sequence, int]
|
|
338
364
|
|
|
339
|
-
if isinstance(arg, (str, Func)):
|
|
365
|
+
if isinstance(arg, (str, Column, Func)):
|
|
340
366
|
cols = [arg]
|
|
341
367
|
args = [index]
|
|
342
368
|
else:
|
|
@@ -353,31 +379,34 @@ def get_element(arg: Union[str, Sequence, Func], index: int) -> Func:
|
|
|
353
379
|
)
|
|
354
380
|
|
|
355
381
|
|
|
356
|
-
def sip_hash_64(arg: Union[str, Sequence]) -> Func:
|
|
382
|
+
def sip_hash_64(arg: Union[str, Column, Func, Sequence]) -> Func:
|
|
357
383
|
"""
|
|
358
|
-
|
|
384
|
+
Returns the SipHash-64 hash of the array.
|
|
359
385
|
|
|
360
386
|
Args:
|
|
361
|
-
arg (str | Sequence): Array to compute the SipHash-64 hash of.
|
|
387
|
+
arg (str | Column | Func | Sequence): Array to compute the SipHash-64 hash of.
|
|
362
388
|
If a string is provided, it is assumed to be the name of the array column.
|
|
389
|
+
If a Column is provided, it is assumed to be an array column.
|
|
390
|
+
If a Func is provided, it is assumed to be a function returning an array.
|
|
363
391
|
If a sequence is provided, it is assumed to be an array of values.
|
|
364
392
|
|
|
365
393
|
Returns:
|
|
366
|
-
Func: A Func object that represents the sip_hash_64 function.
|
|
394
|
+
Func: A `Func` object that represents the sip_hash_64 function.
|
|
367
395
|
|
|
368
396
|
Example:
|
|
369
397
|
```py
|
|
370
398
|
dc.mutate(
|
|
371
399
|
hash1=func.sip_hash_64("signal.values"),
|
|
372
|
-
hash2=func.sip_hash_64(
|
|
400
|
+
hash2=func.sip_hash_64(dc.C("signal.values")),
|
|
401
|
+
hash3=func.sip_hash_64([1, 2, 3, 4, 5]),
|
|
373
402
|
)
|
|
374
403
|
```
|
|
375
404
|
|
|
376
405
|
Note:
|
|
377
406
|
- This function is only available for the ClickHouse warehouse.
|
|
378
|
-
-
|
|
407
|
+
- The result column will always be of type int.
|
|
379
408
|
"""
|
|
380
|
-
if isinstance(arg, str):
|
|
409
|
+
if isinstance(arg, (str, Column, Func)):
|
|
381
410
|
cols = [arg]
|
|
382
411
|
args = None
|
|
383
412
|
else:
|