datachain 0.18.4__py3-none-any.whl → 0.18.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/func/array.py CHANGED
@@ -1,14 +1,15 @@
1
1
  from collections.abc import Sequence
2
2
  from typing import Any, Optional, Union
3
3
 
4
+ from datachain.query.schema import Column
4
5
  from datachain.sql.functions import array
5
6
 
6
7
  from .func import Func
7
8
 
8
9
 
9
- def cosine_distance(*args: Union[str, Sequence]) -> Func:
10
+ def cosine_distance(*args: Union[str, Column, Func, Sequence]) -> Func:
10
11
  """
11
- Computes the cosine distance between two vectors.
12
+ Returns the cosine distance between two vectors.
12
13
 
13
14
  The cosine distance is derived from the cosine similarity, which measures the angle
14
15
  between two vectors. This function returns the dissimilarity between the vectors,
@@ -16,29 +17,33 @@ def cosine_distance(*args: Union[str, Sequence]) -> Func:
16
17
  indicate higher dissimilarity.
17
18
 
18
19
  Args:
19
- args (str | Sequence): Two vectors to compute the cosine distance between.
20
+ args (str | Column | Func | Sequence): Two vectors to compute the cosine
21
+ distance between.
20
22
  If a string is provided, it is assumed to be the name of the column vector.
23
+ If a Column is provided, it is assumed to be an array column.
24
+ If a Func is provided, it is assumed to be a function returning an array.
21
25
  If a sequence is provided, it is assumed to be a vector of values.
22
26
 
23
27
  Returns:
24
- Func: A Func object that represents the cosine_distance function.
28
+ Func: A `Func` object that represents the cosine_distance function.
25
29
 
26
30
  Example:
27
31
  ```py
28
32
  target_embedding = [0.1, 0.2, 0.3]
29
33
  dc.mutate(
30
34
  cos_dist1=func.cosine_distance("embedding", target_embedding),
31
- cos_dist2=func.cosine_distance(target_embedding, [0.4, 0.5, 0.6]),
35
+ cos_dist2=func.cosine_distance(dc.C("emb1"), "emb2"),
36
+ cos_dist3=func.cosine_distance(target_embedding, [0.4, 0.5, 0.6]),
32
37
  )
33
38
  ```
34
39
 
35
40
  Notes:
36
41
  - Ensure both vectors have the same number of elements.
37
- - Result column will always be of type float.
42
+ - The result column will always be of type float.
38
43
  """
39
44
  cols, func_args = [], []
40
45
  for arg in args:
41
- if isinstance(arg, str):
46
+ if isinstance(arg, (str, Column, Func)):
42
47
  cols.append(arg)
43
48
  else:
44
49
  func_args.append(list(arg))
@@ -57,37 +62,41 @@ def cosine_distance(*args: Union[str, Sequence]) -> Func:
57
62
  )
58
63
 
59
64
 
60
- def euclidean_distance(*args: Union[str, Sequence]) -> Func:
65
+ def euclidean_distance(*args: Union[str, Column, Func, Sequence]) -> Func:
61
66
  """
62
- Computes the Euclidean distance between two vectors.
67
+ Returns the Euclidean distance between two vectors.
63
68
 
64
69
  The Euclidean distance is the straight-line distance between two points
65
70
  in Euclidean space. This function returns the distance between the two vectors.
66
71
 
67
72
  Args:
68
- args (str | Sequence): Two vectors to compute the Euclidean distance between.
73
+ args (str | Column | Func | Sequence): Two vectors to compute the Euclidean
74
+ distance between.
69
75
  If a string is provided, it is assumed to be the name of the column vector.
76
+ If a Column is provided, it is assumed to be an array column.
77
+ If a Func is provided, it is assumed to be a function returning an array.
70
78
  If a sequence is provided, it is assumed to be a vector of values.
71
79
 
72
80
  Returns:
73
- Func: A Func object that represents the euclidean_distance function.
81
+ Func: A `Func` object that represents the euclidean_distance function.
74
82
 
75
83
  Example:
76
84
  ```py
77
85
  target_embedding = [0.1, 0.2, 0.3]
78
86
  dc.mutate(
79
87
  eu_dist1=func.euclidean_distance("embedding", target_embedding),
80
- eu_dist2=func.euclidean_distance(target_embedding, [0.4, 0.5, 0.6]),
88
+ eu_dist2=func.euclidean_distance(dc.C("emb1"), "emb2"),
89
+ eu_dist3=func.euclidean_distance(target_embedding, [0.4, 0.5, 0.6]),
81
90
  )
82
91
  ```
83
92
 
84
93
  Notes:
85
94
  - Ensure both vectors have the same number of elements.
86
- - Result column will always be of type float.
95
+ - The result column will always be of type float.
87
96
  """
88
97
  cols, func_args = [], []
89
98
  for arg in args:
90
- if isinstance(arg, str):
99
+ if isinstance(arg, (str, Column, Func)):
91
100
  cols.append(arg)
92
101
  else:
93
102
  func_args.append(list(arg))
@@ -106,31 +115,33 @@ def euclidean_distance(*args: Union[str, Sequence]) -> Func:
106
115
  )
107
116
 
108
117
 
109
- def length(arg: Union[str, Sequence, Func]) -> Func:
118
+ def length(arg: Union[str, Column, Func, Sequence]) -> Func:
110
119
  """
111
120
  Returns the length of the array.
112
121
 
113
122
  Args:
114
- arg (str | Sequence | Func): Array to compute the length of.
123
+ arg (str | Column | Func | Sequence): Array to compute the length of.
115
124
  If a string is provided, it is assumed to be the name of the array column.
116
- If a sequence is provided, it is assumed to be an array of values.
125
+ If a Column is provided, it is assumed to be an array column.
117
126
  If a Func is provided, it is assumed to be a function returning an array.
127
+ If a sequence is provided, it is assumed to be an array of values.
118
128
 
119
129
  Returns:
120
- Func: A Func object that represents the array length function.
130
+ Func: A `Func` object that represents the array length function.
121
131
 
122
132
  Example:
123
133
  ```py
124
134
  dc.mutate(
125
135
  len1=func.array.length("signal.values"),
126
- len2=func.array.length([1, 2, 3, 4, 5]),
136
+ len2=func.array.length(dc.C("signal.values")),
137
+ len3=func.array.length([1, 2, 3, 4, 5]),
127
138
  )
128
139
  ```
129
140
 
130
- Note:
131
- - Result column will always be of type int.
141
+ Notes:
142
+ - The result column will always be of type int.
132
143
  """
133
- if isinstance(arg, (str, Func)):
144
+ if isinstance(arg, (str, Column, Func)):
134
145
  cols = [arg]
135
146
  args = None
136
147
  else:
@@ -140,35 +151,41 @@ def length(arg: Union[str, Sequence, Func]) -> Func:
140
151
  return Func("length", inner=array.length, cols=cols, args=args, result_type=int)
141
152
 
142
153
 
143
- def contains(arr: Union[str, Sequence, Func], elem: Any) -> Func:
154
+ def contains(arr: Union[str, Column, Func, Sequence], elem: Any) -> Func:
144
155
  """
145
- Checks whether the `arr` array has the `elem` element.
156
+ Checks whether the array contains the specified element.
146
157
 
147
158
  Args:
148
- arr (str | Sequence | Func): Array to check for the element.
159
+ arr (str | Column | Func | Sequence): Array to check for the element.
149
160
  If a string is provided, it is assumed to be the name of the array column.
150
- If a sequence is provided, it is assumed to be an array of values.
161
+ If a Column is provided, it is assumed to be an array column.
151
162
  If a Func is provided, it is assumed to be a function returning an array.
163
+ If a sequence is provided, it is assumed to be an array of values.
152
164
  elem (Any): Element to check for in the array.
153
165
 
154
166
  Returns:
155
- Func: A Func object that represents the contains function. Result of the
156
- function will be 1 if the element is present in the array, and 0 otherwise.
167
+ Func: A `Func` object that represents the contains function. Result of the
168
+ function will be `1` if the element is present in the array,
169
+ and `0` otherwise.
157
170
 
158
171
  Example:
159
172
  ```py
160
173
  dc.mutate(
161
174
  contains1=func.array.contains("signal.values", 3),
162
- contains2=func.array.contains([1, 2, 3, 4, 5], 7),
175
+ contains2=func.array.contains(dc.C("signal.values"), 7),
176
+ contains3=func.array.contains([1, 2, 3, 4, 5], 7),
163
177
  )
164
178
  ```
179
+
180
+ Notes:
181
+ - The result column will always be of type int.
165
182
  """
166
183
 
167
184
  def inner(arg):
168
185
  is_json = type(elem) in [list, dict]
169
186
  return array.contains(arg, elem, is_json)
170
187
 
171
- if isinstance(arr, (str, Func)):
188
+ if isinstance(arr, (str, Column, Func)):
172
189
  cols = [arr]
173
190
  args = None
174
191
  else:
@@ -179,34 +196,38 @@ def contains(arr: Union[str, Sequence, Func], elem: Any) -> Func:
179
196
 
180
197
 
181
198
  def slice(
182
- arr: Union[str, Sequence, Func],
199
+ arr: Union[str, Column, Func, Sequence],
183
200
  offset: int,
184
201
  length: Optional[int] = None,
185
202
  ) -> Func:
186
203
  """
187
- Returns a slice of the array.
204
+ Returns a slice of the array starting from the specified offset.
188
205
 
189
206
  Args:
190
- arr (str | Sequence | Func): Array to check for the element.
207
+ arr (str | Column | Func | Sequence): Array to slice.
191
208
  If a string is provided, it is assumed to be the name of the array column.
192
- If a sequence is provided, it is assumed to be an array of values.
209
+ If a Column is provided, it is assumed to be an array column.
193
210
  If a Func is provided, it is assumed to be a function returning an array.
194
- offset (int): Offset to start the slice from.
195
- length (int, optional): Length of the slice. If not provided, the slice will
196
- continue to the end of the array.
211
+ If a sequence is provided, it is assumed to be an array of values.
212
+ offset (int): Starting position of the slice (0-based).
213
+ length (int, optional): Number of elements to include in the slice.
214
+ If not provided, returns all elements from offset to the end.
197
215
 
198
216
  Returns:
199
- Func: A Func object that represents the slice function. Result of the
200
- function will be a slice of the array starting from the offset
201
- and with the given length.
217
+ Func: A `Func` object that represents the slice function.
202
218
 
203
219
  Example:
204
220
  ```py
205
221
  dc.mutate(
206
- contains1=func.array.slice("signal.values", 3),
207
- contains2=func.array.slice([1, 2, 3, 4, 5], 1, 3),
222
+ slice1=func.array.slice("signal.values", 1, 3),
223
+ slice2=func.array.slice(dc.C("signal.values"), 2),
224
+ slice3=func.array.slice([1, 2, 3, 4, 5], 1, 2),
208
225
  )
209
226
  ```
227
+
228
+ Notes:
229
+ - The result column will be of type array with the same element type
230
+ as the input.
210
231
  """
211
232
 
212
233
  def inner(arg):
@@ -228,11 +249,11 @@ def slice(
228
249
  try:
229
250
  return list[element_type(arr[0])]
230
251
  except IndexError:
231
- # if the array is empty, return list[str] as default type
232
- return list[str]
233
- return None
252
+ pass
253
+ # if not an array or array is empty, return list[str] as default type
254
+ return list[str]
234
255
 
235
- if isinstance(arr, (str, Func)):
256
+ if isinstance(arr, (str, Column, Func)):
236
257
  cols = [arr]
237
258
  args = None
238
259
  else:
@@ -251,37 +272,40 @@ def slice(
251
272
 
252
273
 
253
274
  def join(
254
- arr: Union[str, Sequence, Func],
275
+ arr: Union[str, Column, Func, Sequence],
255
276
  sep: str = "",
256
277
  ) -> Func:
257
278
  """
258
- Returns a string that is the concatenation of the elements of the array,
279
+ Returns a string that is the concatenation of the elements of the array.
259
280
 
260
281
  Args:
261
- arr (str | Sequence | Func): Array to check for the element.
282
+ arr (str | Column | Func | Sequence): Array to join.
262
283
  If a string is provided, it is assumed to be the name of the array column.
263
- If a sequence is provided, it is assumed to be an array of values.
284
+ If a Column is provided, it is assumed to be an array column.
264
285
  If a Func is provided, it is assumed to be a function returning an array.
286
+ If a sequence is provided, it is assumed to be an array of values.
265
287
  sep (str): Separator to use for the concatenation. Default is an empty string.
266
288
 
267
289
  Returns:
268
- Func: A Func object that represents the join function. Result of the
269
- function will be a string that is the concatenation of the elements
270
- of the array, separated by the given separator.
290
+ Func: A `Func` object that represents the join function.
271
291
 
272
292
  Example:
273
293
  ```py
274
294
  dc.mutate(
275
- contains1=func.array.join("signal.values", ":"),
276
- contains2=func.array.join(["1", "2", "3", "4", "5"], "/"),
295
+ join1=func.array.join("signal.values", ":"),
296
+ join2=func.array.join(dc.C("signal.values"), ","),
297
+ join3=func.array.join(["1", "2", "3", "4", "5"], "/"),
277
298
  )
278
299
  ```
300
+
301
+ Notes:
302
+ - The result column will always be of type string.
279
303
  """
280
304
 
281
305
  def inner(arg):
282
306
  return array.join(arg, sep)
283
307
 
284
- if isinstance(arr, (str, Func)):
308
+ if isinstance(arr, (str, Column, Func)):
285
309
  cols = [arr]
286
310
  args = None
287
311
  else:
@@ -298,31 +322,33 @@ def join(
298
322
  )
299
323
 
300
324
 
301
- def get_element(arg: Union[str, Sequence, Func], index: int) -> Func:
325
+ def get_element(arg: Union[str, Column, Func, Sequence], index: int) -> Func:
302
326
  """
303
327
  Returns the element at the given index from the array.
304
328
  If the index is out of bounds, it returns None or columns default value.
305
329
 
306
330
  Args:
307
- arg (str | Sequence | Func): Array to get the element from.
331
+ arg (str | Column | Func | Sequence): Array to get the element from.
308
332
  If a string is provided, it is assumed to be the name of the array column.
309
- If a sequence is provided, it is assumed to be an array of values.
333
+ If a Column is provided, it is assumed to be an array column.
310
334
  If a Func is provided, it is assumed to be a function returning an array.
335
+ If a sequence is provided, it is assumed to be an array of values.
311
336
  index (int): Index of the element to get from the array.
312
337
 
313
338
  Returns:
314
- Func: A Func object that represents the array get_element function.
339
+ Func: A `Func` object that represents the array get_element function.
315
340
 
316
341
  Example:
317
342
  ```py
318
343
  dc.mutate(
319
344
  first_el=func.array.get_element("signal.values", 0),
320
- second_el=func.array.get_element([1, 2, 3, 4, 5], 1),
345
+ second_el=func.array.get_element(dc.C("signal.values"), 1),
346
+ third_el=func.array.get_element([1, 2, 3, 4, 5], 2),
321
347
  )
322
348
  ```
323
349
 
324
- Note:
325
- - Result column will always be the same type as the elements of the array.
350
+ Notes:
351
+ - The result column will always be the same type as the elements of the array.
326
352
  """
327
353
 
328
354
  def type_from_args(arr, _):
@@ -333,10 +359,10 @@ def get_element(arg: Union[str, Sequence, Func], index: int) -> Func:
333
359
  return str # if the array is empty, return str as default type
334
360
  return None
335
361
 
336
- cols: Optional[Union[str, Sequence, Func]]
337
- args: Union[str, Sequence, Func, int]
362
+ cols: Optional[Union[str, Column, Func, Sequence]]
363
+ args: Union[str, Column, Func, Sequence, int]
338
364
 
339
- if isinstance(arg, (str, Func)):
365
+ if isinstance(arg, (str, Column, Func)):
340
366
  cols = [arg]
341
367
  args = [index]
342
368
  else:
@@ -353,31 +379,34 @@ def get_element(arg: Union[str, Sequence, Func], index: int) -> Func:
353
379
  )
354
380
 
355
381
 
356
- def sip_hash_64(arg: Union[str, Sequence]) -> Func:
382
+ def sip_hash_64(arg: Union[str, Column, Func, Sequence]) -> Func:
357
383
  """
358
- Computes the SipHash-64 hash of the array.
384
+ Returns the SipHash-64 hash of the array.
359
385
 
360
386
  Args:
361
- arg (str | Sequence): Array to compute the SipHash-64 hash of.
387
+ arg (str | Column | Func | Sequence): Array to compute the SipHash-64 hash of.
362
388
  If a string is provided, it is assumed to be the name of the array column.
389
+ If a Column is provided, it is assumed to be an array column.
390
+ If a Func is provided, it is assumed to be a function returning an array.
363
391
  If a sequence is provided, it is assumed to be an array of values.
364
392
 
365
393
  Returns:
366
- Func: A Func object that represents the sip_hash_64 function.
394
+ Func: A `Func` object that represents the sip_hash_64 function.
367
395
 
368
396
  Example:
369
397
  ```py
370
398
  dc.mutate(
371
399
  hash1=func.sip_hash_64("signal.values"),
372
- hash2=func.sip_hash_64([1, 2, 3, 4, 5]),
400
+ hash2=func.sip_hash_64(dc.C("signal.values")),
401
+ hash3=func.sip_hash_64([1, 2, 3, 4, 5]),
373
402
  )
374
403
  ```
375
404
 
376
405
  Note:
377
406
  - This function is only available for the ClickHouse warehouse.
378
- - Result column will always be of type int.
407
+ - The result column will always be of type int.
379
408
  """
380
- if isinstance(arg, str):
409
+ if isinstance(arg, (str, Column, Func)):
381
410
  cols = [arg]
382
411
  args = None
383
412
  else: