datachain 0.7.7__py3-none-any.whl → 0.7.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/cli.py CHANGED
@@ -16,7 +16,7 @@ from tabulate import tabulate
16
16
  from datachain import Session, utils
17
17
  from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
18
18
  from datachain.config import Config
19
- from datachain.error import DataChainError
19
+ from datachain.error import DataChainError, DatasetNotFoundError
20
20
  from datachain.lib.dc import DataChain
21
21
  from datachain.studio import (
22
22
  edit_studio_dataset,
@@ -1056,7 +1056,10 @@ def rm_dataset(
1056
1056
  all, local, studio = _determine_flavors(studio, local, all, token)
1057
1057
 
1058
1058
  if all or local:
1059
- catalog.remove_dataset(name, version=version, force=force)
1059
+ try:
1060
+ catalog.remove_dataset(name, version=version, force=force)
1061
+ except DatasetNotFoundError:
1062
+ print("Dataset not found in local", file=sys.stderr)
1060
1063
 
1061
1064
  if (all or studio) and token:
1062
1065
  remove_studio_dataset(team, name, version, force)
@@ -1077,7 +1080,10 @@ def edit_dataset(
1077
1080
  all, local, studio = _determine_flavors(studio, local, all, token)
1078
1081
 
1079
1082
  if all or local:
1080
- catalog.edit_dataset(name, new_name, description, labels)
1083
+ try:
1084
+ catalog.edit_dataset(name, new_name, description, labels)
1085
+ except DatasetNotFoundError:
1086
+ print("Dataset not found in local", file=sys.stderr)
1081
1087
 
1082
1088
  if (all or studio) and token:
1083
1089
  edit_studio_dataset(team, name, new_name, description, labels)
@@ -725,9 +725,10 @@ class AbstractDBMetastore(AbstractMetastore):
725
725
 
726
726
  def list_datasets(self) -> Iterator["DatasetListRecord"]:
727
727
  """Lists all datasets."""
728
- yield from self._parse_dataset_list(
729
- self.db.execute(self._base_list_datasets_query())
728
+ query = self._base_list_datasets_query().order_by(
729
+ self._datasets.c.name, self._datasets_versions.c.version
730
730
  )
731
+ yield from self._parse_dataset_list(self.db.execute(query))
731
732
 
732
733
  def list_datasets_by_prefix(
733
734
  self, prefix: str, conn=None
@@ -17,6 +17,7 @@ from .aggregate import (
17
17
  )
18
18
  from .array import cosine_distance, euclidean_distance, length, sip_hash_64
19
19
  from .conditional import greatest, least
20
+ from .numeric import bit_and, bit_or, bit_xor, int_hash_64
20
21
  from .random import rand
21
22
  from .window import window
22
23
 
@@ -24,6 +25,9 @@ __all__ = [
24
25
  "any_value",
25
26
  "array",
26
27
  "avg",
28
+ "bit_and",
29
+ "bit_or",
30
+ "bit_xor",
27
31
  "case",
28
32
  "collect",
29
33
  "concat",
@@ -33,6 +37,7 @@ __all__ = [
33
37
  "euclidean_distance",
34
38
  "first",
35
39
  "greatest",
40
+ "int_hash_64",
36
41
  "least",
37
42
  "length",
38
43
  "literal",
datachain/func/func.py CHANGED
@@ -2,13 +2,15 @@ import inspect
2
2
  from collections.abc import Sequence
3
3
  from typing import TYPE_CHECKING, Any, Callable, Optional, Union
4
4
 
5
- from sqlalchemy import BindParameter, Case, ColumnElement, desc
5
+ from sqlalchemy import BindParameter, Case, ColumnElement, Integer, cast, desc
6
6
  from sqlalchemy.ext.hybrid import Comparator
7
+ from sqlalchemy.sql import func as sa_func
7
8
 
8
9
  from datachain.lib.convert.python_to_sql import python_to_sql
9
10
  from datachain.lib.convert.sql_to_python import sql_to_python
10
11
  from datachain.lib.utils import DataChainColumnError, DataChainParamsError
11
12
  from datachain.query.schema import Column, ColumnMeta
13
+ from datachain.sql.functions import numeric
12
14
 
13
15
  from .base import Function
14
16
 
@@ -98,94 +100,232 @@ class Func(Function):
98
100
  return list[col_type] if self.is_array else col_type # type: ignore[valid-type]
99
101
 
100
102
  def __add__(self, other: Union[ColT, float]) -> "Func":
101
- return math_add(self, other)
103
+ if isinstance(other, (int, float)):
104
+ return Func("add", lambda a: a + other, [self])
105
+ return Func("add", lambda a1, a2: a1 + a2, [self, other])
102
106
 
103
107
  def __radd__(self, other: Union[ColT, float]) -> "Func":
104
- return math_add(other, self)
108
+ if isinstance(other, (int, float)):
109
+ return Func("add", lambda a: other + a, [self])
110
+ return Func("add", lambda a1, a2: a1 + a2, [other, self])
105
111
 
106
112
  def __sub__(self, other: Union[ColT, float]) -> "Func":
107
- return math_sub(self, other)
113
+ if isinstance(other, (int, float)):
114
+ return Func("sub", lambda a: a - other, [self])
115
+ return Func("sub", lambda a1, a2: a1 - a2, [self, other])
108
116
 
109
117
  def __rsub__(self, other: Union[ColT, float]) -> "Func":
110
- return math_sub(other, self)
118
+ if isinstance(other, (int, float)):
119
+ return Func("sub", lambda a: other - a, [self])
120
+ return Func("sub", lambda a1, a2: a1 - a2, [other, self])
111
121
 
112
122
  def __mul__(self, other: Union[ColT, float]) -> "Func":
113
- return math_mul(self, other)
123
+ if isinstance(other, (int, float)):
124
+ return Func("mul", lambda a: a * other, [self])
125
+ return Func("mul", lambda a1, a2: a1 * a2, [self, other])
114
126
 
115
127
  def __rmul__(self, other: Union[ColT, float]) -> "Func":
116
- return math_mul(other, self)
128
+ if isinstance(other, (int, float)):
129
+ return Func("mul", lambda a: other * a, [self])
130
+ return Func("mul", lambda a1, a2: a1 * a2, [other, self])
117
131
 
118
132
  def __truediv__(self, other: Union[ColT, float]) -> "Func":
119
- return math_truediv(self, other)
133
+ if isinstance(other, (int, float)):
134
+ return Func("div", lambda a: _truediv(a, other), [self], result_type=float)
135
+ return Func(
136
+ "div", lambda a1, a2: _truediv(a1, a2), [self, other], result_type=float
137
+ )
120
138
 
121
139
  def __rtruediv__(self, other: Union[ColT, float]) -> "Func":
122
- return math_truediv(other, self)
140
+ if isinstance(other, (int, float)):
141
+ return Func("div", lambda a: _truediv(other, a), [self], result_type=float)
142
+ return Func(
143
+ "div", lambda a1, a2: _truediv(a1, a2), [other, self], result_type=float
144
+ )
123
145
 
124
146
  def __floordiv__(self, other: Union[ColT, float]) -> "Func":
125
- return math_floordiv(self, other)
147
+ if isinstance(other, (int, float)):
148
+ return Func(
149
+ "floordiv", lambda a: _floordiv(a, other), [self], result_type=int
150
+ )
151
+ return Func(
152
+ "floordiv", lambda a1, a2: _floordiv(a1, a2), [self, other], result_type=int
153
+ )
126
154
 
127
155
  def __rfloordiv__(self, other: Union[ColT, float]) -> "Func":
128
- return math_floordiv(other, self)
156
+ if isinstance(other, (int, float)):
157
+ return Func(
158
+ "floordiv", lambda a: _floordiv(other, a), [self], result_type=int
159
+ )
160
+ return Func(
161
+ "floordiv", lambda a1, a2: _floordiv(a1, a2), [other, self], result_type=int
162
+ )
129
163
 
130
164
  def __mod__(self, other: Union[ColT, float]) -> "Func":
131
- return math_mod(self, other)
165
+ if isinstance(other, (int, float)):
166
+ return Func("mod", lambda a: a % other, [self], result_type=int)
167
+ return Func("mod", lambda a1, a2: a1 % a2, [self, other], result_type=int)
132
168
 
133
169
  def __rmod__(self, other: Union[ColT, float]) -> "Func":
134
- return math_mod(other, self)
135
-
136
- def __pow__(self, other: Union[ColT, float]) -> "Func":
137
- return math_pow(self, other)
138
-
139
- def __rpow__(self, other: Union[ColT, float]) -> "Func":
140
- return math_pow(other, self)
141
-
142
- def __lshift__(self, other: Union[ColT, float]) -> "Func":
143
- return math_lshift(self, other)
144
-
145
- def __rlshift__(self, other: Union[ColT, float]) -> "Func":
146
- return math_lshift(other, self)
147
-
148
- def __rshift__(self, other: Union[ColT, float]) -> "Func":
149
- return math_rshift(self, other)
150
-
151
- def __rrshift__(self, other: Union[ColT, float]) -> "Func":
152
- return math_rshift(other, self)
170
+ if isinstance(other, (int, float)):
171
+ return Func("mod", lambda a: other % a, [self], result_type=int)
172
+ return Func("mod", lambda a1, a2: a1 % a2, [other, self], result_type=int)
153
173
 
154
174
  def __and__(self, other: Union[ColT, float]) -> "Func":
155
- return math_and(self, other)
175
+ if isinstance(other, (int, float)):
176
+ return Func(
177
+ "and", lambda a: numeric.bit_and(a, other), [self], result_type=int
178
+ )
179
+ return Func(
180
+ "and",
181
+ lambda a1, a2: numeric.bit_and(a1, a2),
182
+ [self, other],
183
+ result_type=int,
184
+ )
156
185
 
157
186
  def __rand__(self, other: Union[ColT, float]) -> "Func":
158
- return math_and(other, self)
187
+ if isinstance(other, (int, float)):
188
+ return Func(
189
+ "and", lambda a: numeric.bit_and(other, a), [self], result_type=int
190
+ )
191
+ return Func(
192
+ "and",
193
+ lambda a1, a2: numeric.bit_and(a1, a2),
194
+ [other, self],
195
+ result_type=int,
196
+ )
159
197
 
160
198
  def __or__(self, other: Union[ColT, float]) -> "Func":
161
- return math_or(self, other)
199
+ if isinstance(other, (int, float)):
200
+ return Func(
201
+ "or", lambda a: numeric.bit_or(a, other), [self], result_type=int
202
+ )
203
+ return Func(
204
+ "or", lambda a1, a2: numeric.bit_or(a1, a2), [self, other], result_type=int
205
+ )
162
206
 
163
207
  def __ror__(self, other: Union[ColT, float]) -> "Func":
164
- return math_or(other, self)
208
+ if isinstance(other, (int, float)):
209
+ return Func(
210
+ "or", lambda a: numeric.bit_or(other, a), [self], result_type=int
211
+ )
212
+ return Func(
213
+ "or", lambda a1, a2: numeric.bit_or(a1, a2), [other, self], result_type=int
214
+ )
165
215
 
166
216
  def __xor__(self, other: Union[ColT, float]) -> "Func":
167
- return math_xor(self, other)
217
+ if isinstance(other, (int, float)):
218
+ return Func(
219
+ "xor", lambda a: numeric.bit_xor(a, other), [self], result_type=int
220
+ )
221
+ return Func(
222
+ "xor",
223
+ lambda a1, a2: numeric.bit_xor(a1, a2),
224
+ [self, other],
225
+ result_type=int,
226
+ )
168
227
 
169
228
  def __rxor__(self, other: Union[ColT, float]) -> "Func":
170
- return math_xor(other, self)
229
+ if isinstance(other, (int, float)):
230
+ return Func(
231
+ "xor", lambda a: numeric.bit_xor(other, a), [self], result_type=int
232
+ )
233
+ return Func(
234
+ "xor",
235
+ lambda a1, a2: numeric.bit_xor(a1, a2),
236
+ [other, self],
237
+ result_type=int,
238
+ )
239
+
240
+ def __rshift__(self, other: Union[ColT, float]) -> "Func":
241
+ if isinstance(other, (int, float)):
242
+ return Func(
243
+ "rshift",
244
+ lambda a: numeric.bit_rshift(a, other),
245
+ [self],
246
+ result_type=int,
247
+ )
248
+ return Func(
249
+ "rshift",
250
+ lambda a1, a2: numeric.bit_rshift(a1, a2),
251
+ [self, other],
252
+ result_type=int,
253
+ )
254
+
255
+ def __rrshift__(self, other: Union[ColT, float]) -> "Func":
256
+ if isinstance(other, (int, float)):
257
+ return Func(
258
+ "rshift",
259
+ lambda a: numeric.bit_rshift(other, a),
260
+ [self],
261
+ result_type=int,
262
+ )
263
+ return Func(
264
+ "rshift",
265
+ lambda a1, a2: numeric.bit_rshift(a1, a2),
266
+ [other, self],
267
+ result_type=int,
268
+ )
269
+
270
+ def __lshift__(self, other: Union[ColT, float]) -> "Func":
271
+ if isinstance(other, (int, float)):
272
+ return Func(
273
+ "lshift",
274
+ lambda a: numeric.bit_lshift(a, other),
275
+ [self],
276
+ result_type=int,
277
+ )
278
+ return Func(
279
+ "lshift",
280
+ lambda a1, a2: numeric.bit_lshift(a1, a2),
281
+ [self, other],
282
+ result_type=int,
283
+ )
284
+
285
+ def __rlshift__(self, other: Union[ColT, float]) -> "Func":
286
+ if isinstance(other, (int, float)):
287
+ return Func(
288
+ "lshift",
289
+ lambda a: numeric.bit_lshift(other, a),
290
+ [self],
291
+ result_type=int,
292
+ )
293
+ return Func(
294
+ "lshift",
295
+ lambda a1, a2: numeric.bit_lshift(a1, a2),
296
+ [other, self],
297
+ result_type=int,
298
+ )
171
299
 
172
300
  def __lt__(self, other: Union[ColT, float]) -> "Func":
173
- return math_lt(self, other)
301
+ if isinstance(other, (int, float)):
302
+ return Func("lt", lambda a: a < other, [self], result_type=bool)
303
+ return Func("lt", lambda a1, a2: a1 < a2, [self, other], result_type=bool)
174
304
 
175
305
  def __le__(self, other: Union[ColT, float]) -> "Func":
176
- return math_le(self, other)
306
+ if isinstance(other, (int, float)):
307
+ return Func("le", lambda a: a <= other, [self], result_type=bool)
308
+ return Func("le", lambda a1, a2: a1 <= a2, [self, other], result_type=bool)
177
309
 
178
310
  def __eq__(self, other):
179
- return math_eq(self, other)
311
+ if isinstance(other, (int, float)):
312
+ return Func("eq", lambda a: a == other, [self], result_type=bool)
313
+ return Func("eq", lambda a1, a2: a1 == a2, [self, other], result_type=bool)
180
314
 
181
315
  def __ne__(self, other):
182
- return math_ne(self, other)
316
+ if isinstance(other, (int, float)):
317
+ return Func("ne", lambda a: a != other, [self], result_type=bool)
318
+ return Func("ne", lambda a1, a2: a1 != a2, [self, other], result_type=bool)
183
319
 
184
320
  def __gt__(self, other: Union[ColT, float]) -> "Func":
185
- return math_gt(self, other)
321
+ if isinstance(other, (int, float)):
322
+ return Func("gt", lambda a: a > other, [self], result_type=bool)
323
+ return Func("gt", lambda a1, a2: a1 > a2, [self, other], result_type=bool)
186
324
 
187
325
  def __ge__(self, other: Union[ColT, float]) -> "Func":
188
- return math_ge(self, other)
326
+ if isinstance(other, (int, float)):
327
+ return Func("ge", lambda a: a >= other, [self], result_type=bool)
328
+ return Func("ge", lambda a1, a2: a1 >= a2, [self, other], result_type=bool)
189
329
 
190
330
  def label(self, label: str) -> "Func":
191
331
  return Func(
@@ -283,107 +423,12 @@ def get_db_col_type(signals_schema: "SignalSchema", col: ColT) -> "DataType":
283
423
  )
284
424
 
285
425
 
286
- def math_func(
287
- name: str,
288
- inner: Callable,
289
- params: Sequence[Union[ColT, float]],
290
- result_type: Optional["DataType"] = None,
291
- ) -> Func:
292
- """Returns math function from the columns."""
293
- cols, args = [], []
294
- for arg in params:
295
- if isinstance(arg, (int, float)):
296
- args.append(arg)
297
- else:
298
- cols.append(arg)
299
- return Func(name, inner, cols=cols, args=args, result_type=result_type)
300
-
301
-
302
- def math_add(*args: Union[ColT, float]) -> Func:
303
- """Computes the sum of the column."""
304
- return math_func("add", lambda a1, a2: a1 + a2, args)
305
-
306
-
307
- def math_sub(*args: Union[ColT, float]) -> Func:
308
- """Computes the diff of the column."""
309
- return math_func("sub", lambda a1, a2: a1 - a2, args)
310
-
311
-
312
- def math_mul(*args: Union[ColT, float]) -> Func:
313
- """Computes the product of the column."""
314
- return math_func("mul", lambda a1, a2: a1 * a2, args)
315
-
316
-
317
- def math_truediv(*args: Union[ColT, float]) -> Func:
318
- """Computes the division of the column."""
319
- return math_func("div", lambda a1, a2: a1 / a2, args, result_type=float)
320
-
321
-
322
- def math_floordiv(*args: Union[ColT, float]) -> Func:
323
- """Computes the floor division of the column."""
324
- return math_func("floordiv", lambda a1, a2: a1 // a2, args, result_type=float)
325
-
326
-
327
- def math_mod(*args: Union[ColT, float]) -> Func:
328
- """Computes the modulo of the column."""
329
- return math_func("mod", lambda a1, a2: a1 % a2, args, result_type=float)
330
-
331
-
332
- def math_pow(*args: Union[ColT, float]) -> Func:
333
- """Computes the power of the column."""
334
- return math_func("pow", lambda a1, a2: a1**a2, args, result_type=float)
335
-
336
-
337
- def math_lshift(*args: Union[ColT, float]) -> Func:
338
- """Computes the left shift of the column."""
339
- return math_func("lshift", lambda a1, a2: a1 << a2, args, result_type=int)
340
-
341
-
342
- def math_rshift(*args: Union[ColT, float]) -> Func:
343
- """Computes the right shift of the column."""
344
- return math_func("rshift", lambda a1, a2: a1 >> a2, args, result_type=int)
345
-
346
-
347
- def math_and(*args: Union[ColT, float]) -> Func:
348
- """Computes the logical AND of the column."""
349
- return math_func("and", lambda a1, a2: a1 & a2, args, result_type=bool)
350
-
351
-
352
- def math_or(*args: Union[ColT, float]) -> Func:
353
- """Computes the logical OR of the column."""
354
- return math_func("or", lambda a1, a2: a1 | a2, args, result_type=bool)
355
-
356
-
357
- def math_xor(*args: Union[ColT, float]) -> Func:
358
- """Computes the logical XOR of the column."""
359
- return math_func("xor", lambda a1, a2: a1 ^ a2, args, result_type=bool)
360
-
361
-
362
- def math_lt(*args: Union[ColT, float]) -> Func:
363
- """Computes the less than comparison of the column."""
364
- return math_func("lt", lambda a1, a2: a1 < a2, args, result_type=bool)
365
-
366
-
367
- def math_le(*args: Union[ColT, float]) -> Func:
368
- """Computes the less than or equal comparison of the column."""
369
- return math_func("le", lambda a1, a2: a1 <= a2, args, result_type=bool)
370
-
371
-
372
- def math_eq(*args: Union[ColT, float]) -> Func:
373
- """Computes the equality comparison of the column."""
374
- return math_func("eq", lambda a1, a2: a1 == a2, args, result_type=bool)
375
-
376
-
377
- def math_ne(*args: Union[ColT, float]) -> Func:
378
- """Computes the inequality comparison of the column."""
379
- return math_func("ne", lambda a1, a2: a1 != a2, args, result_type=bool)
380
-
381
-
382
- def math_gt(*args: Union[ColT, float]) -> Func:
383
- """Computes the greater than comparison of the column."""
384
- return math_func("gt", lambda a1, a2: a1 > a2, args, result_type=bool)
426
+ def _truediv(a, b):
427
+ # Using sqlalchemy.sql.func.divide here instead of / operator
428
+ # because of a bug in ClickHouse SQLAlchemy dialect
429
+ # See https://github.com/xzkostyan/clickhouse-sqlalchemy/issues/335
430
+ return sa_func.divide(a, b)
385
431
 
386
432
 
387
- def math_ge(*args: Union[ColT, float]) -> Func:
388
- """Computes the greater than or equal comparison of the column."""
389
- return math_func("ge", lambda a1, a2: a1 >= a2, args, result_type=bool)
433
+ def _floordiv(a, b):
434
+ return cast(_truediv(a, b), Integer)
@@ -0,0 +1,162 @@
1
+ from typing import Union
2
+
3
+ from datachain.sql.functions import numeric
4
+
5
+ from .func import ColT, Func
6
+
7
+
8
+ def bit_and(*args: Union[ColT, int]) -> Func:
9
+ """
10
+ Computes the bitwise AND operation between two values.
11
+
12
+ Args:
13
+ args (str | int): Two values to compute the bitwise AND operation between.
14
+ If a string is provided, it is assumed to be the name of the column vector.
15
+ If an integer is provided, it is assumed to be a constant value.
16
+
17
+ Returns:
18
+ Func: A Func object that represents the bitwise AND function.
19
+
20
+ Example:
21
+ ```py
22
+ dc.mutate(
23
+ xor1=func.bit_and("signal.values", 0x0F),
24
+ )
25
+ ```
26
+
27
+ Notes:
28
+ - Result column will always be of type int.
29
+ """
30
+ cols, func_args = [], []
31
+ for arg in args:
32
+ if isinstance(arg, int):
33
+ func_args.append(arg)
34
+ else:
35
+ cols.append(arg)
36
+
37
+ if len(cols) + len(func_args) != 2:
38
+ raise ValueError("bit_and() requires exactly two arguments")
39
+
40
+ return Func(
41
+ "bit_and",
42
+ inner=numeric.bit_and,
43
+ cols=cols,
44
+ args=func_args,
45
+ result_type=int,
46
+ )
47
+
48
+
49
+ def bit_or(*args: Union[ColT, int]) -> Func:
50
+ """
51
+ Computes the bitwise AND operation between two values.
52
+
53
+ Args:
54
+ args (str | int): Two values to compute the bitwise OR operation between.
55
+ If a string is provided, it is assumed to be the name of the column vector.
56
+ If an integer is provided, it is assumed to be a constant value.
57
+
58
+ Returns:
59
+ Func: A Func object that represents the bitwise OR function.
60
+
61
+ Example:
62
+ ```py
63
+ dc.mutate(
64
+ xor1=func.bit_or("signal.values", 0x0F),
65
+ )
66
+ ```
67
+
68
+ Notes:
69
+ - Result column will always be of type int.
70
+ """
71
+ cols, func_args = [], []
72
+ for arg in args:
73
+ if isinstance(arg, int):
74
+ func_args.append(arg)
75
+ else:
76
+ cols.append(arg)
77
+
78
+ if len(cols) + len(func_args) != 2:
79
+ raise ValueError("bit_or() requires exactly two arguments")
80
+
81
+ return Func(
82
+ "bit_or",
83
+ inner=numeric.bit_or,
84
+ cols=cols,
85
+ args=func_args,
86
+ result_type=int,
87
+ )
88
+
89
+
90
+ def bit_xor(*args: Union[ColT, int]) -> Func:
91
+ """
92
+ Computes the bitwise XOR operation between two values.
93
+
94
+ Args:
95
+ args (str | int): Two values to compute the bitwise XOR operation between.
96
+ If a string is provided, it is assumed to be the name of the column vector.
97
+ If an integer is provided, it is assumed to be a constant value.
98
+
99
+ Returns:
100
+ Func: A Func object that represents the bitwise XOR function.
101
+
102
+ Example:
103
+ ```py
104
+ dc.mutate(
105
+ xor1=func.bit_xor("signal.values", 0x0F),
106
+ )
107
+ ```
108
+
109
+ Notes:
110
+ - Result column will always be of type int.
111
+ """
112
+ cols, func_args = [], []
113
+ for arg in args:
114
+ if isinstance(arg, int):
115
+ func_args.append(arg)
116
+ else:
117
+ cols.append(arg)
118
+
119
+ if len(cols) + len(func_args) != 2:
120
+ raise ValueError("bit_xor() requires exactly two arguments")
121
+
122
+ return Func(
123
+ "bit_xor",
124
+ inner=numeric.bit_xor,
125
+ cols=cols,
126
+ args=func_args,
127
+ result_type=int,
128
+ )
129
+
130
+
131
+ def int_hash_64(col: Union[ColT, int]) -> Func:
132
+ """
133
+ Returns the 64-bit hash of an integer.
134
+
135
+ Args:
136
+ col (str | int): String to compute the hash of.
137
+ If a string is provided, it is assumed to be the name of the column.
138
+ If a int is provided, it is assumed to be an int literal.
139
+ If a Func is provided, it is assumed to be a function returning an int.
140
+
141
+ Returns:
142
+ Func: A Func object that represents the 64-bit hash function.
143
+
144
+ Example:
145
+ ```py
146
+ dc.mutate(
147
+ val_hash=func.int_hash_64("val"),
148
+ )
149
+ ```
150
+
151
+ Note:
152
+ - Result column will always be of type int.
153
+ """
154
+ cols, args = [], []
155
+ if isinstance(col, int):
156
+ args.append(col)
157
+ else:
158
+ cols.append(col)
159
+
160
+ return Func(
161
+ "int_hash_64", inner=numeric.int_hash_64, cols=cols, args=args, result_type=int
162
+ )
datachain/lib/dc.py CHANGED
@@ -1446,6 +1446,7 @@ class DataChain:
1446
1446
  tokenizer=tokenizer,
1447
1447
  tokenizer_kwargs=tokenizer_kwargs,
1448
1448
  num_samples=num_samples,
1449
+ dc_settings=chain._settings,
1449
1450
  )
1450
1451
 
1451
1452
  def remove_file_signals(self) -> "Self": # noqa: D102
datachain/lib/pytorch.py CHANGED
@@ -10,8 +10,10 @@ from torchvision.transforms import v2
10
10
  from tqdm import tqdm
11
11
 
12
12
  from datachain import Session
13
+ from datachain.asyn import AsyncMapper
13
14
  from datachain.catalog import Catalog, get_catalog
14
15
  from datachain.lib.dc import DataChain
16
+ from datachain.lib.settings import Settings
15
17
  from datachain.lib.text import convert_text
16
18
 
17
19
  if TYPE_CHECKING:
@@ -30,6 +32,8 @@ def label_to_int(value: str, classes: list) -> int:
30
32
 
31
33
 
32
34
  class PytorchDataset(IterableDataset):
35
+ prefetch: int = 2
36
+
33
37
  def __init__(
34
38
  self,
35
39
  name: str,
@@ -39,6 +43,7 @@ class PytorchDataset(IterableDataset):
39
43
  tokenizer: Optional[Callable] = None,
40
44
  tokenizer_kwargs: Optional[dict[str, Any]] = None,
41
45
  num_samples: int = 0,
46
+ dc_settings: Optional[Settings] = None,
42
47
  ):
43
48
  """
44
49
  Pytorch IterableDataset that streams DataChain datasets.
@@ -66,6 +71,11 @@ class PytorchDataset(IterableDataset):
66
71
  catalog = get_catalog()
67
72
  self._init_catalog(catalog)
68
73
 
74
+ dc_settings = dc_settings or Settings()
75
+ self.cache = dc_settings.cache
76
+ if (prefetch := dc_settings.prefetch) is not None:
77
+ self.prefetch = prefetch
78
+
69
79
  def _init_catalog(self, catalog: "Catalog"):
70
80
  # For compatibility with multiprocessing,
71
81
  # we can only store params in __init__(), as Catalog isn't picklable
@@ -82,51 +92,58 @@ class PytorchDataset(IterableDataset):
82
92
  wh = wh_cls(*wh_args, **wh_kwargs)
83
93
  return Catalog(ms, wh, **self._catalog_params)
84
94
 
85
- def __iter__(self) -> Iterator[Any]:
86
- if self.catalog is None:
87
- self.catalog = self._get_catalog()
88
- session = Session.get(catalog=self.catalog)
89
- total_rank, total_workers = self.get_rank_and_workers()
95
+ def _rows_iter(self, total_rank: int, total_workers: int):
96
+ catalog = self._get_catalog()
97
+ session = Session("PyTorch", catalog=catalog)
90
98
  ds = DataChain.from_dataset(
91
99
  name=self.name, version=self.version, session=session
92
- )
100
+ ).settings(cache=self.cache, prefetch=self.prefetch)
93
101
  ds = ds.remove_file_signals()
94
102
 
95
103
  if self.num_samples > 0:
96
104
  ds = ds.sample(self.num_samples)
97
105
  ds = ds.chunk(total_rank, total_workers)
106
+ yield from ds.collect()
107
+
108
+ def __iter__(self) -> Iterator[Any]:
109
+ total_rank, total_workers = self.get_rank_and_workers()
110
+ rows = self._rows_iter(total_rank, total_workers)
111
+ if self.prefetch > 0:
112
+ from datachain.lib.udf import _prefetch_input
113
+
114
+ rows = AsyncMapper(_prefetch_input, rows, workers=self.prefetch).iterate()
115
+
98
116
  desc = f"Parsed PyTorch dataset for rank={total_rank} worker"
99
- with tqdm(desc=desc, unit=" rows") as pbar:
100
- for row_features in ds.collect():
101
- row = []
102
- for fr in row_features:
103
- if hasattr(fr, "read"):
104
- row.append(fr.read()) # type: ignore[unreachable]
105
- else:
106
- row.append(fr)
107
- # Apply transforms
108
- if self.transform:
109
- try:
110
- if isinstance(self.transform, v2.Transform):
111
- row = self.transform(row)
112
- for i, val in enumerate(row):
113
- if isinstance(val, Image.Image):
114
- row[i] = self.transform(val)
115
- except ValueError:
116
- logger.warning(
117
- "Skipping transform due to unsupported data types."
118
- )
119
- self.transform = None
120
- if self.tokenizer:
121
- for i, val in enumerate(row):
122
- if isinstance(val, str) or (
123
- isinstance(val, list) and isinstance(val[0], str)
124
- ):
125
- row[i] = convert_text(
126
- val, self.tokenizer, self.tokenizer_kwargs
127
- ).squeeze(0) # type: ignore[union-attr]
128
- yield row
129
- pbar.update(1)
117
+ with tqdm(rows, desc=desc, unit=" rows", position=total_rank) as rows_it:
118
+ yield from map(self._process_row, rows_it)
119
+
120
+ def _process_row(self, row_features):
121
+ row = []
122
+ for fr in row_features:
123
+ if hasattr(fr, "read"):
124
+ row.append(fr.read()) # type: ignore[unreachable]
125
+ else:
126
+ row.append(fr)
127
+ # Apply transforms
128
+ if self.transform:
129
+ try:
130
+ if isinstance(self.transform, v2.Transform):
131
+ row = self.transform(row)
132
+ for i, val in enumerate(row):
133
+ if isinstance(val, Image.Image):
134
+ row[i] = self.transform(val)
135
+ except ValueError:
136
+ logger.warning("Skipping transform due to unsupported data types.")
137
+ self.transform = None
138
+ if self.tokenizer:
139
+ for i, val in enumerate(row):
140
+ if isinstance(val, str) or (
141
+ isinstance(val, list) and isinstance(val[0], str)
142
+ ):
143
+ row[i] = convert_text(
144
+ val, self.tokenizer, self.tokenizer_kwargs
145
+ ).squeeze(0) # type: ignore[union-attr]
146
+ return row
130
147
 
131
148
  @staticmethod
132
149
  def get_rank_and_workers() -> tuple[int, int]:
@@ -119,18 +119,27 @@ class StudioClient:
119
119
  "\tpip install 'datachain[remote]'"
120
120
  ) from None
121
121
 
122
- def _send_request_msgpack(self, route: str, data: dict[str, Any]) -> Response[Any]:
122
+ def _send_request_msgpack(
123
+ self, route: str, data: dict[str, Any], method: Optional[str] = "POST"
124
+ ) -> Response[Any]:
123
125
  import msgpack
124
126
  import requests
125
127
 
126
- response = requests.post(
127
- f"{self.url}/{route}",
128
- json={**data, "team_name": self.team},
128
+ kwargs = (
129
+ {"params": {**data, "team_name": self.team}}
130
+ if method == "GET"
131
+ else {"json": {**data, "team_name": self.team}}
132
+ )
133
+
134
+ response = requests.request(
135
+ method=method, # type: ignore[arg-type]
136
+ url=f"{self.url}/{route}",
129
137
  headers={
130
138
  "Content-Type": "application/json",
131
139
  "Authorization": f"token {self.token}",
132
140
  },
133
141
  timeout=self.timeout,
142
+ **kwargs, # type: ignore[arg-type]
134
143
  )
135
144
  ok = response.ok
136
145
  if not ok:
@@ -148,7 +157,9 @@ class StudioClient:
148
157
  return Response(response_data, ok, message)
149
158
 
150
159
  @retry_with_backoff(retries=5)
151
- def _send_request(self, route: str, data: dict[str, Any]) -> Response[Any]:
160
+ def _send_request(
161
+ self, route: str, data: dict[str, Any], method: Optional[str] = "POST"
162
+ ) -> Response[Any]:
152
163
  """
153
164
  Function that communicate Studio API.
154
165
  It will raise an exception, and try to retry, if 5xx status code is
@@ -157,14 +168,21 @@ class StudioClient:
157
168
  """
158
169
  import requests
159
170
 
160
- response = requests.post(
161
- f"{self.url}/{route}",
162
- json={**data, "team_name": self.team},
171
+ kwargs = (
172
+ {"params": {**data, "team_name": self.team}}
173
+ if method == "GET"
174
+ else {"json": {**data, "team_name": self.team}}
175
+ )
176
+
177
+ response = requests.request(
178
+ method=method, # type: ignore[arg-type]
179
+ url=f"{self.url}/{route}",
163
180
  headers={
164
181
  "Content-Type": "application/json",
165
182
  "Authorization": f"token {self.token}",
166
183
  },
167
184
  timeout=self.timeout,
185
+ **kwargs, # type: ignore[arg-type]
168
186
  )
169
187
  try:
170
188
  response.raise_for_status()
@@ -222,7 +240,7 @@ class StudioClient:
222
240
  yield path, response
223
241
 
224
242
  def ls_datasets(self) -> Response[LsData]:
225
- return self._send_request("datachain/ls-datasets", {})
243
+ return self._send_request("datachain/datasets", {}, method="GET")
226
244
 
227
245
  def edit_dataset(
228
246
  self,
@@ -232,20 +250,14 @@ class StudioClient:
232
250
  labels: Optional[list[str]] = None,
233
251
  ) -> Response[DatasetInfoData]:
234
252
  body = {
253
+ "new_name": new_name,
235
254
  "dataset_name": name,
255
+ "description": description,
256
+ "labels": labels,
236
257
  }
237
258
 
238
- if new_name is not None:
239
- body["new_name"] = new_name
240
-
241
- if description is not None:
242
- body["description"] = description
243
-
244
- if labels is not None:
245
- body["labels"] = labels # type: ignore[assignment]
246
-
247
259
  return self._send_request(
248
- "datachain/edit-dataset",
260
+ "datachain/datasets",
249
261
  body,
250
262
  )
251
263
 
@@ -256,12 +268,13 @@ class StudioClient:
256
268
  force: Optional[bool] = False,
257
269
  ) -> Response[DatasetInfoData]:
258
270
  return self._send_request(
259
- "datachain/rm-dataset",
271
+ "datachain/datasets",
260
272
  {
261
273
  "dataset_name": name,
262
274
  "version": version,
263
275
  "force": force,
264
276
  },
277
+ method="DELETE",
265
278
  )
266
279
 
267
280
  def dataset_info(self, name: str) -> Response[DatasetInfoData]:
@@ -272,7 +285,9 @@ class StudioClient:
272
285
 
273
286
  return dataset_info
274
287
 
275
- response = self._send_request("datachain/dataset-info", {"dataset_name": name})
288
+ response = self._send_request(
289
+ "datachain/datasets/info", {"dataset_name": name}, method="GET"
290
+ )
276
291
  if response.ok:
277
292
  response.data = _parse_dataset_info(response.data)
278
293
  return response
@@ -282,14 +297,16 @@ class StudioClient:
282
297
  ) -> Response[DatasetRowsData]:
283
298
  req_data = {"dataset_name": name, "dataset_version": version}
284
299
  return self._send_request_msgpack(
285
- "datachain/dataset-rows",
300
+ "datachain/datasets/rows",
286
301
  {**req_data, "offset": offset, "limit": DATASET_ROWS_CHUNK_SIZE},
302
+ method="GET",
287
303
  )
288
304
 
289
305
  def dataset_stats(self, name: str, version: int) -> Response[DatasetStatsData]:
290
306
  response = self._send_request(
291
- "datachain/dataset-stats",
307
+ "datachain/datasets/stats",
292
308
  {"dataset_name": name, "dataset_version": version},
309
+ method="GET",
293
310
  )
294
311
  if response.ok:
295
312
  response.data = DatasetStats(**response.data)
@@ -299,16 +316,18 @@ class StudioClient:
299
316
  self, name: str, version: int
300
317
  ) -> Response[DatasetExportSignedUrls]:
301
318
  return self._send_request(
302
- "datachain/dataset-export",
319
+ "datachain/datasets/export",
303
320
  {"dataset_name": name, "dataset_version": version},
321
+ method="GET",
304
322
  )
305
323
 
306
324
  def dataset_export_status(
307
325
  self, name: str, version: int
308
326
  ) -> Response[DatasetExportStatus]:
309
327
  return self._send_request(
310
- "datachain/dataset-export-status",
328
+ "datachain/datasets/export-status",
311
329
  {"dataset_name": name, "dataset_version": version},
330
+ method="GET",
312
331
  )
313
332
 
314
333
  def upload_file(self, file_name: str, content: bytes) -> Response[FileUploadData]:
@@ -38,6 +38,10 @@ class length(GenericFunction): # noqa: N801
38
38
 
39
39
 
40
40
  class sip_hash_64(GenericFunction): # noqa: N801
41
+ """
42
+ Computes the SipHash-64 hash of the array.
43
+ """
44
+
41
45
  type = Int64()
42
46
  package = "hash"
43
47
  name = "sip_hash_64"
@@ -0,0 +1,43 @@
1
+ from sqlalchemy.sql.functions import GenericFunction, ReturnTypeFromArgs
2
+
3
+ from datachain.sql.types import Int64
4
+ from datachain.sql.utils import compiler_not_implemented
5
+
6
+
7
+ class bit_and(ReturnTypeFromArgs): # noqa: N801
8
+ inherit_cache = True
9
+
10
+
11
+ class bit_or(ReturnTypeFromArgs): # noqa: N801
12
+ inherit_cache = True
13
+
14
+
15
+ class bit_xor(ReturnTypeFromArgs): # noqa: N801
16
+ inherit_cache = True
17
+
18
+
19
+ class bit_rshift(ReturnTypeFromArgs): # noqa: N801
20
+ inherit_cache = True
21
+
22
+
23
+ class bit_lshift(ReturnTypeFromArgs): # noqa: N801
24
+ inherit_cache = True
25
+
26
+
27
+ class int_hash_64(GenericFunction): # noqa: N801
28
+ """
29
+ Computes the 64-bit hash of an integer.
30
+ """
31
+
32
+ type = Int64()
33
+ package = "hash"
34
+ name = "int_hash_64"
35
+ inherit_cache = True
36
+
37
+
38
+ compiler_not_implemented(bit_and)
39
+ compiler_not_implemented(bit_or)
40
+ compiler_not_implemented(bit_xor)
41
+ compiler_not_implemented(bit_rshift)
42
+ compiler_not_implemented(bit_lshift)
43
+ compiler_not_implemented(int_hash_64)
@@ -15,7 +15,14 @@ from sqlalchemy.sql.elements import literal
15
15
  from sqlalchemy.sql.expression import case
16
16
  from sqlalchemy.sql.functions import func
17
17
 
18
- from datachain.sql.functions import aggregate, array, conditional, random, string
18
+ from datachain.sql.functions import (
19
+ aggregate,
20
+ array,
21
+ conditional,
22
+ numeric,
23
+ random,
24
+ string,
25
+ )
19
26
  from datachain.sql.functions import path as sql_path
20
27
  from datachain.sql.selectable import Values, base_values_compiler
21
28
  from datachain.sql.sqlite.types import (
@@ -47,6 +54,8 @@ slash = literal("/")
47
54
  empty_str = literal("")
48
55
  dot = literal(".")
49
56
 
57
+ MAX_INT64 = 2**64 - 1
58
+
50
59
 
51
60
  def setup():
52
61
  global setup_is_complete # noqa: PLW0603
@@ -89,6 +98,12 @@ def setup():
89
98
  compiles(aggregate.group_concat, "sqlite")(compile_group_concat)
90
99
  compiles(aggregate.any_value, "sqlite")(compile_any_value)
91
100
  compiles(aggregate.collect, "sqlite")(compile_collect)
101
+ compiles(numeric.bit_and, "sqlite")(compile_bitwise_and)
102
+ compiles(numeric.bit_or, "sqlite")(compile_bitwise_or)
103
+ compiles(numeric.bit_xor, "sqlite")(compile_bitwise_xor)
104
+ compiles(numeric.bit_rshift, "sqlite")(compile_bitwise_rshift)
105
+ compiles(numeric.bit_lshift, "sqlite")(compile_bitwise_lshift)
106
+ compiles(numeric.int_hash_64, "sqlite")(compile_int_hash_64)
92
107
 
93
108
  if load_usearch_extension(sqlite3.connect(":memory:")):
94
109
  compiles(array.cosine_distance, "sqlite")(compile_cosine_distance_ext)
@@ -163,6 +178,19 @@ def sqlite_string_split(string: str, sep: str, maxsplit: int = -1) -> str:
163
178
  return orjson.dumps(string.split(sep, maxsplit)).decode("utf-8")
164
179
 
165
180
 
181
+ def sqlite_int_hash_64(x: int) -> int:
182
+ """IntHash64 implementation from ClickHouse."""
183
+ x ^= 0x4CF2D2BAAE6DA887
184
+ x ^= x >> 33
185
+ x = (x * 0xFF51AFD7ED558CCD) & MAX_INT64
186
+ x ^= x >> 33
187
+ x = (x * 0xC4CEB9FE1A85EC53) & MAX_INT64
188
+ x ^= x >> 33
189
+ # SQLite does not support unsigned 64-bit integers,
190
+ # so we need to convert to signed 64-bit
191
+ return x if x < 1 << 63 else (x & MAX_INT64) - (1 << 64)
192
+
193
+
166
194
  def register_user_defined_sql_functions() -> None:
167
195
  # Register optional functions if we have the necessary dependencies
168
196
  # and otherwise register functions that will raise an exception with
@@ -185,6 +213,21 @@ def register_user_defined_sql_functions() -> None:
185
213
 
186
214
  _registered_function_creators["vector_functions"] = create_vector_functions
187
215
 
216
+ def create_numeric_functions(conn):
217
+ conn.create_function("divide", 2, lambda a, b: a / b, deterministic=True)
218
+ conn.create_function("bitwise_and", 2, lambda a, b: a & b, deterministic=True)
219
+ conn.create_function("bitwise_or", 2, lambda a, b: a | b, deterministic=True)
220
+ conn.create_function("bitwise_xor", 2, lambda a, b: a ^ b, deterministic=True)
221
+ conn.create_function(
222
+ "bitwise_rshift", 2, lambda a, b: a >> b, deterministic=True
223
+ )
224
+ conn.create_function(
225
+ "bitwise_lshift", 2, lambda a, b: a << b, deterministic=True
226
+ )
227
+ conn.create_function("int_hash_64", 1, sqlite_int_hash_64, deterministic=True)
228
+
229
+ _registered_function_creators["numeric_functions"] = create_numeric_functions
230
+
188
231
  def sqlite_regexp_replace(string: str, pattern: str, replacement: str) -> str:
189
232
  return re.sub(pattern, replacement, string)
190
233
 
@@ -316,6 +359,30 @@ def compile_euclidean_distance(element, compiler, **kwargs):
316
359
  return f"euclidean_distance({compiler.process(element.clauses, **kwargs)})"
317
360
 
318
361
 
362
+ def compile_bitwise_and(element, compiler, **kwargs):
363
+ return compiler.process(func.bitwise_and(*element.clauses.clauses), **kwargs)
364
+
365
+
366
+ def compile_bitwise_or(element, compiler, **kwargs):
367
+ return compiler.process(func.bitwise_or(*element.clauses.clauses), **kwargs)
368
+
369
+
370
+ def compile_bitwise_xor(element, compiler, **kwargs):
371
+ return compiler.process(func.bitwise_xor(*element.clauses.clauses), **kwargs)
372
+
373
+
374
+ def compile_bitwise_rshift(element, compiler, **kwargs):
375
+ return compiler.process(func.bitwise_rshift(*element.clauses.clauses), **kwargs)
376
+
377
+
378
+ def compile_bitwise_lshift(element, compiler, **kwargs):
379
+ return compiler.process(func.bitwise_lshift(*element.clauses.clauses), **kwargs)
380
+
381
+
382
+ def compile_int_hash_64(element, compiler, **kwargs):
383
+ return compiler.process(func.int_hash_64(*element.clauses.clauses), **kwargs)
384
+
385
+
319
386
  def py_json_array_length(arr):
320
387
  return len(orjson.loads(arr))
321
388
 
datachain/studio.py CHANGED
@@ -155,7 +155,7 @@ def edit_studio_dataset(
155
155
  if not response.ok:
156
156
  raise_remote_error(response.message)
157
157
 
158
- print(f"Dataset {name} updated")
158
+ print(f"Dataset '{name}' updated in Studio")
159
159
 
160
160
 
161
161
  def remove_studio_dataset(
@@ -169,7 +169,7 @@ def remove_studio_dataset(
169
169
  if not response.ok:
170
170
  raise_remote_error(response.message)
171
171
 
172
- print(f"Dataset {name} removed")
172
+ print(f"Dataset '{name}' removed from Studio")
173
173
 
174
174
 
175
175
  def save_config(hostname, token):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.7.7
3
+ Version: 0.7.9
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -98,7 +98,7 @@ Requires-Dist: unstructured[embed-huggingface,pdf]<0.16.0; extra == "examples"
98
98
  Requires-Dist: pdfplumber==0.11.4; extra == "examples"
99
99
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
100
100
  Requires-Dist: onnx==1.16.1; extra == "examples"
101
- Requires-Dist: ultralytics==8.3.29; extra == "examples"
101
+ Requires-Dist: ultralytics==8.3.37; extra == "examples"
102
102
 
103
103
  ================
104
104
  |logo| DataChain
@@ -2,7 +2,7 @@ datachain/__init__.py,sha256=ofPJ6B-d-ybSDRrE7J6wqF_ZRAB2W9U8l-eeuBtqPLg,865
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=5aKrjnUxk0mtnZeFKNJd1DCE0MsnSoyJBZkr0y9H_a0,9313
4
4
  datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
5
- datachain/cli.py,sha256=Ysm-6Kb-54FfkN35VJIe5vW7Kik8VGA3wcyCUnqPBHg,42245
5
+ datachain/cli.py,sha256=wQiYQ_qSVCGvS06pkknT9_FIBdFRzBdeRusW9uXE3vQ,42505
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
8
8
  datachain/dataset.py,sha256=P-pDBgvPqJGDhq_I7fwCfb6hY8E8mIAO8Q0NT7SNlNE,19128
@@ -14,7 +14,7 @@ datachain/nodes_fetcher.py,sha256=ILMzUW5o4_6lUOVrLDC9gJPCXfcgKnMG68plrc7dAOA,11
14
14
  datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
15
15
  datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
16
16
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- datachain/studio.py,sha256=MthVADn-jM2I5TlESOfbzFnKGZjpuk9bM8m2vqOK-C8,7227
17
+ datachain/studio.py,sha256=Hr0Ha0kou0so4i8i-gWiXC1AYlJ2arI1D55cc7mi3tg,7253
18
18
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
19
19
  datachain/utils.py,sha256=-mSFowjIidJ4_sMXInvNHLn4rK_QnHuIlLuH1_lMGmI,13897
20
20
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
@@ -32,17 +32,18 @@ datachain/client/s3.py,sha256=CVHBUZ1Ic2Q3370nl-Bbe69phuWjFlrVv9dTJKBpRT0,6019
32
32
  datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
33
33
  datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
34
34
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
35
- datachain/data_storage/metastore.py,sha256=EzSsfR_l_84i1AewYygpdsJyzGqEmvXjpeohlYF7h4A,37435
35
+ datachain/data_storage/metastore.py,sha256=hfTITcesE9XlUTxcCcdDyWGGep-QSjJL9DUxko5QCeI,37524
36
36
  datachain/data_storage/schema.py,sha256=-QVlRvD0dfu-ZFUxylEoSnLJLnleMEjVlcAb2OGu-AY,9895
37
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
38
  datachain/data_storage/sqlite.py,sha256=D_ZQ0PHmZzHO2dinv4naVJocUDIZUwV4WAz692C1cyk,22521
39
39
  datachain/data_storage/warehouse.py,sha256=tjIkU-5JywBR0apCyqTcwSyaRtGxhu2L7IVjrz-55uc,30802
40
- datachain/func/__init__.py,sha256=VAN7N2-eCHgidMCFI-fJTkCwdI1U_NIuCOgYc4sfYUQ,812
40
+ datachain/func/__init__.py,sha256=oz-GbCcp5jnN82u6cghWTGzmU9IQvtvllOof73wE52g,934
41
41
  datachain/func/aggregate.py,sha256=7_IPrIwb2XSs3zG4iOr1eTvzn6kNVe2mkzvNzjusDHk,10942
42
42
  datachain/func/array.py,sha256=zHDNWuWLA7HVa9FEvQeHhVi00_xqenyleTqcLwkXWBI,5477
43
43
  datachain/func/base.py,sha256=wA0sBQAVyN9LPxoo7Ox83peS0zUVnyuKxukwAcjGLfY,534
44
44
  datachain/func/conditional.py,sha256=mQroxsoExpBW84Zm5dAYP4OpBblWmzfnF2qJq9rba54,2223
45
- datachain/func/func.py,sha256=GykhTvNbACFSwaSXsgVlDnqR48kpP_GNAxm3bcq1RYg,12560
45
+ datachain/func/func.py,sha256=mJ_rOXMpoqnK4-d5eF9boSMx5hWzgKoMLPGpZQqLAfw,15222
46
+ datachain/func/numeric.py,sha256=GcUX6ijZvzfac8CZrHE0gRc9WCPiutcMLKqNXtbn-Yo,4186
46
47
  datachain/func/path.py,sha256=mqN_mfkwv44z2II7DMTp_fGGw95hmTCNls_TOFNpr4k,3155
47
48
  datachain/func/random.py,sha256=pENOLj9rSmWfGCnOsUIaCsVC5486zQb66qfQvXaz9Z4,452
48
49
  datachain/func/string.py,sha256=NQzaXXYu7yb72HPADy4WrFlcgvTS77L9x7-qvCKJtnk,4522
@@ -52,7 +53,7 @@ datachain/lib/arrow.py,sha256=b5efxAUaNNYVwtXVJqj07D3zf5KC-BPlLCxKEZbEG6w,9429
52
53
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
53
54
  datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
54
55
  datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
55
- datachain/lib/dc.py,sha256=t5y5tsYyU7uuk3gEPPhhcDSZ1tL1aHkKG2W54eHiUq8,89492
56
+ datachain/lib/dc.py,sha256=xqLR4IH_mbuet0FsxBHDsRUg-zR6tO8UZdLQQTLG8EE,89533
56
57
  datachain/lib/file.py,sha256=-XMkL6ED1sE7TMhWoMRTEuOXswZJw8X6AEmJDONFP74,15019
57
58
  datachain/lib/hf.py,sha256=a-zFpDmZIR4r8dlNNTjfpAKSnuJ9xyRXlgcdENiXt3E,5864
58
59
  datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
@@ -60,7 +61,7 @@ datachain/lib/listing.py,sha256=cVkCp7TRVpcZKSx-Bbk9t51bQI9Mw0o86W6ZPhAsuzM,3667
60
61
  datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
61
62
  datachain/lib/meta_formats.py,sha256=anK2bDVbaeCCh0yvKUBaW2MVos3zRgdaSV8uSduzPcU,6680
62
63
  datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
63
- datachain/lib/pytorch.py,sha256=Nh6fUbQMLX8OpZvX4tw4bJjTCQpRKi0jSLgkJnLHdTM,5880
64
+ datachain/lib/pytorch.py,sha256=QMJO_OGEMvBi2x71vGcG25agLzNwyLmF4Qx5iILlwaM,6350
64
65
  datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,2535
65
66
  datachain/lib/signal_schema.py,sha256=_uh19nCKhiD9ua8oIN1Q8R9iYv1BZAuqTJCLYVmyW8k,24557
66
67
  datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
@@ -95,7 +96,7 @@ datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
95
96
  datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
96
97
  datachain/query/session.py,sha256=vvLIJ5b8eElovHLAWq_CZJXmN5t7C7iAZA7x9wPPOms,5905
97
98
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
98
- datachain/remote/studio.py,sha256=jp6NWo7OPUxqO8uYEHP0_XFlmj47rMxC80qKQ7rA3Xk,11024
99
+ datachain/remote/studio.py,sha256=WiK6fpRAw0a6Dth4XXI0YInEHH4gDU7AUHHDNd3wJzg,11616
99
100
  datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
100
101
  datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
101
102
  datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
@@ -104,21 +105,22 @@ datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESm
104
105
  datachain/sql/default/base.py,sha256=QD-31C6JnyOXzogyDx90sUhm7QvgXIYpeHEASH84igU,628
105
106
  datachain/sql/functions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
106
107
  datachain/sql/functions/aggregate.py,sha256=3AQdA8YHPFdtCEfwZKQXTT8SlQWdG9gD5PBtGN3Odqs,944
107
- datachain/sql/functions/array.py,sha256=rvH27SWN9gdh_mFnp0GIiXuCrNW6n8ZbY4I_JUS-_e0,1140
108
+ datachain/sql/functions/array.py,sha256=Zq59CaMHf_hFapU4kxvy2mwteH344k5Wksxja4MfBks,1204
108
109
  datachain/sql/functions/conditional.py,sha256=q7YUKfunXeEldXaxgT-p5pUTcOEVU_tcQ2BJlquTRPs,207
110
+ datachain/sql/functions/numeric.py,sha256=DFTTEWsvBBXwbaaC4zdxhAoqUYwI6nbymG-nzbzdPv8,972
109
111
  datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0mg,1294
110
112
  datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
111
113
  datachain/sql/functions/string.py,sha256=DYgiw8XSk7ge7GXvyRI1zbaMruIizNeI-puOjriQGZQ,1148
112
114
  datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7d04,166
113
- datachain/sql/sqlite/base.py,sha256=X4iEynOAqqvqz8lmgUKvURleKO6aguULgG8RoufKrSk,14772
115
+ datachain/sql/sqlite/base.py,sha256=eQv2U32jChG9tnYSFE4SS2Mvfb7-W3Ok3Ffhew9qkKI,17254
114
116
  datachain/sql/sqlite/types.py,sha256=lPXS1XbkmUtlkkiRxy_A_UzsgpPv2VSkXYOD4zIHM4w,1734
115
117
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
116
118
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
117
119
  datachain/toolkit/split.py,sha256=ZgDcrNiKiPXZmKD591_1z9qRIXitu5zwAsoVPB7ykiU,2508
118
120
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
119
- datachain-0.7.7.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
120
- datachain-0.7.7.dist-info/METADATA,sha256=laxYaz9f-PIJ30f3krSjRu45CjyfbnBM8Q4kddXa9dM,18006
121
- datachain-0.7.7.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
122
- datachain-0.7.7.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
123
- datachain-0.7.7.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
124
- datachain-0.7.7.dist-info/RECORD,,
121
+ datachain-0.7.9.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
122
+ datachain-0.7.9.dist-info/METADATA,sha256=iu58cwfGQVYTwn53symALXVpe9292EWXdOly2MWuPZY,18006
123
+ datachain-0.7.9.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
124
+ datachain-0.7.9.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
125
+ datachain-0.7.9.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
126
+ datachain-0.7.9.dist-info/RECORD,,