datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +20 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +7 -7
- datachain/catalog/__init__.py +2 -2
- datachain/catalog/catalog.py +621 -507
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +28 -18
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +24 -33
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +83 -52
- datachain/cli/commands/ls.py +17 -17
- datachain/cli/commands/show.py +4 -4
- datachain/cli/parser/__init__.py +8 -74
- datachain/cli/parser/job.py +95 -3
- datachain/cli/parser/studio.py +11 -4
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +4 -4
- datachain/client/fsspec.py +45 -28
- datachain/client/gcs.py +6 -6
- datachain/client/hf.py +29 -2
- datachain/client/http.py +157 -0
- datachain/client/local.py +15 -11
- datachain/client/s3.py +17 -9
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +5 -1
- datachain/data_storage/metastore.py +1252 -186
- datachain/data_storage/schema.py +58 -45
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +286 -127
- datachain/data_storage/warehouse.py +250 -113
- datachain/dataset.py +353 -148
- datachain/delta.py +391 -0
- datachain/diff/__init__.py +27 -29
- datachain/error.py +60 -0
- datachain/func/__init__.py +2 -1
- datachain/func/aggregate.py +66 -42
- datachain/func/array.py +242 -38
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +110 -60
- datachain/func/func.py +96 -45
- datachain/func/numeric.py +55 -38
- datachain/func/path.py +32 -20
- datachain/func/random.py +2 -2
- datachain/func/string.py +67 -37
- datachain/func/window.py +7 -8
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +58 -22
- datachain/lib/audio.py +245 -0
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/flatten.py +5 -3
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/sql_to_python.py +8 -0
- datachain/lib/convert/values_to_tuples.py +156 -51
- datachain/lib/data_model.py +42 -20
- datachain/lib/dataset_info.py +36 -8
- datachain/lib/dc/__init__.py +8 -2
- datachain/lib/dc/csv.py +25 -28
- datachain/lib/dc/database.py +398 -0
- datachain/lib/dc/datachain.py +1289 -425
- datachain/lib/dc/datasets.py +320 -38
- datachain/lib/dc/hf.py +38 -24
- datachain/lib/dc/json.py +29 -32
- datachain/lib/dc/listings.py +112 -8
- datachain/lib/dc/pandas.py +16 -12
- datachain/lib/dc/parquet.py +35 -23
- datachain/lib/dc/records.py +31 -23
- datachain/lib/dc/storage.py +154 -64
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +24 -16
- datachain/lib/dc/values.py +8 -9
- datachain/lib/file.py +622 -89
- datachain/lib/hf.py +69 -39
- datachain/lib/image.py +14 -14
- datachain/lib/listing.py +14 -11
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +3 -4
- datachain/lib/model_store.py +39 -7
- datachain/lib/namespaces.py +125 -0
- datachain/lib/projects.py +130 -0
- datachain/lib/pytorch.py +32 -21
- datachain/lib/settings.py +192 -56
- datachain/lib/signal_schema.py +427 -104
- datachain/lib/tar.py +1 -2
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +164 -76
- datachain/lib/udf_signature.py +60 -35
- datachain/lib/utils.py +118 -4
- datachain/lib/video.py +17 -9
- datachain/lib/webdataset.py +61 -56
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +22 -10
- datachain/model/bbox.py +3 -1
- datachain/model/ultralytics/bbox.py +16 -12
- datachain/model/ultralytics/pose.py +16 -12
- datachain/model/ultralytics/segment.py +16 -12
- datachain/namespace.py +84 -0
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +78 -0
- datachain/query/batch.py +40 -41
- datachain/query/dataset.py +604 -322
- datachain/query/dispatch.py +261 -154
- datachain/query/metrics.py +4 -6
- datachain/query/params.py +2 -3
- datachain/query/queue.py +3 -12
- datachain/query/schema.py +11 -6
- datachain/query/session.py +200 -33
- datachain/query/udf.py +34 -2
- datachain/remote/studio.py +171 -69
- datachain/script_meta.py +12 -12
- datachain/semver.py +68 -0
- datachain/sql/__init__.py +2 -0
- datachain/sql/functions/array.py +33 -1
- datachain/sql/postgresql_dialect.py +9 -0
- datachain/sql/postgresql_types.py +21 -0
- datachain/sql/sqlite/__init__.py +5 -1
- datachain/sql/sqlite/base.py +102 -29
- datachain/sql/sqlite/types.py +8 -13
- datachain/sql/types.py +70 -15
- datachain/studio.py +223 -46
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +101 -59
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
- datachain-0.39.0.dist-info/RECORD +173 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
- datachain/cli/commands/query.py +0 -53
- datachain/query/utils.py +0 -42
- datachain-0.14.2.dist-info/RECORD +0 -158
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/func/conditional.py
CHANGED
|
@@ -1,46 +1,46 @@
|
|
|
1
|
-
from typing import Optional, Union
|
|
2
|
-
|
|
3
1
|
from sqlalchemy import ColumnElement
|
|
4
2
|
from sqlalchemy import and_ as sql_and
|
|
5
3
|
from sqlalchemy import case as sql_case
|
|
4
|
+
from sqlalchemy import not_ as sql_not
|
|
6
5
|
from sqlalchemy import or_ as sql_or
|
|
7
6
|
|
|
8
7
|
from datachain.lib.utils import DataChainParamsError
|
|
9
8
|
from datachain.query.schema import Column
|
|
10
9
|
from datachain.sql.functions import conditional
|
|
11
10
|
|
|
12
|
-
from .func import
|
|
11
|
+
from .func import Func
|
|
13
12
|
|
|
14
|
-
CaseT =
|
|
13
|
+
CaseT = int | float | complex | bool | str | Func | ColumnElement
|
|
15
14
|
|
|
16
15
|
|
|
17
|
-
def greatest(*args:
|
|
16
|
+
def greatest(*args: str | Column | Func | float) -> Func:
|
|
18
17
|
"""
|
|
19
18
|
Returns the greatest (largest) value from the given input values.
|
|
20
19
|
|
|
21
20
|
Args:
|
|
22
|
-
args (
|
|
21
|
+
args (str | Column | Func | int | float): The values to compare.
|
|
23
22
|
If a string is provided, it is assumed to be the name of the column.
|
|
23
|
+
If a Column is provided, it is assumed to be a column in the dataset.
|
|
24
24
|
If a Func is provided, it is assumed to be a function returning a value.
|
|
25
|
-
If an int
|
|
25
|
+
If an int or float is provided, it is assumed to be a literal.
|
|
26
26
|
|
|
27
27
|
Returns:
|
|
28
|
-
Func: A Func object that represents the greatest function.
|
|
28
|
+
Func: A `Func` object that represents the greatest function.
|
|
29
29
|
|
|
30
30
|
Example:
|
|
31
31
|
```py
|
|
32
32
|
dc.mutate(
|
|
33
|
-
greatest=func.greatest("signal.value", 0),
|
|
33
|
+
greatest=func.greatest(dc.C("signal.value"), "signal.value2", 0.5, 1.0),
|
|
34
34
|
)
|
|
35
35
|
```
|
|
36
36
|
|
|
37
|
-
|
|
38
|
-
-
|
|
37
|
+
Notes:
|
|
38
|
+
- The result column will always be of the same type as the input columns.
|
|
39
39
|
"""
|
|
40
40
|
cols, func_args = [], []
|
|
41
41
|
|
|
42
42
|
for arg in args:
|
|
43
|
-
if isinstance(arg, (str, Func)):
|
|
43
|
+
if isinstance(arg, (str, Column, Func)):
|
|
44
44
|
cols.append(arg)
|
|
45
45
|
else:
|
|
46
46
|
func_args.append(arg)
|
|
@@ -54,33 +54,34 @@ def greatest(*args: Union[ColT, float]) -> Func:
|
|
|
54
54
|
)
|
|
55
55
|
|
|
56
56
|
|
|
57
|
-
def least(*args:
|
|
57
|
+
def least(*args: str | Column | Func | float) -> Func:
|
|
58
58
|
"""
|
|
59
59
|
Returns the least (smallest) value from the given input values.
|
|
60
60
|
|
|
61
61
|
Args:
|
|
62
|
-
args (
|
|
62
|
+
args (str | Column | Func | int | float): The values to compare.
|
|
63
63
|
If a string is provided, it is assumed to be the name of the column.
|
|
64
|
+
If a Column is provided, it is assumed to be a column in the dataset.
|
|
64
65
|
If a Func is provided, it is assumed to be a function returning a value.
|
|
65
|
-
If an int
|
|
66
|
+
If an int or float is provided, it is assumed to be a literal.
|
|
66
67
|
|
|
67
68
|
Returns:
|
|
68
|
-
Func: A Func object that represents the least function.
|
|
69
|
+
Func: A `Func` object that represents the least function.
|
|
69
70
|
|
|
70
71
|
Example:
|
|
71
72
|
```py
|
|
72
73
|
dc.mutate(
|
|
73
|
-
least=func.least("signal.value", 0),
|
|
74
|
+
least=func.least(dc.C("signal.value"), "signal.value2", -1.0, 0),
|
|
74
75
|
)
|
|
75
76
|
```
|
|
76
77
|
|
|
77
|
-
|
|
78
|
-
-
|
|
78
|
+
Notes:
|
|
79
|
+
- The result column will always be of the same type as the input columns.
|
|
79
80
|
"""
|
|
80
81
|
cols, func_args = [], []
|
|
81
82
|
|
|
82
83
|
for arg in args:
|
|
83
|
-
if isinstance(arg, (str, Func)):
|
|
84
|
+
if isinstance(arg, (str, Column, Func)):
|
|
84
85
|
cols.append(arg)
|
|
85
86
|
else:
|
|
86
87
|
func_args.append(arg)
|
|
@@ -91,32 +92,34 @@ def least(*args: Union[ColT, float]) -> Func:
|
|
|
91
92
|
|
|
92
93
|
|
|
93
94
|
def case(
|
|
94
|
-
*args: tuple[
|
|
95
|
+
*args: tuple[ColumnElement | Func | bool, CaseT], else_: CaseT | None = None
|
|
95
96
|
) -> Func:
|
|
96
97
|
"""
|
|
97
|
-
Returns
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
or columns.
|
|
101
|
-
Result type is inferred from condition results.
|
|
98
|
+
Returns a case expression that evaluates a list of conditions and returns
|
|
99
|
+
corresponding results. Results can be Python primitives (string, numbers, booleans),
|
|
100
|
+
nested functions (including case function), or columns.
|
|
102
101
|
|
|
103
102
|
Args:
|
|
104
|
-
args tuple
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
will be None
|
|
103
|
+
args (tuple[ColumnElement | Func | bool, CaseT]): Tuples of (condition, value)
|
|
104
|
+
pairs. Each condition is evaluated in order, and the corresponding value
|
|
105
|
+
is returned for the first condition that evaluates to True.
|
|
106
|
+
else_ (CaseT, optional): Value to return if no conditions are satisfied.
|
|
107
|
+
If omitted and no conditions are satisfied, the result will be None
|
|
108
|
+
(NULL in DB).
|
|
109
109
|
|
|
110
110
|
Returns:
|
|
111
|
-
Func: A Func object that represents the case function.
|
|
111
|
+
Func: A `Func` object that represents the case function.
|
|
112
112
|
|
|
113
113
|
Example:
|
|
114
114
|
```py
|
|
115
115
|
dc.mutate(
|
|
116
|
-
res=func.case((C("num") > 0, "P"), (C("num") < 0, "N"), else_="Z"),
|
|
116
|
+
res=func.case((dc.C("num") > 0, "P"), (dc.C("num") < 0, "N"), else_="Z"),
|
|
117
117
|
)
|
|
118
118
|
```
|
|
119
|
-
|
|
119
|
+
|
|
120
|
+
Notes:
|
|
121
|
+
- The result type is inferred from the values provided in the case statements.
|
|
122
|
+
"""
|
|
120
123
|
supported_types = [int, float, complex, str, bool]
|
|
121
124
|
|
|
122
125
|
def _get_type(val):
|
|
@@ -158,24 +161,20 @@ def case(
|
|
|
158
161
|
return Func("case", inner=sql_case, cols=args, kwargs=kwargs, result_type=type_)
|
|
159
162
|
|
|
160
163
|
|
|
161
|
-
def ifelse(
|
|
162
|
-
condition: Union[ColumnElement, Func], if_val: CaseT, else_val: CaseT
|
|
163
|
-
) -> Func:
|
|
164
|
+
def ifelse(condition: ColumnElement | Func, if_val: CaseT, else_val: CaseT) -> Func:
|
|
164
165
|
"""
|
|
165
|
-
Returns
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
Result type is inferred from the values.
|
|
166
|
+
Returns an if-else expression that evaluates a condition and returns one
|
|
167
|
+
of two values based on the result. Values can be Python primitives
|
|
168
|
+
(string, numbers, booleans), nested functions, or columns.
|
|
169
169
|
|
|
170
170
|
Args:
|
|
171
|
-
condition (ColumnElement
|
|
172
|
-
if_val (
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
false condition outcome.
|
|
171
|
+
condition (ColumnElement | Func): Condition to evaluate.
|
|
172
|
+
if_val (ColumnElement | Func | literal): Value to return if condition is True.
|
|
173
|
+
else_val (ColumnElement | Func | literal): Value to return if condition
|
|
174
|
+
is False.
|
|
176
175
|
|
|
177
176
|
Returns:
|
|
178
|
-
Func: A Func object that represents the ifelse function.
|
|
177
|
+
Func: A `Func` object that represents the ifelse function.
|
|
179
178
|
|
|
180
179
|
Example:
|
|
181
180
|
```py
|
|
@@ -183,57 +182,69 @@ def ifelse(
|
|
|
183
182
|
res=func.ifelse(isnone("col"), "EMPTY", "NOT_EMPTY")
|
|
184
183
|
)
|
|
185
184
|
```
|
|
185
|
+
|
|
186
|
+
Notes:
|
|
187
|
+
- The result type is inferred from the values provided in the ifelse statement.
|
|
186
188
|
"""
|
|
187
189
|
return case((condition, if_val), else_=else_val)
|
|
188
190
|
|
|
189
191
|
|
|
190
|
-
def isnone(col:
|
|
192
|
+
def isnone(col: str | ColumnElement) -> Func:
|
|
191
193
|
"""
|
|
192
|
-
Returns
|
|
194
|
+
Returns a function that checks if the column value is `None` (NULL in DB).
|
|
193
195
|
|
|
194
196
|
Args:
|
|
195
197
|
col (str | Column): Column to check if it's None or not.
|
|
196
198
|
If a string is provided, it is assumed to be the name of the column.
|
|
199
|
+
If a Column is provided, it is assumed to be a column in the dataset.
|
|
197
200
|
|
|
198
201
|
Returns:
|
|
199
|
-
Func: A Func object that represents the
|
|
202
|
+
Func: A `Func` object that represents the isnone function.
|
|
203
|
+
Returns True if column value is None, otherwise False.
|
|
200
204
|
|
|
201
205
|
Example:
|
|
202
206
|
```py
|
|
203
207
|
dc.mutate(test=ifelse(isnone("col"), "EMPTY", "NOT_EMPTY"))
|
|
204
208
|
```
|
|
205
|
-
"""
|
|
206
|
-
from datachain import C
|
|
207
209
|
|
|
210
|
+
Notes:
|
|
211
|
+
- The result column will always be of type bool.
|
|
212
|
+
"""
|
|
208
213
|
if isinstance(col, str):
|
|
209
|
-
# if string, it is assumed to be the name of the column
|
|
210
|
-
col =
|
|
214
|
+
# if string is provided, it is assumed to be the name of the column
|
|
215
|
+
col = Column(col)
|
|
211
216
|
|
|
212
217
|
return case((col.is_(None) if col is not None else True, True), else_=False)
|
|
213
218
|
|
|
214
219
|
|
|
215
|
-
def or_(*args:
|
|
220
|
+
def or_(*args: ColumnElement | Func) -> Func:
|
|
216
221
|
"""
|
|
217
222
|
Returns the function that produces conjunction of expressions joined by OR
|
|
218
223
|
logical operator.
|
|
219
224
|
|
|
220
225
|
Args:
|
|
221
226
|
args (ColumnElement | Func): The expressions for OR statement.
|
|
227
|
+
If a string is provided, it is assumed to be the name of the column.
|
|
228
|
+
If a Column is provided, it is assumed to be a column in the dataset.
|
|
229
|
+
If a Func is provided, it is assumed to be a function returning a value.
|
|
222
230
|
|
|
223
231
|
Returns:
|
|
224
|
-
Func: A Func object that represents the
|
|
232
|
+
Func: A `Func` object that represents the OR function.
|
|
225
233
|
|
|
226
234
|
Example:
|
|
227
235
|
```py
|
|
228
236
|
dc.mutate(
|
|
229
|
-
test=ifelse(or_(isnone("name"), C("name") == ''), "Empty", "Not Empty")
|
|
237
|
+
test=ifelse(or_(isnone("name"), dc.C("name") == ''), "Empty", "Not Empty")
|
|
230
238
|
)
|
|
231
239
|
```
|
|
240
|
+
|
|
241
|
+
Notes:
|
|
242
|
+
- The result column will always be of type bool.
|
|
232
243
|
"""
|
|
233
244
|
cols, func_args = [], []
|
|
234
245
|
|
|
235
246
|
for arg in args:
|
|
236
|
-
if isinstance(arg, (str, Func)):
|
|
247
|
+
if isinstance(arg, (str, Column, Func)):
|
|
237
248
|
cols.append(arg)
|
|
238
249
|
else:
|
|
239
250
|
func_args.append(arg)
|
|
@@ -241,16 +252,19 @@ def or_(*args: Union[ColumnElement, Func]) -> Func:
|
|
|
241
252
|
return Func("or", inner=sql_or, cols=cols, args=func_args, result_type=bool)
|
|
242
253
|
|
|
243
254
|
|
|
244
|
-
def and_(*args:
|
|
255
|
+
def and_(*args: ColumnElement | Func) -> Func:
|
|
245
256
|
"""
|
|
246
257
|
Returns the function that produces conjunction of expressions joined by AND
|
|
247
258
|
logical operator.
|
|
248
259
|
|
|
249
260
|
Args:
|
|
250
261
|
args (ColumnElement | Func): The expressions for AND statement.
|
|
262
|
+
If a string is provided, it is assumed to be the name of the column.
|
|
263
|
+
If a Column is provided, it is assumed to be a column in the dataset.
|
|
264
|
+
If a Func is provided, it is assumed to be a function returning a value.
|
|
251
265
|
|
|
252
266
|
Returns:
|
|
253
|
-
Func: A Func object that represents the
|
|
267
|
+
Func: A `Func` object that represents the AND function.
|
|
254
268
|
|
|
255
269
|
Example:
|
|
256
270
|
```py
|
|
@@ -258,6 +272,9 @@ def and_(*args: Union[ColumnElement, Func]) -> Func:
|
|
|
258
272
|
test=ifelse(and_(isnone("name"), isnone("surname")), "Empty", "Not Empty")
|
|
259
273
|
)
|
|
260
274
|
```
|
|
275
|
+
|
|
276
|
+
Notes:
|
|
277
|
+
- The result column will always be of type bool.
|
|
261
278
|
"""
|
|
262
279
|
cols, func_args = [], []
|
|
263
280
|
|
|
@@ -268,3 +285,36 @@ def and_(*args: Union[ColumnElement, Func]) -> Func:
|
|
|
268
285
|
func_args.append(arg)
|
|
269
286
|
|
|
270
287
|
return Func("and", inner=sql_and, cols=cols, args=func_args, result_type=bool)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def not_(arg: ColumnElement | Func) -> Func:
|
|
291
|
+
"""
|
|
292
|
+
Returns the function that produces NOT of the given expressions.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
arg (ColumnElement | Func): The expression for NOT statement.
|
|
296
|
+
If a string is provided, it is assumed to be the name of the column.
|
|
297
|
+
If a Column is provided, it is assumed to be a column in the dataset.
|
|
298
|
+
If a Func is provided, it is assumed to be a function returning a value.
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
Func: A `Func` object that represents the NOT function.
|
|
302
|
+
|
|
303
|
+
Example:
|
|
304
|
+
```py
|
|
305
|
+
dc.mutate(
|
|
306
|
+
test=not_(C("value") == 5)
|
|
307
|
+
)
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
Notes:
|
|
311
|
+
- The result column will always be of type bool.
|
|
312
|
+
"""
|
|
313
|
+
cols, func_args = [], []
|
|
314
|
+
|
|
315
|
+
if isinstance(arg, (str, Func)):
|
|
316
|
+
cols.append(arg)
|
|
317
|
+
else:
|
|
318
|
+
func_args.append(arg)
|
|
319
|
+
|
|
320
|
+
return Func("not", inner=sql_not, cols=cols, args=func_args, result_type=bool)
|