datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +20 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +7 -7
- datachain/catalog/__init__.py +2 -2
- datachain/catalog/catalog.py +621 -507
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +28 -18
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +24 -33
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +83 -52
- datachain/cli/commands/ls.py +17 -17
- datachain/cli/commands/show.py +4 -4
- datachain/cli/parser/__init__.py +8 -74
- datachain/cli/parser/job.py +95 -3
- datachain/cli/parser/studio.py +11 -4
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +4 -4
- datachain/client/fsspec.py +45 -28
- datachain/client/gcs.py +6 -6
- datachain/client/hf.py +29 -2
- datachain/client/http.py +157 -0
- datachain/client/local.py +15 -11
- datachain/client/s3.py +17 -9
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +5 -1
- datachain/data_storage/metastore.py +1252 -186
- datachain/data_storage/schema.py +58 -45
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +286 -127
- datachain/data_storage/warehouse.py +250 -113
- datachain/dataset.py +353 -148
- datachain/delta.py +391 -0
- datachain/diff/__init__.py +27 -29
- datachain/error.py +60 -0
- datachain/func/__init__.py +2 -1
- datachain/func/aggregate.py +66 -42
- datachain/func/array.py +242 -38
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +110 -60
- datachain/func/func.py +96 -45
- datachain/func/numeric.py +55 -38
- datachain/func/path.py +32 -20
- datachain/func/random.py +2 -2
- datachain/func/string.py +67 -37
- datachain/func/window.py +7 -8
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +58 -22
- datachain/lib/audio.py +245 -0
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/flatten.py +5 -3
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/sql_to_python.py +8 -0
- datachain/lib/convert/values_to_tuples.py +156 -51
- datachain/lib/data_model.py +42 -20
- datachain/lib/dataset_info.py +36 -8
- datachain/lib/dc/__init__.py +8 -2
- datachain/lib/dc/csv.py +25 -28
- datachain/lib/dc/database.py +398 -0
- datachain/lib/dc/datachain.py +1289 -425
- datachain/lib/dc/datasets.py +320 -38
- datachain/lib/dc/hf.py +38 -24
- datachain/lib/dc/json.py +29 -32
- datachain/lib/dc/listings.py +112 -8
- datachain/lib/dc/pandas.py +16 -12
- datachain/lib/dc/parquet.py +35 -23
- datachain/lib/dc/records.py +31 -23
- datachain/lib/dc/storage.py +154 -64
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +24 -16
- datachain/lib/dc/values.py +8 -9
- datachain/lib/file.py +622 -89
- datachain/lib/hf.py +69 -39
- datachain/lib/image.py +14 -14
- datachain/lib/listing.py +14 -11
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +3 -4
- datachain/lib/model_store.py +39 -7
- datachain/lib/namespaces.py +125 -0
- datachain/lib/projects.py +130 -0
- datachain/lib/pytorch.py +32 -21
- datachain/lib/settings.py +192 -56
- datachain/lib/signal_schema.py +427 -104
- datachain/lib/tar.py +1 -2
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +164 -76
- datachain/lib/udf_signature.py +60 -35
- datachain/lib/utils.py +118 -4
- datachain/lib/video.py +17 -9
- datachain/lib/webdataset.py +61 -56
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +22 -10
- datachain/model/bbox.py +3 -1
- datachain/model/ultralytics/bbox.py +16 -12
- datachain/model/ultralytics/pose.py +16 -12
- datachain/model/ultralytics/segment.py +16 -12
- datachain/namespace.py +84 -0
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +78 -0
- datachain/query/batch.py +40 -41
- datachain/query/dataset.py +604 -322
- datachain/query/dispatch.py +261 -154
- datachain/query/metrics.py +4 -6
- datachain/query/params.py +2 -3
- datachain/query/queue.py +3 -12
- datachain/query/schema.py +11 -6
- datachain/query/session.py +200 -33
- datachain/query/udf.py +34 -2
- datachain/remote/studio.py +171 -69
- datachain/script_meta.py +12 -12
- datachain/semver.py +68 -0
- datachain/sql/__init__.py +2 -0
- datachain/sql/functions/array.py +33 -1
- datachain/sql/postgresql_dialect.py +9 -0
- datachain/sql/postgresql_types.py +21 -0
- datachain/sql/sqlite/__init__.py +5 -1
- datachain/sql/sqlite/base.py +102 -29
- datachain/sql/sqlite/types.py +8 -13
- datachain/sql/types.py +70 -15
- datachain/studio.py +223 -46
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +101 -59
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
- datachain-0.39.0.dist-info/RECORD +173 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
- datachain/cli/commands/query.py +0 -53
- datachain/query/utils.py +0 -42
- datachain-0.14.2.dist-info/RECORD +0 -158
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/func/aggregate.py
CHANGED
|
@@ -1,78 +1,87 @@
|
|
|
1
|
-
from typing import Optional
|
|
2
|
-
|
|
3
1
|
from sqlalchemy import func as sa_func
|
|
4
2
|
|
|
3
|
+
from datachain.query.schema import Column
|
|
5
4
|
from datachain.sql.functions import aggregate
|
|
6
5
|
|
|
7
6
|
from .func import Func
|
|
8
7
|
|
|
9
8
|
|
|
10
|
-
def count(col:
|
|
9
|
+
def count(col: str | Column | None = None) -> Func:
|
|
11
10
|
"""
|
|
12
|
-
Returns
|
|
11
|
+
Returns a COUNT aggregate SQL function for the specified column.
|
|
13
12
|
|
|
14
|
-
The COUNT function returns the number of rows
|
|
13
|
+
The COUNT function returns the number of rows, optionally filtered
|
|
14
|
+
by a specific column.
|
|
15
15
|
|
|
16
16
|
Args:
|
|
17
|
-
col (str, optional): The
|
|
18
|
-
|
|
17
|
+
col (str | Column, optional): The column to count.
|
|
18
|
+
If omitted, counts all rows.
|
|
19
|
+
The column can be specified as a string or a `Column` object.
|
|
19
20
|
|
|
20
21
|
Returns:
|
|
21
|
-
Func: A Func object
|
|
22
|
+
Func: A `Func` object representing the COUNT aggregate function.
|
|
22
23
|
|
|
23
24
|
Example:
|
|
24
25
|
```py
|
|
25
26
|
dc.group_by(
|
|
26
|
-
|
|
27
|
+
count1=func.count(),
|
|
28
|
+
count2=func.count("signal.id"),
|
|
29
|
+
count3=func.count(dc.C("signal.category")),
|
|
27
30
|
partition_by="signal.category",
|
|
28
31
|
)
|
|
29
32
|
```
|
|
30
33
|
|
|
31
34
|
Notes:
|
|
32
|
-
-
|
|
35
|
+
- The result column will always have an integer type.
|
|
33
36
|
"""
|
|
34
37
|
return Func(
|
|
35
|
-
"count",
|
|
38
|
+
"count",
|
|
39
|
+
inner=sa_func.count,
|
|
40
|
+
cols=[col] if col is not None else None,
|
|
41
|
+
result_type=int,
|
|
36
42
|
)
|
|
37
43
|
|
|
38
44
|
|
|
39
|
-
def sum(col: str) -> Func:
|
|
45
|
+
def sum(col: str | Column) -> Func:
|
|
40
46
|
"""
|
|
41
|
-
Returns the SUM aggregate SQL function for the
|
|
47
|
+
Returns the SUM aggregate SQL function for the specified column.
|
|
42
48
|
|
|
43
49
|
The SUM function returns the total sum of a numeric column in a table.
|
|
44
50
|
It sums up all the values for the specified column.
|
|
45
51
|
|
|
46
52
|
Args:
|
|
47
|
-
col (str): The name of the column for which to calculate the sum.
|
|
53
|
+
col (str | Column): The name of the column for which to calculate the sum.
|
|
54
|
+
The column can be specified as a string or a `Column` object.
|
|
48
55
|
|
|
49
56
|
Returns:
|
|
50
|
-
Func: A Func object that represents the SUM aggregate function.
|
|
57
|
+
Func: A `Func` object that represents the SUM aggregate function.
|
|
51
58
|
|
|
52
59
|
Example:
|
|
53
60
|
```py
|
|
54
61
|
dc.group_by(
|
|
55
62
|
files_size=func.sum("file.size"),
|
|
63
|
+
total_size=func.sum(dc.C("size")),
|
|
56
64
|
partition_by="signal.category",
|
|
57
65
|
)
|
|
58
66
|
```
|
|
59
67
|
|
|
60
68
|
Notes:
|
|
61
69
|
- The `sum` function should be used on numeric columns.
|
|
62
|
-
-
|
|
70
|
+
- The result column type will be the same as the input column type.
|
|
63
71
|
"""
|
|
64
72
|
return Func("sum", inner=sa_func.sum, cols=[col])
|
|
65
73
|
|
|
66
74
|
|
|
67
|
-
def avg(col: str) -> Func:
|
|
75
|
+
def avg(col: str | Column) -> Func:
|
|
68
76
|
"""
|
|
69
|
-
Returns the AVG aggregate SQL function for the
|
|
77
|
+
Returns the AVG aggregate SQL function for the specified column.
|
|
70
78
|
|
|
71
79
|
The AVG function returns the average of a numeric column in a table.
|
|
72
80
|
It calculates the mean of all values in the specified column.
|
|
73
81
|
|
|
74
82
|
Args:
|
|
75
|
-
col (str): The name of the column for which to calculate the average.
|
|
83
|
+
col (str | Column): The name of the column for which to calculate the average.
|
|
84
|
+
Column can be specified as a string or a `Column` object.
|
|
76
85
|
|
|
77
86
|
Returns:
|
|
78
87
|
Func: A Func object that represents the AVG aggregate function.
|
|
@@ -81,26 +90,28 @@ def avg(col: str) -> Func:
|
|
|
81
90
|
```py
|
|
82
91
|
dc.group_by(
|
|
83
92
|
avg_file_size=func.avg("file.size"),
|
|
93
|
+
avg_signal_value=func.avg(dc.C("signal.value")),
|
|
84
94
|
partition_by="signal.category",
|
|
85
95
|
)
|
|
86
96
|
```
|
|
87
97
|
|
|
88
98
|
Notes:
|
|
89
99
|
- The `avg` function should be used on numeric columns.
|
|
90
|
-
-
|
|
100
|
+
- The result column will always be of type float.
|
|
91
101
|
"""
|
|
92
102
|
return Func("avg", inner=aggregate.avg, cols=[col], result_type=float)
|
|
93
103
|
|
|
94
104
|
|
|
95
|
-
def min(col: str) -> Func:
|
|
105
|
+
def min(col: str | Column) -> Func:
|
|
96
106
|
"""
|
|
97
|
-
Returns the MIN aggregate SQL function for the
|
|
107
|
+
Returns the MIN aggregate SQL function for the specified column.
|
|
98
108
|
|
|
99
109
|
The MIN function returns the smallest value in the specified column.
|
|
100
110
|
It can be used on both numeric and non-numeric columns to find the minimum value.
|
|
101
111
|
|
|
102
112
|
Args:
|
|
103
|
-
col (str): The name of the column for which to find the minimum value.
|
|
113
|
+
col (str | Column): The name of the column for which to find the minimum value.
|
|
114
|
+
Column can be specified as a string or a `Column` object.
|
|
104
115
|
|
|
105
116
|
Returns:
|
|
106
117
|
Func: A Func object that represents the MIN aggregate function.
|
|
@@ -109,18 +120,19 @@ def min(col: str) -> Func:
|
|
|
109
120
|
```py
|
|
110
121
|
dc.group_by(
|
|
111
122
|
smallest_file=func.min("file.size"),
|
|
123
|
+
min_signal=func.min(dc.C("signal")),
|
|
112
124
|
partition_by="signal.category",
|
|
113
125
|
)
|
|
114
126
|
```
|
|
115
127
|
|
|
116
128
|
Notes:
|
|
117
129
|
- The `min` function can be used with numeric, date, and string columns.
|
|
118
|
-
-
|
|
130
|
+
- The result column will have the same type as the input column.
|
|
119
131
|
"""
|
|
120
132
|
return Func("min", inner=sa_func.min, cols=[col])
|
|
121
133
|
|
|
122
134
|
|
|
123
|
-
def max(col: str) -> Func:
|
|
135
|
+
def max(col: str | Column) -> Func:
|
|
124
136
|
"""
|
|
125
137
|
Returns the MAX aggregate SQL function for the given column name.
|
|
126
138
|
|
|
@@ -128,7 +140,8 @@ def max(col: str) -> Func:
|
|
|
128
140
|
It can be used on both numeric and non-numeric columns to find the maximum value.
|
|
129
141
|
|
|
130
142
|
Args:
|
|
131
|
-
col (str): The name of the column for which to find the maximum value.
|
|
143
|
+
col (str | Column): The name of the column for which to find the maximum value.
|
|
144
|
+
Column can be specified as a string or a `Column` object.
|
|
132
145
|
|
|
133
146
|
Returns:
|
|
134
147
|
Func: A Func object that represents the MAX aggregate function.
|
|
@@ -137,18 +150,19 @@ def max(col: str) -> Func:
|
|
|
137
150
|
```py
|
|
138
151
|
dc.group_by(
|
|
139
152
|
largest_file=func.max("file.size"),
|
|
153
|
+
max_signal=func.max(dc.C("signal")),
|
|
140
154
|
partition_by="signal.category",
|
|
141
155
|
)
|
|
142
156
|
```
|
|
143
157
|
|
|
144
158
|
Notes:
|
|
145
159
|
- The `max` function can be used with numeric, date, and string columns.
|
|
146
|
-
-
|
|
160
|
+
- The result column will have the same type as the input column.
|
|
147
161
|
"""
|
|
148
162
|
return Func("max", inner=sa_func.max, cols=[col])
|
|
149
163
|
|
|
150
164
|
|
|
151
|
-
def any_value(col: str) -> Func:
|
|
165
|
+
def any_value(col: str | Column) -> Func:
|
|
152
166
|
"""
|
|
153
167
|
Returns the ANY_VALUE aggregate SQL function for the given column name.
|
|
154
168
|
|
|
@@ -157,7 +171,9 @@ def any_value(col: str) -> Func:
|
|
|
157
171
|
as long as it comes from one of the rows in the group.
|
|
158
172
|
|
|
159
173
|
Args:
|
|
160
|
-
col (str): The name of the column from which to return
|
|
174
|
+
col (str | Column): The name of the column from which to return
|
|
175
|
+
an arbitrary value.
|
|
176
|
+
Column can be specified as a string or a `Column` object.
|
|
161
177
|
|
|
162
178
|
Returns:
|
|
163
179
|
Func: A Func object that represents the ANY_VALUE aggregate function.
|
|
@@ -165,21 +181,22 @@ def any_value(col: str) -> Func:
|
|
|
165
181
|
Example:
|
|
166
182
|
```py
|
|
167
183
|
dc.group_by(
|
|
168
|
-
file_example=func.any_value("file.
|
|
184
|
+
file_example=func.any_value("file.path"),
|
|
185
|
+
signal_example=func.any_value(dc.C("signal.value")),
|
|
169
186
|
partition_by="signal.category",
|
|
170
187
|
)
|
|
171
188
|
```
|
|
172
189
|
|
|
173
190
|
Notes:
|
|
174
191
|
- The `any_value` function can be used with any type of column.
|
|
175
|
-
-
|
|
192
|
+
- The result column will have the same type as the input column.
|
|
176
193
|
- The result of `any_value` is non-deterministic,
|
|
177
194
|
meaning it may return different values for different executions.
|
|
178
195
|
"""
|
|
179
196
|
return Func("any_value", inner=aggregate.any_value, cols=[col])
|
|
180
197
|
|
|
181
198
|
|
|
182
|
-
def collect(col: str) -> Func:
|
|
199
|
+
def collect(col: str | Column) -> Func:
|
|
183
200
|
"""
|
|
184
201
|
Returns the COLLECT aggregate SQL function for the given column name.
|
|
185
202
|
|
|
@@ -188,7 +205,8 @@ def collect(col: str) -> Func:
|
|
|
188
205
|
into a collection, often for further processing or aggregation.
|
|
189
206
|
|
|
190
207
|
Args:
|
|
191
|
-
col (str): The name of the column from which to collect values.
|
|
208
|
+
col (str | Column): The name of the column from which to collect values.
|
|
209
|
+
Column can be specified as a string or a `Column` object.
|
|
192
210
|
|
|
193
211
|
Returns:
|
|
194
212
|
Func: A Func object that represents the COLLECT aggregate function.
|
|
@@ -197,18 +215,19 @@ def collect(col: str) -> Func:
|
|
|
197
215
|
```py
|
|
198
216
|
dc.group_by(
|
|
199
217
|
signals=func.collect("signal"),
|
|
218
|
+
file_paths=func.collect(dc.C("file.path")),
|
|
200
219
|
partition_by="signal.category",
|
|
201
220
|
)
|
|
202
221
|
```
|
|
203
222
|
|
|
204
223
|
Notes:
|
|
205
224
|
- The `collect` function can be used with numeric and string columns.
|
|
206
|
-
-
|
|
225
|
+
- The result column will have an array type.
|
|
207
226
|
"""
|
|
208
227
|
return Func("collect", inner=aggregate.collect, cols=[col], is_array=True)
|
|
209
228
|
|
|
210
229
|
|
|
211
|
-
def concat(col: str, separator="") -> Func:
|
|
230
|
+
def concat(col: str | Column, separator="") -> Func:
|
|
212
231
|
"""
|
|
213
232
|
Returns the CONCAT aggregate SQL function for the given column name.
|
|
214
233
|
|
|
@@ -217,9 +236,10 @@ def concat(col: str, separator="") -> Func:
|
|
|
217
236
|
into a single combined value.
|
|
218
237
|
|
|
219
238
|
Args:
|
|
220
|
-
col (str): The name of the column from which to concatenate values.
|
|
239
|
+
col (str | Column): The name of the column from which to concatenate values.
|
|
240
|
+
Column can be specified as a string or a `Column` object.
|
|
221
241
|
separator (str, optional): The separator to use between concatenated values.
|
|
222
|
-
|
|
242
|
+
Defaults to an empty string.
|
|
223
243
|
|
|
224
244
|
Returns:
|
|
225
245
|
Func: A Func object that represents the CONCAT aggregate function.
|
|
@@ -227,14 +247,15 @@ def concat(col: str, separator="") -> Func:
|
|
|
227
247
|
Example:
|
|
228
248
|
```py
|
|
229
249
|
dc.group_by(
|
|
230
|
-
files=func.concat("file.
|
|
250
|
+
files=func.concat("file.path", separator=", "),
|
|
251
|
+
signals=func.concat(dc.C("signal.name"), separator=" | "),
|
|
231
252
|
partition_by="signal.category",
|
|
232
253
|
)
|
|
233
254
|
```
|
|
234
255
|
|
|
235
256
|
Notes:
|
|
236
257
|
- The `concat` function can be used with string columns.
|
|
237
|
-
-
|
|
258
|
+
- The result column will have a string type.
|
|
238
259
|
"""
|
|
239
260
|
|
|
240
261
|
def inner(arg):
|
|
@@ -325,7 +346,7 @@ def dense_rank() -> Func:
|
|
|
325
346
|
return Func("dense_rank", inner=sa_func.dense_rank, result_type=int, is_window=True)
|
|
326
347
|
|
|
327
348
|
|
|
328
|
-
def first(col: str) -> Func:
|
|
349
|
+
def first(col: str | Column) -> Func:
|
|
329
350
|
"""
|
|
330
351
|
Returns the FIRST_VALUE window function for SQL queries.
|
|
331
352
|
|
|
@@ -334,7 +355,9 @@ def first(col: str) -> Func:
|
|
|
334
355
|
and can be useful for retrieving the leading value in a group of rows.
|
|
335
356
|
|
|
336
357
|
Args:
|
|
337
|
-
col (str): The name of the column from which to retrieve
|
|
358
|
+
col (str | Column): The name of the column from which to retrieve
|
|
359
|
+
the first value.
|
|
360
|
+
Column can be specified as a string or a `Column` object.
|
|
338
361
|
|
|
339
362
|
Returns:
|
|
340
363
|
Func: A Func object that represents the FIRST_VALUE window function.
|
|
@@ -343,7 +366,8 @@ def first(col: str) -> Func:
|
|
|
343
366
|
```py
|
|
344
367
|
window = func.window(partition_by="signal.category", order_by="created_at")
|
|
345
368
|
dc.mutate(
|
|
346
|
-
first_file=func.first("file.
|
|
369
|
+
first_file=func.first("file.path").over(window),
|
|
370
|
+
first_signal=func.first(dc.C("signal.value")).over(window),
|
|
347
371
|
)
|
|
348
372
|
```
|
|
349
373
|
|