datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. datachain/__init__.py +20 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +7 -7
  4. datachain/catalog/__init__.py +2 -2
  5. datachain/catalog/catalog.py +621 -507
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +28 -18
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +24 -33
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +83 -52
  12. datachain/cli/commands/ls.py +17 -17
  13. datachain/cli/commands/show.py +4 -4
  14. datachain/cli/parser/__init__.py +8 -74
  15. datachain/cli/parser/job.py +95 -3
  16. datachain/cli/parser/studio.py +11 -4
  17. datachain/cli/parser/utils.py +1 -2
  18. datachain/cli/utils.py +2 -15
  19. datachain/client/azure.py +4 -4
  20. datachain/client/fsspec.py +45 -28
  21. datachain/client/gcs.py +6 -6
  22. datachain/client/hf.py +29 -2
  23. datachain/client/http.py +157 -0
  24. datachain/client/local.py +15 -11
  25. datachain/client/s3.py +17 -9
  26. datachain/config.py +4 -8
  27. datachain/data_storage/db_engine.py +12 -6
  28. datachain/data_storage/job.py +5 -1
  29. datachain/data_storage/metastore.py +1252 -186
  30. datachain/data_storage/schema.py +58 -45
  31. datachain/data_storage/serializer.py +105 -15
  32. datachain/data_storage/sqlite.py +286 -127
  33. datachain/data_storage/warehouse.py +250 -113
  34. datachain/dataset.py +353 -148
  35. datachain/delta.py +391 -0
  36. datachain/diff/__init__.py +27 -29
  37. datachain/error.py +60 -0
  38. datachain/func/__init__.py +2 -1
  39. datachain/func/aggregate.py +66 -42
  40. datachain/func/array.py +242 -38
  41. datachain/func/base.py +7 -4
  42. datachain/func/conditional.py +110 -60
  43. datachain/func/func.py +96 -45
  44. datachain/func/numeric.py +55 -38
  45. datachain/func/path.py +32 -20
  46. datachain/func/random.py +2 -2
  47. datachain/func/string.py +67 -37
  48. datachain/func/window.py +7 -8
  49. datachain/hash_utils.py +123 -0
  50. datachain/job.py +11 -7
  51. datachain/json.py +138 -0
  52. datachain/lib/arrow.py +58 -22
  53. datachain/lib/audio.py +245 -0
  54. datachain/lib/clip.py +14 -13
  55. datachain/lib/convert/flatten.py +5 -3
  56. datachain/lib/convert/python_to_sql.py +6 -10
  57. datachain/lib/convert/sql_to_python.py +8 -0
  58. datachain/lib/convert/values_to_tuples.py +156 -51
  59. datachain/lib/data_model.py +42 -20
  60. datachain/lib/dataset_info.py +36 -8
  61. datachain/lib/dc/__init__.py +8 -2
  62. datachain/lib/dc/csv.py +25 -28
  63. datachain/lib/dc/database.py +398 -0
  64. datachain/lib/dc/datachain.py +1289 -425
  65. datachain/lib/dc/datasets.py +320 -38
  66. datachain/lib/dc/hf.py +38 -24
  67. datachain/lib/dc/json.py +29 -32
  68. datachain/lib/dc/listings.py +112 -8
  69. datachain/lib/dc/pandas.py +16 -12
  70. datachain/lib/dc/parquet.py +35 -23
  71. datachain/lib/dc/records.py +31 -23
  72. datachain/lib/dc/storage.py +154 -64
  73. datachain/lib/dc/storage_pattern.py +251 -0
  74. datachain/lib/dc/utils.py +24 -16
  75. datachain/lib/dc/values.py +8 -9
  76. datachain/lib/file.py +622 -89
  77. datachain/lib/hf.py +69 -39
  78. datachain/lib/image.py +14 -14
  79. datachain/lib/listing.py +14 -11
  80. datachain/lib/listing_info.py +1 -2
  81. datachain/lib/meta_formats.py +3 -4
  82. datachain/lib/model_store.py +39 -7
  83. datachain/lib/namespaces.py +125 -0
  84. datachain/lib/projects.py +130 -0
  85. datachain/lib/pytorch.py +32 -21
  86. datachain/lib/settings.py +192 -56
  87. datachain/lib/signal_schema.py +427 -104
  88. datachain/lib/tar.py +1 -2
  89. datachain/lib/text.py +8 -7
  90. datachain/lib/udf.py +164 -76
  91. datachain/lib/udf_signature.py +60 -35
  92. datachain/lib/utils.py +118 -4
  93. datachain/lib/video.py +17 -9
  94. datachain/lib/webdataset.py +61 -56
  95. datachain/lib/webdataset_laion.py +15 -16
  96. datachain/listing.py +22 -10
  97. datachain/model/bbox.py +3 -1
  98. datachain/model/ultralytics/bbox.py +16 -12
  99. datachain/model/ultralytics/pose.py +16 -12
  100. datachain/model/ultralytics/segment.py +16 -12
  101. datachain/namespace.py +84 -0
  102. datachain/node.py +6 -6
  103. datachain/nodes_thread_pool.py +0 -1
  104. datachain/plugins.py +24 -0
  105. datachain/project.py +78 -0
  106. datachain/query/batch.py +40 -41
  107. datachain/query/dataset.py +604 -322
  108. datachain/query/dispatch.py +261 -154
  109. datachain/query/metrics.py +4 -6
  110. datachain/query/params.py +2 -3
  111. datachain/query/queue.py +3 -12
  112. datachain/query/schema.py +11 -6
  113. datachain/query/session.py +200 -33
  114. datachain/query/udf.py +34 -2
  115. datachain/remote/studio.py +171 -69
  116. datachain/script_meta.py +12 -12
  117. datachain/semver.py +68 -0
  118. datachain/sql/__init__.py +2 -0
  119. datachain/sql/functions/array.py +33 -1
  120. datachain/sql/postgresql_dialect.py +9 -0
  121. datachain/sql/postgresql_types.py +21 -0
  122. datachain/sql/sqlite/__init__.py +5 -1
  123. datachain/sql/sqlite/base.py +102 -29
  124. datachain/sql/sqlite/types.py +8 -13
  125. datachain/sql/types.py +70 -15
  126. datachain/studio.py +223 -46
  127. datachain/toolkit/split.py +31 -10
  128. datachain/utils.py +101 -59
  129. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
  130. datachain-0.39.0.dist-info/RECORD +173 -0
  131. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
  132. datachain/cli/commands/query.py +0 -53
  133. datachain/query/utils.py +0 -42
  134. datachain-0.14.2.dist-info/RECORD +0 -158
  135. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  136. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  137. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
@@ -1,78 +1,87 @@
1
- from typing import Optional
2
-
3
1
  from sqlalchemy import func as sa_func
4
2
 
3
+ from datachain.query.schema import Column
5
4
  from datachain.sql.functions import aggregate
6
5
 
7
6
  from .func import Func
8
7
 
9
8
 
10
- def count(col: Optional[str] = None) -> Func:
9
+ def count(col: str | Column | None = None) -> Func:
11
10
  """
12
- Returns the COUNT aggregate SQL function for the given column name.
11
+ Returns a COUNT aggregate SQL function for the specified column.
13
12
 
14
- The COUNT function returns the number of rows in a table.
13
+ The COUNT function returns the number of rows, optionally filtered
14
+ by a specific column.
15
15
 
16
16
  Args:
17
- col (str, optional): The name of the column for which to count rows.
18
- If not provided, it defaults to counting all rows.
17
+ col (str | Column, optional): The column to count.
18
+ If omitted, counts all rows.
19
+ The column can be specified as a string or a `Column` object.
19
20
 
20
21
  Returns:
21
- Func: A Func object that represents the COUNT aggregate function.
22
+ Func: A `Func` object representing the COUNT aggregate function.
22
23
 
23
24
  Example:
24
25
  ```py
25
26
  dc.group_by(
26
- count=func.count(),
27
+ count1=func.count(),
28
+ count2=func.count("signal.id"),
29
+ count3=func.count(dc.C("signal.category")),
27
30
  partition_by="signal.category",
28
31
  )
29
32
  ```
30
33
 
31
34
  Notes:
32
- - Result column will always be of type int.
35
+ - The result column will always have an integer type.
33
36
  """
34
37
  return Func(
35
- "count", inner=sa_func.count, cols=[col] if col else None, result_type=int
38
+ "count",
39
+ inner=sa_func.count,
40
+ cols=[col] if col is not None else None,
41
+ result_type=int,
36
42
  )
37
43
 
38
44
 
39
- def sum(col: str) -> Func:
45
+ def sum(col: str | Column) -> Func:
40
46
  """
41
- Returns the SUM aggregate SQL function for the given column name.
47
+ Returns the SUM aggregate SQL function for the specified column.
42
48
 
43
49
  The SUM function returns the total sum of a numeric column in a table.
44
50
  It sums up all the values for the specified column.
45
51
 
46
52
  Args:
47
- col (str): The name of the column for which to calculate the sum.
53
+ col (str | Column): The name of the column for which to calculate the sum.
54
+ The column can be specified as a string or a `Column` object.
48
55
 
49
56
  Returns:
50
- Func: A Func object that represents the SUM aggregate function.
57
+ Func: A `Func` object that represents the SUM aggregate function.
51
58
 
52
59
  Example:
53
60
  ```py
54
61
  dc.group_by(
55
62
  files_size=func.sum("file.size"),
63
+ total_size=func.sum(dc.C("size")),
56
64
  partition_by="signal.category",
57
65
  )
58
66
  ```
59
67
 
60
68
  Notes:
61
69
  - The `sum` function should be used on numeric columns.
62
- - Result column type will be the same as the input column type.
70
+ - The result column type will be the same as the input column type.
63
71
  """
64
72
  return Func("sum", inner=sa_func.sum, cols=[col])
65
73
 
66
74
 
67
- def avg(col: str) -> Func:
75
+ def avg(col: str | Column) -> Func:
68
76
  """
69
- Returns the AVG aggregate SQL function for the given column name.
77
+ Returns the AVG aggregate SQL function for the specified column.
70
78
 
71
79
  The AVG function returns the average of a numeric column in a table.
72
80
  It calculates the mean of all values in the specified column.
73
81
 
74
82
  Args:
75
- col (str): The name of the column for which to calculate the average.
83
+ col (str | Column): The name of the column for which to calculate the average.
84
+ Column can be specified as a string or a `Column` object.
76
85
 
77
86
  Returns:
78
87
  Func: A Func object that represents the AVG aggregate function.
@@ -81,26 +90,28 @@ def avg(col: str) -> Func:
81
90
  ```py
82
91
  dc.group_by(
83
92
  avg_file_size=func.avg("file.size"),
93
+ avg_signal_value=func.avg(dc.C("signal.value")),
84
94
  partition_by="signal.category",
85
95
  )
86
96
  ```
87
97
 
88
98
  Notes:
89
99
  - The `avg` function should be used on numeric columns.
90
- - Result column will always be of type float.
100
+ - The result column will always be of type float.
91
101
  """
92
102
  return Func("avg", inner=aggregate.avg, cols=[col], result_type=float)
93
103
 
94
104
 
95
- def min(col: str) -> Func:
105
+ def min(col: str | Column) -> Func:
96
106
  """
97
- Returns the MIN aggregate SQL function for the given column name.
107
+ Returns the MIN aggregate SQL function for the specified column.
98
108
 
99
109
  The MIN function returns the smallest value in the specified column.
100
110
  It can be used on both numeric and non-numeric columns to find the minimum value.
101
111
 
102
112
  Args:
103
- col (str): The name of the column for which to find the minimum value.
113
+ col (str | Column): The name of the column for which to find the minimum value.
114
+ Column can be specified as a string or a `Column` object.
104
115
 
105
116
  Returns:
106
117
  Func: A Func object that represents the MIN aggregate function.
@@ -109,18 +120,19 @@ def min(col: str) -> Func:
109
120
  ```py
110
121
  dc.group_by(
111
122
  smallest_file=func.min("file.size"),
123
+ min_signal=func.min(dc.C("signal")),
112
124
  partition_by="signal.category",
113
125
  )
114
126
  ```
115
127
 
116
128
  Notes:
117
129
  - The `min` function can be used with numeric, date, and string columns.
118
- - Result column will have the same type as the input column.
130
+ - The result column will have the same type as the input column.
119
131
  """
120
132
  return Func("min", inner=sa_func.min, cols=[col])
121
133
 
122
134
 
123
- def max(col: str) -> Func:
135
+ def max(col: str | Column) -> Func:
124
136
  """
125
137
  Returns the MAX aggregate SQL function for the given column name.
126
138
 
@@ -128,7 +140,8 @@ def max(col: str) -> Func:
128
140
  It can be used on both numeric and non-numeric columns to find the maximum value.
129
141
 
130
142
  Args:
131
- col (str): The name of the column for which to find the maximum value.
143
+ col (str | Column): The name of the column for which to find the maximum value.
144
+ Column can be specified as a string or a `Column` object.
132
145
 
133
146
  Returns:
134
147
  Func: A Func object that represents the MAX aggregate function.
@@ -137,18 +150,19 @@ def max(col: str) -> Func:
137
150
  ```py
138
151
  dc.group_by(
139
152
  largest_file=func.max("file.size"),
153
+ max_signal=func.max(dc.C("signal")),
140
154
  partition_by="signal.category",
141
155
  )
142
156
  ```
143
157
 
144
158
  Notes:
145
159
  - The `max` function can be used with numeric, date, and string columns.
146
- - Result column will have the same type as the input column.
160
+ - The result column will have the same type as the input column.
147
161
  """
148
162
  return Func("max", inner=sa_func.max, cols=[col])
149
163
 
150
164
 
151
- def any_value(col: str) -> Func:
165
+ def any_value(col: str | Column) -> Func:
152
166
  """
153
167
  Returns the ANY_VALUE aggregate SQL function for the given column name.
154
168
 
@@ -157,7 +171,9 @@ def any_value(col: str) -> Func:
157
171
  as long as it comes from one of the rows in the group.
158
172
 
159
173
  Args:
160
- col (str): The name of the column from which to return an arbitrary value.
174
+ col (str | Column): The name of the column from which to return
175
+ an arbitrary value.
176
+ Column can be specified as a string or a `Column` object.
161
177
 
162
178
  Returns:
163
179
  Func: A Func object that represents the ANY_VALUE aggregate function.
@@ -165,21 +181,22 @@ def any_value(col: str) -> Func:
165
181
  Example:
166
182
  ```py
167
183
  dc.group_by(
168
- file_example=func.any_value("file.name"),
184
+ file_example=func.any_value("file.path"),
185
+ signal_example=func.any_value(dc.C("signal.value")),
169
186
  partition_by="signal.category",
170
187
  )
171
188
  ```
172
189
 
173
190
  Notes:
174
191
  - The `any_value` function can be used with any type of column.
175
- - Result column will have the same type as the input column.
192
+ - The result column will have the same type as the input column.
176
193
  - The result of `any_value` is non-deterministic,
177
194
  meaning it may return different values for different executions.
178
195
  """
179
196
  return Func("any_value", inner=aggregate.any_value, cols=[col])
180
197
 
181
198
 
182
- def collect(col: str) -> Func:
199
+ def collect(col: str | Column) -> Func:
183
200
  """
184
201
  Returns the COLLECT aggregate SQL function for the given column name.
185
202
 
@@ -188,7 +205,8 @@ def collect(col: str) -> Func:
188
205
  into a collection, often for further processing or aggregation.
189
206
 
190
207
  Args:
191
- col (str): The name of the column from which to collect values.
208
+ col (str | Column): The name of the column from which to collect values.
209
+ Column can be specified as a string or a `Column` object.
192
210
 
193
211
  Returns:
194
212
  Func: A Func object that represents the COLLECT aggregate function.
@@ -197,18 +215,19 @@ def collect(col: str) -> Func:
197
215
  ```py
198
216
  dc.group_by(
199
217
  signals=func.collect("signal"),
218
+ file_paths=func.collect(dc.C("file.path")),
200
219
  partition_by="signal.category",
201
220
  )
202
221
  ```
203
222
 
204
223
  Notes:
205
224
  - The `collect` function can be used with numeric and string columns.
206
- - Result column will have an array type.
225
+ - The result column will have an array type.
207
226
  """
208
227
  return Func("collect", inner=aggregate.collect, cols=[col], is_array=True)
209
228
 
210
229
 
211
- def concat(col: str, separator="") -> Func:
230
+ def concat(col: str | Column, separator="") -> Func:
212
231
  """
213
232
  Returns the CONCAT aggregate SQL function for the given column name.
214
233
 
@@ -217,9 +236,10 @@ def concat(col: str, separator="") -> Func:
217
236
  into a single combined value.
218
237
 
219
238
  Args:
220
- col (str): The name of the column from which to concatenate values.
239
+ col (str | Column): The name of the column from which to concatenate values.
240
+ Column can be specified as a string or a `Column` object.
221
241
  separator (str, optional): The separator to use between concatenated values.
222
- Defaults to an empty string.
242
+ Defaults to an empty string.
223
243
 
224
244
  Returns:
225
245
  Func: A Func object that represents the CONCAT aggregate function.
@@ -227,14 +247,15 @@ def concat(col: str, separator="") -> Func:
227
247
  Example:
228
248
  ```py
229
249
  dc.group_by(
230
- files=func.concat("file.name", separator=", "),
250
+ files=func.concat("file.path", separator=", "),
251
+ signals=func.concat(dc.C("signal.name"), separator=" | "),
231
252
  partition_by="signal.category",
232
253
  )
233
254
  ```
234
255
 
235
256
  Notes:
236
257
  - The `concat` function can be used with string columns.
237
- - Result column will have a string type.
258
+ - The result column will have a string type.
238
259
  """
239
260
 
240
261
  def inner(arg):
@@ -325,7 +346,7 @@ def dense_rank() -> Func:
325
346
  return Func("dense_rank", inner=sa_func.dense_rank, result_type=int, is_window=True)
326
347
 
327
348
 
328
- def first(col: str) -> Func:
349
+ def first(col: str | Column) -> Func:
329
350
  """
330
351
  Returns the FIRST_VALUE window function for SQL queries.
331
352
 
@@ -334,7 +355,9 @@ def first(col: str) -> Func:
334
355
  and can be useful for retrieving the leading value in a group of rows.
335
356
 
336
357
  Args:
337
- col (str): The name of the column from which to retrieve the first value.
358
+ col (str | Column): The name of the column from which to retrieve
359
+ the first value.
360
+ Column can be specified as a string or a `Column` object.
338
361
 
339
362
  Returns:
340
363
  Func: A Func object that represents the FIRST_VALUE window function.
@@ -343,7 +366,8 @@ def first(col: str) -> Func:
343
366
  ```py
344
367
  window = func.window(partition_by="signal.category", order_by="created_at")
345
368
  dc.mutate(
346
- first_file=func.first("file.name").over(window),
369
+ first_file=func.first("file.path").over(window),
370
+ first_signal=func.first(dc.C("signal.value")).over(window),
347
371
  )
348
372
  ```
349
373