datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. datachain/__init__.py +20 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +7 -7
  4. datachain/catalog/__init__.py +2 -2
  5. datachain/catalog/catalog.py +621 -507
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +28 -18
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +24 -33
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +83 -52
  12. datachain/cli/commands/ls.py +17 -17
  13. datachain/cli/commands/show.py +4 -4
  14. datachain/cli/parser/__init__.py +8 -74
  15. datachain/cli/parser/job.py +95 -3
  16. datachain/cli/parser/studio.py +11 -4
  17. datachain/cli/parser/utils.py +1 -2
  18. datachain/cli/utils.py +2 -15
  19. datachain/client/azure.py +4 -4
  20. datachain/client/fsspec.py +45 -28
  21. datachain/client/gcs.py +6 -6
  22. datachain/client/hf.py +29 -2
  23. datachain/client/http.py +157 -0
  24. datachain/client/local.py +15 -11
  25. datachain/client/s3.py +17 -9
  26. datachain/config.py +4 -8
  27. datachain/data_storage/db_engine.py +12 -6
  28. datachain/data_storage/job.py +5 -1
  29. datachain/data_storage/metastore.py +1252 -186
  30. datachain/data_storage/schema.py +58 -45
  31. datachain/data_storage/serializer.py +105 -15
  32. datachain/data_storage/sqlite.py +286 -127
  33. datachain/data_storage/warehouse.py +250 -113
  34. datachain/dataset.py +353 -148
  35. datachain/delta.py +391 -0
  36. datachain/diff/__init__.py +27 -29
  37. datachain/error.py +60 -0
  38. datachain/func/__init__.py +2 -1
  39. datachain/func/aggregate.py +66 -42
  40. datachain/func/array.py +242 -38
  41. datachain/func/base.py +7 -4
  42. datachain/func/conditional.py +110 -60
  43. datachain/func/func.py +96 -45
  44. datachain/func/numeric.py +55 -38
  45. datachain/func/path.py +32 -20
  46. datachain/func/random.py +2 -2
  47. datachain/func/string.py +67 -37
  48. datachain/func/window.py +7 -8
  49. datachain/hash_utils.py +123 -0
  50. datachain/job.py +11 -7
  51. datachain/json.py +138 -0
  52. datachain/lib/arrow.py +58 -22
  53. datachain/lib/audio.py +245 -0
  54. datachain/lib/clip.py +14 -13
  55. datachain/lib/convert/flatten.py +5 -3
  56. datachain/lib/convert/python_to_sql.py +6 -10
  57. datachain/lib/convert/sql_to_python.py +8 -0
  58. datachain/lib/convert/values_to_tuples.py +156 -51
  59. datachain/lib/data_model.py +42 -20
  60. datachain/lib/dataset_info.py +36 -8
  61. datachain/lib/dc/__init__.py +8 -2
  62. datachain/lib/dc/csv.py +25 -28
  63. datachain/lib/dc/database.py +398 -0
  64. datachain/lib/dc/datachain.py +1289 -425
  65. datachain/lib/dc/datasets.py +320 -38
  66. datachain/lib/dc/hf.py +38 -24
  67. datachain/lib/dc/json.py +29 -32
  68. datachain/lib/dc/listings.py +112 -8
  69. datachain/lib/dc/pandas.py +16 -12
  70. datachain/lib/dc/parquet.py +35 -23
  71. datachain/lib/dc/records.py +31 -23
  72. datachain/lib/dc/storage.py +154 -64
  73. datachain/lib/dc/storage_pattern.py +251 -0
  74. datachain/lib/dc/utils.py +24 -16
  75. datachain/lib/dc/values.py +8 -9
  76. datachain/lib/file.py +622 -89
  77. datachain/lib/hf.py +69 -39
  78. datachain/lib/image.py +14 -14
  79. datachain/lib/listing.py +14 -11
  80. datachain/lib/listing_info.py +1 -2
  81. datachain/lib/meta_formats.py +3 -4
  82. datachain/lib/model_store.py +39 -7
  83. datachain/lib/namespaces.py +125 -0
  84. datachain/lib/projects.py +130 -0
  85. datachain/lib/pytorch.py +32 -21
  86. datachain/lib/settings.py +192 -56
  87. datachain/lib/signal_schema.py +427 -104
  88. datachain/lib/tar.py +1 -2
  89. datachain/lib/text.py +8 -7
  90. datachain/lib/udf.py +164 -76
  91. datachain/lib/udf_signature.py +60 -35
  92. datachain/lib/utils.py +118 -4
  93. datachain/lib/video.py +17 -9
  94. datachain/lib/webdataset.py +61 -56
  95. datachain/lib/webdataset_laion.py +15 -16
  96. datachain/listing.py +22 -10
  97. datachain/model/bbox.py +3 -1
  98. datachain/model/ultralytics/bbox.py +16 -12
  99. datachain/model/ultralytics/pose.py +16 -12
  100. datachain/model/ultralytics/segment.py +16 -12
  101. datachain/namespace.py +84 -0
  102. datachain/node.py +6 -6
  103. datachain/nodes_thread_pool.py +0 -1
  104. datachain/plugins.py +24 -0
  105. datachain/project.py +78 -0
  106. datachain/query/batch.py +40 -41
  107. datachain/query/dataset.py +604 -322
  108. datachain/query/dispatch.py +261 -154
  109. datachain/query/metrics.py +4 -6
  110. datachain/query/params.py +2 -3
  111. datachain/query/queue.py +3 -12
  112. datachain/query/schema.py +11 -6
  113. datachain/query/session.py +200 -33
  114. datachain/query/udf.py +34 -2
  115. datachain/remote/studio.py +171 -69
  116. datachain/script_meta.py +12 -12
  117. datachain/semver.py +68 -0
  118. datachain/sql/__init__.py +2 -0
  119. datachain/sql/functions/array.py +33 -1
  120. datachain/sql/postgresql_dialect.py +9 -0
  121. datachain/sql/postgresql_types.py +21 -0
  122. datachain/sql/sqlite/__init__.py +5 -1
  123. datachain/sql/sqlite/base.py +102 -29
  124. datachain/sql/sqlite/types.py +8 -13
  125. datachain/sql/types.py +70 -15
  126. datachain/studio.py +223 -46
  127. datachain/toolkit/split.py +31 -10
  128. datachain/utils.py +101 -59
  129. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
  130. datachain-0.39.0.dist-info/RECORD +173 -0
  131. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
  132. datachain/cli/commands/query.py +0 -53
  133. datachain/query/utils.py +0 -42
  134. datachain-0.14.2.dist-info/RECORD +0 -158
  135. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  136. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  137. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
@@ -1,46 +1,46 @@
1
- from typing import Optional, Union
2
-
3
1
  from sqlalchemy import ColumnElement
4
2
  from sqlalchemy import and_ as sql_and
5
3
  from sqlalchemy import case as sql_case
4
+ from sqlalchemy import not_ as sql_not
6
5
  from sqlalchemy import or_ as sql_or
7
6
 
8
7
  from datachain.lib.utils import DataChainParamsError
9
8
  from datachain.query.schema import Column
10
9
  from datachain.sql.functions import conditional
11
10
 
12
- from .func import ColT, Func
11
+ from .func import Func
13
12
 
14
- CaseT = Union[int, float, complex, bool, str, Func, ColumnElement]
13
+ CaseT = int | float | complex | bool | str | Func | ColumnElement
15
14
 
16
15
 
17
- def greatest(*args: Union[ColT, float]) -> Func:
16
+ def greatest(*args: str | Column | Func | float) -> Func:
18
17
  """
19
18
  Returns the greatest (largest) value from the given input values.
20
19
 
21
20
  Args:
22
- args (ColT | str | int | float | Sequence): The values to compare.
21
+ args (str | Column | Func | int | float): The values to compare.
23
22
  If a string is provided, it is assumed to be the name of the column.
23
+ If a Column is provided, it is assumed to be a column in the dataset.
24
24
  If a Func is provided, it is assumed to be a function returning a value.
25
- If an int, float, or Sequence is provided, it is assumed to be a literal.
25
+ If an int or float is provided, it is assumed to be a literal.
26
26
 
27
27
  Returns:
28
- Func: A Func object that represents the greatest function.
28
+ Func: A `Func` object that represents the greatest function.
29
29
 
30
30
  Example:
31
31
  ```py
32
32
  dc.mutate(
33
- greatest=func.greatest("signal.value", 0),
33
+ greatest=func.greatest(dc.C("signal.value"), "signal.value2", 0.5, 1.0),
34
34
  )
35
35
  ```
36
36
 
37
- Note:
38
- - Result column will always be of the same type as the input columns.
37
+ Notes:
38
+ - The result column will always be of the same type as the input columns.
39
39
  """
40
40
  cols, func_args = [], []
41
41
 
42
42
  for arg in args:
43
- if isinstance(arg, (str, Func)):
43
+ if isinstance(arg, (str, Column, Func)):
44
44
  cols.append(arg)
45
45
  else:
46
46
  func_args.append(arg)
@@ -54,33 +54,34 @@ def greatest(*args: Union[ColT, float]) -> Func:
54
54
  )
55
55
 
56
56
 
57
- def least(*args: Union[ColT, float]) -> Func:
57
+ def least(*args: str | Column | Func | float) -> Func:
58
58
  """
59
59
  Returns the least (smallest) value from the given input values.
60
60
 
61
61
  Args:
62
- args (ColT | str | int | float | Sequence): The values to compare.
62
+ args (str | Column | Func | int | float): The values to compare.
63
63
  If a string is provided, it is assumed to be the name of the column.
64
+ If a Column is provided, it is assumed to be a column in the dataset.
64
65
  If a Func is provided, it is assumed to be a function returning a value.
65
- If an int, float, or Sequence is provided, it is assumed to be a literal.
66
+ If an int or float is provided, it is assumed to be a literal.
66
67
 
67
68
  Returns:
68
- Func: A Func object that represents the least function.
69
+ Func: A `Func` object that represents the least function.
69
70
 
70
71
  Example:
71
72
  ```py
72
73
  dc.mutate(
73
- least=func.least("signal.value", 0),
74
+ least=func.least(dc.C("signal.value"), "signal.value2", -1.0, 0),
74
75
  )
75
76
  ```
76
77
 
77
- Note:
78
- - Result column will always be of the same type as the input columns.
78
+ Notes:
79
+ - The result column will always be of the same type as the input columns.
79
80
  """
80
81
  cols, func_args = [], []
81
82
 
82
83
  for arg in args:
83
- if isinstance(arg, (str, Func)):
84
+ if isinstance(arg, (str, Column, Func)):
84
85
  cols.append(arg)
85
86
  else:
86
87
  func_args.append(arg)
@@ -91,32 +92,34 @@ def least(*args: Union[ColT, float]) -> Func:
91
92
 
92
93
 
93
94
  def case(
94
- *args: tuple[Union[ColumnElement, Func, bool], CaseT], else_: Optional[CaseT] = None
95
+ *args: tuple[ColumnElement | Func | bool, CaseT], else_: CaseT | None = None
95
96
  ) -> Func:
96
97
  """
97
- Returns the case function that produces case expression which has a list of
98
- conditions and corresponding results. Results can be python primitives like string,
99
- numbers or booleans but can also be other nested functions (including case function)
100
- or columns.
101
- Result type is inferred from condition results.
98
+ Returns a case expression that evaluates a list of conditions and returns
99
+ corresponding results. Results can be Python primitives (string, numbers, booleans),
100
+ nested functions (including case function), or columns.
102
101
 
103
102
  Args:
104
- args tuple((ColumnElement | Func | bool),(str | int | float | complex | bool, Func, ColumnElement)):
105
- Tuple of condition and values pair.
106
- else_ (str | int | float | complex | bool, Func): optional else value in case
107
- expression. If omitted, and no case conditions are satisfied, the result
108
- will be None (NULL in DB).
103
+ args (tuple[ColumnElement | Func | bool, CaseT]): Tuples of (condition, value)
104
+ pairs. Each condition is evaluated in order, and the corresponding value
105
+ is returned for the first condition that evaluates to True.
106
+ else_ (CaseT, optional): Value to return if no conditions are satisfied.
107
+ If omitted and no conditions are satisfied, the result will be None
108
+ (NULL in DB).
109
109
 
110
110
  Returns:
111
- Func: A Func object that represents the case function.
111
+ Func: A `Func` object that represents the case function.
112
112
 
113
113
  Example:
114
114
  ```py
115
115
  dc.mutate(
116
- res=func.case((C("num") > 0, "P"), (C("num") < 0, "N"), else_="Z"),
116
+ res=func.case((dc.C("num") > 0, "P"), (dc.C("num") < 0, "N"), else_="Z"),
117
117
  )
118
118
  ```
119
- """ # noqa: E501
119
+
120
+ Notes:
121
+ - The result type is inferred from the values provided in the case statements.
122
+ """
120
123
  supported_types = [int, float, complex, str, bool]
121
124
 
122
125
  def _get_type(val):
@@ -158,24 +161,20 @@ def case(
158
161
  return Func("case", inner=sql_case, cols=args, kwargs=kwargs, result_type=type_)
159
162
 
160
163
 
161
- def ifelse(
162
- condition: Union[ColumnElement, Func], if_val: CaseT, else_val: CaseT
163
- ) -> Func:
164
+ def ifelse(condition: ColumnElement | Func, if_val: CaseT, else_val: CaseT) -> Func:
164
165
  """
165
- Returns the ifelse function that produces if expression which has a condition
166
- and values for true and false outcome. Results can be one of python primitives
167
- like string, numbers or booleans, but can also be nested functions or columns.
168
- Result type is inferred from the values.
166
+ Returns an if-else expression that evaluates a condition and returns one
167
+ of two values based on the result. Values can be Python primitives
168
+ (string, numbers, booleans), nested functions, or columns.
169
169
 
170
170
  Args:
171
- condition (ColumnElement, Func): Condition which is evaluated.
172
- if_val (str | int | float | complex | bool, Func, ColumnElement): Value for true
173
- condition outcome.
174
- else_val (str | int | float | complex | bool, Func, ColumnElement): Value for
175
- false condition outcome.
171
+ condition (ColumnElement | Func): Condition to evaluate.
172
+ if_val (ColumnElement | Func | literal): Value to return if condition is True.
173
+ else_val (ColumnElement | Func | literal): Value to return if condition
174
+ is False.
176
175
 
177
176
  Returns:
178
- Func: A Func object that represents the ifelse function.
177
+ Func: A `Func` object that represents the ifelse function.
179
178
 
180
179
  Example:
181
180
  ```py
@@ -183,57 +182,69 @@ def ifelse(
183
182
  res=func.ifelse(isnone("col"), "EMPTY", "NOT_EMPTY")
184
183
  )
185
184
  ```
185
+
186
+ Notes:
187
+ - The result type is inferred from the values provided in the ifelse statement.
186
188
  """
187
189
  return case((condition, if_val), else_=else_val)
188
190
 
189
191
 
190
- def isnone(col: Union[str, Column]) -> Func:
192
+ def isnone(col: str | ColumnElement) -> Func:
191
193
  """
192
- Returns True if column value is None, otherwise False.
194
+ Returns a function that checks if the column value is `None` (NULL in DB).
193
195
 
194
196
  Args:
195
197
  col (str | Column): Column to check if it's None or not.
196
198
  If a string is provided, it is assumed to be the name of the column.
199
+ If a Column is provided, it is assumed to be a column in the dataset.
197
200
 
198
201
  Returns:
199
- Func: A Func object that represents the conditional to check if column is None.
202
+ Func: A `Func` object that represents the isnone function.
203
+ Returns True if column value is None, otherwise False.
200
204
 
201
205
  Example:
202
206
  ```py
203
207
  dc.mutate(test=ifelse(isnone("col"), "EMPTY", "NOT_EMPTY"))
204
208
  ```
205
- """
206
- from datachain import C
207
209
 
210
+ Notes:
211
+ - The result column will always be of type bool.
212
+ """
208
213
  if isinstance(col, str):
209
- # if string, it is assumed to be the name of the column
210
- col = C(col)
214
+ # if string is provided, it is assumed to be the name of the column
215
+ col = Column(col)
211
216
 
212
217
  return case((col.is_(None) if col is not None else True, True), else_=False)
213
218
 
214
219
 
215
- def or_(*args: Union[ColumnElement, Func]) -> Func:
220
+ def or_(*args: ColumnElement | Func) -> Func:
216
221
  """
217
222
  Returns the function that produces conjunction of expressions joined by OR
218
223
  logical operator.
219
224
 
220
225
  Args:
221
226
  args (ColumnElement | Func): The expressions for OR statement.
227
+ If a string is provided, it is assumed to be the name of the column.
228
+ If a Column is provided, it is assumed to be a column in the dataset.
229
+ If a Func is provided, it is assumed to be a function returning a value.
222
230
 
223
231
  Returns:
224
- Func: A Func object that represents the or function.
232
+ Func: A `Func` object that represents the OR function.
225
233
 
226
234
  Example:
227
235
  ```py
228
236
  dc.mutate(
229
- test=ifelse(or_(isnone("name"), C("name") == ''), "Empty", "Not Empty")
237
+ test=ifelse(or_(isnone("name"), dc.C("name") == ''), "Empty", "Not Empty")
230
238
  )
231
239
  ```
240
+
241
+ Notes:
242
+ - The result column will always be of type bool.
232
243
  """
233
244
  cols, func_args = [], []
234
245
 
235
246
  for arg in args:
236
- if isinstance(arg, (str, Func)):
247
+ if isinstance(arg, (str, Column, Func)):
237
248
  cols.append(arg)
238
249
  else:
239
250
  func_args.append(arg)
@@ -241,16 +252,19 @@ def or_(*args: Union[ColumnElement, Func]) -> Func:
241
252
  return Func("or", inner=sql_or, cols=cols, args=func_args, result_type=bool)
242
253
 
243
254
 
244
- def and_(*args: Union[ColumnElement, Func]) -> Func:
255
+ def and_(*args: ColumnElement | Func) -> Func:
245
256
  """
246
257
  Returns the function that produces conjunction of expressions joined by AND
247
258
  logical operator.
248
259
 
249
260
  Args:
250
261
  args (ColumnElement | Func): The expressions for AND statement.
262
+ If a string is provided, it is assumed to be the name of the column.
263
+ If a Column is provided, it is assumed to be a column in the dataset.
264
+ If a Func is provided, it is assumed to be a function returning a value.
251
265
 
252
266
  Returns:
253
- Func: A Func object that represents the and function.
267
+ Func: A `Func` object that represents the AND function.
254
268
 
255
269
  Example:
256
270
  ```py
@@ -258,6 +272,9 @@ def and_(*args: Union[ColumnElement, Func]) -> Func:
258
272
  test=ifelse(and_(isnone("name"), isnone("surname")), "Empty", "Not Empty")
259
273
  )
260
274
  ```
275
+
276
+ Notes:
277
+ - The result column will always be of type bool.
261
278
  """
262
279
  cols, func_args = [], []
263
280
 
@@ -268,3 +285,36 @@ def and_(*args: Union[ColumnElement, Func]) -> Func:
268
285
  func_args.append(arg)
269
286
 
270
287
  return Func("and", inner=sql_and, cols=cols, args=func_args, result_type=bool)
288
+
289
+
290
+ def not_(arg: ColumnElement | Func) -> Func:
291
+ """
292
+ Returns the function that produces NOT of the given expressions.
293
+
294
+ Args:
295
+ arg (ColumnElement | Func): The expression for NOT statement.
296
+ If a string is provided, it is assumed to be the name of the column.
297
+ If a Column is provided, it is assumed to be a column in the dataset.
298
+ If a Func is provided, it is assumed to be a function returning a value.
299
+
300
+ Returns:
301
+ Func: A `Func` object that represents the NOT function.
302
+
303
+ Example:
304
+ ```py
305
+ dc.mutate(
306
+ test=not_(C("value") == 5)
307
+ )
308
+ ```
309
+
310
+ Notes:
311
+ - The result column will always be of type bool.
312
+ """
313
+ cols, func_args = [], []
314
+
315
+ if isinstance(arg, (str, Func)):
316
+ cols.append(arg)
317
+ else:
318
+ func_args.append(arg)
319
+
320
+ return Func("not", inner=sql_not, cols=cols, args=func_args, result_type=bool)