datachain 0.7.1__py3-none-any.whl → 0.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (46) hide show
  1. datachain/__init__.py +0 -2
  2. datachain/catalog/catalog.py +12 -9
  3. datachain/cli.py +109 -9
  4. datachain/client/fsspec.py +9 -9
  5. datachain/data_storage/metastore.py +63 -11
  6. datachain/data_storage/schema.py +2 -2
  7. datachain/data_storage/sqlite.py +5 -4
  8. datachain/data_storage/warehouse.py +18 -18
  9. datachain/dataset.py +142 -14
  10. datachain/func/__init__.py +49 -0
  11. datachain/{lib/func → func}/aggregate.py +13 -11
  12. datachain/func/array.py +176 -0
  13. datachain/func/base.py +23 -0
  14. datachain/func/conditional.py +81 -0
  15. datachain/func/func.py +384 -0
  16. datachain/func/path.py +110 -0
  17. datachain/func/random.py +23 -0
  18. datachain/func/string.py +154 -0
  19. datachain/func/window.py +49 -0
  20. datachain/lib/arrow.py +24 -12
  21. datachain/lib/data_model.py +25 -9
  22. datachain/lib/dataset_info.py +9 -5
  23. datachain/lib/dc.py +94 -56
  24. datachain/lib/hf.py +1 -1
  25. datachain/lib/signal_schema.py +1 -1
  26. datachain/lib/utils.py +1 -0
  27. datachain/lib/webdataset_laion.py +5 -5
  28. datachain/model/bbox.py +2 -2
  29. datachain/model/pose.py +5 -5
  30. datachain/model/segment.py +2 -2
  31. datachain/nodes_fetcher.py +2 -2
  32. datachain/query/dataset.py +57 -34
  33. datachain/remote/studio.py +40 -8
  34. datachain/sql/__init__.py +0 -2
  35. datachain/sql/functions/__init__.py +0 -26
  36. datachain/sql/selectable.py +11 -5
  37. datachain/sql/sqlite/base.py +11 -2
  38. datachain/studio.py +29 -0
  39. {datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/METADATA +2 -2
  40. {datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/RECORD +44 -37
  41. datachain/lib/func/__init__.py +0 -32
  42. datachain/lib/func/func.py +0 -152
  43. {datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/LICENSE +0 -0
  44. {datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/WHEEL +0 -0
  45. {datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/entry_points.txt +0 -0
  46. {datachain-0.7.1.dist-info → datachain-0.7.3.dist-info}/top_level.txt +0 -0
datachain/func/func.py ADDED
@@ -0,0 +1,384 @@
1
+ import inspect
2
+ from collections.abc import Sequence
3
+ from typing import TYPE_CHECKING, Any, Callable, Optional, Union
4
+
5
+ from sqlalchemy import BindParameter, ColumnElement, desc
6
+
7
+ from datachain.lib.convert.python_to_sql import python_to_sql
8
+ from datachain.lib.utils import DataChainColumnError, DataChainParamsError
9
+ from datachain.query.schema import Column, ColumnMeta
10
+
11
+ from .base import Function
12
+
13
+ if TYPE_CHECKING:
14
+ from sqlalchemy import TableClause
15
+
16
+ from datachain import DataType
17
+ from datachain.lib.signal_schema import SignalSchema
18
+
19
+ from .window import Window
20
+
21
+
22
+ ColT = Union[str, ColumnElement, "Func"]
23
+
24
+
25
+ class Func(Function):
26
+ """Represents a function to be applied to a column in a SQL query."""
27
+
28
+ def __init__(
29
+ self,
30
+ name: str,
31
+ inner: Callable,
32
+ cols: Optional[Sequence[ColT]] = None,
33
+ args: Optional[Sequence[Any]] = None,
34
+ result_type: Optional["DataType"] = None,
35
+ is_array: bool = False,
36
+ is_window: bool = False,
37
+ window: Optional["Window"] = None,
38
+ label: Optional[str] = None,
39
+ ) -> None:
40
+ self.name = name
41
+ self.inner = inner
42
+ self.cols = cols or []
43
+ self.args = args or []
44
+ self.result_type = result_type
45
+ self.is_array = is_array
46
+ self.is_window = is_window
47
+ self.window = window
48
+ self.col_label = label
49
+
50
+ def __str__(self) -> str:
51
+ return self.name + "()"
52
+
53
+ def over(self, window: "Window") -> "Func":
54
+ if not self.is_window:
55
+ raise DataChainParamsError(f"{self} doesn't support window (over())")
56
+
57
+ return Func(
58
+ "over",
59
+ self.inner,
60
+ self.cols,
61
+ self.args,
62
+ self.result_type,
63
+ self.is_array,
64
+ self.is_window,
65
+ window,
66
+ self.col_label,
67
+ )
68
+
69
+ @property
70
+ def _db_cols(self) -> Sequence[ColT]:
71
+ return (
72
+ [
73
+ col
74
+ if isinstance(col, (Func, BindParameter))
75
+ else ColumnMeta.to_db_name(
76
+ col.name if isinstance(col, ColumnElement) else col
77
+ )
78
+ for col in self.cols
79
+ ]
80
+ if self.cols
81
+ else []
82
+ )
83
+
84
+ def _db_col_type(self, signals_schema: "SignalSchema") -> Optional["DataType"]:
85
+ if not self._db_cols:
86
+ return None
87
+
88
+ col_type: type = get_db_col_type(signals_schema, self._db_cols[0])
89
+ for col in self._db_cols[1:]:
90
+ if get_db_col_type(signals_schema, col) != col_type:
91
+ raise DataChainColumnError(
92
+ str(self),
93
+ "Columns must have the same type to infer result type",
94
+ )
95
+
96
+ return list[col_type] if self.is_array else col_type # type: ignore[valid-type]
97
+
98
+ def __add__(self, other: Union[ColT, float]) -> "Func":
99
+ return math_add(self, other)
100
+
101
+ def __radd__(self, other: Union[ColT, float]) -> "Func":
102
+ return math_add(other, self)
103
+
104
+ def __sub__(self, other: Union[ColT, float]) -> "Func":
105
+ return math_sub(self, other)
106
+
107
+ def __rsub__(self, other: Union[ColT, float]) -> "Func":
108
+ return math_sub(other, self)
109
+
110
+ def __mul__(self, other: Union[ColT, float]) -> "Func":
111
+ return math_mul(self, other)
112
+
113
+ def __rmul__(self, other: Union[ColT, float]) -> "Func":
114
+ return math_mul(other, self)
115
+
116
+ def __truediv__(self, other: Union[ColT, float]) -> "Func":
117
+ return math_truediv(self, other)
118
+
119
+ def __rtruediv__(self, other: Union[ColT, float]) -> "Func":
120
+ return math_truediv(other, self)
121
+
122
+ def __floordiv__(self, other: Union[ColT, float]) -> "Func":
123
+ return math_floordiv(self, other)
124
+
125
+ def __rfloordiv__(self, other: Union[ColT, float]) -> "Func":
126
+ return math_floordiv(other, self)
127
+
128
+ def __mod__(self, other: Union[ColT, float]) -> "Func":
129
+ return math_mod(self, other)
130
+
131
+ def __rmod__(self, other: Union[ColT, float]) -> "Func":
132
+ return math_mod(other, self)
133
+
134
+ def __pow__(self, other: Union[ColT, float]) -> "Func":
135
+ return math_pow(self, other)
136
+
137
+ def __rpow__(self, other: Union[ColT, float]) -> "Func":
138
+ return math_pow(other, self)
139
+
140
+ def __lshift__(self, other: Union[ColT, float]) -> "Func":
141
+ return math_lshift(self, other)
142
+
143
+ def __rlshift__(self, other: Union[ColT, float]) -> "Func":
144
+ return math_lshift(other, self)
145
+
146
+ def __rshift__(self, other: Union[ColT, float]) -> "Func":
147
+ return math_rshift(self, other)
148
+
149
+ def __rrshift__(self, other: Union[ColT, float]) -> "Func":
150
+ return math_rshift(other, self)
151
+
152
+ def __and__(self, other: Union[ColT, float]) -> "Func":
153
+ return math_and(self, other)
154
+
155
+ def __rand__(self, other: Union[ColT, float]) -> "Func":
156
+ return math_and(other, self)
157
+
158
+ def __or__(self, other: Union[ColT, float]) -> "Func":
159
+ return math_or(self, other)
160
+
161
+ def __ror__(self, other: Union[ColT, float]) -> "Func":
162
+ return math_or(other, self)
163
+
164
+ def __xor__(self, other: Union[ColT, float]) -> "Func":
165
+ return math_xor(self, other)
166
+
167
+ def __rxor__(self, other: Union[ColT, float]) -> "Func":
168
+ return math_xor(other, self)
169
+
170
+ def __lt__(self, other: Union[ColT, float]) -> "Func":
171
+ return math_lt(self, other)
172
+
173
+ def __le__(self, other: Union[ColT, float]) -> "Func":
174
+ return math_le(self, other)
175
+
176
+ def __eq__(self, other):
177
+ return math_eq(self, other)
178
+
179
+ def __ne__(self, other):
180
+ return math_ne(self, other)
181
+
182
+ def __gt__(self, other: Union[ColT, float]) -> "Func":
183
+ return math_gt(self, other)
184
+
185
+ def __ge__(self, other: Union[ColT, float]) -> "Func":
186
+ return math_ge(self, other)
187
+
188
+ def label(self, label: str) -> "Func":
189
+ return Func(
190
+ self.name,
191
+ self.inner,
192
+ self.cols,
193
+ self.args,
194
+ self.result_type,
195
+ self.is_array,
196
+ self.is_window,
197
+ self.window,
198
+ label,
199
+ )
200
+
201
+ def get_col_name(self, label: Optional[str] = None) -> str:
202
+ if label:
203
+ return label
204
+ if self.col_label:
205
+ return self.col_label
206
+ if (db_cols := self._db_cols) and len(db_cols) == 1:
207
+ if isinstance(db_cols[0], str):
208
+ return db_cols[0]
209
+ if isinstance(db_cols[0], Column):
210
+ return db_cols[0].name
211
+ if isinstance(db_cols[0], Func):
212
+ return db_cols[0].get_col_name()
213
+ return self.name
214
+
215
+ def get_result_type(
216
+ self, signals_schema: Optional["SignalSchema"] = None
217
+ ) -> "DataType":
218
+ if self.result_type:
219
+ return self.result_type
220
+
221
+ if signals_schema and (col_type := self._db_col_type(signals_schema)):
222
+ return col_type
223
+
224
+ raise DataChainColumnError(
225
+ str(self),
226
+ "Column name is required to infer result type",
227
+ )
228
+
229
+ def get_column(
230
+ self,
231
+ signals_schema: Optional["SignalSchema"] = None,
232
+ label: Optional[str] = None,
233
+ table: Optional["TableClause"] = None,
234
+ ) -> Column:
235
+ col_type = self.get_result_type(signals_schema)
236
+ sql_type = python_to_sql(col_type)
237
+
238
+ def get_col(col: ColT) -> ColT:
239
+ if isinstance(col, Func):
240
+ return col.get_column(signals_schema, table=table)
241
+ if isinstance(col, str):
242
+ column = Column(col, sql_type)
243
+ column.table = table
244
+ return column
245
+ return col
246
+
247
+ cols = [get_col(col) for col in self._db_cols]
248
+ func_col = self.inner(*cols, *self.args)
249
+
250
+ if self.is_window:
251
+ if not self.window:
252
+ raise DataChainParamsError(
253
+ f"Window function {self} requires over() clause with a window spec",
254
+ )
255
+ func_col = func_col.over(
256
+ partition_by=self.window.partition_by,
257
+ order_by=(
258
+ desc(self.window.order_by)
259
+ if self.window.desc
260
+ else self.window.order_by
261
+ ),
262
+ )
263
+
264
+ func_col.type = sql_type() if inspect.isclass(sql_type) else sql_type
265
+
266
+ if col_name := self.get_col_name(label):
267
+ func_col = func_col.label(col_name)
268
+
269
+ return func_col
270
+
271
+
272
+ def get_db_col_type(signals_schema: "SignalSchema", col: ColT) -> "DataType":
273
+ if isinstance(col, Func):
274
+ return col.get_result_type(signals_schema)
275
+
276
+ return signals_schema.get_column_type(
277
+ col.name if isinstance(col, ColumnElement) else col
278
+ )
279
+
280
+
281
+ def math_func(
282
+ name: str,
283
+ inner: Callable,
284
+ params: Sequence[Union[ColT, float]],
285
+ result_type: Optional["DataType"] = None,
286
+ ) -> Func:
287
+ """Returns math function from the columns."""
288
+ cols, args = [], []
289
+ for arg in params:
290
+ if isinstance(arg, (int, float)):
291
+ args.append(arg)
292
+ else:
293
+ cols.append(arg)
294
+ return Func(name, inner, cols=cols, args=args, result_type=result_type)
295
+
296
+
297
+ def math_add(*args: Union[ColT, float]) -> Func:
298
+ """Computes the sum of the column."""
299
+ return math_func("add", lambda a1, a2: a1 + a2, args)
300
+
301
+
302
+ def math_sub(*args: Union[ColT, float]) -> Func:
303
+ """Computes the diff of the column."""
304
+ return math_func("sub", lambda a1, a2: a1 - a2, args)
305
+
306
+
307
+ def math_mul(*args: Union[ColT, float]) -> Func:
308
+ """Computes the product of the column."""
309
+ return math_func("mul", lambda a1, a2: a1 * a2, args)
310
+
311
+
312
+ def math_truediv(*args: Union[ColT, float]) -> Func:
313
+ """Computes the division of the column."""
314
+ return math_func("div", lambda a1, a2: a1 / a2, args, result_type=float)
315
+
316
+
317
+ def math_floordiv(*args: Union[ColT, float]) -> Func:
318
+ """Computes the floor division of the column."""
319
+ return math_func("floordiv", lambda a1, a2: a1 // a2, args, result_type=float)
320
+
321
+
322
+ def math_mod(*args: Union[ColT, float]) -> Func:
323
+ """Computes the modulo of the column."""
324
+ return math_func("mod", lambda a1, a2: a1 % a2, args, result_type=float)
325
+
326
+
327
+ def math_pow(*args: Union[ColT, float]) -> Func:
328
+ """Computes the power of the column."""
329
+ return math_func("pow", lambda a1, a2: a1**a2, args, result_type=float)
330
+
331
+
332
+ def math_lshift(*args: Union[ColT, float]) -> Func:
333
+ """Computes the left shift of the column."""
334
+ return math_func("lshift", lambda a1, a2: a1 << a2, args, result_type=int)
335
+
336
+
337
+ def math_rshift(*args: Union[ColT, float]) -> Func:
338
+ """Computes the right shift of the column."""
339
+ return math_func("rshift", lambda a1, a2: a1 >> a2, args, result_type=int)
340
+
341
+
342
+ def math_and(*args: Union[ColT, float]) -> Func:
343
+ """Computes the logical AND of the column."""
344
+ return math_func("and", lambda a1, a2: a1 & a2, args, result_type=bool)
345
+
346
+
347
+ def math_or(*args: Union[ColT, float]) -> Func:
348
+ """Computes the logical OR of the column."""
349
+ return math_func("or", lambda a1, a2: a1 | a2, args, result_type=bool)
350
+
351
+
352
+ def math_xor(*args: Union[ColT, float]) -> Func:
353
+ """Computes the logical XOR of the column."""
354
+ return math_func("xor", lambda a1, a2: a1 ^ a2, args, result_type=bool)
355
+
356
+
357
+ def math_lt(*args: Union[ColT, float]) -> Func:
358
+ """Computes the less than comparison of the column."""
359
+ return math_func("lt", lambda a1, a2: a1 < a2, args, result_type=bool)
360
+
361
+
362
+ def math_le(*args: Union[ColT, float]) -> Func:
363
+ """Computes the less than or equal comparison of the column."""
364
+ return math_func("le", lambda a1, a2: a1 <= a2, args, result_type=bool)
365
+
366
+
367
+ def math_eq(*args: Union[ColT, float]) -> Func:
368
+ """Computes the equality comparison of the column."""
369
+ return math_func("eq", lambda a1, a2: a1 == a2, args, result_type=bool)
370
+
371
+
372
+ def math_ne(*args: Union[ColT, float]) -> Func:
373
+ """Computes the inequality comparison of the column."""
374
+ return math_func("ne", lambda a1, a2: a1 != a2, args, result_type=bool)
375
+
376
+
377
+ def math_gt(*args: Union[ColT, float]) -> Func:
378
+ """Computes the greater than comparison of the column."""
379
+ return math_func("gt", lambda a1, a2: a1 > a2, args, result_type=bool)
380
+
381
+
382
+ def math_ge(*args: Union[ColT, float]) -> Func:
383
+ """Computes the greater than or equal comparison of the column."""
384
+ return math_func("ge", lambda a1, a2: a1 >= a2, args, result_type=bool)
datachain/func/path.py ADDED
@@ -0,0 +1,110 @@
1
+ from datachain.sql.functions import path
2
+
3
+ from .func import ColT, Func
4
+
5
+
6
+ def parent(col: ColT) -> Func:
7
+ """
8
+ Returns the directory component of a posix-style path.
9
+
10
+ Args:
11
+ col (str | literal | Func): String to compute the path parent of.
12
+ If a string is provided, it is assumed to be the name of the column.
13
+ If a literal is provided, it is assumed to be a string literal.
14
+ If a Func is provided, it is assumed to be a function returning a string.
15
+
16
+ Returns:
17
+ Func: A Func object that represents the path parent function.
18
+
19
+ Example:
20
+ ```py
21
+ dc.mutate(
22
+ parent=func.path.parent("file.path"),
23
+ )
24
+ ```
25
+
26
+ Note:
27
+ - Result column will always be of type string.
28
+ """
29
+ return Func("parent", inner=path.parent, cols=[col], result_type=str)
30
+
31
+
32
+ def name(col: ColT) -> Func:
33
+ """
34
+ Returns the final component of a posix-style path.
35
+
36
+ Args:
37
+ col (str | literal): String to compute the path name of.
38
+ If a string is provided, it is assumed to be the name of the column.
39
+ If a literal is provided, it is assumed to be a string literal.
40
+ If a Func is provided, it is assumed to be a function returning a string.
41
+
42
+ Returns:
43
+ Func: A Func object that represents the path name function.
44
+
45
+ Example:
46
+ ```py
47
+ dc.mutate(
48
+ file_name=func.path.name("file.path"),
49
+ )
50
+ ```
51
+
52
+ Note:
53
+ - Result column will always be of type string.
54
+ """
55
+
56
+ return Func("name", inner=path.name, cols=[col], result_type=str)
57
+
58
+
59
+ def file_stem(col: ColT) -> Func:
60
+ """
61
+ Returns the path without the extension.
62
+
63
+ Args:
64
+ col (str | literal): String to compute the file stem of.
65
+ If a string is provided, it is assumed to be the name of the column.
66
+ If a literal is provided, it is assumed to be a string literal.
67
+ If a Func is provided, it is assumed to be a function returning a string.
68
+
69
+ Returns:
70
+ Func: A Func object that represents the file stem function.
71
+
72
+ Example:
73
+ ```py
74
+ dc.mutate(
75
+ file_stem=func.path.file_stem("file.path"),
76
+ )
77
+ ```
78
+
79
+ Note:
80
+ - Result column will always be of type string.
81
+ """
82
+
83
+ return Func("file_stem", inner=path.file_stem, cols=[col], result_type=str)
84
+
85
+
86
+ def file_ext(col: ColT) -> Func:
87
+ """
88
+ Returns the extension of the given path.
89
+
90
+ Args:
91
+ col (str | literal): String to compute the file extension of.
92
+ If a string is provided, it is assumed to be the name of the column.
93
+ If a literal is provided, it is assumed to be a string literal.
94
+ If a Func is provided, it is assumed to be a function returning a string.
95
+
96
+ Returns:
97
+ Func: A Func object that represents the file extension function.
98
+
99
+ Example:
100
+ ```py
101
+ dc.mutate(
102
+ file_stem=func.path.file_ext("file.path"),
103
+ )
104
+ ```
105
+
106
+ Note:
107
+ - Result column will always be of type string.
108
+ """
109
+
110
+ return Func("file_ext", inner=path.file_ext, cols=[col], result_type=str)
@@ -0,0 +1,23 @@
1
+ from datachain.sql.functions import random
2
+
3
+ from .func import Func
4
+
5
+
6
+ def rand() -> Func:
7
+ """
8
+ Returns the random integer value.
9
+
10
+ Returns:
11
+ Func: A Func object that represents the rand function.
12
+
13
+ Example:
14
+ ```py
15
+ dc.mutate(
16
+ rnd=func.random.rand(),
17
+ )
18
+ ```
19
+
20
+ Note:
21
+ - Result column will always be of type integer.
22
+ """
23
+ return Func("rand", inner=random.rand, result_type=int)
@@ -0,0 +1,154 @@
1
+ from typing import Optional, Union, get_origin
2
+
3
+ from sqlalchemy import literal
4
+
5
+ from datachain.sql.functions import string
6
+
7
+ from .func import Func
8
+
9
+
10
+ def length(col: Union[str, Func]) -> Func:
11
+ """
12
+ Returns the length of the string.
13
+
14
+ Args:
15
+ col (str | literal | Func): String to compute the length of.
16
+ If a string is provided, it is assumed to be the name of the column.
17
+ If a literal is provided, it is assumed to be a string literal.
18
+ If a Func is provided, it is assumed to be a function returning a string.
19
+
20
+ Returns:
21
+ Func: A Func object that represents the string length function.
22
+
23
+ Example:
24
+ ```py
25
+ dc.mutate(
26
+ len1=func.string.length("file.path"),
27
+ len2=func.string.length("Random string"),
28
+ )
29
+ ```
30
+
31
+ Note:
32
+ - Result column will always be of type int.
33
+ """
34
+ return Func("length", inner=string.length, cols=[col], result_type=int)
35
+
36
+
37
+ def split(col: Union[str, Func], sep: str, limit: Optional[int] = None) -> Func:
38
+ """
39
+ Takes a column and split character and returns an array of the parts.
40
+
41
+ Args:
42
+ col (str | literal): Column to split.
43
+ If a string is provided, it is assumed to be the name of the column.
44
+ If a literal is provided, it is assumed to be a string literal.
45
+ If a Func is provided, it is assumed to be a function returning a string.
46
+ sep (str): Separator to split the string.
47
+ limit (int, optional): Maximum number of splits to perform.
48
+
49
+ Returns:
50
+ Func: A Func object that represents the split function.
51
+
52
+ Example:
53
+ ```py
54
+ dc.mutate(
55
+ path_parts=func.string.split("file.path", "/"),
56
+ str_words=func.string.length("Random string", " "),
57
+ )
58
+ ```
59
+
60
+ Note:
61
+ - Result column will always be of type array of strings.
62
+ """
63
+
64
+ def inner(arg):
65
+ if limit is not None:
66
+ return string.split(arg, sep, limit)
67
+ return string.split(arg, sep)
68
+
69
+ if get_origin(col) is literal:
70
+ cols = None
71
+ args = [col]
72
+ else:
73
+ cols = [col]
74
+ args = None
75
+
76
+ return Func("split", inner=inner, cols=cols, args=args, result_type=list[str])
77
+
78
+
79
+ def replace(col: Union[str, Func], pattern: str, replacement: str) -> Func:
80
+ """
81
+ Replaces substring with another string.
82
+
83
+ Args:
84
+ col (str | literal): Column to split.
85
+ If a string is provided, it is assumed to be the name of the column.
86
+ If a literal is provided, it is assumed to be a string literal.
87
+ If a Func is provided, it is assumed to be a function returning a string.
88
+ pattern (str): Pattern to replace.
89
+ replacement (str): Replacement string.
90
+
91
+ Returns:
92
+ Func: A Func object that represents the replace function.
93
+
94
+ Example:
95
+ ```py
96
+ dc.mutate(
97
+ signal=func.string.replace("signal.name", "pattern", "replacement),
98
+ )
99
+ ```
100
+
101
+ Note:
102
+ - Result column will always be of type string.
103
+ """
104
+
105
+ def inner(arg):
106
+ return string.replace(arg, pattern, replacement)
107
+
108
+ if get_origin(col) is literal:
109
+ cols = None
110
+ args = [col]
111
+ else:
112
+ cols = [col]
113
+ args = None
114
+
115
+ return Func("replace", inner=inner, cols=cols, args=args, result_type=str)
116
+
117
+
118
+ def regexp_replace(col: Union[str, Func], regex: str, replacement: str) -> Func:
119
+ r"""
120
+ Replaces substring that match a regular expression.
121
+
122
+ Args:
123
+ col (str | literal): Column to split.
124
+ If a string is provided, it is assumed to be the name of the column.
125
+ If a literal is provided, it is assumed to be a string literal.
126
+ If a Func is provided, it is assumed to be a function returning a string.
127
+ regex (str): Regular expression pattern to replace.
128
+ replacement (str): Replacement string.
129
+
130
+ Returns:
131
+ Func: A Func object that represents the regexp_replace function.
132
+
133
+ Example:
134
+ ```py
135
+ dc.mutate(
136
+ signal=func.string.regexp_replace("signal.name", r"\d+", "X"),
137
+ )
138
+ ```
139
+
140
+ Note:
141
+ - Result column will always be of type string.
142
+ """
143
+
144
+ def inner(arg):
145
+ return string.regexp_replace(arg, regex, replacement)
146
+
147
+ if get_origin(col) is literal:
148
+ cols = None
149
+ args = [col]
150
+ else:
151
+ cols = [col]
152
+ args = None
153
+
154
+ return Func("regexp_replace", inner=inner, cols=cols, args=args, result_type=str)