pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +42 -8
- pixeltable/{dataframe.py → _query.py} +470 -206
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -4
- pixeltable/catalog/catalog.py +1785 -432
- pixeltable/catalog/column.py +190 -113
- pixeltable/catalog/dir.py +2 -4
- pixeltable/catalog/globals.py +19 -46
- pixeltable/catalog/insertable_table.py +191 -98
- pixeltable/catalog/path.py +63 -23
- pixeltable/catalog/schema_object.py +11 -15
- pixeltable/catalog/table.py +843 -436
- pixeltable/catalog/table_metadata.py +103 -0
- pixeltable/catalog/table_version.py +978 -657
- pixeltable/catalog/table_version_handle.py +72 -16
- pixeltable/catalog/table_version_path.py +112 -43
- pixeltable/catalog/tbl_ops.py +53 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +134 -90
- pixeltable/config.py +134 -22
- pixeltable/env.py +471 -157
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +4 -1
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +268 -0
- pixeltable/exec/cell_reconstruction_node.py +168 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +11 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +106 -56
- pixeltable/exec/globals.py +35 -0
- pixeltable/exec/in_memory_data_node.py +19 -19
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +351 -84
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +36 -23
- pixeltable/exprs/column_ref.py +213 -89
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +164 -54
- pixeltable/exprs/expr.py +70 -44
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +100 -40
- pixeltable/exprs/globals.py +2 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +18 -32
- pixeltable/exprs/is_null.py +7 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +27 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +167 -67
- pixeltable/exprs/rowid_ref.py +25 -10
- pixeltable/exprs/similarity_expr.py +58 -40
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +17 -11
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +29 -27
- pixeltable/func/signature.py +46 -19
- pixeltable/func/tools.py +31 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +16 -0
- pixeltable/functions/anthropic.py +123 -77
- pixeltable/functions/audio.py +147 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +7 -4
- pixeltable/functions/deepseek.py +35 -43
- pixeltable/functions/document.py +81 -0
- pixeltable/functions/fal.py +76 -0
- pixeltable/functions/fireworks.py +11 -20
- pixeltable/functions/gemini.py +195 -39
- pixeltable/functions/globals.py +142 -14
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1056 -24
- pixeltable/functions/image.py +115 -57
- pixeltable/functions/json.py +1 -1
- pixeltable/functions/llama_cpp.py +28 -13
- pixeltable/functions/math.py +67 -5
- pixeltable/functions/mistralai.py +18 -55
- pixeltable/functions/net.py +70 -0
- pixeltable/functions/ollama.py +20 -13
- pixeltable/functions/openai.py +240 -226
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +4 -4
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +239 -69
- pixeltable/functions/timestamp.py +16 -16
- pixeltable/functions/together.py +24 -84
- pixeltable/functions/twelvelabs.py +188 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/uuid.py +30 -0
- pixeltable/functions/video.py +1515 -107
- pixeltable/functions/vision.py +8 -8
- pixeltable/functions/voyageai.py +289 -0
- pixeltable/functions/whisper.py +16 -8
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +362 -115
- pixeltable/index/base.py +17 -21
- pixeltable/index/btree.py +28 -22
- pixeltable/index/embedding_index.py +100 -118
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +8 -7
- pixeltable/io/external_store.py +56 -105
- pixeltable/io/fiftyone.py +13 -13
- pixeltable/io/globals.py +31 -30
- pixeltable/io/hf_datasets.py +61 -16
- pixeltable/io/label_studio.py +74 -70
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +21 -12
- pixeltable/io/parquet.py +25 -105
- pixeltable/io/table_data_conduit.py +250 -123
- pixeltable/io/utils.py +4 -4
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +26 -25
- pixeltable/iterators/base.py +9 -3
- pixeltable/iterators/document.py +112 -78
- pixeltable/iterators/image.py +12 -15
- pixeltable/iterators/string.py +11 -4
- pixeltable/iterators/video.py +523 -120
- pixeltable/metadata/__init__.py +14 -3
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_30.py +34 -21
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/convert_41.py +12 -0
- pixeltable/metadata/converters/convert_42.py +9 -0
- pixeltable/metadata/converters/convert_43.py +44 -0
- pixeltable/metadata/converters/util.py +20 -31
- pixeltable/metadata/notes.py +9 -0
- pixeltable/metadata/schema.py +140 -53
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +382 -115
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +547 -83
- pixeltable/share/protocol/__init__.py +33 -0
- pixeltable/share/protocol/common.py +165 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +119 -0
- pixeltable/share/publish.py +257 -59
- pixeltable/store.py +311 -194
- pixeltable/type_system.py +373 -211
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +131 -17
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +346 -0
- pixeltable/utils/coco.py +6 -6
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +32 -6
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +7 -18
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +86 -48
- pixeltable/utils/gcs_store.py +295 -0
- pixeltable/utils/http.py +133 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +90 -0
- pixeltable/utils/local_store.py +322 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +573 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +527 -0
- pixeltable/utils/sql.py +26 -0
- pixeltable/utils/system.py +30 -0
- pixeltable-0.5.7.dist-info/METADATA +579 -0
- pixeltable-0.5.7.dist-info/RECORD +227 -0
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
- pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/catalog/named_function.py +0 -40
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable-0.3.14.dist-info/METADATA +0 -434
- pixeltable-0.3.14.dist-info/RECORD +0 -186
- pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/functions/string.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Pixeltable
|
|
2
|
+
Pixeltable UDFs for `StringType`.
|
|
3
3
|
It closely follows the Pandas `pandas.Series.str` API.
|
|
4
4
|
|
|
5
5
|
Example:
|
|
@@ -12,7 +12,12 @@ t.select(t.str_col.capitalize()).collect()
|
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
14
|
import builtins
|
|
15
|
-
|
|
15
|
+
import re
|
|
16
|
+
import textwrap
|
|
17
|
+
from string import whitespace
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
import sqlalchemy as sql
|
|
16
21
|
|
|
17
22
|
import pixeltable as pxt
|
|
18
23
|
from pixeltable.utils.code import local_public_names
|
|
@@ -28,6 +33,11 @@ def capitalize(self: str) -> str:
|
|
|
28
33
|
return self.capitalize()
|
|
29
34
|
|
|
30
35
|
|
|
36
|
+
@capitalize.to_sql
|
|
37
|
+
def _(self: sql.ColumnElement) -> sql.ColumnElement:
|
|
38
|
+
return sql.func.concat(sql.func.upper(sql.func.left(self, 1)), sql.func.lower(sql.func.right(self, -1)))
|
|
39
|
+
|
|
40
|
+
|
|
31
41
|
@pxt.udf(is_method=True)
|
|
32
42
|
def casefold(self: str) -> str:
|
|
33
43
|
"""
|
|
@@ -53,26 +63,45 @@ def center(self: str, width: int, fillchar: str = ' ') -> str:
|
|
|
53
63
|
|
|
54
64
|
|
|
55
65
|
@pxt.udf(is_method=True)
|
|
56
|
-
def contains(self: str,
|
|
66
|
+
def contains(self: str, substr: str, case: bool = True) -> bool:
|
|
57
67
|
"""
|
|
58
|
-
Test if string contains
|
|
68
|
+
Test if string contains a substring.
|
|
59
69
|
|
|
60
70
|
Args:
|
|
61
|
-
|
|
71
|
+
substr: string literal or regular expression
|
|
62
72
|
case: if False, ignore case
|
|
63
|
-
flags: [flags](https://docs.python.org/3/library/re.html#flags) for the `re` module
|
|
64
|
-
regex: if True, treat pattern as a regular expression
|
|
65
73
|
"""
|
|
66
|
-
if
|
|
67
|
-
|
|
74
|
+
if case:
|
|
75
|
+
return substr in self
|
|
76
|
+
else:
|
|
77
|
+
return substr.lower() in self.lower()
|
|
78
|
+
|
|
68
79
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
80
|
+
@contains.to_sql
|
|
81
|
+
def _(self: sql.ColumnElement, substr: sql.ColumnElement, case: sql.ColumnElement | None = None) -> sql.ColumnElement:
|
|
82
|
+
# Replace all occurrences of `%`, `_`, and `\` with escaped versions
|
|
83
|
+
escaped_substr = sql.func.regexp_replace(substr, r'(%|_|\\)', r'\\\1', 'g')
|
|
84
|
+
if case is None:
|
|
85
|
+
# Default `case` is True, so we do a case-sensitive comparison
|
|
86
|
+
return self.like(sql.func.concat('%', escaped_substr, '%'))
|
|
74
87
|
else:
|
|
75
|
-
|
|
88
|
+
# Toggle case-sensitivity based on the value of `case`
|
|
89
|
+
return sql.case(
|
|
90
|
+
(case, self.like(sql.func.concat('%', escaped_substr, '%'))),
|
|
91
|
+
else_=sql.func.lower(self).like(sql.func.concat('%', sql.func.lower(escaped_substr), '%')),
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@pxt.udf(is_method=True)
|
|
96
|
+
def contains_re(self: str, pattern: str, flags: int = 0) -> bool:
|
|
97
|
+
"""
|
|
98
|
+
Test if string contains a regular expression pattern.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
pattern: regular expression pattern
|
|
102
|
+
flags: [flags](https://docs.python.org/3/library/re.html#flags) for the `re` module
|
|
103
|
+
"""
|
|
104
|
+
return bool(re.search(pattern, self, flags))
|
|
76
105
|
|
|
77
106
|
|
|
78
107
|
@pxt.udf(is_method=True)
|
|
@@ -84,22 +113,27 @@ def count(self: str, pattern: str, flags: int = 0) -> int:
|
|
|
84
113
|
pattern: string literal or regular expression
|
|
85
114
|
flags: [flags](https://docs.python.org/3/library/re.html#flags) for the `re` module
|
|
86
115
|
"""
|
|
87
|
-
import re
|
|
88
|
-
|
|
89
116
|
return builtins.len(re.findall(pattern, self, flags))
|
|
90
117
|
|
|
91
118
|
|
|
92
119
|
@pxt.udf(is_method=True)
|
|
93
|
-
def endswith(self: str,
|
|
120
|
+
def endswith(self: str, substr: str) -> bool:
|
|
94
121
|
"""
|
|
95
122
|
Return `True` if the string ends with the specified suffix, otherwise return `False`.
|
|
96
123
|
|
|
97
124
|
Equivalent to [`str.endswith()`](https://docs.python.org/3/library/stdtypes.html#str.endswith).
|
|
98
125
|
|
|
99
126
|
Args:
|
|
100
|
-
|
|
127
|
+
substr: string literal
|
|
101
128
|
"""
|
|
102
|
-
return self.endswith(
|
|
129
|
+
return self.endswith(substr)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@endswith.to_sql
|
|
133
|
+
def _(self: sql.ColumnElement, substr: sql.ColumnElement) -> sql.ColumnElement:
|
|
134
|
+
# Replace all occurrences of `%`, `_`, and `\` with escaped versions
|
|
135
|
+
escaped_substr = sql.func.regexp_replace(substr, r'(%|_|\\)', r'\\\1', 'g')
|
|
136
|
+
return self.like(sql.func.concat('%', escaped_substr))
|
|
103
137
|
|
|
104
138
|
|
|
105
139
|
@pxt.udf(is_method=True)
|
|
@@ -113,13 +147,11 @@ def fill(self: str, width: int, **kwargs: Any) -> str:
|
|
|
113
147
|
width: Maximum line width.
|
|
114
148
|
kwargs: Additional keyword arguments to pass to `textwrap.fill()`.
|
|
115
149
|
"""
|
|
116
|
-
import textwrap
|
|
117
|
-
|
|
118
150
|
return textwrap.fill(self, width, **kwargs)
|
|
119
151
|
|
|
120
152
|
|
|
121
153
|
@pxt.udf(is_method=True)
|
|
122
|
-
def find(self: str, substr: str, start:
|
|
154
|
+
def find(self: str, substr: str, start: int = 0, end: int | None = None) -> int:
|
|
123
155
|
"""
|
|
124
156
|
Return the lowest index in string where `substr` is found within the slice `s[start:end]`.
|
|
125
157
|
|
|
@@ -133,6 +165,20 @@ def find(self: str, substr: str, start: Optional[int] = 0, end: Optional[int] =
|
|
|
133
165
|
return self.find(substr, start, end)
|
|
134
166
|
|
|
135
167
|
|
|
168
|
+
@find.to_sql
|
|
169
|
+
def _(
|
|
170
|
+
self: sql.ColumnElement, substr: sql.ColumnElement, start: sql.ColumnElement, end: sql.ColumnElement | None = None
|
|
171
|
+
) -> sql.ColumnElement:
|
|
172
|
+
sl = pxt.functions.string.slice._to_sql(self, start, end)
|
|
173
|
+
if sl is None:
|
|
174
|
+
return None
|
|
175
|
+
|
|
176
|
+
strpos = sql.func.strpos(sl, substr)
|
|
177
|
+
return sql.case(
|
|
178
|
+
(strpos == 0, -1), (start >= 0, strpos + start - 1), else_=strpos + sql.func.char_length(self) + start - 1
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
|
|
136
182
|
@pxt.udf(is_method=True)
|
|
137
183
|
def findall(self: str, pattern: str, flags: int = 0) -> list:
|
|
138
184
|
"""
|
|
@@ -144,8 +190,6 @@ def findall(self: str, pattern: str, flags: int = 0) -> list:
|
|
|
144
190
|
pattern: regular expression pattern
|
|
145
191
|
flags: [flags](https://docs.python.org/3/library/re.html#flags) for the `re` module
|
|
146
192
|
"""
|
|
147
|
-
import re
|
|
148
|
-
|
|
149
193
|
return re.findall(pattern, self, flags)
|
|
150
194
|
|
|
151
195
|
|
|
@@ -171,8 +215,6 @@ def fullmatch(self: str, pattern: str, case: bool = True, flags: int = 0) -> boo
|
|
|
171
215
|
case: if False, ignore case
|
|
172
216
|
flags: [flags](https://docs.python.org/3/library/re.html#flags) for the `re` module
|
|
173
217
|
"""
|
|
174
|
-
import re
|
|
175
|
-
|
|
176
218
|
if not case:
|
|
177
219
|
flags |= re.IGNORECASE
|
|
178
220
|
_ = bool(re.fullmatch(pattern, self, flags))
|
|
@@ -180,7 +222,7 @@ def fullmatch(self: str, pattern: str, case: bool = True, flags: int = 0) -> boo
|
|
|
180
222
|
|
|
181
223
|
|
|
182
224
|
@pxt.udf(is_method=True)
|
|
183
|
-
def index(self: str, substr: str, start:
|
|
225
|
+
def index(self: str, substr: str, start: int = 0, end: int | None = None) -> int:
|
|
184
226
|
"""
|
|
185
227
|
Return the lowest index in string where `substr` is found within the slice `[start:end]`.
|
|
186
228
|
Raises ValueError if `substr` is not found.
|
|
@@ -330,6 +372,11 @@ def len(self: str) -> int:
|
|
|
330
372
|
return builtins.len(self)
|
|
331
373
|
|
|
332
374
|
|
|
375
|
+
@len.to_sql
|
|
376
|
+
def _(self: sql.ColumnElement) -> sql.ColumnElement:
|
|
377
|
+
return sql.func.char_length(self)
|
|
378
|
+
|
|
379
|
+
|
|
333
380
|
@pxt.udf(is_method=True)
|
|
334
381
|
def ljust(self: str, width: int, fillchar: str = ' ') -> str:
|
|
335
382
|
"""
|
|
@@ -355,8 +402,13 @@ def lower(self: str) -> str:
|
|
|
355
402
|
return self.lower()
|
|
356
403
|
|
|
357
404
|
|
|
405
|
+
@lower.to_sql
|
|
406
|
+
def _(self: sql.ColumnElement) -> sql.ColumnElement:
|
|
407
|
+
return sql.func.lower(self)
|
|
408
|
+
|
|
409
|
+
|
|
358
410
|
@pxt.udf(is_method=True)
|
|
359
|
-
def lstrip(self: str, chars:
|
|
411
|
+
def lstrip(self: str, chars: str | None = None) -> str:
|
|
360
412
|
"""
|
|
361
413
|
Return a copy of the string with leading characters removed. The `chars` argument is a string specifying the set of
|
|
362
414
|
characters to be removed. If omitted or `None`, whitespace characters are removed.
|
|
@@ -369,6 +421,11 @@ def lstrip(self: str, chars: Optional[str] = None) -> str:
|
|
|
369
421
|
return self.lstrip(chars)
|
|
370
422
|
|
|
371
423
|
|
|
424
|
+
@lstrip.to_sql
|
|
425
|
+
def _(self: sql.ColumnElement, chars: sql.ColumnElement | None = None) -> sql.ColumnElement:
|
|
426
|
+
return sql.func.ltrim(self, chars if chars is not None else whitespace)
|
|
427
|
+
|
|
428
|
+
|
|
372
429
|
@pxt.udf(is_method=True)
|
|
373
430
|
def match(self: str, pattern: str, case: bool = True, flags: int = 0) -> bool:
|
|
374
431
|
"""
|
|
@@ -379,8 +436,6 @@ def match(self: str, pattern: str, case: bool = True, flags: int = 0) -> bool:
|
|
|
379
436
|
case: if False, ignore case
|
|
380
437
|
flags: [flags](https://docs.python.org/3/library/re.html#flags) for the `re` module
|
|
381
438
|
"""
|
|
382
|
-
import re
|
|
383
|
-
|
|
384
439
|
if not case:
|
|
385
440
|
flags |= re.IGNORECASE
|
|
386
441
|
return bool(re.match(pattern, self, flags))
|
|
@@ -440,9 +495,12 @@ def removeprefix(self: str, prefix: str) -> str:
|
|
|
440
495
|
"""
|
|
441
496
|
Remove prefix. If the prefix is not present, returns string.
|
|
442
497
|
"""
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
498
|
+
return self.removeprefix(prefix)
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
@removeprefix.to_sql
|
|
502
|
+
def _(self: sql.ColumnElement, prefix: sql.ColumnElement) -> sql.ColumnElement:
|
|
503
|
+
return sql.case((startswith._to_sql(self, prefix), sql.func.right(self, -sql.func.char_length(prefix))), else_=self)
|
|
446
504
|
|
|
447
505
|
|
|
448
506
|
@pxt.udf(is_method=True)
|
|
@@ -450,9 +508,12 @@ def removesuffix(self: str, suffix: str) -> str:
|
|
|
450
508
|
"""
|
|
451
509
|
Remove suffix. If the suffix is not present, returns string.
|
|
452
510
|
"""
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
511
|
+
return self.removesuffix(suffix)
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
@removesuffix.to_sql
|
|
515
|
+
def _(self: sql.ColumnElement, suffix: sql.ColumnElement) -> sql.ColumnElement:
|
|
516
|
+
return sql.case((endswith._to_sql(self, suffix), sql.func.left(self, -sql.func.char_length(suffix))), else_=self)
|
|
456
517
|
|
|
457
518
|
|
|
458
519
|
@pxt.udf(is_method=True)
|
|
@@ -463,36 +524,69 @@ def repeat(self: str, n: int) -> str:
|
|
|
463
524
|
return self * n
|
|
464
525
|
|
|
465
526
|
|
|
527
|
+
@repeat.to_sql
|
|
528
|
+
def _(self: sql.ColumnElement, n: sql.ColumnElement) -> sql.ColumnElement:
|
|
529
|
+
return sql.func.repeat(self, n.cast(sql.types.INT))
|
|
530
|
+
|
|
531
|
+
|
|
466
532
|
@pxt.udf(is_method=True)
|
|
467
|
-
def replace(
|
|
468
|
-
self: str, pattern: str, repl: str, n: int = -1, case: bool = True, flags: int = 0, regex: bool = False
|
|
469
|
-
) -> str:
|
|
533
|
+
def replace(self: str, substr: str, repl: str, n: int | None = None) -> str:
|
|
470
534
|
"""
|
|
471
|
-
Replace occurrences of `
|
|
535
|
+
Replace occurrences of `substr` with `repl`.
|
|
472
536
|
|
|
473
|
-
Equivalent to [`str.replace()`](https://docs.python.org/3/library/stdtypes.html#str.replace)
|
|
474
|
-
[`re.sub()`](https://docs.python.org/3/library/re.html#re.sub), depending on the value of regex.
|
|
537
|
+
Equivalent to [`str.replace()`](https://docs.python.org/3/library/stdtypes.html#str.replace).
|
|
475
538
|
|
|
476
539
|
Args:
|
|
477
|
-
|
|
540
|
+
substr: string literal
|
|
478
541
|
repl: replacement string
|
|
479
|
-
n: number of replacements to make (
|
|
480
|
-
|
|
542
|
+
n: number of replacements to make (if `None`, replace all occurrences)
|
|
543
|
+
"""
|
|
544
|
+
return self.replace(substr, repl, n or -1)
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
@replace.to_sql
|
|
548
|
+
def _(
|
|
549
|
+
self: sql.ColumnElement, substr: sql.ColumnElement, repl: sql.ColumnElement, n: sql.ColumnElement | None = None
|
|
550
|
+
) -> sql.ColumnElement:
|
|
551
|
+
if n is not None:
|
|
552
|
+
return None # SQL does not support bounding the number of replacements
|
|
553
|
+
|
|
554
|
+
return sql.func.replace(self, substr, repl)
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
@pxt.udf(is_method=True)
|
|
558
|
+
def replace_re(self: str, pattern: str, repl: str, n: int | None = None, flags: int = 0) -> str:
|
|
559
|
+
"""
|
|
560
|
+
Replace occurrences of a regular expression pattern with `repl`.
|
|
561
|
+
|
|
562
|
+
Equivalent to [`re.sub()`](https://docs.python.org/3/library/re.html#re.sub).
|
|
563
|
+
|
|
564
|
+
Args:
|
|
565
|
+
pattern: regular expression pattern
|
|
566
|
+
repl: replacement string
|
|
567
|
+
n: number of replacements to make (if `None`, replace all occurrences)
|
|
481
568
|
flags: [flags](https://docs.python.org/3/library/re.html#flags) for the `re` module
|
|
482
|
-
regex: if True, treat pattern as a regular expression
|
|
483
569
|
"""
|
|
484
|
-
|
|
485
|
-
|
|
570
|
+
return re.sub(pattern, repl, self, count=(n or 0), flags=flags)
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
@pxt.udf(is_method=True)
|
|
574
|
+
def reverse(self: str) -> str:
|
|
575
|
+
"""
|
|
576
|
+
Return a reversed copy of the string.
|
|
577
|
+
|
|
578
|
+
Equivalent to `str[::-1]`.
|
|
579
|
+
"""
|
|
580
|
+
return self[::-1]
|
|
486
581
|
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
return self.replace(pattern, repl, n)
|
|
582
|
+
|
|
583
|
+
@reverse.to_sql
|
|
584
|
+
def _(self: sql.ColumnElement) -> sql.ColumnElement:
|
|
585
|
+
return sql.func.reverse(self)
|
|
492
586
|
|
|
493
587
|
|
|
494
588
|
@pxt.udf(is_method=True)
|
|
495
|
-
def rfind(self: str, substr: str, start:
|
|
589
|
+
def rfind(self: str, substr: str, start: int | None = 0, end: int | None = None) -> int:
|
|
496
590
|
"""
|
|
497
591
|
Return the highest index where `substr` is found, such that `substr` is contained within `[start:end]`.
|
|
498
592
|
|
|
@@ -507,7 +601,7 @@ def rfind(self: str, substr: str, start: Optional[int] = 0, end: Optional[int] =
|
|
|
507
601
|
|
|
508
602
|
|
|
509
603
|
@pxt.udf(is_method=True)
|
|
510
|
-
def rindex(self: str, substr: str, start:
|
|
604
|
+
def rindex(self: str, substr: str, start: int | None = 0, end: int | None = None) -> int:
|
|
511
605
|
"""
|
|
512
606
|
Return the highest index where `substr` is found, such that `substr` is contained within `[start:end]`.
|
|
513
607
|
Raises ValueError if `substr` is not found.
|
|
@@ -544,7 +638,7 @@ def rpartition(self: str, sep: str = ' ') -> list:
|
|
|
544
638
|
|
|
545
639
|
|
|
546
640
|
@pxt.udf(is_method=True)
|
|
547
|
-
def rstrip(self: str, chars:
|
|
641
|
+
def rstrip(self: str, chars: str | None = None) -> str:
|
|
548
642
|
"""
|
|
549
643
|
Return a copy of string with trailing characters removed.
|
|
550
644
|
|
|
@@ -556,8 +650,13 @@ def rstrip(self: str, chars: Optional[str] = None) -> str:
|
|
|
556
650
|
return self.rstrip(chars)
|
|
557
651
|
|
|
558
652
|
|
|
653
|
+
@rstrip.to_sql
|
|
654
|
+
def _(self: sql.ColumnElement, chars: sql.ColumnElement | None = None) -> sql.ColumnElement:
|
|
655
|
+
return sql.func.rtrim(self, chars if chars is not None else whitespace)
|
|
656
|
+
|
|
657
|
+
|
|
559
658
|
@pxt.udf(is_method=True)
|
|
560
|
-
def slice(self: str, start:
|
|
659
|
+
def slice(self: str, start: int | None = None, stop: int | None = None, step: int | None = None) -> str:
|
|
561
660
|
"""
|
|
562
661
|
Return a slice.
|
|
563
662
|
|
|
@@ -569,10 +668,43 @@ def slice(self: str, start: Optional[int] = None, stop: Optional[int] = None, st
|
|
|
569
668
|
return self[start:stop:step]
|
|
570
669
|
|
|
571
670
|
|
|
572
|
-
@
|
|
573
|
-
def
|
|
574
|
-
self:
|
|
575
|
-
|
|
671
|
+
@slice.to_sql
|
|
672
|
+
def _(
|
|
673
|
+
self: sql.ColumnElement,
|
|
674
|
+
start: sql.ColumnElement | None = None,
|
|
675
|
+
stop: sql.ColumnElement | None = None,
|
|
676
|
+
step: sql.ColumnElement | None = None,
|
|
677
|
+
) -> sql.ColumnElement:
|
|
678
|
+
if step is not None:
|
|
679
|
+
return None
|
|
680
|
+
|
|
681
|
+
if start is not None:
|
|
682
|
+
start = start.cast(sql.types.INT) # Postgres won't accept a BIGINT
|
|
683
|
+
start = sql.case(
|
|
684
|
+
(start >= 0, start + 1), # SQL is 1-based, Python is 0-based
|
|
685
|
+
else_=sql.func.char_length(self) + start + 1, # negative index
|
|
686
|
+
)
|
|
687
|
+
start = sql.func.greatest(start, 1)
|
|
688
|
+
|
|
689
|
+
if stop is not None:
|
|
690
|
+
stop = stop.cast(sql.types.INT) # Postgres won't accept a BIGINT
|
|
691
|
+
stop = sql.case(
|
|
692
|
+
(stop >= 0, stop + 1), # SQL is 1-based, Python is 0-based
|
|
693
|
+
else_=sql.func.char_length(self) + stop + 1, # negative index
|
|
694
|
+
)
|
|
695
|
+
stop = sql.func.greatest(stop, 0)
|
|
696
|
+
|
|
697
|
+
if start is None:
|
|
698
|
+
if stop is None:
|
|
699
|
+
return self
|
|
700
|
+
return sql.func.substr(self, 1, stop)
|
|
701
|
+
if stop is None:
|
|
702
|
+
return sql.func.substr(self, start)
|
|
703
|
+
return sql.func.substr(self, start, sql.func.greatest(stop - start, 0))
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
@pxt.udf(is_method=True)
|
|
707
|
+
def slice_replace(self: str, start: int | None = None, stop: int | None = None, repl: str | None = None) -> str:
|
|
576
708
|
"""
|
|
577
709
|
Replace a positional slice with another value.
|
|
578
710
|
|
|
@@ -585,20 +717,27 @@ def slice_replace(
|
|
|
585
717
|
|
|
586
718
|
|
|
587
719
|
@pxt.udf(is_method=True)
|
|
588
|
-
def startswith(self: str,
|
|
720
|
+
def startswith(self: str, substr: str) -> int:
|
|
589
721
|
"""
|
|
590
|
-
Return `True` if string starts with `
|
|
722
|
+
Return `True` if string starts with `substr`, otherwise return `False`.
|
|
591
723
|
|
|
592
724
|
Equivalent to [`str.startswith()`](https://docs.python.org/3/library/stdtypes.html#str.startswith).
|
|
593
725
|
|
|
594
726
|
Args:
|
|
595
|
-
|
|
727
|
+
substr: string literal
|
|
596
728
|
"""
|
|
597
|
-
return self.startswith(
|
|
729
|
+
return self.startswith(substr)
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
@startswith.to_sql
|
|
733
|
+
def _(self: sql.ColumnElement, substr: sql.ColumnElement) -> sql.ColumnElement:
|
|
734
|
+
# Replace all occurrences of `%`, `_`, and `\` with escaped versions
|
|
735
|
+
escaped_substr = sql.func.regexp_replace(substr, r'(%|_|\\)', r'\\\1', 'g')
|
|
736
|
+
return self.like(sql.func.concat(escaped_substr, '%'))
|
|
598
737
|
|
|
599
738
|
|
|
600
739
|
@pxt.udf(is_method=True)
|
|
601
|
-
def strip(self: str, chars:
|
|
740
|
+
def strip(self: str, chars: str | None = None) -> str:
|
|
602
741
|
"""
|
|
603
742
|
Return a copy of string with leading and trailing characters removed.
|
|
604
743
|
|
|
@@ -610,6 +749,11 @@ def strip(self: str, chars: Optional[str] = None) -> str:
|
|
|
610
749
|
return self.strip(chars)
|
|
611
750
|
|
|
612
751
|
|
|
752
|
+
@strip.to_sql
|
|
753
|
+
def _(self: sql.ColumnElement, chars: sql.ColumnElement | None = None) -> sql.ColumnElement:
|
|
754
|
+
return sql.func.trim(self, chars if chars is not None else whitespace)
|
|
755
|
+
|
|
756
|
+
|
|
613
757
|
@pxt.udf(is_method=True)
|
|
614
758
|
def swapcase(self: str) -> str:
|
|
615
759
|
"""
|
|
@@ -641,6 +785,11 @@ def upper(self: str) -> str:
|
|
|
641
785
|
return self.upper()
|
|
642
786
|
|
|
643
787
|
|
|
788
|
+
@upper.to_sql
|
|
789
|
+
def _(self: sql.ColumnElement) -> sql.ColumnElement:
|
|
790
|
+
return sql.func.upper(self)
|
|
791
|
+
|
|
792
|
+
|
|
644
793
|
@pxt.udf(is_method=True)
|
|
645
794
|
def wrap(self: str, width: int, **kwargs: Any) -> list[str]:
|
|
646
795
|
"""
|
|
@@ -653,8 +802,6 @@ def wrap(self: str, width: int, **kwargs: Any) -> list[str]:
|
|
|
653
802
|
width: Maximum line width.
|
|
654
803
|
kwargs: Additional keyword arguments to pass to `textwrap.fill()`.
|
|
655
804
|
"""
|
|
656
|
-
import textwrap
|
|
657
|
-
|
|
658
805
|
return textwrap.wrap(self, width, **kwargs)
|
|
659
806
|
|
|
660
807
|
|
|
@@ -671,6 +818,29 @@ def zfill(self: str, width: int) -> str:
|
|
|
671
818
|
return self.zfill(width)
|
|
672
819
|
|
|
673
820
|
|
|
821
|
+
def string_splitter(text: Any, separators: str) -> tuple[type[pxt.iterators.ComponentIterator], dict[str, Any]]:
|
|
822
|
+
"""Iterator over chunks of a string. The string is chunked according to the specified `separators`.
|
|
823
|
+
|
|
824
|
+
The iterator yields a `text` field containing the text of the chunk.
|
|
825
|
+
Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
|
|
826
|
+
|
|
827
|
+
Args:
|
|
828
|
+
separators: separators to use to chunk the document. Currently the only supported option is `'sentence'`.
|
|
829
|
+
|
|
830
|
+
Examples:
|
|
831
|
+
This example assumes an existing table `tbl` with a column `text` of type `pxt.String`.
|
|
832
|
+
|
|
833
|
+
Create a view that splits all strings on sentence boundaries:
|
|
834
|
+
|
|
835
|
+
>>> pxt.create_view(
|
|
836
|
+
... 'sentence_chunks',
|
|
837
|
+
... tbl,
|
|
838
|
+
... iterator=string_splitter(tbl.text, separators='sentence')
|
|
839
|
+
... )
|
|
840
|
+
"""
|
|
841
|
+
return pxt.iterators.string.StringSplitter._create(text=text, separators=separators)
|
|
842
|
+
|
|
843
|
+
|
|
674
844
|
__all__ = local_public_names(__name__)
|
|
675
845
|
|
|
676
846
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Pixeltable
|
|
2
|
+
Pixeltable UDFs for `TimestampType`.
|
|
3
3
|
|
|
4
4
|
Usage example:
|
|
5
5
|
```python
|
|
@@ -11,7 +11,6 @@ t.select(t.timestamp_col.year, t.timestamp_col.weekday()).collect()
|
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
13
|
from datetime import datetime
|
|
14
|
-
from typing import Optional
|
|
15
14
|
|
|
16
15
|
import sqlalchemy as sql
|
|
17
16
|
|
|
@@ -134,7 +133,8 @@ def astimezone(self: datetime, tz: str) -> datetime:
|
|
|
134
133
|
Convert the datetime to the given time zone.
|
|
135
134
|
|
|
136
135
|
Args:
|
|
137
|
-
tz: The time zone to convert to. Must be a valid time zone name from the
|
|
136
|
+
tz: The time zone to convert to. Must be a valid time zone name from the
|
|
137
|
+
[IANA Time Zone Database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones).
|
|
138
138
|
"""
|
|
139
139
|
from zoneinfo import ZoneInfo
|
|
140
140
|
|
|
@@ -237,12 +237,12 @@ def _(
|
|
|
237
237
|
microsecond: sql.ColumnElement = _SQL_ZERO,
|
|
238
238
|
) -> sql.ColumnElement:
|
|
239
239
|
return sql.func.make_timestamptz(
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
240
|
+
year.cast(sql.Integer),
|
|
241
|
+
month.cast(sql.Integer),
|
|
242
|
+
day.cast(sql.Integer),
|
|
243
|
+
hour.cast(sql.Integer),
|
|
244
|
+
minute.cast(sql.Integer),
|
|
245
|
+
(second + microsecond / 1000000.0).cast(sql.Float),
|
|
246
246
|
)
|
|
247
247
|
|
|
248
248
|
|
|
@@ -271,13 +271,13 @@ def _(
|
|
|
271
271
|
@pxt.udf(is_method=True)
|
|
272
272
|
def replace(
|
|
273
273
|
self: datetime,
|
|
274
|
-
year:
|
|
275
|
-
month:
|
|
276
|
-
day:
|
|
277
|
-
hour:
|
|
278
|
-
minute:
|
|
279
|
-
second:
|
|
280
|
-
microsecond:
|
|
274
|
+
year: int | None = None,
|
|
275
|
+
month: int | None = None,
|
|
276
|
+
day: int | None = None,
|
|
277
|
+
hour: int | None = None,
|
|
278
|
+
minute: int | None = None,
|
|
279
|
+
second: int | None = None,
|
|
280
|
+
microsecond: int | None = None,
|
|
281
281
|
) -> datetime:
|
|
282
282
|
"""
|
|
283
283
|
Return a datetime with the same attributes, except for those attributes given new values by whichever keyword
|