pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. pixeltable/__init__.py +42 -8
  2. pixeltable/{dataframe.py → _query.py} +470 -206
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +5 -4
  5. pixeltable/catalog/catalog.py +1785 -432
  6. pixeltable/catalog/column.py +190 -113
  7. pixeltable/catalog/dir.py +2 -4
  8. pixeltable/catalog/globals.py +19 -46
  9. pixeltable/catalog/insertable_table.py +191 -98
  10. pixeltable/catalog/path.py +63 -23
  11. pixeltable/catalog/schema_object.py +11 -15
  12. pixeltable/catalog/table.py +843 -436
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +978 -657
  15. pixeltable/catalog/table_version_handle.py +72 -16
  16. pixeltable/catalog/table_version_path.py +112 -43
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +134 -90
  20. pixeltable/config.py +134 -22
  21. pixeltable/env.py +471 -157
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +4 -1
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +11 -7
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +106 -56
  37. pixeltable/exec/globals.py +35 -0
  38. pixeltable/exec/in_memory_data_node.py +19 -19
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +351 -84
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +36 -23
  46. pixeltable/exprs/column_ref.py +213 -89
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +164 -54
  50. pixeltable/exprs/expr.py +70 -44
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +100 -40
  54. pixeltable/exprs/globals.py +2 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +18 -32
  57. pixeltable/exprs/is_null.py +7 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +27 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +167 -67
  64. pixeltable/exprs/rowid_ref.py +25 -10
  65. pixeltable/exprs/similarity_expr.py +58 -40
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +17 -11
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +29 -27
  78. pixeltable/func/signature.py +46 -19
  79. pixeltable/func/tools.py +31 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +16 -0
  82. pixeltable/functions/anthropic.py +123 -77
  83. pixeltable/functions/audio.py +147 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +7 -4
  86. pixeltable/functions/deepseek.py +35 -43
  87. pixeltable/functions/document.py +81 -0
  88. pixeltable/functions/fal.py +76 -0
  89. pixeltable/functions/fireworks.py +11 -20
  90. pixeltable/functions/gemini.py +195 -39
  91. pixeltable/functions/globals.py +142 -14
  92. pixeltable/functions/groq.py +108 -0
  93. pixeltable/functions/huggingface.py +1056 -24
  94. pixeltable/functions/image.py +115 -57
  95. pixeltable/functions/json.py +1 -1
  96. pixeltable/functions/llama_cpp.py +28 -13
  97. pixeltable/functions/math.py +67 -5
  98. pixeltable/functions/mistralai.py +18 -55
  99. pixeltable/functions/net.py +70 -0
  100. pixeltable/functions/ollama.py +20 -13
  101. pixeltable/functions/openai.py +240 -226
  102. pixeltable/functions/openrouter.py +143 -0
  103. pixeltable/functions/replicate.py +4 -4
  104. pixeltable/functions/reve.py +250 -0
  105. pixeltable/functions/string.py +239 -69
  106. pixeltable/functions/timestamp.py +16 -16
  107. pixeltable/functions/together.py +24 -84
  108. pixeltable/functions/twelvelabs.py +188 -0
  109. pixeltable/functions/util.py +6 -1
  110. pixeltable/functions/uuid.py +30 -0
  111. pixeltable/functions/video.py +1515 -107
  112. pixeltable/functions/vision.py +8 -8
  113. pixeltable/functions/voyageai.py +289 -0
  114. pixeltable/functions/whisper.py +16 -8
  115. pixeltable/functions/whisperx.py +179 -0
  116. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  117. pixeltable/globals.py +362 -115
  118. pixeltable/index/base.py +17 -21
  119. pixeltable/index/btree.py +28 -22
  120. pixeltable/index/embedding_index.py +100 -118
  121. pixeltable/io/__init__.py +4 -2
  122. pixeltable/io/datarows.py +8 -7
  123. pixeltable/io/external_store.py +56 -105
  124. pixeltable/io/fiftyone.py +13 -13
  125. pixeltable/io/globals.py +31 -30
  126. pixeltable/io/hf_datasets.py +61 -16
  127. pixeltable/io/label_studio.py +74 -70
  128. pixeltable/io/lancedb.py +3 -0
  129. pixeltable/io/pandas.py +21 -12
  130. pixeltable/io/parquet.py +25 -105
  131. pixeltable/io/table_data_conduit.py +250 -123
  132. pixeltable/io/utils.py +4 -4
  133. pixeltable/iterators/__init__.py +2 -1
  134. pixeltable/iterators/audio.py +26 -25
  135. pixeltable/iterators/base.py +9 -3
  136. pixeltable/iterators/document.py +112 -78
  137. pixeltable/iterators/image.py +12 -15
  138. pixeltable/iterators/string.py +11 -4
  139. pixeltable/iterators/video.py +523 -120
  140. pixeltable/metadata/__init__.py +14 -3
  141. pixeltable/metadata/converters/convert_13.py +2 -2
  142. pixeltable/metadata/converters/convert_18.py +2 -2
  143. pixeltable/metadata/converters/convert_19.py +2 -2
  144. pixeltable/metadata/converters/convert_20.py +2 -2
  145. pixeltable/metadata/converters/convert_21.py +2 -2
  146. pixeltable/metadata/converters/convert_22.py +2 -2
  147. pixeltable/metadata/converters/convert_24.py +2 -2
  148. pixeltable/metadata/converters/convert_25.py +2 -2
  149. pixeltable/metadata/converters/convert_26.py +2 -2
  150. pixeltable/metadata/converters/convert_29.py +4 -4
  151. pixeltable/metadata/converters/convert_30.py +34 -21
  152. pixeltable/metadata/converters/convert_34.py +2 -2
  153. pixeltable/metadata/converters/convert_35.py +9 -0
  154. pixeltable/metadata/converters/convert_36.py +38 -0
  155. pixeltable/metadata/converters/convert_37.py +15 -0
  156. pixeltable/metadata/converters/convert_38.py +39 -0
  157. pixeltable/metadata/converters/convert_39.py +124 -0
  158. pixeltable/metadata/converters/convert_40.py +73 -0
  159. pixeltable/metadata/converters/convert_41.py +12 -0
  160. pixeltable/metadata/converters/convert_42.py +9 -0
  161. pixeltable/metadata/converters/convert_43.py +44 -0
  162. pixeltable/metadata/converters/util.py +20 -31
  163. pixeltable/metadata/notes.py +9 -0
  164. pixeltable/metadata/schema.py +140 -53
  165. pixeltable/metadata/utils.py +74 -0
  166. pixeltable/mypy/__init__.py +3 -0
  167. pixeltable/mypy/mypy_plugin.py +123 -0
  168. pixeltable/plan.py +382 -115
  169. pixeltable/share/__init__.py +1 -1
  170. pixeltable/share/packager.py +547 -83
  171. pixeltable/share/protocol/__init__.py +33 -0
  172. pixeltable/share/protocol/common.py +165 -0
  173. pixeltable/share/protocol/operation_types.py +33 -0
  174. pixeltable/share/protocol/replica.py +119 -0
  175. pixeltable/share/publish.py +257 -59
  176. pixeltable/store.py +311 -194
  177. pixeltable/type_system.py +373 -211
  178. pixeltable/utils/__init__.py +2 -3
  179. pixeltable/utils/arrow.py +131 -17
  180. pixeltable/utils/av.py +298 -0
  181. pixeltable/utils/azure_store.py +346 -0
  182. pixeltable/utils/coco.py +6 -6
  183. pixeltable/utils/code.py +3 -3
  184. pixeltable/utils/console_output.py +4 -1
  185. pixeltable/utils/coroutine.py +6 -23
  186. pixeltable/utils/dbms.py +32 -6
  187. pixeltable/utils/description_helper.py +4 -5
  188. pixeltable/utils/documents.py +7 -18
  189. pixeltable/utils/exception_handler.py +7 -30
  190. pixeltable/utils/filecache.py +6 -6
  191. pixeltable/utils/formatter.py +86 -48
  192. pixeltable/utils/gcs_store.py +295 -0
  193. pixeltable/utils/http.py +133 -0
  194. pixeltable/utils/http_server.py +2 -3
  195. pixeltable/utils/iceberg.py +1 -2
  196. pixeltable/utils/image.py +17 -0
  197. pixeltable/utils/lancedb.py +90 -0
  198. pixeltable/utils/local_store.py +322 -0
  199. pixeltable/utils/misc.py +5 -0
  200. pixeltable/utils/object_stores.py +573 -0
  201. pixeltable/utils/pydantic.py +60 -0
  202. pixeltable/utils/pytorch.py +5 -6
  203. pixeltable/utils/s3_store.py +527 -0
  204. pixeltable/utils/sql.py +26 -0
  205. pixeltable/utils/system.py +30 -0
  206. pixeltable-0.5.7.dist-info/METADATA +579 -0
  207. pixeltable-0.5.7.dist-info/RECORD +227 -0
  208. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  209. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  210. pixeltable/__version__.py +0 -3
  211. pixeltable/catalog/named_function.py +0 -40
  212. pixeltable/ext/__init__.py +0 -17
  213. pixeltable/ext/functions/__init__.py +0 -11
  214. pixeltable/ext/functions/whisperx.py +0 -77
  215. pixeltable/utils/media_store.py +0 -77
  216. pixeltable/utils/s3.py +0 -17
  217. pixeltable-0.3.14.dist-info/METADATA +0 -434
  218. pixeltable-0.3.14.dist-info/RECORD +0 -186
  219. pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
  220. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -1,5 +1,5 @@
1
1
  """
2
- Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `StringType`.
2
+ Pixeltable UDFs for `StringType`.
3
3
  It closely follows the Pandas `pandas.Series.str` API.
4
4
 
5
5
  Example:
@@ -12,7 +12,12 @@ t.select(t.str_col.capitalize()).collect()
12
12
  """
13
13
 
14
14
  import builtins
15
- from typing import Any, Optional
15
+ import re
16
+ import textwrap
17
+ from string import whitespace
18
+ from typing import Any
19
+
20
+ import sqlalchemy as sql
16
21
 
17
22
  import pixeltable as pxt
18
23
  from pixeltable.utils.code import local_public_names
@@ -28,6 +33,11 @@ def capitalize(self: str) -> str:
28
33
  return self.capitalize()
29
34
 
30
35
 
36
+ @capitalize.to_sql
37
+ def _(self: sql.ColumnElement) -> sql.ColumnElement:
38
+ return sql.func.concat(sql.func.upper(sql.func.left(self, 1)), sql.func.lower(sql.func.right(self, -1)))
39
+
40
+
31
41
  @pxt.udf(is_method=True)
32
42
  def casefold(self: str) -> str:
33
43
  """
@@ -53,26 +63,45 @@ def center(self: str, width: int, fillchar: str = ' ') -> str:
53
63
 
54
64
 
55
65
  @pxt.udf(is_method=True)
56
- def contains(self: str, pattern: str, case: bool = True, flags: int = 0, regex: bool = True) -> bool:
66
+ def contains(self: str, substr: str, case: bool = True) -> bool:
57
67
  """
58
- Test if string contains pattern or regex.
68
+ Test if string contains a substring.
59
69
 
60
70
  Args:
61
- pattern: string literal or regular expression
71
+ substr: string literal or regular expression
62
72
  case: if False, ignore case
63
- flags: [flags](https://docs.python.org/3/library/re.html#flags) for the `re` module
64
- regex: if True, treat pattern as a regular expression
65
73
  """
66
- if regex:
67
- import re
74
+ if case:
75
+ return substr in self
76
+ else:
77
+ return substr.lower() in self.lower()
78
+
68
79
 
69
- if not case:
70
- flags |= re.IGNORECASE
71
- return bool(re.search(pattern, self, flags))
72
- elif case:
73
- return pattern in self
80
+ @contains.to_sql
81
+ def _(self: sql.ColumnElement, substr: sql.ColumnElement, case: sql.ColumnElement | None = None) -> sql.ColumnElement:
82
+ # Replace all occurrences of `%`, `_`, and `\` with escaped versions
83
+ escaped_substr = sql.func.regexp_replace(substr, r'(%|_|\\)', r'\\\1', 'g')
84
+ if case is None:
85
+ # Default `case` is True, so we do a case-sensitive comparison
86
+ return self.like(sql.func.concat('%', escaped_substr, '%'))
74
87
  else:
75
- return pattern.lower() in self.lower()
88
+ # Toggle case-sensitivity based on the value of `case`
89
+ return sql.case(
90
+ (case, self.like(sql.func.concat('%', escaped_substr, '%'))),
91
+ else_=sql.func.lower(self).like(sql.func.concat('%', sql.func.lower(escaped_substr), '%')),
92
+ )
93
+
94
+
95
+ @pxt.udf(is_method=True)
96
+ def contains_re(self: str, pattern: str, flags: int = 0) -> bool:
97
+ """
98
+ Test if string contains a regular expression pattern.
99
+
100
+ Args:
101
+ pattern: regular expression pattern
102
+ flags: [flags](https://docs.python.org/3/library/re.html#flags) for the `re` module
103
+ """
104
+ return bool(re.search(pattern, self, flags))
76
105
 
77
106
 
78
107
  @pxt.udf(is_method=True)
@@ -84,22 +113,27 @@ def count(self: str, pattern: str, flags: int = 0) -> int:
84
113
  pattern: string literal or regular expression
85
114
  flags: [flags](https://docs.python.org/3/library/re.html#flags) for the `re` module
86
115
  """
87
- import re
88
-
89
116
  return builtins.len(re.findall(pattern, self, flags))
90
117
 
91
118
 
92
119
  @pxt.udf(is_method=True)
93
- def endswith(self: str, pattern: str) -> bool:
120
+ def endswith(self: str, substr: str) -> bool:
94
121
  """
95
122
  Return `True` if the string ends with the specified suffix, otherwise return `False`.
96
123
 
97
124
  Equivalent to [`str.endswith()`](https://docs.python.org/3/library/stdtypes.html#str.endswith).
98
125
 
99
126
  Args:
100
- pattern: string literal
127
+ substr: string literal
101
128
  """
102
- return self.endswith(pattern)
129
+ return self.endswith(substr)
130
+
131
+
132
+ @endswith.to_sql
133
+ def _(self: sql.ColumnElement, substr: sql.ColumnElement) -> sql.ColumnElement:
134
+ # Replace all occurrences of `%`, `_`, and `\` with escaped versions
135
+ escaped_substr = sql.func.regexp_replace(substr, r'(%|_|\\)', r'\\\1', 'g')
136
+ return self.like(sql.func.concat('%', escaped_substr))
103
137
 
104
138
 
105
139
  @pxt.udf(is_method=True)
@@ -113,13 +147,11 @@ def fill(self: str, width: int, **kwargs: Any) -> str:
113
147
  width: Maximum line width.
114
148
  kwargs: Additional keyword arguments to pass to `textwrap.fill()`.
115
149
  """
116
- import textwrap
117
-
118
150
  return textwrap.fill(self, width, **kwargs)
119
151
 
120
152
 
121
153
  @pxt.udf(is_method=True)
122
- def find(self: str, substr: str, start: Optional[int] = 0, end: Optional[int] = None) -> int:
154
+ def find(self: str, substr: str, start: int = 0, end: int | None = None) -> int:
123
155
  """
124
156
  Return the lowest index in string where `substr` is found within the slice `s[start:end]`.
125
157
 
@@ -133,6 +165,20 @@ def find(self: str, substr: str, start: Optional[int] = 0, end: Optional[int] =
133
165
  return self.find(substr, start, end)
134
166
 
135
167
 
168
+ @find.to_sql
169
+ def _(
170
+ self: sql.ColumnElement, substr: sql.ColumnElement, start: sql.ColumnElement, end: sql.ColumnElement | None = None
171
+ ) -> sql.ColumnElement:
172
+ sl = pxt.functions.string.slice._to_sql(self, start, end)
173
+ if sl is None:
174
+ return None
175
+
176
+ strpos = sql.func.strpos(sl, substr)
177
+ return sql.case(
178
+ (strpos == 0, -1), (start >= 0, strpos + start - 1), else_=strpos + sql.func.char_length(self) + start - 1
179
+ )
180
+
181
+
136
182
  @pxt.udf(is_method=True)
137
183
  def findall(self: str, pattern: str, flags: int = 0) -> list:
138
184
  """
@@ -144,8 +190,6 @@ def findall(self: str, pattern: str, flags: int = 0) -> list:
144
190
  pattern: regular expression pattern
145
191
  flags: [flags](https://docs.python.org/3/library/re.html#flags) for the `re` module
146
192
  """
147
- import re
148
-
149
193
  return re.findall(pattern, self, flags)
150
194
 
151
195
 
@@ -171,8 +215,6 @@ def fullmatch(self: str, pattern: str, case: bool = True, flags: int = 0) -> boo
171
215
  case: if False, ignore case
172
216
  flags: [flags](https://docs.python.org/3/library/re.html#flags) for the `re` module
173
217
  """
174
- import re
175
-
176
218
  if not case:
177
219
  flags |= re.IGNORECASE
178
220
  _ = bool(re.fullmatch(pattern, self, flags))
@@ -180,7 +222,7 @@ def fullmatch(self: str, pattern: str, case: bool = True, flags: int = 0) -> boo
180
222
 
181
223
 
182
224
  @pxt.udf(is_method=True)
183
- def index(self: str, substr: str, start: Optional[int] = 0, end: Optional[int] = None) -> int:
225
+ def index(self: str, substr: str, start: int = 0, end: int | None = None) -> int:
184
226
  """
185
227
  Return the lowest index in string where `substr` is found within the slice `[start:end]`.
186
228
  Raises ValueError if `substr` is not found.
@@ -330,6 +372,11 @@ def len(self: str) -> int:
330
372
  return builtins.len(self)
331
373
 
332
374
 
375
+ @len.to_sql
376
+ def _(self: sql.ColumnElement) -> sql.ColumnElement:
377
+ return sql.func.char_length(self)
378
+
379
+
333
380
  @pxt.udf(is_method=True)
334
381
  def ljust(self: str, width: int, fillchar: str = ' ') -> str:
335
382
  """
@@ -355,8 +402,13 @@ def lower(self: str) -> str:
355
402
  return self.lower()
356
403
 
357
404
 
405
+ @lower.to_sql
406
+ def _(self: sql.ColumnElement) -> sql.ColumnElement:
407
+ return sql.func.lower(self)
408
+
409
+
358
410
  @pxt.udf(is_method=True)
359
- def lstrip(self: str, chars: Optional[str] = None) -> str:
411
+ def lstrip(self: str, chars: str | None = None) -> str:
360
412
  """
361
413
  Return a copy of the string with leading characters removed. The `chars` argument is a string specifying the set of
362
414
  characters to be removed. If omitted or `None`, whitespace characters are removed.
@@ -369,6 +421,11 @@ def lstrip(self: str, chars: Optional[str] = None) -> str:
369
421
  return self.lstrip(chars)
370
422
 
371
423
 
424
+ @lstrip.to_sql
425
+ def _(self: sql.ColumnElement, chars: sql.ColumnElement | None = None) -> sql.ColumnElement:
426
+ return sql.func.ltrim(self, chars if chars is not None else whitespace)
427
+
428
+
372
429
  @pxt.udf(is_method=True)
373
430
  def match(self: str, pattern: str, case: bool = True, flags: int = 0) -> bool:
374
431
  """
@@ -379,8 +436,6 @@ def match(self: str, pattern: str, case: bool = True, flags: int = 0) -> bool:
379
436
  case: if False, ignore case
380
437
  flags: [flags](https://docs.python.org/3/library/re.html#flags) for the `re` module
381
438
  """
382
- import re
383
-
384
439
  if not case:
385
440
  flags |= re.IGNORECASE
386
441
  return bool(re.match(pattern, self, flags))
@@ -440,9 +495,12 @@ def removeprefix(self: str, prefix: str) -> str:
440
495
  """
441
496
  Remove prefix. If the prefix is not present, returns string.
442
497
  """
443
- if self.startswith(prefix):
444
- return self[builtins.len(prefix) :]
445
- return self
498
+ return self.removeprefix(prefix)
499
+
500
+
501
+ @removeprefix.to_sql
502
+ def _(self: sql.ColumnElement, prefix: sql.ColumnElement) -> sql.ColumnElement:
503
+ return sql.case((startswith._to_sql(self, prefix), sql.func.right(self, -sql.func.char_length(prefix))), else_=self)
446
504
 
447
505
 
448
506
  @pxt.udf(is_method=True)
@@ -450,9 +508,12 @@ def removesuffix(self: str, suffix: str) -> str:
450
508
  """
451
509
  Remove suffix. If the suffix is not present, returns string.
452
510
  """
453
- if self.endswith(suffix):
454
- return self[: -builtins.len(suffix)]
455
- return self
511
+ return self.removesuffix(suffix)
512
+
513
+
514
+ @removesuffix.to_sql
515
+ def _(self: sql.ColumnElement, suffix: sql.ColumnElement) -> sql.ColumnElement:
516
+ return sql.case((endswith._to_sql(self, suffix), sql.func.left(self, -sql.func.char_length(suffix))), else_=self)
456
517
 
457
518
 
458
519
  @pxt.udf(is_method=True)
@@ -463,36 +524,69 @@ def repeat(self: str, n: int) -> str:
463
524
  return self * n
464
525
 
465
526
 
527
+ @repeat.to_sql
528
+ def _(self: sql.ColumnElement, n: sql.ColumnElement) -> sql.ColumnElement:
529
+ return sql.func.repeat(self, n.cast(sql.types.INT))
530
+
531
+
466
532
  @pxt.udf(is_method=True)
467
- def replace(
468
- self: str, pattern: str, repl: str, n: int = -1, case: bool = True, flags: int = 0, regex: bool = False
469
- ) -> str:
533
+ def replace(self: str, substr: str, repl: str, n: int | None = None) -> str:
470
534
  """
471
- Replace occurrences of `pattern` with `repl`.
535
+ Replace occurrences of `substr` with `repl`.
472
536
 
473
- Equivalent to [`str.replace()`](https://docs.python.org/3/library/stdtypes.html#str.replace) or
474
- [`re.sub()`](https://docs.python.org/3/library/re.html#re.sub), depending on the value of regex.
537
+ Equivalent to [`str.replace()`](https://docs.python.org/3/library/stdtypes.html#str.replace).
475
538
 
476
539
  Args:
477
- pattern: string literal or regular expression
540
+ substr: string literal
478
541
  repl: replacement string
479
- n: number of replacements to make (-1 for all)
480
- case: if False, ignore case
542
+ n: number of replacements to make (if `None`, replace all occurrences)
543
+ """
544
+ return self.replace(substr, repl, n or -1)
545
+
546
+
547
+ @replace.to_sql
548
+ def _(
549
+ self: sql.ColumnElement, substr: sql.ColumnElement, repl: sql.ColumnElement, n: sql.ColumnElement | None = None
550
+ ) -> sql.ColumnElement:
551
+ if n is not None:
552
+ return None # SQL does not support bounding the number of replacements
553
+
554
+ return sql.func.replace(self, substr, repl)
555
+
556
+
557
+ @pxt.udf(is_method=True)
558
+ def replace_re(self: str, pattern: str, repl: str, n: int | None = None, flags: int = 0) -> str:
559
+ """
560
+ Replace occurrences of a regular expression pattern with `repl`.
561
+
562
+ Equivalent to [`re.sub()`](https://docs.python.org/3/library/re.html#re.sub).
563
+
564
+ Args:
565
+ pattern: regular expression pattern
566
+ repl: replacement string
567
+ n: number of replacements to make (if `None`, replace all occurrences)
481
568
  flags: [flags](https://docs.python.org/3/library/re.html#flags) for the `re` module
482
- regex: if True, treat pattern as a regular expression
483
569
  """
484
- if regex:
485
- import re
570
+ return re.sub(pattern, repl, self, count=(n or 0), flags=flags)
571
+
572
+
573
+ @pxt.udf(is_method=True)
574
+ def reverse(self: str) -> str:
575
+ """
576
+ Return a reversed copy of the string.
577
+
578
+ Equivalent to `str[::-1]`.
579
+ """
580
+ return self[::-1]
486
581
 
487
- if not case:
488
- flags |= re.IGNORECASE
489
- return re.sub(pattern, repl, self, count=(0 if n == -1 else n), flags=flags)
490
- else:
491
- return self.replace(pattern, repl, n)
582
+
583
+ @reverse.to_sql
584
+ def _(self: sql.ColumnElement) -> sql.ColumnElement:
585
+ return sql.func.reverse(self)
492
586
 
493
587
 
494
588
  @pxt.udf(is_method=True)
495
- def rfind(self: str, substr: str, start: Optional[int] = 0, end: Optional[int] = None) -> int:
589
+ def rfind(self: str, substr: str, start: int | None = 0, end: int | None = None) -> int:
496
590
  """
497
591
  Return the highest index where `substr` is found, such that `substr` is contained within `[start:end]`.
498
592
 
@@ -507,7 +601,7 @@ def rfind(self: str, substr: str, start: Optional[int] = 0, end: Optional[int] =
507
601
 
508
602
 
509
603
  @pxt.udf(is_method=True)
510
- def rindex(self: str, substr: str, start: Optional[int] = 0, end: Optional[int] = None) -> int:
604
+ def rindex(self: str, substr: str, start: int | None = 0, end: int | None = None) -> int:
511
605
  """
512
606
  Return the highest index where `substr` is found, such that `substr` is contained within `[start:end]`.
513
607
  Raises ValueError if `substr` is not found.
@@ -544,7 +638,7 @@ def rpartition(self: str, sep: str = ' ') -> list:
544
638
 
545
639
 
546
640
  @pxt.udf(is_method=True)
547
- def rstrip(self: str, chars: Optional[str] = None) -> str:
641
+ def rstrip(self: str, chars: str | None = None) -> str:
548
642
  """
549
643
  Return a copy of string with trailing characters removed.
550
644
 
@@ -556,8 +650,13 @@ def rstrip(self: str, chars: Optional[str] = None) -> str:
556
650
  return self.rstrip(chars)
557
651
 
558
652
 
653
+ @rstrip.to_sql
654
+ def _(self: sql.ColumnElement, chars: sql.ColumnElement | None = None) -> sql.ColumnElement:
655
+ return sql.func.rtrim(self, chars if chars is not None else whitespace)
656
+
657
+
559
658
  @pxt.udf(is_method=True)
560
- def slice(self: str, start: Optional[int] = None, stop: Optional[int] = None, step: Optional[int] = None) -> str:
659
+ def slice(self: str, start: int | None = None, stop: int | None = None, step: int | None = None) -> str:
561
660
  """
562
661
  Return a slice.
563
662
 
@@ -569,10 +668,43 @@ def slice(self: str, start: Optional[int] = None, stop: Optional[int] = None, st
569
668
  return self[start:stop:step]
570
669
 
571
670
 
572
- @pxt.udf(is_method=True)
573
- def slice_replace(
574
- self: str, start: Optional[int] = None, stop: Optional[int] = None, repl: Optional[str] = None
575
- ) -> str:
671
+ @slice.to_sql
672
+ def _(
673
+ self: sql.ColumnElement,
674
+ start: sql.ColumnElement | None = None,
675
+ stop: sql.ColumnElement | None = None,
676
+ step: sql.ColumnElement | None = None,
677
+ ) -> sql.ColumnElement:
678
+ if step is not None:
679
+ return None
680
+
681
+ if start is not None:
682
+ start = start.cast(sql.types.INT) # Postgres won't accept a BIGINT
683
+ start = sql.case(
684
+ (start >= 0, start + 1), # SQL is 1-based, Python is 0-based
685
+ else_=sql.func.char_length(self) + start + 1, # negative index
686
+ )
687
+ start = sql.func.greatest(start, 1)
688
+
689
+ if stop is not None:
690
+ stop = stop.cast(sql.types.INT) # Postgres won't accept a BIGINT
691
+ stop = sql.case(
692
+ (stop >= 0, stop + 1), # SQL is 1-based, Python is 0-based
693
+ else_=sql.func.char_length(self) + stop + 1, # negative index
694
+ )
695
+ stop = sql.func.greatest(stop, 0)
696
+
697
+ if start is None:
698
+ if stop is None:
699
+ return self
700
+ return sql.func.substr(self, 1, stop)
701
+ if stop is None:
702
+ return sql.func.substr(self, start)
703
+ return sql.func.substr(self, start, sql.func.greatest(stop - start, 0))
704
+
705
+
706
+ @pxt.udf(is_method=True)
707
+ def slice_replace(self: str, start: int | None = None, stop: int | None = None, repl: str | None = None) -> str:
576
708
  """
577
709
  Replace a positional slice with another value.
578
710
 
@@ -585,20 +717,27 @@ def slice_replace(
585
717
 
586
718
 
587
719
  @pxt.udf(is_method=True)
588
- def startswith(self: str, pattern: str) -> int:
720
+ def startswith(self: str, substr: str) -> int:
589
721
  """
590
- Return `True` if string starts with `pattern`, otherwise return `False`.
722
+ Return `True` if string starts with `substr`, otherwise return `False`.
591
723
 
592
724
  Equivalent to [`str.startswith()`](https://docs.python.org/3/library/stdtypes.html#str.startswith).
593
725
 
594
726
  Args:
595
- pattern: string literal
727
+ substr: string literal
596
728
  """
597
- return self.startswith(pattern)
729
+ return self.startswith(substr)
730
+
731
+
732
+ @startswith.to_sql
733
+ def _(self: sql.ColumnElement, substr: sql.ColumnElement) -> sql.ColumnElement:
734
+ # Replace all occurrences of `%`, `_`, and `\` with escaped versions
735
+ escaped_substr = sql.func.regexp_replace(substr, r'(%|_|\\)', r'\\\1', 'g')
736
+ return self.like(sql.func.concat(escaped_substr, '%'))
598
737
 
599
738
 
600
739
  @pxt.udf(is_method=True)
601
- def strip(self: str, chars: Optional[str] = None) -> str:
740
+ def strip(self: str, chars: str | None = None) -> str:
602
741
  """
603
742
  Return a copy of string with leading and trailing characters removed.
604
743
 
@@ -610,6 +749,11 @@ def strip(self: str, chars: Optional[str] = None) -> str:
610
749
  return self.strip(chars)
611
750
 
612
751
 
752
+ @strip.to_sql
753
+ def _(self: sql.ColumnElement, chars: sql.ColumnElement | None = None) -> sql.ColumnElement:
754
+ return sql.func.trim(self, chars if chars is not None else whitespace)
755
+
756
+
613
757
  @pxt.udf(is_method=True)
614
758
  def swapcase(self: str) -> str:
615
759
  """
@@ -641,6 +785,11 @@ def upper(self: str) -> str:
641
785
  return self.upper()
642
786
 
643
787
 
788
+ @upper.to_sql
789
+ def _(self: sql.ColumnElement) -> sql.ColumnElement:
790
+ return sql.func.upper(self)
791
+
792
+
644
793
  @pxt.udf(is_method=True)
645
794
  def wrap(self: str, width: int, **kwargs: Any) -> list[str]:
646
795
  """
@@ -653,8 +802,6 @@ def wrap(self: str, width: int, **kwargs: Any) -> list[str]:
653
802
  width: Maximum line width.
654
803
  kwargs: Additional keyword arguments to pass to `textwrap.fill()`.
655
804
  """
656
- import textwrap
657
-
658
805
  return textwrap.wrap(self, width, **kwargs)
659
806
 
660
807
 
@@ -671,6 +818,29 @@ def zfill(self: str, width: int) -> str:
671
818
  return self.zfill(width)
672
819
 
673
820
 
821
+ def string_splitter(text: Any, separators: str) -> tuple[type[pxt.iterators.ComponentIterator], dict[str, Any]]:
822
+ """Iterator over chunks of a string. The string is chunked according to the specified `separators`.
823
+
824
+ The iterator yields a `text` field containing the text of the chunk.
825
+ Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
826
+
827
+ Args:
828
+ separators: separators to use to chunk the document. Currently the only supported option is `'sentence'`.
829
+
830
+ Examples:
831
+ This example assumes an existing table `tbl` with a column `text` of type `pxt.String`.
832
+
833
+ Create a view that splits all strings on sentence boundaries:
834
+
835
+ >>> pxt.create_view(
836
+ ... 'sentence_chunks',
837
+ ... tbl,
838
+ ... iterator=string_splitter(tbl.text, separators='sentence')
839
+ ... )
840
+ """
841
+ return pxt.iterators.string.StringSplitter._create(text=text, separators=separators)
842
+
843
+
674
844
  __all__ = local_public_names(__name__)
675
845
 
676
846
 
@@ -1,5 +1,5 @@
1
1
  """
2
- Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `TimestampType`.
2
+ Pixeltable UDFs for `TimestampType`.
3
3
 
4
4
  Usage example:
5
5
  ```python
@@ -11,7 +11,6 @@ t.select(t.timestamp_col.year, t.timestamp_col.weekday()).collect()
11
11
  """
12
12
 
13
13
  from datetime import datetime
14
- from typing import Optional
15
14
 
16
15
  import sqlalchemy as sql
17
16
 
@@ -134,7 +133,8 @@ def astimezone(self: datetime, tz: str) -> datetime:
134
133
  Convert the datetime to the given time zone.
135
134
 
136
135
  Args:
137
- tz: The time zone to convert to. Must be a valid time zone name from the IANA Time Zone Database.
136
+ tz: The time zone to convert to. Must be a valid time zone name from the
137
+ [IANA Time Zone Database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones).
138
138
  """
139
139
  from zoneinfo import ZoneInfo
140
140
 
@@ -237,12 +237,12 @@ def _(
237
237
  microsecond: sql.ColumnElement = _SQL_ZERO,
238
238
  ) -> sql.ColumnElement:
239
239
  return sql.func.make_timestamptz(
240
- sql.cast(year, sql.Integer),
241
- sql.cast(month, sql.Integer),
242
- sql.cast(day, sql.Integer),
243
- sql.cast(hour, sql.Integer),
244
- sql.cast(minute, sql.Integer),
245
- sql.cast(second + microsecond / 1000000.0, sql.Float),
240
+ year.cast(sql.Integer),
241
+ month.cast(sql.Integer),
242
+ day.cast(sql.Integer),
243
+ hour.cast(sql.Integer),
244
+ minute.cast(sql.Integer),
245
+ (second + microsecond / 1000000.0).cast(sql.Float),
246
246
  )
247
247
 
248
248
 
@@ -271,13 +271,13 @@ def _(
271
271
  @pxt.udf(is_method=True)
272
272
  def replace(
273
273
  self: datetime,
274
- year: Optional[int] = None,
275
- month: Optional[int] = None,
276
- day: Optional[int] = None,
277
- hour: Optional[int] = None,
278
- minute: Optional[int] = None,
279
- second: Optional[int] = None,
280
- microsecond: Optional[int] = None,
274
+ year: int | None = None,
275
+ month: int | None = None,
276
+ day: int | None = None,
277
+ hour: int | None = None,
278
+ minute: int | None = None,
279
+ second: int | None = None,
280
+ microsecond: int | None = None,
281
281
  ) -> datetime:
282
282
  """
283
283
  Return a datetime with the same attributes, except for those attributes given new values by whichever keyword