pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (202) hide show
  1. pixeltable/__init__.py +23 -5
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/__init__.py +5 -3
  4. pixeltable/catalog/catalog.py +1318 -404
  5. pixeltable/catalog/column.py +186 -115
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +11 -43
  8. pixeltable/catalog/insertable_table.py +167 -79
  9. pixeltable/catalog/path.py +61 -23
  10. pixeltable/catalog/schema_object.py +9 -10
  11. pixeltable/catalog/table.py +626 -308
  12. pixeltable/catalog/table_metadata.py +101 -0
  13. pixeltable/catalog/table_version.py +713 -569
  14. pixeltable/catalog/table_version_handle.py +37 -6
  15. pixeltable/catalog/table_version_path.py +42 -29
  16. pixeltable/catalog/tbl_ops.py +50 -0
  17. pixeltable/catalog/update_status.py +191 -0
  18. pixeltable/catalog/view.py +108 -94
  19. pixeltable/config.py +128 -22
  20. pixeltable/dataframe.py +188 -100
  21. pixeltable/env.py +407 -136
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +3 -0
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +231 -0
  27. pixeltable/exec/cell_reconstruction_node.py +135 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +7 -6
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +190 -30
  37. pixeltable/exec/globals.py +32 -0
  38. pixeltable/exec/in_memory_data_node.py +18 -18
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +206 -101
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +34 -30
  46. pixeltable/exprs/column_ref.py +92 -96
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +152 -55
  50. pixeltable/exprs/expr.py +62 -43
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +75 -37
  54. pixeltable/exprs/globals.py +1 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +10 -27
  57. pixeltable/exprs/is_null.py +1 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +5 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +127 -53
  64. pixeltable/exprs/rowid_ref.py +8 -12
  65. pixeltable/exprs/similarity_expr.py +50 -25
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +10 -10
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +20 -18
  78. pixeltable/func/signature.py +43 -16
  79. pixeltable/func/tools.py +23 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +6 -0
  82. pixeltable/functions/anthropic.py +93 -33
  83. pixeltable/functions/audio.py +114 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +1 -1
  86. pixeltable/functions/deepseek.py +20 -9
  87. pixeltable/functions/fireworks.py +2 -2
  88. pixeltable/functions/gemini.py +28 -11
  89. pixeltable/functions/globals.py +13 -13
  90. pixeltable/functions/groq.py +108 -0
  91. pixeltable/functions/huggingface.py +1046 -23
  92. pixeltable/functions/image.py +9 -18
  93. pixeltable/functions/llama_cpp.py +23 -8
  94. pixeltable/functions/math.py +3 -4
  95. pixeltable/functions/mistralai.py +4 -15
  96. pixeltable/functions/ollama.py +16 -9
  97. pixeltable/functions/openai.py +104 -82
  98. pixeltable/functions/openrouter.py +143 -0
  99. pixeltable/functions/replicate.py +2 -2
  100. pixeltable/functions/reve.py +250 -0
  101. pixeltable/functions/string.py +21 -28
  102. pixeltable/functions/timestamp.py +13 -14
  103. pixeltable/functions/together.py +4 -6
  104. pixeltable/functions/twelvelabs.py +92 -0
  105. pixeltable/functions/util.py +6 -1
  106. pixeltable/functions/video.py +1388 -106
  107. pixeltable/functions/vision.py +7 -7
  108. pixeltable/functions/whisper.py +15 -7
  109. pixeltable/functions/whisperx.py +179 -0
  110. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  111. pixeltable/globals.py +332 -105
  112. pixeltable/index/base.py +13 -22
  113. pixeltable/index/btree.py +23 -22
  114. pixeltable/index/embedding_index.py +32 -44
  115. pixeltable/io/__init__.py +4 -2
  116. pixeltable/io/datarows.py +7 -6
  117. pixeltable/io/external_store.py +49 -77
  118. pixeltable/io/fiftyone.py +11 -11
  119. pixeltable/io/globals.py +29 -28
  120. pixeltable/io/hf_datasets.py +17 -9
  121. pixeltable/io/label_studio.py +70 -66
  122. pixeltable/io/lancedb.py +3 -0
  123. pixeltable/io/pandas.py +12 -11
  124. pixeltable/io/parquet.py +13 -93
  125. pixeltable/io/table_data_conduit.py +71 -47
  126. pixeltable/io/utils.py +3 -3
  127. pixeltable/iterators/__init__.py +2 -1
  128. pixeltable/iterators/audio.py +21 -11
  129. pixeltable/iterators/document.py +116 -55
  130. pixeltable/iterators/image.py +5 -2
  131. pixeltable/iterators/video.py +293 -13
  132. pixeltable/metadata/__init__.py +4 -2
  133. pixeltable/metadata/converters/convert_18.py +2 -2
  134. pixeltable/metadata/converters/convert_19.py +2 -2
  135. pixeltable/metadata/converters/convert_20.py +2 -2
  136. pixeltable/metadata/converters/convert_21.py +2 -2
  137. pixeltable/metadata/converters/convert_22.py +2 -2
  138. pixeltable/metadata/converters/convert_24.py +2 -2
  139. pixeltable/metadata/converters/convert_25.py +2 -2
  140. pixeltable/metadata/converters/convert_26.py +2 -2
  141. pixeltable/metadata/converters/convert_29.py +4 -4
  142. pixeltable/metadata/converters/convert_34.py +2 -2
  143. pixeltable/metadata/converters/convert_36.py +2 -2
  144. pixeltable/metadata/converters/convert_37.py +15 -0
  145. pixeltable/metadata/converters/convert_38.py +39 -0
  146. pixeltable/metadata/converters/convert_39.py +124 -0
  147. pixeltable/metadata/converters/convert_40.py +73 -0
  148. pixeltable/metadata/converters/util.py +13 -12
  149. pixeltable/metadata/notes.py +4 -0
  150. pixeltable/metadata/schema.py +79 -42
  151. pixeltable/metadata/utils.py +74 -0
  152. pixeltable/mypy/__init__.py +3 -0
  153. pixeltable/mypy/mypy_plugin.py +123 -0
  154. pixeltable/plan.py +274 -223
  155. pixeltable/share/__init__.py +1 -1
  156. pixeltable/share/packager.py +259 -129
  157. pixeltable/share/protocol/__init__.py +34 -0
  158. pixeltable/share/protocol/common.py +170 -0
  159. pixeltable/share/protocol/operation_types.py +33 -0
  160. pixeltable/share/protocol/replica.py +109 -0
  161. pixeltable/share/publish.py +213 -57
  162. pixeltable/store.py +238 -175
  163. pixeltable/type_system.py +104 -63
  164. pixeltable/utils/__init__.py +2 -3
  165. pixeltable/utils/arrow.py +108 -13
  166. pixeltable/utils/av.py +298 -0
  167. pixeltable/utils/azure_store.py +305 -0
  168. pixeltable/utils/code.py +3 -3
  169. pixeltable/utils/console_output.py +4 -1
  170. pixeltable/utils/coroutine.py +6 -23
  171. pixeltable/utils/dbms.py +31 -5
  172. pixeltable/utils/description_helper.py +4 -5
  173. pixeltable/utils/documents.py +5 -6
  174. pixeltable/utils/exception_handler.py +7 -30
  175. pixeltable/utils/filecache.py +6 -6
  176. pixeltable/utils/formatter.py +4 -6
  177. pixeltable/utils/gcs_store.py +283 -0
  178. pixeltable/utils/http_server.py +2 -3
  179. pixeltable/utils/iceberg.py +1 -2
  180. pixeltable/utils/image.py +17 -0
  181. pixeltable/utils/lancedb.py +88 -0
  182. pixeltable/utils/local_store.py +316 -0
  183. pixeltable/utils/misc.py +5 -0
  184. pixeltable/utils/object_stores.py +528 -0
  185. pixeltable/utils/pydantic.py +60 -0
  186. pixeltable/utils/pytorch.py +5 -6
  187. pixeltable/utils/s3_store.py +392 -0
  188. pixeltable-0.4.20.dist-info/METADATA +587 -0
  189. pixeltable-0.4.20.dist-info/RECORD +218 -0
  190. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
  191. pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
  192. pixeltable/__version__.py +0 -3
  193. pixeltable/ext/__init__.py +0 -17
  194. pixeltable/ext/functions/__init__.py +0 -11
  195. pixeltable/ext/functions/whisperx.py +0 -77
  196. pixeltable/utils/media_store.py +0 -77
  197. pixeltable/utils/s3.py +0 -17
  198. pixeltable/utils/sample.py +0 -25
  199. pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
  200. pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
  201. pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
  202. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
@@ -4,9 +4,10 @@ import asyncio
4
4
  import datetime
5
5
  import inspect
6
6
  import logging
7
+ import re
7
8
  import sys
8
9
  import time
9
- from typing import Awaitable, Collection, Optional
10
+ from typing import Any, Awaitable, Collection
10
11
 
11
12
  from pixeltable import env, func
12
13
  from pixeltable.config import Config
@@ -34,7 +35,7 @@ class RateLimitsScheduler(Scheduler):
34
35
  get_request_resources_param_names: list[str] # names of parameters of RateLimitsInfo.get_request_resources()
35
36
 
36
37
  # scheduling-related state
37
- pool_info: Optional[env.RateLimitsInfo]
38
+ pool_info: env.RateLimitsInfo | None
38
39
  est_usage: dict[str, int] # value per resource; accumulated estimates since the last util. report
39
40
 
40
41
  num_in_flight: int # unfinished tasks
@@ -76,10 +77,12 @@ class RateLimitsScheduler(Scheduler):
76
77
  self.est_usage = dict.fromkeys(self._resources, 0)
77
78
 
78
79
  async def _main_loop(self) -> None:
79
- item: Optional[RateLimitsScheduler.QueueItem] = None
80
+ item: RateLimitsScheduler.QueueItem | None = None
80
81
  while True:
81
82
  if item is None:
82
83
  item = await self.queue.get()
84
+ assert isinstance(item.request.fn_call.fn, func.CallableFunction)
85
+ assert '_runtime_ctx' in item.request.fn_call.fn.signature.system_parameters
83
86
  if item.num_retries > 0:
84
87
  self.total_retried += 1
85
88
 
@@ -96,12 +99,11 @@ class RateLimitsScheduler(Scheduler):
96
99
  continue
97
100
 
98
101
  # check rate limits
99
- _logger.debug(f'checking rate limits for {self.resource_pool}')
100
102
  request_resources = self._get_request_resources(item.request)
101
103
  limits_info = self._check_resource_limits(request_resources)
102
104
  aws: list[Awaitable[None]] = []
103
- completed_aw: Optional[asyncio.Task] = None
104
- wait_for_reset: Optional[asyncio.Task] = None
105
+ completed_aw: asyncio.Task | None = None
106
+ wait_for_reset: asyncio.Task | None = None
105
107
  if limits_info is not None:
106
108
  # limits_info's resource is depleted, wait for capacity to free up
107
109
 
@@ -115,21 +117,31 @@ class RateLimitsScheduler(Scheduler):
115
117
  reset_at = limits_info.reset_at
116
118
  if reset_at > now:
117
119
  # we're waiting for the rate limit to reset
118
- wait_for_reset = asyncio.create_task(asyncio.sleep((reset_at - now).total_seconds()))
120
+ wait_duration = (reset_at - now).total_seconds()
121
+ wait_for_reset = asyncio.create_task(asyncio.sleep(wait_duration))
119
122
  aws.append(wait_for_reset)
120
- _logger.debug(f'waiting for rate limit reset for {self.resource_pool}')
123
+ _logger.debug(
124
+ f'waiting {wait_duration:.2f}s for rate limit reset of '
125
+ f'{self.resource_pool}:{limits_info.resource} (remaining={limits_info.remaining})'
126
+ )
121
127
 
122
128
  if len(aws) > 0:
123
129
  # we have something to wait for
130
+ report_ts = limits_info.recorded_at
124
131
  done, pending = await asyncio.wait(aws, return_when=asyncio.FIRST_COMPLETED)
125
132
  for task in pending:
126
133
  task.cancel()
127
134
  if completed_aw in done:
128
135
  _logger.debug(f'wait(): completed request for {self.resource_pool}')
129
136
  if wait_for_reset in done:
130
- _logger.debug(f'wait(): rate limit reset for {self.resource_pool}')
131
- # force waiting for another rate limit report before making any scheduling decisions
132
- self.pool_info.reset()
137
+ _logger.debug(f'wait(): rate limit reset for {self.resource_pool}:{limits_info.resource}')
138
+ last_report_ts = self.pool_info.resource_limits[limits_info.resource].recorded_at
139
+ if report_ts == last_report_ts:
140
+ # if we haven't seen a new report since we started waiting, force waiting for another rate limit
141
+ # report before making any scheduling decisions
142
+ # TODO: is it a good idea to discard the information we have?
143
+ _logger.debug(f'resetting {self.resource_pool}: currently at {self.pool_info.debug_str()}')
144
+ self.pool_info.reset()
133
145
  # re-evaluate current capacity for current item
134
146
  continue
135
147
 
@@ -155,18 +167,24 @@ class RateLimitsScheduler(Scheduler):
155
167
  constant_kwargs, batch_kwargs = request.pxt_fn.create_batch_kwargs(batch_kwargs)
156
168
  return self.pool_info.get_request_resources(**constant_kwargs, **batch_kwargs)
157
169
 
158
- def _check_resource_limits(self, request_resources: dict[str, int]) -> Optional[env.RateLimitInfo]:
170
+ def _check_resource_limits(self, request_resources: dict[str, int]) -> env.RateLimitInfo | None:
159
171
  """Returns the most depleted resource, relative to its limit, or None if all resources are within limits"""
160
- candidates: list[tuple[env.RateLimitInfo, float]] = [] # (info, relative usage)
172
+ candidates: list[tuple[env.RateLimitInfo, float]] = [] # (info, relative remaining)
161
173
  for resource, usage in request_resources.items():
162
- # 0.05: leave some headroom, we don't have perfect information
163
174
  info = self.pool_info.resource_limits[resource]
164
175
  est_remaining = info.remaining - self.est_usage[resource] - usage
165
- if est_remaining < 0.05 * info.limit:
166
- candidates.append((info, est_remaining / info.limit))
167
- if len(candidates) == 0:
168
- return None
169
- return min(candidates, key=lambda x: x[1])[0]
176
+ candidates.append((info, est_remaining / info.limit))
177
+ assert len(candidates) > 0
178
+ candidates.sort(key=lambda x: x[1]) # most depleted first
179
+ most_depleted = candidates[0]
180
+ _logger.debug(
181
+ f'check_resource_limits({request_resources}): '
182
+ f'most_depleted={most_depleted[0].resource}, rel_remaining={most_depleted[1]}'
183
+ )
184
+ # 0.05: leave some headroom, we don't have perfect information
185
+ if most_depleted[1] < 0.05:
186
+ return most_depleted[0]
187
+ return None
170
188
 
171
189
  async def _exec(self, request: FnCallArgs, exec_ctx: ExecCtx, num_retries: int, is_task: bool) -> None:
172
190
  assert all(not row.has_val[request.fn_call.slot_idx] for row in request.rows)
@@ -187,7 +205,8 @@ class RateLimitsScheduler(Scheduler):
187
205
  for row, result in zip(request.rows, batch_result):
188
206
  row[request.fn_call.slot_idx] = result
189
207
  else:
190
- result = await pxt_fn.aexec(*request.args, **request.kwargs)
208
+ request_kwargs = {**request.kwargs, '_runtime_ctx': env.RuntimeCtx(is_retry=num_retries > 0)}
209
+ result = await pxt_fn.aexec(*request.args, **request_kwargs)
191
210
  request.row[request.fn_call.slot_idx] = result
192
211
  end_ts = datetime.datetime.now(tz=datetime.timezone.utc)
193
212
  _logger.debug(
@@ -201,10 +220,14 @@ class RateLimitsScheduler(Scheduler):
201
220
  self.dispatcher.dispatch(request.rows, exec_ctx)
202
221
  except Exception as exc:
203
222
  _logger.debug(f'scheduler {self.resource_pool}: exception in slot {request.fn_call.slot_idx}: {exc}')
223
+ if hasattr(exc, 'response') and hasattr(exc.response, 'headers'):
224
+ _logger.debug(f'scheduler {self.resource_pool}: exception headers: {exc.response.headers}')
204
225
  if self.pool_info is None:
205
226
  # our pool info should be available at this point
206
227
  self._set_pool_info()
207
228
  assert self.pool_info is not None
229
+ self.pool_info.record_exc(exc)
230
+
208
231
  if num_retries < self.MAX_RETRIES:
209
232
  retry_delay = self.pool_info.get_retry_delay(exc)
210
233
  if retry_delay is not None:
@@ -213,7 +236,6 @@ class RateLimitsScheduler(Scheduler):
213
236
  await asyncio.sleep(retry_delay)
214
237
  self.queue.put_nowait(self.QueueItem(request, num_retries + 1, exec_ctx))
215
238
  return
216
- # TODO: update resource limits reported in exc.response.headers, if present
217
239
 
218
240
  # record the exception
219
241
  _, _, exc_tb = sys.exc_info()
@@ -248,10 +270,23 @@ class RequestRateScheduler(Scheduler):
248
270
  num_in_flight: int
249
271
  total_requests: int
250
272
  total_retried: int
273
+ total_errors: int
251
274
 
252
275
  TIME_FORMAT = '%H:%M.%S %f'
253
- MAX_RETRIES = 10
276
+ MAX_RETRIES = 3
254
277
  DEFAULT_RATE_LIMIT = 600 # requests per minute
278
+ RATE_LIMIT_INDICATORS = ('rate limit', 'too many requests', '429', 'quota exceeded', 'throttled', 'rate exceeded')
279
+ RETRY_AFTER_PATTERNS = (
280
+ r'retry after (\d+(?:\.\d+)?)\s*seconds?',
281
+ r'try again in (\d+(?:\.\d+)?)\s*seconds?',
282
+ r'wait (\d+(?:\.\d+)?)\s*seconds?',
283
+ r'retry-after:\s*(\d+(?:\.\d+)?)',
284
+ )
285
+
286
+ # Exponential backoff defaults
287
+ BASE_RETRY_DELAY = 1.0 # in seconds
288
+ MAX_RETRY_DELAY = 60.0 # in seconds
289
+ RETRY_BACKOFF_MULTIPLIER = 2.0
255
290
 
256
291
  def __init__(self, resource_pool: str, dispatcher: Dispatcher):
257
292
  super().__init__(resource_pool, dispatcher)
@@ -260,6 +295,7 @@ class RequestRateScheduler(Scheduler):
260
295
  self.num_in_flight = 0
261
296
  self.total_requests = 0
262
297
  self.total_retried = 0
298
+ self.total_errors = 0
263
299
 
264
300
  # try to get the rate limit from the config
265
301
  elems = resource_pool.split(':')
@@ -278,6 +314,7 @@ class RequestRateScheduler(Scheduler):
278
314
  key = model
279
315
  requests_per_min = Config.get().get_int_value(key, section=section)
280
316
  requests_per_min = requests_per_min or self.DEFAULT_RATE_LIMIT
317
+ _logger.debug(f'rate limit for {self.resource_pool}: {requests_per_min} RPM')
281
318
  self.secs_per_request = 1 / (requests_per_min / 60)
282
319
 
283
320
  @classmethod
@@ -291,8 +328,12 @@ class RequestRateScheduler(Scheduler):
291
328
  if item.num_retries > 0:
292
329
  self.total_retried += 1
293
330
  now = time.monotonic()
331
+ wait_duration = 0.0
332
+ if item.retry_after is not None:
333
+ wait_duration = item.retry_after - now
294
334
  if now - last_request_ts < self.secs_per_request:
295
- wait_duration = self.secs_per_request - (now - last_request_ts)
335
+ wait_duration = max(wait_duration, self.secs_per_request - (now - last_request_ts))
336
+ if wait_duration > 0:
296
337
  _logger.debug(f'waiting for {wait_duration} for {self.resource_pool}')
297
338
  await asyncio.sleep(wait_duration)
298
339
 
@@ -337,15 +378,21 @@ class RequestRateScheduler(Scheduler):
337
378
  self.dispatcher.dispatch(request.rows, exec_ctx)
338
379
 
339
380
  except Exception as exc:
340
- # TODO: which exception can be retried?
341
- _logger.debug(f'exception for {self.resource_pool}: {exc}')
342
- status = getattr(exc, 'status', None)
343
- _logger.debug(f'type={type(exc)} has_status={hasattr(exc, "status")} status={status}')
344
- if num_retries < self.MAX_RETRIES:
345
- self.queue.put_nowait(self.QueueItem(request, num_retries + 1, exec_ctx))
381
+ _logger.debug(f'exception for {self.resource_pool}: type={type(exc)}\n{exc}')
382
+ if hasattr(exc, 'response') and hasattr(exc.response, 'headers'):
383
+ _logger.debug(f'scheduler {self.resource_pool}: exception headers: {exc.response.headers}')
384
+ is_rate_limit_error, retry_after = self._is_rate_limit_error(exc)
385
+ if is_rate_limit_error and num_retries < self.MAX_RETRIES:
386
+ retry_delay = self._compute_retry_delay(num_retries, retry_after)
387
+ _logger.debug(f'scheduler {self.resource_pool}: retrying after {retry_delay}')
388
+ now = time.monotonic()
389
+ # put the request back in the queue right away, which prevents new requests from being generated until
390
+ # this one succeeds or exceeds its retry limit
391
+ self.queue.put_nowait(self.QueueItem(request, num_retries + 1, exec_ctx, retry_after=now + retry_delay))
346
392
  return
347
393
 
348
394
  # record the exception
395
+ self.total_errors += 1
349
396
  _, _, exc_tb = sys.exc_info()
350
397
  for row in request.rows:
351
398
  row.set_exc(request.fn_call.slot_idx, exc)
@@ -353,11 +400,124 @@ class RequestRateScheduler(Scheduler):
353
400
  finally:
354
401
  _logger.debug(
355
402
  f'Scheduler stats: #in-flight={self.num_in_flight} #requests={self.total_requests}, '
356
- f'#retried={self.total_retried}'
403
+ f'#retried={self.total_retried} #errors={self.total_errors}'
357
404
  )
358
405
  if is_task:
359
406
  self.num_in_flight -= 1
360
407
 
408
+ def _is_rate_limit_error(self, exc: Exception) -> tuple[bool, float | None]:
409
+ """Returns True if the exception indicates a rate limit error, and the retry delay in seconds."""
410
+ from http import HTTPStatus
411
+
412
+ # Check for HTTP status TOO_MANY_REQUESTS in various exception classes.
413
+ # We look for attributes that contain status codes, instead of checking the type of the exception,
414
+ # in order to handle a wider variety of exception classes.
415
+ is_rate_limit_error = False
416
+ retry_delay: float | None = None
417
+
418
+ # requests.HTTPError/httpx.HTTPStatusError
419
+ if (
420
+ hasattr(exc, 'response')
421
+ and hasattr(exc.response, 'status_code')
422
+ and exc.response.status_code == HTTPStatus.TOO_MANY_REQUESTS.value
423
+ ):
424
+ is_rate_limit_error = True
425
+ retry_delay = self._extract_retry_delay_from_headers(exc.response.headers)
426
+ elif (
427
+ # urllib.error.HTTPError
428
+ (hasattr(exc, 'code') and exc.code == HTTPStatus.TOO_MANY_REQUESTS.value)
429
+ # aiohttp.ClientResponseError
430
+ or (hasattr(exc, 'status') and exc.status == HTTPStatus.TOO_MANY_REQUESTS.value)
431
+ ) and hasattr(exc, 'headers'):
432
+ is_rate_limit_error = True
433
+ retry_delay = self._extract_retry_delay_from_headers(exc.headers)
434
+
435
+ if is_rate_limit_error:
436
+ return True, retry_delay
437
+
438
+ # Check common rate limit keywords in exception message
439
+ error_msg = str(exc).lower()
440
+ if any(indicator in error_msg for indicator in self.RATE_LIMIT_INDICATORS):
441
+ retry_delay = self._extract_retry_delay_from_message(error_msg)
442
+ return True, retry_delay
443
+
444
+ return False, None
445
+
446
+ def _extract_retry_delay_from_headers(self, headers: Any | None) -> float | None:
447
+ """Extract retry delay from HTTP headers."""
448
+ if headers is None:
449
+ return None
450
+
451
+ # convert headers to dict-like object for consistent access
452
+ header_dict: dict
453
+ if hasattr(headers, 'get'):
454
+ header_dict = headers
455
+ else:
456
+ # headers are a list of tuples or other format
457
+ try:
458
+ header_dict = dict(headers)
459
+ except (TypeError, ValueError):
460
+ return None
461
+ # normalize dict keys: lowercase and remove dashes
462
+ header_dict = {k.lower().replace('-', ''): v for k, v in header_dict.items()}
463
+
464
+ # check Retry-After header
465
+ retry_after = header_dict.get('retryafter')
466
+ if retry_after is not None:
467
+ try:
468
+ return float(retry_after)
469
+ except (ValueError, TypeError):
470
+ pass
471
+
472
+ # check X-RateLimit-Reset (Unix timestamp)
473
+ reset_time = header_dict.get('xratelimitreset')
474
+ if reset_time is not None:
475
+ try:
476
+ reset_timestamp = float(reset_time)
477
+ delay = max(0, reset_timestamp - time.time())
478
+ return delay
479
+ except (ValueError, TypeError):
480
+ pass
481
+
482
+ # check X-RateLimit-Reset-After (seconds from now)
483
+ reset_after = header_dict.get('xratelimitresetafter')
484
+ if reset_after is not None:
485
+ try:
486
+ return float(reset_after)
487
+ except (ValueError, TypeError):
488
+ pass
489
+
490
+ return None
491
+
492
+ def _extract_retry_delay_from_message(self, msg: str) -> float | None:
493
+ msg_lower = msg.lower()
494
+ for pattern in self.RETRY_AFTER_PATTERNS:
495
+ match = re.search(pattern, msg_lower)
496
+ if match is not None:
497
+ try:
498
+ return float(match.group(1))
499
+ except (ValueError, TypeError):
500
+ continue
501
+ return None
502
+
503
+ def _compute_retry_delay(self, num_retries: int, retry_after: float | None = None) -> float:
504
+ """
505
+ Calculate exponential backoff delay for rate limit errors.
506
+
507
+ Args:
508
+ retry_count: Number of retries attempted (0-based)
509
+ retry_after: Suggested delay from Retry-After header
510
+
511
+ Returns:
512
+ Delay in seconds
513
+ """
514
+ if retry_after is not None and retry_after > 0:
515
+ # Use server-suggested delay, but cap it at max_delay
516
+ return max(min(retry_after, self.MAX_RETRY_DELAY), self.BASE_RETRY_DELAY)
517
+ else:
518
+ delay = self.BASE_RETRY_DELAY * (self.RETRY_BACKOFF_MULTIPLIER**num_retries)
519
+ return max(min(delay, self.MAX_RETRY_DELAY), self.BASE_RETRY_DELAY)
520
+
361
521
 
362
522
  # all concrete Scheduler subclasses that implement matches()
363
523
  SCHEDULERS = [RateLimitsScheduler, RequestRateScheduler]
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ import dataclasses
4
+
5
+ from pixeltable.exprs import ArrayMd
6
+ from pixeltable.utils.misc import non_none_dict_factory
7
+
8
+ INLINED_OBJECT_MD_KEY = '__pxtinlinedobjmd__'
9
+
10
+
11
+ @dataclasses.dataclass
12
+ class InlinedObjectMd:
13
+ type: str # corresponds to ts.ColumnType.Type
14
+ url_idx: int
15
+ img_start: int | None = None
16
+ img_end: int | None = None
17
+ array_md: ArrayMd | None = None
18
+
19
+ @classmethod
20
+ def from_dict(cls, d: dict) -> InlinedObjectMd:
21
+ if 'array_md' in d:
22
+ array_md = ArrayMd(**d['array_md'])
23
+ del d['array_md']
24
+ return cls(**d, array_md=array_md)
25
+ else:
26
+ return cls(**d)
27
+
28
+ def as_dict(self) -> dict:
29
+ result = dataclasses.asdict(self, dict_factory=non_none_dict_factory)
30
+ if self.array_md is not None:
31
+ result['array_md'] = self.array_md.as_dict()
32
+ return result
@@ -1,8 +1,8 @@
1
1
  import logging
2
- from typing import Any, AsyncIterator, Optional
2
+ from typing import Any, AsyncIterator
3
3
 
4
4
  from pixeltable import catalog, exprs
5
- from pixeltable.utils.media_store import MediaStore
5
+ from pixeltable.utils.local_store import TempStore
6
6
 
7
7
  from .data_row_batch import DataRowBatch
8
8
  from .exec_node import ExecNode
@@ -23,7 +23,7 @@ class InMemoryDataNode(ExecNode):
23
23
 
24
24
  input_rows: list[dict[str, Any]]
25
25
  start_row_id: int
26
- output_rows: Optional[DataRowBatch]
26
+ output_batch: DataRowBatch | None
27
27
 
28
28
  # output_exprs is declared in the superclass, but we redeclare it here with a more specific type
29
29
  output_exprs: list[exprs.ColumnRef]
@@ -42,7 +42,7 @@ class InMemoryDataNode(ExecNode):
42
42
  self.tbl = tbl
43
43
  self.input_rows = rows
44
44
  self.start_row_id = start_row_id
45
- self.output_rows = None
45
+ self.output_batch = None
46
46
 
47
47
  def _open(self) -> None:
48
48
  """Create row batch and populate with self.input_rows"""
@@ -56,22 +56,21 @@ class InMemoryDataNode(ExecNode):
56
56
  }
57
57
  output_slot_idxs = {e.slot_idx for e in self.output_exprs}
58
58
 
59
- self.output_rows = DataRowBatch(self.tbl, self.row_builder, len(self.input_rows))
60
- for row_idx, input_row in enumerate(self.input_rows):
59
+ self.output_batch = DataRowBatch(self.row_builder)
60
+ for input_row in self.input_rows:
61
+ output_row = self.row_builder.make_row()
61
62
  # populate the output row with the values provided in the input row
62
63
  input_slot_idxs: set[int] = set()
63
64
  for col_name, val in input_row.items():
64
65
  col_info = user_cols_by_name.get(col_name)
65
66
  assert col_info is not None
66
-
67
- if col_info.col.col_type.is_image_type() and isinstance(val, bytes):
68
- # this is a literal image, ie, a sequence of bytes; we save this as a media file and store the path
69
- path = str(MediaStore.prepare_media_path(self.tbl.id, col_info.col.id, self.tbl.get().version))
70
- with open(path, 'wb') as fp:
71
- fp.write(val)
72
- self.output_rows[row_idx][col_info.slot_idx] = path
67
+ col = col_info.col
68
+ if col.col_type.is_image_type() and isinstance(val, bytes):
69
+ # this is a literal media file, ie, a sequence of bytes; save it as a binary file and store the path
70
+ filepath, _ = TempStore.save_media_object(val, col, format=None)
71
+ output_row[col_info.slot_idx] = str(filepath)
73
72
  else:
74
- self.output_rows[row_idx][col_info.slot_idx] = val
73
+ output_row[col_info.slot_idx] = val
75
74
 
76
75
  input_slot_idxs.add(col_info.slot_idx)
77
76
 
@@ -80,10 +79,11 @@ class InMemoryDataNode(ExecNode):
80
79
  for slot_idx in missing_slot_idxs:
81
80
  col_info = output_cols_by_idx.get(slot_idx)
82
81
  assert col_info is not None
83
- self.output_rows[row_idx][col_info.slot_idx] = None
82
+ output_row[col_info.slot_idx] = None
83
+ self.output_batch.add_row(output_row)
84
84
 
85
- self.ctx.num_rows = len(self.output_rows)
85
+ self.ctx.num_rows = len(self.output_batch)
86
86
 
87
87
  async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
88
- _logger.debug(f'InMemoryDataNode: created row batch with {len(self.output_rows)} output_rows')
89
- yield self.output_rows
88
+ _logger.debug(f'InMemoryDataNode: created row batch with {len(self.output_batch)} rows')
89
+ yield self.output_batch