pixeltable 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (147) hide show
  1. pixeltable/__init__.py +64 -11
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +1 -1
  4. pixeltable/catalog/catalog.py +50 -27
  5. pixeltable/catalog/column.py +27 -11
  6. pixeltable/catalog/dir.py +6 -4
  7. pixeltable/catalog/globals.py +8 -1
  8. pixeltable/catalog/insertable_table.py +22 -12
  9. pixeltable/catalog/named_function.py +10 -6
  10. pixeltable/catalog/path.py +3 -2
  11. pixeltable/catalog/path_dict.py +8 -6
  12. pixeltable/catalog/schema_object.py +2 -1
  13. pixeltable/catalog/table.py +121 -101
  14. pixeltable/catalog/table_version.py +291 -142
  15. pixeltable/catalog/table_version_path.py +8 -5
  16. pixeltable/catalog/view.py +67 -26
  17. pixeltable/dataframe.py +102 -72
  18. pixeltable/env.py +20 -21
  19. pixeltable/exec/__init__.py +2 -2
  20. pixeltable/exec/aggregation_node.py +10 -4
  21. pixeltable/exec/cache_prefetch_node.py +5 -3
  22. pixeltable/exec/component_iteration_node.py +9 -8
  23. pixeltable/exec/data_row_batch.py +21 -10
  24. pixeltable/exec/exec_context.py +10 -3
  25. pixeltable/exec/exec_node.py +23 -12
  26. pixeltable/exec/expr_eval/evaluators.py +13 -7
  27. pixeltable/exec/expr_eval/expr_eval_node.py +24 -15
  28. pixeltable/exec/expr_eval/globals.py +30 -7
  29. pixeltable/exec/expr_eval/row_buffer.py +5 -6
  30. pixeltable/exec/expr_eval/schedulers.py +151 -31
  31. pixeltable/exec/in_memory_data_node.py +8 -7
  32. pixeltable/exec/row_update_node.py +15 -5
  33. pixeltable/exec/sql_node.py +56 -27
  34. pixeltable/exprs/__init__.py +2 -2
  35. pixeltable/exprs/arithmetic_expr.py +57 -26
  36. pixeltable/exprs/array_slice.py +1 -1
  37. pixeltable/exprs/column_property_ref.py +2 -1
  38. pixeltable/exprs/column_ref.py +20 -15
  39. pixeltable/exprs/comparison.py +6 -2
  40. pixeltable/exprs/compound_predicate.py +1 -3
  41. pixeltable/exprs/data_row.py +2 -2
  42. pixeltable/exprs/expr.py +101 -72
  43. pixeltable/exprs/expr_dict.py +2 -1
  44. pixeltable/exprs/expr_set.py +3 -1
  45. pixeltable/exprs/function_call.py +39 -41
  46. pixeltable/exprs/globals.py +1 -0
  47. pixeltable/exprs/in_predicate.py +2 -2
  48. pixeltable/exprs/inline_expr.py +20 -17
  49. pixeltable/exprs/json_mapper.py +4 -2
  50. pixeltable/exprs/json_path.py +12 -18
  51. pixeltable/exprs/literal.py +5 -9
  52. pixeltable/exprs/method_ref.py +1 -0
  53. pixeltable/exprs/object_ref.py +1 -1
  54. pixeltable/exprs/row_builder.py +32 -17
  55. pixeltable/exprs/rowid_ref.py +14 -5
  56. pixeltable/exprs/similarity_expr.py +11 -6
  57. pixeltable/exprs/sql_element_cache.py +1 -1
  58. pixeltable/exprs/type_cast.py +24 -9
  59. pixeltable/ext/__init__.py +1 -0
  60. pixeltable/ext/functions/__init__.py +1 -0
  61. pixeltable/ext/functions/whisperx.py +2 -2
  62. pixeltable/ext/functions/yolox.py +11 -11
  63. pixeltable/func/aggregate_function.py +17 -13
  64. pixeltable/func/callable_function.py +6 -6
  65. pixeltable/func/expr_template_function.py +15 -14
  66. pixeltable/func/function.py +16 -16
  67. pixeltable/func/function_registry.py +11 -8
  68. pixeltable/func/globals.py +4 -2
  69. pixeltable/func/query_template_function.py +12 -13
  70. pixeltable/func/signature.py +18 -9
  71. pixeltable/func/tools.py +10 -17
  72. pixeltable/func/udf.py +106 -11
  73. pixeltable/functions/__init__.py +21 -2
  74. pixeltable/functions/anthropic.py +16 -12
  75. pixeltable/functions/fireworks.py +63 -5
  76. pixeltable/functions/gemini.py +13 -3
  77. pixeltable/functions/globals.py +18 -6
  78. pixeltable/functions/huggingface.py +20 -38
  79. pixeltable/functions/image.py +7 -3
  80. pixeltable/functions/json.py +1 -0
  81. pixeltable/functions/llama_cpp.py +1 -4
  82. pixeltable/functions/mistralai.py +31 -20
  83. pixeltable/functions/ollama.py +4 -18
  84. pixeltable/functions/openai.py +201 -108
  85. pixeltable/functions/replicate.py +11 -10
  86. pixeltable/functions/string.py +70 -7
  87. pixeltable/functions/timestamp.py +21 -8
  88. pixeltable/functions/together.py +66 -52
  89. pixeltable/functions/video.py +1 -0
  90. pixeltable/functions/vision.py +14 -11
  91. pixeltable/functions/whisper.py +2 -1
  92. pixeltable/globals.py +60 -26
  93. pixeltable/index/__init__.py +1 -1
  94. pixeltable/index/btree.py +5 -3
  95. pixeltable/index/embedding_index.py +15 -14
  96. pixeltable/io/__init__.py +1 -1
  97. pixeltable/io/external_store.py +30 -25
  98. pixeltable/io/fiftyone.py +6 -14
  99. pixeltable/io/globals.py +33 -27
  100. pixeltable/io/hf_datasets.py +2 -1
  101. pixeltable/io/label_studio.py +77 -68
  102. pixeltable/io/pandas.py +33 -9
  103. pixeltable/io/parquet.py +9 -12
  104. pixeltable/iterators/__init__.py +1 -0
  105. pixeltable/iterators/audio.py +205 -0
  106. pixeltable/iterators/document.py +19 -8
  107. pixeltable/iterators/image.py +6 -24
  108. pixeltable/iterators/string.py +3 -6
  109. pixeltable/iterators/video.py +1 -7
  110. pixeltable/metadata/__init__.py +7 -1
  111. pixeltable/metadata/converters/convert_10.py +2 -2
  112. pixeltable/metadata/converters/convert_15.py +1 -5
  113. pixeltable/metadata/converters/convert_16.py +2 -4
  114. pixeltable/metadata/converters/convert_17.py +2 -4
  115. pixeltable/metadata/converters/convert_18.py +2 -4
  116. pixeltable/metadata/converters/convert_19.py +2 -5
  117. pixeltable/metadata/converters/convert_20.py +1 -4
  118. pixeltable/metadata/converters/convert_21.py +4 -6
  119. pixeltable/metadata/converters/convert_22.py +1 -0
  120. pixeltable/metadata/converters/convert_23.py +5 -5
  121. pixeltable/metadata/converters/convert_24.py +12 -13
  122. pixeltable/metadata/converters/convert_26.py +23 -0
  123. pixeltable/metadata/converters/util.py +3 -4
  124. pixeltable/metadata/notes.py +1 -0
  125. pixeltable/metadata/schema.py +13 -2
  126. pixeltable/plan.py +173 -98
  127. pixeltable/store.py +42 -26
  128. pixeltable/type_system.py +62 -54
  129. pixeltable/utils/arrow.py +1 -2
  130. pixeltable/utils/coco.py +16 -17
  131. pixeltable/utils/code.py +1 -1
  132. pixeltable/utils/console_output.py +6 -3
  133. pixeltable/utils/description_helper.py +7 -7
  134. pixeltable/utils/documents.py +3 -1
  135. pixeltable/utils/filecache.py +12 -7
  136. pixeltable/utils/http_server.py +9 -8
  137. pixeltable/utils/media_store.py +2 -1
  138. pixeltable/utils/pytorch.py +11 -14
  139. pixeltable/utils/s3.py +1 -0
  140. pixeltable/utils/sql.py +1 -0
  141. pixeltable/utils/transactional_directory.py +2 -2
  142. {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/METADATA +6 -8
  143. pixeltable-0.3.3.dist-info/RECORD +163 -0
  144. pixeltable-0.3.2.dist-info/RECORD +0 -161
  145. {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/LICENSE +0 -0
  146. {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/WHEEL +0 -0
  147. {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/entry_points.txt +0 -0
@@ -1,16 +1,18 @@
1
+ from __future__ import annotations
2
+
1
3
  import abc
2
4
  import asyncio
3
5
  from dataclasses import dataclass
4
6
  from types import TracebackType
5
- from typing import Any, Protocol, Optional
7
+ from typing import Any, Optional, Protocol
6
8
 
7
- from pixeltable import exprs
8
- from pixeltable import func
9
+ from pixeltable import exprs, func
9
10
 
10
11
 
11
12
  @dataclass
12
13
  class FnCallArgs:
13
14
  """Container for everything needed to execute a FunctionCall against one or more DataRows"""
15
+
14
16
  fn_call: exprs.FunctionCall
15
17
  rows: list[exprs.DataRow]
16
18
  # single call
@@ -37,16 +39,36 @@ class FnCallArgs:
37
39
 
38
40
  class Scheduler(abc.ABC):
39
41
  """
40
- Base class for schedulers. A scheduler executes FunctionCalls against a limited resource pool.
42
+ Base class for queueing schedulers. A scheduler executes FunctionCalls against a limited resource pool.
41
43
 
42
44
  Expected behavior:
43
45
  - all created tasks must be recorded in dispatcher.tasks
44
46
  - schedulers are responsible for aborting execution when a) the task is cancelled or b) when an exception occurred
45
47
  elsewhere (indicated by dispatcher.exc_event)
46
48
  """
47
- @abc.abstractmethod
49
+
50
+ @dataclass(frozen=True)
51
+ class QueueItem:
52
+ """Container of work items for queueing schedulers"""
53
+
54
+ request: FnCallArgs
55
+ num_retries: int
56
+
57
+ def __lt__(self, other: Scheduler.QueueItem) -> bool:
58
+ # prioritize by number of retries (more retries = higher priority)
59
+ return self.num_retries > other.num_retries
60
+
61
+ resource_pool: str
62
+ queue: asyncio.PriorityQueue[QueueItem] # prioritizes retries
63
+ dispatcher: Dispatcher
64
+
65
+ def __init__(self, resource_pool: str, dispatcher: Dispatcher):
66
+ self.resource_pool = resource_pool
67
+ self.queue = asyncio.PriorityQueue()
68
+ self.dispatcher = dispatcher
69
+
48
70
  def submit(self, item: FnCallArgs) -> None:
49
- pass
71
+ self.queue.put_nowait(self.QueueItem(item, 0))
50
72
 
51
73
  @classmethod
52
74
  @abc.abstractmethod
@@ -63,6 +85,7 @@ class Dispatcher(Protocol):
63
85
  Exceptions: evaluators/schedulers need to check exc_event prior to starting long-running (non-interruptible)
64
86
  computations
65
87
  """
88
+
66
89
  row_builder: exprs.RowBuilder
67
90
  exc_event: asyncio.Event
68
91
  schedulers: dict[str, Scheduler] # key: resource pool id
@@ -90,6 +113,7 @@ class Evaluator(abc.ABC):
90
113
  - evaluators are responsible for aborting execution when a) the task is cancelled or b) when an exception occurred
91
114
  elsewhere (indicated by dispatcher.exc_event)
92
115
  """
116
+
93
117
  dispatcher: Dispatcher
94
118
  is_closed: bool
95
119
 
@@ -110,4 +134,3 @@ class Evaluator(abc.ABC):
110
134
  """Indicates that there may not be any more rows getting scheduled"""
111
135
  self.is_closed = True
112
136
  self._close()
113
-
@@ -62,15 +62,14 @@ class RowBuffer:
62
62
  return []
63
63
  rows: list[exprs.DataRow]
64
64
  if self.head_idx + n <= self.size:
65
- rows = self.buffer[self.head_idx:self.head_idx + n].tolist()
66
- self.buffer[self.head_idx:self.head_idx + n] = None
65
+ rows = self.buffer[self.head_idx : self.head_idx + n].tolist()
66
+ self.buffer[self.head_idx : self.head_idx + n] = None
67
67
  else:
68
- rows = np.concatenate([self.buffer[self.head_idx:], self.buffer[:self.head_idx + n - self.size]]).tolist()
69
- self.buffer[self.head_idx:] = None
70
- self.buffer[:self.head_idx + n - self.size] = None
68
+ rows = np.concatenate([self.buffer[self.head_idx :], self.buffer[: self.head_idx + n - self.size]]).tolist()
69
+ self.buffer[self.head_idx :] = None
70
+ self.buffer[: self.head_idx + n - self.size] = None
71
71
  self.head_pos += n
72
72
  self.head_idx = (self.head_idx + n) % self.size
73
73
  self.num_rows -= n
74
74
  self.num_ready -= n
75
75
  return rows
76
-
@@ -5,12 +5,12 @@ import datetime
5
5
  import inspect
6
6
  import logging
7
7
  import sys
8
- from dataclasses import dataclass
9
- from typing import Optional, Awaitable, Collection
8
+ import time
9
+ from typing import Awaitable, Collection, Optional
10
10
 
11
- from pixeltable import env
12
- from pixeltable import func
13
- from .globals import Scheduler, FnCallArgs, Dispatcher
11
+ from pixeltable import env, func
12
+
13
+ from .globals import Dispatcher, FnCallArgs, Scheduler
14
14
 
15
15
  _logger = logging.getLogger('pixeltable')
16
16
 
@@ -29,19 +29,7 @@ class RateLimitsScheduler(Scheduler):
29
29
  TODO:
30
30
  - limit the number of in-flight requests based on the open file limit
31
31
  """
32
- @dataclass(frozen=True)
33
- class QueueItem:
34
- request: FnCallArgs
35
- num_retries: int
36
-
37
- def __lt__(self, other: RateLimitsScheduler.QueueItem) -> bool:
38
- # prioritize by number of retries
39
- return self.num_retries > other.num_retries
40
-
41
- resource_pool: str
42
- queue: asyncio.PriorityQueue[QueueItem] # prioritizes retries
43
- loop_task: asyncio.Task
44
- dispatcher: Dispatcher
32
+
45
33
  get_request_resources_param_names: list[str] # names of parameters of RateLimitsInfo.get_request_resources()
46
34
 
47
35
  # scheduling-related state
@@ -58,11 +46,9 @@ class RateLimitsScheduler(Scheduler):
58
46
  MAX_RETRIES = 10
59
47
 
60
48
  def __init__(self, resource_pool: str, dispatcher: Dispatcher):
61
- self.resource_pool = resource_pool
62
- self.queue = asyncio.PriorityQueue()
63
- self.dispatcher = dispatcher
64
- self.loop_task = asyncio.create_task(self._main_loop())
65
- self.dispatcher.register_task(self.loop_task)
49
+ super().__init__(resource_pool, dispatcher)
50
+ loop_task = asyncio.create_task(self._main_loop())
51
+ self.dispatcher.register_task(loop_task)
66
52
  self.pool_info = None # initialized in _main_loop by the first request
67
53
  self.est_usage = {}
68
54
  self.num_in_flight = 0
@@ -104,6 +90,7 @@ class RateLimitsScheduler(Scheduler):
104
90
  # wait for a single request to get rate limits
105
91
  _logger.debug(f'initializing rate limits for {self.resource_pool}')
106
92
  await self._exec(item.request, item.num_retries, is_task=False)
93
+ _logger.debug(f'initialized rate limits for {self.resource_pool}')
107
94
  item = None
108
95
  # if this was the first request, it created the pool_info
109
96
  if self.pool_info is None:
@@ -111,6 +98,7 @@ class RateLimitsScheduler(Scheduler):
111
98
  continue
112
99
 
113
100
  # check rate limits
101
+ _logger.debug(f'checking rate limits for {self.resource_pool}')
114
102
  request_resources = self._get_request_resources(item.request)
115
103
  limits_info = self._check_resource_limits(request_resources)
116
104
  aws: list[Awaitable[None]] = []
@@ -169,7 +157,6 @@ class RateLimitsScheduler(Scheduler):
169
157
  constant_kwargs, batch_kwargs = request.pxt_fn.create_batch_kwargs(batch_kwargs)
170
158
  return self.pool_info.get_request_resources(**constant_kwargs, **batch_kwargs)
171
159
 
172
-
173
160
  def _check_resource_limits(self, request_resources: dict[str, int]) -> Optional[env.RateLimitInfo]:
174
161
  """Returns the most depleted resource, relative to its limit, or None if all resources are within limits"""
175
162
  candidates: list[tuple[env.RateLimitInfo, float]] = [] # (info, relative usage)
@@ -191,7 +178,9 @@ class RateLimitsScheduler(Scheduler):
191
178
  start_ts = datetime.datetime.now(tz=datetime.timezone.utc)
192
179
  pxt_fn = request.fn_call.fn
193
180
  assert isinstance(pxt_fn, func.CallableFunction)
194
- _logger.debug(f'scheduler {self.resource_pool}: start evaluating slot {request.fn_call.slot_idx}, batch_size={len(request.rows)}')
181
+ _logger.debug(
182
+ f'scheduler {self.resource_pool}: start evaluating slot {request.fn_call.slot_idx}, batch_size={len(request.rows)}'
183
+ )
195
184
  self.total_requests += 1
196
185
  if request.is_batched:
197
186
  batch_result = await pxt_fn.aexec_batch(*request.batch_args, **request.batch_kwargs)
@@ -202,7 +191,9 @@ class RateLimitsScheduler(Scheduler):
202
191
  result = await pxt_fn.aexec(*request.args, **request.kwargs)
203
192
  request.row[request.fn_call.slot_idx] = result
204
193
  end_ts = datetime.datetime.now(tz=datetime.timezone.utc)
205
- _logger.debug(f'scheduler {self.resource_pool}: evaluated slot {request.fn_call.slot_idx} in {end_ts - start_ts}, batch_size={len(request.rows)}')
194
+ _logger.debug(
195
+ f'scheduler {self.resource_pool}: evaluated slot {request.fn_call.slot_idx} in {end_ts - start_ts}, batch_size={len(request.rows)}'
196
+ )
206
197
 
207
198
  # purge accumulated usage estimate, now that we have a new report
208
199
  self.est_usage = {r: 0 for r in self._resources}
@@ -210,10 +201,11 @@ class RateLimitsScheduler(Scheduler):
210
201
  self.dispatcher.dispatch(request.rows)
211
202
  except Exception as exc:
212
203
  _logger.debug(f'scheduler {self.resource_pool}: exception in slot {request.fn_call.slot_idx}: {exc}')
213
- if self.pool_info is None:
204
+ if self.pool_info is None:
214
205
  # our pool info should be available at this point
215
206
  self._set_pool_info()
216
- if num_retries < self.MAX_RETRIES and self.pool_info is not None:
207
+ assert self.pool_info is not None
208
+ if num_retries < self.MAX_RETRIES:
217
209
  retry_delay = self.pool_info.get_retry_delay(exc)
218
210
  if retry_delay is not None:
219
211
  self.total_retried += 1
@@ -229,12 +221,140 @@ class RateLimitsScheduler(Scheduler):
229
221
  row.set_exc(request.fn_call.slot_idx, exc)
230
222
  self.dispatcher.dispatch_exc(request.rows, request.fn_call.slot_idx, exc_tb)
231
223
  finally:
232
- _logger.debug(
233
- f'Scheduler stats: #requests={self.total_requests}, #retried={self.total_retried}')
224
+ _logger.debug(f'Scheduler stats: #requests={self.total_requests}, #retried={self.total_retried}')
234
225
  if is_task:
235
226
  self.num_in_flight -= 1
236
227
  self.request_completed.set()
237
228
 
238
229
 
230
+ class RequestRateScheduler(Scheduler):
231
+ """
232
+ Scheduler for FunctionCalls with a fixed request rate limit and no runtime resource usage reports.
233
+
234
+ Rate limits are supplied in the config, in one of two ways:
235
+ - resource_pool='request-rate:<endpoint>':
236
+ * a single rate limit for all calls against that endpoint
237
+ * in the config: section '<endpoint>', key 'rate_limit'
238
+ - resource_pool='request-rate:<endpoint>:<model>':
239
+ * a single rate limit for all calls against that model
240
+ * in the config: section '<endpoint>.rate_limits', key '<model>'
241
+ - if no rate limit is found in the config, uses a default of 600 RPM
242
+
243
+ TODO:
244
+ - adaptive rate limiting based on 429 errors
245
+ """
246
+
247
+ secs_per_request: float # inverted rate limit
248
+ num_in_flight: int
249
+ total_requests: int
250
+ total_retried: int
251
+
252
+ TIME_FORMAT = '%H:%M.%S %f'
253
+ MAX_RETRIES = 10
254
+ DEFAULT_RATE_LIMIT = 600 # requests per minute
255
+
256
+ def __init__(self, resource_pool: str, dispatcher: Dispatcher):
257
+ super().__init__(resource_pool, dispatcher)
258
+ loop_task = asyncio.create_task(self._main_loop())
259
+ self.dispatcher.register_task(loop_task)
260
+ self.num_in_flight = 0
261
+ self.total_requests = 0
262
+ self.total_retried = 0
263
+
264
+ # try to get the rate limit from the config
265
+ elems = resource_pool.split(':')
266
+ section: str
267
+ key: str
268
+ if len(elems) == 2:
269
+ # resource_pool: request-rate:endpoint
270
+ _, endpoint = elems
271
+ section = endpoint
272
+ key = 'rate_limit'
273
+ else:
274
+ # resource_pool: request-rate:endpoint:model
275
+ assert len(elems) == 3
276
+ _, endpoint, model = elems
277
+ section = f'{endpoint}.rate_limits'
278
+ key = model
279
+ requests_per_min = env.Env.get().config.get_int_value(key, section=section)
280
+ requests_per_min = requests_per_min or self.DEFAULT_RATE_LIMIT
281
+ self.secs_per_request = 1 / (requests_per_min / 60)
282
+
283
+ @classmethod
284
+ def matches(cls, resource_pool: str) -> bool:
285
+ return resource_pool.startswith('request-rate:')
286
+
287
+ async def _main_loop(self) -> None:
288
+ last_request_ts = 0.0
289
+ while True:
290
+ item = await self.queue.get()
291
+ if item.num_retries > 0:
292
+ self.total_retried += 1
293
+ now = time.monotonic()
294
+ if now - last_request_ts < self.secs_per_request:
295
+ wait_duration = self.secs_per_request - (now - last_request_ts)
296
+ _logger.debug(f'waiting for {wait_duration} for {self.resource_pool}')
297
+ await asyncio.sleep(wait_duration)
298
+
299
+ last_request_ts = time.monotonic()
300
+ if item.num_retries > 0:
301
+ # the last request encountered some problem: retry it synchronously, to wait for the problem to pass
302
+ _logger.debug(f'retrying request for {self.resource_pool}: #retries={item.num_retries}')
303
+ await self._exec(item.request, item.num_retries, is_task=False)
304
+ _logger.debug(f'retried request for {self.resource_pool}: #retries={item.num_retries}')
305
+ else:
306
+ _logger.debug(f'creating task for {self.resource_pool}')
307
+ self.num_in_flight += 1
308
+ task = asyncio.create_task(self._exec(item.request, item.num_retries, is_task=True))
309
+ self.dispatcher.register_task(task)
310
+
311
+ async def _exec(self, request: FnCallArgs, num_retries: int, is_task: bool) -> None:
312
+ assert all(not row.has_val[request.fn_call.slot_idx] for row in request.rows)
313
+ assert all(not row.has_exc(request.fn_call.slot_idx) for row in request.rows)
314
+
315
+ try:
316
+ start_ts = datetime.datetime.now(tz=datetime.timezone.utc)
317
+ pxt_fn = request.fn_call.fn
318
+ assert isinstance(pxt_fn, func.CallableFunction)
319
+ _logger.debug(
320
+ f'scheduler {self.resource_pool}: start evaluating slot {request.fn_call.slot_idx}, batch_size={len(request.rows)}'
321
+ )
322
+ self.total_requests += 1
323
+ if request.is_batched:
324
+ batch_result = await pxt_fn.aexec_batch(*request.batch_args, **request.batch_kwargs)
325
+ assert len(batch_result) == len(request.rows)
326
+ for row, result in zip(request.rows, batch_result):
327
+ row[request.fn_call.slot_idx] = result
328
+ else:
329
+ result = await pxt_fn.aexec(*request.args, **request.kwargs)
330
+ request.row[request.fn_call.slot_idx] = result
331
+ end_ts = datetime.datetime.now(tz=datetime.timezone.utc)
332
+ _logger.debug(
333
+ f'scheduler {self.resource_pool}: evaluated slot {request.fn_call.slot_idx} in {end_ts - start_ts}, batch_size={len(request.rows)}'
334
+ )
335
+ self.dispatcher.dispatch(request.rows)
336
+
337
+ except Exception as exc:
338
+ # TODO: which exception can be retried?
339
+ _logger.debug(f'exception for {self.resource_pool}: {exc}')
340
+ status = getattr(exc, 'status', None)
341
+ _logger.debug(f'type={type(exc)} has_status={hasattr(exc, "status")} status={status}')
342
+ if num_retries < self.MAX_RETRIES:
343
+ self.queue.put_nowait(self.QueueItem(request, num_retries + 1))
344
+ return
345
+
346
+ # record the exception
347
+ _, _, exc_tb = sys.exc_info()
348
+ for row in request.rows:
349
+ row.set_exc(request.fn_call.slot_idx, exc)
350
+ self.dispatcher.dispatch_exc(request.rows, request.fn_call.slot_idx, exc_tb)
351
+ finally:
352
+ _logger.debug(
353
+ f'Scheduler stats: #in-flight={self.num_in_flight} #requests={self.total_requests}, #retried={self.total_retried}'
354
+ )
355
+ if is_task:
356
+ self.num_in_flight -= 1
357
+
358
+
239
359
  # all concrete Scheduler subclasses that implement matches()
240
- SCHEDULERS = [RateLimitsScheduler]
360
+ SCHEDULERS = [RateLimitsScheduler, RequestRateScheduler]
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Any, Iterator, Optional, AsyncIterator
2
+ from typing import Any, AsyncIterator, Iterator, Optional
3
3
 
4
4
  import pixeltable.catalog as catalog
5
5
  import pixeltable.exprs as exprs
@@ -10,6 +10,7 @@ from .exec_node import ExecNode
10
10
 
11
11
  _logger = logging.getLogger('pixeltable')
12
12
 
13
+
13
14
  class InMemoryDataNode(ExecNode):
14
15
  """
15
16
  Outputs in-memory data as a DataRowBatch of a particular table.
@@ -18,6 +19,7 @@ class InMemoryDataNode(ExecNode):
18
19
  - with the values provided in the input rows
19
20
  - if an input row doesn't provide a value, sets the slot to the column default
20
21
  """
22
+
21
23
  tbl: catalog.TableVersion
22
24
  input_rows: list[dict[str, Any]]
23
25
  start_row_id: int
@@ -27,8 +29,7 @@ class InMemoryDataNode(ExecNode):
27
29
  output_exprs: list[exprs.ColumnRef]
28
30
 
29
31
  def __init__(
30
- self, tbl: catalog.TableVersion, rows: list[dict[str, Any]],
31
- row_builder: exprs.RowBuilder, start_row_id: int,
32
+ self, tbl: catalog.TableVersion, rows: list[dict[str, Any]], row_builder: exprs.RowBuilder, start_row_id: int
32
33
  ):
33
34
  # we materialize the input slots
34
35
  output_exprs = list(row_builder.input_exprs)
@@ -43,11 +44,11 @@ class InMemoryDataNode(ExecNode):
43
44
  """Create row batch and populate with self.input_rows"""
44
45
  user_cols_by_name = {
45
46
  col_ref.col.name: exprs.ColumnSlotIdx(col_ref.col, col_ref.slot_idx)
46
- for col_ref in self.output_exprs if col_ref.col.name is not None
47
+ for col_ref in self.output_exprs
48
+ if col_ref.col.name is not None
47
49
  }
48
50
  output_cols_by_idx = {
49
- col_ref.slot_idx: exprs.ColumnSlotIdx(col_ref.col, col_ref.slot_idx)
50
- for col_ref in self.output_exprs
51
+ col_ref.slot_idx: exprs.ColumnSlotIdx(col_ref.col, col_ref.slot_idx) for col_ref in self.output_exprs
51
52
  }
52
53
  output_slot_idxs = {e.slot_idx for e in self.output_exprs}
53
54
 
@@ -68,7 +69,7 @@ class InMemoryDataNode(ExecNode):
68
69
  input_slot_idxs.add(col_info.slot_idx)
69
70
 
70
71
  # set the remaining output slots to their default values (presently None)
71
- missing_slot_idxs = output_slot_idxs - input_slot_idxs
72
+ missing_slot_idxs = output_slot_idxs - input_slot_idxs
72
73
  for slot_idx in missing_slot_idxs:
73
74
  col_info = output_cols_by_idx.get(slot_idx)
74
75
  assert col_info is not None
@@ -4,11 +4,13 @@ from typing import Any, AsyncIterator
4
4
  import pixeltable.catalog as catalog
5
5
  import pixeltable.exprs as exprs
6
6
  from pixeltable.utils.media_store import MediaStore
7
+
7
8
  from .data_row_batch import DataRowBatch
8
9
  from .exec_node import ExecNode
9
10
 
10
11
  _logger = logging.getLogger('pixeltable')
11
12
 
13
+
12
14
  class RowUpdateNode(ExecNode):
13
15
  """
14
16
  Update individual rows in the input batches, identified by key columns.
@@ -17,9 +19,15 @@ class RowUpdateNode(ExecNode):
17
19
  The node assumes that all update dicts contain the same keys, and it populates the slots of the columns present in
18
20
  the update list.
19
21
  """
22
+
20
23
  def __init__(
21
- self, tbl: catalog.TableVersionPath, key_vals_batch: list[tuple], is_rowid_key: bool,
22
- col_vals_batch: list[dict[catalog.Column, Any]], row_builder: exprs.RowBuilder, input: ExecNode,
24
+ self,
25
+ tbl: catalog.TableVersionPath,
26
+ key_vals_batch: list[tuple],
27
+ is_rowid_key: bool,
28
+ col_vals_batch: list[dict[catalog.Column, Any]],
29
+ row_builder: exprs.RowBuilder,
30
+ input: ExecNode,
23
31
  ):
24
32
  super().__init__(row_builder, [], [], input)
25
33
  self.updates = {key_vals: col_vals for key_vals, col_vals in zip(key_vals_batch, col_vals_batch)}
@@ -28,7 +36,8 @@ class RowUpdateNode(ExecNode):
28
36
  # retrieve ColumnRefs from the RowBuilder (has slot_idx set)
29
37
  all_col_slot_idxs = {
30
38
  col_ref.col: col_ref.slot_idx
31
- for col_ref in row_builder.unique_exprs if isinstance(col_ref, exprs.ColumnRef)
39
+ for col_ref in row_builder.unique_exprs
40
+ if isinstance(col_ref, exprs.ColumnRef)
32
41
  }
33
42
  self.col_slot_idxs = {col: all_col_slot_idxs[col] for col in col_vals_batch[0].keys()}
34
43
  self.key_slot_idxs = {col: all_col_slot_idxs[col] for col in tbl.tbl_version.primary_key_columns()}
@@ -37,8 +46,9 @@ class RowUpdateNode(ExecNode):
37
46
  async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
38
47
  async for batch in self.input:
39
48
  for row in batch:
40
- key_vals = row.rowid if self.is_rowid_key else \
41
- tuple(row[slot_idx] for slot_idx in self.key_slot_idxs.values())
49
+ key_vals = (
50
+ row.rowid if self.is_rowid_key else tuple(row[slot_idx] for slot_idx in self.key_slot_idxs.values())
51
+ )
42
52
  if key_vals not in self.updates:
43
53
  continue
44
54
  self.matched_key_vals.add(key_vals)
@@ -1,13 +1,14 @@
1
1
  import logging
2
2
  import warnings
3
3
  from decimal import Decimal
4
- from typing import Iterable, Iterator, NamedTuple, Optional, TYPE_CHECKING, Sequence, AsyncIterator
4
+ from typing import TYPE_CHECKING, AsyncIterator, Iterable, Iterator, NamedTuple, Optional, Sequence
5
5
  from uuid import UUID
6
6
 
7
7
  import sqlalchemy as sql
8
8
 
9
9
  import pixeltable.catalog as catalog
10
10
  import pixeltable.exprs as exprs
11
+
11
12
  from .data_row_batch import DataRowBatch
12
13
  from .exec_node import ExecNode
13
14
 
@@ -53,10 +54,12 @@ def combine_order_by_clauses(clauses: Iterable[OrderByClause]) -> Optional[Order
53
54
 
54
55
 
55
56
  def print_order_by_clause(clause: OrderByClause) -> str:
56
- return ', '.join([
57
- f'({item.expr}{", asc=True" if item.asc is True else ""}{", asc=False" if item.asc is False else ""})'
58
- for item in clause
59
- ])
57
+ return ', '.join(
58
+ [
59
+ f'({item.expr}{", asc=True" if item.asc is True else ""}{", asc=False" if item.asc is False else ""})'
60
+ for item in clause
61
+ ]
62
+ )
60
63
 
61
64
 
62
65
  class SqlNode(ExecNode):
@@ -82,8 +85,12 @@ class SqlNode(ExecNode):
82
85
  limit: Optional[int]
83
86
 
84
87
  def __init__(
85
- self, tbl: Optional[catalog.TableVersionPath], row_builder: exprs.RowBuilder,
86
- select_list: Iterable[exprs.Expr], sql_elements: exprs.SqlElementCache, set_pk: bool = False
88
+ self,
89
+ tbl: Optional[catalog.TableVersionPath],
90
+ row_builder: exprs.RowBuilder,
91
+ select_list: Iterable[exprs.Expr],
92
+ sql_elements: exprs.SqlElementCache,
93
+ set_pk: bool = False,
87
94
  ):
88
95
  """
89
96
  If row_builder contains references to unstored iter columns, expands the select list to include their
@@ -186,8 +193,11 @@ class SqlNode(ExecNode):
186
193
 
187
194
  @classmethod
188
195
  def create_from_clause(
189
- cls, tbl: catalog.TableVersionPath, stmt: sql.Select, refd_tbl_ids: Optional[set[UUID]] = None,
190
- exact_version_only: Optional[set[UUID]] = None
196
+ cls,
197
+ tbl: catalog.TableVersionPath,
198
+ stmt: sql.Select,
199
+ refd_tbl_ids: Optional[set[UUID]] = None,
200
+ exact_version_only: Optional[set[UUID]] = None,
191
201
  ) -> sql.Select:
192
202
  """Add From clause to stmt for tables/views referenced by materialized_exprs
193
203
  Args:
@@ -220,15 +230,14 @@ class SqlNode(ExecNode):
220
230
  # join tbl to prev_tbl on prev_tbl's rowid cols
221
231
  prev_tbl_rowid_cols = prev_tbl.store_tbl.rowid_columns()
222
232
  tbl_rowid_cols = tbl.store_tbl.rowid_columns()
223
- rowid_clauses = \
224
- [c1 == c2 for c1, c2 in zip(prev_tbl_rowid_cols, tbl_rowid_cols[:len(prev_tbl_rowid_cols)])]
233
+ rowid_clauses = [
234
+ c1 == c2 for c1, c2 in zip(prev_tbl_rowid_cols, tbl_rowid_cols[: len(prev_tbl_rowid_cols)])
235
+ ]
225
236
  stmt = stmt.join(tbl.store_tbl.sa_tbl, sql.and_(*rowid_clauses))
226
237
  if tbl.id in exact_version_only:
227
238
  stmt = stmt.where(tbl.store_tbl.v_min_col == tbl.version)
228
239
  else:
229
- stmt = stmt \
230
- .where(tbl.store_tbl.v_min_col <= tbl.version) \
231
- .where(tbl.store_tbl.v_max_col > tbl.version)
240
+ stmt = stmt.where(tbl.store_tbl.v_min_col <= tbl.version).where(tbl.store_tbl.v_max_col > tbl.version)
232
241
  prev_tbl = tbl
233
242
  return stmt
234
243
 
@@ -291,7 +300,7 @@ class SqlNode(ExecNode):
291
300
 
292
301
  # populate output_row
293
302
  if self.num_pk_cols > 0:
294
- output_row.set_pk(tuple(sql_row[-self.num_pk_cols:]))
303
+ output_row.set_pk(tuple(sql_row[-self.num_pk_cols :]))
295
304
  # copy the output of the SQL query into the output row
296
305
  for i, e in enumerate(self.select_list):
297
306
  slot_idx = e.slot_idx
@@ -341,12 +350,16 @@ class SqlScanNode(SqlNode):
341
350
 
342
351
  Supports filtering and ordering.
343
352
  """
353
+
344
354
  exact_version_only: list[catalog.TableVersion]
345
355
 
346
356
  def __init__(
347
- self, tbl: catalog.TableVersionPath, row_builder: exprs.RowBuilder,
357
+ self,
358
+ tbl: catalog.TableVersionPath,
359
+ row_builder: exprs.RowBuilder,
348
360
  select_list: Iterable[exprs.Expr],
349
- set_pk: bool = False, exact_version_only: Optional[list[catalog.TableVersion]] = None
361
+ set_pk: bool = False,
362
+ exact_version_only: Optional[list[catalog.TableVersion]] = None,
350
363
  ):
351
364
  """
352
365
  Args:
@@ -367,7 +380,8 @@ class SqlScanNode(SqlNode):
367
380
  where_clause_tbl_ids = self.where_clause.tbl_ids() if self.where_clause is not None else set()
368
381
  refd_tbl_ids = exprs.Expr.all_tbl_ids(self.select_list) | where_clause_tbl_ids | self._ordering_tbl_ids()
369
382
  stmt = self.create_from_clause(
370
- self.tbl, stmt, refd_tbl_ids, exact_version_only={t.id for t in self.exact_version_only})
383
+ self.tbl, stmt, refd_tbl_ids, exact_version_only={t.id for t in self.exact_version_only}
384
+ )
371
385
  return stmt
372
386
 
373
387
 
@@ -377,8 +391,12 @@ class SqlLookupNode(SqlNode):
377
391
  """
378
392
 
379
393
  def __init__(
380
- self, tbl: catalog.TableVersionPath, row_builder: exprs.RowBuilder,
381
- select_list: Iterable[exprs.Expr], sa_key_cols: list[sql.Column], key_vals: list[tuple],
394
+ self,
395
+ tbl: catalog.TableVersionPath,
396
+ row_builder: exprs.RowBuilder,
397
+ select_list: Iterable[exprs.Expr],
398
+ sa_key_cols: list[sql.Column],
399
+ key_vals: list[tuple],
382
400
  ):
383
401
  """
384
402
  Args:
@@ -406,11 +424,13 @@ class SqlAggregationNode(SqlNode):
406
424
  group_by_items: Optional[list[exprs.Expr]]
407
425
 
408
426
  def __init__(
409
- self, row_builder: exprs.RowBuilder,
427
+ self,
428
+ row_builder: exprs.RowBuilder,
410
429
  input: SqlNode,
411
430
  select_list: Iterable[exprs.Expr],
412
431
  group_by_items: Optional[list[exprs.Expr]] = None,
413
- limit: Optional[int] = None, exact_version_only: Optional[list[catalog.TableVersion]] = None
432
+ limit: Optional[int] = None,
433
+ exact_version_only: Optional[list[catalog.TableVersion]] = None,
414
434
  ):
415
435
  """
416
436
  Args:
@@ -436,12 +456,16 @@ class SqlJoinNode(SqlNode):
436
456
  """
437
457
  Materializes data from the store via a Select ... From ... that contains joins
438
458
  """
459
+
439
460
  input_ctes: list[sql.CTE]
440
461
  join_clauses: list['pixeltable.plan.JoinClause']
441
462
 
442
463
  def __init__(
443
- self, row_builder: exprs.RowBuilder,
444
- inputs: Sequence[SqlNode], join_clauses: list['pixeltable.plan.JoinClause'], select_list: Iterable[exprs.Expr]
464
+ self,
465
+ row_builder: exprs.RowBuilder,
466
+ inputs: Sequence[SqlNode],
467
+ join_clauses: list['pixeltable.plan.JoinClause'],
468
+ select_list: Iterable[exprs.Expr],
445
469
  ):
446
470
  assert len(inputs) > 1
447
471
  assert len(inputs) == len(join_clauses) + 1
@@ -456,16 +480,21 @@ class SqlJoinNode(SqlNode):
456
480
 
457
481
  def _create_stmt(self) -> sql.Select:
458
482
  from pixeltable import plan
483
+
459
484
  stmt = super()._create_stmt()
460
485
  stmt = stmt.select_from(self.input_ctes[0])
461
486
  for i in range(len(self.join_clauses)):
462
487
  join_clause = self.join_clauses[i]
463
488
  on_clause = (
464
- self.sql_elements.get(join_clause.join_predicate) if join_clause.join_type != plan.JoinType.CROSS
489
+ self.sql_elements.get(join_clause.join_predicate)
490
+ if join_clause.join_type != plan.JoinType.CROSS
465
491
  else sql.sql.expression.literal(True)
466
492
  )
467
493
  is_outer = join_clause.join_type == plan.JoinType.LEFT or join_clause.join_type == plan.JoinType.FULL_OUTER
468
494
  stmt = stmt.join(
469
- self.input_ctes[i + 1], onclause=on_clause, isouter=is_outer,
470
- full=join_clause == plan.JoinType.FULL_OUTER)
495
+ self.input_ctes[i + 1],
496
+ onclause=on_clause,
497
+ isouter=is_outer,
498
+ full=join_clause == plan.JoinType.FULL_OUTER,
499
+ )
471
500
  return stmt