pixeltable 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (39) hide show
  1. pixeltable/__version__.py +2 -2
  2. pixeltable/catalog/catalog.py +106 -71
  3. pixeltable/catalog/path.py +59 -20
  4. pixeltable/catalog/schema_object.py +1 -0
  5. pixeltable/catalog/table.py +6 -0
  6. pixeltable/catalog/table_version.py +2 -1
  7. pixeltable/catalog/view.py +21 -10
  8. pixeltable/config.py +12 -4
  9. pixeltable/dataframe.py +57 -1
  10. pixeltable/env.py +25 -13
  11. pixeltable/exec/aggregation_node.py +1 -1
  12. pixeltable/exec/cache_prefetch_node.py +2 -6
  13. pixeltable/exec/component_iteration_node.py +4 -3
  14. pixeltable/exec/data_row_batch.py +10 -53
  15. pixeltable/exec/expr_eval/expr_eval_node.py +2 -2
  16. pixeltable/exec/in_memory_data_node.py +13 -11
  17. pixeltable/exec/sql_node.py +6 -7
  18. pixeltable/exprs/data_row.py +13 -13
  19. pixeltable/exprs/row_builder.py +16 -4
  20. pixeltable/exprs/string_op.py +1 -1
  21. pixeltable/func/expr_template_function.py +1 -4
  22. pixeltable/functions/date.py +1 -1
  23. pixeltable/functions/math.py +1 -1
  24. pixeltable/functions/openai.py +8 -4
  25. pixeltable/functions/timestamp.py +6 -6
  26. pixeltable/globals.py +14 -10
  27. pixeltable/metadata/schema.py +1 -1
  28. pixeltable/plan.py +5 -14
  29. pixeltable/share/packager.py +13 -13
  30. pixeltable/store.py +9 -6
  31. pixeltable/type_system.py +2 -1
  32. pixeltable/utils/filecache.py +1 -1
  33. pixeltable/utils/http_server.py +2 -3
  34. pixeltable/utils/media_store.py +84 -39
  35. {pixeltable-0.4.4.dist-info → pixeltable-0.4.5.dist-info}/METADATA +1 -1
  36. {pixeltable-0.4.4.dist-info → pixeltable-0.4.5.dist-info}/RECORD +39 -39
  37. {pixeltable-0.4.4.dist-info → pixeltable-0.4.5.dist-info}/LICENSE +0 -0
  38. {pixeltable-0.4.4.dist-info → pixeltable-0.4.5.dist-info}/WHEEL +0 -0
  39. {pixeltable-0.4.4.dist-info → pixeltable-0.4.5.dist-info}/entry_points.txt +0 -0
pixeltable/dataframe.py CHANGED
@@ -8,9 +8,22 @@ import json
8
8
  import logging
9
9
  import traceback
10
10
  from pathlib import Path
11
- from typing import TYPE_CHECKING, Any, AsyncIterator, Callable, Hashable, Iterator, NoReturn, Optional, Sequence, Union
11
+ from typing import (
12
+ TYPE_CHECKING,
13
+ Any,
14
+ AsyncIterator,
15
+ Callable,
16
+ Hashable,
17
+ Iterator,
18
+ NoReturn,
19
+ Optional,
20
+ Sequence,
21
+ TypeVar,
22
+ Union,
23
+ )
12
24
 
13
25
  import pandas as pd
26
+ import pydantic
14
27
  import sqlalchemy as sql
15
28
 
16
29
  from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
@@ -32,6 +45,11 @@ _logger = logging.getLogger('pixeltable')
32
45
 
33
46
 
34
47
  class DataFrameResultSet:
48
+ _rows: list[list[Any]]
49
+ _col_names: list[str]
50
+ __schema: dict[str, ColumnType]
51
+ __formatter: Formatter
52
+
35
53
  def __init__(self, rows: list[list[Any]], schema: dict[str, ColumnType]):
36
54
  self._rows = rows
37
55
  self._col_names = list(schema.keys())
@@ -66,6 +84,44 @@ class DataFrameResultSet:
66
84
  def to_pandas(self) -> pd.DataFrame:
67
85
  return pd.DataFrame.from_records(self._rows, columns=self._col_names)
68
86
 
87
+ BaseModelT = TypeVar('BaseModelT', bound=pydantic.BaseModel)
88
+
89
+ def to_pydantic(self, model: type[BaseModelT]) -> Iterator[BaseModelT]:
90
+ """
91
+ Convert the DataFrameResultSet to a list of Pydantic model instances.
92
+
93
+ Args:
94
+ model: A Pydantic model class.
95
+
96
+ Returns:
97
+ An iterator over Pydantic model instances, one for each row in the result set.
98
+
99
+ Raises:
100
+ Error: If the row data doesn't match the model schema.
101
+ """
102
+ model_fields = model.model_fields
103
+ model_config = getattr(model, 'model_config', {})
104
+ forbid_extra_fields = model_config.get('extra') == 'forbid'
105
+
106
+ # schema validation
107
+ required_fields = {name for name, field in model_fields.items() if field.is_required()}
108
+ col_names = set(self._col_names)
109
+ missing_fields = required_fields - col_names
110
+ if len(missing_fields) > 0:
111
+ raise excs.Error(
112
+ f'Required model fields {missing_fields} are missing from result set columns {self._col_names}'
113
+ )
114
+ if forbid_extra_fields:
115
+ extra_fields = col_names - set(model_fields.keys())
116
+ if len(extra_fields) > 0:
117
+ raise excs.Error(f"Extra fields {extra_fields} are not allowed in model with extra='forbid'")
118
+
119
+ for row in self:
120
+ try:
121
+ yield model(**row)
122
+ except pydantic.ValidationError as e:
123
+ raise excs.Error(str(e)) from e
124
+
69
125
  def _row_to_dict(self, row_idx: int) -> dict[str, Any]:
70
126
  return {self._col_names[i]: self._rows[row_idx][i] for i in range(len(self._col_names))}
71
127
 
pixeltable/env.py CHANGED
@@ -13,6 +13,8 @@ import platform
13
13
  import shutil
14
14
  import sys
15
15
  import threading
16
+ import types
17
+ import typing
16
18
  import uuid
17
19
  import warnings
18
20
  from abc import abstractmethod
@@ -604,16 +606,26 @@ class Env:
604
606
 
605
607
  # Construct a client, retrieving each parameter from config.
606
608
 
607
- init_kwargs: dict[str, str] = {}
608
- for param in cl.param_names:
609
- arg = Config.get().get_string_value(param, section=name)
610
- if arg is not None and len(arg) > 0:
611
- init_kwargs[param] = arg
612
- else:
609
+ init_kwargs: dict[str, Any] = {}
610
+ for param in cl.params.values():
611
+ # Determine the type of the parameter for proper config parsing.
612
+ t = param.annotation
613
+ # Deference Optional[T]
614
+ if typing.get_origin(t) in (typing.Union, types.UnionType):
615
+ args = typing.get_args(t)
616
+ if args[0] is type(None):
617
+ t = args[1]
618
+ elif args[1] is type(None):
619
+ t = args[0]
620
+ assert isinstance(t, type), t
621
+ arg: Any = Config.get().get_value(param.name, t, section=name)
622
+ if arg is not None:
623
+ init_kwargs[param.name] = arg
624
+ elif param.default is inspect.Parameter.empty:
613
625
  raise excs.Error(
614
- f'`{name}` client not initialized: parameter `{param}` is not configured.\n'
615
- f'To fix this, specify the `{name.upper()}_{param.upper()}` environment variable, '
616
- f'or put `{param.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
626
+ f'`{name}` client not initialized: parameter `{param.name}` is not configured.\n'
627
+ f'To fix this, specify the `{name.upper()}_{param.name.upper()}` environment variable, '
628
+ f'or put `{param.name.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
617
629
  )
618
630
 
619
631
  cl.client_obj = cl.init_fn(**init_kwargs)
@@ -624,7 +636,7 @@ class Env:
624
636
  """
625
637
  The http server root is the file system root.
626
638
  eg: /home/media/foo.mp4 is located at http://127.0.0.1:{port}/home/media/foo.mp4
627
- in windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
639
+ On Windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
628
640
  This arrangement enables serving media hosted within _home,
629
641
  as well as external media inserted into pixeltable or produced by pixeltable.
630
642
  The port is chosen dynamically to prevent conflicts.
@@ -832,8 +844,8 @@ def register_client(name: str) -> Callable:
832
844
 
833
845
  def decorator(fn: Callable) -> None:
834
846
  sig = inspect.signature(fn)
835
- param_names = list(sig.parameters.keys())
836
- _registered_clients[name] = ApiClient(init_fn=fn, param_names=param_names)
847
+ params = dict(sig.parameters)
848
+ _registered_clients[name] = ApiClient(init_fn=fn, params=params)
837
849
 
838
850
  return decorator
839
851
 
@@ -844,7 +856,7 @@ _registered_clients: dict[str, ApiClient] = {}
844
856
  @dataclass
845
857
  class ApiClient:
846
858
  init_fn: Callable
847
- param_names: list[str]
859
+ params: dict[str, inspect.Parameter]
848
860
  client_obj: Optional[Any] = None
849
861
 
850
862
 
@@ -45,7 +45,7 @@ class AggregationNode(ExecNode):
45
45
  # we need to make sure to refer to the same exprs that RowBuilder.eval() will use
46
46
  self.agg_fn_calls = [cast(exprs.FunctionCall, e) for e in self.agg_fn_eval_ctx.target_exprs]
47
47
  # create output_batch here, rather than in __iter__(), so we don't need to remember tbl and row_builder
48
- self.output_batch = DataRowBatch(tbl, row_builder, 0)
48
+ self.output_batch = DataRowBatch(row_builder)
49
49
  self.limit = None
50
50
 
51
51
  def set_limit(self, limit: int) -> None:
@@ -12,7 +12,7 @@ from pathlib import Path
12
12
  from typing import Any, AsyncIterator, Iterator, Optional
13
13
  from uuid import UUID
14
14
 
15
- from pixeltable import catalog, env, exceptions as excs, exprs
15
+ from pixeltable import env, exceptions as excs, exprs
16
16
  from pixeltable.utils.filecache import FileCache
17
17
 
18
18
  from .data_row_batch import DataRowBatch
@@ -37,7 +37,6 @@ class CachePrefetchNode(ExecNode):
37
37
  boto_client_lock: threading.Lock
38
38
 
39
39
  # execution state
40
- batch_tbl_version: Optional[catalog.TableVersionHandle] # needed to construct output batches
41
40
  num_returned_rows: int
42
41
 
43
42
  # ready_rows: rows that are ready to be returned, ordered by row idx;
@@ -68,7 +67,6 @@ class CachePrefetchNode(ExecNode):
68
67
  self.boto_client = None
69
68
  self.boto_client_lock = threading.Lock()
70
69
 
71
- self.batch_tbl_version = None
72
70
  self.num_returned_rows = 0
73
71
  self.ready_rows = deque()
74
72
  self.in_flight_rows = {}
@@ -95,7 +93,7 @@ class CachePrefetchNode(ExecNode):
95
93
 
96
94
  if len(self.ready_rows) > 0:
97
95
  # create DataRowBatch from the first BATCH_SIZE ready rows
98
- batch = DataRowBatch(self.batch_tbl_version, self.row_builder)
96
+ batch = DataRowBatch(self.row_builder)
99
97
  rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
100
98
  for row in rows:
101
99
  assert row is not None
@@ -173,8 +171,6 @@ class CachePrefetchNode(ExecNode):
173
171
  if input_batch is None:
174
172
  self.input_finished = True
175
173
  return
176
- if self.batch_tbl_version is None:
177
- self.batch_tbl_version = input_batch.tbl
178
174
 
179
175
  file_cache = FileCache.get()
180
176
 
@@ -40,7 +40,7 @@ class ComponentIterationNode(ExecNode):
40
40
  }
41
41
 
42
42
  async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
43
- output_batch = DataRowBatch(self.view, self.row_builder)
43
+ output_batch = DataRowBatch(self.row_builder)
44
44
  async for input_batch in self.input:
45
45
  for input_row in input_batch:
46
46
  self.row_builder.eval(input_row, self.iterator_args_ctx)
@@ -52,13 +52,14 @@ class ComponentIterationNode(ExecNode):
52
52
  if self.__non_nullable_args_specified(iterator_args):
53
53
  iterator = self.view.get().iterator_cls(**iterator_args)
54
54
  for pos, component_dict in enumerate(iterator):
55
- output_row = output_batch.add_row()
55
+ output_row = self.row_builder.make_row()
56
56
  input_row.copy(output_row)
57
57
  # we're expanding the input and need to add the iterator position to the pk
58
58
  self.__populate_output_row(output_row, pos, component_dict)
59
+ output_batch.add_row(output_row)
59
60
  if len(output_batch) == self.__OUTPUT_BATCH_SIZE:
60
61
  yield output_batch
61
- output_batch = DataRowBatch(self.view, self.row_builder)
62
+ output_batch = DataRowBatch(self.row_builder)
62
63
 
63
64
  if len(output_batch) > 0:
64
65
  yield output_batch
@@ -3,8 +3,7 @@ from __future__ import annotations
3
3
  import logging
4
4
  from typing import Iterator, Optional
5
5
 
6
- from pixeltable import catalog, exprs
7
- from pixeltable.utils.media_store import MediaStore
6
+ from pixeltable import exprs
8
7
 
9
8
  _logger = logging.getLogger('pixeltable')
10
9
 
@@ -15,51 +14,19 @@ class DataRowBatch:
15
14
  Contains the metadata needed to initialize DataRows.
16
15
  """
17
16
 
18
- tbl: Optional[catalog.TableVersionHandle]
19
17
  row_builder: exprs.RowBuilder
20
- img_slot_idxs: list[int]
21
- media_slot_idxs: list[int] # non-image media slots
22
- array_slot_idxs: list[int]
23
18
  rows: list[exprs.DataRow]
24
19
 
25
- def __init__(
26
- self,
27
- tbl: Optional[catalog.TableVersionHandle],
28
- row_builder: exprs.RowBuilder,
29
- num_rows: Optional[int] = None,
30
- rows: Optional[list[exprs.DataRow]] = None,
31
- ):
20
+ def __init__(self, row_builder: exprs.RowBuilder, rows: Optional[list[exprs.DataRow]] = None):
32
21
  """
33
22
  Requires either num_rows or rows to be specified, but not both.
34
23
  """
35
- assert num_rows is None or rows is None
36
- self.tbl = tbl
37
24
  self.row_builder = row_builder
38
- self.img_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_image_type()]
39
- # non-image media slots
40
- self.media_slot_idxs = [
41
- e.slot_idx
42
- for e in row_builder.unique_exprs
43
- if e.col_type.is_media_type() and not e.col_type.is_image_type()
44
- ]
45
- self.array_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_array_type()]
46
- if rows is not None:
47
- self.rows = rows
48
- else:
49
- if num_rows is None:
50
- num_rows = 0
51
- self.rows = [
52
- exprs.DataRow(
53
- row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
54
- )
55
- for _ in range(num_rows)
56
- ]
25
+ self.rows = [] if rows is None else rows
57
26
 
58
- def add_row(self, row: Optional[exprs.DataRow] = None) -> exprs.DataRow:
27
+ def add_row(self, row: Optional[exprs.DataRow]) -> exprs.DataRow:
59
28
  if row is None:
60
- row = exprs.DataRow(
61
- self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
62
- )
29
+ row = self.row_builder.make_row()
63
30
  self.rows.append(row)
64
31
  return row
65
32
 
@@ -73,28 +40,18 @@ class DataRowBatch:
73
40
  return self.rows[index]
74
41
 
75
42
  def flush_imgs(
76
- self,
77
- idx_range: Optional[slice] = None,
78
- stored_img_info: Optional[list[exprs.ColumnSlotIdx]] = None,
79
- flushed_slot_idxs: Optional[list[int]] = None,
43
+ self, idx_range: Optional[slice], stored_img_info: list[exprs.ColumnSlotIdx], flushed_img_slots: list[int]
80
44
  ) -> None:
81
45
  """Flushes images in the given range of rows."""
82
- assert self.tbl is not None
83
- if stored_img_info is None:
84
- stored_img_info = []
85
- if flushed_slot_idxs is None:
86
- flushed_slot_idxs = []
87
- if len(stored_img_info) == 0 and len(flushed_slot_idxs) == 0:
46
+ if len(stored_img_info) == 0 and len(flushed_img_slots) == 0:
88
47
  return
48
+
89
49
  if idx_range is None:
90
50
  idx_range = slice(0, len(self.rows))
91
51
  for row in self.rows[idx_range]:
92
52
  for info in stored_img_info:
93
- col = info.col
94
- assert col.tbl.id == self.tbl.id
95
- filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
96
- row.flush_img(info.slot_idx, filepath)
97
- for slot_idx in flushed_slot_idxs:
53
+ row.flush_img(info.slot_idx, info.col)
54
+ for slot_idx in flushed_img_slots:
98
55
  row.flush_img(slot_idx)
99
56
 
100
57
  def __iter__(self) -> Iterator[exprs.DataRow]:
@@ -240,7 +240,7 @@ class ExprEvalNode(ExecNode):
240
240
  # make sure we top up our in-flight rows before yielding
241
241
  self._dispatch_input_rows()
242
242
  self._log_state(f'yielding {len(batch_rows)} rows')
243
- yield DataRowBatch(tbl=None, row_builder=self.row_builder, rows=batch_rows)
243
+ yield DataRowBatch(row_builder=self.row_builder, rows=batch_rows)
244
244
  # at this point, we may have more completed rows
245
245
 
246
246
  assert self.completed_rows.empty() # all completed rows should be sitting in output_buffer
@@ -254,7 +254,7 @@ class ExprEvalNode(ExecNode):
254
254
  batch_rows = self.output_buffer.get_rows(self.output_buffer.num_ready)
255
255
  self.num_output_rows += len(batch_rows)
256
256
  self._log_state(f'yielding {len(batch_rows)} rows')
257
- yield DataRowBatch(tbl=None, row_builder=self.row_builder, rows=batch_rows)
257
+ yield DataRowBatch(row_builder=self.row_builder, rows=batch_rows)
258
258
 
259
259
  assert self.output_buffer.num_rows == 0
260
260
  return
@@ -23,7 +23,7 @@ class InMemoryDataNode(ExecNode):
23
23
 
24
24
  input_rows: list[dict[str, Any]]
25
25
  start_row_id: int
26
- output_rows: Optional[DataRowBatch]
26
+ output_batch: Optional[DataRowBatch]
27
27
 
28
28
  # output_exprs is declared in the superclass, but we redeclare it here with a more specific type
29
29
  output_exprs: list[exprs.ColumnRef]
@@ -42,7 +42,7 @@ class InMemoryDataNode(ExecNode):
42
42
  self.tbl = tbl
43
43
  self.input_rows = rows
44
44
  self.start_row_id = start_row_id
45
- self.output_rows = None
45
+ self.output_batch = None
46
46
 
47
47
  def _open(self) -> None:
48
48
  """Create row batch and populate with self.input_rows"""
@@ -56,8 +56,9 @@ class InMemoryDataNode(ExecNode):
56
56
  }
57
57
  output_slot_idxs = {e.slot_idx for e in self.output_exprs}
58
58
 
59
- self.output_rows = DataRowBatch(self.tbl, self.row_builder, len(self.input_rows))
60
- for row_idx, input_row in enumerate(self.input_rows):
59
+ self.output_batch = DataRowBatch(self.row_builder)
60
+ for input_row in self.input_rows:
61
+ output_row = self.row_builder.make_row()
61
62
  # populate the output row with the values provided in the input row
62
63
  input_slot_idxs: set[int] = set()
63
64
  for col_name, val in input_row.items():
@@ -67,10 +68,10 @@ class InMemoryDataNode(ExecNode):
67
68
  if col.col_type.is_image_type() and isinstance(val, bytes):
68
69
  # this is a literal media file, ie, a sequence of bytes; save it as a binary file and store the path
69
70
  assert col.tbl.id == self.tbl.id
70
- path = MediaStore.save_media_file(val, col.tbl.id, col.id, col.tbl.version)
71
- self.output_rows[row_idx][col_info.slot_idx] = str(path)
71
+ filepath, _ = MediaStore.save_media_object(val, col, format=None)
72
+ output_row[col_info.slot_idx] = str(filepath)
72
73
  else:
73
- self.output_rows[row_idx][col_info.slot_idx] = val
74
+ output_row[col_info.slot_idx] = val
74
75
 
75
76
  input_slot_idxs.add(col_info.slot_idx)
76
77
 
@@ -79,10 +80,11 @@ class InMemoryDataNode(ExecNode):
79
80
  for slot_idx in missing_slot_idxs:
80
81
  col_info = output_cols_by_idx.get(slot_idx)
81
82
  assert col_info is not None
82
- self.output_rows[row_idx][col_info.slot_idx] = None
83
+ output_row[col_info.slot_idx] = None
84
+ self.output_batch.add_row(output_row)
83
85
 
84
- self.ctx.num_rows = len(self.output_rows)
86
+ self.ctx.num_rows = len(self.output_batch)
85
87
 
86
88
  async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
87
- _logger.debug(f'InMemoryDataNode: created row batch with {len(self.output_rows)} output_rows')
88
- yield self.output_rows
89
+ _logger.debug(f'InMemoryDataNode: created row batch with {len(self.output_batch)} rows')
90
+ yield self.output_batch
@@ -316,8 +316,7 @@ class SqlNode(ExecNode):
316
316
  for _ in w:
317
317
  pass
318
318
 
319
- tbl_version = self.tbl.tbl_version if self.tbl is not None else None
320
- output_batch = DataRowBatch(tbl_version, self.row_builder)
319
+ output_batch = DataRowBatch(self.row_builder)
321
320
  output_row: Optional[exprs.DataRow] = None
322
321
  num_rows_returned = 0
323
322
 
@@ -359,7 +358,7 @@ class SqlNode(ExecNode):
359
358
  if self.ctx.batch_size > 0 and len(output_batch) == self.ctx.batch_size:
360
359
  _logger.debug(f'SqlScanNode: returning {len(output_batch)} rows')
361
360
  yield output_batch
362
- output_batch = DataRowBatch(tbl_version, self.row_builder)
361
+ output_batch = DataRowBatch(self.row_builder)
363
362
 
364
363
  if len(output_batch) > 0:
365
364
  _logger.debug(f'SqlScanNode: returning {len(output_batch)} rows')
@@ -569,10 +568,10 @@ class SqlSampleNode(SqlNode):
569
568
  General SQL form is:
570
569
  - MD5(<seed::text> [ + '___' + <rowid_col_val>::text]+
571
570
  """
572
- sql_expr: sql.ColumnElement = sql.cast(seed, sql.Text)
571
+ sql_expr: sql.ColumnElement = seed.cast(sql.String)
573
572
  for e in sql_cols:
574
573
  # Quotes are required below to guarantee that the string is properly presented in SQL
575
- sql_expr = sql_expr + sql.literal_column("'___'", sql.Text) + sql.cast(e, sql.Text)
574
+ sql_expr = sql_expr + sql.literal_column("'___'", sql.Text) + e.cast(sql.String)
576
575
  sql_expr = sql.func.md5(sql_expr)
577
576
  return sql_expr
578
577
 
@@ -591,9 +590,9 @@ class SqlSampleNode(SqlNode):
591
590
  s_key = self._create_key_sql(self.input_cte)
592
591
 
593
592
  # Construct a suitable where clause
594
- fraction_sql = sql.cast(SampleClause.fraction_to_md5_hex(float(self.sample_clause.fraction)), sql.Text)
593
+ fraction_md5 = SampleClause.fraction_to_md5_hex(self.sample_clause.fraction)
595
594
  order_by = self._create_key_sql(self.input_cte)
596
- return sql.select(*self.input_cte.c).where(s_key < fraction_sql).order_by(order_by)
595
+ return sql.select(*self.input_cte.c).where(s_key < fraction_md5).order_by(order_by)
597
596
 
598
597
  return self._create_stmt_stratified_fraction(self.sample_clause.fraction)
599
598
  else:
@@ -13,7 +13,8 @@ import PIL
13
13
  import PIL.Image
14
14
  import sqlalchemy as sql
15
15
 
16
- from pixeltable import env
16
+ from pixeltable import catalog, env
17
+ from pixeltable.utils.media_store import MediaStore
17
18
 
18
19
 
19
20
  class DataRow:
@@ -256,23 +257,22 @@ class DataRow:
256
257
  self.vals[idx] = val
257
258
  self.has_val[idx] = True
258
259
 
259
- def flush_img(self, index: int, filepath: Optional[str] = None) -> None:
260
- """Discard the in-memory value and save it to a local file, if filepath is not None"""
260
+ def flush_img(self, index: int, col: Optional[catalog.Column] = None) -> None:
261
+ """Save or discard the in-memory value (required to be a PIL.Image.Image)"""
261
262
  if self.vals[index] is None:
262
263
  return
263
264
  assert self.excs[index] is None
264
265
  if self.file_paths[index] is None:
265
- if filepath is not None:
266
+ if col is not None:
266
267
  image = self.vals[index]
267
- assert isinstance(image, PIL.Image.Image)
268
- # Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
269
- # In that case, use WebP instead.
270
- format = 'webp' if image.has_transparency_data else 'jpeg'
271
- if not filepath.endswith(f'.{format}'):
272
- filepath += f'.{format}'
273
- self.file_paths[index] = filepath
274
- self.file_urls[index] = urllib.parse.urljoin('file:', urllib.request.pathname2url(filepath))
275
- image.save(filepath, format=format)
268
+ format = None
269
+ if isinstance(image, PIL.Image.Image):
270
+ # Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
271
+ # In that case, use WebP instead.
272
+ format = 'webp' if image.has_transparency_data else 'jpeg'
273
+ filepath, url = MediaStore.save_media_object(image, col, format=format)
274
+ self.file_paths[index] = str(filepath)
275
+ self.file_urls[index] = url
276
276
  else:
277
277
  # we discard the content of this cell
278
278
  self.has_val[index] = False
@@ -8,9 +8,8 @@ from uuid import UUID
8
8
 
9
9
  import numpy as np
10
10
 
11
- from pixeltable import catalog, exceptions as excs, utils
11
+ from pixeltable import catalog, exceptions as excs, exprs, utils
12
12
  from pixeltable.env import Env
13
- from pixeltable.utils.media_store import MediaStore
14
13
 
15
14
  from .data_row import DataRow
16
15
  from .expr import Expr, ExprScope
@@ -85,6 +84,10 @@ class RowBuilder:
85
84
  # (a subexpr can be shared across multiple output exprs)
86
85
  output_expr_ids: list[set[int]]
87
86
 
87
+ img_slot_idxs: list[int] # Indices of image slots
88
+ media_slot_idxs: list[int] # Indices of non-image media slots
89
+ array_slot_idxs: list[int] # Indices of array slots
90
+
88
91
  @dataclass
89
92
  class EvalCtx:
90
93
  """Context for evaluating a set of target exprs"""
@@ -235,6 +238,12 @@ class RowBuilder:
235
238
  for e in self.output_exprs:
236
239
  self._record_output_expr_id(e, e.slot_idx)
237
240
 
241
+ self.img_slot_idxs = [e.slot_idx for e in self.unique_exprs if e.col_type.is_image_type()]
242
+ self.media_slot_idxs = [
243
+ e.slot_idx for e in self.unique_exprs if e.col_type.is_media_type() and not e.col_type.is_image_type()
244
+ ]
245
+ self.array_slot_idxs = [e.slot_idx for e in self.unique_exprs if e.col_type.is_array_type()]
246
+
238
247
  def add_table_column(self, col: catalog.Column, slot_idx: int) -> None:
239
248
  """Record a column that is part of the table row"""
240
249
  assert self.tbl is not None
@@ -462,8 +471,7 @@ class RowBuilder:
462
471
  else:
463
472
  if col.col_type.is_image_type() and data_row.file_urls[slot_idx] is None:
464
473
  # we have yet to store this image
465
- filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
466
- data_row.flush_img(slot_idx, filepath)
474
+ data_row.flush_img(slot_idx, col)
467
475
  val = data_row.get_stored_val(slot_idx, col.get_sa_col_type())
468
476
  table_row.append(val)
469
477
  if col.stores_cellmd:
@@ -489,3 +497,7 @@ class RowBuilder:
489
497
  store_col_names.append(col.col.cellmd_store_name())
490
498
 
491
499
  return store_col_names, media_cols
500
+
501
+ def make_row(self) -> exprs.DataRow:
502
+ """Creates a new DataRow with the current row_builder's configuration."""
503
+ return exprs.DataRow(self.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs)
@@ -68,7 +68,7 @@ class StringOp(Expr):
68
68
  if self.operator == StringOperator.CONCAT:
69
69
  return left.concat(right)
70
70
  if self.operator == StringOperator.REPEAT:
71
- return sql.func.repeat(sql.cast(left, sql.String), sql.cast(right, sql.Integer))
71
+ return sql.func.repeat(left.cast(sql.String), right.cast(sql.Integer))
72
72
  return None
73
73
 
74
74
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
@@ -101,13 +101,10 @@ class ExprTemplateFunction(Function):
101
101
  return None
102
102
 
103
103
  def exec(self, args: Sequence[Any], kwargs: dict[str, Any]) -> Any:
104
- from pixeltable import exec
105
-
106
104
  assert not self.is_polymorphic
107
105
  expr = self.instantiate(args, kwargs)
108
106
  row_builder = exprs.RowBuilder(output_exprs=[expr], columns=[], input_exprs=[])
109
- row_batch = exec.DataRowBatch(tbl=None, row_builder=row_builder, num_rows=1)
110
- row = row_batch[0]
107
+ row = row_builder.make_row()
111
108
  row_builder.eval(row, ctx=row_builder.default_eval_ctx)
112
109
  return row[row_builder.get_output_exprs()[0].slot_idx]
113
110
 
@@ -83,7 +83,7 @@ def make_date(year: int, month: int, day: int) -> date:
83
83
 
84
84
  @make_date.to_sql
85
85
  def _(year: sql.ColumnElement, month: sql.ColumnElement, day: sql.ColumnElement) -> sql.ColumnElement:
86
- return sql.func.make_date(sql.cast(year, sql.Integer), sql.cast(month, sql.Integer), sql.cast(day, sql.Integer))
86
+ return sql.func.make_date(year.cast(sql.Integer), month.cast(sql.Integer), day.cast(sql.Integer))
87
87
 
88
88
 
89
89
  @pxt.udf(is_method=True)
@@ -97,7 +97,7 @@ def _(self: sql.ColumnElement, digits: Optional[sql.ColumnElement] = None) -> sq
97
97
  if digits is None:
98
98
  return sql.func.round(self)
99
99
  else:
100
- return sql.func.round(sql.cast(self, sql.Numeric), sql.cast(digits, sql.Integer))
100
+ return sql.func.round(self.cast(sql.Numeric), digits.cast(sql.Integer))
101
101
 
102
102
 
103
103
  @pxt.udf(is_method=True)
@@ -31,11 +31,15 @@ _logger = logging.getLogger('pixeltable')
31
31
 
32
32
 
33
33
  @env.register_client('openai')
34
- def _(api_key: str) -> 'openai.AsyncOpenAI':
34
+ def _(api_key: str, base_url: Optional[str] = None, api_version: Optional[str] = None) -> 'openai.AsyncOpenAI':
35
35
  import openai
36
36
 
37
+ default_query = None if api_version is None else {'api-version': api_version}
38
+
37
39
  return openai.AsyncOpenAI(
38
40
  api_key=api_key,
41
+ base_url=base_url,
42
+ default_query=default_query,
39
43
  # recommended to increase limits for async client to avoid connection errors
40
44
  http_client=httpx.AsyncClient(limits=httpx.Limits(max_keepalive_connections=100, max_connections=500)),
41
45
  )
@@ -124,7 +128,7 @@ _header_duration_pattern = re.compile(r'(?:(\d+)d)?(?:(\d+)h)?(?:(\d+)ms)|(?:(\d
124
128
  def _parse_header_duration(duration_str: str) -> datetime.timedelta:
125
129
  match = _header_duration_pattern.match(duration_str)
126
130
  if not match:
127
- raise ValueError('Invalid duration format')
131
+ raise ValueError(f'Invalid duration format: {duration_str}')
128
132
 
129
133
  days = int(match.group(1) or 0)
130
134
  hours = int(match.group(2) or 0)
@@ -147,7 +151,7 @@ def _get_header_info(
147
151
  requests_limit = int(requests_limit_str) if requests_limit_str is not None else None
148
152
  requests_remaining_str = headers.get('x-ratelimit-remaining-requests')
149
153
  requests_remaining = int(requests_remaining_str) if requests_remaining_str is not None else None
150
- requests_reset_str = headers.get('x-ratelimit-reset-requests')
154
+ requests_reset_str = headers.get('x-ratelimit-reset-requests', '5s') # Default to 5 seconds
151
155
  requests_reset_ts = now + _parse_header_duration(requests_reset_str)
152
156
  requests_info = (requests_limit, requests_remaining, requests_reset_ts)
153
157
 
@@ -157,7 +161,7 @@ def _get_header_info(
157
161
  tokens_limit = int(tokens_limit_str) if tokens_limit_str is not None else None
158
162
  tokens_remaining_str = headers.get('x-ratelimit-remaining-tokens')
159
163
  tokens_remaining = int(tokens_remaining_str) if tokens_remaining_str is not None else None
160
- tokens_reset_str = headers.get('x-ratelimit-reset-tokens')
164
+ tokens_reset_str = headers.get('x-ratelimit-reset-tokens', '5s') # Default to 5 seconds
161
165
  tokens_reset_ts = now + _parse_header_duration(tokens_reset_str)
162
166
  tokens_info = (tokens_limit, tokens_remaining, tokens_reset_ts)
163
167