pixeltable 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +106 -71
- pixeltable/catalog/path.py +59 -20
- pixeltable/catalog/schema_object.py +1 -0
- pixeltable/catalog/table.py +6 -0
- pixeltable/catalog/table_version.py +2 -1
- pixeltable/catalog/view.py +21 -10
- pixeltable/config.py +12 -4
- pixeltable/dataframe.py +57 -1
- pixeltable/env.py +25 -13
- pixeltable/exec/aggregation_node.py +1 -1
- pixeltable/exec/cache_prefetch_node.py +2 -6
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +10 -53
- pixeltable/exec/expr_eval/expr_eval_node.py +2 -2
- pixeltable/exec/in_memory_data_node.py +13 -11
- pixeltable/exec/sql_node.py +6 -7
- pixeltable/exprs/data_row.py +13 -13
- pixeltable/exprs/row_builder.py +16 -4
- pixeltable/exprs/string_op.py +1 -1
- pixeltable/func/expr_template_function.py +1 -4
- pixeltable/functions/date.py +1 -1
- pixeltable/functions/math.py +1 -1
- pixeltable/functions/openai.py +8 -4
- pixeltable/functions/timestamp.py +6 -6
- pixeltable/globals.py +14 -10
- pixeltable/metadata/schema.py +1 -1
- pixeltable/plan.py +5 -14
- pixeltable/share/packager.py +13 -13
- pixeltable/store.py +9 -6
- pixeltable/type_system.py +2 -1
- pixeltable/utils/filecache.py +1 -1
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/media_store.py +84 -39
- {pixeltable-0.4.4.dist-info → pixeltable-0.4.6.dist-info}/METADATA +40 -41
- {pixeltable-0.4.4.dist-info → pixeltable-0.4.6.dist-info}/RECORD +44 -44
- {pixeltable-0.4.4.dist-info → pixeltable-0.4.6.dist-info}/WHEEL +1 -1
- pixeltable-0.4.6.dist-info/entry_points.txt +2 -0
- pixeltable-0.4.4.dist-info/entry_points.txt +0 -3
- {pixeltable-0.4.4.dist-info → pixeltable-0.4.6.dist-info/licenses}/LICENSE +0 -0
pixeltable/dataframe.py
CHANGED
|
@@ -8,9 +8,22 @@ import json
|
|
|
8
8
|
import logging
|
|
9
9
|
import traceback
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import
|
|
11
|
+
from typing import (
|
|
12
|
+
TYPE_CHECKING,
|
|
13
|
+
Any,
|
|
14
|
+
AsyncIterator,
|
|
15
|
+
Callable,
|
|
16
|
+
Hashable,
|
|
17
|
+
Iterator,
|
|
18
|
+
NoReturn,
|
|
19
|
+
Optional,
|
|
20
|
+
Sequence,
|
|
21
|
+
TypeVar,
|
|
22
|
+
Union,
|
|
23
|
+
)
|
|
12
24
|
|
|
13
25
|
import pandas as pd
|
|
26
|
+
import pydantic
|
|
14
27
|
import sqlalchemy as sql
|
|
15
28
|
|
|
16
29
|
from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
|
|
@@ -32,6 +45,11 @@ _logger = logging.getLogger('pixeltable')
|
|
|
32
45
|
|
|
33
46
|
|
|
34
47
|
class DataFrameResultSet:
|
|
48
|
+
_rows: list[list[Any]]
|
|
49
|
+
_col_names: list[str]
|
|
50
|
+
__schema: dict[str, ColumnType]
|
|
51
|
+
__formatter: Formatter
|
|
52
|
+
|
|
35
53
|
def __init__(self, rows: list[list[Any]], schema: dict[str, ColumnType]):
|
|
36
54
|
self._rows = rows
|
|
37
55
|
self._col_names = list(schema.keys())
|
|
@@ -66,6 +84,44 @@ class DataFrameResultSet:
|
|
|
66
84
|
def to_pandas(self) -> pd.DataFrame:
|
|
67
85
|
return pd.DataFrame.from_records(self._rows, columns=self._col_names)
|
|
68
86
|
|
|
87
|
+
BaseModelT = TypeVar('BaseModelT', bound=pydantic.BaseModel)
|
|
88
|
+
|
|
89
|
+
def to_pydantic(self, model: type[BaseModelT]) -> Iterator[BaseModelT]:
|
|
90
|
+
"""
|
|
91
|
+
Convert the DataFrameResultSet to a list of Pydantic model instances.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
model: A Pydantic model class.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
An iterator over Pydantic model instances, one for each row in the result set.
|
|
98
|
+
|
|
99
|
+
Raises:
|
|
100
|
+
Error: If the row data doesn't match the model schema.
|
|
101
|
+
"""
|
|
102
|
+
model_fields = model.model_fields
|
|
103
|
+
model_config = getattr(model, 'model_config', {})
|
|
104
|
+
forbid_extra_fields = model_config.get('extra') == 'forbid'
|
|
105
|
+
|
|
106
|
+
# schema validation
|
|
107
|
+
required_fields = {name for name, field in model_fields.items() if field.is_required()}
|
|
108
|
+
col_names = set(self._col_names)
|
|
109
|
+
missing_fields = required_fields - col_names
|
|
110
|
+
if len(missing_fields) > 0:
|
|
111
|
+
raise excs.Error(
|
|
112
|
+
f'Required model fields {missing_fields} are missing from result set columns {self._col_names}'
|
|
113
|
+
)
|
|
114
|
+
if forbid_extra_fields:
|
|
115
|
+
extra_fields = col_names - set(model_fields.keys())
|
|
116
|
+
if len(extra_fields) > 0:
|
|
117
|
+
raise excs.Error(f"Extra fields {extra_fields} are not allowed in model with extra='forbid'")
|
|
118
|
+
|
|
119
|
+
for row in self:
|
|
120
|
+
try:
|
|
121
|
+
yield model(**row)
|
|
122
|
+
except pydantic.ValidationError as e:
|
|
123
|
+
raise excs.Error(str(e)) from e
|
|
124
|
+
|
|
69
125
|
def _row_to_dict(self, row_idx: int) -> dict[str, Any]:
|
|
70
126
|
return {self._col_names[i]: self._rows[row_idx][i] for i in range(len(self._col_names))}
|
|
71
127
|
|
pixeltable/env.py
CHANGED
|
@@ -13,6 +13,8 @@ import platform
|
|
|
13
13
|
import shutil
|
|
14
14
|
import sys
|
|
15
15
|
import threading
|
|
16
|
+
import types
|
|
17
|
+
import typing
|
|
16
18
|
import uuid
|
|
17
19
|
import warnings
|
|
18
20
|
from abc import abstractmethod
|
|
@@ -604,16 +606,26 @@ class Env:
|
|
|
604
606
|
|
|
605
607
|
# Construct a client, retrieving each parameter from config.
|
|
606
608
|
|
|
607
|
-
init_kwargs: dict[str,
|
|
608
|
-
for param in cl.
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
609
|
+
init_kwargs: dict[str, Any] = {}
|
|
610
|
+
for param in cl.params.values():
|
|
611
|
+
# Determine the type of the parameter for proper config parsing.
|
|
612
|
+
t = param.annotation
|
|
613
|
+
# Deference Optional[T]
|
|
614
|
+
if typing.get_origin(t) in (typing.Union, types.UnionType):
|
|
615
|
+
args = typing.get_args(t)
|
|
616
|
+
if args[0] is type(None):
|
|
617
|
+
t = args[1]
|
|
618
|
+
elif args[1] is type(None):
|
|
619
|
+
t = args[0]
|
|
620
|
+
assert isinstance(t, type), t
|
|
621
|
+
arg: Any = Config.get().get_value(param.name, t, section=name)
|
|
622
|
+
if arg is not None:
|
|
623
|
+
init_kwargs[param.name] = arg
|
|
624
|
+
elif param.default is inspect.Parameter.empty:
|
|
613
625
|
raise excs.Error(
|
|
614
|
-
f'`{name}` client not initialized: parameter `{param}` is not configured.\n'
|
|
615
|
-
f'To fix this, specify the `{name.upper()}_{param.upper()}` environment variable, '
|
|
616
|
-
f'or put `{param.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
|
|
626
|
+
f'`{name}` client not initialized: parameter `{param.name}` is not configured.\n'
|
|
627
|
+
f'To fix this, specify the `{name.upper()}_{param.name.upper()}` environment variable, '
|
|
628
|
+
f'or put `{param.name.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
|
|
617
629
|
)
|
|
618
630
|
|
|
619
631
|
cl.client_obj = cl.init_fn(**init_kwargs)
|
|
@@ -624,7 +636,7 @@ class Env:
|
|
|
624
636
|
"""
|
|
625
637
|
The http server root is the file system root.
|
|
626
638
|
eg: /home/media/foo.mp4 is located at http://127.0.0.1:{port}/home/media/foo.mp4
|
|
627
|
-
|
|
639
|
+
On Windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
|
|
628
640
|
This arrangement enables serving media hosted within _home,
|
|
629
641
|
as well as external media inserted into pixeltable or produced by pixeltable.
|
|
630
642
|
The port is chosen dynamically to prevent conflicts.
|
|
@@ -832,8 +844,8 @@ def register_client(name: str) -> Callable:
|
|
|
832
844
|
|
|
833
845
|
def decorator(fn: Callable) -> None:
|
|
834
846
|
sig = inspect.signature(fn)
|
|
835
|
-
|
|
836
|
-
_registered_clients[name] = ApiClient(init_fn=fn,
|
|
847
|
+
params = dict(sig.parameters)
|
|
848
|
+
_registered_clients[name] = ApiClient(init_fn=fn, params=params)
|
|
837
849
|
|
|
838
850
|
return decorator
|
|
839
851
|
|
|
@@ -844,7 +856,7 @@ _registered_clients: dict[str, ApiClient] = {}
|
|
|
844
856
|
@dataclass
|
|
845
857
|
class ApiClient:
|
|
846
858
|
init_fn: Callable
|
|
847
|
-
|
|
859
|
+
params: dict[str, inspect.Parameter]
|
|
848
860
|
client_obj: Optional[Any] = None
|
|
849
861
|
|
|
850
862
|
|
|
@@ -45,7 +45,7 @@ class AggregationNode(ExecNode):
|
|
|
45
45
|
# we need to make sure to refer to the same exprs that RowBuilder.eval() will use
|
|
46
46
|
self.agg_fn_calls = [cast(exprs.FunctionCall, e) for e in self.agg_fn_eval_ctx.target_exprs]
|
|
47
47
|
# create output_batch here, rather than in __iter__(), so we don't need to remember tbl and row_builder
|
|
48
|
-
self.output_batch = DataRowBatch(
|
|
48
|
+
self.output_batch = DataRowBatch(row_builder)
|
|
49
49
|
self.limit = None
|
|
50
50
|
|
|
51
51
|
def set_limit(self, limit: int) -> None:
|
|
@@ -12,7 +12,7 @@ from pathlib import Path
|
|
|
12
12
|
from typing import Any, AsyncIterator, Iterator, Optional
|
|
13
13
|
from uuid import UUID
|
|
14
14
|
|
|
15
|
-
from pixeltable import
|
|
15
|
+
from pixeltable import env, exceptions as excs, exprs
|
|
16
16
|
from pixeltable.utils.filecache import FileCache
|
|
17
17
|
|
|
18
18
|
from .data_row_batch import DataRowBatch
|
|
@@ -37,7 +37,6 @@ class CachePrefetchNode(ExecNode):
|
|
|
37
37
|
boto_client_lock: threading.Lock
|
|
38
38
|
|
|
39
39
|
# execution state
|
|
40
|
-
batch_tbl_version: Optional[catalog.TableVersionHandle] # needed to construct output batches
|
|
41
40
|
num_returned_rows: int
|
|
42
41
|
|
|
43
42
|
# ready_rows: rows that are ready to be returned, ordered by row idx;
|
|
@@ -68,7 +67,6 @@ class CachePrefetchNode(ExecNode):
|
|
|
68
67
|
self.boto_client = None
|
|
69
68
|
self.boto_client_lock = threading.Lock()
|
|
70
69
|
|
|
71
|
-
self.batch_tbl_version = None
|
|
72
70
|
self.num_returned_rows = 0
|
|
73
71
|
self.ready_rows = deque()
|
|
74
72
|
self.in_flight_rows = {}
|
|
@@ -95,7 +93,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
95
93
|
|
|
96
94
|
if len(self.ready_rows) > 0:
|
|
97
95
|
# create DataRowBatch from the first BATCH_SIZE ready rows
|
|
98
|
-
batch = DataRowBatch(self.
|
|
96
|
+
batch = DataRowBatch(self.row_builder)
|
|
99
97
|
rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
|
|
100
98
|
for row in rows:
|
|
101
99
|
assert row is not None
|
|
@@ -173,8 +171,6 @@ class CachePrefetchNode(ExecNode):
|
|
|
173
171
|
if input_batch is None:
|
|
174
172
|
self.input_finished = True
|
|
175
173
|
return
|
|
176
|
-
if self.batch_tbl_version is None:
|
|
177
|
-
self.batch_tbl_version = input_batch.tbl
|
|
178
174
|
|
|
179
175
|
file_cache = FileCache.get()
|
|
180
176
|
|
|
@@ -40,7 +40,7 @@ class ComponentIterationNode(ExecNode):
|
|
|
40
40
|
}
|
|
41
41
|
|
|
42
42
|
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
43
|
-
output_batch = DataRowBatch(self.
|
|
43
|
+
output_batch = DataRowBatch(self.row_builder)
|
|
44
44
|
async for input_batch in self.input:
|
|
45
45
|
for input_row in input_batch:
|
|
46
46
|
self.row_builder.eval(input_row, self.iterator_args_ctx)
|
|
@@ -52,13 +52,14 @@ class ComponentIterationNode(ExecNode):
|
|
|
52
52
|
if self.__non_nullable_args_specified(iterator_args):
|
|
53
53
|
iterator = self.view.get().iterator_cls(**iterator_args)
|
|
54
54
|
for pos, component_dict in enumerate(iterator):
|
|
55
|
-
output_row =
|
|
55
|
+
output_row = self.row_builder.make_row()
|
|
56
56
|
input_row.copy(output_row)
|
|
57
57
|
# we're expanding the input and need to add the iterator position to the pk
|
|
58
58
|
self.__populate_output_row(output_row, pos, component_dict)
|
|
59
|
+
output_batch.add_row(output_row)
|
|
59
60
|
if len(output_batch) == self.__OUTPUT_BATCH_SIZE:
|
|
60
61
|
yield output_batch
|
|
61
|
-
output_batch = DataRowBatch(self.
|
|
62
|
+
output_batch = DataRowBatch(self.row_builder)
|
|
62
63
|
|
|
63
64
|
if len(output_batch) > 0:
|
|
64
65
|
yield output_batch
|
|
@@ -3,8 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import logging
|
|
4
4
|
from typing import Iterator, Optional
|
|
5
5
|
|
|
6
|
-
from pixeltable import
|
|
7
|
-
from pixeltable.utils.media_store import MediaStore
|
|
6
|
+
from pixeltable import exprs
|
|
8
7
|
|
|
9
8
|
_logger = logging.getLogger('pixeltable')
|
|
10
9
|
|
|
@@ -15,51 +14,19 @@ class DataRowBatch:
|
|
|
15
14
|
Contains the metadata needed to initialize DataRows.
|
|
16
15
|
"""
|
|
17
16
|
|
|
18
|
-
tbl: Optional[catalog.TableVersionHandle]
|
|
19
17
|
row_builder: exprs.RowBuilder
|
|
20
|
-
img_slot_idxs: list[int]
|
|
21
|
-
media_slot_idxs: list[int] # non-image media slots
|
|
22
|
-
array_slot_idxs: list[int]
|
|
23
18
|
rows: list[exprs.DataRow]
|
|
24
19
|
|
|
25
|
-
def __init__(
|
|
26
|
-
self,
|
|
27
|
-
tbl: Optional[catalog.TableVersionHandle],
|
|
28
|
-
row_builder: exprs.RowBuilder,
|
|
29
|
-
num_rows: Optional[int] = None,
|
|
30
|
-
rows: Optional[list[exprs.DataRow]] = None,
|
|
31
|
-
):
|
|
20
|
+
def __init__(self, row_builder: exprs.RowBuilder, rows: Optional[list[exprs.DataRow]] = None):
|
|
32
21
|
"""
|
|
33
22
|
Requires either num_rows or rows to be specified, but not both.
|
|
34
23
|
"""
|
|
35
|
-
assert num_rows is None or rows is None
|
|
36
|
-
self.tbl = tbl
|
|
37
24
|
self.row_builder = row_builder
|
|
38
|
-
self.
|
|
39
|
-
# non-image media slots
|
|
40
|
-
self.media_slot_idxs = [
|
|
41
|
-
e.slot_idx
|
|
42
|
-
for e in row_builder.unique_exprs
|
|
43
|
-
if e.col_type.is_media_type() and not e.col_type.is_image_type()
|
|
44
|
-
]
|
|
45
|
-
self.array_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_array_type()]
|
|
46
|
-
if rows is not None:
|
|
47
|
-
self.rows = rows
|
|
48
|
-
else:
|
|
49
|
-
if num_rows is None:
|
|
50
|
-
num_rows = 0
|
|
51
|
-
self.rows = [
|
|
52
|
-
exprs.DataRow(
|
|
53
|
-
row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
|
|
54
|
-
)
|
|
55
|
-
for _ in range(num_rows)
|
|
56
|
-
]
|
|
25
|
+
self.rows = [] if rows is None else rows
|
|
57
26
|
|
|
58
|
-
def add_row(self, row: Optional[exprs.DataRow]
|
|
27
|
+
def add_row(self, row: Optional[exprs.DataRow]) -> exprs.DataRow:
|
|
59
28
|
if row is None:
|
|
60
|
-
row =
|
|
61
|
-
self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
|
|
62
|
-
)
|
|
29
|
+
row = self.row_builder.make_row()
|
|
63
30
|
self.rows.append(row)
|
|
64
31
|
return row
|
|
65
32
|
|
|
@@ -73,28 +40,18 @@ class DataRowBatch:
|
|
|
73
40
|
return self.rows[index]
|
|
74
41
|
|
|
75
42
|
def flush_imgs(
|
|
76
|
-
self,
|
|
77
|
-
idx_range: Optional[slice] = None,
|
|
78
|
-
stored_img_info: Optional[list[exprs.ColumnSlotIdx]] = None,
|
|
79
|
-
flushed_slot_idxs: Optional[list[int]] = None,
|
|
43
|
+
self, idx_range: Optional[slice], stored_img_info: list[exprs.ColumnSlotIdx], flushed_img_slots: list[int]
|
|
80
44
|
) -> None:
|
|
81
45
|
"""Flushes images in the given range of rows."""
|
|
82
|
-
|
|
83
|
-
if stored_img_info is None:
|
|
84
|
-
stored_img_info = []
|
|
85
|
-
if flushed_slot_idxs is None:
|
|
86
|
-
flushed_slot_idxs = []
|
|
87
|
-
if len(stored_img_info) == 0 and len(flushed_slot_idxs) == 0:
|
|
46
|
+
if len(stored_img_info) == 0 and len(flushed_img_slots) == 0:
|
|
88
47
|
return
|
|
48
|
+
|
|
89
49
|
if idx_range is None:
|
|
90
50
|
idx_range = slice(0, len(self.rows))
|
|
91
51
|
for row in self.rows[idx_range]:
|
|
92
52
|
for info in stored_img_info:
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
|
|
96
|
-
row.flush_img(info.slot_idx, filepath)
|
|
97
|
-
for slot_idx in flushed_slot_idxs:
|
|
53
|
+
row.flush_img(info.slot_idx, info.col)
|
|
54
|
+
for slot_idx in flushed_img_slots:
|
|
98
55
|
row.flush_img(slot_idx)
|
|
99
56
|
|
|
100
57
|
def __iter__(self) -> Iterator[exprs.DataRow]:
|
|
@@ -240,7 +240,7 @@ class ExprEvalNode(ExecNode):
|
|
|
240
240
|
# make sure we top up our in-flight rows before yielding
|
|
241
241
|
self._dispatch_input_rows()
|
|
242
242
|
self._log_state(f'yielding {len(batch_rows)} rows')
|
|
243
|
-
yield DataRowBatch(
|
|
243
|
+
yield DataRowBatch(row_builder=self.row_builder, rows=batch_rows)
|
|
244
244
|
# at this point, we may have more completed rows
|
|
245
245
|
|
|
246
246
|
assert self.completed_rows.empty() # all completed rows should be sitting in output_buffer
|
|
@@ -254,7 +254,7 @@ class ExprEvalNode(ExecNode):
|
|
|
254
254
|
batch_rows = self.output_buffer.get_rows(self.output_buffer.num_ready)
|
|
255
255
|
self.num_output_rows += len(batch_rows)
|
|
256
256
|
self._log_state(f'yielding {len(batch_rows)} rows')
|
|
257
|
-
yield DataRowBatch(
|
|
257
|
+
yield DataRowBatch(row_builder=self.row_builder, rows=batch_rows)
|
|
258
258
|
|
|
259
259
|
assert self.output_buffer.num_rows == 0
|
|
260
260
|
return
|
|
@@ -23,7 +23,7 @@ class InMemoryDataNode(ExecNode):
|
|
|
23
23
|
|
|
24
24
|
input_rows: list[dict[str, Any]]
|
|
25
25
|
start_row_id: int
|
|
26
|
-
|
|
26
|
+
output_batch: Optional[DataRowBatch]
|
|
27
27
|
|
|
28
28
|
# output_exprs is declared in the superclass, but we redeclare it here with a more specific type
|
|
29
29
|
output_exprs: list[exprs.ColumnRef]
|
|
@@ -42,7 +42,7 @@ class InMemoryDataNode(ExecNode):
|
|
|
42
42
|
self.tbl = tbl
|
|
43
43
|
self.input_rows = rows
|
|
44
44
|
self.start_row_id = start_row_id
|
|
45
|
-
self.
|
|
45
|
+
self.output_batch = None
|
|
46
46
|
|
|
47
47
|
def _open(self) -> None:
|
|
48
48
|
"""Create row batch and populate with self.input_rows"""
|
|
@@ -56,8 +56,9 @@ class InMemoryDataNode(ExecNode):
|
|
|
56
56
|
}
|
|
57
57
|
output_slot_idxs = {e.slot_idx for e in self.output_exprs}
|
|
58
58
|
|
|
59
|
-
self.
|
|
60
|
-
for
|
|
59
|
+
self.output_batch = DataRowBatch(self.row_builder)
|
|
60
|
+
for input_row in self.input_rows:
|
|
61
|
+
output_row = self.row_builder.make_row()
|
|
61
62
|
# populate the output row with the values provided in the input row
|
|
62
63
|
input_slot_idxs: set[int] = set()
|
|
63
64
|
for col_name, val in input_row.items():
|
|
@@ -67,10 +68,10 @@ class InMemoryDataNode(ExecNode):
|
|
|
67
68
|
if col.col_type.is_image_type() and isinstance(val, bytes):
|
|
68
69
|
# this is a literal media file, ie, a sequence of bytes; save it as a binary file and store the path
|
|
69
70
|
assert col.tbl.id == self.tbl.id
|
|
70
|
-
|
|
71
|
-
|
|
71
|
+
filepath, _ = MediaStore.save_media_object(val, col, format=None)
|
|
72
|
+
output_row[col_info.slot_idx] = str(filepath)
|
|
72
73
|
else:
|
|
73
|
-
|
|
74
|
+
output_row[col_info.slot_idx] = val
|
|
74
75
|
|
|
75
76
|
input_slot_idxs.add(col_info.slot_idx)
|
|
76
77
|
|
|
@@ -79,10 +80,11 @@ class InMemoryDataNode(ExecNode):
|
|
|
79
80
|
for slot_idx in missing_slot_idxs:
|
|
80
81
|
col_info = output_cols_by_idx.get(slot_idx)
|
|
81
82
|
assert col_info is not None
|
|
82
|
-
|
|
83
|
+
output_row[col_info.slot_idx] = None
|
|
84
|
+
self.output_batch.add_row(output_row)
|
|
83
85
|
|
|
84
|
-
self.ctx.num_rows = len(self.
|
|
86
|
+
self.ctx.num_rows = len(self.output_batch)
|
|
85
87
|
|
|
86
88
|
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
87
|
-
_logger.debug(f'InMemoryDataNode: created row batch with {len(self.
|
|
88
|
-
yield self.
|
|
89
|
+
_logger.debug(f'InMemoryDataNode: created row batch with {len(self.output_batch)} rows')
|
|
90
|
+
yield self.output_batch
|
pixeltable/exec/sql_node.py
CHANGED
|
@@ -316,8 +316,7 @@ class SqlNode(ExecNode):
|
|
|
316
316
|
for _ in w:
|
|
317
317
|
pass
|
|
318
318
|
|
|
319
|
-
|
|
320
|
-
output_batch = DataRowBatch(tbl_version, self.row_builder)
|
|
319
|
+
output_batch = DataRowBatch(self.row_builder)
|
|
321
320
|
output_row: Optional[exprs.DataRow] = None
|
|
322
321
|
num_rows_returned = 0
|
|
323
322
|
|
|
@@ -359,7 +358,7 @@ class SqlNode(ExecNode):
|
|
|
359
358
|
if self.ctx.batch_size > 0 and len(output_batch) == self.ctx.batch_size:
|
|
360
359
|
_logger.debug(f'SqlScanNode: returning {len(output_batch)} rows')
|
|
361
360
|
yield output_batch
|
|
362
|
-
output_batch = DataRowBatch(
|
|
361
|
+
output_batch = DataRowBatch(self.row_builder)
|
|
363
362
|
|
|
364
363
|
if len(output_batch) > 0:
|
|
365
364
|
_logger.debug(f'SqlScanNode: returning {len(output_batch)} rows')
|
|
@@ -569,10 +568,10 @@ class SqlSampleNode(SqlNode):
|
|
|
569
568
|
General SQL form is:
|
|
570
569
|
- MD5(<seed::text> [ + '___' + <rowid_col_val>::text]+
|
|
571
570
|
"""
|
|
572
|
-
sql_expr: sql.ColumnElement =
|
|
571
|
+
sql_expr: sql.ColumnElement = seed.cast(sql.String)
|
|
573
572
|
for e in sql_cols:
|
|
574
573
|
# Quotes are required below to guarantee that the string is properly presented in SQL
|
|
575
|
-
sql_expr = sql_expr + sql.literal_column("'___'", sql.Text) +
|
|
574
|
+
sql_expr = sql_expr + sql.literal_column("'___'", sql.Text) + e.cast(sql.String)
|
|
576
575
|
sql_expr = sql.func.md5(sql_expr)
|
|
577
576
|
return sql_expr
|
|
578
577
|
|
|
@@ -591,9 +590,9 @@ class SqlSampleNode(SqlNode):
|
|
|
591
590
|
s_key = self._create_key_sql(self.input_cte)
|
|
592
591
|
|
|
593
592
|
# Construct a suitable where clause
|
|
594
|
-
|
|
593
|
+
fraction_md5 = SampleClause.fraction_to_md5_hex(self.sample_clause.fraction)
|
|
595
594
|
order_by = self._create_key_sql(self.input_cte)
|
|
596
|
-
return sql.select(*self.input_cte.c).where(s_key <
|
|
595
|
+
return sql.select(*self.input_cte.c).where(s_key < fraction_md5).order_by(order_by)
|
|
597
596
|
|
|
598
597
|
return self._create_stmt_stratified_fraction(self.sample_clause.fraction)
|
|
599
598
|
else:
|
pixeltable/exprs/data_row.py
CHANGED
|
@@ -13,7 +13,8 @@ import PIL
|
|
|
13
13
|
import PIL.Image
|
|
14
14
|
import sqlalchemy as sql
|
|
15
15
|
|
|
16
|
-
from pixeltable import env
|
|
16
|
+
from pixeltable import catalog, env
|
|
17
|
+
from pixeltable.utils.media_store import MediaStore
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
class DataRow:
|
|
@@ -256,23 +257,22 @@ class DataRow:
|
|
|
256
257
|
self.vals[idx] = val
|
|
257
258
|
self.has_val[idx] = True
|
|
258
259
|
|
|
259
|
-
def flush_img(self, index: int,
|
|
260
|
-
"""
|
|
260
|
+
def flush_img(self, index: int, col: Optional[catalog.Column] = None) -> None:
|
|
261
|
+
"""Save or discard the in-memory value (required to be a PIL.Image.Image)"""
|
|
261
262
|
if self.vals[index] is None:
|
|
262
263
|
return
|
|
263
264
|
assert self.excs[index] is None
|
|
264
265
|
if self.file_paths[index] is None:
|
|
265
|
-
if
|
|
266
|
+
if col is not None:
|
|
266
267
|
image = self.vals[index]
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
self.file_paths[index] = filepath
|
|
274
|
-
self.file_urls[index] =
|
|
275
|
-
image.save(filepath, format=format)
|
|
268
|
+
format = None
|
|
269
|
+
if isinstance(image, PIL.Image.Image):
|
|
270
|
+
# Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
|
|
271
|
+
# In that case, use WebP instead.
|
|
272
|
+
format = 'webp' if image.has_transparency_data else 'jpeg'
|
|
273
|
+
filepath, url = MediaStore.save_media_object(image, col, format=format)
|
|
274
|
+
self.file_paths[index] = str(filepath)
|
|
275
|
+
self.file_urls[index] = url
|
|
276
276
|
else:
|
|
277
277
|
# we discard the content of this cell
|
|
278
278
|
self.has_val[index] = False
|
pixeltable/exprs/row_builder.py
CHANGED
|
@@ -8,9 +8,8 @@ from uuid import UUID
|
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
|
|
11
|
-
from pixeltable import catalog, exceptions as excs, utils
|
|
11
|
+
from pixeltable import catalog, exceptions as excs, exprs, utils
|
|
12
12
|
from pixeltable.env import Env
|
|
13
|
-
from pixeltable.utils.media_store import MediaStore
|
|
14
13
|
|
|
15
14
|
from .data_row import DataRow
|
|
16
15
|
from .expr import Expr, ExprScope
|
|
@@ -85,6 +84,10 @@ class RowBuilder:
|
|
|
85
84
|
# (a subexpr can be shared across multiple output exprs)
|
|
86
85
|
output_expr_ids: list[set[int]]
|
|
87
86
|
|
|
87
|
+
img_slot_idxs: list[int] # Indices of image slots
|
|
88
|
+
media_slot_idxs: list[int] # Indices of non-image media slots
|
|
89
|
+
array_slot_idxs: list[int] # Indices of array slots
|
|
90
|
+
|
|
88
91
|
@dataclass
|
|
89
92
|
class EvalCtx:
|
|
90
93
|
"""Context for evaluating a set of target exprs"""
|
|
@@ -235,6 +238,12 @@ class RowBuilder:
|
|
|
235
238
|
for e in self.output_exprs:
|
|
236
239
|
self._record_output_expr_id(e, e.slot_idx)
|
|
237
240
|
|
|
241
|
+
self.img_slot_idxs = [e.slot_idx for e in self.unique_exprs if e.col_type.is_image_type()]
|
|
242
|
+
self.media_slot_idxs = [
|
|
243
|
+
e.slot_idx for e in self.unique_exprs if e.col_type.is_media_type() and not e.col_type.is_image_type()
|
|
244
|
+
]
|
|
245
|
+
self.array_slot_idxs = [e.slot_idx for e in self.unique_exprs if e.col_type.is_array_type()]
|
|
246
|
+
|
|
238
247
|
def add_table_column(self, col: catalog.Column, slot_idx: int) -> None:
|
|
239
248
|
"""Record a column that is part of the table row"""
|
|
240
249
|
assert self.tbl is not None
|
|
@@ -462,8 +471,7 @@ class RowBuilder:
|
|
|
462
471
|
else:
|
|
463
472
|
if col.col_type.is_image_type() and data_row.file_urls[slot_idx] is None:
|
|
464
473
|
# we have yet to store this image
|
|
465
|
-
|
|
466
|
-
data_row.flush_img(slot_idx, filepath)
|
|
474
|
+
data_row.flush_img(slot_idx, col)
|
|
467
475
|
val = data_row.get_stored_val(slot_idx, col.get_sa_col_type())
|
|
468
476
|
table_row.append(val)
|
|
469
477
|
if col.stores_cellmd:
|
|
@@ -489,3 +497,7 @@ class RowBuilder:
|
|
|
489
497
|
store_col_names.append(col.col.cellmd_store_name())
|
|
490
498
|
|
|
491
499
|
return store_col_names, media_cols
|
|
500
|
+
|
|
501
|
+
def make_row(self) -> exprs.DataRow:
|
|
502
|
+
"""Creates a new DataRow with the current row_builder's configuration."""
|
|
503
|
+
return exprs.DataRow(self.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs)
|
pixeltable/exprs/string_op.py
CHANGED
|
@@ -68,7 +68,7 @@ class StringOp(Expr):
|
|
|
68
68
|
if self.operator == StringOperator.CONCAT:
|
|
69
69
|
return left.concat(right)
|
|
70
70
|
if self.operator == StringOperator.REPEAT:
|
|
71
|
-
return sql.func.repeat(
|
|
71
|
+
return sql.func.repeat(left.cast(sql.String), right.cast(sql.Integer))
|
|
72
72
|
return None
|
|
73
73
|
|
|
74
74
|
def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
|
|
@@ -101,13 +101,10 @@ class ExprTemplateFunction(Function):
|
|
|
101
101
|
return None
|
|
102
102
|
|
|
103
103
|
def exec(self, args: Sequence[Any], kwargs: dict[str, Any]) -> Any:
|
|
104
|
-
from pixeltable import exec
|
|
105
|
-
|
|
106
104
|
assert not self.is_polymorphic
|
|
107
105
|
expr = self.instantiate(args, kwargs)
|
|
108
106
|
row_builder = exprs.RowBuilder(output_exprs=[expr], columns=[], input_exprs=[])
|
|
109
|
-
|
|
110
|
-
row = row_batch[0]
|
|
107
|
+
row = row_builder.make_row()
|
|
111
108
|
row_builder.eval(row, ctx=row_builder.default_eval_ctx)
|
|
112
109
|
return row[row_builder.get_output_exprs()[0].slot_idx]
|
|
113
110
|
|
pixeltable/functions/date.py
CHANGED
|
@@ -83,7 +83,7 @@ def make_date(year: int, month: int, day: int) -> date:
|
|
|
83
83
|
|
|
84
84
|
@make_date.to_sql
|
|
85
85
|
def _(year: sql.ColumnElement, month: sql.ColumnElement, day: sql.ColumnElement) -> sql.ColumnElement:
|
|
86
|
-
return sql.func.make_date(
|
|
86
|
+
return sql.func.make_date(year.cast(sql.Integer), month.cast(sql.Integer), day.cast(sql.Integer))
|
|
87
87
|
|
|
88
88
|
|
|
89
89
|
@pxt.udf(is_method=True)
|
pixeltable/functions/math.py
CHANGED
|
@@ -97,7 +97,7 @@ def _(self: sql.ColumnElement, digits: Optional[sql.ColumnElement] = None) -> sq
|
|
|
97
97
|
if digits is None:
|
|
98
98
|
return sql.func.round(self)
|
|
99
99
|
else:
|
|
100
|
-
return sql.func.round(
|
|
100
|
+
return sql.func.round(self.cast(sql.Numeric), digits.cast(sql.Integer))
|
|
101
101
|
|
|
102
102
|
|
|
103
103
|
@pxt.udf(is_method=True)
|
pixeltable/functions/openai.py
CHANGED
|
@@ -31,11 +31,15 @@ _logger = logging.getLogger('pixeltable')
|
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
@env.register_client('openai')
|
|
34
|
-
def _(api_key: str) -> 'openai.AsyncOpenAI':
|
|
34
|
+
def _(api_key: str, base_url: Optional[str] = None, api_version: Optional[str] = None) -> 'openai.AsyncOpenAI':
|
|
35
35
|
import openai
|
|
36
36
|
|
|
37
|
+
default_query = None if api_version is None else {'api-version': api_version}
|
|
38
|
+
|
|
37
39
|
return openai.AsyncOpenAI(
|
|
38
40
|
api_key=api_key,
|
|
41
|
+
base_url=base_url,
|
|
42
|
+
default_query=default_query,
|
|
39
43
|
# recommended to increase limits for async client to avoid connection errors
|
|
40
44
|
http_client=httpx.AsyncClient(limits=httpx.Limits(max_keepalive_connections=100, max_connections=500)),
|
|
41
45
|
)
|
|
@@ -124,7 +128,7 @@ _header_duration_pattern = re.compile(r'(?:(\d+)d)?(?:(\d+)h)?(?:(\d+)ms)|(?:(\d
|
|
|
124
128
|
def _parse_header_duration(duration_str: str) -> datetime.timedelta:
|
|
125
129
|
match = _header_duration_pattern.match(duration_str)
|
|
126
130
|
if not match:
|
|
127
|
-
raise ValueError('Invalid duration format')
|
|
131
|
+
raise ValueError(f'Invalid duration format: {duration_str}')
|
|
128
132
|
|
|
129
133
|
days = int(match.group(1) or 0)
|
|
130
134
|
hours = int(match.group(2) or 0)
|
|
@@ -147,7 +151,7 @@ def _get_header_info(
|
|
|
147
151
|
requests_limit = int(requests_limit_str) if requests_limit_str is not None else None
|
|
148
152
|
requests_remaining_str = headers.get('x-ratelimit-remaining-requests')
|
|
149
153
|
requests_remaining = int(requests_remaining_str) if requests_remaining_str is not None else None
|
|
150
|
-
requests_reset_str = headers.get('x-ratelimit-reset-requests')
|
|
154
|
+
requests_reset_str = headers.get('x-ratelimit-reset-requests', '5s') # Default to 5 seconds
|
|
151
155
|
requests_reset_ts = now + _parse_header_duration(requests_reset_str)
|
|
152
156
|
requests_info = (requests_limit, requests_remaining, requests_reset_ts)
|
|
153
157
|
|
|
@@ -157,7 +161,7 @@ def _get_header_info(
|
|
|
157
161
|
tokens_limit = int(tokens_limit_str) if tokens_limit_str is not None else None
|
|
158
162
|
tokens_remaining_str = headers.get('x-ratelimit-remaining-tokens')
|
|
159
163
|
tokens_remaining = int(tokens_remaining_str) if tokens_remaining_str is not None else None
|
|
160
|
-
tokens_reset_str = headers.get('x-ratelimit-reset-tokens')
|
|
164
|
+
tokens_reset_str = headers.get('x-ratelimit-reset-tokens', '5s') # Default to 5 seconds
|
|
161
165
|
tokens_reset_ts = now + _parse_header_duration(tokens_reset_str)
|
|
162
166
|
tokens_info = (tokens_limit, tokens_remaining, tokens_reset_ts)
|
|
163
167
|
|