pixeltable 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/catalog/column.py +26 -49
- pixeltable/catalog/insertable_table.py +7 -4
- pixeltable/catalog/table.py +163 -57
- pixeltable/catalog/table_version.py +416 -140
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/client.py +72 -6
- pixeltable/dataframe.py +65 -21
- pixeltable/env.py +52 -53
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/in_memory_data_node.py +11 -7
- pixeltable/exprs/comparison.py +3 -3
- pixeltable/exprs/data_row.py +5 -1
- pixeltable/exprs/literal.py +16 -4
- pixeltable/exprs/row_builder.py +8 -40
- pixeltable/ext/__init__.py +5 -0
- pixeltable/ext/functions/yolox.py +92 -0
- pixeltable/func/aggregate_function.py +15 -15
- pixeltable/func/expr_template_function.py +9 -1
- pixeltable/func/globals.py +24 -14
- pixeltable/func/signature.py +18 -12
- pixeltable/func/udf.py +7 -2
- pixeltable/functions/__init__.py +9 -9
- pixeltable/functions/eval.py +7 -8
- pixeltable/functions/fireworks.py +10 -37
- pixeltable/functions/huggingface.py +47 -19
- pixeltable/functions/openai.py +192 -24
- pixeltable/functions/together.py +104 -9
- pixeltable/functions/util.py +11 -0
- pixeltable/index/__init__.py +2 -0
- pixeltable/index/base.py +49 -0
- pixeltable/index/embedding_index.py +95 -0
- pixeltable/metadata/schema.py +45 -22
- pixeltable/plan.py +15 -34
- pixeltable/store.py +38 -41
- pixeltable/tests/conftest.py +8 -14
- pixeltable/tests/ext/test_yolox.py +21 -0
- pixeltable/tests/functions/test_fireworks.py +43 -0
- pixeltable/tests/functions/test_functions.py +60 -0
- pixeltable/tests/{test_functions.py → functions/test_huggingface.py} +7 -143
- pixeltable/tests/functions/test_openai.py +162 -0
- pixeltable/tests/functions/test_together.py +112 -0
- pixeltable/tests/test_component_view.py +14 -5
- pixeltable/tests/test_dataframe.py +23 -22
- pixeltable/tests/test_exprs.py +99 -102
- pixeltable/tests/test_function.py +51 -43
- pixeltable/tests/test_index.py +138 -0
- pixeltable/tests/test_migration.py +2 -1
- pixeltable/tests/test_snapshot.py +24 -1
- pixeltable/tests/test_table.py +205 -26
- pixeltable/tests/test_types.py +30 -0
- pixeltable/tests/test_video.py +16 -16
- pixeltable/tests/test_view.py +5 -0
- pixeltable/tests/utils.py +171 -14
- pixeltable/tool/create_test_db_dump.py +16 -0
- pixeltable/type_system.py +77 -128
- pixeltable/utils/arrow.py +98 -0
- pixeltable/utils/hf_datasets.py +157 -0
- pixeltable/utils/parquet.py +68 -27
- pixeltable/utils/pytorch.py +16 -97
- {pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/METADATA +35 -28
- {pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/RECORD +63 -50
- {pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/WHEEL +0 -0
|
@@ -101,8 +101,8 @@ class TableVersionPath:
|
|
|
101
101
|
return DataFrame(self).__getitem__(index)
|
|
102
102
|
|
|
103
103
|
def columns(self) -> List[Column]:
|
|
104
|
-
"""Return all columns visible in this tbl version path, including columns from bases"""
|
|
105
|
-
result = self.tbl_version.
|
|
104
|
+
"""Return all user columns visible in this tbl version path, including columns from bases"""
|
|
105
|
+
result = list(self.tbl_version.cols_by_name.values())
|
|
106
106
|
if self.base is not None:
|
|
107
107
|
base_cols = self.base.columns()
|
|
108
108
|
# we only include base columns that don't conflict with one of our column names
|
pixeltable/client.py
CHANGED
|
@@ -2,12 +2,11 @@ from typing import List, Optional, Dict, Type, Any, Union
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
import logging
|
|
4
4
|
import dataclasses
|
|
5
|
-
from uuid import UUID
|
|
6
|
-
from collections import defaultdict
|
|
7
5
|
|
|
8
6
|
import sqlalchemy as sql
|
|
9
7
|
import sqlalchemy.orm as orm
|
|
10
8
|
|
|
9
|
+
import pixeltable
|
|
11
10
|
from pixeltable.metadata import schema
|
|
12
11
|
from pixeltable.env import Env
|
|
13
12
|
import pixeltable.func as func
|
|
@@ -16,6 +15,10 @@ from pixeltable import exceptions as excs
|
|
|
16
15
|
from pixeltable.exprs import Predicate
|
|
17
16
|
from pixeltable.iterators import ComponentIterator
|
|
18
17
|
|
|
18
|
+
from typing import TYPE_CHECKING
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
import datasets
|
|
21
|
+
|
|
19
22
|
__all__ = [
|
|
20
23
|
'Client',
|
|
21
24
|
]
|
|
@@ -129,10 +132,6 @@ class Client:
|
|
|
129
132
|
Create a table with an int and a string column:
|
|
130
133
|
|
|
131
134
|
>>> table = cl.create_table('my_table', schema={'col1': IntType(), 'col2': StringType()})
|
|
132
|
-
|
|
133
|
-
Create a table with a single indexed image column:
|
|
134
|
-
|
|
135
|
-
>>> table = cl.create_table('my_table', schema={'col1': {'type': ImageType(), 'indexed': True}})
|
|
136
135
|
"""
|
|
137
136
|
path = catalog.Path(path_str)
|
|
138
137
|
self.catalog.paths.check_is_valid(path, expected=None)
|
|
@@ -155,6 +154,73 @@ class Client:
|
|
|
155
154
|
_logger.info(f'Created table `{path_str}`.')
|
|
156
155
|
return tbl
|
|
157
156
|
|
|
157
|
+
def import_parquet(
|
|
158
|
+
self,
|
|
159
|
+
table_path: str,
|
|
160
|
+
*,
|
|
161
|
+
parquet_path: str,
|
|
162
|
+
schema_override: Optional[Dict[str, Any]] = None,
|
|
163
|
+
**kwargs,
|
|
164
|
+
) -> catalog.InsertableTable:
|
|
165
|
+
"""Create a new `InsertableTable` from a Parquet file or set of files. Requires pyarrow to be installed.
|
|
166
|
+
Args:
|
|
167
|
+
path_str: Path to the table within pixeltable.
|
|
168
|
+
parquet_path: Path to an individual Parquet file or directory of Parquet files.
|
|
169
|
+
schema_override: Optional dictionary mapping column names to column type to override the default
|
|
170
|
+
schema inferred from the Parquet file. The column type should be a pixeltable ColumnType.
|
|
171
|
+
For example, {'col_vid': VideoType()}, rather than {'col_vid': StringType()}.
|
|
172
|
+
Any fields not provided explicitly will map to types with `pixeltable.utils.parquet.parquet_schema_to_pixeltable_schema`
|
|
173
|
+
kwargs: Additional arguments to pass to `Client.create_table`.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
The newly created table. The table will have loaded the data from the Parquet file(s).
|
|
177
|
+
"""
|
|
178
|
+
from pixeltable.utils import parquet
|
|
179
|
+
|
|
180
|
+
return parquet.import_parquet(
|
|
181
|
+
self,
|
|
182
|
+
table_path=table_path,
|
|
183
|
+
parquet_path=parquet_path,
|
|
184
|
+
schema_override=schema_override,
|
|
185
|
+
**kwargs,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
def import_huggingface_dataset(
|
|
189
|
+
self,
|
|
190
|
+
table_path: str,
|
|
191
|
+
dataset: Union['datasets.Dataset', 'datasets.DatasetDict'],
|
|
192
|
+
*,
|
|
193
|
+
column_name_for_split: Optional[str] = 'split',
|
|
194
|
+
schema_override: Optional[Dict[str, Any]] = None,
|
|
195
|
+
**kwargs
|
|
196
|
+
) -> catalog.InsertableTable:
|
|
197
|
+
"""Create a new `InsertableTable` from a Huggingface dataset, or dataset dict with multiple splits.
|
|
198
|
+
Requires datasets library to be installed.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
path_str: Path to the table.
|
|
202
|
+
dataset: Huggingface datasts.Dataset or datasts.DatasetDict to insert into the table.
|
|
203
|
+
column_name_for_split: column name to use for split information. If None, no split information will be stored.
|
|
204
|
+
schema_override: Optional dictionary mapping column names to column type to override the corresponding defaults from
|
|
205
|
+
`pixeltable.utils.hf_datasets.huggingface_schema_to_pixeltable_schema`. The column type should be a pixeltable ColumnType.
|
|
206
|
+
For example, {'col_vid': VideoType()}, rather than {'col_vid': StringType()}.
|
|
207
|
+
|
|
208
|
+
kwargs: Additional arguments to pass to `create_table`.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
The newly created table. The table will have loaded the data from the dataset.
|
|
212
|
+
"""
|
|
213
|
+
from pixeltable.utils import hf_datasets
|
|
214
|
+
|
|
215
|
+
return hf_datasets.import_huggingface_dataset(
|
|
216
|
+
self,
|
|
217
|
+
table_path,
|
|
218
|
+
dataset,
|
|
219
|
+
column_name_for_split=column_name_for_split,
|
|
220
|
+
schema_override=schema_override,
|
|
221
|
+
**kwargs,
|
|
222
|
+
)
|
|
223
|
+
|
|
158
224
|
def create_view(
|
|
159
225
|
self, path_str: str, base: catalog.Table, *, schema: Optional[Dict[str, Any]] = None,
|
|
160
226
|
filter: Optional[Predicate] = None,
|
pixeltable/dataframe.py
CHANGED
|
@@ -11,6 +11,8 @@ import traceback
|
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
from typing import List, Optional, Any, Dict, Generator, Tuple, Set
|
|
13
13
|
|
|
14
|
+
import PIL.Image
|
|
15
|
+
import cv2
|
|
14
16
|
import pandas as pd
|
|
15
17
|
import pandas.io.formats.style
|
|
16
18
|
import sqlalchemy as sql
|
|
@@ -31,15 +33,6 @@ __all__ = [
|
|
|
31
33
|
|
|
32
34
|
_logger = logging.getLogger('pixeltable')
|
|
33
35
|
|
|
34
|
-
def _format_img(img: object) -> str:
|
|
35
|
-
"""
|
|
36
|
-
Create <img> tag for Image object.
|
|
37
|
-
"""
|
|
38
|
-
assert isinstance(img, Image.Image), f'Wrong type: {type(img)}'
|
|
39
|
-
with io.BytesIO() as buffer:
|
|
40
|
-
img.save(buffer, 'jpeg')
|
|
41
|
-
img_base64 = base64.b64encode(buffer.getvalue()).decode()
|
|
42
|
-
return f'<div style="width:200px;"><img src="data:image/jpeg;base64,{img_base64}" width="200" /></div>'
|
|
43
36
|
|
|
44
37
|
def _create_source_tag(file_path: str) -> str:
|
|
45
38
|
abs_path = Path(file_path)
|
|
@@ -50,21 +43,17 @@ def _create_source_tag(file_path: str) -> str:
|
|
|
50
43
|
mime_attr = f'type="{mime}"' if mime is not None else ''
|
|
51
44
|
return f'<source src="{src_url}" {mime_attr} />'
|
|
52
45
|
|
|
53
|
-
def _format_video(file_path: str) -> str:
|
|
54
|
-
return f'<video controls>{_create_source_tag(file_path)}</video>'
|
|
55
|
-
|
|
56
|
-
def _format_audio(file_path: str) -> str:
|
|
57
|
-
return f'<audio controls>{_create_source_tag(file_path)}</audio>'
|
|
58
46
|
|
|
59
47
|
class DataFrameResultSet:
|
|
48
|
+
|
|
60
49
|
def __init__(self, rows: List[List[Any]], col_names: List[str], col_types: List[ColumnType]):
|
|
61
50
|
self._rows = rows
|
|
62
51
|
self._col_names = col_names
|
|
63
52
|
self._col_types = col_types
|
|
64
53
|
self._formatters = {
|
|
65
|
-
ts.ImageType: _format_img,
|
|
66
|
-
ts.VideoType: _format_video,
|
|
67
|
-
ts.AudioType: _format_audio,
|
|
54
|
+
ts.ImageType: self._format_img,
|
|
55
|
+
ts.VideoType: self._format_video,
|
|
56
|
+
ts.AudioType: self._format_audio,
|
|
68
57
|
}
|
|
69
58
|
|
|
70
59
|
def __len__(self) -> int:
|
|
@@ -85,9 +74,7 @@ class DataFrameResultSet:
|
|
|
85
74
|
for col_name, col_type in zip(self._col_names, self._col_types)
|
|
86
75
|
if col_type.__class__ in self._formatters
|
|
87
76
|
}
|
|
88
|
-
|
|
89
|
-
# TODO: why does mypy complain about formatters having an incorrect type?
|
|
90
|
-
return self.to_pandas().to_html(formatters=formatters, escape=False, index=False) # type: ignore[arg-type]
|
|
77
|
+
return self.to_pandas().to_html(formatters=formatters, escape=False, index=False)
|
|
91
78
|
|
|
92
79
|
def __str__(self) -> str:
|
|
93
80
|
return self.to_pandas().to_string()
|
|
@@ -102,6 +89,64 @@ class DataFrameResultSet:
|
|
|
102
89
|
def _row_to_dict(self, row_idx: int) -> Dict[str, Any]:
|
|
103
90
|
return {self._col_names[i]: self._rows[row_idx][i] for i in range(len(self._col_names))}
|
|
104
91
|
|
|
92
|
+
# Formatters
|
|
93
|
+
|
|
94
|
+
def _format_img(self, img: Image.Image) -> str:
|
|
95
|
+
"""
|
|
96
|
+
Create <img> tag for Image object.
|
|
97
|
+
"""
|
|
98
|
+
assert isinstance(img, Image.Image), f'Wrong type: {type(img)}'
|
|
99
|
+
# Try to make it look decent in a variety of display scenarios
|
|
100
|
+
if len(self._rows) > 1:
|
|
101
|
+
width = 240 # Multiple rows: display small images
|
|
102
|
+
elif len(self._col_names) > 1:
|
|
103
|
+
width = 480 # Multiple columns: display medium images
|
|
104
|
+
else:
|
|
105
|
+
width = 640 # A single image: larger display
|
|
106
|
+
with io.BytesIO() as buffer:
|
|
107
|
+
img.save(buffer, 'jpeg')
|
|
108
|
+
img_base64 = base64.b64encode(buffer.getvalue()).decode()
|
|
109
|
+
return f'''
|
|
110
|
+
<div style="width:{width}px;">
|
|
111
|
+
<img src="data:image/jpeg;base64,{img_base64}" width="{width}" />
|
|
112
|
+
</div>
|
|
113
|
+
'''
|
|
114
|
+
|
|
115
|
+
def _format_video(self, file_path: str) -> str:
|
|
116
|
+
thumb_tag = ""
|
|
117
|
+
# Attempt to extract the first frame of the video to use as a thumbnail,
|
|
118
|
+
# so that the notebook can be exported as HTML and viewed in contexts where
|
|
119
|
+
# the video itself is not accessible.
|
|
120
|
+
# TODO(aaron-siegel): If the video is backed by a concrete external URL,
|
|
121
|
+
# should we link to that instead?
|
|
122
|
+
video_reader = cv2.VideoCapture(str(file_path))
|
|
123
|
+
if video_reader.isOpened():
|
|
124
|
+
status, img_array = video_reader.read()
|
|
125
|
+
if status:
|
|
126
|
+
img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
|
|
127
|
+
thumb = PIL.Image.fromarray(img_array)
|
|
128
|
+
with io.BytesIO() as buffer:
|
|
129
|
+
thumb.save(buffer, 'jpeg')
|
|
130
|
+
thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
|
|
131
|
+
thumb_tag = f'poster="data:image/jpeg;base64,{thumb_base64}"'
|
|
132
|
+
video_reader.release()
|
|
133
|
+
if len(self._rows) > 1:
|
|
134
|
+
width = 320
|
|
135
|
+
elif len(self._col_names) > 1:
|
|
136
|
+
width = 480
|
|
137
|
+
else:
|
|
138
|
+
width = 800
|
|
139
|
+
return f'''
|
|
140
|
+
<div style="width:{width}px;">
|
|
141
|
+
<video controls width="{width}" {thumb_tag}>
|
|
142
|
+
{_create_source_tag(file_path)}
|
|
143
|
+
</video>
|
|
144
|
+
</div>
|
|
145
|
+
'''
|
|
146
|
+
|
|
147
|
+
def _format_audio(self, file_path: str) -> str:
|
|
148
|
+
return f'<audio controls>{_create_source_tag(file_path)}</audio>'
|
|
149
|
+
|
|
105
150
|
def __getitem__(self, index: Any) -> Any:
|
|
106
151
|
if isinstance(index, str):
|
|
107
152
|
if index not in self._col_names:
|
|
@@ -173,7 +218,6 @@ class AnalysisInfo:
|
|
|
173
218
|
self.filter.release()
|
|
174
219
|
|
|
175
220
|
|
|
176
|
-
|
|
177
221
|
class DataFrame:
|
|
178
222
|
def __init__(
|
|
179
223
|
self, tbl: catalog.TableVersionPath,
|
pixeltable/env.py
CHANGED
|
@@ -1,33 +1,29 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
+
|
|
2
3
|
import datetime
|
|
3
|
-
import
|
|
4
|
-
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
import sqlalchemy as sql
|
|
7
|
-
import uuid
|
|
4
|
+
import glob
|
|
5
|
+
import http.server
|
|
8
6
|
import importlib
|
|
9
7
|
import importlib.util
|
|
10
|
-
|
|
11
|
-
import
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
12
10
|
import socketserver
|
|
11
|
+
import sys
|
|
13
12
|
import threading
|
|
14
|
-
import typing
|
|
15
13
|
import uuid
|
|
14
|
+
import warnings
|
|
16
15
|
from pathlib import Path
|
|
17
|
-
from typing import Optional, Dict, Any, List
|
|
16
|
+
from typing import Callable, Optional, Dict, Any, List
|
|
18
17
|
|
|
18
|
+
import pgserver
|
|
19
|
+
import sqlalchemy as sql
|
|
19
20
|
import yaml
|
|
20
21
|
from sqlalchemy_utils.functions import database_exists, create_database, drop_database
|
|
21
|
-
import
|
|
22
|
-
import logging
|
|
23
|
-
import sys
|
|
24
|
-
import glob
|
|
22
|
+
from tqdm import TqdmWarning
|
|
25
23
|
|
|
26
|
-
from pixeltable import metadata
|
|
27
24
|
import pixeltable.exceptions as excs
|
|
25
|
+
from pixeltable import metadata
|
|
28
26
|
|
|
29
|
-
if typing.TYPE_CHECKING:
|
|
30
|
-
import openai
|
|
31
27
|
|
|
32
28
|
class Env:
|
|
33
29
|
"""
|
|
@@ -59,12 +55,12 @@ class Env:
|
|
|
59
55
|
# package name -> version; version == []: package is installed, but we haven't determined the version yet
|
|
60
56
|
self._installed_packages: Dict[str, Optional[List[int]]] = {}
|
|
61
57
|
self._nos_client: Optional[Any] = None
|
|
62
|
-
self._openai_client: Optional['openai.OpenAI'] = None
|
|
63
|
-
self._has_together_client: bool = False
|
|
64
58
|
self._spacy_nlp: Optional[Any] = None # spacy.Language
|
|
65
59
|
self._httpd: Optional[socketserver.TCPServer] = None
|
|
66
60
|
self._http_address: Optional[str] = None
|
|
67
61
|
|
|
62
|
+
self._registered_clients: dict[str, Any] = {}
|
|
63
|
+
|
|
68
64
|
# logging-related state
|
|
69
65
|
self._logger = logging.getLogger('pixeltable')
|
|
70
66
|
self._logger.setLevel(logging.DEBUG) # allow everything to pass, we filter in _log_filter()
|
|
@@ -193,11 +189,21 @@ class Env:
|
|
|
193
189
|
fh = logging.FileHandler(self._log_dir / self._logfilename, mode='w')
|
|
194
190
|
fh.setFormatter(logging.Formatter(self._log_fmt_str))
|
|
195
191
|
self._logger.addHandler(fh)
|
|
192
|
+
|
|
193
|
+
# configure sqlalchemy logging
|
|
196
194
|
sql_logger = logging.getLogger('sqlalchemy.engine')
|
|
197
195
|
sql_logger.setLevel(logging.INFO)
|
|
198
196
|
sql_logger.addHandler(fh)
|
|
199
197
|
sql_logger.propagate = False
|
|
200
198
|
|
|
199
|
+
# configure pyav logging
|
|
200
|
+
av_logfilename = self._logfilename.replace('.log', '_av.log')
|
|
201
|
+
av_fh = logging.FileHandler(self._log_dir / av_logfilename, mode='w')
|
|
202
|
+
av_fh.setFormatter(logging.Formatter(self._log_fmt_str))
|
|
203
|
+
av_logger = logging.getLogger('libav')
|
|
204
|
+
av_logger.addHandler(av_fh)
|
|
205
|
+
av_logger.propagate = False
|
|
206
|
+
|
|
201
207
|
# empty tmp dir
|
|
202
208
|
for path in glob.glob(f'{self._tmp_dir}/*'):
|
|
203
209
|
os.remove(path)
|
|
@@ -234,6 +240,9 @@ class Env:
|
|
|
234
240
|
self._set_up_runtime()
|
|
235
241
|
self.log_to_stdout(False)
|
|
236
242
|
|
|
243
|
+
# Disable spurious warnings
|
|
244
|
+
warnings.simplefilter("ignore", category=TqdmWarning)
|
|
245
|
+
|
|
237
246
|
def upgrade_metadata(self) -> None:
|
|
238
247
|
metadata.upgrade_md(self._sa_engine)
|
|
239
248
|
|
|
@@ -256,31 +265,32 @@ class Env:
|
|
|
256
265
|
from pixeltable.functions.util import create_nos_modules
|
|
257
266
|
_ = create_nos_modules()
|
|
258
267
|
|
|
259
|
-
def
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
import openai
|
|
263
|
-
if 'openai' in self._config and 'api_key' in self._config['openai']:
|
|
264
|
-
api_key = self._config['openai']['api_key']
|
|
265
|
-
else:
|
|
266
|
-
api_key = os.environ.get('OPENAI_API_KEY')
|
|
267
|
-
if api_key is None or api_key == '':
|
|
268
|
-
raise excs.Error('OpenAI client not initialized (no API key configured).')
|
|
269
|
-
self._openai_client = openai.OpenAI(api_key=api_key)
|
|
270
|
-
self._logger.info('Initialized OpenAI client.')
|
|
268
|
+
def get_client(self, name: str, init: Callable, environ: Optional[str] = None) -> Any:
|
|
269
|
+
"""
|
|
270
|
+
Gets the client with the specified name, using `init` to construct one if necessary.
|
|
271
271
|
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
272
|
+
- name: The name of the client
|
|
273
|
+
- init: A `Callable` with signature `fn(api_key: str) -> Any` that constructs a client object
|
|
274
|
+
- environ: The name of the environment variable to use for the API key, if no API key is found in config
|
|
275
|
+
(defaults to f'{name.upper()}_API_KEY')
|
|
276
|
+
"""
|
|
277
|
+
if name in self._registered_clients:
|
|
278
|
+
return self._registered_clients[name]
|
|
279
|
+
|
|
280
|
+
if environ is None:
|
|
281
|
+
environ = f'{name.upper()}_API_KEY'
|
|
282
|
+
|
|
283
|
+
if name in self._config and 'api_key' in self._config[name]:
|
|
284
|
+
api_key = self._config[name]['api_key']
|
|
275
285
|
else:
|
|
276
|
-
api_key = os.environ.get(
|
|
286
|
+
api_key = os.environ.get(environ)
|
|
277
287
|
if api_key is None or api_key == '':
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
self.
|
|
282
|
-
|
|
283
|
-
|
|
288
|
+
raise excs.Error(f'`{name}` client not initialized (no API key configured).')
|
|
289
|
+
|
|
290
|
+
client = init(api_key)
|
|
291
|
+
self._registered_clients[name] = client
|
|
292
|
+
self._logger.info(f'Initialized `{name}` client.')
|
|
293
|
+
return client
|
|
284
294
|
|
|
285
295
|
def _start_web_server(self) -> None:
|
|
286
296
|
"""
|
|
@@ -319,10 +329,12 @@ class Env:
|
|
|
319
329
|
else:
|
|
320
330
|
self._installed_packages[package] = None
|
|
321
331
|
|
|
332
|
+
check('datasets')
|
|
322
333
|
check('torch')
|
|
323
334
|
check('torchvision')
|
|
324
335
|
check('transformers')
|
|
325
336
|
check('sentence_transformers')
|
|
337
|
+
check('yolox')
|
|
326
338
|
check('boto3')
|
|
327
339
|
check('pyarrow')
|
|
328
340
|
check('spacy') # TODO: deal with en-core-web-sm
|
|
@@ -332,8 +344,6 @@ class Env:
|
|
|
332
344
|
check('tiktoken')
|
|
333
345
|
check('openai')
|
|
334
346
|
check('together')
|
|
335
|
-
if self.is_installed_package('together'):
|
|
336
|
-
self._create_together_client()
|
|
337
347
|
check('fireworks')
|
|
338
348
|
check('nos')
|
|
339
349
|
if self.is_installed_package('nos'):
|
|
@@ -399,17 +409,6 @@ class Env:
|
|
|
399
409
|
def nos_client(self) -> Any:
|
|
400
410
|
return self._nos_client
|
|
401
411
|
|
|
402
|
-
@property
|
|
403
|
-
def openai_client(self) -> 'openai.OpenAI':
|
|
404
|
-
if self._openai_client is None:
|
|
405
|
-
self._create_openai_client()
|
|
406
|
-
assert self._openai_client is not None
|
|
407
|
-
return self._openai_client
|
|
408
|
-
|
|
409
|
-
@property
|
|
410
|
-
def has_together_client(self) -> bool:
|
|
411
|
-
return self._has_together_client
|
|
412
|
-
|
|
413
412
|
@property
|
|
414
413
|
def spacy_nlp(self) -> Any:
|
|
415
414
|
assert self._spacy_nlp is not None
|
|
@@ -89,7 +89,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
89
89
|
# preserve the file extension, if there is one
|
|
90
90
|
extension = ''
|
|
91
91
|
if parsed.path != '':
|
|
92
|
-
p = Path(urllib.parse.unquote(parsed.path))
|
|
92
|
+
p = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
|
|
93
93
|
extension = p.suffix
|
|
94
94
|
tmp_path = env.Env.get().create_tmp_path(extension=extension)
|
|
95
95
|
try:
|
|
@@ -29,18 +29,21 @@ class InMemoryDataNode(ExecNode):
|
|
|
29
29
|
|
|
30
30
|
def _open(self) -> None:
|
|
31
31
|
"""Create row batch and populate with self.input_rows"""
|
|
32
|
-
column_info = {info.col.
|
|
32
|
+
column_info = {info.col.id: info for info in self.row_builder.output_slot_idxs()}
|
|
33
|
+
# exclude system columns
|
|
34
|
+
user_column_info = {info.col.name: info for _, info in column_info.items() if info.col.name is not None}
|
|
33
35
|
# stored columns that are not computed
|
|
34
|
-
|
|
35
|
-
info.col.
|
|
36
|
+
inserted_col_ids = set([
|
|
37
|
+
info.col.id for info in self.row_builder.output_slot_idxs()
|
|
36
38
|
if info.col.is_stored and not info.col.is_computed
|
|
37
39
|
])
|
|
38
40
|
|
|
39
41
|
self.output_rows = DataRowBatch(self.tbl, self.row_builder, len(self.input_rows))
|
|
40
42
|
for row_idx, input_row in enumerate(self.input_rows):
|
|
41
43
|
# populate the output row with the values provided in the input row
|
|
44
|
+
input_col_ids: List[int] = []
|
|
42
45
|
for col_name, val in input_row.items():
|
|
43
|
-
col_info =
|
|
46
|
+
col_info = user_column_info.get(col_name)
|
|
44
47
|
assert col_info is not None
|
|
45
48
|
|
|
46
49
|
if col_info.col.col_type.is_image_type() and isinstance(val, bytes):
|
|
@@ -49,11 +52,12 @@ class InMemoryDataNode(ExecNode):
|
|
|
49
52
|
open(path, 'wb').write(val)
|
|
50
53
|
val = path
|
|
51
54
|
self.output_rows[row_idx][col_info.slot_idx] = val
|
|
55
|
+
input_col_ids.append(col_info.col.id)
|
|
52
56
|
|
|
53
57
|
# set the remaining stored non-computed columns to null
|
|
54
|
-
|
|
55
|
-
for
|
|
56
|
-
col_info = column_info.get(
|
|
58
|
+
null_col_ids = inserted_col_ids - set(input_col_ids)
|
|
59
|
+
for col_id in null_col_ids:
|
|
60
|
+
col_info = column_info.get(col_id)
|
|
57
61
|
assert col_info is not None
|
|
58
62
|
self.output_rows[row_idx][col_info.slot_idx] = None
|
|
59
63
|
|
pixeltable/exprs/comparison.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
+
|
|
2
3
|
from typing import Optional, List, Any, Dict, Tuple
|
|
3
4
|
|
|
4
5
|
import sqlalchemy as sql
|
|
5
6
|
|
|
6
|
-
from .
|
|
7
|
+
from .data_row import DataRow
|
|
7
8
|
from .expr import Expr
|
|
9
|
+
from .globals import ComparisonOperator
|
|
8
10
|
from .predicate import Predicate
|
|
9
|
-
from .data_row import DataRow
|
|
10
11
|
from .row_builder import RowBuilder
|
|
11
|
-
import pixeltable.catalog as catalog
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class Comparison(Predicate):
|
pixeltable/exprs/data_row.py
CHANGED
|
@@ -5,6 +5,8 @@ import urllib.parse
|
|
|
5
5
|
import urllib.request
|
|
6
6
|
from typing import Optional, List, Any, Tuple
|
|
7
7
|
|
|
8
|
+
import sqlalchemy as sql
|
|
9
|
+
import pgvector.sqlalchemy
|
|
8
10
|
import PIL
|
|
9
11
|
import numpy as np
|
|
10
12
|
|
|
@@ -110,7 +112,7 @@ class DataRow:
|
|
|
110
112
|
|
|
111
113
|
return self.vals[index]
|
|
112
114
|
|
|
113
|
-
def get_stored_val(self, index: object) -> Any:
|
|
115
|
+
def get_stored_val(self, index: object, sa_col_type: Optional[sql.types.TypeEngine] = None) -> Any:
|
|
114
116
|
"""Return the value that gets stored in the db"""
|
|
115
117
|
assert self.excs[index] is None
|
|
116
118
|
if not self.has_val[index]:
|
|
@@ -125,6 +127,8 @@ class DataRow:
|
|
|
125
127
|
if self.vals[index] is not None and index in self.array_slot_idxs:
|
|
126
128
|
assert isinstance(self.vals[index], np.ndarray)
|
|
127
129
|
np_array = self.vals[index]
|
|
130
|
+
if sa_col_type is not None and isinstance(sa_col_type, pgvector.sqlalchemy.Vector):
|
|
131
|
+
return np_array
|
|
128
132
|
buffer = io.BytesIO()
|
|
129
133
|
np.save(buffer, np_array)
|
|
130
134
|
return buffer.getvalue()
|
pixeltable/exprs/literal.py
CHANGED
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import datetime
|
|
2
4
|
from typing import Optional, List, Any, Dict, Tuple
|
|
3
5
|
|
|
4
6
|
import sqlalchemy as sql
|
|
5
7
|
|
|
6
|
-
|
|
8
|
+
import pixeltable.exceptions as excs
|
|
9
|
+
import pixeltable.type_system as ts
|
|
7
10
|
from .data_row import DataRow
|
|
11
|
+
from .expr import Expr
|
|
8
12
|
from .row_builder import RowBuilder
|
|
9
|
-
|
|
10
|
-
import pixeltable.type_system as ts
|
|
13
|
+
|
|
11
14
|
|
|
12
15
|
class Literal(Expr):
|
|
13
16
|
def __init__(self, val: Any, col_type: Optional[ts.ColumnType] = None):
|
|
@@ -46,9 +49,18 @@ class Literal(Expr):
|
|
|
46
49
|
data_row[self.slot_idx] = self.val
|
|
47
50
|
|
|
48
51
|
def _as_dict(self) -> Dict:
|
|
49
|
-
|
|
52
|
+
# For some types, we need to explictly record their type, because JSON does not know
|
|
53
|
+
# how to interpret them unambiguously
|
|
54
|
+
if self.col_type.is_timestamp_type():
|
|
55
|
+
return {'val': self.val.isoformat(), 'val_t': self.col_type._type.name, **super()._as_dict()}
|
|
56
|
+
else:
|
|
57
|
+
return {'val': self.val, **super()._as_dict()}
|
|
50
58
|
|
|
51
59
|
@classmethod
|
|
52
60
|
def _from_dict(cls, d: Dict, components: List[Expr]) -> Expr:
|
|
53
61
|
assert 'val' in d
|
|
62
|
+
if 'val_t' in d:
|
|
63
|
+
val_t = d['val_t']
|
|
64
|
+
assert val_t == ts.ColumnType.Type.TIMESTAMP.name
|
|
65
|
+
return cls(datetime.datetime.fromisoformat(d['val']))
|
|
54
66
|
return cls(d['val'])
|
pixeltable/exprs/row_builder.py
CHANGED
|
@@ -54,14 +54,12 @@ class RowBuilder:
|
|
|
54
54
|
target_exprs: List[Expr] # exprs corresponding to target_slot_idxs
|
|
55
55
|
|
|
56
56
|
def __init__(
|
|
57
|
-
self, output_exprs: List[Expr], columns: List[catalog.Column],
|
|
58
|
-
indices: List[Tuple[catalog.Column, func.Function]], input_exprs: List[Expr]
|
|
57
|
+
self, output_exprs: List[Expr], columns: List[catalog.Column], input_exprs: List[Expr]
|
|
59
58
|
):
|
|
60
59
|
"""
|
|
61
60
|
Args:
|
|
62
61
|
output_exprs: list of Exprs to be evaluated
|
|
63
62
|
columns: list of columns to be materialized
|
|
64
|
-
indices: list of embeddings to be materialized (Tuple[indexed column, embedding function])
|
|
65
63
|
"""
|
|
66
64
|
self.unique_exprs = ExprSet() # dependencies precede their dependents
|
|
67
65
|
self.next_slot_idx = 0
|
|
@@ -73,7 +71,6 @@ class RowBuilder:
|
|
|
73
71
|
# output exprs: all exprs the caller wants to materialize
|
|
74
72
|
# - explicitly requested output_exprs
|
|
75
73
|
# - values for computed columns
|
|
76
|
-
# - embedding values for indices
|
|
77
74
|
resolve_cols = set(columns)
|
|
78
75
|
self.output_exprs = [
|
|
79
76
|
self._record_unique_expr(e.copy().resolve_computed_cols(resolve_cols=resolve_cols), recursive=True)
|
|
@@ -97,21 +94,6 @@ class RowBuilder:
|
|
|
97
94
|
ref = self._record_unique_expr(ref, recursive=False)
|
|
98
95
|
self.add_table_column(col, ref.slot_idx)
|
|
99
96
|
|
|
100
|
-
# record indices; indexed by slot_idx
|
|
101
|
-
self.index_columns: List[catalog.Column] = []
|
|
102
|
-
for col, embedding_fn in indices:
|
|
103
|
-
# we assume that the parameter of the embedding function is a ref to an image column
|
|
104
|
-
assert col.col_type.is_image_type()
|
|
105
|
-
# construct expr to compute embedding; explicitly resize images to the required size
|
|
106
|
-
target_img_type = next(iter(embedding_fn.signature.parameters.values())).col_type
|
|
107
|
-
expr = embedding_fn(ColumnRef(col).resize(target_img_type.size))
|
|
108
|
-
expr = self._record_unique_expr(expr, recursive=True)
|
|
109
|
-
self.output_exprs.append(expr)
|
|
110
|
-
if len(self.index_columns) <= expr.slot_idx:
|
|
111
|
-
# pad to slot_idx
|
|
112
|
-
self.index_columns.extend([None] * (expr.slot_idx - len(self.index_columns) + 1))
|
|
113
|
-
self.index_columns[expr.slot_idx] = col
|
|
114
|
-
|
|
115
97
|
# default eval ctx: all output exprs
|
|
116
98
|
self.default_eval_ctx = self.create_eval_ctx(self.output_exprs, exclude=unique_input_exprs)
|
|
117
99
|
|
|
@@ -170,13 +152,6 @@ class RowBuilder:
|
|
|
170
152
|
"""Return ColumnSlotIdx for output columns"""
|
|
171
153
|
return self.table_columns
|
|
172
154
|
|
|
173
|
-
def index_slot_idxs(self) -> List[ColumnSlotIdx]:
|
|
174
|
-
"""Return ColumnSlotIdx for index columns"""
|
|
175
|
-
return [
|
|
176
|
-
ColumnSlotIdx(self.output_columns[i], i) for i in range(len(self.index_columns))
|
|
177
|
-
if self.output_columns[i] is not None
|
|
178
|
-
]
|
|
179
|
-
|
|
180
155
|
@property
|
|
181
156
|
def num_materialized(self) -> int:
|
|
182
157
|
return self.next_slot_idx
|
|
@@ -334,22 +309,15 @@ class RowBuilder:
|
|
|
334
309
|
exc = data_row.get_exc(slot_idx)
|
|
335
310
|
num_excs += 1
|
|
336
311
|
exc_col_ids.add(col.id)
|
|
337
|
-
table_row[col.
|
|
338
|
-
table_row[col.
|
|
339
|
-
table_row[col.
|
|
312
|
+
table_row[col.store_name()] = None
|
|
313
|
+
table_row[col.errortype_store_name()] = type(exc).__name__
|
|
314
|
+
table_row[col.errormsg_store_name()] = str(exc)
|
|
340
315
|
else:
|
|
341
|
-
val = data_row.get_stored_val(slot_idx)
|
|
342
|
-
table_row[col.
|
|
316
|
+
val = data_row.get_stored_val(slot_idx, col.sa_col.type)
|
|
317
|
+
table_row[col.store_name()] = val
|
|
343
318
|
# we unfortunately need to set these, even if there are no errors
|
|
344
|
-
table_row[col.
|
|
345
|
-
table_row[col.
|
|
346
|
-
|
|
347
|
-
for slot_idx, col in enumerate(self.index_columns):
|
|
348
|
-
if col is None:
|
|
349
|
-
continue
|
|
350
|
-
# don't use get_stored_val() here, we need to pass in the ndarray
|
|
351
|
-
val = data_row[slot_idx]
|
|
352
|
-
table_row[col.index_storage_name()] = val
|
|
319
|
+
table_row[col.errortype_store_name()] = None
|
|
320
|
+
table_row[col.errormsg_store_name()] = None
|
|
353
321
|
|
|
354
322
|
return table_row, num_excs
|
|
355
323
|
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Extended integrations for Pixeltable. This package contains experimental or demonstration features that
|
|
3
|
+
are not intended for production use. Long-term support cannot be guaranteed, usually because the features
|
|
4
|
+
have dependencies whose future support is unclear.
|
|
5
|
+
"""
|