pixeltable 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (63) hide show
  1. pixeltable/catalog/column.py +26 -49
  2. pixeltable/catalog/insertable_table.py +7 -4
  3. pixeltable/catalog/table.py +163 -57
  4. pixeltable/catalog/table_version.py +416 -140
  5. pixeltable/catalog/table_version_path.py +2 -2
  6. pixeltable/client.py +72 -6
  7. pixeltable/dataframe.py +65 -21
  8. pixeltable/env.py +52 -53
  9. pixeltable/exec/cache_prefetch_node.py +1 -1
  10. pixeltable/exec/in_memory_data_node.py +11 -7
  11. pixeltable/exprs/comparison.py +3 -3
  12. pixeltable/exprs/data_row.py +5 -1
  13. pixeltable/exprs/literal.py +16 -4
  14. pixeltable/exprs/row_builder.py +8 -40
  15. pixeltable/ext/__init__.py +5 -0
  16. pixeltable/ext/functions/yolox.py +92 -0
  17. pixeltable/func/aggregate_function.py +15 -15
  18. pixeltable/func/expr_template_function.py +9 -1
  19. pixeltable/func/globals.py +24 -14
  20. pixeltable/func/signature.py +18 -12
  21. pixeltable/func/udf.py +7 -2
  22. pixeltable/functions/__init__.py +9 -9
  23. pixeltable/functions/eval.py +7 -8
  24. pixeltable/functions/fireworks.py +10 -37
  25. pixeltable/functions/huggingface.py +47 -19
  26. pixeltable/functions/openai.py +192 -24
  27. pixeltable/functions/together.py +104 -9
  28. pixeltable/functions/util.py +11 -0
  29. pixeltable/index/__init__.py +2 -0
  30. pixeltable/index/base.py +49 -0
  31. pixeltable/index/embedding_index.py +95 -0
  32. pixeltable/metadata/schema.py +45 -22
  33. pixeltable/plan.py +15 -34
  34. pixeltable/store.py +38 -41
  35. pixeltable/tests/conftest.py +8 -14
  36. pixeltable/tests/ext/test_yolox.py +21 -0
  37. pixeltable/tests/functions/test_fireworks.py +43 -0
  38. pixeltable/tests/functions/test_functions.py +60 -0
  39. pixeltable/tests/{test_functions.py → functions/test_huggingface.py} +7 -143
  40. pixeltable/tests/functions/test_openai.py +162 -0
  41. pixeltable/tests/functions/test_together.py +112 -0
  42. pixeltable/tests/test_component_view.py +14 -5
  43. pixeltable/tests/test_dataframe.py +23 -22
  44. pixeltable/tests/test_exprs.py +99 -102
  45. pixeltable/tests/test_function.py +51 -43
  46. pixeltable/tests/test_index.py +138 -0
  47. pixeltable/tests/test_migration.py +2 -1
  48. pixeltable/tests/test_snapshot.py +24 -1
  49. pixeltable/tests/test_table.py +205 -26
  50. pixeltable/tests/test_types.py +30 -0
  51. pixeltable/tests/test_video.py +16 -16
  52. pixeltable/tests/test_view.py +5 -0
  53. pixeltable/tests/utils.py +171 -14
  54. pixeltable/tool/create_test_db_dump.py +16 -0
  55. pixeltable/type_system.py +77 -128
  56. pixeltable/utils/arrow.py +98 -0
  57. pixeltable/utils/hf_datasets.py +157 -0
  58. pixeltable/utils/parquet.py +68 -27
  59. pixeltable/utils/pytorch.py +16 -97
  60. {pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/METADATA +35 -28
  61. {pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/RECORD +63 -50
  62. {pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/LICENSE +0 -0
  63. {pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/WHEEL +0 -0
@@ -101,8 +101,8 @@ class TableVersionPath:
101
101
  return DataFrame(self).__getitem__(index)
102
102
 
103
103
  def columns(self) -> List[Column]:
104
- """Return all columns visible in this tbl version path, including columns from bases"""
105
- result = self.tbl_version.cols.copy()
104
+ """Return all user columns visible in this tbl version path, including columns from bases"""
105
+ result = list(self.tbl_version.cols_by_name.values())
106
106
  if self.base is not None:
107
107
  base_cols = self.base.columns()
108
108
  # we only include base columns that don't conflict with one of our column names
pixeltable/client.py CHANGED
@@ -2,12 +2,11 @@ from typing import List, Optional, Dict, Type, Any, Union
2
2
  import pandas as pd
3
3
  import logging
4
4
  import dataclasses
5
- from uuid import UUID
6
- from collections import defaultdict
7
5
 
8
6
  import sqlalchemy as sql
9
7
  import sqlalchemy.orm as orm
10
8
 
9
+ import pixeltable
11
10
  from pixeltable.metadata import schema
12
11
  from pixeltable.env import Env
13
12
  import pixeltable.func as func
@@ -16,6 +15,10 @@ from pixeltable import exceptions as excs
16
15
  from pixeltable.exprs import Predicate
17
16
  from pixeltable.iterators import ComponentIterator
18
17
 
18
+ from typing import TYPE_CHECKING
19
+ if TYPE_CHECKING:
20
+ import datasets
21
+
19
22
  __all__ = [
20
23
  'Client',
21
24
  ]
@@ -129,10 +132,6 @@ class Client:
129
132
  Create a table with an int and a string column:
130
133
 
131
134
  >>> table = cl.create_table('my_table', schema={'col1': IntType(), 'col2': StringType()})
132
-
133
- Create a table with a single indexed image column:
134
-
135
- >>> table = cl.create_table('my_table', schema={'col1': {'type': ImageType(), 'indexed': True}})
136
135
  """
137
136
  path = catalog.Path(path_str)
138
137
  self.catalog.paths.check_is_valid(path, expected=None)
@@ -155,6 +154,73 @@ class Client:
155
154
  _logger.info(f'Created table `{path_str}`.')
156
155
  return tbl
157
156
 
157
+ def import_parquet(
158
+ self,
159
+ table_path: str,
160
+ *,
161
+ parquet_path: str,
162
+ schema_override: Optional[Dict[str, Any]] = None,
163
+ **kwargs,
164
+ ) -> catalog.InsertableTable:
165
+ """Create a new `InsertableTable` from a Parquet file or set of files. Requires pyarrow to be installed.
166
+ Args:
167
+ path_str: Path to the table within pixeltable.
168
+ parquet_path: Path to an individual Parquet file or directory of Parquet files.
169
+ schema_override: Optional dictionary mapping column names to column type to override the default
170
+ schema inferred from the Parquet file. The column type should be a pixeltable ColumnType.
171
+ For example, {'col_vid': VideoType()}, rather than {'col_vid': StringType()}.
172
+ Any fields not provided explicitly will map to types with `pixeltable.utils.parquet.parquet_schema_to_pixeltable_schema`
173
+ kwargs: Additional arguments to pass to `Client.create_table`.
174
+
175
+ Returns:
176
+ The newly created table. The table will have loaded the data from the Parquet file(s).
177
+ """
178
+ from pixeltable.utils import parquet
179
+
180
+ return parquet.import_parquet(
181
+ self,
182
+ table_path=table_path,
183
+ parquet_path=parquet_path,
184
+ schema_override=schema_override,
185
+ **kwargs,
186
+ )
187
+
188
+ def import_huggingface_dataset(
189
+ self,
190
+ table_path: str,
191
+ dataset: Union['datasets.Dataset', 'datasets.DatasetDict'],
192
+ *,
193
+ column_name_for_split: Optional[str] = 'split',
194
+ schema_override: Optional[Dict[str, Any]] = None,
195
+ **kwargs
196
+ ) -> catalog.InsertableTable:
197
+ """Create a new `InsertableTable` from a Huggingface dataset, or dataset dict with multiple splits.
198
+ Requires datasets library to be installed.
199
+
200
+ Args:
201
+ path_str: Path to the table.
202
+ dataset: Huggingface datasts.Dataset or datasts.DatasetDict to insert into the table.
203
+ column_name_for_split: column name to use for split information. If None, no split information will be stored.
204
+ schema_override: Optional dictionary mapping column names to column type to override the corresponding defaults from
205
+ `pixeltable.utils.hf_datasets.huggingface_schema_to_pixeltable_schema`. The column type should be a pixeltable ColumnType.
206
+ For example, {'col_vid': VideoType()}, rather than {'col_vid': StringType()}.
207
+
208
+ kwargs: Additional arguments to pass to `create_table`.
209
+
210
+ Returns:
211
+ The newly created table. The table will have loaded the data from the dataset.
212
+ """
213
+ from pixeltable.utils import hf_datasets
214
+
215
+ return hf_datasets.import_huggingface_dataset(
216
+ self,
217
+ table_path,
218
+ dataset,
219
+ column_name_for_split=column_name_for_split,
220
+ schema_override=schema_override,
221
+ **kwargs,
222
+ )
223
+
158
224
  def create_view(
159
225
  self, path_str: str, base: catalog.Table, *, schema: Optional[Dict[str, Any]] = None,
160
226
  filter: Optional[Predicate] = None,
pixeltable/dataframe.py CHANGED
@@ -11,6 +11,8 @@ import traceback
11
11
  from pathlib import Path
12
12
  from typing import List, Optional, Any, Dict, Generator, Tuple, Set
13
13
 
14
+ import PIL.Image
15
+ import cv2
14
16
  import pandas as pd
15
17
  import pandas.io.formats.style
16
18
  import sqlalchemy as sql
@@ -31,15 +33,6 @@ __all__ = [
31
33
 
32
34
  _logger = logging.getLogger('pixeltable')
33
35
 
34
- def _format_img(img: object) -> str:
35
- """
36
- Create <img> tag for Image object.
37
- """
38
- assert isinstance(img, Image.Image), f'Wrong type: {type(img)}'
39
- with io.BytesIO() as buffer:
40
- img.save(buffer, 'jpeg')
41
- img_base64 = base64.b64encode(buffer.getvalue()).decode()
42
- return f'<div style="width:200px;"><img src="data:image/jpeg;base64,{img_base64}" width="200" /></div>'
43
36
 
44
37
  def _create_source_tag(file_path: str) -> str:
45
38
  abs_path = Path(file_path)
@@ -50,21 +43,17 @@ def _create_source_tag(file_path: str) -> str:
50
43
  mime_attr = f'type="{mime}"' if mime is not None else ''
51
44
  return f'<source src="{src_url}" {mime_attr} />'
52
45
 
53
- def _format_video(file_path: str) -> str:
54
- return f'<video controls>{_create_source_tag(file_path)}</video>'
55
-
56
- def _format_audio(file_path: str) -> str:
57
- return f'<audio controls>{_create_source_tag(file_path)}</audio>'
58
46
 
59
47
  class DataFrameResultSet:
48
+
60
49
  def __init__(self, rows: List[List[Any]], col_names: List[str], col_types: List[ColumnType]):
61
50
  self._rows = rows
62
51
  self._col_names = col_names
63
52
  self._col_types = col_types
64
53
  self._formatters = {
65
- ts.ImageType: _format_img,
66
- ts.VideoType: _format_video,
67
- ts.AudioType: _format_audio,
54
+ ts.ImageType: self._format_img,
55
+ ts.VideoType: self._format_video,
56
+ ts.AudioType: self._format_audio,
68
57
  }
69
58
 
70
59
  def __len__(self) -> int:
@@ -85,9 +74,7 @@ class DataFrameResultSet:
85
74
  for col_name, col_type in zip(self._col_names, self._col_types)
86
75
  if col_type.__class__ in self._formatters
87
76
  }
88
-
89
- # TODO: why does mypy complain about formatters having an incorrect type?
90
- return self.to_pandas().to_html(formatters=formatters, escape=False, index=False) # type: ignore[arg-type]
77
+ return self.to_pandas().to_html(formatters=formatters, escape=False, index=False)
91
78
 
92
79
  def __str__(self) -> str:
93
80
  return self.to_pandas().to_string()
@@ -102,6 +89,64 @@ class DataFrameResultSet:
102
89
  def _row_to_dict(self, row_idx: int) -> Dict[str, Any]:
103
90
  return {self._col_names[i]: self._rows[row_idx][i] for i in range(len(self._col_names))}
104
91
 
92
+ # Formatters
93
+
94
+ def _format_img(self, img: Image.Image) -> str:
95
+ """
96
+ Create <img> tag for Image object.
97
+ """
98
+ assert isinstance(img, Image.Image), f'Wrong type: {type(img)}'
99
+ # Try to make it look decent in a variety of display scenarios
100
+ if len(self._rows) > 1:
101
+ width = 240 # Multiple rows: display small images
102
+ elif len(self._col_names) > 1:
103
+ width = 480 # Multiple columns: display medium images
104
+ else:
105
+ width = 640 # A single image: larger display
106
+ with io.BytesIO() as buffer:
107
+ img.save(buffer, 'jpeg')
108
+ img_base64 = base64.b64encode(buffer.getvalue()).decode()
109
+ return f'''
110
+ <div style="width:{width}px;">
111
+ <img src="data:image/jpeg;base64,{img_base64}" width="{width}" />
112
+ </div>
113
+ '''
114
+
115
+ def _format_video(self, file_path: str) -> str:
116
+ thumb_tag = ""
117
+ # Attempt to extract the first frame of the video to use as a thumbnail,
118
+ # so that the notebook can be exported as HTML and viewed in contexts where
119
+ # the video itself is not accessible.
120
+ # TODO(aaron-siegel): If the video is backed by a concrete external URL,
121
+ # should we link to that instead?
122
+ video_reader = cv2.VideoCapture(str(file_path))
123
+ if video_reader.isOpened():
124
+ status, img_array = video_reader.read()
125
+ if status:
126
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
127
+ thumb = PIL.Image.fromarray(img_array)
128
+ with io.BytesIO() as buffer:
129
+ thumb.save(buffer, 'jpeg')
130
+ thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
131
+ thumb_tag = f'poster="data:image/jpeg;base64,{thumb_base64}"'
132
+ video_reader.release()
133
+ if len(self._rows) > 1:
134
+ width = 320
135
+ elif len(self._col_names) > 1:
136
+ width = 480
137
+ else:
138
+ width = 800
139
+ return f'''
140
+ <div style="width:{width}px;">
141
+ <video controls width="{width}" {thumb_tag}>
142
+ {_create_source_tag(file_path)}
143
+ </video>
144
+ </div>
145
+ '''
146
+
147
+ def _format_audio(self, file_path: str) -> str:
148
+ return f'<audio controls>{_create_source_tag(file_path)}</audio>'
149
+
105
150
  def __getitem__(self, index: Any) -> Any:
106
151
  if isinstance(index, str):
107
152
  if index not in self._col_names:
@@ -173,7 +218,6 @@ class AnalysisInfo:
173
218
  self.filter.release()
174
219
 
175
220
 
176
-
177
221
  class DataFrame:
178
222
  def __init__(
179
223
  self, tbl: catalog.TableVersionPath,
pixeltable/env.py CHANGED
@@ -1,33 +1,29 @@
1
1
  from __future__ import annotations
2
+
2
3
  import datetime
3
- import os
4
- from typing import Optional, Dict, Any, List
5
- from pathlib import Path
6
- import sqlalchemy as sql
7
- import uuid
4
+ import glob
5
+ import http.server
8
6
  import importlib
9
7
  import importlib.util
10
-
11
- import http.server
8
+ import logging
9
+ import os
12
10
  import socketserver
11
+ import sys
13
12
  import threading
14
- import typing
15
13
  import uuid
14
+ import warnings
16
15
  from pathlib import Path
17
- from typing import Optional, Dict, Any, List
16
+ from typing import Callable, Optional, Dict, Any, List
18
17
 
18
+ import pgserver
19
+ import sqlalchemy as sql
19
20
  import yaml
20
21
  from sqlalchemy_utils.functions import database_exists, create_database, drop_database
21
- import pgserver
22
- import logging
23
- import sys
24
- import glob
22
+ from tqdm import TqdmWarning
25
23
 
26
- from pixeltable import metadata
27
24
  import pixeltable.exceptions as excs
25
+ from pixeltable import metadata
28
26
 
29
- if typing.TYPE_CHECKING:
30
- import openai
31
27
 
32
28
  class Env:
33
29
  """
@@ -59,12 +55,12 @@ class Env:
59
55
  # package name -> version; version == []: package is installed, but we haven't determined the version yet
60
56
  self._installed_packages: Dict[str, Optional[List[int]]] = {}
61
57
  self._nos_client: Optional[Any] = None
62
- self._openai_client: Optional['openai.OpenAI'] = None
63
- self._has_together_client: bool = False
64
58
  self._spacy_nlp: Optional[Any] = None # spacy.Language
65
59
  self._httpd: Optional[socketserver.TCPServer] = None
66
60
  self._http_address: Optional[str] = None
67
61
 
62
+ self._registered_clients: dict[str, Any] = {}
63
+
68
64
  # logging-related state
69
65
  self._logger = logging.getLogger('pixeltable')
70
66
  self._logger.setLevel(logging.DEBUG) # allow everything to pass, we filter in _log_filter()
@@ -193,11 +189,21 @@ class Env:
193
189
  fh = logging.FileHandler(self._log_dir / self._logfilename, mode='w')
194
190
  fh.setFormatter(logging.Formatter(self._log_fmt_str))
195
191
  self._logger.addHandler(fh)
192
+
193
+ # configure sqlalchemy logging
196
194
  sql_logger = logging.getLogger('sqlalchemy.engine')
197
195
  sql_logger.setLevel(logging.INFO)
198
196
  sql_logger.addHandler(fh)
199
197
  sql_logger.propagate = False
200
198
 
199
+ # configure pyav logging
200
+ av_logfilename = self._logfilename.replace('.log', '_av.log')
201
+ av_fh = logging.FileHandler(self._log_dir / av_logfilename, mode='w')
202
+ av_fh.setFormatter(logging.Formatter(self._log_fmt_str))
203
+ av_logger = logging.getLogger('libav')
204
+ av_logger.addHandler(av_fh)
205
+ av_logger.propagate = False
206
+
201
207
  # empty tmp dir
202
208
  for path in glob.glob(f'{self._tmp_dir}/*'):
203
209
  os.remove(path)
@@ -234,6 +240,9 @@ class Env:
234
240
  self._set_up_runtime()
235
241
  self.log_to_stdout(False)
236
242
 
243
+ # Disable spurious warnings
244
+ warnings.simplefilter("ignore", category=TqdmWarning)
245
+
237
246
  def upgrade_metadata(self) -> None:
238
247
  metadata.upgrade_md(self._sa_engine)
239
248
 
@@ -256,31 +265,32 @@ class Env:
256
265
  from pixeltable.functions.util import create_nos_modules
257
266
  _ = create_nos_modules()
258
267
 
259
- def _create_openai_client(self) -> None:
260
- if not self.is_installed_package('openai'):
261
- raise excs.Error('OpenAI client not initialized (cannot find package `openai`: `pip install openai`?)')
262
- import openai
263
- if 'openai' in self._config and 'api_key' in self._config['openai']:
264
- api_key = self._config['openai']['api_key']
265
- else:
266
- api_key = os.environ.get('OPENAI_API_KEY')
267
- if api_key is None or api_key == '':
268
- raise excs.Error('OpenAI client not initialized (no API key configured).')
269
- self._openai_client = openai.OpenAI(api_key=api_key)
270
- self._logger.info('Initialized OpenAI client.')
268
+ def get_client(self, name: str, init: Callable, environ: Optional[str] = None) -> Any:
269
+ """
270
+ Gets the client with the specified name, using `init` to construct one if necessary.
271
271
 
272
- def _create_together_client(self) -> None:
273
- if 'together' in self._config and 'api_key' in self._config['together']:
274
- api_key = self._config['together']['api_key']
272
+ - name: The name of the client
273
+ - init: A `Callable` with signature `fn(api_key: str) -> Any` that constructs a client object
274
+ - environ: The name of the environment variable to use for the API key, if no API key is found in config
275
+ (defaults to f'{name.upper()}_API_KEY')
276
+ """
277
+ if name in self._registered_clients:
278
+ return self._registered_clients[name]
279
+
280
+ if environ is None:
281
+ environ = f'{name.upper()}_API_KEY'
282
+
283
+ if name in self._config and 'api_key' in self._config[name]:
284
+ api_key = self._config[name]['api_key']
275
285
  else:
276
- api_key = os.environ.get('TOGETHER_API_KEY')
286
+ api_key = os.environ.get(environ)
277
287
  if api_key is None or api_key == '':
278
- self._logger.info('Together client not initialized (no API key configured).')
279
- return
280
- import together
281
- self._logger.info('Initializing Together client.')
282
- together.api_key = api_key
283
- self._has_together_client = True
288
+ raise excs.Error(f'`{name}` client not initialized (no API key configured).')
289
+
290
+ client = init(api_key)
291
+ self._registered_clients[name] = client
292
+ self._logger.info(f'Initialized `{name}` client.')
293
+ return client
284
294
 
285
295
  def _start_web_server(self) -> None:
286
296
  """
@@ -319,10 +329,12 @@ class Env:
319
329
  else:
320
330
  self._installed_packages[package] = None
321
331
 
332
+ check('datasets')
322
333
  check('torch')
323
334
  check('torchvision')
324
335
  check('transformers')
325
336
  check('sentence_transformers')
337
+ check('yolox')
326
338
  check('boto3')
327
339
  check('pyarrow')
328
340
  check('spacy') # TODO: deal with en-core-web-sm
@@ -332,8 +344,6 @@ class Env:
332
344
  check('tiktoken')
333
345
  check('openai')
334
346
  check('together')
335
- if self.is_installed_package('together'):
336
- self._create_together_client()
337
347
  check('fireworks')
338
348
  check('nos')
339
349
  if self.is_installed_package('nos'):
@@ -399,17 +409,6 @@ class Env:
399
409
  def nos_client(self) -> Any:
400
410
  return self._nos_client
401
411
 
402
- @property
403
- def openai_client(self) -> 'openai.OpenAI':
404
- if self._openai_client is None:
405
- self._create_openai_client()
406
- assert self._openai_client is not None
407
- return self._openai_client
408
-
409
- @property
410
- def has_together_client(self) -> bool:
411
- return self._has_together_client
412
-
413
412
  @property
414
413
  def spacy_nlp(self) -> Any:
415
414
  assert self._spacy_nlp is not None
@@ -89,7 +89,7 @@ class CachePrefetchNode(ExecNode):
89
89
  # preserve the file extension, if there is one
90
90
  extension = ''
91
91
  if parsed.path != '':
92
- p = Path(urllib.parse.unquote(parsed.path))
92
+ p = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
93
93
  extension = p.suffix
94
94
  tmp_path = env.Env.get().create_tmp_path(extension=extension)
95
95
  try:
@@ -29,18 +29,21 @@ class InMemoryDataNode(ExecNode):
29
29
 
30
30
  def _open(self) -> None:
31
31
  """Create row batch and populate with self.input_rows"""
32
- column_info = {info.col.name: info for info in self.row_builder.output_slot_idxs()}
32
+ column_info = {info.col.id: info for info in self.row_builder.output_slot_idxs()}
33
+ # exclude system columns
34
+ user_column_info = {info.col.name: info for _, info in column_info.items() if info.col.name is not None}
33
35
  # stored columns that are not computed
34
- inserted_column_names = set([
35
- info.col.name for info in self.row_builder.output_slot_idxs()
36
+ inserted_col_ids = set([
37
+ info.col.id for info in self.row_builder.output_slot_idxs()
36
38
  if info.col.is_stored and not info.col.is_computed
37
39
  ])
38
40
 
39
41
  self.output_rows = DataRowBatch(self.tbl, self.row_builder, len(self.input_rows))
40
42
  for row_idx, input_row in enumerate(self.input_rows):
41
43
  # populate the output row with the values provided in the input row
44
+ input_col_ids: List[int] = []
42
45
  for col_name, val in input_row.items():
43
- col_info = column_info.get(col_name)
46
+ col_info = user_column_info.get(col_name)
44
47
  assert col_info is not None
45
48
 
46
49
  if col_info.col.col_type.is_image_type() and isinstance(val, bytes):
@@ -49,11 +52,12 @@ class InMemoryDataNode(ExecNode):
49
52
  open(path, 'wb').write(val)
50
53
  val = path
51
54
  self.output_rows[row_idx][col_info.slot_idx] = val
55
+ input_col_ids.append(col_info.col.id)
52
56
 
53
57
  # set the remaining stored non-computed columns to null
54
- null_col_names = inserted_column_names - set(input_row.keys())
55
- for col_name in null_col_names:
56
- col_info = column_info.get(col_name)
58
+ null_col_ids = inserted_col_ids - set(input_col_ids)
59
+ for col_id in null_col_ids:
60
+ col_info = column_info.get(col_id)
57
61
  assert col_info is not None
58
62
  self.output_rows[row_idx][col_info.slot_idx] = None
59
63
 
@@ -1,14 +1,14 @@
1
1
  from __future__ import annotations
2
+
2
3
  from typing import Optional, List, Any, Dict, Tuple
3
4
 
4
5
  import sqlalchemy as sql
5
6
 
6
- from .globals import ComparisonOperator
7
+ from .data_row import DataRow
7
8
  from .expr import Expr
9
+ from .globals import ComparisonOperator
8
10
  from .predicate import Predicate
9
- from .data_row import DataRow
10
11
  from .row_builder import RowBuilder
11
- import pixeltable.catalog as catalog
12
12
 
13
13
 
14
14
  class Comparison(Predicate):
@@ -5,6 +5,8 @@ import urllib.parse
5
5
  import urllib.request
6
6
  from typing import Optional, List, Any, Tuple
7
7
 
8
+ import sqlalchemy as sql
9
+ import pgvector.sqlalchemy
8
10
  import PIL
9
11
  import numpy as np
10
12
 
@@ -110,7 +112,7 @@ class DataRow:
110
112
 
111
113
  return self.vals[index]
112
114
 
113
- def get_stored_val(self, index: object) -> Any:
115
+ def get_stored_val(self, index: object, sa_col_type: Optional[sql.types.TypeEngine] = None) -> Any:
114
116
  """Return the value that gets stored in the db"""
115
117
  assert self.excs[index] is None
116
118
  if not self.has_val[index]:
@@ -125,6 +127,8 @@ class DataRow:
125
127
  if self.vals[index] is not None and index in self.array_slot_idxs:
126
128
  assert isinstance(self.vals[index], np.ndarray)
127
129
  np_array = self.vals[index]
130
+ if sa_col_type is not None and isinstance(sa_col_type, pgvector.sqlalchemy.Vector):
131
+ return np_array
128
132
  buffer = io.BytesIO()
129
133
  np.save(buffer, np_array)
130
134
  return buffer.getvalue()
@@ -1,13 +1,16 @@
1
1
  from __future__ import annotations
2
+
3
+ import datetime
2
4
  from typing import Optional, List, Any, Dict, Tuple
3
5
 
4
6
  import sqlalchemy as sql
5
7
 
6
- from .expr import Expr
8
+ import pixeltable.exceptions as excs
9
+ import pixeltable.type_system as ts
7
10
  from .data_row import DataRow
11
+ from .expr import Expr
8
12
  from .row_builder import RowBuilder
9
- import pixeltable.catalog as catalog
10
- import pixeltable.type_system as ts
13
+
11
14
 
12
15
  class Literal(Expr):
13
16
  def __init__(self, val: Any, col_type: Optional[ts.ColumnType] = None):
@@ -46,9 +49,18 @@ class Literal(Expr):
46
49
  data_row[self.slot_idx] = self.val
47
50
 
48
51
  def _as_dict(self) -> Dict:
49
- return {'val': self.val, **super()._as_dict()}
52
+ # For some types, we need to explictly record their type, because JSON does not know
53
+ # how to interpret them unambiguously
54
+ if self.col_type.is_timestamp_type():
55
+ return {'val': self.val.isoformat(), 'val_t': self.col_type._type.name, **super()._as_dict()}
56
+ else:
57
+ return {'val': self.val, **super()._as_dict()}
50
58
 
51
59
  @classmethod
52
60
  def _from_dict(cls, d: Dict, components: List[Expr]) -> Expr:
53
61
  assert 'val' in d
62
+ if 'val_t' in d:
63
+ val_t = d['val_t']
64
+ assert val_t == ts.ColumnType.Type.TIMESTAMP.name
65
+ return cls(datetime.datetime.fromisoformat(d['val']))
54
66
  return cls(d['val'])
@@ -54,14 +54,12 @@ class RowBuilder:
54
54
  target_exprs: List[Expr] # exprs corresponding to target_slot_idxs
55
55
 
56
56
  def __init__(
57
- self, output_exprs: List[Expr], columns: List[catalog.Column],
58
- indices: List[Tuple[catalog.Column, func.Function]], input_exprs: List[Expr]
57
+ self, output_exprs: List[Expr], columns: List[catalog.Column], input_exprs: List[Expr]
59
58
  ):
60
59
  """
61
60
  Args:
62
61
  output_exprs: list of Exprs to be evaluated
63
62
  columns: list of columns to be materialized
64
- indices: list of embeddings to be materialized (Tuple[indexed column, embedding function])
65
63
  """
66
64
  self.unique_exprs = ExprSet() # dependencies precede their dependents
67
65
  self.next_slot_idx = 0
@@ -73,7 +71,6 @@ class RowBuilder:
73
71
  # output exprs: all exprs the caller wants to materialize
74
72
  # - explicitly requested output_exprs
75
73
  # - values for computed columns
76
- # - embedding values for indices
77
74
  resolve_cols = set(columns)
78
75
  self.output_exprs = [
79
76
  self._record_unique_expr(e.copy().resolve_computed_cols(resolve_cols=resolve_cols), recursive=True)
@@ -97,21 +94,6 @@ class RowBuilder:
97
94
  ref = self._record_unique_expr(ref, recursive=False)
98
95
  self.add_table_column(col, ref.slot_idx)
99
96
 
100
- # record indices; indexed by slot_idx
101
- self.index_columns: List[catalog.Column] = []
102
- for col, embedding_fn in indices:
103
- # we assume that the parameter of the embedding function is a ref to an image column
104
- assert col.col_type.is_image_type()
105
- # construct expr to compute embedding; explicitly resize images to the required size
106
- target_img_type = next(iter(embedding_fn.signature.parameters.values())).col_type
107
- expr = embedding_fn(ColumnRef(col).resize(target_img_type.size))
108
- expr = self._record_unique_expr(expr, recursive=True)
109
- self.output_exprs.append(expr)
110
- if len(self.index_columns) <= expr.slot_idx:
111
- # pad to slot_idx
112
- self.index_columns.extend([None] * (expr.slot_idx - len(self.index_columns) + 1))
113
- self.index_columns[expr.slot_idx] = col
114
-
115
97
  # default eval ctx: all output exprs
116
98
  self.default_eval_ctx = self.create_eval_ctx(self.output_exprs, exclude=unique_input_exprs)
117
99
 
@@ -170,13 +152,6 @@ class RowBuilder:
170
152
  """Return ColumnSlotIdx for output columns"""
171
153
  return self.table_columns
172
154
 
173
- def index_slot_idxs(self) -> List[ColumnSlotIdx]:
174
- """Return ColumnSlotIdx for index columns"""
175
- return [
176
- ColumnSlotIdx(self.output_columns[i], i) for i in range(len(self.index_columns))
177
- if self.output_columns[i] is not None
178
- ]
179
-
180
155
  @property
181
156
  def num_materialized(self) -> int:
182
157
  return self.next_slot_idx
@@ -334,22 +309,15 @@ class RowBuilder:
334
309
  exc = data_row.get_exc(slot_idx)
335
310
  num_excs += 1
336
311
  exc_col_ids.add(col.id)
337
- table_row[col.storage_name()] = None
338
- table_row[col.errortype_storage_name()] = type(exc).__name__
339
- table_row[col.errormsg_storage_name()] = str(exc)
312
+ table_row[col.store_name()] = None
313
+ table_row[col.errortype_store_name()] = type(exc).__name__
314
+ table_row[col.errormsg_store_name()] = str(exc)
340
315
  else:
341
- val = data_row.get_stored_val(slot_idx)
342
- table_row[col.storage_name()] = val
316
+ val = data_row.get_stored_val(slot_idx, col.sa_col.type)
317
+ table_row[col.store_name()] = val
343
318
  # we unfortunately need to set these, even if there are no errors
344
- table_row[col.errortype_storage_name()] = None
345
- table_row[col.errormsg_storage_name()] = None
346
-
347
- for slot_idx, col in enumerate(self.index_columns):
348
- if col is None:
349
- continue
350
- # don't use get_stored_val() here, we need to pass in the ndarray
351
- val = data_row[slot_idx]
352
- table_row[col.index_storage_name()] = val
319
+ table_row[col.errortype_store_name()] = None
320
+ table_row[col.errormsg_store_name()] = None
353
321
 
354
322
  return table_row, num_excs
355
323
 
@@ -0,0 +1,5 @@
1
+ """
2
+ Extended integrations for Pixeltable. This package contains experimental or demonstration features that
3
+ are not intended for production use. Long-term support cannot be guaranteed, usually because the features
4
+ have dependencies whose future support is unclear.
5
+ """