pixeltable 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (76) hide show
  1. pixeltable/__init__.py +15 -33
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/catalog.py +1 -1
  4. pixeltable/catalog/column.py +28 -16
  5. pixeltable/catalog/dir.py +2 -2
  6. pixeltable/catalog/insertable_table.py +5 -55
  7. pixeltable/catalog/named_function.py +2 -2
  8. pixeltable/catalog/schema_object.py +2 -7
  9. pixeltable/catalog/table.py +298 -204
  10. pixeltable/catalog/table_version.py +104 -139
  11. pixeltable/catalog/table_version_path.py +22 -4
  12. pixeltable/catalog/view.py +20 -10
  13. pixeltable/dataframe.py +128 -25
  14. pixeltable/env.py +21 -14
  15. pixeltable/exec/exec_context.py +5 -0
  16. pixeltable/exec/exec_node.py +1 -0
  17. pixeltable/exec/in_memory_data_node.py +29 -24
  18. pixeltable/exec/sql_scan_node.py +1 -1
  19. pixeltable/exprs/column_ref.py +13 -8
  20. pixeltable/exprs/data_row.py +4 -0
  21. pixeltable/exprs/expr.py +16 -1
  22. pixeltable/exprs/function_call.py +4 -4
  23. pixeltable/exprs/row_builder.py +29 -20
  24. pixeltable/exprs/similarity_expr.py +4 -3
  25. pixeltable/ext/functions/yolox.py +2 -1
  26. pixeltable/func/__init__.py +1 -0
  27. pixeltable/func/aggregate_function.py +14 -12
  28. pixeltable/func/callable_function.py +8 -6
  29. pixeltable/func/expr_template_function.py +13 -19
  30. pixeltable/func/function.py +3 -6
  31. pixeltable/func/query_template_function.py +84 -0
  32. pixeltable/func/signature.py +68 -23
  33. pixeltable/func/udf.py +13 -10
  34. pixeltable/functions/__init__.py +6 -91
  35. pixeltable/functions/eval.py +26 -14
  36. pixeltable/functions/fireworks.py +25 -23
  37. pixeltable/functions/globals.py +62 -0
  38. pixeltable/functions/huggingface.py +20 -16
  39. pixeltable/functions/image.py +170 -1
  40. pixeltable/functions/openai.py +95 -128
  41. pixeltable/functions/string.py +10 -2
  42. pixeltable/functions/together.py +95 -84
  43. pixeltable/functions/util.py +16 -0
  44. pixeltable/functions/video.py +94 -16
  45. pixeltable/functions/whisper.py +78 -0
  46. pixeltable/globals.py +1 -1
  47. pixeltable/io/__init__.py +10 -0
  48. pixeltable/io/external_store.py +370 -0
  49. pixeltable/io/globals.py +50 -22
  50. pixeltable/{datatransfer → io}/label_studio.py +279 -166
  51. pixeltable/io/parquet.py +1 -1
  52. pixeltable/iterators/__init__.py +9 -0
  53. pixeltable/iterators/string.py +40 -0
  54. pixeltable/metadata/__init__.py +6 -8
  55. pixeltable/metadata/converters/convert_10.py +2 -4
  56. pixeltable/metadata/converters/convert_12.py +7 -2
  57. pixeltable/metadata/converters/convert_13.py +6 -8
  58. pixeltable/metadata/converters/convert_14.py +2 -4
  59. pixeltable/metadata/converters/convert_15.py +40 -25
  60. pixeltable/metadata/converters/convert_16.py +18 -0
  61. pixeltable/metadata/converters/util.py +11 -8
  62. pixeltable/metadata/schema.py +3 -6
  63. pixeltable/plan.py +8 -7
  64. pixeltable/store.py +1 -1
  65. pixeltable/tool/create_test_db_dump.py +145 -54
  66. pixeltable/tool/embed_udf.py +9 -0
  67. pixeltable/type_system.py +1 -2
  68. pixeltable/utils/code.py +34 -0
  69. {pixeltable-0.2.7.dist-info → pixeltable-0.2.9.dist-info}/METADATA +2 -2
  70. pixeltable-0.2.9.dist-info/RECORD +131 -0
  71. pixeltable/datatransfer/__init__.py +0 -1
  72. pixeltable/datatransfer/remote.py +0 -113
  73. pixeltable/functions/pil/image.py +0 -147
  74. pixeltable-0.2.7.dist-info/RECORD +0 -126
  75. {pixeltable-0.2.7.dist-info → pixeltable-0.2.9.dist-info}/LICENSE +0 -0
  76. {pixeltable-0.2.7.dist-info → pixeltable-0.2.9.dist-info}/WHEEL +0 -0
@@ -1,3 +1,12 @@
1
1
  from .base import ComponentIterator
2
2
  from .document import DocumentSplitter
3
+ from .string import StringSplitter
3
4
  from .video import FrameIterator
5
+
6
+ __default_dir = set(symbol for symbol in dir() if not symbol.startswith('_'))
7
+ __removed_symbols = {'base', 'document', 'video'}
8
+ __all__ = sorted(list(__default_dir - __removed_symbols))
9
+
10
+
11
+ def __dir__():
12
+ return __all__
@@ -0,0 +1,40 @@
1
+ from typing import Iterator, Any
2
+
3
+ import pixeltable.exceptions as excs
4
+ import pixeltable.type_system as ts
5
+ from pixeltable.env import Env
6
+ from pixeltable.iterators.base import ComponentIterator
7
+
8
+
9
+ class StringSplitter(ComponentIterator):
10
+ # TODO(aaron-siegel): Merge this with `DocumentSplitter` in order to provide additional capabilities.
11
+ def __init__(self, text: str, *, separators: str):
12
+ if separators != 'sentence':
13
+ raise excs.Error('Only `sentence` separators are currently supported.')
14
+ self._text = text
15
+ self.doc = Env.get().spacy_nlp(self._text)
16
+ self.iter = self._iter()
17
+
18
+ def _iter(self) -> Iterator[dict[str, Any]]:
19
+ for sentence in self.doc.sents:
20
+ yield {'text': sentence.text}
21
+
22
+ def __next__(self) -> dict[str, Any]:
23
+ return next(self.iter)
24
+
25
+ def close(self) -> None:
26
+ pass
27
+
28
+ def set_pos(self, pos: int) -> None:
29
+ pass
30
+
31
+ @classmethod
32
+ def input_schema(cls, *args: Any, **kwargs: Any) -> dict[str, ts.ColumnType]:
33
+ return {
34
+ 'text': ts.StringType(),
35
+ 'separators': ts.StringType(),
36
+ }
37
+
38
+ @classmethod
39
+ def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
40
+ return {'text': ts.StringType()}, []
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
10
10
  from .schema import SystemInfo, SystemInfoMd
11
11
 
12
12
  # current version of the metadata; this is incremented whenever the metadata schema changes
13
- VERSION = 16
13
+ VERSION = 17
14
14
 
15
15
 
16
16
  def create_system_info(engine: sql.engine.Engine) -> None:
@@ -26,13 +26,11 @@ def create_system_info(engine: sql.engine.Engine) -> None:
26
26
  # key: old schema version
27
27
  converter_cbs: Dict[int, Callable[[sql.engine.Engine], None]] = {}
28
28
 
29
- def register_converter(version: int, cb: Callable[[sql.engine.Engine], None]) -> None:
30
- global converter_cbs
31
- converter_cbs[version] = cb
32
-
33
- def noop_converter(engine: sql.engine.Engine) -> None:
34
- # Converter to use when incrementing the schema version, but without any functional changes
35
- pass
29
+ def register_converter(version: int) -> Callable[[Callable[[sql.engine.Engine], None]], None]:
30
+ def decorator(fn: Callable[[sql.engine.Engine], None]) -> None:
31
+ global converter_cbs
32
+ converter_cbs[version] = fn
33
+ return decorator
36
34
 
37
35
  # load all converter modules
38
36
  for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/converters']):
@@ -4,7 +4,8 @@ from pixeltable.metadata.schema import Table, TableSchemaVersion
4
4
  from pixeltable.metadata import register_converter
5
5
 
6
6
 
7
- def convert_10(engine: sql.engine.Engine) -> None:
7
+ @register_converter(version=10)
8
+ def _(engine: sql.engine.Engine) -> None:
8
9
  default_table_attrs = {"comment": None, "num_retained_versions": 10}
9
10
  with engine.begin() as conn:
10
11
  # Because `parameters` wasn't actually used for anything,
@@ -13,6 +14,3 @@ def convert_10(engine: sql.engine.Engine) -> None:
13
14
  # Add `table_attrs` to all instances of tableschemaversions.md.
14
15
  conn.execute(sql.update(TableSchemaVersion).values(md=TableSchemaVersion.md.concat(default_table_attrs)))
15
16
  return
16
-
17
-
18
- register_converter(10, convert_10)
@@ -1,3 +1,8 @@
1
- from pixeltable.metadata import register_converter, noop_converter
1
+ import sqlalchemy as sql
2
2
 
3
- register_converter(12, noop_converter)
3
+ from pixeltable.metadata import register_converter
4
+
5
+
6
+ @register_converter(version=12)
7
+ def _(engine: sql.engine.Engine) -> None:
8
+ pass
@@ -9,12 +9,13 @@ from pixeltable.metadata.schema import Table
9
9
  _logger = logging.getLogger('pixeltable')
10
10
 
11
11
 
12
- def convert_13(engine: sql.engine.Engine) -> None:
12
+ @register_converter(version=13)
13
+ def _(engine: sql.engine.Engine) -> None:
13
14
  with engine.begin() as conn:
14
15
  for row in conn.execute(sql.select(Table)):
15
16
  id = row[0]
16
17
  md = row[2]
17
- updated_md = _update_md(md)
18
+ updated_md = __update_md(md)
18
19
  if updated_md != md:
19
20
  _logger.info(f'Updating schema for table: {id}')
20
21
  conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_md))
@@ -23,19 +24,16 @@ def convert_13(engine: sql.engine.Engine) -> None:
23
24
  # Traverse the schema dictionary and replace instances of `ExplicitBatchedFunction` with
24
25
  # `CallableFunction`. DB versions prior to 14 can't contain serialized batched functions,
25
26
  # so this is all we need to do.
26
- def _update_md(md: Any) -> Any:
27
+ def __update_md(md: Any) -> Any:
27
28
  if isinstance(md, dict):
28
29
  updated_md = {}
29
30
  for k, v in md.items():
30
31
  if k == '_classpath' and v == 'pixeltable.func.batched_function.ExplicitBatchedFunction':
31
32
  updated_md[k] = 'pixeltable.func.callable_function.CallableFunction'
32
33
  else:
33
- updated_md[k] = _update_md(v)
34
+ updated_md[k] = __update_md(v)
34
35
  return updated_md
35
36
  elif isinstance(md, list):
36
- return [_update_md(v) for v in md]
37
+ return [__update_md(v) for v in md]
37
38
  else:
38
39
  return md
39
-
40
-
41
- register_converter(13, convert_13)
@@ -4,10 +4,8 @@ from pixeltable.metadata.schema import Table
4
4
  from pixeltable.metadata import register_converter
5
5
 
6
6
 
7
- def convert_14(engine: sql.engine.Engine) -> None:
7
+ @register_converter(version=14)
8
+ def _(engine: sql.engine.Engine) -> None:
8
9
  default_remotes = {'remotes': []}
9
10
  with engine.begin() as conn:
10
11
  conn.execute(sql.update(Table).where(Table.md['remotes'] == None).values(md=Table.md.concat(default_remotes)))
11
-
12
-
13
- register_converter(14, convert_14)
@@ -1,29 +1,44 @@
1
- import uuid
2
1
 
2
+ import inspect
3
+ import logging
4
+ from typing import Any
5
+
6
+ import cloudpickle
3
7
  import sqlalchemy as sql
4
8
 
9
+ import pixeltable.func as func
10
+ import pixeltable.type_system as ts
5
11
  from pixeltable.metadata import register_converter
6
- from pixeltable.metadata.converters.util import convert_table_md
7
-
8
-
9
- def convert_15(engine: sql.engine.Engine) -> None:
10
- convert_table_md(engine, column_md_updater=update_column_md, remote_md_updater=update_remote_md)
11
-
12
-
13
- def update_column_md(column_md: dict) -> None:
14
- column_md['proxy_base'] = None
15
-
16
-
17
- def update_remote_md(remote_md: dict) -> None:
18
- remote_md['class'] = f'{remote_md["module"]}.{remote_md["class"]}'
19
- del remote_md['module']
20
- if remote_md['class'] == 'pixeltable.datatransfer.remote.MockRemote':
21
- remote_md['remote_md']['name'] = f'remote_{uuid.uuid4()}'
22
- elif remote_md['class'] == 'pixeltable.datatransfer.label_studio.LabelStudioProject':
23
- # 'post' is the media_import_method for legacy LabelStudioProject remotes
24
- remote_md['remote_md']['media_import_method'] = 'post'
25
- else:
26
- assert False, remote_md['class']
27
-
28
-
29
- register_converter(15, convert_15)
12
+ from pixeltable.metadata.schema import Function
13
+
14
+ _logger = logging.getLogger('pixeltable')
15
+
16
+
17
+ @register_converter(version=15)
18
+ def _(engine: sql.engine.Engine) -> None:
19
+ with engine.begin() as conn:
20
+ for row in conn.execute(sql.select(Function)):
21
+ id, dir_id, md, binary_obj = row
22
+ md['md'] = __update_md(md['md'], binary_obj)
23
+ _logger.info(f'Updating function: {id}')
24
+ conn.execute(sql.update(Function).where(Function.id == id).values(md=md))
25
+
26
+
27
+ def __update_md(orig_d: dict, binary_obj: bytes) -> Any:
28
+ # construct dict produced by CallableFunction.to_store()
29
+ py_fn = cloudpickle.loads(binary_obj)
30
+ py_params = inspect.signature(py_fn).parameters
31
+ return_type = ts.ColumnType.from_dict(orig_d['return_type'])
32
+ params: list[func.Parameter] = []
33
+ for name, col_type_dict, kind_int, is_batched in orig_d['parameters']:
34
+ col_type = ts.ColumnType.from_dict(col_type_dict) if col_type_dict is not None else None
35
+ default = py_params[name].default
36
+ kind = inspect._ParameterKind(kind_int) # is there a way to avoid referencing a private type?
37
+ params.append(func.Parameter(name=name, col_type=col_type, kind=kind, default=default, is_batched=is_batched))
38
+ is_batched = 'batch_size' in orig_d
39
+ sig = func.Signature(return_type, params, is_batched=is_batched)
40
+ d = {
41
+ 'signature': sig.as_dict(),
42
+ 'batch_size': orig_d['batch_size'] if is_batched else None,
43
+ }
44
+ return d
@@ -0,0 +1,18 @@
1
+ import sqlalchemy as sql
2
+
3
+ from pixeltable.metadata import register_converter
4
+ from pixeltable.metadata.converters.util import convert_table_md
5
+
6
+
7
+ @register_converter(version=16)
8
+ def _(engine: sql.engine.Engine) -> None:
9
+ convert_table_md(
10
+ engine,
11
+ table_md_updater=__update_table_md
12
+ )
13
+
14
+
15
+ def __update_table_md(table_md: dict) -> None:
16
+ # External stores are not migratable; just drop them
17
+ del table_md['remotes']
18
+ table_md['external_stores'] = {}
@@ -11,8 +11,9 @@ __logger = logging.getLogger('pixeltable')
11
11
 
12
12
  def convert_table_md(
13
13
  engine: sql.engine.Engine,
14
+ table_md_updater: Optional[Callable[[dict], None]] = None,
14
15
  column_md_updater: Optional[Callable[[dict], None]] = None,
15
- remote_md_updater: Optional[Callable[[dict], None]] = None,
16
+ external_store_md_updater: Optional[Callable[[dict], None]] = None,
16
17
  substitution_fn: Optional[Callable[[Any, Any], Optional[tuple[Any, Any]]]] = None
17
18
  ) -> None:
18
19
  with engine.begin() as conn:
@@ -21,10 +22,12 @@ def convert_table_md(
21
22
  table_md = row[2]
22
23
  assert isinstance(table_md, dict)
23
24
  updated_table_md = copy.deepcopy(table_md)
25
+ if table_md_updater is not None:
26
+ table_md_updater(updated_table_md)
24
27
  if column_md_updater is not None:
25
28
  __update_column_md(updated_table_md, column_md_updater)
26
- if remote_md_updater is not None:
27
- __update_remote_md(updated_table_md, remote_md_updater)
29
+ if external_store_md_updater is not None:
30
+ __update_external_store_md(updated_table_md, external_store_md_updater)
28
31
  if substitution_fn is not None:
29
32
  updated_table_md = __substitute_md_rec(updated_table_md, substitution_fn)
30
33
  if updated_table_md != table_md:
@@ -39,11 +42,11 @@ def __update_column_md(table_md: dict, column_md_updater: Callable[[dict], None]
39
42
  column_md_updater(column_md)
40
43
 
41
44
 
42
- def __update_remote_md(table_md: dict, remote_md_updater: Callable[[dict], None]) -> None:
43
- remotes_md = table_md['remotes']
44
- assert isinstance(remotes_md, list)
45
- for remote_md in remotes_md:
46
- remote_md_updater(remote_md)
45
+ def __update_external_store_md(table_md: dict, external_store_md_updater: Callable[[dict], None]) -> None:
46
+ stores_md = table_md['external_stores']
47
+ assert isinstance(stores_md, list)
48
+ for store_md in stores_md:
49
+ external_store_md_updater(store_md)
47
50
 
48
51
 
49
52
  def __substitute_md_rec(md: Any, substitution_fn: Callable[[Any, Any], Optional[tuple[Any, Any]]]) -> Any:
@@ -92,9 +92,6 @@ class ColumnMd:
92
92
  # if True, the column is present in the stored table
93
93
  stored: Optional[bool]
94
94
 
95
- # if specified, the column is a stored proxy of another column
96
- proxy_base: Optional[int]
97
-
98
95
 
99
96
  @dataclasses.dataclass
100
97
  class IndexMd:
@@ -145,9 +142,9 @@ class TableMd:
145
142
  # - every row is assigned a unique and immutable rowid on insertion
146
143
  next_row_id: int
147
144
 
148
- # Metadata format for remotes:
149
- # {'class': 'pixeltable.datatransfer.LabelStudioProject', 'md': {'project_id': 3}}
150
- remotes: list[dict[str, Any]]
145
+ # Metadata format for external stores:
146
+ # {'class': 'pixeltable.io.label_studio.LabelStudioProject', 'md': {'project_id': 3}}
147
+ external_stores: list[dict[str, Any]]
151
148
 
152
149
  column_md: dict[int, ColumnMd] # col_id -> ColumnMd
153
150
  index_md: dict[int, IndexMd] # index_id -> IndexMd
pixeltable/plan.py CHANGED
@@ -217,15 +217,15 @@ class Planner:
217
217
  plan = exec.InMemoryDataNode(tbl, rows, row_builder, tbl.next_rowid)
218
218
 
219
219
  media_input_cols = [info for info in input_col_info if info.col.col_type.is_media_type()]
220
+ if len(media_input_cols) > 0:
221
+ # prefetch external files for all input column refs for validation
222
+ plan = exec.CachePrefetchNode(tbl.id, media_input_cols, plan)
223
+ plan = exec.MediaValidationNode(row_builder, media_input_cols, input=plan)
220
224
 
221
- # prefetch external files for all input column refs for validation
222
- plan = exec.CachePrefetchNode(tbl.id, media_input_cols, plan)
223
- plan = exec.MediaValidationNode(row_builder, media_input_cols, input=plan)
224
-
225
- computed_exprs = row_builder.default_eval_ctx.target_exprs
225
+ computed_exprs = [e for e in row_builder.default_eval_ctx.target_exprs if not isinstance(e, exprs.ColumnRef)]
226
226
  if len(computed_exprs) > 0:
227
227
  # add an ExprEvalNode when there are exprs to compute
228
- plan = exec.ExprEvalNode(row_builder, computed_exprs, [], input=plan)
228
+ plan = exec.ExprEvalNode(row_builder, computed_exprs, plan.output_exprs, input=plan)
229
229
 
230
230
  plan.set_stored_img_cols(stored_img_col_info)
231
231
  plan.set_ctx(
@@ -355,7 +355,8 @@ class Planner:
355
355
  # - we can ignore stored non-computed columns because they have a default value that is supplied directly by
356
356
  # the store
357
357
  target = view.tbl_version # the one we need to populate
358
- stored_cols = [c for c in target.cols if c.is_stored and (c.is_computed or target.is_iterator_column(c))]
358
+ #stored_cols = [c for c in target.cols if c.is_stored and (c.is_computed or target.is_iterator_column(c))]
359
+ stored_cols = [c for c in target.cols if c.is_stored]
359
360
  # 2. for component views: iterator args
360
361
  iterator_args = [target.iterator_args] if target.iterator_args is not None else []
361
362
 
pixeltable/store.py CHANGED
@@ -263,7 +263,7 @@ class StoreBase:
263
263
  number of inserted rows, number of exceptions, set of column ids that have exceptions
264
264
  """
265
265
  assert v_min is not None
266
- exec_plan.ctx.conn = conn
266
+ exec_plan.ctx.set_conn(conn)
267
267
  batch_size = 16 # TODO: is this a good batch size?
268
268
  # TODO: total?
269
269
  num_excs = 0
@@ -4,6 +4,7 @@ import logging
4
4
  import os
5
5
  import pathlib
6
6
  import subprocess
7
+ from typing import Any
7
8
 
8
9
  import pgserver
9
10
  import toml
@@ -12,8 +13,10 @@ import pixeltable as pxt
12
13
  import pixeltable.metadata as metadata
13
14
  from pixeltable.env import Env
14
15
  from pixeltable.func import Batch
16
+ from pixeltable.io.external_store import Project
17
+ from pixeltable.tool import embed_udf
15
18
  from pixeltable.type_system import \
16
- StringType, IntType, FloatType, BoolType, TimestampType, JsonType
19
+ StringType, IntType, FloatType, BoolType, TimestampType, JsonType, ImageType
17
20
 
18
21
  _logger = logging.getLogger('pixeltable')
19
22
 
@@ -64,8 +67,7 @@ class Dumper:
64
67
  with open(info_file, 'w') as info:
65
68
  toml.dump(info_dict, info)
66
69
 
67
- # TODO: Add additional features to the test DB dump (ideally it should exercise
68
- # every major pixeltable DB feature)
70
+ # Expression types, predicate types, embedding indices, views on views
69
71
  def create_tables(self) -> None:
70
72
  schema = {
71
73
  'c1': StringType(nullable=False),
@@ -76,29 +78,11 @@ class Dumper:
76
78
  'c5': TimestampType(nullable=False),
77
79
  'c6': JsonType(nullable=False),
78
80
  'c7': JsonType(nullable=False),
81
+ 'c8': ImageType(nullable=True)
79
82
  }
80
- t = pxt.create_table('sample_table', schema, primary_key='c2')
83
+ t = pxt.create_table('base_table', schema, primary_key='c2')
81
84
 
82
- # Add columns for InlineArray and InlineDict
83
- t.add_column(c8=[[1, 2, 3], [4, 5, 6]])
84
- t.add_column(c9=[['a', 'b', 'c'], ['d', 'e', 'f']])
85
- t.add_column(c10=[t.c1, [t.c1n, t.c2]])
86
- t.add_column(c11={'int': 22, 'dict': {'key': 'val'}, 'expr': t.c1})
87
-
88
- # InPredicate
89
- t.add_column(isin_1=t.c1.isin(['test string 1', 'test string 2', 'test string 3']))
90
- t.add_column(isin_2=t.c2.isin([1, 2, 3, 4, 5]))
91
- t.add_column(isin_3=t.c2.isin(t.c6.f5))
92
-
93
- # Add columns for .astype converters to ensure they're persisted properly
94
- t.add_column(c2_as_float=t.c2.astype(FloatType()))
95
-
96
- # Add columns for .apply
97
- t.add_column(c2_to_string=t.c2.apply(str))
98
- t.add_column(c6_to_string=t.c6.apply(json.dumps))
99
- t.add_column(c6_back_to_json=t.c6_to_string.apply(json.loads))
100
-
101
- num_rows = 100
85
+ num_rows = 20
102
86
  d1 = {
103
87
  'f1': 'test string 1',
104
88
  'f2': 1,
@@ -117,9 +101,8 @@ class Dumper:
117
101
  c3_data = [float(i) for i in range(num_rows)]
118
102
  c4_data = [bool(i % 2) for i in range(num_rows)]
119
103
  c5_data = [datetime.datetime.now()] * num_rows
120
- c6_data = []
121
- for i in range(num_rows):
122
- d = {
104
+ c6_data = [
105
+ {
123
106
  'f1': f'test string {i}',
124
107
  'f2': i,
125
108
  'f3': float(i),
@@ -130,8 +113,8 @@ class Dumper:
130
113
  'f8': [1.0, 2.0, 3.0, 4.0],
131
114
  },
132
115
  }
133
- c6_data.append(d)
134
-
116
+ for i in range(num_rows)
117
+ ]
135
118
  c7_data = [d2] * num_rows
136
119
  rows = [
137
120
  {
@@ -143,40 +126,148 @@ class Dumper:
143
126
  'c5': c5_data[i],
144
127
  'c6': c6_data[i],
145
128
  'c7': c7_data[i],
129
+ 'c8': None
146
130
  }
147
131
  for i in range(num_rows)
148
132
  ]
133
+
134
+ self.__add_expr_columns(t, 'base_table')
149
135
  t.insert(rows)
136
+
150
137
  pxt.create_dir('views')
151
- v = pxt.create_view('views.sample_view', t, filter=(t.c2 < 50))
152
- _ = pxt.create_view('views.sample_snapshot', t, filter=(t.c2 >= 75), is_snapshot=True)
138
+
139
+ # simple view
140
+ v = pxt.create_view('views.view', t, filter=(t.c2 < 50))
141
+ self.__add_expr_columns(v, 'view')
142
+
143
+ # snapshot
144
+ _ = pxt.create_view('views.snapshot', t, filter=(t.c2 >= 75), is_snapshot=True)
145
+
146
+ # view of views
147
+ vv = pxt.create_view('views.view_of_views', v, filter=(t.c2 >= 25))
148
+ self.__add_expr_columns(vv, 'view_of_views')
149
+
150
+ # empty view
153
151
  e = pxt.create_view('views.empty_view', t, filter=t.c2 == 4171780)
154
152
  assert e.count() == 0
155
- # Computed column using a library function
156
- v['str_format'] = pxt.functions.string.str_format('{0} {key}', t.c1, key=t.c1)
157
- # Computed column using a bespoke stored udf
158
- v['test_udf'] = test_udf_stored(t.c2)
159
- # Computed column using a batched function
160
- # (apply this to the empty view, since it's a "heavyweight" function)
161
- e['batched'] = pxt.functions.huggingface.clip_text(t.c1, model_id='openai/clip-vit-base-patch32')
162
- # computed column using a stored batched function
163
- v['test_udf_batched'] = test_udf_stored_batched(t.c1, upper=False)
164
- # astype
165
- v['astype'] = t.c1.astype(pxt.FloatType())
166
-
167
- # Add remotes
168
- from pixeltable.datatransfer.remote import MockRemote
169
- v.link(
170
- MockRemote('remote', {'int_field': pxt.IntType()}, {'str_field': pxt.StringType()}),
171
- col_mapping={'test_udf': 'int_field', 'c1': 'str_field'}
153
+ self.__add_expr_columns(e, 'empty_view', include_expensive_functions=True)
154
+
155
+ # Add external stores
156
+ from pixeltable.io.external_store import MockProject
157
+ v._link_external_store(
158
+ MockProject.create(
159
+ v,
160
+ 'project',
161
+ {'int_field': pxt.IntType()},
162
+ {'str_field': pxt.StringType()},
163
+ {'view_test_udf': 'int_field', 'c1': 'str_field'}
164
+ )
172
165
  )
173
- # We're just trying to test metadata here, so reach "under the covers" and link a fake
174
- # Label Studio project without validation (so we don't need a real Label Studio server)
175
- from pixeltable.datatransfer.label_studio import LabelStudioProject
176
- v.tbl_version_path.tbl_version.link(
177
- LabelStudioProject(4171780, media_import_method='file'),
178
- col_mapping={'str_format': 'str_format'}
166
+ # We're just trying to test metadata here, so it's ok to link a false Label Studio project.
167
+ # We include a computed image column in order to ensure the creation of a stored proxy.
168
+ from pixeltable.io.label_studio import LabelStudioProject
169
+ col_mapping = Project.validate_columns(
170
+ v, {'str_field': pxt.StringType(), 'img_field': pxt.ImageType()}, {},
171
+ {'view_function_call': 'str_field', 'base_table_image_rot': 'img_field'}
179
172
  )
173
+ project = LabelStudioProject('ls_project_0', 4171780, media_import_method='file', col_mapping=col_mapping)
174
+ v._link_external_store(project)
175
+ # Sanity check that the stored proxy column did get created
176
+ assert len(project.stored_proxies) == 1
177
+ assert t.base_table_image_rot.col in project.stored_proxies
178
+
179
+ def __add_expr_columns(self, t: pxt.Table, col_prefix: str, include_expensive_functions=False) -> None:
180
+ def add_column(col_name: str, col_expr: Any) -> None:
181
+ t.add_column(**{f'{col_prefix}_{col_name}': col_expr})
182
+
183
+ # arithmetic_expr
184
+ add_column('plus', t.c2 + 6)
185
+ add_column('minus', t.c2 - 5)
186
+ add_column('times', t.c3 * 1.2)
187
+ add_column('div', t.c3 / 1.7)
188
+ add_column('mod', t.c2 % 11)
189
+
190
+ # array_slice
191
+ add_column('array_slice_1', t.c6[5])
192
+
193
+ # column_property_ref
194
+ add_column('fileurl', t.c8.fileurl)
195
+ add_column('localpath', t.c8.localpath)
196
+
197
+ # comparison
198
+ add_column('lt', t.c2 < t.c3)
199
+ add_column('le', t.c2 <= t.c3)
200
+ add_column('gt', t.c2 > t.c3)
201
+ add_column('ge', t.c2 >= t.c3)
202
+ add_column('ne', t.c2 != t.c3)
203
+ add_column('eq', t.c2 == t.c3)
204
+
205
+ # compound_predicate
206
+ add_column('and', (t.c2 >= 5) & (t.c2 < 8))
207
+ add_column('or', (t.c2 > 1) | t.c4)
208
+ add_column('not', ~(t.c2 > 20))
209
+
210
+ # function_call
211
+ add_column('function_call', pxt.functions.string.str_format('{0} {key}', t.c1, key=t.c1)) # library function
212
+ add_column('test_udf', test_udf_stored(t.c2)) # stored udf
213
+ add_column('test_udf_batched', test_udf_stored_batched(t.c1, upper=False)) # batched stored udf
214
+ if include_expensive_functions:
215
+ # batched library function
216
+ add_column('batched', pxt.functions.huggingface.clip_text(t.c1, model_id='openai/clip-vit-base-patch32'))
217
+
218
+ # image_member_access
219
+ add_column('image_mode', t.c8.mode)
220
+ add_column('image_rot', t.c8.rotate(180))
221
+
222
+ # in_predicate
223
+ add_column('isin_1', t.c1.isin(['test string 1', 'test string 2', 'test string 3']))
224
+ add_column('isin_2', t.c2.isin([1, 2, 3, 4, 5]))
225
+ add_column('isin_3', t.c2.isin(t.c6.f5))
226
+
227
+ # inline_array and inline_dict
228
+ add_column('inline_array_1', [[1, 2, 3], [4, 5, 6]])
229
+ add_column('inline_array_2', [['a', 'b', 'c'], ['d', 'e', 'f']])
230
+ add_column('inline_list_exprs', [t.c1, [t.c1n, t.c2]])
231
+ add_column('inline_list_mixed', [1, 'a', t.c1, [1, 'a', t.c1n], 1, 'a'])
232
+ add_column('inline_dict', {'int': 22, 'dict': {'key': 'val'}, 'expr': t.c1})
233
+
234
+ # is_null
235
+ add_column('isnull', t.c1 == None)
236
+
237
+ # json_mapper and json_path
238
+ add_column('json_mapper', t.c6[3])
239
+ add_column('json_path', t.c6.f1)
240
+
241
+ # literal
242
+ add_column('str_const', 'str')
243
+ add_column('int_const', 5)
244
+ add_column('float_const', 5.0)
245
+ add_column('timestamp_const_1', datetime.datetime.utcnow())
246
+ add_column('timestamp_const_2', datetime.date.today())
247
+
248
+ # type_cast
249
+ add_column('astype', t.c2.astype(FloatType()))
250
+
251
+ # .apply
252
+ add_column('c2_to_string', t.c2.apply(str))
253
+ add_column('c6_to_string', t.c6.apply(json.dumps))
254
+ add_column('c6_back_to_json', t[f'{col_prefix}_c6_to_string'].apply(json.loads))
255
+
256
+ t.add_embedding_index(f'{col_prefix}_function_call', text_embed=embed_udf.clip_text_embed)
257
+
258
+ # query()
259
+ @t.query
260
+ def q1(i: int):
261
+ # this breaks; TODO: why?
262
+ #return t.where(t.c2 < i)
263
+ return t.where(t.c2 < i).select(t.c1, t.c2)
264
+ add_column('query_output', t.q1(t.c2))
265
+
266
+ @t.query
267
+ def q2(s: str):
268
+ sim = t[f'{col_prefix}_function_call'].similarity(s)
269
+ return t.order_by(sim, asc=False).select(t[f'{col_prefix}_function_call']).limit(5)
270
+ add_column('sim_output', t.q2(t.c1))
180
271
 
181
272
 
182
273
  @pxt.udf(_force_stored=True)
@@ -0,0 +1,9 @@
1
+ import numpy as np
2
+
3
+ import pixeltable as pxt
4
+
5
+
6
+ # TODO This can go away once we have the ability to inline expr_udf's
7
+ @pxt.expr_udf
8
+ def clip_text_embed(txt: str) -> np.ndarray:
9
+ return pxt.functions.huggingface.clip_text(txt, model_id='openai/clip-vit-base-patch32')
pixeltable/type_system.py CHANGED
@@ -160,7 +160,7 @@ class ColumnType:
160
160
  if t == cls.Type.AUDIO:
161
161
  return AudioType()
162
162
  if t == cls.Type.DOCUMENT:
163
- return AudioType()
163
+ return DocumentType()
164
164
 
165
165
  def __str__(self) -> str:
166
166
  return self._type.name.lower()
@@ -250,7 +250,6 @@ class ColumnType:
250
250
  return None
251
251
  return None
252
252
 
253
-
254
253
  @classmethod
255
254
  def from_python_type(cls, t: type) -> Optional[ColumnType]:
256
255
  if typing.get_origin(t) is typing.Union: