pixeltable 0.2.8__py3-none-any.whl → 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (77) hide show
  1. pixeltable/__init__.py +15 -33
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/catalog.py +1 -1
  4. pixeltable/catalog/column.py +29 -11
  5. pixeltable/catalog/dir.py +2 -2
  6. pixeltable/catalog/insertable_table.py +5 -55
  7. pixeltable/catalog/named_function.py +2 -2
  8. pixeltable/catalog/schema_object.py +2 -7
  9. pixeltable/catalog/table.py +307 -186
  10. pixeltable/catalog/table_version.py +109 -63
  11. pixeltable/catalog/table_version_path.py +28 -5
  12. pixeltable/catalog/view.py +20 -10
  13. pixeltable/dataframe.py +128 -25
  14. pixeltable/env.py +29 -18
  15. pixeltable/exec/exec_context.py +5 -0
  16. pixeltable/exec/exec_node.py +1 -0
  17. pixeltable/exec/in_memory_data_node.py +29 -24
  18. pixeltable/exec/sql_scan_node.py +1 -1
  19. pixeltable/exprs/column_ref.py +13 -8
  20. pixeltable/exprs/data_row.py +4 -0
  21. pixeltable/exprs/expr.py +16 -1
  22. pixeltable/exprs/function_call.py +4 -4
  23. pixeltable/exprs/row_builder.py +29 -20
  24. pixeltable/exprs/similarity_expr.py +4 -3
  25. pixeltable/ext/functions/yolox.py +2 -1
  26. pixeltable/func/__init__.py +1 -0
  27. pixeltable/func/aggregate_function.py +14 -12
  28. pixeltable/func/callable_function.py +8 -6
  29. pixeltable/func/expr_template_function.py +13 -19
  30. pixeltable/func/function.py +3 -6
  31. pixeltable/func/query_template_function.py +84 -0
  32. pixeltable/func/signature.py +68 -23
  33. pixeltable/func/udf.py +13 -10
  34. pixeltable/functions/__init__.py +6 -91
  35. pixeltable/functions/eval.py +26 -14
  36. pixeltable/functions/fireworks.py +25 -23
  37. pixeltable/functions/globals.py +62 -0
  38. pixeltable/functions/huggingface.py +20 -16
  39. pixeltable/functions/image.py +170 -1
  40. pixeltable/functions/openai.py +95 -128
  41. pixeltable/functions/string.py +10 -2
  42. pixeltable/functions/together.py +95 -84
  43. pixeltable/functions/util.py +16 -0
  44. pixeltable/functions/video.py +94 -16
  45. pixeltable/functions/whisper.py +78 -0
  46. pixeltable/globals.py +1 -1
  47. pixeltable/io/__init__.py +10 -0
  48. pixeltable/io/external_store.py +370 -0
  49. pixeltable/io/globals.py +51 -22
  50. pixeltable/io/label_studio.py +639 -0
  51. pixeltable/io/parquet.py +1 -1
  52. pixeltable/iterators/__init__.py +9 -0
  53. pixeltable/iterators/string.py +40 -0
  54. pixeltable/metadata/__init__.py +6 -8
  55. pixeltable/metadata/converters/convert_10.py +2 -4
  56. pixeltable/metadata/converters/convert_12.py +7 -2
  57. pixeltable/metadata/converters/convert_13.py +6 -8
  58. pixeltable/metadata/converters/convert_14.py +2 -4
  59. pixeltable/metadata/converters/convert_15.py +44 -0
  60. pixeltable/metadata/converters/convert_16.py +18 -0
  61. pixeltable/metadata/converters/util.py +66 -0
  62. pixeltable/metadata/schema.py +3 -3
  63. pixeltable/plan.py +8 -7
  64. pixeltable/store.py +1 -1
  65. pixeltable/tool/create_test_db_dump.py +147 -54
  66. pixeltable/tool/embed_udf.py +9 -0
  67. pixeltable/type_system.py +1 -2
  68. pixeltable/utils/code.py +34 -0
  69. {pixeltable-0.2.8.dist-info → pixeltable-0.2.9.dist-info}/METADATA +1 -1
  70. pixeltable-0.2.9.dist-info/RECORD +131 -0
  71. pixeltable/datatransfer/__init__.py +0 -1
  72. pixeltable/datatransfer/label_studio.py +0 -452
  73. pixeltable/datatransfer/remote.py +0 -85
  74. pixeltable/functions/pil/image.py +0 -147
  75. pixeltable-0.2.8.dist-info/RECORD +0 -124
  76. {pixeltable-0.2.8.dist-info → pixeltable-0.2.9.dist-info}/LICENSE +0 -0
  77. {pixeltable-0.2.8.dist-info → pixeltable-0.2.9.dist-info}/WHEEL +0 -0
@@ -0,0 +1,40 @@
1
+ from typing import Iterator, Any
2
+
3
+ import pixeltable.exceptions as excs
4
+ import pixeltable.type_system as ts
5
+ from pixeltable.env import Env
6
+ from pixeltable.iterators.base import ComponentIterator
7
+
8
+
9
+ class StringSplitter(ComponentIterator):
10
+ # TODO(aaron-siegel): Merge this with `DocumentSplitter` in order to provide additional capabilities.
11
+ def __init__(self, text: str, *, separators: str):
12
+ if separators != 'sentence':
13
+ raise excs.Error('Only `sentence` separators are currently supported.')
14
+ self._text = text
15
+ self.doc = Env.get().spacy_nlp(self._text)
16
+ self.iter = self._iter()
17
+
18
+ def _iter(self) -> Iterator[dict[str, Any]]:
19
+ for sentence in self.doc.sents:
20
+ yield {'text': sentence.text}
21
+
22
+ def __next__(self) -> dict[str, Any]:
23
+ return next(self.iter)
24
+
25
+ def close(self) -> None:
26
+ pass
27
+
28
+ def set_pos(self, pos: int) -> None:
29
+ pass
30
+
31
+ @classmethod
32
+ def input_schema(cls, *args: Any, **kwargs: Any) -> dict[str, ts.ColumnType]:
33
+ return {
34
+ 'text': ts.StringType(),
35
+ 'separators': ts.StringType(),
36
+ }
37
+
38
+ @classmethod
39
+ def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
40
+ return {'text': ts.StringType()}, []
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
10
10
  from .schema import SystemInfo, SystemInfoMd
11
11
 
12
12
  # current version of the metadata; this is incremented whenever the metadata schema changes
13
- VERSION = 15
13
+ VERSION = 17
14
14
 
15
15
 
16
16
  def create_system_info(engine: sql.engine.Engine) -> None:
@@ -26,13 +26,11 @@ def create_system_info(engine: sql.engine.Engine) -> None:
26
26
  # key: old schema version
27
27
  converter_cbs: Dict[int, Callable[[sql.engine.Engine], None]] = {}
28
28
 
29
- def register_converter(version: int, cb: Callable[[sql.engine.Engine], None]) -> None:
30
- global converter_cbs
31
- converter_cbs[version] = cb
32
-
33
- def noop_converter(engine: sql.engine.Engine) -> None:
34
- # Converter to use when incrementing the schema version, but without any functional changes
35
- pass
29
+ def register_converter(version: int) -> Callable[[Callable[[sql.engine.Engine], None]], None]:
30
+ def decorator(fn: Callable[[sql.engine.Engine], None]) -> None:
31
+ global converter_cbs
32
+ converter_cbs[version] = fn
33
+ return decorator
36
34
 
37
35
  # load all converter modules
38
36
  for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/converters']):
@@ -4,7 +4,8 @@ from pixeltable.metadata.schema import Table, TableSchemaVersion
4
4
  from pixeltable.metadata import register_converter
5
5
 
6
6
 
7
- def convert_10(engine: sql.engine.Engine) -> None:
7
+ @register_converter(version=10)
8
+ def _(engine: sql.engine.Engine) -> None:
8
9
  default_table_attrs = {"comment": None, "num_retained_versions": 10}
9
10
  with engine.begin() as conn:
10
11
  # Because `parameters` wasn't actually used for anything,
@@ -13,6 +14,3 @@ def convert_10(engine: sql.engine.Engine) -> None:
13
14
  # Add `table_attrs` to all instances of tableschemaversions.md.
14
15
  conn.execute(sql.update(TableSchemaVersion).values(md=TableSchemaVersion.md.concat(default_table_attrs)))
15
16
  return
16
-
17
-
18
- register_converter(10, convert_10)
@@ -1,3 +1,8 @@
1
- from pixeltable.metadata import register_converter, noop_converter
1
+ import sqlalchemy as sql
2
2
 
3
- register_converter(12, noop_converter)
3
+ from pixeltable.metadata import register_converter
4
+
5
+
6
+ @register_converter(version=12)
7
+ def _(engine: sql.engine.Engine) -> None:
8
+ pass
@@ -9,12 +9,13 @@ from pixeltable.metadata.schema import Table
9
9
  _logger = logging.getLogger('pixeltable')
10
10
 
11
11
 
12
- def convert_13(engine: sql.engine.Engine) -> None:
12
+ @register_converter(version=13)
13
+ def _(engine: sql.engine.Engine) -> None:
13
14
  with engine.begin() as conn:
14
15
  for row in conn.execute(sql.select(Table)):
15
16
  id = row[0]
16
17
  md = row[2]
17
- updated_md = _update_md(md)
18
+ updated_md = __update_md(md)
18
19
  if updated_md != md:
19
20
  _logger.info(f'Updating schema for table: {id}')
20
21
  conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_md))
@@ -23,19 +24,16 @@ def convert_13(engine: sql.engine.Engine) -> None:
23
24
  # Traverse the schema dictionary and replace instances of `ExplicitBatchedFunction` with
24
25
  # `CallableFunction`. DB versions prior to 14 can't contain serialized batched functions,
25
26
  # so this is all we need to do.
26
- def _update_md(md: Any) -> Any:
27
+ def __update_md(md: Any) -> Any:
27
28
  if isinstance(md, dict):
28
29
  updated_md = {}
29
30
  for k, v in md.items():
30
31
  if k == '_classpath' and v == 'pixeltable.func.batched_function.ExplicitBatchedFunction':
31
32
  updated_md[k] = 'pixeltable.func.callable_function.CallableFunction'
32
33
  else:
33
- updated_md[k] = _update_md(v)
34
+ updated_md[k] = __update_md(v)
34
35
  return updated_md
35
36
  elif isinstance(md, list):
36
- return [_update_md(v) for v in md]
37
+ return [__update_md(v) for v in md]
37
38
  else:
38
39
  return md
39
-
40
-
41
- register_converter(13, convert_13)
@@ -4,10 +4,8 @@ from pixeltable.metadata.schema import Table
4
4
  from pixeltable.metadata import register_converter
5
5
 
6
6
 
7
- def convert_14(engine: sql.engine.Engine) -> None:
7
+ @register_converter(version=14)
8
+ def _(engine: sql.engine.Engine) -> None:
8
9
  default_remotes = {'remotes': []}
9
10
  with engine.begin() as conn:
10
11
  conn.execute(sql.update(Table).where(Table.md['remotes'] == None).values(md=Table.md.concat(default_remotes)))
11
-
12
-
13
- register_converter(14, convert_14)
@@ -0,0 +1,44 @@
1
+
2
+ import inspect
3
+ import logging
4
+ from typing import Any
5
+
6
+ import cloudpickle
7
+ import sqlalchemy as sql
8
+
9
+ import pixeltable.func as func
10
+ import pixeltable.type_system as ts
11
+ from pixeltable.metadata import register_converter
12
+ from pixeltable.metadata.schema import Function
13
+
14
+ _logger = logging.getLogger('pixeltable')
15
+
16
+
17
+ @register_converter(version=15)
18
+ def _(engine: sql.engine.Engine) -> None:
19
+ with engine.begin() as conn:
20
+ for row in conn.execute(sql.select(Function)):
21
+ id, dir_id, md, binary_obj = row
22
+ md['md'] = __update_md(md['md'], binary_obj)
23
+ _logger.info(f'Updating function: {id}')
24
+ conn.execute(sql.update(Function).where(Function.id == id).values(md=md))
25
+
26
+
27
+ def __update_md(orig_d: dict, binary_obj: bytes) -> Any:
28
+ # construct dict produced by CallableFunction.to_store()
29
+ py_fn = cloudpickle.loads(binary_obj)
30
+ py_params = inspect.signature(py_fn).parameters
31
+ return_type = ts.ColumnType.from_dict(orig_d['return_type'])
32
+ params: list[func.Parameter] = []
33
+ for name, col_type_dict, kind_int, is_batched in orig_d['parameters']:
34
+ col_type = ts.ColumnType.from_dict(col_type_dict) if col_type_dict is not None else None
35
+ default = py_params[name].default
36
+ kind = inspect._ParameterKind(kind_int) # is there a way to avoid referencing a private type?
37
+ params.append(func.Parameter(name=name, col_type=col_type, kind=kind, default=default, is_batched=is_batched))
38
+ is_batched = 'batch_size' in orig_d
39
+ sig = func.Signature(return_type, params, is_batched=is_batched)
40
+ d = {
41
+ 'signature': sig.as_dict(),
42
+ 'batch_size': orig_d['batch_size'] if is_batched else None,
43
+ }
44
+ return d
@@ -0,0 +1,18 @@
1
+ import sqlalchemy as sql
2
+
3
+ from pixeltable.metadata import register_converter
4
+ from pixeltable.metadata.converters.util import convert_table_md
5
+
6
+
7
+ @register_converter(version=16)
8
+ def _(engine: sql.engine.Engine) -> None:
9
+ convert_table_md(
10
+ engine,
11
+ table_md_updater=__update_table_md
12
+ )
13
+
14
+
15
+ def __update_table_md(table_md: dict) -> None:
16
+ # External stores are not migratable; just drop them
17
+ del table_md['remotes']
18
+ table_md['external_stores'] = {}
@@ -0,0 +1,66 @@
1
+ import copy
2
+ import logging
3
+ from typing import Any, Callable, Optional
4
+
5
+ import sqlalchemy as sql
6
+
7
+ from pixeltable.metadata.schema import Table
8
+
9
+ __logger = logging.getLogger('pixeltable')
10
+
11
+
12
+ def convert_table_md(
13
+ engine: sql.engine.Engine,
14
+ table_md_updater: Optional[Callable[[dict], None]] = None,
15
+ column_md_updater: Optional[Callable[[dict], None]] = None,
16
+ external_store_md_updater: Optional[Callable[[dict], None]] = None,
17
+ substitution_fn: Optional[Callable[[Any, Any], Optional[tuple[Any, Any]]]] = None
18
+ ) -> None:
19
+ with engine.begin() as conn:
20
+ for row in conn.execute(sql.select(Table)):
21
+ id = row[0]
22
+ table_md = row[2]
23
+ assert isinstance(table_md, dict)
24
+ updated_table_md = copy.deepcopy(table_md)
25
+ if table_md_updater is not None:
26
+ table_md_updater(updated_table_md)
27
+ if column_md_updater is not None:
28
+ __update_column_md(updated_table_md, column_md_updater)
29
+ if external_store_md_updater is not None:
30
+ __update_external_store_md(updated_table_md, external_store_md_updater)
31
+ if substitution_fn is not None:
32
+ updated_table_md = __substitute_md_rec(updated_table_md, substitution_fn)
33
+ if updated_table_md != table_md:
34
+ __logger.info(f'Updating schema for table: {id}')
35
+ conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_table_md))
36
+
37
+
38
+ def __update_column_md(table_md: dict, column_md_updater: Callable[[dict], None]) -> None:
39
+ columns_md = table_md['column_md']
40
+ assert isinstance(columns_md, dict)
41
+ for column_md in columns_md.values():
42
+ column_md_updater(column_md)
43
+
44
+
45
+ def __update_external_store_md(table_md: dict, external_store_md_updater: Callable[[dict], None]) -> None:
46
+ stores_md = table_md['external_stores']
47
+ assert isinstance(stores_md, list)
48
+ for store_md in stores_md:
49
+ external_store_md_updater(store_md)
50
+
51
+
52
+ def __substitute_md_rec(md: Any, substitution_fn: Callable[[Any, Any], Optional[tuple[Any, Any]]]) -> Any:
53
+ if isinstance(md, dict):
54
+ updated_md = {}
55
+ for k, v in md.items():
56
+ substitute = substitution_fn(k, v)
57
+ if substitute is not None:
58
+ updated_k, updated_v = substitute
59
+ updated_md[updated_k] = updated_v
60
+ else:
61
+ updated_md[k] = __substitute_md_rec(v, substitution_fn)
62
+ return updated_md
63
+ elif isinstance(md, list):
64
+ return [__substitute_md_rec(v, substitution_fn) for v in md]
65
+ else:
66
+ return md
@@ -142,9 +142,9 @@ class TableMd:
142
142
  # - every row is assigned a unique and immutable rowid on insertion
143
143
  next_row_id: int
144
144
 
145
- # Metadata format for remotes:
146
- # {'class': 'pixeltable.datatransfer.LabelStudioProject', 'md': {'project_id': 3}}
147
- remotes: list[dict[str, Any]]
145
+ # Metadata format for external stores:
146
+ # {'class': 'pixeltable.io.label_studio.LabelStudioProject', 'md': {'project_id': 3}}
147
+ external_stores: list[dict[str, Any]]
148
148
 
149
149
  column_md: dict[int, ColumnMd] # col_id -> ColumnMd
150
150
  index_md: dict[int, IndexMd] # index_id -> IndexMd
pixeltable/plan.py CHANGED
@@ -217,15 +217,15 @@ class Planner:
217
217
  plan = exec.InMemoryDataNode(tbl, rows, row_builder, tbl.next_rowid)
218
218
 
219
219
  media_input_cols = [info for info in input_col_info if info.col.col_type.is_media_type()]
220
+ if len(media_input_cols) > 0:
221
+ # prefetch external files for all input column refs for validation
222
+ plan = exec.CachePrefetchNode(tbl.id, media_input_cols, plan)
223
+ plan = exec.MediaValidationNode(row_builder, media_input_cols, input=plan)
220
224
 
221
- # prefetch external files for all input column refs for validation
222
- plan = exec.CachePrefetchNode(tbl.id, media_input_cols, plan)
223
- plan = exec.MediaValidationNode(row_builder, media_input_cols, input=plan)
224
-
225
- computed_exprs = row_builder.default_eval_ctx.target_exprs
225
+ computed_exprs = [e for e in row_builder.default_eval_ctx.target_exprs if not isinstance(e, exprs.ColumnRef)]
226
226
  if len(computed_exprs) > 0:
227
227
  # add an ExprEvalNode when there are exprs to compute
228
- plan = exec.ExprEvalNode(row_builder, computed_exprs, [], input=plan)
228
+ plan = exec.ExprEvalNode(row_builder, computed_exprs, plan.output_exprs, input=plan)
229
229
 
230
230
  plan.set_stored_img_cols(stored_img_col_info)
231
231
  plan.set_ctx(
@@ -355,7 +355,8 @@ class Planner:
355
355
  # - we can ignore stored non-computed columns because they have a default value that is supplied directly by
356
356
  # the store
357
357
  target = view.tbl_version # the one we need to populate
358
- stored_cols = [c for c in target.cols if c.is_stored and (c.is_computed or target.is_iterator_column(c))]
358
+ #stored_cols = [c for c in target.cols if c.is_stored and (c.is_computed or target.is_iterator_column(c))]
359
+ stored_cols = [c for c in target.cols if c.is_stored]
359
360
  # 2. for component views: iterator args
360
361
  iterator_args = [target.iterator_args] if target.iterator_args is not None else []
361
362
 
pixeltable/store.py CHANGED
@@ -263,7 +263,7 @@ class StoreBase:
263
263
  number of inserted rows, number of exceptions, set of column ids that have exceptions
264
264
  """
265
265
  assert v_min is not None
266
- exec_plan.ctx.conn = conn
266
+ exec_plan.ctx.set_conn(conn)
267
267
  batch_size = 16 # TODO: is this a good batch size?
268
268
  # TODO: total?
269
269
  num_excs = 0
@@ -4,6 +4,7 @@ import logging
4
4
  import os
5
5
  import pathlib
6
6
  import subprocess
7
+ from typing import Any
7
8
 
8
9
  import pgserver
9
10
  import toml
@@ -12,8 +13,10 @@ import pixeltable as pxt
12
13
  import pixeltable.metadata as metadata
13
14
  from pixeltable.env import Env
14
15
  from pixeltable.func import Batch
16
+ from pixeltable.io.external_store import Project
17
+ from pixeltable.tool import embed_udf
15
18
  from pixeltable.type_system import \
16
- StringType, IntType, FloatType, BoolType, TimestampType, JsonType
19
+ StringType, IntType, FloatType, BoolType, TimestampType, JsonType, ImageType
17
20
 
18
21
  _logger = logging.getLogger('pixeltable')
19
22
 
@@ -30,6 +33,8 @@ class Dumper:
30
33
  os.environ['PIXELTABLE_DB'] = db_name
31
34
  os.environ['PIXELTABLE_PGDATA'] = str(shared_home / 'pgdata')
32
35
 
36
+ Env._init_env(reinit_db=True)
37
+
33
38
  Env.get().configure_logging(level=logging.DEBUG, to_stdout=True)
34
39
 
35
40
  def dump_db(self) -> None:
@@ -62,8 +67,7 @@ class Dumper:
62
67
  with open(info_file, 'w') as info:
63
68
  toml.dump(info_dict, info)
64
69
 
65
- # TODO: Add additional features to the test DB dump (ideally it should exercise
66
- # every major pixeltable DB feature)
70
+ # Expression types, predicate types, embedding indices, views on views
67
71
  def create_tables(self) -> None:
68
72
  schema = {
69
73
  'c1': StringType(nullable=False),
@@ -74,29 +78,11 @@ class Dumper:
74
78
  'c5': TimestampType(nullable=False),
75
79
  'c6': JsonType(nullable=False),
76
80
  'c7': JsonType(nullable=False),
81
+ 'c8': ImageType(nullable=True)
77
82
  }
78
- t = pxt.create_table('sample_table', schema, primary_key='c2')
79
-
80
- # Add columns for InlineArray and InlineDict
81
- t.add_column(c8=[[1, 2, 3], [4, 5, 6]])
82
- t.add_column(c9=[['a', 'b', 'c'], ['d', 'e', 'f']])
83
- t.add_column(c10=[t.c1, [t.c1n, t.c2]])
84
- t.add_column(c11={'int': 22, 'dict': {'key': 'val'}, 'expr': t.c1})
85
-
86
- # InPredicate
87
- t.add_column(isin_1=t.c1.isin(['test string 1', 'test string 2', 'test string 3']))
88
- t.add_column(isin_2=t.c2.isin([1, 2, 3, 4, 5]))
89
- t.add_column(isin_3=t.c2.isin(t.c6.f5))
90
-
91
- # Add columns for .astype converters to ensure they're persisted properly
92
- t.add_column(c2_as_float=t.c2.astype(FloatType()))
93
-
94
- # Add columns for .apply
95
- t.add_column(c2_to_string=t.c2.apply(str))
96
- t.add_column(c6_to_string=t.c6.apply(json.dumps))
97
- t.add_column(c6_back_to_json=t.c6_to_string.apply(json.loads))
83
+ t = pxt.create_table('base_table', schema, primary_key='c2')
98
84
 
99
- num_rows = 100
85
+ num_rows = 20
100
86
  d1 = {
101
87
  'f1': 'test string 1',
102
88
  'f2': 1,
@@ -115,9 +101,8 @@ class Dumper:
115
101
  c3_data = [float(i) for i in range(num_rows)]
116
102
  c4_data = [bool(i % 2) for i in range(num_rows)]
117
103
  c5_data = [datetime.datetime.now()] * num_rows
118
- c6_data = []
119
- for i in range(num_rows):
120
- d = {
104
+ c6_data = [
105
+ {
121
106
  'f1': f'test string {i}',
122
107
  'f2': i,
123
108
  'f3': float(i),
@@ -128,8 +113,8 @@ class Dumper:
128
113
  'f8': [1.0, 2.0, 3.0, 4.0],
129
114
  },
130
115
  }
131
- c6_data.append(d)
132
-
116
+ for i in range(num_rows)
117
+ ]
133
118
  c7_data = [d2] * num_rows
134
119
  rows = [
135
120
  {
@@ -141,40 +126,148 @@ class Dumper:
141
126
  'c5': c5_data[i],
142
127
  'c6': c6_data[i],
143
128
  'c7': c7_data[i],
129
+ 'c8': None
144
130
  }
145
131
  for i in range(num_rows)
146
132
  ]
133
+
134
+ self.__add_expr_columns(t, 'base_table')
147
135
  t.insert(rows)
136
+
148
137
  pxt.create_dir('views')
149
- v = pxt.create_view('views.sample_view', t, filter=(t.c2 < 50))
150
- _ = pxt.create_view('views.sample_snapshot', t, filter=(t.c2 >= 75), is_snapshot=True)
138
+
139
+ # simple view
140
+ v = pxt.create_view('views.view', t, filter=(t.c2 < 50))
141
+ self.__add_expr_columns(v, 'view')
142
+
143
+ # snapshot
144
+ _ = pxt.create_view('views.snapshot', t, filter=(t.c2 >= 75), is_snapshot=True)
145
+
146
+ # view of views
147
+ vv = pxt.create_view('views.view_of_views', v, filter=(t.c2 >= 25))
148
+ self.__add_expr_columns(vv, 'view_of_views')
149
+
150
+ # empty view
151
151
  e = pxt.create_view('views.empty_view', t, filter=t.c2 == 4171780)
152
152
  assert e.count() == 0
153
- # Computed column using a library function
154
- v['str_format'] = pxt.functions.string.str_format('{0} {key}', t.c1, key=t.c1)
155
- # Computed column using a bespoke stored udf
156
- v['test_udf'] = test_udf_stored(t.c2)
157
- # Computed column using a batched function
158
- # (apply this to the empty view, since it's a "heavyweight" function)
159
- e['batched'] = pxt.functions.huggingface.clip_text(t.c1, model_id='openai/clip-vit-base-patch32')
160
- # computed column using a stored batched function
161
- v['test_udf_batched'] = test_udf_stored_batched(t.c1, upper=False)
162
- # astype
163
- v['astype'] = t.c1.astype(pxt.FloatType())
164
-
165
- # Add remotes
166
- from pixeltable.datatransfer.remote import MockRemote
167
- v.link_remote(
168
- MockRemote({'int_field': pxt.IntType()}, {'str_field': pxt.StringType()}),
169
- col_mapping={'test_udf': 'int_field', 'c1': 'str_field'}
153
+ self.__add_expr_columns(e, 'empty_view', include_expensive_functions=True)
154
+
155
+ # Add external stores
156
+ from pixeltable.io.external_store import MockProject
157
+ v._link_external_store(
158
+ MockProject.create(
159
+ v,
160
+ 'project',
161
+ {'int_field': pxt.IntType()},
162
+ {'str_field': pxt.StringType()},
163
+ {'view_test_udf': 'int_field', 'c1': 'str_field'}
164
+ )
170
165
  )
171
- # We're just trying to test metadata here, so reach "under the covers" and link a fake
172
- # Label Studio project without validation (so we don't need a real Label Studio server)
173
- from pixeltable.datatransfer.label_studio import LabelStudioProject
174
- v.tbl_version_path.tbl_version.link(
175
- LabelStudioProject(4171780),
176
- col_mapping={'str_format': 'str_format'}
166
+ # We're just trying to test metadata here, so it's ok to link a false Label Studio project.
167
+ # We include a computed image column in order to ensure the creation of a stored proxy.
168
+ from pixeltable.io.label_studio import LabelStudioProject
169
+ col_mapping = Project.validate_columns(
170
+ v, {'str_field': pxt.StringType(), 'img_field': pxt.ImageType()}, {},
171
+ {'view_function_call': 'str_field', 'base_table_image_rot': 'img_field'}
177
172
  )
173
+ project = LabelStudioProject('ls_project_0', 4171780, media_import_method='file', col_mapping=col_mapping)
174
+ v._link_external_store(project)
175
+ # Sanity check that the stored proxy column did get created
176
+ assert len(project.stored_proxies) == 1
177
+ assert t.base_table_image_rot.col in project.stored_proxies
178
+
179
+ def __add_expr_columns(self, t: pxt.Table, col_prefix: str, include_expensive_functions=False) -> None:
180
+ def add_column(col_name: str, col_expr: Any) -> None:
181
+ t.add_column(**{f'{col_prefix}_{col_name}': col_expr})
182
+
183
+ # arithmetic_expr
184
+ add_column('plus', t.c2 + 6)
185
+ add_column('minus', t.c2 - 5)
186
+ add_column('times', t.c3 * 1.2)
187
+ add_column('div', t.c3 / 1.7)
188
+ add_column('mod', t.c2 % 11)
189
+
190
+ # array_slice
191
+ add_column('array_slice_1', t.c6[5])
192
+
193
+ # column_property_ref
194
+ add_column('fileurl', t.c8.fileurl)
195
+ add_column('localpath', t.c8.localpath)
196
+
197
+ # comparison
198
+ add_column('lt', t.c2 < t.c3)
199
+ add_column('le', t.c2 <= t.c3)
200
+ add_column('gt', t.c2 > t.c3)
201
+ add_column('ge', t.c2 >= t.c3)
202
+ add_column('ne', t.c2 != t.c3)
203
+ add_column('eq', t.c2 == t.c3)
204
+
205
+ # compound_predicate
206
+ add_column('and', (t.c2 >= 5) & (t.c2 < 8))
207
+ add_column('or', (t.c2 > 1) | t.c4)
208
+ add_column('not', ~(t.c2 > 20))
209
+
210
+ # function_call
211
+ add_column('function_call', pxt.functions.string.str_format('{0} {key}', t.c1, key=t.c1)) # library function
212
+ add_column('test_udf', test_udf_stored(t.c2)) # stored udf
213
+ add_column('test_udf_batched', test_udf_stored_batched(t.c1, upper=False)) # batched stored udf
214
+ if include_expensive_functions:
215
+ # batched library function
216
+ add_column('batched', pxt.functions.huggingface.clip_text(t.c1, model_id='openai/clip-vit-base-patch32'))
217
+
218
+ # image_member_access
219
+ add_column('image_mode', t.c8.mode)
220
+ add_column('image_rot', t.c8.rotate(180))
221
+
222
+ # in_predicate
223
+ add_column('isin_1', t.c1.isin(['test string 1', 'test string 2', 'test string 3']))
224
+ add_column('isin_2', t.c2.isin([1, 2, 3, 4, 5]))
225
+ add_column('isin_3', t.c2.isin(t.c6.f5))
226
+
227
+ # inline_array and inline_dict
228
+ add_column('inline_array_1', [[1, 2, 3], [4, 5, 6]])
229
+ add_column('inline_array_2', [['a', 'b', 'c'], ['d', 'e', 'f']])
230
+ add_column('inline_list_exprs', [t.c1, [t.c1n, t.c2]])
231
+ add_column('inline_list_mixed', [1, 'a', t.c1, [1, 'a', t.c1n], 1, 'a'])
232
+ add_column('inline_dict', {'int': 22, 'dict': {'key': 'val'}, 'expr': t.c1})
233
+
234
+ # is_null
235
+ add_column('isnull', t.c1 == None)
236
+
237
+ # json_mapper and json_path
238
+ add_column('json_mapper', t.c6[3])
239
+ add_column('json_path', t.c6.f1)
240
+
241
+ # literal
242
+ add_column('str_const', 'str')
243
+ add_column('int_const', 5)
244
+ add_column('float_const', 5.0)
245
+ add_column('timestamp_const_1', datetime.datetime.utcnow())
246
+ add_column('timestamp_const_2', datetime.date.today())
247
+
248
+ # type_cast
249
+ add_column('astype', t.c2.astype(FloatType()))
250
+
251
+ # .apply
252
+ add_column('c2_to_string', t.c2.apply(str))
253
+ add_column('c6_to_string', t.c6.apply(json.dumps))
254
+ add_column('c6_back_to_json', t[f'{col_prefix}_c6_to_string'].apply(json.loads))
255
+
256
+ t.add_embedding_index(f'{col_prefix}_function_call', text_embed=embed_udf.clip_text_embed)
257
+
258
+ # query()
259
+ @t.query
260
+ def q1(i: int):
261
+ # this breaks; TODO: why?
262
+ #return t.where(t.c2 < i)
263
+ return t.where(t.c2 < i).select(t.c1, t.c2)
264
+ add_column('query_output', t.q1(t.c2))
265
+
266
+ @t.query
267
+ def q2(s: str):
268
+ sim = t[f'{col_prefix}_function_call'].similarity(s)
269
+ return t.order_by(sim, asc=False).select(t[f'{col_prefix}_function_call']).limit(5)
270
+ add_column('sim_output', t.q2(t.c1))
178
271
 
179
272
 
180
273
  @pxt.udf(_force_stored=True)
@@ -0,0 +1,9 @@
1
+ import numpy as np
2
+
3
+ import pixeltable as pxt
4
+
5
+
6
+ # TODO This can go away once we have the ability to inline expr_udf's
7
+ @pxt.expr_udf
8
+ def clip_text_embed(txt: str) -> np.ndarray:
9
+ return pxt.functions.huggingface.clip_text(txt, model_id='openai/clip-vit-base-patch32')
pixeltable/type_system.py CHANGED
@@ -160,7 +160,7 @@ class ColumnType:
160
160
  if t == cls.Type.AUDIO:
161
161
  return AudioType()
162
162
  if t == cls.Type.DOCUMENT:
163
- return AudioType()
163
+ return DocumentType()
164
164
 
165
165
  def __str__(self) -> str:
166
166
  return self._type.name.lower()
@@ -250,7 +250,6 @@ class ColumnType:
250
250
  return None
251
251
  return None
252
252
 
253
-
254
253
  @classmethod
255
254
  def from_python_type(cls, t: type) -> Optional[ColumnType]:
256
255
  if typing.get_origin(t) is typing.Union: