pixeltable 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (110) hide show
  1. pixeltable/__init__.py +20 -9
  2. pixeltable/__version__.py +3 -0
  3. pixeltable/catalog/column.py +23 -7
  4. pixeltable/catalog/insertable_table.py +32 -19
  5. pixeltable/catalog/table.py +210 -20
  6. pixeltable/catalog/table_version.py +272 -111
  7. pixeltable/catalog/table_version_path.py +6 -1
  8. pixeltable/dataframe.py +184 -110
  9. pixeltable/datatransfer/__init__.py +1 -0
  10. pixeltable/datatransfer/label_studio.py +526 -0
  11. pixeltable/datatransfer/remote.py +113 -0
  12. pixeltable/env.py +213 -79
  13. pixeltable/exec/__init__.py +2 -1
  14. pixeltable/exec/data_row_batch.py +6 -7
  15. pixeltable/exec/expr_eval_node.py +28 -28
  16. pixeltable/exec/sql_scan_node.py +7 -6
  17. pixeltable/exprs/__init__.py +4 -3
  18. pixeltable/exprs/column_ref.py +11 -2
  19. pixeltable/exprs/comparison.py +39 -1
  20. pixeltable/exprs/data_row.py +7 -0
  21. pixeltable/exprs/expr.py +26 -19
  22. pixeltable/exprs/function_call.py +17 -18
  23. pixeltable/exprs/globals.py +14 -2
  24. pixeltable/exprs/image_member_access.py +9 -28
  25. pixeltable/exprs/in_predicate.py +96 -0
  26. pixeltable/exprs/inline_array.py +13 -11
  27. pixeltable/exprs/inline_dict.py +15 -13
  28. pixeltable/exprs/row_builder.py +7 -1
  29. pixeltable/exprs/similarity_expr.py +67 -0
  30. pixeltable/ext/functions/whisperx.py +30 -0
  31. pixeltable/ext/functions/yolox.py +16 -0
  32. pixeltable/func/__init__.py +0 -2
  33. pixeltable/func/aggregate_function.py +5 -2
  34. pixeltable/func/callable_function.py +57 -13
  35. pixeltable/func/expr_template_function.py +14 -3
  36. pixeltable/func/function.py +35 -4
  37. pixeltable/func/signature.py +5 -15
  38. pixeltable/func/udf.py +8 -12
  39. pixeltable/functions/fireworks.py +9 -4
  40. pixeltable/functions/huggingface.py +48 -5
  41. pixeltable/functions/openai.py +49 -11
  42. pixeltable/functions/pil/image.py +61 -64
  43. pixeltable/functions/together.py +32 -6
  44. pixeltable/functions/util.py +0 -43
  45. pixeltable/functions/video.py +46 -8
  46. pixeltable/globals.py +443 -0
  47. pixeltable/index/__init__.py +1 -0
  48. pixeltable/index/base.py +9 -2
  49. pixeltable/index/btree.py +54 -0
  50. pixeltable/index/embedding_index.py +91 -15
  51. pixeltable/io/__init__.py +4 -0
  52. pixeltable/io/globals.py +59 -0
  53. pixeltable/{utils → io}/hf_datasets.py +48 -17
  54. pixeltable/io/pandas.py +148 -0
  55. pixeltable/{utils → io}/parquet.py +58 -33
  56. pixeltable/iterators/__init__.py +1 -1
  57. pixeltable/iterators/base.py +8 -4
  58. pixeltable/iterators/document.py +225 -93
  59. pixeltable/iterators/video.py +16 -9
  60. pixeltable/metadata/__init__.py +8 -4
  61. pixeltable/metadata/converters/convert_12.py +3 -0
  62. pixeltable/metadata/converters/convert_13.py +41 -0
  63. pixeltable/metadata/converters/convert_14.py +13 -0
  64. pixeltable/metadata/converters/convert_15.py +29 -0
  65. pixeltable/metadata/converters/util.py +63 -0
  66. pixeltable/metadata/schema.py +12 -6
  67. pixeltable/plan.py +11 -24
  68. pixeltable/store.py +16 -23
  69. pixeltable/tool/create_test_db_dump.py +49 -14
  70. pixeltable/type_system.py +27 -58
  71. pixeltable/utils/coco.py +94 -0
  72. pixeltable/utils/documents.py +42 -12
  73. pixeltable/utils/http_server.py +70 -0
  74. pixeltable-0.2.7.dist-info/METADATA +137 -0
  75. pixeltable-0.2.7.dist-info/RECORD +126 -0
  76. {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +1 -1
  77. pixeltable/client.py +0 -600
  78. pixeltable/exprs/image_similarity_predicate.py +0 -58
  79. pixeltable/func/batched_function.py +0 -53
  80. pixeltable/func/nos_function.py +0 -202
  81. pixeltable/tests/conftest.py +0 -171
  82. pixeltable/tests/ext/test_yolox.py +0 -21
  83. pixeltable/tests/functions/test_fireworks.py +0 -43
  84. pixeltable/tests/functions/test_functions.py +0 -60
  85. pixeltable/tests/functions/test_huggingface.py +0 -158
  86. pixeltable/tests/functions/test_openai.py +0 -162
  87. pixeltable/tests/functions/test_together.py +0 -112
  88. pixeltable/tests/test_audio.py +0 -65
  89. pixeltable/tests/test_catalog.py +0 -27
  90. pixeltable/tests/test_client.py +0 -21
  91. pixeltable/tests/test_component_view.py +0 -379
  92. pixeltable/tests/test_dataframe.py +0 -440
  93. pixeltable/tests/test_dirs.py +0 -107
  94. pixeltable/tests/test_document.py +0 -120
  95. pixeltable/tests/test_exprs.py +0 -802
  96. pixeltable/tests/test_function.py +0 -332
  97. pixeltable/tests/test_index.py +0 -138
  98. pixeltable/tests/test_migration.py +0 -44
  99. pixeltable/tests/test_nos.py +0 -54
  100. pixeltable/tests/test_snapshot.py +0 -231
  101. pixeltable/tests/test_table.py +0 -1343
  102. pixeltable/tests/test_transactional_directory.py +0 -42
  103. pixeltable/tests/test_types.py +0 -52
  104. pixeltable/tests/test_video.py +0 -159
  105. pixeltable/tests/test_view.py +0 -535
  106. pixeltable/tests/utils.py +0 -442
  107. pixeltable/utils/clip.py +0 -18
  108. pixeltable-0.2.5.dist-info/METADATA +0 -128
  109. pixeltable-0.2.5.dist-info/RECORD +0 -139
  110. {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
@@ -0,0 +1,63 @@
1
+ import copy
2
+ import logging
3
+ from typing import Any, Callable, Optional
4
+
5
+ import sqlalchemy as sql
6
+
7
+ from pixeltable.metadata.schema import Table
8
+
9
+ __logger = logging.getLogger('pixeltable')
10
+
11
+
12
+ def convert_table_md(
13
+ engine: sql.engine.Engine,
14
+ column_md_updater: Optional[Callable[[dict], None]] = None,
15
+ remote_md_updater: Optional[Callable[[dict], None]] = None,
16
+ substitution_fn: Optional[Callable[[Any, Any], Optional[tuple[Any, Any]]]] = None
17
+ ) -> None:
18
+ with engine.begin() as conn:
19
+ for row in conn.execute(sql.select(Table)):
20
+ id = row[0]
21
+ table_md = row[2]
22
+ assert isinstance(table_md, dict)
23
+ updated_table_md = copy.deepcopy(table_md)
24
+ if column_md_updater is not None:
25
+ __update_column_md(updated_table_md, column_md_updater)
26
+ if remote_md_updater is not None:
27
+ __update_remote_md(updated_table_md, remote_md_updater)
28
+ if substitution_fn is not None:
29
+ updated_table_md = __substitute_md_rec(updated_table_md, substitution_fn)
30
+ if updated_table_md != table_md:
31
+ __logger.info(f'Updating schema for table: {id}')
32
+ conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_table_md))
33
+
34
+
35
+ def __update_column_md(table_md: dict, column_md_updater: Callable[[dict], None]) -> None:
36
+ columns_md = table_md['column_md']
37
+ assert isinstance(columns_md, dict)
38
+ for column_md in columns_md.values():
39
+ column_md_updater(column_md)
40
+
41
+
42
+ def __update_remote_md(table_md: dict, remote_md_updater: Callable[[dict], None]) -> None:
43
+ remotes_md = table_md['remotes']
44
+ assert isinstance(remotes_md, list)
45
+ for remote_md in remotes_md:
46
+ remote_md_updater(remote_md)
47
+
48
+
49
+ def __substitute_md_rec(md: Any, substitution_fn: Callable[[Any, Any], Optional[tuple[Any, Any]]]) -> Any:
50
+ if isinstance(md, dict):
51
+ updated_md = {}
52
+ for k, v in md.items():
53
+ substitute = substitution_fn(k, v)
54
+ if substitute is not None:
55
+ updated_k, updated_v = substitute
56
+ updated_md[updated_k] = updated_v
57
+ else:
58
+ updated_md[k] = __substitute_md_rec(v, substitution_fn)
59
+ return updated_md
60
+ elif isinstance(md, list):
61
+ return [__substitute_md_rec(v, substitution_fn) for v in md]
62
+ else:
63
+ return md
@@ -1,12 +1,11 @@
1
- from typing import Optional, List, get_type_hints, Type, Any, TypeVar, Tuple, Union
2
- import platform
3
- import uuid
4
1
  import dataclasses
2
+ import uuid
3
+ from typing import Optional, List, get_type_hints, Type, Any, TypeVar, Tuple, Union
5
4
 
6
5
  import sqlalchemy as sql
7
- from sqlalchemy import Integer, String, Boolean, BigInteger, LargeBinary
6
+ from sqlalchemy import ForeignKey
7
+ from sqlalchemy import Integer, BigInteger, LargeBinary
8
8
  from sqlalchemy.dialects.postgresql import UUID, JSONB
9
- from sqlalchemy import ForeignKey, UniqueConstraint, ForeignKeyConstraint
10
9
  from sqlalchemy.orm import declarative_base
11
10
 
12
11
  Base = declarative_base()
@@ -93,6 +92,9 @@ class ColumnMd:
93
92
  # if True, the column is present in the stored table
94
93
  stored: Optional[bool]
95
94
 
95
+ # if specified, the column is a stored proxy of another column
96
+ proxy_base: Optional[int]
97
+
96
98
 
97
99
  @dataclasses.dataclass
98
100
  class IndexMd:
@@ -143,6 +145,10 @@ class TableMd:
143
145
  # - every row is assigned a unique and immutable rowid on insertion
144
146
  next_row_id: int
145
147
 
148
+ # Metadata format for remotes:
149
+ # {'class': 'pixeltable.datatransfer.LabelStudioProject', 'md': {'project_id': 3}}
150
+ remotes: list[dict[str, Any]]
151
+
146
152
  column_md: dict[int, ColumnMd] # col_id -> ColumnMd
147
153
  index_md: dict[int, IndexMd] # index_id -> IndexMd
148
154
  view_md: Optional[ViewMd]
@@ -160,7 +166,7 @@ class Table(Base):
160
166
 
161
167
  MAX_VERSION = 9223372036854775807 # 2^63 - 1
162
168
 
163
- id = sql.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False)
169
+ id = sql.Column(UUID(as_uuid=True), primary_key=True, nullable=False)
164
170
  dir_id = sql.Column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=False)
165
171
  md = sql.Column(JSONB, nullable=False) # TableMd
166
172
 
pixeltable/plan.py CHANGED
@@ -60,25 +60,10 @@ class Analyzer:
60
60
  # filter predicate applied to output rows of the SQL scan
61
61
  self.filter: Optional[exprs.Predicate] = None
62
62
  # not executable
63
- self.similarity_clause: Optional[exprs.ImageSimilarityPredicate] = None
63
+ #self.similarity_clause: Optional[exprs.ImageSimilarityPredicate] = None
64
64
  if where_clause is not None:
65
65
  where_clause_conjuncts, self.filter = where_clause.split_conjuncts(lambda e: e.sql_expr() is not None)
66
66
  self.sql_where_clause = exprs.CompoundPredicate.make_conjunction(where_clause_conjuncts)
67
- if self.filter is not None:
68
- similarity_clauses, self.filter = self.filter.split_conjuncts(
69
- lambda e: isinstance(e, exprs.ImageSimilarityPredicate))
70
- if len(similarity_clauses) > 1:
71
- raise excs.Error(f'More than one nearest() not supported')
72
- if len(similarity_clauses) == 1:
73
- if len(self.order_by_clause) > 0:
74
- raise excs.Error((
75
- f'nearest() returns results in order of proximity and cannot be used in conjunction with '
76
- f'order_by()'))
77
- self.similarity_clause = similarity_clauses[0]
78
- img_col = self.similarity_clause.img_col_ref.col
79
- indexed_col_ids = {info.col.id for info in tbl.tbl_version.idxs_by_name.values()}
80
- if img_col.id not in indexed_col_ids:
81
- raise excs.Error(f'nearest() not available for unindexed column {img_col.name}')
82
67
 
83
68
  # all exprs that are evaluated in Python; not executable
84
69
  self.all_exprs = self.select_list.copy()
@@ -204,8 +189,6 @@ class Planner:
204
189
  refd_tbl_ids: Set[UUID] = set()
205
190
  if where_clause is not None:
206
191
  analyzer = cls.analyze(tbl, where_clause)
207
- if analyzer.similarity_clause is not None:
208
- raise excs.Error('nearest() cannot be used with count()')
209
192
  if analyzer.filter is not None:
210
193
  raise excs.Error(f'Filter {analyzer.filter} not expressible in SQL')
211
194
  clause_element = analyzer.sql_where_clause.sql_expr()
@@ -268,7 +251,7 @@ class Planner:
268
251
  Returns:
269
252
  - root node of the plan
270
253
  - list of qualified column names that are getting updated
271
- - list of columns that are being recomputed
254
+ - list of user-visible columns that are being recomputed
272
255
  """
273
256
  # retrieve all stored cols and all target exprs
274
257
  assert isinstance(tbl, catalog.TableVersionPath)
@@ -277,7 +260,10 @@ class Planner:
277
260
  if len(recompute_targets) > 0:
278
261
  recomputed_cols = recompute_targets.copy()
279
262
  else:
280
- recomputed_cols = target.get_dependent_columns(updated_cols) if cascade else {}
263
+ recomputed_cols = target.get_dependent_columns(updated_cols) if cascade else set()
264
+ # regardless of cascade, we need to update all indices on any updated column
265
+ idx_val_cols = target.get_idx_val_columns(updated_cols)
266
+ recomputed_cols.update(idx_val_cols)
281
267
  # we only need to recompute stored columns (unstored ones are substituted away)
282
268
  recomputed_cols = {c for c in recomputed_cols if c.is_stored}
283
269
  recomputed_base_cols = {col for col in recomputed_cols if col.tbl == target}
@@ -290,8 +276,8 @@ class Planner:
290
276
  recomputed_exprs = \
291
277
  [c.value_expr.copy().resolve_computed_cols(resolve_cols=recomputed_base_cols) for c in recomputed_base_cols]
292
278
  # recomputed cols reference the new values of the updated cols
293
- for col, e in update_targets.items():
294
- exprs.Expr.list_substitute(recomputed_exprs, exprs.ColumnRef(col), e)
279
+ spec = {exprs.ColumnRef(col): e for col, e in update_targets.items()}
280
+ exprs.Expr.list_substitute(recomputed_exprs, spec)
295
281
  select_list.extend(recomputed_exprs)
296
282
 
297
283
  # we need to retrieve the PK columns of the existing rows
@@ -299,7 +285,8 @@ class Planner:
299
285
  all_base_cols = copied_cols + updated_cols + list(recomputed_base_cols) # same order as select_list
300
286
  # update row builder with column information
301
287
  [plan.row_builder.add_table_column(col, select_list[i].slot_idx) for i, col in enumerate(all_base_cols)]
302
- return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + list(recomputed_cols)], list(recomputed_cols)
288
+ recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
289
+ return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
303
290
 
304
291
  @classmethod
305
292
  def create_view_update_plan(
@@ -570,7 +557,7 @@ class Planner:
570
557
  sql_select_list = analyzer.sql_exprs.copy()
571
558
  plan = exec.SqlScanNode(
572
559
  tbl, row_builder, select_list=sql_select_list, where_clause=analyzer.sql_where_clause,
573
- filter=analyzer.filter, similarity_clause=analyzer.similarity_clause, order_by_items=order_by_items,
560
+ filter=analyzer.filter, order_by_items=order_by_items,
574
561
  limit=sql_limit, set_pk=with_pk, exact_version_only=exact_version_only)
575
562
  plan = cls._insert_prefetch_node(tbl.tbl_version.id, analyzer.select_list, row_builder, plan)
576
563
 
pixeltable/store.py CHANGED
@@ -66,7 +66,6 @@ class StoreBase:
66
66
  """Create self.sa_tbl from self.tbl_version."""
67
67
  system_cols = self._create_system_columns()
68
68
  all_cols = system_cols.copy()
69
- idxs: List[sql.Index] = []
70
69
  for col in [c for c in self.tbl_version.cols if c.is_stored]:
71
70
  # re-create sql.Column for each column, regardless of whether it already has sa_col set: it was bound
72
71
  # to the last sql.Table version we created and cannot be reused
@@ -76,26 +75,18 @@ class StoreBase:
76
75
  all_cols.append(col.sa_errormsg_col)
77
76
  all_cols.append(col.sa_errortype_col)
78
77
 
79
- # we create an index for:
80
- # - scalar columns (except for strings, because long strings can't be used for B-tree indices)
81
- # - non-computed video and image columns (they will contain external paths/urls that users might want to
82
- # filter on)
83
- if (col.col_type.is_scalar_type() and not col.col_type.is_string_type()) \
84
- or (col.col_type.is_media_type() and not col.is_computed):
85
- # index names need to be unique within the Postgres instance
86
- idx_name = f'idx_{col.id}_{self.tbl_version.id.hex}'
87
- idxs.append(sql.Index(idx_name, col.sa_col))
88
-
89
78
  if self.sa_tbl is not None:
90
79
  # if we're called in response to a schema change, we need to remove the old table first
91
80
  self.sa_md.remove(self.sa_tbl)
92
81
 
82
+ idxs: List[sql.Index] = []
93
83
  # index for all system columns:
94
84
  # - base x view joins can be executed as merge joins
95
85
  # - speeds up ORDER BY rowid DESC
96
86
  # - allows filtering for a particular table version in index scan
97
87
  idx_name = f'sys_cols_idx_{self.tbl_version.id.hex}'
98
88
  idxs.append(sql.Index(idx_name, *system_cols))
89
+
99
90
  # v_min/v_max indices: speeds up base table scans needed to propagate a base table insert or delete
100
91
  idx_name = f'vmin_idx_{self.tbl_version.id.hex}'
101
92
  idxs.append(sql.Index(idx_name, self.v_min_col, postgresql_using='brin'))
@@ -201,10 +192,10 @@ class StoreBase:
201
192
  if col.records_errors:
202
193
  # we also need to create the errormsg and errortype storage cols
203
194
  stmt = (f'ALTER TABLE {self._storage_name()} '
204
- f'ADD COLUMN {col.errormsg_store_name()} {StringType().to_sql()} DEFAULT NULL')
195
+ f'ADD COLUMN {col.errormsg_store_name()} VARCHAR DEFAULT NULL')
205
196
  conn.execute(sql.text(stmt))
206
197
  stmt = (f'ALTER TABLE {self._storage_name()} '
207
- f'ADD COLUMN {col.errortype_store_name()} {StringType().to_sql()} DEFAULT NULL')
198
+ f'ADD COLUMN {col.errortype_store_name()} VARCHAR DEFAULT NULL')
208
199
  conn.execute(sql.text(stmt))
209
200
  added_storage_cols.extend([col.errormsg_store_name(), col.errortype_store_name()])
210
201
  self.create_sa_tbl()
@@ -264,7 +255,8 @@ class StoreBase:
264
255
  return num_excs
265
256
 
266
257
  def insert_rows(
267
- self, exec_plan: ExecNode, conn: sql.engine.Connection, v_min: Optional[int] = None
258
+ self, exec_plan: ExecNode, conn: sql.engine.Connection, v_min: Optional[int] = None,
259
+ show_progress: bool = True
268
260
  ) -> Tuple[int, int, Set[int]]:
269
261
  """Insert rows into the store table and update the catalog table's md
270
262
  Returns:
@@ -293,15 +285,16 @@ class StoreBase:
293
285
  self._create_table_row(row, row_builder, media_cols, cols_with_excs, v_min=v_min)
294
286
  num_excs += num_row_exc
295
287
  table_rows.append(table_row)
296
- if progress_bar is None:
297
- warnings.simplefilter("ignore", category=TqdmWarning)
298
- progress_bar = tqdm(
299
- desc=f'Inserting rows into `{self.tbl_version.name}`',
300
- unit=' rows',
301
- ncols=100,
302
- file=sys.stdout
303
- )
304
- progress_bar.update(1)
288
+ if show_progress:
289
+ if progress_bar is None:
290
+ warnings.simplefilter("ignore", category=TqdmWarning)
291
+ progress_bar = tqdm(
292
+ desc=f'Inserting rows into `{self.tbl_version.name}`',
293
+ unit=' rows',
294
+ ncols=100,
295
+ file=sys.stdout
296
+ )
297
+ progress_bar.update(1)
305
298
  self._move_tmp_media_files(table_rows, media_cols, v_min)
306
299
  conn.execute(sql.insert(self.sa_tbl), table_rows)
307
300
  if progress_bar is not None:
@@ -11,6 +11,7 @@ import toml
11
11
  import pixeltable as pxt
12
12
  import pixeltable.metadata as metadata
13
13
  from pixeltable.env import Env
14
+ from pixeltable.func import Batch
14
15
  from pixeltable.type_system import \
15
16
  StringType, IntType, FloatType, BoolType, TimestampType, JsonType
16
17
 
@@ -29,9 +30,9 @@ class Dumper:
29
30
  os.environ['PIXELTABLE_DB'] = db_name
30
31
  os.environ['PIXELTABLE_PGDATA'] = str(shared_home / 'pgdata')
31
32
 
32
- Env.get().set_up(reinit_db=True)
33
- self.cl = pxt.Client()
34
- self.cl.logging(level=logging.DEBUG, to_stdout=True)
33
+ Env._init_env(reinit_db=True)
34
+
35
+ Env.get().configure_logging(level=logging.DEBUG, to_stdout=True)
35
36
 
36
37
  def dump_db(self) -> None:
37
38
  md_version = metadata.VERSION
@@ -76,8 +77,18 @@ class Dumper:
76
77
  'c6': JsonType(nullable=False),
77
78
  'c7': JsonType(nullable=False),
78
79
  }
79
- t = self.cl.create_table('sample_table', schema, primary_key='c2')
80
+ t = pxt.create_table('sample_table', schema, primary_key='c2')
81
+
82
+ # Add columns for InlineArray and InlineDict
80
83
  t.add_column(c8=[[1, 2, 3], [4, 5, 6]])
84
+ t.add_column(c9=[['a', 'b', 'c'], ['d', 'e', 'f']])
85
+ t.add_column(c10=[t.c1, [t.c1n, t.c2]])
86
+ t.add_column(c11={'int': 22, 'dict': {'key': 'val'}, 'expr': t.c1})
87
+
88
+ # InPredicate
89
+ t.add_column(isin_1=t.c1.isin(['test string 1', 'test string 2', 'test string 3']))
90
+ t.add_column(isin_2=t.c2.isin([1, 2, 3, 4, 5]))
91
+ t.add_column(isin_3=t.c2.isin(t.c6.f5))
81
92
 
82
93
  # Add columns for .astype converters to ensure they're persisted properly
83
94
  t.add_column(c2_as_float=t.c2.astype(FloatType()))
@@ -136,24 +147,48 @@ class Dumper:
136
147
  for i in range(num_rows)
137
148
  ]
138
149
  t.insert(rows)
139
- self.cl.create_dir('views')
140
- v = self.cl.create_view('views.sample_view', t, filter=(t.c2 < 50))
141
- _ = self.cl.create_view('views.sample_snapshot', t, filter=(t.c2 >= 75), is_snapshot=True)
150
+ pxt.create_dir('views')
151
+ v = pxt.create_view('views.sample_view', t, filter=(t.c2 < 50))
152
+ _ = pxt.create_view('views.sample_snapshot', t, filter=(t.c2 >= 75), is_snapshot=True)
153
+ e = pxt.create_view('views.empty_view', t, filter=t.c2 == 4171780)
154
+ assert e.count() == 0
142
155
  # Computed column using a library function
143
156
  v['str_format'] = pxt.functions.string.str_format('{0} {key}', t.c1, key=t.c1)
144
- # Computed column using a bespoke udf
145
- v['test_udf'] = test_udf(t.c2)
157
+ # Computed column using a bespoke stored udf
158
+ v['test_udf'] = test_udf_stored(t.c2)
159
+ # Computed column using a batched function
160
+ # (apply this to the empty view, since it's a "heavyweight" function)
161
+ e['batched'] = pxt.functions.huggingface.clip_text(t.c1, model_id='openai/clip-vit-base-patch32')
162
+ # computed column using a stored batched function
163
+ v['test_udf_batched'] = test_udf_stored_batched(t.c1, upper=False)
146
164
  # astype
147
165
  v['astype'] = t.c1.astype(pxt.FloatType())
148
- # computed column using a stored function
149
- v['stored'] = t.c1.apply(lambda x: f'Hello, {x}', col_type=pxt.StringType())
150
166
 
151
-
152
- @pxt.udf
153
- def test_udf(n: int) -> int:
167
+ # Add remotes
168
+ from pixeltable.datatransfer.remote import MockRemote
169
+ v.link(
170
+ MockRemote('remote', {'int_field': pxt.IntType()}, {'str_field': pxt.StringType()}),
171
+ col_mapping={'test_udf': 'int_field', 'c1': 'str_field'}
172
+ )
173
+ # We're just trying to test metadata here, so reach "under the covers" and link a fake
174
+ # Label Studio project without validation (so we don't need a real Label Studio server)
175
+ from pixeltable.datatransfer.label_studio import LabelStudioProject
176
+ v.tbl_version_path.tbl_version.link(
177
+ LabelStudioProject(4171780, media_import_method='file'),
178
+ col_mapping={'str_format': 'str_format'}
179
+ )
180
+
181
+
182
+ @pxt.udf(_force_stored=True)
183
+ def test_udf_stored(n: int) -> int:
154
184
  return n + 1
155
185
 
156
186
 
187
+ @pxt.udf(batch_size=4, _force_stored=True)
188
+ def test_udf_stored_batched(strings: Batch[str], *, upper: bool = True) -> Batch[str]:
189
+ return [string.upper() if upper else string.lower() for string in strings]
190
+
191
+
157
192
  def main() -> None:
158
193
  _logger.info("Creating pixeltable test artifact.")
159
194
  dumper = Dumper()
pixeltable/type_system.py CHANGED
@@ -7,7 +7,7 @@ import json
7
7
  import typing
8
8
  import urllib.parse
9
9
  import urllib.request
10
- from copy import copy
10
+ from copy import deepcopy
11
11
  from pathlib import Path
12
12
  from typing import Any, Optional, Tuple, Dict, Callable, List, Union, Sequence, Mapping
13
13
 
@@ -82,7 +82,11 @@ class ColumnType:
82
82
 
83
83
  def __init__(self, t: Type, nullable: bool = False):
84
84
  self._type = t
85
- self.nullable = nullable
85
+ self._nullable = nullable
86
+
87
+ @property
88
+ def nullable(self) -> bool:
89
+ return self._nullable
86
90
 
87
91
  @property
88
92
  def type_enum(self) -> Type:
@@ -91,6 +95,12 @@ class ColumnType:
91
95
  def serialize(self) -> str:
92
96
  return json.dumps(self.as_dict())
93
97
 
98
+ def copy(self, nullable: Optional[bool] = None) -> ColumnType:
99
+ result = deepcopy(self)
100
+ if nullable is not None:
101
+ result._nullable = nullable
102
+ return result
103
+
94
104
  @classmethod
95
105
  def serialize_list(cls, type_list: List[ColumnType]) -> str:
96
106
  return json.dumps([t.as_dict() for t in type_list])
@@ -177,7 +187,7 @@ class ColumnType:
177
187
  if type(self) != type(other):
178
188
  return False
179
189
  for member_var in vars(self).keys():
180
- if member_var == 'nullable':
190
+ if member_var == '_nullable':
181
191
  continue
182
192
  if getattr(self, member_var) != getattr(other, member_var):
183
193
  return False
@@ -225,6 +235,8 @@ class ColumnType:
225
235
  return BoolType()
226
236
  if isinstance(val, datetime.datetime) or isinstance(val, datetime.date):
227
237
  return TimestampType()
238
+ if isinstance(val, PIL.Image.Image):
239
+ return ImageType(width=val.width, height=val.height)
228
240
  if isinstance(val, np.ndarray):
229
241
  col_type = ArrayType.from_literal(val)
230
242
  if col_type is not None:
@@ -248,7 +260,7 @@ class ColumnType:
248
260
  # We treat it as the underlying type but with nullable=True.
249
261
  underlying = cls.from_python_type(union_args[0])
250
262
  if underlying is not None:
251
- underlying.nullable = True
263
+ underlying._nullable = True
252
264
  return underlying
253
265
  else:
254
266
  # Discard type parameters to ensure that parameterized types such as `list[T]`
@@ -370,13 +382,6 @@ class ColumnType:
370
382
  # types that refer to external media files
371
383
  return self.is_image_type() or self.is_video_type() or self.is_audio_type() or self.is_document_type()
372
384
 
373
- @abc.abstractmethod
374
- def to_sql(self) -> str:
375
- """
376
- Return corresponding Postgres type.
377
- """
378
- pass
379
-
380
385
  @abc.abstractmethod
381
386
  def to_sa_type(self) -> sql.types.TypeEngine:
382
387
  """
@@ -404,9 +409,6 @@ class InvalidType(ColumnType):
404
409
  def __init__(self, nullable: bool = False):
405
410
  super().__init__(self.Type.INVALID, nullable=nullable)
406
411
 
407
- def to_sql(self) -> str:
408
- assert False
409
-
410
412
  def to_sa_type(self) -> sql.types.TypeEngine:
411
413
  assert False
412
414
 
@@ -432,9 +434,6 @@ class StringType(ColumnType):
432
434
  return None
433
435
  return convert
434
436
 
435
- def to_sql(self) -> str:
436
- return 'VARCHAR'
437
-
438
437
  def to_sa_type(self) -> sql.types.TypeEngine:
439
438
  return sql.String()
440
439
 
@@ -458,9 +457,6 @@ class IntType(ColumnType):
458
457
  def __init__(self, nullable: bool = False):
459
458
  super().__init__(self.Type.INT, nullable=nullable)
460
459
 
461
- def to_sql(self) -> str:
462
- return 'BIGINT'
463
-
464
460
  def to_sa_type(self) -> sql.types.TypeEngine:
465
461
  return sql.BigInteger()
466
462
 
@@ -473,9 +469,6 @@ class FloatType(ColumnType):
473
469
  def __init__(self, nullable: bool = False):
474
470
  super().__init__(self.Type.FLOAT, nullable=nullable)
475
471
 
476
- def to_sql(self) -> str:
477
- return 'FLOAT'
478
-
479
472
  def to_sa_type(self) -> sql.types.TypeEngine:
480
473
  return sql.Float()
481
474
 
@@ -493,9 +486,6 @@ class BoolType(ColumnType):
493
486
  def __init__(self, nullable: bool = False):
494
487
  super().__init__(self.Type.BOOL, nullable=nullable)
495
488
 
496
- def to_sql(self) -> str:
497
- return 'BOOLEAN'
498
-
499
489
  def to_sa_type(self) -> sql.types.TypeEngine:
500
490
  return sql.Boolean()
501
491
 
@@ -513,9 +503,6 @@ class TimestampType(ColumnType):
513
503
  def __init__(self, nullable: bool = False):
514
504
  super().__init__(self.Type.TIMESTAMP, nullable=nullable)
515
505
 
516
- def to_sql(self) -> str:
517
- return 'INTEGER'
518
-
519
506
  def to_sa_type(self) -> sql.types.TypeEngine:
520
507
  return sql.TIMESTAMP()
521
508
 
@@ -551,14 +538,13 @@ class JsonType(ColumnType):
551
538
  }
552
539
  return cls(type_spec, nullable=d['nullable'])
553
540
 
554
- def to_sql(self) -> str:
555
- return 'JSONB'
556
-
557
541
  def to_sa_type(self) -> sql.types.TypeEngine:
558
542
  return sql.dialects.postgresql.JSONB()
559
543
 
560
544
  def print_value(self, val: Any) -> str:
561
545
  val_type = self.infer_literal_type(val)
546
+ if val_type is None:
547
+ return super().print_value(val)
562
548
  if val_type == self:
563
549
  return str(val)
564
550
  return val_type.print_value(val)
@@ -657,9 +643,6 @@ class ArrayType(ColumnType):
657
643
  return np.array(val, dtype=self.numpy_dtype())
658
644
  return val
659
645
 
660
- def to_sql(self) -> str:
661
- return 'BYTEA'
662
-
663
646
  def to_sa_type(self) -> sql.types.TypeEngine:
664
647
  return sql.LargeBinary()
665
648
 
@@ -762,9 +745,6 @@ class ImageType(ColumnType):
762
745
  return img
763
746
  return convert
764
747
 
765
- def to_sql(self) -> str:
766
- return 'VARCHAR'
767
-
768
748
  def to_sa_type(self) -> sql.types.TypeEngine:
769
749
  return sql.String()
770
750
 
@@ -785,11 +765,8 @@ class VideoType(ColumnType):
785
765
  def __init__(self, nullable: bool = False):
786
766
  super().__init__(self.Type.VIDEO, nullable=nullable)
787
767
 
788
- def to_sql(self) -> str:
789
- # stored as a file path
790
- return 'VARCHAR'
791
-
792
768
  def to_sa_type(self) -> sql.types.TypeEngine:
769
+ # stored as a file path
793
770
  return sql.String()
794
771
 
795
772
  def _validate_literal(self, val: Any) -> None:
@@ -820,11 +797,8 @@ class AudioType(ColumnType):
820
797
  def __init__(self, nullable: bool = False):
821
798
  super().__init__(self.Type.AUDIO, nullable=nullable)
822
799
 
823
- def to_sql(self) -> str:
824
- # stored as a file path
825
- return 'VARCHAR'
826
-
827
800
  def to_sa_type(self) -> sql.types.TypeEngine:
801
+ # stored as a file path
828
802
  return sql.String()
829
803
 
830
804
  def _validate_literal(self, val: Any) -> None:
@@ -864,11 +838,8 @@ class DocumentType(ColumnType):
864
838
  else:
865
839
  self._doc_formats = [t for t in self.DocumentFormat]
866
840
 
867
- def to_sql(self) -> str:
868
- # stored as a file path
869
- return 'VARCHAR'
870
-
871
841
  def to_sa_type(self) -> sql.types.TypeEngine:
842
+ # stored as a file path
872
843
  return sql.String()
873
844
 
874
845
  def _validate_literal(self, val: Any) -> None:
@@ -877,11 +848,9 @@ class DocumentType(ColumnType):
877
848
  def validate_media(self, val: Any) -> None:
878
849
  assert isinstance(val, str)
879
850
  from pixeltable.utils.documents import get_document_handle
880
- with open(val, 'r', encoding='utf8') as fh:
881
- try:
882
- s = fh.read()
883
- dh = get_document_handle(s)
884
- if dh is None:
885
- raise excs.Error(f'Not a recognized document format: {val}')
886
- except Exception as e:
887
- raise excs.Error(f'Not a recognized document format: {val}') from None
851
+ try:
852
+ dh = get_document_handle(val)
853
+ if dh is None:
854
+ raise excs.Error(f'Not a recognized document format: {val}')
855
+ except Exception as e:
856
+ raise excs.Error(f'Not a recognized document format: {val}') from None