pixeltable 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. pixeltable/__init__.py +2 -2
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +1 -1
  4. pixeltable/catalog/column.py +41 -29
  5. pixeltable/catalog/globals.py +18 -0
  6. pixeltable/catalog/insertable_table.py +30 -10
  7. pixeltable/catalog/table.py +198 -86
  8. pixeltable/catalog/table_version.py +47 -53
  9. pixeltable/catalog/table_version_path.py +2 -2
  10. pixeltable/catalog/view.py +17 -18
  11. pixeltable/dataframe.py +27 -36
  12. pixeltable/env.py +7 -0
  13. pixeltable/exec/__init__.py +0 -1
  14. pixeltable/exec/aggregation_node.py +6 -3
  15. pixeltable/exec/cache_prefetch_node.py +189 -43
  16. pixeltable/exec/data_row_batch.py +5 -22
  17. pixeltable/exec/exec_context.py +2 -2
  18. pixeltable/exec/exec_node.py +3 -2
  19. pixeltable/exec/expr_eval_node.py +23 -16
  20. pixeltable/exec/in_memory_data_node.py +6 -3
  21. pixeltable/exec/sql_node.py +24 -25
  22. pixeltable/exprs/arithmetic_expr.py +12 -5
  23. pixeltable/exprs/array_slice.py +7 -7
  24. pixeltable/exprs/column_property_ref.py +37 -10
  25. pixeltable/exprs/column_ref.py +97 -14
  26. pixeltable/exprs/comparison.py +10 -5
  27. pixeltable/exprs/compound_predicate.py +8 -7
  28. pixeltable/exprs/data_row.py +27 -18
  29. pixeltable/exprs/expr.py +53 -52
  30. pixeltable/exprs/expr_set.py +5 -0
  31. pixeltable/exprs/function_call.py +32 -16
  32. pixeltable/exprs/globals.py +4 -1
  33. pixeltable/exprs/in_predicate.py +8 -7
  34. pixeltable/exprs/inline_expr.py +4 -4
  35. pixeltable/exprs/is_null.py +4 -4
  36. pixeltable/exprs/json_mapper.py +11 -12
  37. pixeltable/exprs/json_path.py +6 -11
  38. pixeltable/exprs/literal.py +5 -5
  39. pixeltable/exprs/method_ref.py +5 -4
  40. pixeltable/exprs/object_ref.py +2 -1
  41. pixeltable/exprs/row_builder.py +88 -36
  42. pixeltable/exprs/rowid_ref.py +12 -11
  43. pixeltable/exprs/similarity_expr.py +12 -7
  44. pixeltable/exprs/sql_element_cache.py +7 -5
  45. pixeltable/exprs/type_cast.py +8 -6
  46. pixeltable/exprs/variable.py +5 -4
  47. pixeltable/func/aggregate_function.py +9 -9
  48. pixeltable/func/expr_template_function.py +6 -5
  49. pixeltable/func/function.py +11 -10
  50. pixeltable/func/udf.py +6 -11
  51. pixeltable/functions/__init__.py +2 -2
  52. pixeltable/functions/globals.py +5 -7
  53. pixeltable/functions/huggingface.py +155 -45
  54. pixeltable/functions/llama_cpp.py +107 -0
  55. pixeltable/functions/mistralai.py +1 -1
  56. pixeltable/functions/ollama.py +147 -0
  57. pixeltable/functions/openai.py +1 -1
  58. pixeltable/functions/replicate.py +72 -0
  59. pixeltable/functions/string.py +9 -0
  60. pixeltable/functions/together.py +1 -1
  61. pixeltable/functions/util.py +5 -2
  62. pixeltable/globals.py +67 -26
  63. pixeltable/index/btree.py +16 -3
  64. pixeltable/index/embedding_index.py +4 -4
  65. pixeltable/io/__init__.py +1 -2
  66. pixeltable/io/fiftyone.py +178 -0
  67. pixeltable/io/globals.py +96 -2
  68. pixeltable/iterators/base.py +3 -2
  69. pixeltable/iterators/document.py +1 -1
  70. pixeltable/iterators/video.py +120 -63
  71. pixeltable/metadata/__init__.py +1 -1
  72. pixeltable/metadata/converters/convert_21.py +34 -0
  73. pixeltable/metadata/converters/util.py +45 -4
  74. pixeltable/metadata/notes.py +1 -0
  75. pixeltable/metadata/schema.py +8 -0
  76. pixeltable/plan.py +17 -15
  77. pixeltable/py.typed +0 -0
  78. pixeltable/store.py +7 -2
  79. pixeltable/tool/create_test_db_dump.py +1 -1
  80. pixeltable/tool/create_test_video.py +1 -1
  81. pixeltable/tool/embed_udf.py +1 -1
  82. pixeltable/tool/mypy_plugin.py +28 -5
  83. pixeltable/type_system.py +100 -36
  84. pixeltable/utils/coco.py +5 -5
  85. pixeltable/utils/documents.py +15 -1
  86. pixeltable/utils/formatter.py +12 -13
  87. pixeltable/utils/s3.py +6 -3
  88. {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/METADATA +158 -49
  89. pixeltable-0.2.23.dist-info/RECORD +153 -0
  90. pixeltable/exec/media_validation_node.py +0 -43
  91. pixeltable-0.2.21.dist-info/RECORD +0 -148
  92. {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/LICENSE +0 -0
  93. {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/WHEEL +0 -0
  94. {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/entry_points.txt +0 -0
@@ -4,7 +4,7 @@ from typing import Any, Callable, Optional
4
4
 
5
5
  import sqlalchemy as sql
6
6
 
7
- from pixeltable.metadata.schema import Table
7
+ from pixeltable.metadata.schema import Table, TableSchemaVersion
8
8
 
9
9
  __logger = logging.getLogger('pixeltable')
10
10
 
@@ -17,12 +17,12 @@ def convert_table_md(
17
17
  substitution_fn: Optional[Callable[[Optional[str], Any], Optional[tuple[Optional[str], Any]]]] = None
18
18
  ) -> None:
19
19
  """
20
- Converts table metadata based on the specified conversion functions.
20
+ Converts schema.TableMd dicts based on the specified conversion functions.
21
21
 
22
22
  Args:
23
23
  engine: The SQLAlchemy engine.
24
- table_md_updater: A function that updates the table metadata in place.
25
- column_md_updater: A function that updates the column metadata in place.
24
+ table_md_updater: A function that updates schema.TableMd dicts in place.
25
+ column_md_updater: A function that updates schema.ColumnMd dicts in place.
26
26
  external_store_md_updater: A function that updates the external store metadata in place.
27
27
  substitution_fn: A function that substitutes metadata values. If specified, all metadata will be traversed
28
28
  recursively, and `substitution_fn` will be called once for each metadata entry. If the entry appears in
@@ -90,3 +90,44 @@ def __substitute_md_rec(
90
90
  return updated_list
91
91
  else:
92
92
  return md
93
+
94
+
95
+ def convert_table_schema_version_md(
96
+ engine: sql.engine.Engine,
97
+ table_schema_version_md_updater: Optional[Callable[[dict], None]] = None,
98
+ schema_column_updater: Optional[Callable[[dict], None]] = None
99
+ ) -> None:
100
+ """
101
+ Converts schema.TableSchemaVersionMd dicts based on the specified conversion functions.
102
+
103
+ Args:
104
+ engine: The SQLAlchemy engine.
105
+ table_schema_version_md_updater: A function that updates schema.TableSchemaVersionMd dicts in place.
106
+ schema_column_updater: A function that updates schema.SchemaColumn dicts in place.
107
+ """
108
+ with engine.begin() as conn:
109
+ stmt = sql.select(TableSchemaVersion.tbl_id, TableSchemaVersion.schema_version, TableSchemaVersion.md)
110
+ for row in conn.execute(stmt):
111
+ tbl_id, schema_version, md = row[0], row[1], row[2]
112
+ assert isinstance(md, dict)
113
+ updated_md = copy.deepcopy(md)
114
+ if table_schema_version_md_updater is not None:
115
+ table_schema_version_md_updater(updated_md)
116
+ if schema_column_updater is not None:
117
+ __update_schema_column(updated_md, schema_column_updater)
118
+ if updated_md != md:
119
+ __logger.info(f'Updating TableSchemaVersion(tbl_id={tbl_id}, schema_version={schema_version})')
120
+ update_stmt = (
121
+ sql.update(TableSchemaVersion)
122
+ .where(TableSchemaVersion.tbl_id == tbl_id)
123
+ .where(TableSchemaVersion.schema_version == schema_version)
124
+ .values(md=updated_md)
125
+ )
126
+ conn.execute(update_stmt)
127
+
128
+
129
+ def __update_schema_column(table_schema_version_md: dict, schema_column_updater: Callable[[dict], None]) -> None:
130
+ cols = table_schema_version_md['columns']
131
+ assert isinstance(cols, dict)
132
+ for schema_col in cols.values():
133
+ schema_column_updater(schema_col)
@@ -2,6 +2,7 @@
2
2
  # rather than as a comment, so that the existence of a description can be enforced by
3
3
  # the unit tests when new versions are added.
4
4
  VERSION_NOTES = {
5
+ 22: 'TableMd/ColumnMd.media_validation',
5
6
  21: 'Separate InlineArray and InlineList',
6
7
  20: 'Store DB timestamps in UTC',
7
8
  19: 'UDF renames; ImageMemberAccess removal',
@@ -202,6 +202,10 @@ class SchemaColumn:
202
202
  pos: int
203
203
  name: str
204
204
 
205
+ # media validation strategy of this particular media column; if not set, TableMd.media_validation applies
206
+ # stores column.MediaValiation.name.lower()
207
+ media_validation: Optional[str]
208
+
205
209
 
206
210
  @dataclasses.dataclass
207
211
  class TableSchemaVersionMd:
@@ -214,6 +218,10 @@ class TableSchemaVersionMd:
214
218
  num_retained_versions: int
215
219
  comment: str
216
220
 
221
+ # default validation strategy for any media column of this table
222
+ # stores column.MediaValiation.name.lower()
223
+ media_validation: str
224
+
217
225
 
218
226
  # versioning: each table schema change results in a new record
219
227
  class TableSchemaVersion(Base):
pixeltable/plan.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Any, Iterable, Optional, Sequence, cast
1
+ from typing import Any, Iterable, Optional, Sequence
2
2
  from uuid import UUID
3
3
 
4
4
  import sqlalchemy as sql
@@ -225,27 +225,28 @@ class Planner:
225
225
  assert not tbl.is_view()
226
226
  # stored_cols: all cols we need to store, incl computed cols (and indices)
227
227
  stored_cols = [c for c in tbl.cols if c.is_stored]
228
- assert len(stored_cols) > 0
229
-
228
+ assert len(stored_cols) > 0 # there needs to be something to store
230
229
  row_builder = exprs.RowBuilder([], stored_cols, [])
231
230
 
232
231
  # create InMemoryDataNode for 'rows'
233
- stored_col_info = row_builder.output_slot_idxs()
234
- stored_img_col_info = [info for info in stored_col_info if info.col.col_type.is_image_type()]
235
- input_col_info = [info for info in stored_col_info if not info.col.is_computed]
236
232
  plan: exec.ExecNode = exec.InMemoryDataNode(tbl, rows, row_builder, tbl.next_rowid)
237
233
 
238
- media_input_cols = [info for info in input_col_info if info.col.col_type.is_media_type()]
239
- if len(media_input_cols) > 0:
240
- # prefetch external files for all input column refs for validation
241
- plan = exec.CachePrefetchNode(tbl.id, media_input_cols, input=plan)
242
- plan = exec.MediaValidationNode(row_builder, media_input_cols, input=plan)
234
+ media_input_col_info = [
235
+ exprs.ColumnSlotIdx(col_ref.col, col_ref.slot_idx)
236
+ for col_ref in row_builder.input_exprs
237
+ if isinstance(col_ref, exprs.ColumnRef) and col_ref.col_type.is_media_type()
238
+ ]
239
+ if len(media_input_col_info) > 0:
240
+ # prefetch external files for all input column refs
241
+ plan = exec.CachePrefetchNode(tbl.id, media_input_col_info, input=plan)
243
242
 
244
- computed_exprs = [e for e in row_builder.default_eval_ctx.target_exprs if not isinstance(e, exprs.ColumnRef)]
243
+ computed_exprs = row_builder.output_exprs - row_builder.input_exprs
245
244
  if len(computed_exprs) > 0:
246
245
  # add an ExprEvalNode when there are exprs to compute
247
246
  plan = exec.ExprEvalNode(row_builder, computed_exprs, plan.output_exprs, input=plan)
248
247
 
248
+ stored_col_info = row_builder.output_slot_idxs()
249
+ stored_img_col_info = [info for info in stored_col_info if info.col.col_type.is_image_type()]
249
250
  plan.set_stored_img_cols(stored_img_col_info)
250
251
  plan.set_ctx(
251
252
  exec.ExecContext(
@@ -621,7 +622,8 @@ class Planner:
621
622
  assert isinstance(tbl, catalog.TableVersionPath)
622
623
  sql_elements = analyzer.sql_elements
623
624
  is_python_agg = (
624
- not sql_elements.contains(analyzer.agg_fn_calls) or not sql_elements.contains(analyzer.window_fn_calls)
625
+ not sql_elements.contains_all(analyzer.agg_fn_calls)
626
+ or not sql_elements.contains_all(analyzer.window_fn_calls)
625
627
  )
626
628
  ctx = exec.ExecContext(row_builder)
627
629
  cls._verify_ordering(analyzer, verify_agg=is_python_agg)
@@ -671,8 +673,8 @@ class Planner:
671
673
  ctx.batch_size = 16
672
674
 
673
675
  # do aggregation in SQL if all agg exprs can be translated
674
- if (sql_elements.contains(analyzer.select_list)
675
- and sql_elements.contains(analyzer.grouping_exprs)
676
+ if (sql_elements.contains_all(analyzer.select_list)
677
+ and sql_elements.contains_all(analyzer.grouping_exprs)
676
678
  and isinstance(plan, exec.SqlNode)
677
679
  and plan.to_cte() is not None):
678
680
  plan = exec.SqlAggregationNode(
pixeltable/py.typed ADDED
File without changes
pixeltable/store.py CHANGED
@@ -303,7 +303,7 @@ class StoreBase:
303
303
 
304
304
  def insert_rows(
305
305
  self, exec_plan: ExecNode, conn: sql.engine.Connection, v_min: Optional[int] = None,
306
- show_progress: bool = True, rowids: Optional[Iterator[int]] = None
306
+ show_progress: bool = True, rowids: Optional[Iterator[int]] = None, abort_on_exc: bool = False
307
307
  ) -> tuple[int, int, set[int]]:
308
308
  """Insert rows into the store table and update the catalog table's md
309
309
  Returns:
@@ -325,8 +325,13 @@ class StoreBase:
325
325
  for batch_start_idx in range(0, len(row_batch), self.__INSERT_BATCH_SIZE):
326
326
  # compute batch of rows and convert them into table rows
327
327
  table_rows: list[dict[str, Any]] = []
328
- for row_idx in range(batch_start_idx, min(batch_start_idx + self.__INSERT_BATCH_SIZE, len(row_batch))):
328
+ batch_stop_idx = min(batch_start_idx + self.__INSERT_BATCH_SIZE, len(row_batch))
329
+ for row_idx in range(batch_start_idx, batch_stop_idx):
329
330
  row = row_batch[row_idx]
331
+ # if abort_on_exc == True, we need to check for media validation exceptions
332
+ if abort_on_exc and row.has_exc():
333
+ exc = row.get_first_exc()
334
+ raise exc
330
335
 
331
336
  rowid = (next(rowids),) if rowids is not None else row.pk[:-1]
332
337
  pk = rowid + (v_min,)
@@ -153,7 +153,7 @@ class Dumper:
153
153
  self.__add_expr_columns(v, 'view')
154
154
 
155
155
  # snapshot
156
- _ = pxt.create_view('views.snapshot', t.where(t.c2 >= 75), is_snapshot=True)
156
+ _ = pxt.create_snapshot('views.snapshot', t.where(t.c2 >= 75))
157
157
 
158
158
  # view of views
159
159
  vv = pxt.create_view('views.view_of_views', v.where(t.c2 >= 25))
@@ -1,4 +1,4 @@
1
- import av
1
+ import av # type: ignore[import-untyped]
2
2
  import PIL.Image
3
3
  import PIL.ImageDraw
4
4
  import PIL.ImageFont
@@ -6,4 +6,4 @@ import pixeltable as pxt
6
6
  # TODO This can go away once we have the ability to inline expr_udf's
7
7
  @pxt.expr_udf
8
8
  def clip_text_embed(txt: str) -> np.ndarray:
9
- return pxt.functions.huggingface.clip_text(txt, model_id='openai/clip-vit-base-patch32')
9
+ return pxt.functions.huggingface.clip_text(txt, model_id='openai/clip-vit-base-patch32') # type: ignore[return-value]
@@ -1,12 +1,15 @@
1
1
  from typing import Callable, Optional
2
2
 
3
- from mypy.plugin import AnalyzeTypeContext, Plugin
4
- from mypy.types import Type
3
+ from mypy import nodes
4
+ from mypy.plugin import AnalyzeTypeContext, ClassDefContext, Plugin
5
+ from mypy.plugins.common import add_method_to_class
6
+ from mypy.types import AnyType, Type, TypeOfAny
5
7
 
6
8
  import pixeltable as pxt
7
9
 
8
10
 
9
11
  class PxtPlugin(Plugin):
12
+ __UDA_FULLNAME = f'{pxt.uda.__module__}.{pxt.uda.__name__}'
10
13
  __TYPE_MAP = {
11
14
  pxt.Json: 'typing.Any',
12
15
  pxt.Array: 'numpy.ndarray',
@@ -20,13 +23,33 @@ class PxtPlugin(Plugin):
20
23
  for k, v in __TYPE_MAP.items()
21
24
  }
22
25
 
23
- def get_type_analyze_hook(self, fullname: str) -> Optional[Callable[[AnalyzeTypeContext], type]]:
26
+ def get_type_analyze_hook(self, fullname: str) -> Optional[Callable[[AnalyzeTypeContext], Type]]:
24
27
  if fullname in self.__FULLNAME_MAP:
25
28
  subst_name = self.__FULLNAME_MAP[fullname]
26
29
  return lambda ctx: pxt_hook(ctx, subst_name)
30
+ return None
27
31
 
28
- def plugin(version: str):
32
+ def get_class_decorator_hook_2(self, fullname: str) -> Optional[Callable[[ClassDefContext], bool]]:
33
+ if fullname == self.__UDA_FULLNAME:
34
+ return pxt_decorator_hook
35
+ return None
36
+
37
+ def plugin(version: str) -> type:
29
38
  return PxtPlugin
30
39
 
31
40
  def pxt_hook(ctx: AnalyzeTypeContext, subst_name: str) -> Type:
32
- return ctx.api.named_type(subst_name)
41
+ if subst_name == 'typing.Any':
42
+ return AnyType(TypeOfAny.special_form)
43
+ return ctx.api.named_type(subst_name, [])
44
+
45
+ def pxt_decorator_hook(ctx: ClassDefContext) -> bool:
46
+ arg = nodes.Argument(nodes.Var('fn'), AnyType(TypeOfAny.special_form), None, nodes.ARG_POS)
47
+ add_method_to_class(
48
+ ctx.api,
49
+ ctx.cls,
50
+ "to_sql",
51
+ args=[arg],
52
+ return_type=AnyType(TypeOfAny.special_form),
53
+ is_staticmethod=True,
54
+ )
55
+ return True
pixeltable/type_system.py CHANGED
@@ -3,16 +3,18 @@ from __future__ import annotations
3
3
  import abc
4
4
  import datetime
5
5
  import enum
6
+ import io
6
7
  import json
8
+ import types
7
9
  import typing
8
10
  import urllib.parse
9
11
  import urllib.request
10
12
  from pathlib import Path
11
13
  from typing import Any, Iterable, Mapping, Optional, Sequence, Union
12
14
 
15
+ import PIL.Image
13
16
  import av # type: ignore
14
17
  import numpy as np
15
- import PIL.Image
16
18
  import sqlalchemy as sql
17
19
  from typing import _GenericAlias # type: ignore[attr-defined]
18
20
  from typing_extensions import _AnnotatedAlias
@@ -271,63 +273,110 @@ class ColumnType:
271
273
  return inferred_type
272
274
 
273
275
  @classmethod
274
- def from_python_type(cls, t: Union[type, _GenericAlias], nullable_default: bool = False) -> Optional[ColumnType]:
275
- if typing.get_origin(t) is typing.Union:
276
+ def from_python_type(
277
+ cls,
278
+ t: Union[type, _GenericAlias],
279
+ nullable_default: bool = False,
280
+ allow_builtin_types: bool = True
281
+ ) -> Optional[ColumnType]:
282
+ """
283
+ Convert a Python type into a Pixeltable `ColumnType` instance.
284
+
285
+ Args:
286
+ t: The Python type.
287
+ nullable_default: If True, then the returned `ColumnType` will be nullable unless it is marked as
288
+ `Required`.
289
+ allow_builtin_types: If True, then built-in types such as `str`, `int`, `float`, etc., will be
290
+ allowed (as in UDF definitions). If False, then only Pixeltable types such as `pxt.String`,
291
+ `pxt.Int`, etc., will be allowed (as in schema definitions). `Optional` and `Required`
292
+ designations will be allowed regardless.
293
+ """
294
+ origin = typing.get_origin(t)
295
+ if origin is typing.Union:
296
+ # Check if `t` has the form Optional[T].
276
297
  union_args = typing.get_args(t)
277
298
  if len(union_args) == 2 and type(None) in union_args:
278
299
  # `t` is a type of the form Optional[T] (equivalently, Union[T, None] or Union[None, T]).
279
300
  # We treat it as the underlying type but with nullable=True.
280
301
  underlying_py_type = union_args[0] if union_args[1] is type(None) else union_args[1]
281
- underlying = cls.from_python_type(underlying_py_type)
302
+ underlying = cls.from_python_type(underlying_py_type, allow_builtin_types=allow_builtin_types)
282
303
  if underlying is not None:
283
304
  return underlying.copy(nullable=True)
284
- elif typing.get_origin(t) is typing.Annotated:
305
+ elif origin is Required:
306
+ required_args = typing.get_args(t)
307
+ assert len(required_args) == 1
308
+ return cls.from_python_type(
309
+ required_args[0],
310
+ nullable_default=False,
311
+ allow_builtin_types=allow_builtin_types
312
+ )
313
+ elif origin is typing.Annotated:
285
314
  annotated_args = typing.get_args(t)
286
315
  origin = annotated_args[0]
287
316
  parameters = annotated_args[1]
288
317
  if isinstance(parameters, ColumnType):
289
318
  return parameters.copy(nullable=nullable_default)
290
- elif typing.get_origin(t) is Required:
291
- required_args = typing.get_args(t)
292
- assert len(required_args) == 1
293
- return cls.from_python_type(required_args[0], nullable_default=False)
294
319
  else:
295
- # Discard type parameters to ensure that parameterized types such as `list[T]`
296
- # are correctly mapped to Pixeltable types.
297
- origin = typing.get_origin(t)
298
- if origin is None:
299
- # No type parameters; the origin type is just `t` itself
300
- origin = t
301
- if issubclass(origin, _PxtType):
302
- return origin.as_col_type(nullable=nullable_default)
303
- if origin is str:
304
- return StringType(nullable=nullable_default)
305
- if origin is int:
306
- return IntType(nullable=nullable_default)
307
- if origin is float:
308
- return FloatType(nullable=nullable_default)
309
- if origin is bool:
310
- return BoolType(nullable=nullable_default)
311
- if origin is datetime.datetime:
312
- return TimestampType(nullable=nullable_default)
313
- if origin is PIL.Image.Image:
314
- return ImageType(nullable=nullable_default)
315
- if issubclass(origin, Sequence) or issubclass(origin, Mapping):
316
- return JsonType(nullable=nullable_default)
320
+ # It's something other than Optional[T], Required[T], or an explicitly annotated type.
321
+ if origin is not None:
322
+ # Discard type parameters to ensure that parameterized types such as `list[T]`
323
+ # are correctly mapped to Pixeltable types.
324
+ t = origin
325
+ if isinstance(t, type) and issubclass(t, _PxtType):
326
+ return t.as_col_type(nullable=nullable_default)
327
+ elif allow_builtin_types:
328
+ if t is str:
329
+ return StringType(nullable=nullable_default)
330
+ if t is int:
331
+ return IntType(nullable=nullable_default)
332
+ if t is float:
333
+ return FloatType(nullable=nullable_default)
334
+ if t is bool:
335
+ return BoolType(nullable=nullable_default)
336
+ if t is datetime.datetime:
337
+ return TimestampType(nullable=nullable_default)
338
+ if t is PIL.Image.Image:
339
+ return ImageType(nullable=nullable_default)
340
+ if issubclass(t, Sequence) or issubclass(t, Mapping):
341
+ return JsonType(nullable=nullable_default)
317
342
  return None
318
343
 
319
344
  @classmethod
320
- def normalize_type(cls, t: Union[ColumnType, type, _AnnotatedAlias], nullable_default: bool = False) -> ColumnType:
345
+ def normalize_type(
346
+ cls,
347
+ t: Union[ColumnType, type, _AnnotatedAlias],
348
+ nullable_default: bool = False,
349
+ allow_builtin_types: bool = True
350
+ ) -> ColumnType:
321
351
  """
322
352
  Convert any type recognizable by Pixeltable to its corresponding ColumnType.
323
353
  """
324
354
  if isinstance(t, ColumnType):
325
355
  return t
326
- col_type = cls.from_python_type(t, nullable_default)
356
+ col_type = cls.from_python_type(t, nullable_default, allow_builtin_types)
327
357
  if col_type is None:
328
- raise excs.Error(f'Unknown type: {t}')
358
+ cls.__raise_exc_for_invalid_type(t)
329
359
  return col_type
330
360
 
361
+ __TYPE_SUGGESTIONS: list[tuple[type, str]] = [
362
+ (str, 'pxt.String'),
363
+ (bool, 'pxt.Bool'),
364
+ (int, 'pxt.Int'),
365
+ (float, 'pxt.Float'),
366
+ (datetime.datetime, 'pxt.Timestamp'),
367
+ (PIL.Image.Image, 'pxt.Image'),
368
+ (Sequence, 'pxt.Json'),
369
+ (Mapping, 'pxt.Json'),
370
+ ]
371
+
372
+ @classmethod
373
+ def __raise_exc_for_invalid_type(cls, t: Union[type, _AnnotatedAlias]) -> None:
374
+ for builtin_type, suggestion in cls.__TYPE_SUGGESTIONS:
375
+ if t is builtin_type or (isinstance(t, type) and issubclass(t, builtin_type)):
376
+ name = t.__name__ if t.__module__ == 'builtins' else f'{t.__module__}.{t.__name__}'
377
+ raise excs.Error(f'Standard Python type `{name}` cannot be used here; use `{suggestion}` instead')
378
+ raise excs.Error(f'Unknown type: {t}')
379
+
331
380
  def validate_literal(self, val: Any) -> None:
332
381
  """Raise TypeError if val is not a valid literal for this type"""
333
382
  if val is None:
@@ -798,6 +847,20 @@ class ImageType(ColumnType):
798
847
  def to_sa_type(self) -> sql.types.TypeEngine:
799
848
  return sql.String()
800
849
 
850
+ def _create_literal(self, val: Any) -> Any:
851
+ if isinstance(val, str) and val.startswith('data:'):
852
+ # try parsing this as a `data:` URL, and if successful, decode the image immediately
853
+ try:
854
+ with urllib.request.urlopen(val) as response:
855
+ b = response.read()
856
+ img = PIL.Image.open(io.BytesIO(b))
857
+ img.load()
858
+ return img
859
+ except Exception as exc:
860
+ errormsg_val = val if len(val) < 50 else val[:50] + '...'
861
+ raise excs.Error(f'data URL could not be decoded into a valid image: {errormsg_val}') from exc
862
+ return val
863
+
801
864
  def _validate_literal(self, val: Any) -> None:
802
865
  if isinstance(val, PIL.Image.Image):
803
866
  return
@@ -876,6 +939,7 @@ class DocumentType(ColumnType):
876
939
  HTML = 0
877
940
  MD = 1
878
941
  PDF = 2
942
+ XML = 3
879
943
 
880
944
  def __init__(self, nullable: bool = False, doc_formats: Optional[str] = None):
881
945
  super().__init__(self.Type.DOCUMENT, nullable=nullable)
@@ -963,7 +1027,7 @@ class Array(np.ndarray, _PxtType):
963
1027
  `item` (the type subscript) must be a tuple with exactly two elements (in any order):
964
1028
  - A tuple of `Optional[int]`s, specifying the shape of the array
965
1029
  - A type, specifying the dtype of the array
966
- Example: Array[(3, None, 2), float]
1030
+ Example: Array[(3, None, 2), pxt.Float]
967
1031
  """
968
1032
  params = item if isinstance(item, tuple) else (item,)
969
1033
  shape: Optional[tuple] = None
@@ -978,7 +1042,7 @@ class Array(np.ndarray, _PxtType):
978
1042
  elif isinstance(param, type) or isinstance(param, _AnnotatedAlias):
979
1043
  if dtype is not None:
980
1044
  raise TypeError(f'Duplicate Array type parameter: {param}')
981
- dtype = ColumnType.from_python_type(param)
1045
+ dtype = ColumnType.normalize_type(param, allow_builtin_types=False)
982
1046
  else:
983
1047
  raise TypeError(f'Invalid Array type parameter: {param}')
984
1048
  if shape is None:
pixeltable/utils/coco.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import json
2
2
  from pathlib import Path
3
- from typing import Any, Dict, List, Set
3
+ from typing import Any
4
4
 
5
5
  import PIL
6
6
 
@@ -22,7 +22,7 @@ Required format:
22
22
  }
23
23
  """
24
24
 
25
- def _verify_input_dict(input_dict: Dict[str, Any]) -> None:
25
+ def _verify_input_dict(input_dict: dict[str, Any]) -> None:
26
26
  """Verify that input_dict is a valid input dict for write_coco_dataset()"""
27
27
  if not isinstance(input_dict, dict):
28
28
  raise excs.Error(f'Expected dict, got {input_dict}{format_msg}')
@@ -61,11 +61,11 @@ def write_coco_dataset(df: pxt.DataFrame, dest_path: Path) -> Path:
61
61
  images_dir = dest_path / 'images'
62
62
  images_dir.mkdir()
63
63
 
64
- images: List[Dict[str, Any]] = []
64
+ images: list[dict[str, Any]] = []
65
65
  img_id = -1
66
- annotations: List[Dict[str, Any]] = []
66
+ annotations: list[dict[str, Any]] = []
67
67
  ann_id = -1
68
- categories: Set[Any] = set()
68
+ categories: set[Any] = set()
69
69
  for input_row in df._exec():
70
70
  if input_dict_slot_idx == -1:
71
71
  input_dict_expr = df._select_list_exprs[0]
@@ -35,6 +35,11 @@ def get_document_handle(path: str) -> Optional[DocumentHandle]:
35
35
  if md_ast is not None:
36
36
  return DocumentHandle(format=ts.DocumentType.DocumentFormat.MD, md_ast=md_ast)
37
37
 
38
+ if doc_format == '.xml':
39
+ bs_doc = get_xml_handle(path)
40
+ if bs_doc is not None:
41
+ return DocumentHandle(format=ts.DocumentType.DocumentFormat.XML, bs_doc=bs_doc)
42
+
38
43
  return None
39
44
 
40
45
 
@@ -54,7 +59,16 @@ def get_pdf_handle(path: str) -> Optional[fitz.Document]:
54
59
  def get_html_handle(path: str) -> Optional[bs4.BeautifulSoup]:
55
60
  try:
56
61
  with open(path, 'r', encoding='utf8') as fp:
57
- doc = bs4.BeautifulSoup(fp, 'html.parser')
62
+ doc = bs4.BeautifulSoup(fp, 'lxml')
63
+ return doc if doc.find() is not None else None
64
+ except Exception:
65
+ return None
66
+
67
+
68
+ def get_xml_handle(path: str) -> Optional[bs4.BeautifulSoup]:
69
+ try:
70
+ with open(path, 'r', encoding='utf8') as fp:
71
+ doc = bs4.BeautifulSoup(fp, 'xml')
58
72
  return doc if doc.find() is not None else None
59
73
  except Exception:
60
74
  return None
@@ -1,16 +1,16 @@
1
1
  import base64
2
2
  import html
3
+ import io
3
4
  import json
4
5
  import logging
5
6
  import mimetypes
6
7
  from typing import Any, Callable, Optional
7
8
 
9
+ import av # type: ignore[import-untyped]
10
+ import numpy as np
8
11
  import PIL
9
12
  import PIL.Image as Image
10
- import cv2
11
- import numpy as np
12
13
 
13
- import io
14
14
  import pixeltable.type_system as ts
15
15
  from pixeltable.utils.http_server import get_file_uri
16
16
 
@@ -138,11 +138,11 @@ class Formatter:
138
138
  assert isinstance(img, Image.Image), f'Wrong type: {type(img)}'
139
139
  # Try to make it look decent in a variety of display scenarios
140
140
  if self.__num_rows > 1:
141
- width = 240 # Multiple rows: display small images
141
+ width = min(240, img.width) # Multiple rows: display small images
142
142
  elif self.__num_cols > 1:
143
- width = 480 # Multiple columns: display medium images
143
+ width = min(480, img.width) # Multiple columns: display medium images
144
144
  else:
145
- width = 640 # A single image: larger display
145
+ width = min(640, img.width) # A single image: larger display
146
146
  with io.BytesIO() as buffer:
147
147
  img.save(buffer, 'webp')
148
148
  img_base64 = base64.b64encode(buffer.getvalue()).decode()
@@ -159,17 +159,16 @@ class Formatter:
159
159
  # the video itself is not accessible.
160
160
  # TODO(aaron-siegel): If the video is backed by a concrete external URL,
161
161
  # should we link to that instead?
162
- video_reader = cv2.VideoCapture(str(file_path))
163
- if video_reader.isOpened():
164
- status, img_array = video_reader.read()
165
- if status:
166
- img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
167
- thumb = PIL.Image.fromarray(img_array)
162
+ with av.open(file_path) as container:
163
+ try:
164
+ thumb = next(container.decode(video=0)).to_image()
165
+ assert isinstance(thumb, Image.Image)
168
166
  with io.BytesIO() as buffer:
169
167
  thumb.save(buffer, 'jpeg')
170
168
  thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
171
169
  thumb_tag = f'poster="data:image/jpeg;base64,{thumb_base64}"'
172
- video_reader.release()
170
+ except Exception:
171
+ pass
173
172
  if self.__num_rows > 1:
174
173
  width = 320
175
174
  elif self.__num_cols > 1:
pixeltable/utils/s3.py CHANGED
@@ -1,13 +1,16 @@
1
1
  from typing import Any
2
2
 
3
3
 
4
- def get_client() -> Any:
4
+ def get_client(**kwargs: Any) -> Any:
5
5
  import boto3
6
6
  import botocore
7
7
  try:
8
8
  boto3.Session().get_credentials().get_frozen_credentials()
9
- return boto3.client('s3') # credentials are available
9
+ config = botocore.config.Config(**kwargs)
10
+ return boto3.client('s3', config=config) # credentials are available
10
11
  except AttributeError:
11
12
  # No credentials available, use unsigned mode
12
- config = botocore.config.Config(signature_version=botocore.UNSIGNED)
13
+ config_args = kwargs.copy()
14
+ config_args['signature_version'] = botocore.UNSIGNED
15
+ config = botocore.config.Config(**config_args)
13
16
  return boto3.client('s3', config=config)