pixeltable 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +2 -2
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/column.py +41 -29
- pixeltable/catalog/globals.py +18 -0
- pixeltable/catalog/insertable_table.py +30 -10
- pixeltable/catalog/table.py +198 -86
- pixeltable/catalog/table_version.py +47 -53
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/catalog/view.py +17 -18
- pixeltable/dataframe.py +27 -36
- pixeltable/env.py +7 -0
- pixeltable/exec/__init__.py +0 -1
- pixeltable/exec/aggregation_node.py +6 -3
- pixeltable/exec/cache_prefetch_node.py +189 -43
- pixeltable/exec/data_row_batch.py +5 -22
- pixeltable/exec/exec_context.py +2 -2
- pixeltable/exec/exec_node.py +3 -2
- pixeltable/exec/expr_eval_node.py +23 -16
- pixeltable/exec/in_memory_data_node.py +6 -3
- pixeltable/exec/sql_node.py +24 -25
- pixeltable/exprs/arithmetic_expr.py +12 -5
- pixeltable/exprs/array_slice.py +7 -7
- pixeltable/exprs/column_property_ref.py +37 -10
- pixeltable/exprs/column_ref.py +97 -14
- pixeltable/exprs/comparison.py +10 -5
- pixeltable/exprs/compound_predicate.py +8 -7
- pixeltable/exprs/data_row.py +27 -18
- pixeltable/exprs/expr.py +53 -52
- pixeltable/exprs/expr_set.py +5 -0
- pixeltable/exprs/function_call.py +32 -16
- pixeltable/exprs/globals.py +4 -1
- pixeltable/exprs/in_predicate.py +8 -7
- pixeltable/exprs/inline_expr.py +4 -4
- pixeltable/exprs/is_null.py +4 -4
- pixeltable/exprs/json_mapper.py +11 -12
- pixeltable/exprs/json_path.py +6 -11
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +5 -4
- pixeltable/exprs/object_ref.py +2 -1
- pixeltable/exprs/row_builder.py +88 -36
- pixeltable/exprs/rowid_ref.py +12 -11
- pixeltable/exprs/similarity_expr.py +12 -7
- pixeltable/exprs/sql_element_cache.py +7 -5
- pixeltable/exprs/type_cast.py +8 -6
- pixeltable/exprs/variable.py +5 -4
- pixeltable/func/aggregate_function.py +9 -9
- pixeltable/func/expr_template_function.py +6 -5
- pixeltable/func/function.py +11 -10
- pixeltable/func/udf.py +6 -11
- pixeltable/functions/__init__.py +2 -2
- pixeltable/functions/globals.py +5 -7
- pixeltable/functions/huggingface.py +155 -45
- pixeltable/functions/llama_cpp.py +107 -0
- pixeltable/functions/mistralai.py +1 -1
- pixeltable/functions/ollama.py +147 -0
- pixeltable/functions/openai.py +1 -1
- pixeltable/functions/replicate.py +72 -0
- pixeltable/functions/string.py +9 -0
- pixeltable/functions/together.py +1 -1
- pixeltable/functions/util.py +5 -2
- pixeltable/globals.py +67 -26
- pixeltable/index/btree.py +16 -3
- pixeltable/index/embedding_index.py +4 -4
- pixeltable/io/__init__.py +1 -2
- pixeltable/io/fiftyone.py +178 -0
- pixeltable/io/globals.py +96 -2
- pixeltable/iterators/base.py +3 -2
- pixeltable/iterators/document.py +1 -1
- pixeltable/iterators/video.py +120 -63
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_21.py +34 -0
- pixeltable/metadata/converters/util.py +45 -4
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +8 -0
- pixeltable/plan.py +17 -15
- pixeltable/py.typed +0 -0
- pixeltable/store.py +7 -2
- pixeltable/tool/create_test_db_dump.py +1 -1
- pixeltable/tool/create_test_video.py +1 -1
- pixeltable/tool/embed_udf.py +1 -1
- pixeltable/tool/mypy_plugin.py +28 -5
- pixeltable/type_system.py +100 -36
- pixeltable/utils/coco.py +5 -5
- pixeltable/utils/documents.py +15 -1
- pixeltable/utils/formatter.py +12 -13
- pixeltable/utils/s3.py +6 -3
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/METADATA +158 -49
- pixeltable-0.2.23.dist-info/RECORD +153 -0
- pixeltable/exec/media_validation_node.py +0 -43
- pixeltable-0.2.21.dist-info/RECORD +0 -148
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/entry_points.txt +0 -0
|
@@ -4,7 +4,7 @@ from typing import Any, Callable, Optional
|
|
|
4
4
|
|
|
5
5
|
import sqlalchemy as sql
|
|
6
6
|
|
|
7
|
-
from pixeltable.metadata.schema import Table
|
|
7
|
+
from pixeltable.metadata.schema import Table, TableSchemaVersion
|
|
8
8
|
|
|
9
9
|
__logger = logging.getLogger('pixeltable')
|
|
10
10
|
|
|
@@ -17,12 +17,12 @@ def convert_table_md(
|
|
|
17
17
|
substitution_fn: Optional[Callable[[Optional[str], Any], Optional[tuple[Optional[str], Any]]]] = None
|
|
18
18
|
) -> None:
|
|
19
19
|
"""
|
|
20
|
-
Converts
|
|
20
|
+
Converts schema.TableMd dicts based on the specified conversion functions.
|
|
21
21
|
|
|
22
22
|
Args:
|
|
23
23
|
engine: The SQLAlchemy engine.
|
|
24
|
-
table_md_updater: A function that updates
|
|
25
|
-
column_md_updater: A function that updates
|
|
24
|
+
table_md_updater: A function that updates schema.TableMd dicts in place.
|
|
25
|
+
column_md_updater: A function that updates schema.ColumnMd dicts in place.
|
|
26
26
|
external_store_md_updater: A function that updates the external store metadata in place.
|
|
27
27
|
substitution_fn: A function that substitutes metadata values. If specified, all metadata will be traversed
|
|
28
28
|
recursively, and `substitution_fn` will be called once for each metadata entry. If the entry appears in
|
|
@@ -90,3 +90,44 @@ def __substitute_md_rec(
|
|
|
90
90
|
return updated_list
|
|
91
91
|
else:
|
|
92
92
|
return md
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def convert_table_schema_version_md(
|
|
96
|
+
engine: sql.engine.Engine,
|
|
97
|
+
table_schema_version_md_updater: Optional[Callable[[dict], None]] = None,
|
|
98
|
+
schema_column_updater: Optional[Callable[[dict], None]] = None
|
|
99
|
+
) -> None:
|
|
100
|
+
"""
|
|
101
|
+
Converts schema.TableSchemaVersionMd dicts based on the specified conversion functions.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
engine: The SQLAlchemy engine.
|
|
105
|
+
table_schema_version_md_updater: A function that updates schema.TableSchemaVersionMd dicts in place.
|
|
106
|
+
schema_column_updater: A function that updates schema.SchemaColumn dicts in place.
|
|
107
|
+
"""
|
|
108
|
+
with engine.begin() as conn:
|
|
109
|
+
stmt = sql.select(TableSchemaVersion.tbl_id, TableSchemaVersion.schema_version, TableSchemaVersion.md)
|
|
110
|
+
for row in conn.execute(stmt):
|
|
111
|
+
tbl_id, schema_version, md = row[0], row[1], row[2]
|
|
112
|
+
assert isinstance(md, dict)
|
|
113
|
+
updated_md = copy.deepcopy(md)
|
|
114
|
+
if table_schema_version_md_updater is not None:
|
|
115
|
+
table_schema_version_md_updater(updated_md)
|
|
116
|
+
if schema_column_updater is not None:
|
|
117
|
+
__update_schema_column(updated_md, schema_column_updater)
|
|
118
|
+
if updated_md != md:
|
|
119
|
+
__logger.info(f'Updating TableSchemaVersion(tbl_id={tbl_id}, schema_version={schema_version})')
|
|
120
|
+
update_stmt = (
|
|
121
|
+
sql.update(TableSchemaVersion)
|
|
122
|
+
.where(TableSchemaVersion.tbl_id == tbl_id)
|
|
123
|
+
.where(TableSchemaVersion.schema_version == schema_version)
|
|
124
|
+
.values(md=updated_md)
|
|
125
|
+
)
|
|
126
|
+
conn.execute(update_stmt)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def __update_schema_column(table_schema_version_md: dict, schema_column_updater: Callable[[dict], None]) -> None:
|
|
130
|
+
cols = table_schema_version_md['columns']
|
|
131
|
+
assert isinstance(cols, dict)
|
|
132
|
+
for schema_col in cols.values():
|
|
133
|
+
schema_column_updater(schema_col)
|
pixeltable/metadata/notes.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# rather than as a comment, so that the existence of a description can be enforced by
|
|
3
3
|
# the unit tests when new versions are added.
|
|
4
4
|
VERSION_NOTES = {
|
|
5
|
+
22: 'TableMd/ColumnMd.media_validation',
|
|
5
6
|
21: 'Separate InlineArray and InlineList',
|
|
6
7
|
20: 'Store DB timestamps in UTC',
|
|
7
8
|
19: 'UDF renames; ImageMemberAccess removal',
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -202,6 +202,10 @@ class SchemaColumn:
|
|
|
202
202
|
pos: int
|
|
203
203
|
name: str
|
|
204
204
|
|
|
205
|
+
# media validation strategy of this particular media column; if not set, TableMd.media_validation applies
|
|
206
|
+
# stores column.MediaValiation.name.lower()
|
|
207
|
+
media_validation: Optional[str]
|
|
208
|
+
|
|
205
209
|
|
|
206
210
|
@dataclasses.dataclass
|
|
207
211
|
class TableSchemaVersionMd:
|
|
@@ -214,6 +218,10 @@ class TableSchemaVersionMd:
|
|
|
214
218
|
num_retained_versions: int
|
|
215
219
|
comment: str
|
|
216
220
|
|
|
221
|
+
# default validation strategy for any media column of this table
|
|
222
|
+
# stores column.MediaValiation.name.lower()
|
|
223
|
+
media_validation: str
|
|
224
|
+
|
|
217
225
|
|
|
218
226
|
# versioning: each table schema change results in a new record
|
|
219
227
|
class TableSchemaVersion(Base):
|
pixeltable/plan.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any, Iterable, Optional, Sequence
|
|
1
|
+
from typing import Any, Iterable, Optional, Sequence
|
|
2
2
|
from uuid import UUID
|
|
3
3
|
|
|
4
4
|
import sqlalchemy as sql
|
|
@@ -225,27 +225,28 @@ class Planner:
|
|
|
225
225
|
assert not tbl.is_view()
|
|
226
226
|
# stored_cols: all cols we need to store, incl computed cols (and indices)
|
|
227
227
|
stored_cols = [c for c in tbl.cols if c.is_stored]
|
|
228
|
-
assert len(stored_cols) > 0
|
|
229
|
-
|
|
228
|
+
assert len(stored_cols) > 0 # there needs to be something to store
|
|
230
229
|
row_builder = exprs.RowBuilder([], stored_cols, [])
|
|
231
230
|
|
|
232
231
|
# create InMemoryDataNode for 'rows'
|
|
233
|
-
stored_col_info = row_builder.output_slot_idxs()
|
|
234
|
-
stored_img_col_info = [info for info in stored_col_info if info.col.col_type.is_image_type()]
|
|
235
|
-
input_col_info = [info for info in stored_col_info if not info.col.is_computed]
|
|
236
232
|
plan: exec.ExecNode = exec.InMemoryDataNode(tbl, rows, row_builder, tbl.next_rowid)
|
|
237
233
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
234
|
+
media_input_col_info = [
|
|
235
|
+
exprs.ColumnSlotIdx(col_ref.col, col_ref.slot_idx)
|
|
236
|
+
for col_ref in row_builder.input_exprs
|
|
237
|
+
if isinstance(col_ref, exprs.ColumnRef) and col_ref.col_type.is_media_type()
|
|
238
|
+
]
|
|
239
|
+
if len(media_input_col_info) > 0:
|
|
240
|
+
# prefetch external files for all input column refs
|
|
241
|
+
plan = exec.CachePrefetchNode(tbl.id, media_input_col_info, input=plan)
|
|
243
242
|
|
|
244
|
-
computed_exprs =
|
|
243
|
+
computed_exprs = row_builder.output_exprs - row_builder.input_exprs
|
|
245
244
|
if len(computed_exprs) > 0:
|
|
246
245
|
# add an ExprEvalNode when there are exprs to compute
|
|
247
246
|
plan = exec.ExprEvalNode(row_builder, computed_exprs, plan.output_exprs, input=plan)
|
|
248
247
|
|
|
248
|
+
stored_col_info = row_builder.output_slot_idxs()
|
|
249
|
+
stored_img_col_info = [info for info in stored_col_info if info.col.col_type.is_image_type()]
|
|
249
250
|
plan.set_stored_img_cols(stored_img_col_info)
|
|
250
251
|
plan.set_ctx(
|
|
251
252
|
exec.ExecContext(
|
|
@@ -621,7 +622,8 @@ class Planner:
|
|
|
621
622
|
assert isinstance(tbl, catalog.TableVersionPath)
|
|
622
623
|
sql_elements = analyzer.sql_elements
|
|
623
624
|
is_python_agg = (
|
|
624
|
-
not sql_elements.
|
|
625
|
+
not sql_elements.contains_all(analyzer.agg_fn_calls)
|
|
626
|
+
or not sql_elements.contains_all(analyzer.window_fn_calls)
|
|
625
627
|
)
|
|
626
628
|
ctx = exec.ExecContext(row_builder)
|
|
627
629
|
cls._verify_ordering(analyzer, verify_agg=is_python_agg)
|
|
@@ -671,8 +673,8 @@ class Planner:
|
|
|
671
673
|
ctx.batch_size = 16
|
|
672
674
|
|
|
673
675
|
# do aggregation in SQL if all agg exprs can be translated
|
|
674
|
-
if (sql_elements.
|
|
675
|
-
and sql_elements.
|
|
676
|
+
if (sql_elements.contains_all(analyzer.select_list)
|
|
677
|
+
and sql_elements.contains_all(analyzer.grouping_exprs)
|
|
676
678
|
and isinstance(plan, exec.SqlNode)
|
|
677
679
|
and plan.to_cte() is not None):
|
|
678
680
|
plan = exec.SqlAggregationNode(
|
pixeltable/py.typed
ADDED
|
File without changes
|
pixeltable/store.py
CHANGED
|
@@ -303,7 +303,7 @@ class StoreBase:
|
|
|
303
303
|
|
|
304
304
|
def insert_rows(
|
|
305
305
|
self, exec_plan: ExecNode, conn: sql.engine.Connection, v_min: Optional[int] = None,
|
|
306
|
-
show_progress: bool = True, rowids: Optional[Iterator[int]] = None
|
|
306
|
+
show_progress: bool = True, rowids: Optional[Iterator[int]] = None, abort_on_exc: bool = False
|
|
307
307
|
) -> tuple[int, int, set[int]]:
|
|
308
308
|
"""Insert rows into the store table and update the catalog table's md
|
|
309
309
|
Returns:
|
|
@@ -325,8 +325,13 @@ class StoreBase:
|
|
|
325
325
|
for batch_start_idx in range(0, len(row_batch), self.__INSERT_BATCH_SIZE):
|
|
326
326
|
# compute batch of rows and convert them into table rows
|
|
327
327
|
table_rows: list[dict[str, Any]] = []
|
|
328
|
-
|
|
328
|
+
batch_stop_idx = min(batch_start_idx + self.__INSERT_BATCH_SIZE, len(row_batch))
|
|
329
|
+
for row_idx in range(batch_start_idx, batch_stop_idx):
|
|
329
330
|
row = row_batch[row_idx]
|
|
331
|
+
# if abort_on_exc == True, we need to check for media validation exceptions
|
|
332
|
+
if abort_on_exc and row.has_exc():
|
|
333
|
+
exc = row.get_first_exc()
|
|
334
|
+
raise exc
|
|
330
335
|
|
|
331
336
|
rowid = (next(rowids),) if rowids is not None else row.pk[:-1]
|
|
332
337
|
pk = rowid + (v_min,)
|
|
@@ -153,7 +153,7 @@ class Dumper:
|
|
|
153
153
|
self.__add_expr_columns(v, 'view')
|
|
154
154
|
|
|
155
155
|
# snapshot
|
|
156
|
-
_ = pxt.
|
|
156
|
+
_ = pxt.create_snapshot('views.snapshot', t.where(t.c2 >= 75))
|
|
157
157
|
|
|
158
158
|
# view of views
|
|
159
159
|
vv = pxt.create_view('views.view_of_views', v.where(t.c2 >= 25))
|
pixeltable/tool/embed_udf.py
CHANGED
|
@@ -6,4 +6,4 @@ import pixeltable as pxt
|
|
|
6
6
|
# TODO This can go away once we have the ability to inline expr_udf's
|
|
7
7
|
@pxt.expr_udf
|
|
8
8
|
def clip_text_embed(txt: str) -> np.ndarray:
|
|
9
|
-
return pxt.functions.huggingface.clip_text(txt, model_id='openai/clip-vit-base-patch32')
|
|
9
|
+
return pxt.functions.huggingface.clip_text(txt, model_id='openai/clip-vit-base-patch32') # type: ignore[return-value]
|
pixeltable/tool/mypy_plugin.py
CHANGED
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
from typing import Callable, Optional
|
|
2
2
|
|
|
3
|
-
from mypy
|
|
4
|
-
from mypy.
|
|
3
|
+
from mypy import nodes
|
|
4
|
+
from mypy.plugin import AnalyzeTypeContext, ClassDefContext, Plugin
|
|
5
|
+
from mypy.plugins.common import add_method_to_class
|
|
6
|
+
from mypy.types import AnyType, Type, TypeOfAny
|
|
5
7
|
|
|
6
8
|
import pixeltable as pxt
|
|
7
9
|
|
|
8
10
|
|
|
9
11
|
class PxtPlugin(Plugin):
|
|
12
|
+
__UDA_FULLNAME = f'{pxt.uda.__module__}.{pxt.uda.__name__}'
|
|
10
13
|
__TYPE_MAP = {
|
|
11
14
|
pxt.Json: 'typing.Any',
|
|
12
15
|
pxt.Array: 'numpy.ndarray',
|
|
@@ -20,13 +23,33 @@ class PxtPlugin(Plugin):
|
|
|
20
23
|
for k, v in __TYPE_MAP.items()
|
|
21
24
|
}
|
|
22
25
|
|
|
23
|
-
def get_type_analyze_hook(self, fullname: str) -> Optional[Callable[[AnalyzeTypeContext],
|
|
26
|
+
def get_type_analyze_hook(self, fullname: str) -> Optional[Callable[[AnalyzeTypeContext], Type]]:
|
|
24
27
|
if fullname in self.__FULLNAME_MAP:
|
|
25
28
|
subst_name = self.__FULLNAME_MAP[fullname]
|
|
26
29
|
return lambda ctx: pxt_hook(ctx, subst_name)
|
|
30
|
+
return None
|
|
27
31
|
|
|
28
|
-
def
|
|
32
|
+
def get_class_decorator_hook_2(self, fullname: str) -> Optional[Callable[[ClassDefContext], bool]]:
|
|
33
|
+
if fullname == self.__UDA_FULLNAME:
|
|
34
|
+
return pxt_decorator_hook
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
def plugin(version: str) -> type:
|
|
29
38
|
return PxtPlugin
|
|
30
39
|
|
|
31
40
|
def pxt_hook(ctx: AnalyzeTypeContext, subst_name: str) -> Type:
|
|
32
|
-
|
|
41
|
+
if subst_name == 'typing.Any':
|
|
42
|
+
return AnyType(TypeOfAny.special_form)
|
|
43
|
+
return ctx.api.named_type(subst_name, [])
|
|
44
|
+
|
|
45
|
+
def pxt_decorator_hook(ctx: ClassDefContext) -> bool:
|
|
46
|
+
arg = nodes.Argument(nodes.Var('fn'), AnyType(TypeOfAny.special_form), None, nodes.ARG_POS)
|
|
47
|
+
add_method_to_class(
|
|
48
|
+
ctx.api,
|
|
49
|
+
ctx.cls,
|
|
50
|
+
"to_sql",
|
|
51
|
+
args=[arg],
|
|
52
|
+
return_type=AnyType(TypeOfAny.special_form),
|
|
53
|
+
is_staticmethod=True,
|
|
54
|
+
)
|
|
55
|
+
return True
|
pixeltable/type_system.py
CHANGED
|
@@ -3,16 +3,18 @@ from __future__ import annotations
|
|
|
3
3
|
import abc
|
|
4
4
|
import datetime
|
|
5
5
|
import enum
|
|
6
|
+
import io
|
|
6
7
|
import json
|
|
8
|
+
import types
|
|
7
9
|
import typing
|
|
8
10
|
import urllib.parse
|
|
9
11
|
import urllib.request
|
|
10
12
|
from pathlib import Path
|
|
11
13
|
from typing import Any, Iterable, Mapping, Optional, Sequence, Union
|
|
12
14
|
|
|
15
|
+
import PIL.Image
|
|
13
16
|
import av # type: ignore
|
|
14
17
|
import numpy as np
|
|
15
|
-
import PIL.Image
|
|
16
18
|
import sqlalchemy as sql
|
|
17
19
|
from typing import _GenericAlias # type: ignore[attr-defined]
|
|
18
20
|
from typing_extensions import _AnnotatedAlias
|
|
@@ -271,63 +273,110 @@ class ColumnType:
|
|
|
271
273
|
return inferred_type
|
|
272
274
|
|
|
273
275
|
@classmethod
|
|
274
|
-
def from_python_type(
|
|
275
|
-
|
|
276
|
+
def from_python_type(
|
|
277
|
+
cls,
|
|
278
|
+
t: Union[type, _GenericAlias],
|
|
279
|
+
nullable_default: bool = False,
|
|
280
|
+
allow_builtin_types: bool = True
|
|
281
|
+
) -> Optional[ColumnType]:
|
|
282
|
+
"""
|
|
283
|
+
Convert a Python type into a Pixeltable `ColumnType` instance.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
t: The Python type.
|
|
287
|
+
nullable_default: If True, then the returned `ColumnType` will be nullable unless it is marked as
|
|
288
|
+
`Required`.
|
|
289
|
+
allow_builtin_types: If True, then built-in types such as `str`, `int`, `float`, etc., will be
|
|
290
|
+
allowed (as in UDF definitions). If False, then only Pixeltable types such as `pxt.String`,
|
|
291
|
+
`pxt.Int`, etc., will be allowed (as in schema definitions). `Optional` and `Required`
|
|
292
|
+
designations will be allowed regardless.
|
|
293
|
+
"""
|
|
294
|
+
origin = typing.get_origin(t)
|
|
295
|
+
if origin is typing.Union:
|
|
296
|
+
# Check if `t` has the form Optional[T].
|
|
276
297
|
union_args = typing.get_args(t)
|
|
277
298
|
if len(union_args) == 2 and type(None) in union_args:
|
|
278
299
|
# `t` is a type of the form Optional[T] (equivalently, Union[T, None] or Union[None, T]).
|
|
279
300
|
# We treat it as the underlying type but with nullable=True.
|
|
280
301
|
underlying_py_type = union_args[0] if union_args[1] is type(None) else union_args[1]
|
|
281
|
-
underlying = cls.from_python_type(underlying_py_type)
|
|
302
|
+
underlying = cls.from_python_type(underlying_py_type, allow_builtin_types=allow_builtin_types)
|
|
282
303
|
if underlying is not None:
|
|
283
304
|
return underlying.copy(nullable=True)
|
|
284
|
-
elif
|
|
305
|
+
elif origin is Required:
|
|
306
|
+
required_args = typing.get_args(t)
|
|
307
|
+
assert len(required_args) == 1
|
|
308
|
+
return cls.from_python_type(
|
|
309
|
+
required_args[0],
|
|
310
|
+
nullable_default=False,
|
|
311
|
+
allow_builtin_types=allow_builtin_types
|
|
312
|
+
)
|
|
313
|
+
elif origin is typing.Annotated:
|
|
285
314
|
annotated_args = typing.get_args(t)
|
|
286
315
|
origin = annotated_args[0]
|
|
287
316
|
parameters = annotated_args[1]
|
|
288
317
|
if isinstance(parameters, ColumnType):
|
|
289
318
|
return parameters.copy(nullable=nullable_default)
|
|
290
|
-
elif typing.get_origin(t) is Required:
|
|
291
|
-
required_args = typing.get_args(t)
|
|
292
|
-
assert len(required_args) == 1
|
|
293
|
-
return cls.from_python_type(required_args[0], nullable_default=False)
|
|
294
319
|
else:
|
|
295
|
-
#
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
320
|
+
# It's something other than Optional[T], Required[T], or an explicitly annotated type.
|
|
321
|
+
if origin is not None:
|
|
322
|
+
# Discard type parameters to ensure that parameterized types such as `list[T]`
|
|
323
|
+
# are correctly mapped to Pixeltable types.
|
|
324
|
+
t = origin
|
|
325
|
+
if isinstance(t, type) and issubclass(t, _PxtType):
|
|
326
|
+
return t.as_col_type(nullable=nullable_default)
|
|
327
|
+
elif allow_builtin_types:
|
|
328
|
+
if t is str:
|
|
329
|
+
return StringType(nullable=nullable_default)
|
|
330
|
+
if t is int:
|
|
331
|
+
return IntType(nullable=nullable_default)
|
|
332
|
+
if t is float:
|
|
333
|
+
return FloatType(nullable=nullable_default)
|
|
334
|
+
if t is bool:
|
|
335
|
+
return BoolType(nullable=nullable_default)
|
|
336
|
+
if t is datetime.datetime:
|
|
337
|
+
return TimestampType(nullable=nullable_default)
|
|
338
|
+
if t is PIL.Image.Image:
|
|
339
|
+
return ImageType(nullable=nullable_default)
|
|
340
|
+
if issubclass(t, Sequence) or issubclass(t, Mapping):
|
|
341
|
+
return JsonType(nullable=nullable_default)
|
|
317
342
|
return None
|
|
318
343
|
|
|
319
344
|
@classmethod
|
|
320
|
-
def normalize_type(
|
|
345
|
+
def normalize_type(
|
|
346
|
+
cls,
|
|
347
|
+
t: Union[ColumnType, type, _AnnotatedAlias],
|
|
348
|
+
nullable_default: bool = False,
|
|
349
|
+
allow_builtin_types: bool = True
|
|
350
|
+
) -> ColumnType:
|
|
321
351
|
"""
|
|
322
352
|
Convert any type recognizable by Pixeltable to its corresponding ColumnType.
|
|
323
353
|
"""
|
|
324
354
|
if isinstance(t, ColumnType):
|
|
325
355
|
return t
|
|
326
|
-
col_type = cls.from_python_type(t, nullable_default)
|
|
356
|
+
col_type = cls.from_python_type(t, nullable_default, allow_builtin_types)
|
|
327
357
|
if col_type is None:
|
|
328
|
-
|
|
358
|
+
cls.__raise_exc_for_invalid_type(t)
|
|
329
359
|
return col_type
|
|
330
360
|
|
|
361
|
+
__TYPE_SUGGESTIONS: list[tuple[type, str]] = [
|
|
362
|
+
(str, 'pxt.String'),
|
|
363
|
+
(bool, 'pxt.Bool'),
|
|
364
|
+
(int, 'pxt.Int'),
|
|
365
|
+
(float, 'pxt.Float'),
|
|
366
|
+
(datetime.datetime, 'pxt.Timestamp'),
|
|
367
|
+
(PIL.Image.Image, 'pxt.Image'),
|
|
368
|
+
(Sequence, 'pxt.Json'),
|
|
369
|
+
(Mapping, 'pxt.Json'),
|
|
370
|
+
]
|
|
371
|
+
|
|
372
|
+
@classmethod
|
|
373
|
+
def __raise_exc_for_invalid_type(cls, t: Union[type, _AnnotatedAlias]) -> None:
|
|
374
|
+
for builtin_type, suggestion in cls.__TYPE_SUGGESTIONS:
|
|
375
|
+
if t is builtin_type or (isinstance(t, type) and issubclass(t, builtin_type)):
|
|
376
|
+
name = t.__name__ if t.__module__ == 'builtins' else f'{t.__module__}.{t.__name__}'
|
|
377
|
+
raise excs.Error(f'Standard Python type `{name}` cannot be used here; use `{suggestion}` instead')
|
|
378
|
+
raise excs.Error(f'Unknown type: {t}')
|
|
379
|
+
|
|
331
380
|
def validate_literal(self, val: Any) -> None:
|
|
332
381
|
"""Raise TypeError if val is not a valid literal for this type"""
|
|
333
382
|
if val is None:
|
|
@@ -798,6 +847,20 @@ class ImageType(ColumnType):
|
|
|
798
847
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
799
848
|
return sql.String()
|
|
800
849
|
|
|
850
|
+
def _create_literal(self, val: Any) -> Any:
|
|
851
|
+
if isinstance(val, str) and val.startswith('data:'):
|
|
852
|
+
# try parsing this as a `data:` URL, and if successful, decode the image immediately
|
|
853
|
+
try:
|
|
854
|
+
with urllib.request.urlopen(val) as response:
|
|
855
|
+
b = response.read()
|
|
856
|
+
img = PIL.Image.open(io.BytesIO(b))
|
|
857
|
+
img.load()
|
|
858
|
+
return img
|
|
859
|
+
except Exception as exc:
|
|
860
|
+
errormsg_val = val if len(val) < 50 else val[:50] + '...'
|
|
861
|
+
raise excs.Error(f'data URL could not be decoded into a valid image: {errormsg_val}') from exc
|
|
862
|
+
return val
|
|
863
|
+
|
|
801
864
|
def _validate_literal(self, val: Any) -> None:
|
|
802
865
|
if isinstance(val, PIL.Image.Image):
|
|
803
866
|
return
|
|
@@ -876,6 +939,7 @@ class DocumentType(ColumnType):
|
|
|
876
939
|
HTML = 0
|
|
877
940
|
MD = 1
|
|
878
941
|
PDF = 2
|
|
942
|
+
XML = 3
|
|
879
943
|
|
|
880
944
|
def __init__(self, nullable: bool = False, doc_formats: Optional[str] = None):
|
|
881
945
|
super().__init__(self.Type.DOCUMENT, nullable=nullable)
|
|
@@ -963,7 +1027,7 @@ class Array(np.ndarray, _PxtType):
|
|
|
963
1027
|
`item` (the type subscript) must be a tuple with exactly two elements (in any order):
|
|
964
1028
|
- A tuple of `Optional[int]`s, specifying the shape of the array
|
|
965
1029
|
- A type, specifying the dtype of the array
|
|
966
|
-
Example: Array[(3, None, 2),
|
|
1030
|
+
Example: Array[(3, None, 2), pxt.Float]
|
|
967
1031
|
"""
|
|
968
1032
|
params = item if isinstance(item, tuple) else (item,)
|
|
969
1033
|
shape: Optional[tuple] = None
|
|
@@ -978,7 +1042,7 @@ class Array(np.ndarray, _PxtType):
|
|
|
978
1042
|
elif isinstance(param, type) or isinstance(param, _AnnotatedAlias):
|
|
979
1043
|
if dtype is not None:
|
|
980
1044
|
raise TypeError(f'Duplicate Array type parameter: {param}')
|
|
981
|
-
dtype = ColumnType.
|
|
1045
|
+
dtype = ColumnType.normalize_type(param, allow_builtin_types=False)
|
|
982
1046
|
else:
|
|
983
1047
|
raise TypeError(f'Invalid Array type parameter: {param}')
|
|
984
1048
|
if shape is None:
|
pixeltable/utils/coco.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Any
|
|
3
|
+
from typing import Any
|
|
4
4
|
|
|
5
5
|
import PIL
|
|
6
6
|
|
|
@@ -22,7 +22,7 @@ Required format:
|
|
|
22
22
|
}
|
|
23
23
|
"""
|
|
24
24
|
|
|
25
|
-
def _verify_input_dict(input_dict:
|
|
25
|
+
def _verify_input_dict(input_dict: dict[str, Any]) -> None:
|
|
26
26
|
"""Verify that input_dict is a valid input dict for write_coco_dataset()"""
|
|
27
27
|
if not isinstance(input_dict, dict):
|
|
28
28
|
raise excs.Error(f'Expected dict, got {input_dict}{format_msg}')
|
|
@@ -61,11 +61,11 @@ def write_coco_dataset(df: pxt.DataFrame, dest_path: Path) -> Path:
|
|
|
61
61
|
images_dir = dest_path / 'images'
|
|
62
62
|
images_dir.mkdir()
|
|
63
63
|
|
|
64
|
-
images:
|
|
64
|
+
images: list[dict[str, Any]] = []
|
|
65
65
|
img_id = -1
|
|
66
|
-
annotations:
|
|
66
|
+
annotations: list[dict[str, Any]] = []
|
|
67
67
|
ann_id = -1
|
|
68
|
-
categories:
|
|
68
|
+
categories: set[Any] = set()
|
|
69
69
|
for input_row in df._exec():
|
|
70
70
|
if input_dict_slot_idx == -1:
|
|
71
71
|
input_dict_expr = df._select_list_exprs[0]
|
pixeltable/utils/documents.py
CHANGED
|
@@ -35,6 +35,11 @@ def get_document_handle(path: str) -> Optional[DocumentHandle]:
|
|
|
35
35
|
if md_ast is not None:
|
|
36
36
|
return DocumentHandle(format=ts.DocumentType.DocumentFormat.MD, md_ast=md_ast)
|
|
37
37
|
|
|
38
|
+
if doc_format == '.xml':
|
|
39
|
+
bs_doc = get_xml_handle(path)
|
|
40
|
+
if bs_doc is not None:
|
|
41
|
+
return DocumentHandle(format=ts.DocumentType.DocumentFormat.XML, bs_doc=bs_doc)
|
|
42
|
+
|
|
38
43
|
return None
|
|
39
44
|
|
|
40
45
|
|
|
@@ -54,7 +59,16 @@ def get_pdf_handle(path: str) -> Optional[fitz.Document]:
|
|
|
54
59
|
def get_html_handle(path: str) -> Optional[bs4.BeautifulSoup]:
|
|
55
60
|
try:
|
|
56
61
|
with open(path, 'r', encoding='utf8') as fp:
|
|
57
|
-
doc = bs4.BeautifulSoup(fp, '
|
|
62
|
+
doc = bs4.BeautifulSoup(fp, 'lxml')
|
|
63
|
+
return doc if doc.find() is not None else None
|
|
64
|
+
except Exception:
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_xml_handle(path: str) -> Optional[bs4.BeautifulSoup]:
|
|
69
|
+
try:
|
|
70
|
+
with open(path, 'r', encoding='utf8') as fp:
|
|
71
|
+
doc = bs4.BeautifulSoup(fp, 'xml')
|
|
58
72
|
return doc if doc.find() is not None else None
|
|
59
73
|
except Exception:
|
|
60
74
|
return None
|
pixeltable/utils/formatter.py
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import html
|
|
3
|
+
import io
|
|
3
4
|
import json
|
|
4
5
|
import logging
|
|
5
6
|
import mimetypes
|
|
6
7
|
from typing import Any, Callable, Optional
|
|
7
8
|
|
|
9
|
+
import av # type: ignore[import-untyped]
|
|
10
|
+
import numpy as np
|
|
8
11
|
import PIL
|
|
9
12
|
import PIL.Image as Image
|
|
10
|
-
import cv2
|
|
11
|
-
import numpy as np
|
|
12
13
|
|
|
13
|
-
import io
|
|
14
14
|
import pixeltable.type_system as ts
|
|
15
15
|
from pixeltable.utils.http_server import get_file_uri
|
|
16
16
|
|
|
@@ -138,11 +138,11 @@ class Formatter:
|
|
|
138
138
|
assert isinstance(img, Image.Image), f'Wrong type: {type(img)}'
|
|
139
139
|
# Try to make it look decent in a variety of display scenarios
|
|
140
140
|
if self.__num_rows > 1:
|
|
141
|
-
width = 240 # Multiple rows: display small images
|
|
141
|
+
width = min(240, img.width) # Multiple rows: display small images
|
|
142
142
|
elif self.__num_cols > 1:
|
|
143
|
-
width = 480 # Multiple columns: display medium images
|
|
143
|
+
width = min(480, img.width) # Multiple columns: display medium images
|
|
144
144
|
else:
|
|
145
|
-
width = 640 # A single image: larger display
|
|
145
|
+
width = min(640, img.width) # A single image: larger display
|
|
146
146
|
with io.BytesIO() as buffer:
|
|
147
147
|
img.save(buffer, 'webp')
|
|
148
148
|
img_base64 = base64.b64encode(buffer.getvalue()).decode()
|
|
@@ -159,17 +159,16 @@ class Formatter:
|
|
|
159
159
|
# the video itself is not accessible.
|
|
160
160
|
# TODO(aaron-siegel): If the video is backed by a concrete external URL,
|
|
161
161
|
# should we link to that instead?
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
|
|
167
|
-
thumb = PIL.Image.fromarray(img_array)
|
|
162
|
+
with av.open(file_path) as container:
|
|
163
|
+
try:
|
|
164
|
+
thumb = next(container.decode(video=0)).to_image()
|
|
165
|
+
assert isinstance(thumb, Image.Image)
|
|
168
166
|
with io.BytesIO() as buffer:
|
|
169
167
|
thumb.save(buffer, 'jpeg')
|
|
170
168
|
thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
|
|
171
169
|
thumb_tag = f'poster="data:image/jpeg;base64,{thumb_base64}"'
|
|
172
|
-
|
|
170
|
+
except Exception:
|
|
171
|
+
pass
|
|
173
172
|
if self.__num_rows > 1:
|
|
174
173
|
width = 320
|
|
175
174
|
elif self.__num_cols > 1:
|
pixeltable/utils/s3.py
CHANGED
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
3
|
|
|
4
|
-
def get_client() -> Any:
|
|
4
|
+
def get_client(**kwargs: Any) -> Any:
|
|
5
5
|
import boto3
|
|
6
6
|
import botocore
|
|
7
7
|
try:
|
|
8
8
|
boto3.Session().get_credentials().get_frozen_credentials()
|
|
9
|
-
|
|
9
|
+
config = botocore.config.Config(**kwargs)
|
|
10
|
+
return boto3.client('s3', config=config) # credentials are available
|
|
10
11
|
except AttributeError:
|
|
11
12
|
# No credentials available, use unsigned mode
|
|
12
|
-
|
|
13
|
+
config_args = kwargs.copy()
|
|
14
|
+
config_args['signature_version'] = botocore.UNSIGNED
|
|
15
|
+
config = botocore.config.Config(**config_args)
|
|
13
16
|
return boto3.client('s3', config=config)
|