pixeltable 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +3 -1
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/column.py +14 -2
- pixeltable/catalog/insertable_table.py +32 -17
- pixeltable/catalog/table.py +194 -12
- pixeltable/catalog/table_version.py +270 -110
- pixeltable/catalog/table_version_path.py +6 -1
- pixeltable/datatransfer/__init__.py +1 -0
- pixeltable/datatransfer/label_studio.py +526 -0
- pixeltable/datatransfer/remote.py +113 -0
- pixeltable/env.py +156 -73
- pixeltable/exprs/column_ref.py +2 -2
- pixeltable/exprs/comparison.py +39 -1
- pixeltable/exprs/data_row.py +7 -0
- pixeltable/exprs/expr.py +11 -12
- pixeltable/exprs/function_call.py +0 -3
- pixeltable/exprs/globals.py +14 -2
- pixeltable/exprs/similarity_expr.py +5 -3
- pixeltable/ext/functions/whisperx.py +30 -0
- pixeltable/ext/functions/yolox.py +16 -0
- pixeltable/func/aggregate_function.py +2 -2
- pixeltable/func/expr_template_function.py +3 -1
- pixeltable/func/udf.py +2 -2
- pixeltable/functions/fireworks.py +9 -4
- pixeltable/functions/huggingface.py +25 -1
- pixeltable/functions/openai.py +15 -10
- pixeltable/functions/together.py +11 -6
- pixeltable/functions/util.py +0 -43
- pixeltable/functions/video.py +46 -8
- pixeltable/globals.py +20 -2
- pixeltable/index/__init__.py +1 -0
- pixeltable/index/base.py +6 -1
- pixeltable/index/btree.py +54 -0
- pixeltable/index/embedding_index.py +4 -1
- pixeltable/io/__init__.py +1 -0
- pixeltable/io/globals.py +59 -0
- pixeltable/iterators/base.py +4 -4
- pixeltable/iterators/document.py +26 -15
- pixeltable/iterators/video.py +9 -1
- pixeltable/metadata/__init__.py +2 -2
- pixeltable/metadata/converters/convert_14.py +13 -0
- pixeltable/metadata/converters/convert_15.py +29 -0
- pixeltable/metadata/converters/util.py +63 -0
- pixeltable/metadata/schema.py +12 -6
- pixeltable/plan.py +9 -5
- pixeltable/store.py +14 -21
- pixeltable/tool/create_test_db_dump.py +16 -0
- pixeltable/type_system.py +14 -4
- pixeltable/utils/coco.py +94 -0
- pixeltable-0.2.7.dist-info/METADATA +137 -0
- {pixeltable-0.2.6.dist-info → pixeltable-0.2.7.dist-info}/RECORD +53 -46
- pixeltable/func/nos_function.py +0 -202
- pixeltable/utils/clip.py +0 -18
- pixeltable-0.2.6.dist-info/METADATA +0 -131
- {pixeltable-0.2.6.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.6.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +0 -0
pixeltable/iterators/video.py
CHANGED
|
@@ -6,14 +6,22 @@ from typing import Dict, Any, List, Tuple
|
|
|
6
6
|
import PIL.Image
|
|
7
7
|
import cv2
|
|
8
8
|
|
|
9
|
-
from pixeltable import exprs
|
|
10
9
|
from pixeltable.exceptions import Error
|
|
11
10
|
from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
|
|
12
11
|
from .base import ComponentIterator
|
|
13
12
|
|
|
14
13
|
_logger = logging.getLogger('pixeltable')
|
|
15
14
|
|
|
15
|
+
|
|
16
16
|
class FrameIterator(ComponentIterator):
|
|
17
|
+
"""Iterator over frames of a video.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
video: URL or file of the video to use for frame extraction
|
|
21
|
+
fps: number of frames to extract per second of video. This may be a fractional value, such as 0.5.
|
|
22
|
+
If set to 0.0, then the native framerate of the video will be used (all frames will be extracted).
|
|
23
|
+
Default: 0.0
|
|
24
|
+
"""
|
|
17
25
|
def __init__(self, video: str, *, fps: float = 0.0):
|
|
18
26
|
video_path = Path(video)
|
|
19
27
|
assert video_path.exists() and video_path.is_file()
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -10,11 +10,11 @@ import sqlalchemy.orm as orm
|
|
|
10
10
|
from .schema import SystemInfo, SystemInfoMd
|
|
11
11
|
|
|
12
12
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
13
|
-
VERSION =
|
|
13
|
+
VERSION = 16
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
17
|
-
"""Create the
|
|
17
|
+
"""Create the system metadata record"""
|
|
18
18
|
system_md = SystemInfoMd(schema_version=VERSION)
|
|
19
19
|
record = SystemInfo(md=dataclasses.asdict(system_md))
|
|
20
20
|
with orm.Session(engine, future=True) as session:
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import sqlalchemy as sql
|
|
2
|
+
|
|
3
|
+
from pixeltable.metadata.schema import Table
|
|
4
|
+
from pixeltable.metadata import register_converter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def convert_14(engine: sql.engine.Engine) -> None:
|
|
8
|
+
default_remotes = {'remotes': []}
|
|
9
|
+
with engine.begin() as conn:
|
|
10
|
+
conn.execute(sql.update(Table).where(Table.md['remotes'] == None).values(md=Table.md.concat(default_remotes)))
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
register_converter(14, convert_14)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
from pixeltable.metadata import register_converter
|
|
6
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def convert_15(engine: sql.engine.Engine) -> None:
|
|
10
|
+
convert_table_md(engine, column_md_updater=update_column_md, remote_md_updater=update_remote_md)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def update_column_md(column_md: dict) -> None:
|
|
14
|
+
column_md['proxy_base'] = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def update_remote_md(remote_md: dict) -> None:
|
|
18
|
+
remote_md['class'] = f'{remote_md["module"]}.{remote_md["class"]}'
|
|
19
|
+
del remote_md['module']
|
|
20
|
+
if remote_md['class'] == 'pixeltable.datatransfer.remote.MockRemote':
|
|
21
|
+
remote_md['remote_md']['name'] = f'remote_{uuid.uuid4()}'
|
|
22
|
+
elif remote_md['class'] == 'pixeltable.datatransfer.label_studio.LabelStudioProject':
|
|
23
|
+
# 'post' is the media_import_method for legacy LabelStudioProject remotes
|
|
24
|
+
remote_md['remote_md']['media_import_method'] = 'post'
|
|
25
|
+
else:
|
|
26
|
+
assert False, remote_md['class']
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
register_converter(15, convert_15)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, Callable, Optional
|
|
4
|
+
|
|
5
|
+
import sqlalchemy as sql
|
|
6
|
+
|
|
7
|
+
from pixeltable.metadata.schema import Table
|
|
8
|
+
|
|
9
|
+
__logger = logging.getLogger('pixeltable')
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def convert_table_md(
|
|
13
|
+
engine: sql.engine.Engine,
|
|
14
|
+
column_md_updater: Optional[Callable[[dict], None]] = None,
|
|
15
|
+
remote_md_updater: Optional[Callable[[dict], None]] = None,
|
|
16
|
+
substitution_fn: Optional[Callable[[Any, Any], Optional[tuple[Any, Any]]]] = None
|
|
17
|
+
) -> None:
|
|
18
|
+
with engine.begin() as conn:
|
|
19
|
+
for row in conn.execute(sql.select(Table)):
|
|
20
|
+
id = row[0]
|
|
21
|
+
table_md = row[2]
|
|
22
|
+
assert isinstance(table_md, dict)
|
|
23
|
+
updated_table_md = copy.deepcopy(table_md)
|
|
24
|
+
if column_md_updater is not None:
|
|
25
|
+
__update_column_md(updated_table_md, column_md_updater)
|
|
26
|
+
if remote_md_updater is not None:
|
|
27
|
+
__update_remote_md(updated_table_md, remote_md_updater)
|
|
28
|
+
if substitution_fn is not None:
|
|
29
|
+
updated_table_md = __substitute_md_rec(updated_table_md, substitution_fn)
|
|
30
|
+
if updated_table_md != table_md:
|
|
31
|
+
__logger.info(f'Updating schema for table: {id}')
|
|
32
|
+
conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_table_md))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def __update_column_md(table_md: dict, column_md_updater: Callable[[dict], None]) -> None:
|
|
36
|
+
columns_md = table_md['column_md']
|
|
37
|
+
assert isinstance(columns_md, dict)
|
|
38
|
+
for column_md in columns_md.values():
|
|
39
|
+
column_md_updater(column_md)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def __update_remote_md(table_md: dict, remote_md_updater: Callable[[dict], None]) -> None:
|
|
43
|
+
remotes_md = table_md['remotes']
|
|
44
|
+
assert isinstance(remotes_md, list)
|
|
45
|
+
for remote_md in remotes_md:
|
|
46
|
+
remote_md_updater(remote_md)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def __substitute_md_rec(md: Any, substitution_fn: Callable[[Any, Any], Optional[tuple[Any, Any]]]) -> Any:
|
|
50
|
+
if isinstance(md, dict):
|
|
51
|
+
updated_md = {}
|
|
52
|
+
for k, v in md.items():
|
|
53
|
+
substitute = substitution_fn(k, v)
|
|
54
|
+
if substitute is not None:
|
|
55
|
+
updated_k, updated_v = substitute
|
|
56
|
+
updated_md[updated_k] = updated_v
|
|
57
|
+
else:
|
|
58
|
+
updated_md[k] = __substitute_md_rec(v, substitution_fn)
|
|
59
|
+
return updated_md
|
|
60
|
+
elif isinstance(md, list):
|
|
61
|
+
return [__substitute_md_rec(v, substitution_fn) for v in md]
|
|
62
|
+
else:
|
|
63
|
+
return md
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
|
1
|
-
from typing import Optional, List, get_type_hints, Type, Any, TypeVar, Tuple, Union
|
|
2
|
-
import platform
|
|
3
|
-
import uuid
|
|
4
1
|
import dataclasses
|
|
2
|
+
import uuid
|
|
3
|
+
from typing import Optional, List, get_type_hints, Type, Any, TypeVar, Tuple, Union
|
|
5
4
|
|
|
6
5
|
import sqlalchemy as sql
|
|
7
|
-
from sqlalchemy import
|
|
6
|
+
from sqlalchemy import ForeignKey
|
|
7
|
+
from sqlalchemy import Integer, BigInteger, LargeBinary
|
|
8
8
|
from sqlalchemy.dialects.postgresql import UUID, JSONB
|
|
9
|
-
from sqlalchemy import ForeignKey, UniqueConstraint, ForeignKeyConstraint
|
|
10
9
|
from sqlalchemy.orm import declarative_base
|
|
11
10
|
|
|
12
11
|
Base = declarative_base()
|
|
@@ -93,6 +92,9 @@ class ColumnMd:
|
|
|
93
92
|
# if True, the column is present in the stored table
|
|
94
93
|
stored: Optional[bool]
|
|
95
94
|
|
|
95
|
+
# if specified, the column is a stored proxy of another column
|
|
96
|
+
proxy_base: Optional[int]
|
|
97
|
+
|
|
96
98
|
|
|
97
99
|
@dataclasses.dataclass
|
|
98
100
|
class IndexMd:
|
|
@@ -143,6 +145,10 @@ class TableMd:
|
|
|
143
145
|
# - every row is assigned a unique and immutable rowid on insertion
|
|
144
146
|
next_row_id: int
|
|
145
147
|
|
|
148
|
+
# Metadata format for remotes:
|
|
149
|
+
# {'class': 'pixeltable.datatransfer.LabelStudioProject', 'md': {'project_id': 3}}
|
|
150
|
+
remotes: list[dict[str, Any]]
|
|
151
|
+
|
|
146
152
|
column_md: dict[int, ColumnMd] # col_id -> ColumnMd
|
|
147
153
|
index_md: dict[int, IndexMd] # index_id -> IndexMd
|
|
148
154
|
view_md: Optional[ViewMd]
|
|
@@ -160,7 +166,7 @@ class Table(Base):
|
|
|
160
166
|
|
|
161
167
|
MAX_VERSION = 9223372036854775807 # 2^63 - 1
|
|
162
168
|
|
|
163
|
-
id = sql.Column(UUID(as_uuid=True), primary_key=True,
|
|
169
|
+
id = sql.Column(UUID(as_uuid=True), primary_key=True, nullable=False)
|
|
164
170
|
dir_id = sql.Column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=False)
|
|
165
171
|
md = sql.Column(JSONB, nullable=False) # TableMd
|
|
166
172
|
|
pixeltable/plan.py
CHANGED
|
@@ -251,7 +251,7 @@ class Planner:
|
|
|
251
251
|
Returns:
|
|
252
252
|
- root node of the plan
|
|
253
253
|
- list of qualified column names that are getting updated
|
|
254
|
-
- list of columns that are being recomputed
|
|
254
|
+
- list of user-visible columns that are being recomputed
|
|
255
255
|
"""
|
|
256
256
|
# retrieve all stored cols and all target exprs
|
|
257
257
|
assert isinstance(tbl, catalog.TableVersionPath)
|
|
@@ -260,7 +260,10 @@ class Planner:
|
|
|
260
260
|
if len(recompute_targets) > 0:
|
|
261
261
|
recomputed_cols = recompute_targets.copy()
|
|
262
262
|
else:
|
|
263
|
-
recomputed_cols = target.get_dependent_columns(updated_cols) if cascade else
|
|
263
|
+
recomputed_cols = target.get_dependent_columns(updated_cols) if cascade else set()
|
|
264
|
+
# regardless of cascade, we need to update all indices on any updated column
|
|
265
|
+
idx_val_cols = target.get_idx_val_columns(updated_cols)
|
|
266
|
+
recomputed_cols.update(idx_val_cols)
|
|
264
267
|
# we only need to recompute stored columns (unstored ones are substituted away)
|
|
265
268
|
recomputed_cols = {c for c in recomputed_cols if c.is_stored}
|
|
266
269
|
recomputed_base_cols = {col for col in recomputed_cols if col.tbl == target}
|
|
@@ -273,8 +276,8 @@ class Planner:
|
|
|
273
276
|
recomputed_exprs = \
|
|
274
277
|
[c.value_expr.copy().resolve_computed_cols(resolve_cols=recomputed_base_cols) for c in recomputed_base_cols]
|
|
275
278
|
# recomputed cols reference the new values of the updated cols
|
|
276
|
-
for col, e in update_targets.items()
|
|
277
|
-
|
|
279
|
+
spec = {exprs.ColumnRef(col): e for col, e in update_targets.items()}
|
|
280
|
+
exprs.Expr.list_substitute(recomputed_exprs, spec)
|
|
278
281
|
select_list.extend(recomputed_exprs)
|
|
279
282
|
|
|
280
283
|
# we need to retrieve the PK columns of the existing rows
|
|
@@ -282,7 +285,8 @@ class Planner:
|
|
|
282
285
|
all_base_cols = copied_cols + updated_cols + list(recomputed_base_cols) # same order as select_list
|
|
283
286
|
# update row builder with column information
|
|
284
287
|
[plan.row_builder.add_table_column(col, select_list[i].slot_idx) for i, col in enumerate(all_base_cols)]
|
|
285
|
-
|
|
288
|
+
recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
|
|
289
|
+
return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
|
|
286
290
|
|
|
287
291
|
@classmethod
|
|
288
292
|
def create_view_update_plan(
|
pixeltable/store.py
CHANGED
|
@@ -66,7 +66,6 @@ class StoreBase:
|
|
|
66
66
|
"""Create self.sa_tbl from self.tbl_version."""
|
|
67
67
|
system_cols = self._create_system_columns()
|
|
68
68
|
all_cols = system_cols.copy()
|
|
69
|
-
idxs: List[sql.Index] = []
|
|
70
69
|
for col in [c for c in self.tbl_version.cols if c.is_stored]:
|
|
71
70
|
# re-create sql.Column for each column, regardless of whether it already has sa_col set: it was bound
|
|
72
71
|
# to the last sql.Table version we created and cannot be reused
|
|
@@ -76,26 +75,18 @@ class StoreBase:
|
|
|
76
75
|
all_cols.append(col.sa_errormsg_col)
|
|
77
76
|
all_cols.append(col.sa_errortype_col)
|
|
78
77
|
|
|
79
|
-
# we create an index for:
|
|
80
|
-
# - scalar columns (except for strings, because long strings can't be used for B-tree indices)
|
|
81
|
-
# - non-computed video and image columns (they will contain external paths/urls that users might want to
|
|
82
|
-
# filter on)
|
|
83
|
-
if (col.col_type.is_scalar_type() and not col.col_type.is_string_type()) \
|
|
84
|
-
or (col.col_type.is_media_type() and not col.is_computed):
|
|
85
|
-
# index names need to be unique within the Postgres instance
|
|
86
|
-
idx_name = f'idx_{col.id}_{self.tbl_version.id.hex}'
|
|
87
|
-
idxs.append(sql.Index(idx_name, col.sa_col))
|
|
88
|
-
|
|
89
78
|
if self.sa_tbl is not None:
|
|
90
79
|
# if we're called in response to a schema change, we need to remove the old table first
|
|
91
80
|
self.sa_md.remove(self.sa_tbl)
|
|
92
81
|
|
|
82
|
+
idxs: List[sql.Index] = []
|
|
93
83
|
# index for all system columns:
|
|
94
84
|
# - base x view joins can be executed as merge joins
|
|
95
85
|
# - speeds up ORDER BY rowid DESC
|
|
96
86
|
# - allows filtering for a particular table version in index scan
|
|
97
87
|
idx_name = f'sys_cols_idx_{self.tbl_version.id.hex}'
|
|
98
88
|
idxs.append(sql.Index(idx_name, *system_cols))
|
|
89
|
+
|
|
99
90
|
# v_min/v_max indices: speeds up base table scans needed to propagate a base table insert or delete
|
|
100
91
|
idx_name = f'vmin_idx_{self.tbl_version.id.hex}'
|
|
101
92
|
idxs.append(sql.Index(idx_name, self.v_min_col, postgresql_using='brin'))
|
|
@@ -264,7 +255,8 @@ class StoreBase:
|
|
|
264
255
|
return num_excs
|
|
265
256
|
|
|
266
257
|
def insert_rows(
|
|
267
|
-
self, exec_plan: ExecNode, conn: sql.engine.Connection, v_min: Optional[int] = None
|
|
258
|
+
self, exec_plan: ExecNode, conn: sql.engine.Connection, v_min: Optional[int] = None,
|
|
259
|
+
show_progress: bool = True
|
|
268
260
|
) -> Tuple[int, int, Set[int]]:
|
|
269
261
|
"""Insert rows into the store table and update the catalog table's md
|
|
270
262
|
Returns:
|
|
@@ -293,15 +285,16 @@ class StoreBase:
|
|
|
293
285
|
self._create_table_row(row, row_builder, media_cols, cols_with_excs, v_min=v_min)
|
|
294
286
|
num_excs += num_row_exc
|
|
295
287
|
table_rows.append(table_row)
|
|
296
|
-
if
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
288
|
+
if show_progress:
|
|
289
|
+
if progress_bar is None:
|
|
290
|
+
warnings.simplefilter("ignore", category=TqdmWarning)
|
|
291
|
+
progress_bar = tqdm(
|
|
292
|
+
desc=f'Inserting rows into `{self.tbl_version.name}`',
|
|
293
|
+
unit=' rows',
|
|
294
|
+
ncols=100,
|
|
295
|
+
file=sys.stdout
|
|
296
|
+
)
|
|
297
|
+
progress_bar.update(1)
|
|
305
298
|
self._move_tmp_media_files(table_rows, media_cols, v_min)
|
|
306
299
|
conn.execute(sql.insert(self.sa_tbl), table_rows)
|
|
307
300
|
if progress_bar is not None:
|
|
@@ -30,6 +30,8 @@ class Dumper:
|
|
|
30
30
|
os.environ['PIXELTABLE_DB'] = db_name
|
|
31
31
|
os.environ['PIXELTABLE_PGDATA'] = str(shared_home / 'pgdata')
|
|
32
32
|
|
|
33
|
+
Env._init_env(reinit_db=True)
|
|
34
|
+
|
|
33
35
|
Env.get().configure_logging(level=logging.DEBUG, to_stdout=True)
|
|
34
36
|
|
|
35
37
|
def dump_db(self) -> None:
|
|
@@ -162,6 +164,20 @@ class Dumper:
|
|
|
162
164
|
# astype
|
|
163
165
|
v['astype'] = t.c1.astype(pxt.FloatType())
|
|
164
166
|
|
|
167
|
+
# Add remotes
|
|
168
|
+
from pixeltable.datatransfer.remote import MockRemote
|
|
169
|
+
v.link(
|
|
170
|
+
MockRemote('remote', {'int_field': pxt.IntType()}, {'str_field': pxt.StringType()}),
|
|
171
|
+
col_mapping={'test_udf': 'int_field', 'c1': 'str_field'}
|
|
172
|
+
)
|
|
173
|
+
# We're just trying to test metadata here, so reach "under the covers" and link a fake
|
|
174
|
+
# Label Studio project without validation (so we don't need a real Label Studio server)
|
|
175
|
+
from pixeltable.datatransfer.label_studio import LabelStudioProject
|
|
176
|
+
v.tbl_version_path.tbl_version.link(
|
|
177
|
+
LabelStudioProject(4171780, media_import_method='file'),
|
|
178
|
+
col_mapping={'str_format': 'str_format'}
|
|
179
|
+
)
|
|
180
|
+
|
|
165
181
|
|
|
166
182
|
@pxt.udf(_force_stored=True)
|
|
167
183
|
def test_udf_stored(n: int) -> int:
|
pixeltable/type_system.py
CHANGED
|
@@ -7,7 +7,7 @@ import json
|
|
|
7
7
|
import typing
|
|
8
8
|
import urllib.parse
|
|
9
9
|
import urllib.request
|
|
10
|
-
from copy import
|
|
10
|
+
from copy import deepcopy
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
from typing import Any, Optional, Tuple, Dict, Callable, List, Union, Sequence, Mapping
|
|
13
13
|
|
|
@@ -82,7 +82,11 @@ class ColumnType:
|
|
|
82
82
|
|
|
83
83
|
def __init__(self, t: Type, nullable: bool = False):
|
|
84
84
|
self._type = t
|
|
85
|
-
self.
|
|
85
|
+
self._nullable = nullable
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def nullable(self) -> bool:
|
|
89
|
+
return self._nullable
|
|
86
90
|
|
|
87
91
|
@property
|
|
88
92
|
def type_enum(self) -> Type:
|
|
@@ -91,6 +95,12 @@ class ColumnType:
|
|
|
91
95
|
def serialize(self) -> str:
|
|
92
96
|
return json.dumps(self.as_dict())
|
|
93
97
|
|
|
98
|
+
def copy(self, nullable: Optional[bool] = None) -> ColumnType:
|
|
99
|
+
result = deepcopy(self)
|
|
100
|
+
if nullable is not None:
|
|
101
|
+
result._nullable = nullable
|
|
102
|
+
return result
|
|
103
|
+
|
|
94
104
|
@classmethod
|
|
95
105
|
def serialize_list(cls, type_list: List[ColumnType]) -> str:
|
|
96
106
|
return json.dumps([t.as_dict() for t in type_list])
|
|
@@ -177,7 +187,7 @@ class ColumnType:
|
|
|
177
187
|
if type(self) != type(other):
|
|
178
188
|
return False
|
|
179
189
|
for member_var in vars(self).keys():
|
|
180
|
-
if member_var == '
|
|
190
|
+
if member_var == '_nullable':
|
|
181
191
|
continue
|
|
182
192
|
if getattr(self, member_var) != getattr(other, member_var):
|
|
183
193
|
return False
|
|
@@ -250,7 +260,7 @@ class ColumnType:
|
|
|
250
260
|
# We treat it as the underlying type but with nullable=True.
|
|
251
261
|
underlying = cls.from_python_type(union_args[0])
|
|
252
262
|
if underlying is not None:
|
|
253
|
-
underlying.
|
|
263
|
+
underlying._nullable = True
|
|
254
264
|
return underlying
|
|
255
265
|
else:
|
|
256
266
|
# Discard type parameters to ensure that parameterized types such as `list[T]`
|
pixeltable/utils/coco.py
CHANGED
|
@@ -134,3 +134,97 @@ def write_coco_dataset(df: 'pixeltable.DataFrame', dest_path: Path) -> Path:
|
|
|
134
134
|
json.dump(result, f)
|
|
135
135
|
return output_path
|
|
136
136
|
|
|
137
|
+
|
|
138
|
+
COCO_2017_CATEGORIES = {
|
|
139
|
+
0: 'N/A',
|
|
140
|
+
1: 'person',
|
|
141
|
+
2: 'bicycle',
|
|
142
|
+
3: 'car',
|
|
143
|
+
4: 'motorcycle',
|
|
144
|
+
5: 'airplane',
|
|
145
|
+
6: 'bus',
|
|
146
|
+
7: 'train',
|
|
147
|
+
8: 'truck',
|
|
148
|
+
9: 'boat',
|
|
149
|
+
10: 'traffic light',
|
|
150
|
+
11: 'fire hydrant',
|
|
151
|
+
12: 'N/A',
|
|
152
|
+
13: 'stop sign',
|
|
153
|
+
14: 'parking meter',
|
|
154
|
+
15: 'bench',
|
|
155
|
+
16: 'bird',
|
|
156
|
+
17: 'cat',
|
|
157
|
+
18: 'dog',
|
|
158
|
+
19: 'horse',
|
|
159
|
+
20: 'sheep',
|
|
160
|
+
21: 'cow',
|
|
161
|
+
22: 'elephant',
|
|
162
|
+
23: 'bear',
|
|
163
|
+
24: 'zebra',
|
|
164
|
+
25: 'giraffe',
|
|
165
|
+
26: 'N/A',
|
|
166
|
+
27: 'backpack',
|
|
167
|
+
28: 'umbrella',
|
|
168
|
+
29: 'N/A',
|
|
169
|
+
30: 'N/A',
|
|
170
|
+
31: 'handbag',
|
|
171
|
+
32: 'tie',
|
|
172
|
+
33: 'suitcase',
|
|
173
|
+
34: 'frisbee',
|
|
174
|
+
35: 'skis',
|
|
175
|
+
36: 'snowboard',
|
|
176
|
+
37: 'sports ball',
|
|
177
|
+
38: 'kite',
|
|
178
|
+
39: 'baseball bat',
|
|
179
|
+
40: 'baseball glove',
|
|
180
|
+
41: 'skateboard',
|
|
181
|
+
42: 'surfboard',
|
|
182
|
+
43: 'tennis racket',
|
|
183
|
+
44: 'bottle',
|
|
184
|
+
45: 'N/A',
|
|
185
|
+
46: 'wine glass',
|
|
186
|
+
47: 'cup',
|
|
187
|
+
48: 'fork',
|
|
188
|
+
49: 'knife',
|
|
189
|
+
50: 'spoon',
|
|
190
|
+
51: 'bowl',
|
|
191
|
+
52: 'banana',
|
|
192
|
+
53: 'apple',
|
|
193
|
+
54: 'sandwich',
|
|
194
|
+
55: 'orange',
|
|
195
|
+
56: 'broccoli',
|
|
196
|
+
57: 'carrot',
|
|
197
|
+
58: 'hot dog',
|
|
198
|
+
59: 'pizza',
|
|
199
|
+
60: 'donut',
|
|
200
|
+
61: 'cake',
|
|
201
|
+
62: 'chair',
|
|
202
|
+
63: 'couch',
|
|
203
|
+
64: 'potted plant',
|
|
204
|
+
65: 'bed',
|
|
205
|
+
66: 'N/A',
|
|
206
|
+
67: 'dining table',
|
|
207
|
+
68: 'N/A',
|
|
208
|
+
69: 'N/A',
|
|
209
|
+
70: 'toilet',
|
|
210
|
+
71: 'N/A',
|
|
211
|
+
72: 'tv',
|
|
212
|
+
73: 'laptop',
|
|
213
|
+
74: 'mouse',
|
|
214
|
+
75: 'remote',
|
|
215
|
+
76: 'keyboard',
|
|
216
|
+
77: 'cell phone',
|
|
217
|
+
78: 'microwave',
|
|
218
|
+
79: 'oven',
|
|
219
|
+
80: 'toaster',
|
|
220
|
+
81: 'sink',
|
|
221
|
+
82: 'refrigerator',
|
|
222
|
+
83: 'N/A',
|
|
223
|
+
84: 'book',
|
|
224
|
+
85: 'clock',
|
|
225
|
+
86: 'vase',
|
|
226
|
+
87: 'scissors',
|
|
227
|
+
88: 'teddy bear',
|
|
228
|
+
89: 'hair drier',
|
|
229
|
+
90: 'toothbrush'
|
|
230
|
+
}
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pixeltable
|
|
3
|
+
Version: 0.2.7
|
|
4
|
+
Summary: Pixeltable: The Multimodal AI Data Plane
|
|
5
|
+
Author: Marcel Kornacker
|
|
6
|
+
Author-email: marcelk@gmail.com
|
|
7
|
+
Requires-Python: >=3.9,<4.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Requires-Dist: av (>=10.0.0)
|
|
14
|
+
Requires-Dist: beautifulsoup4 (>=4.0.0,<5.0.0)
|
|
15
|
+
Requires-Dist: cloudpickle (>=2.2.1,<3.0.0)
|
|
16
|
+
Requires-Dist: ftfy (>=6.2.0,<7.0.0)
|
|
17
|
+
Requires-Dist: jinja2 (>=3.1.3,<4.0.0)
|
|
18
|
+
Requires-Dist: jmespath (>=1.0.1,<2.0.0)
|
|
19
|
+
Requires-Dist: mistune (>=3.0.2,<4.0.0)
|
|
20
|
+
Requires-Dist: more-itertools (>=10.2,<11.0)
|
|
21
|
+
Requires-Dist: numpy (>=1.25)
|
|
22
|
+
Requires-Dist: opencv-python-headless (>=4.7.0.68,<5.0.0.0)
|
|
23
|
+
Requires-Dist: pandas (>=2.0,<3.0)
|
|
24
|
+
Requires-Dist: pgserver (==0.1.3)
|
|
25
|
+
Requires-Dist: pgvector (>=0.2.1,<0.3.0)
|
|
26
|
+
Requires-Dist: pillow (>=9.3.0)
|
|
27
|
+
Requires-Dist: psutil (>=5.9.5,<6.0.0)
|
|
28
|
+
Requires-Dist: psycopg2-binary (>=2.9.5,<3.0.0)
|
|
29
|
+
Requires-Dist: pymupdf (>=1.24.1,<2.0.0)
|
|
30
|
+
Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
|
|
31
|
+
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
|
32
|
+
Requires-Dist: setuptools (==69.1.1)
|
|
33
|
+
Requires-Dist: sqlalchemy[mypy] (>=2.0.23,<3.0.0)
|
|
34
|
+
Requires-Dist: tenacity (>=8.2,<9.0)
|
|
35
|
+
Requires-Dist: tqdm (>=4.64)
|
|
36
|
+
Description-Content-Type: text/markdown
|
|
37
|
+
|
|
38
|
+
<div align="center">
|
|
39
|
+
<img src="https://raw.githubusercontent.com/pixeltable/pixeltable/master/docs/release/pixeltable-banner.png" alt="Pixeltable" width="45%" />
|
|
40
|
+
|
|
41
|
+
# Unifying Data, Models, and Orchestration for AI Products
|
|
42
|
+
|
|
43
|
+
[](https://opensource.org/licenses/Apache-2.0)
|
|
44
|
+

|
|
45
|
+
[]()
|
|
46
|
+
[](https://github.com/pixeltable/pixeltable/actions)
|
|
47
|
+
[](https://pypi.org/project/pixeltable/)
|
|
48
|
+
|
|
49
|
+
[Installation](https://pixeltable.github.io/pixeltable/getting-started/) | [Documentation](https://pixeltable.readme.io/) | [API Reference](https://pixeltable.github.io/pixeltable/) | [Code Samples](https://pixeltable.readme.io/recipes) | [Examples](https://github.com/pixeltable/pixeltable/tree/master/docs/release/tutorials)
|
|
50
|
+
</div>
|
|
51
|
+
|
|
52
|
+
Pixeltable is a Python library that lets AI engineers and data scientists focus on exploration, modeling, and app development without dealing with the customary data plumbing.
|
|
53
|
+
|
|
54
|
+
## What problems does Pixeltable solve?
|
|
55
|
+
|
|
56
|
+
Today’s solutions for AI app development require extensive custom coding and infrastructure plumbing. Tracking lineage and versions between and across data transformations, models, and deployment is cumbersome. With Pixeltable you can store, transform, index, and iterate on your data within the same table interface, whether it's text, images, embeddings, or even video. Built-in lineage and versioning ensure transparency and reproducibility, while the development-to-production mirror streamlines deployment.
|
|
57
|
+
|
|
58
|
+
## 💾 Installation
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
%pip install pixeltable
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
To verify that it's working:
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
import pixeltable as pxt
|
|
68
|
+
pxt.init()
|
|
69
|
+
```
|
|
70
|
+
> [!NOTE]
|
|
71
|
+
> Check out the [Pixeltable Basics](https://pixeltable.readme.io/docs/pixeltable-basics) tutorial for a tour of its most important features.
|
|
72
|
+
|
|
73
|
+
## 💡 Get Started
|
|
74
|
+
Learn how to create tables, populate them with data, and enhance them with built-in or user-defined transformations and AI operations.
|
|
75
|
+
|
|
76
|
+
| Topic | Notebook | API |
|
|
77
|
+
|:--------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------:|
|
|
78
|
+
| Get Started | <a target="_blank" href="https://colab.research.google.com/github/pixeltable/pixeltable/blob/master/docs/tutorials/pixeltable-basics.ipynb"> <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/> </a> | [](https://pixeltable.github.io/pixeltable/api/pixeltable/) |
|
|
79
|
+
| User-Defined Functions (UDFs) | <a target="_blank" href="https://colab.research.google.com/github/pixeltable/pixeltable/blob/master/docs/release/howto/udfs-in-pixeltable.ipynb"> <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/> </a> | [](https://pixeltable.github.io/pixeltable/api/iterators/document-splitter/) |
|
|
80
|
+
| Comparing Object Detection Models | <a target="_blank" href="https://colab.research.google.com/github/pixeltable/pixeltable/blob/master/docs/release/tutorials/object-detection-in-videos.ipynb"> <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/> </a> | [](https://pixeltable.github.io/pixeltable/api-cheat-sheet/#frame-extraction-for-video-data) |
|
|
81
|
+
| Experimenting with Chunking (RAG) | <a target="_blank" href="https://colab.research.google.com/github/pixeltable/pixeltable/blob/master/docs/release/tutorials/rag-operations.ipynb"> <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/> | [](https://pixeltable.github.io/pixeltable/api/iterators/document-splitter/) |
|
|
82
|
+
| Working with External Files | <a target="_blank" href="https://colab.research.google.com/github/pixeltable/pixeltable/blob/master/docs/release/howto/working-with-external-files.ipynb"> <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/> </a> | [](https://pixeltable.github.io/pixeltable/api-cheat-sheet/#inserting-data-into-a-table) |
|
|
83
|
+
|
|
84
|
+
## ❓ FAQ
|
|
85
|
+
|
|
86
|
+
### What does Pixeltable provide me with? Pixeltable provides:
|
|
87
|
+
|
|
88
|
+
- Data storage and versioning
|
|
89
|
+
- Combined Data and Model Lineage
|
|
90
|
+
- Indexing (e.g. embedding vectors) and Data Retrieval
|
|
91
|
+
- Orchestration of multimodal workloads
|
|
92
|
+
- Incremental updates
|
|
93
|
+
- Code is automatically production-ready
|
|
94
|
+
|
|
95
|
+
### Why should you use Pixeltable?
|
|
96
|
+
|
|
97
|
+
- **It gives you transparency and reproducibility**
|
|
98
|
+
- All generated data is automatically recorded and versioned
|
|
99
|
+
- You will never need to re-run a workload because you lost track of the input data
|
|
100
|
+
- **It saves you money**
|
|
101
|
+
- All data changes are automatically incremental
|
|
102
|
+
- You never need to re-run pipelines from scratch because you’re adding data
|
|
103
|
+
- **It integrates with any existing Python code or libraries**
|
|
104
|
+
- Bring your ever-changing code and workloads
|
|
105
|
+
- You choose the models, tools, and AI practices (e.g., your embedding model for a vector index); Pixeltable orchestrates the data
|
|
106
|
+
|
|
107
|
+
### What is Pixeltable not providing?
|
|
108
|
+
|
|
109
|
+
- Pixeltable is not a low-code, prescriptive AI solution. We empower you to use the best frameworks and techniques for your specific needs.
|
|
110
|
+
- We do not aim to replace your existing AI toolkit, but rather enhance it by streamlining the underlying data infrastructure and orchestration.
|
|
111
|
+
|
|
112
|
+
> [!TIP]
|
|
113
|
+
> Check out the [Integrations](https://pixeltable.readme.io/docs/working-with-openai) section, and feel free to submit a request for additional ones.
|
|
114
|
+
|
|
115
|
+
## 📙 Example of Use Cases
|
|
116
|
+
|
|
117
|
+
- **Interact with video data at the frame level** without having to think about frame extraction, intermediate file storage, or storage space explosion.
|
|
118
|
+
- **Augment your data incrementally and interactively with built-in functions and UDFs**, such as image transformations, model inference, and visualizations, without having to think about data pipelines, incremental updates, or capturing function output.
|
|
119
|
+
- **Interact with all the data relevant to your AI application** (video, images, documents, audio, structured data, JSON) through a simple dataframe-style API directly in Python. This includes:
|
|
120
|
+
- similarity search on embeddings, supported by high-dimensional vector indexing;
|
|
121
|
+
- path expressions and transformations on JSON data;
|
|
122
|
+
- PIL and OpenCV image operations;
|
|
123
|
+
- assembling frames into videos.
|
|
124
|
+
- **Perform keyword and image similarity search at the video frame level** without having to worry about frame storage.
|
|
125
|
+
- **Access all Pixeltable-resident data directly as a PyTorch dataset** in your training scripts.
|
|
126
|
+
- **Understand the compute and storage costs of your data at the granularity** of individual augmentations and get cost projections before adding new data and new augmentations.
|
|
127
|
+
- **Rely on Pixeltable's automatic versioning and snapshot functionality** to protect against regressions and to ensure reproducibility.
|
|
128
|
+
|
|
129
|
+
## 🐛 Contributions & Feedback
|
|
130
|
+
|
|
131
|
+
Are you experiencing issues or bugs with Pixeltable? File an [Issue](https://github.com/pixeltable/pixeltable/issues).
|
|
132
|
+
</br>Do you want to contribute? Feel free to open a [PR](https://github.com/pixeltable/pixeltable/pulls).
|
|
133
|
+
|
|
134
|
+
## :classical_building: License
|
|
135
|
+
|
|
136
|
+
This library is licensed under the Apache 2.0 License.
|
|
137
|
+
|