pixeltable 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +3 -1
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/column.py +8 -2
- pixeltable/catalog/insertable_table.py +32 -17
- pixeltable/catalog/table.py +167 -12
- pixeltable/catalog/table_version.py +185 -106
- pixeltable/datatransfer/__init__.py +1 -0
- pixeltable/datatransfer/label_studio.py +452 -0
- pixeltable/datatransfer/remote.py +85 -0
- pixeltable/env.py +148 -69
- pixeltable/exprs/column_ref.py +2 -2
- pixeltable/exprs/comparison.py +39 -1
- pixeltable/exprs/data_row.py +7 -0
- pixeltable/exprs/expr.py +11 -12
- pixeltable/exprs/function_call.py +0 -3
- pixeltable/exprs/globals.py +14 -2
- pixeltable/exprs/similarity_expr.py +5 -3
- pixeltable/ext/functions/whisperx.py +30 -0
- pixeltable/ext/functions/yolox.py +16 -0
- pixeltable/func/aggregate_function.py +2 -2
- pixeltable/func/expr_template_function.py +3 -1
- pixeltable/func/udf.py +2 -2
- pixeltable/functions/fireworks.py +9 -4
- pixeltable/functions/huggingface.py +25 -1
- pixeltable/functions/openai.py +15 -10
- pixeltable/functions/together.py +11 -6
- pixeltable/functions/util.py +0 -43
- pixeltable/functions/video.py +46 -8
- pixeltable/globals.py +20 -2
- pixeltable/index/__init__.py +1 -0
- pixeltable/index/base.py +6 -1
- pixeltable/index/btree.py +54 -0
- pixeltable/index/embedding_index.py +4 -1
- pixeltable/io/__init__.py +1 -0
- pixeltable/io/globals.py +58 -0
- pixeltable/iterators/base.py +4 -4
- pixeltable/iterators/document.py +26 -15
- pixeltable/iterators/video.py +9 -1
- pixeltable/metadata/__init__.py +2 -2
- pixeltable/metadata/converters/convert_14.py +13 -0
- pixeltable/metadata/schema.py +9 -6
- pixeltable/plan.py +9 -5
- pixeltable/store.py +14 -21
- pixeltable/tool/create_test_db_dump.py +14 -0
- pixeltable/type_system.py +14 -4
- pixeltable/utils/coco.py +94 -0
- pixeltable-0.2.8.dist-info/METADATA +137 -0
- {pixeltable-0.2.6.dist-info → pixeltable-0.2.8.dist-info}/RECORD +50 -45
- pixeltable/func/nos_function.py +0 -202
- pixeltable/utils/clip.py +0 -18
- pixeltable-0.2.6.dist-info/METADATA +0 -131
- {pixeltable-0.2.6.dist-info → pixeltable-0.2.8.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.6.dist-info → pixeltable-0.2.8.dist-info}/WHEEL +0 -0
pixeltable/metadata/__init__.py
CHANGED
|
@@ -10,11 +10,11 @@ import sqlalchemy.orm as orm
|
|
|
10
10
|
from .schema import SystemInfo, SystemInfoMd
|
|
11
11
|
|
|
12
12
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
13
|
-
VERSION =
|
|
13
|
+
VERSION = 15
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
17
|
-
"""Create the
|
|
17
|
+
"""Create the system metadata record"""
|
|
18
18
|
system_md = SystemInfoMd(schema_version=VERSION)
|
|
19
19
|
record = SystemInfo(md=dataclasses.asdict(system_md))
|
|
20
20
|
with orm.Session(engine, future=True) as session:
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import sqlalchemy as sql
|
|
2
|
+
|
|
3
|
+
from pixeltable.metadata.schema import Table
|
|
4
|
+
from pixeltable.metadata import register_converter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def convert_14(engine: sql.engine.Engine) -> None:
|
|
8
|
+
default_remotes = {'remotes': []}
|
|
9
|
+
with engine.begin() as conn:
|
|
10
|
+
conn.execute(sql.update(Table).where(Table.md['remotes'] == None).values(md=Table.md.concat(default_remotes)))
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
register_converter(14, convert_14)
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
|
1
|
-
from typing import Optional, List, get_type_hints, Type, Any, TypeVar, Tuple, Union
|
|
2
|
-
import platform
|
|
3
|
-
import uuid
|
|
4
1
|
import dataclasses
|
|
2
|
+
import uuid
|
|
3
|
+
from typing import Optional, List, get_type_hints, Type, Any, TypeVar, Tuple, Union
|
|
5
4
|
|
|
6
5
|
import sqlalchemy as sql
|
|
7
|
-
from sqlalchemy import
|
|
6
|
+
from sqlalchemy import ForeignKey
|
|
7
|
+
from sqlalchemy import Integer, BigInteger, LargeBinary
|
|
8
8
|
from sqlalchemy.dialects.postgresql import UUID, JSONB
|
|
9
|
-
from sqlalchemy import ForeignKey, UniqueConstraint, ForeignKeyConstraint
|
|
10
9
|
from sqlalchemy.orm import declarative_base
|
|
11
10
|
|
|
12
11
|
Base = declarative_base()
|
|
@@ -143,6 +142,10 @@ class TableMd:
|
|
|
143
142
|
# - every row is assigned a unique and immutable rowid on insertion
|
|
144
143
|
next_row_id: int
|
|
145
144
|
|
|
145
|
+
# Metadata format for remotes:
|
|
146
|
+
# {'class': 'pixeltable.datatransfer.LabelStudioProject', 'md': {'project_id': 3}}
|
|
147
|
+
remotes: list[dict[str, Any]]
|
|
148
|
+
|
|
146
149
|
column_md: dict[int, ColumnMd] # col_id -> ColumnMd
|
|
147
150
|
index_md: dict[int, IndexMd] # index_id -> IndexMd
|
|
148
151
|
view_md: Optional[ViewMd]
|
|
@@ -160,7 +163,7 @@ class Table(Base):
|
|
|
160
163
|
|
|
161
164
|
MAX_VERSION = 9223372036854775807 # 2^63 - 1
|
|
162
165
|
|
|
163
|
-
id = sql.Column(UUID(as_uuid=True), primary_key=True,
|
|
166
|
+
id = sql.Column(UUID(as_uuid=True), primary_key=True, nullable=False)
|
|
164
167
|
dir_id = sql.Column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=False)
|
|
165
168
|
md = sql.Column(JSONB, nullable=False) # TableMd
|
|
166
169
|
|
pixeltable/plan.py
CHANGED
|
@@ -251,7 +251,7 @@ class Planner:
|
|
|
251
251
|
Returns:
|
|
252
252
|
- root node of the plan
|
|
253
253
|
- list of qualified column names that are getting updated
|
|
254
|
-
- list of columns that are being recomputed
|
|
254
|
+
- list of user-visible columns that are being recomputed
|
|
255
255
|
"""
|
|
256
256
|
# retrieve all stored cols and all target exprs
|
|
257
257
|
assert isinstance(tbl, catalog.TableVersionPath)
|
|
@@ -260,7 +260,10 @@ class Planner:
|
|
|
260
260
|
if len(recompute_targets) > 0:
|
|
261
261
|
recomputed_cols = recompute_targets.copy()
|
|
262
262
|
else:
|
|
263
|
-
recomputed_cols = target.get_dependent_columns(updated_cols) if cascade else
|
|
263
|
+
recomputed_cols = target.get_dependent_columns(updated_cols) if cascade else set()
|
|
264
|
+
# regardless of cascade, we need to update all indices on any updated column
|
|
265
|
+
idx_val_cols = target.get_idx_val_columns(updated_cols)
|
|
266
|
+
recomputed_cols.update(idx_val_cols)
|
|
264
267
|
# we only need to recompute stored columns (unstored ones are substituted away)
|
|
265
268
|
recomputed_cols = {c for c in recomputed_cols if c.is_stored}
|
|
266
269
|
recomputed_base_cols = {col for col in recomputed_cols if col.tbl == target}
|
|
@@ -273,8 +276,8 @@ class Planner:
|
|
|
273
276
|
recomputed_exprs = \
|
|
274
277
|
[c.value_expr.copy().resolve_computed_cols(resolve_cols=recomputed_base_cols) for c in recomputed_base_cols]
|
|
275
278
|
# recomputed cols reference the new values of the updated cols
|
|
276
|
-
for col, e in update_targets.items()
|
|
277
|
-
|
|
279
|
+
spec = {exprs.ColumnRef(col): e for col, e in update_targets.items()}
|
|
280
|
+
exprs.Expr.list_substitute(recomputed_exprs, spec)
|
|
278
281
|
select_list.extend(recomputed_exprs)
|
|
279
282
|
|
|
280
283
|
# we need to retrieve the PK columns of the existing rows
|
|
@@ -282,7 +285,8 @@ class Planner:
|
|
|
282
285
|
all_base_cols = copied_cols + updated_cols + list(recomputed_base_cols) # same order as select_list
|
|
283
286
|
# update row builder with column information
|
|
284
287
|
[plan.row_builder.add_table_column(col, select_list[i].slot_idx) for i, col in enumerate(all_base_cols)]
|
|
285
|
-
|
|
288
|
+
recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
|
|
289
|
+
return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
|
|
286
290
|
|
|
287
291
|
@classmethod
|
|
288
292
|
def create_view_update_plan(
|
pixeltable/store.py
CHANGED
|
@@ -66,7 +66,6 @@ class StoreBase:
|
|
|
66
66
|
"""Create self.sa_tbl from self.tbl_version."""
|
|
67
67
|
system_cols = self._create_system_columns()
|
|
68
68
|
all_cols = system_cols.copy()
|
|
69
|
-
idxs: List[sql.Index] = []
|
|
70
69
|
for col in [c for c in self.tbl_version.cols if c.is_stored]:
|
|
71
70
|
# re-create sql.Column for each column, regardless of whether it already has sa_col set: it was bound
|
|
72
71
|
# to the last sql.Table version we created and cannot be reused
|
|
@@ -76,26 +75,18 @@ class StoreBase:
|
|
|
76
75
|
all_cols.append(col.sa_errormsg_col)
|
|
77
76
|
all_cols.append(col.sa_errortype_col)
|
|
78
77
|
|
|
79
|
-
# we create an index for:
|
|
80
|
-
# - scalar columns (except for strings, because long strings can't be used for B-tree indices)
|
|
81
|
-
# - non-computed video and image columns (they will contain external paths/urls that users might want to
|
|
82
|
-
# filter on)
|
|
83
|
-
if (col.col_type.is_scalar_type() and not col.col_type.is_string_type()) \
|
|
84
|
-
or (col.col_type.is_media_type() and not col.is_computed):
|
|
85
|
-
# index names need to be unique within the Postgres instance
|
|
86
|
-
idx_name = f'idx_{col.id}_{self.tbl_version.id.hex}'
|
|
87
|
-
idxs.append(sql.Index(idx_name, col.sa_col))
|
|
88
|
-
|
|
89
78
|
if self.sa_tbl is not None:
|
|
90
79
|
# if we're called in response to a schema change, we need to remove the old table first
|
|
91
80
|
self.sa_md.remove(self.sa_tbl)
|
|
92
81
|
|
|
82
|
+
idxs: List[sql.Index] = []
|
|
93
83
|
# index for all system columns:
|
|
94
84
|
# - base x view joins can be executed as merge joins
|
|
95
85
|
# - speeds up ORDER BY rowid DESC
|
|
96
86
|
# - allows filtering for a particular table version in index scan
|
|
97
87
|
idx_name = f'sys_cols_idx_{self.tbl_version.id.hex}'
|
|
98
88
|
idxs.append(sql.Index(idx_name, *system_cols))
|
|
89
|
+
|
|
99
90
|
# v_min/v_max indices: speeds up base table scans needed to propagate a base table insert or delete
|
|
100
91
|
idx_name = f'vmin_idx_{self.tbl_version.id.hex}'
|
|
101
92
|
idxs.append(sql.Index(idx_name, self.v_min_col, postgresql_using='brin'))
|
|
@@ -264,7 +255,8 @@ class StoreBase:
|
|
|
264
255
|
return num_excs
|
|
265
256
|
|
|
266
257
|
def insert_rows(
|
|
267
|
-
self, exec_plan: ExecNode, conn: sql.engine.Connection, v_min: Optional[int] = None
|
|
258
|
+
self, exec_plan: ExecNode, conn: sql.engine.Connection, v_min: Optional[int] = None,
|
|
259
|
+
show_progress: bool = True
|
|
268
260
|
) -> Tuple[int, int, Set[int]]:
|
|
269
261
|
"""Insert rows into the store table and update the catalog table's md
|
|
270
262
|
Returns:
|
|
@@ -293,15 +285,16 @@ class StoreBase:
|
|
|
293
285
|
self._create_table_row(row, row_builder, media_cols, cols_with_excs, v_min=v_min)
|
|
294
286
|
num_excs += num_row_exc
|
|
295
287
|
table_rows.append(table_row)
|
|
296
|
-
if
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
288
|
+
if show_progress:
|
|
289
|
+
if progress_bar is None:
|
|
290
|
+
warnings.simplefilter("ignore", category=TqdmWarning)
|
|
291
|
+
progress_bar = tqdm(
|
|
292
|
+
desc=f'Inserting rows into `{self.tbl_version.name}`',
|
|
293
|
+
unit=' rows',
|
|
294
|
+
ncols=100,
|
|
295
|
+
file=sys.stdout
|
|
296
|
+
)
|
|
297
|
+
progress_bar.update(1)
|
|
305
298
|
self._move_tmp_media_files(table_rows, media_cols, v_min)
|
|
306
299
|
conn.execute(sql.insert(self.sa_tbl), table_rows)
|
|
307
300
|
if progress_bar is not None:
|
|
@@ -162,6 +162,20 @@ class Dumper:
|
|
|
162
162
|
# astype
|
|
163
163
|
v['astype'] = t.c1.astype(pxt.FloatType())
|
|
164
164
|
|
|
165
|
+
# Add remotes
|
|
166
|
+
from pixeltable.datatransfer.remote import MockRemote
|
|
167
|
+
v.link_remote(
|
|
168
|
+
MockRemote({'int_field': pxt.IntType()}, {'str_field': pxt.StringType()}),
|
|
169
|
+
col_mapping={'test_udf': 'int_field', 'c1': 'str_field'}
|
|
170
|
+
)
|
|
171
|
+
# We're just trying to test metadata here, so reach "under the covers" and link a fake
|
|
172
|
+
# Label Studio project without validation (so we don't need a real Label Studio server)
|
|
173
|
+
from pixeltable.datatransfer.label_studio import LabelStudioProject
|
|
174
|
+
v.tbl_version_path.tbl_version.link(
|
|
175
|
+
LabelStudioProject(4171780),
|
|
176
|
+
col_mapping={'str_format': 'str_format'}
|
|
177
|
+
)
|
|
178
|
+
|
|
165
179
|
|
|
166
180
|
@pxt.udf(_force_stored=True)
|
|
167
181
|
def test_udf_stored(n: int) -> int:
|
pixeltable/type_system.py
CHANGED
|
@@ -7,7 +7,7 @@ import json
|
|
|
7
7
|
import typing
|
|
8
8
|
import urllib.parse
|
|
9
9
|
import urllib.request
|
|
10
|
-
from copy import
|
|
10
|
+
from copy import deepcopy
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
from typing import Any, Optional, Tuple, Dict, Callable, List, Union, Sequence, Mapping
|
|
13
13
|
|
|
@@ -82,7 +82,11 @@ class ColumnType:
|
|
|
82
82
|
|
|
83
83
|
def __init__(self, t: Type, nullable: bool = False):
|
|
84
84
|
self._type = t
|
|
85
|
-
self.
|
|
85
|
+
self._nullable = nullable
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def nullable(self) -> bool:
|
|
89
|
+
return self._nullable
|
|
86
90
|
|
|
87
91
|
@property
|
|
88
92
|
def type_enum(self) -> Type:
|
|
@@ -91,6 +95,12 @@ class ColumnType:
|
|
|
91
95
|
def serialize(self) -> str:
|
|
92
96
|
return json.dumps(self.as_dict())
|
|
93
97
|
|
|
98
|
+
def copy(self, nullable: Optional[bool] = None) -> ColumnType:
|
|
99
|
+
result = deepcopy(self)
|
|
100
|
+
if nullable is not None:
|
|
101
|
+
result._nullable = nullable
|
|
102
|
+
return result
|
|
103
|
+
|
|
94
104
|
@classmethod
|
|
95
105
|
def serialize_list(cls, type_list: List[ColumnType]) -> str:
|
|
96
106
|
return json.dumps([t.as_dict() for t in type_list])
|
|
@@ -177,7 +187,7 @@ class ColumnType:
|
|
|
177
187
|
if type(self) != type(other):
|
|
178
188
|
return False
|
|
179
189
|
for member_var in vars(self).keys():
|
|
180
|
-
if member_var == '
|
|
190
|
+
if member_var == '_nullable':
|
|
181
191
|
continue
|
|
182
192
|
if getattr(self, member_var) != getattr(other, member_var):
|
|
183
193
|
return False
|
|
@@ -250,7 +260,7 @@ class ColumnType:
|
|
|
250
260
|
# We treat it as the underlying type but with nullable=True.
|
|
251
261
|
underlying = cls.from_python_type(union_args[0])
|
|
252
262
|
if underlying is not None:
|
|
253
|
-
underlying.
|
|
263
|
+
underlying._nullable = True
|
|
254
264
|
return underlying
|
|
255
265
|
else:
|
|
256
266
|
# Discard type parameters to ensure that parameterized types such as `list[T]`
|
pixeltable/utils/coco.py
CHANGED
|
@@ -134,3 +134,97 @@ def write_coco_dataset(df: 'pixeltable.DataFrame', dest_path: Path) -> Path:
|
|
|
134
134
|
json.dump(result, f)
|
|
135
135
|
return output_path
|
|
136
136
|
|
|
137
|
+
|
|
138
|
+
COCO_2017_CATEGORIES = {
|
|
139
|
+
0: 'N/A',
|
|
140
|
+
1: 'person',
|
|
141
|
+
2: 'bicycle',
|
|
142
|
+
3: 'car',
|
|
143
|
+
4: 'motorcycle',
|
|
144
|
+
5: 'airplane',
|
|
145
|
+
6: 'bus',
|
|
146
|
+
7: 'train',
|
|
147
|
+
8: 'truck',
|
|
148
|
+
9: 'boat',
|
|
149
|
+
10: 'traffic light',
|
|
150
|
+
11: 'fire hydrant',
|
|
151
|
+
12: 'N/A',
|
|
152
|
+
13: 'stop sign',
|
|
153
|
+
14: 'parking meter',
|
|
154
|
+
15: 'bench',
|
|
155
|
+
16: 'bird',
|
|
156
|
+
17: 'cat',
|
|
157
|
+
18: 'dog',
|
|
158
|
+
19: 'horse',
|
|
159
|
+
20: 'sheep',
|
|
160
|
+
21: 'cow',
|
|
161
|
+
22: 'elephant',
|
|
162
|
+
23: 'bear',
|
|
163
|
+
24: 'zebra',
|
|
164
|
+
25: 'giraffe',
|
|
165
|
+
26: 'N/A',
|
|
166
|
+
27: 'backpack',
|
|
167
|
+
28: 'umbrella',
|
|
168
|
+
29: 'N/A',
|
|
169
|
+
30: 'N/A',
|
|
170
|
+
31: 'handbag',
|
|
171
|
+
32: 'tie',
|
|
172
|
+
33: 'suitcase',
|
|
173
|
+
34: 'frisbee',
|
|
174
|
+
35: 'skis',
|
|
175
|
+
36: 'snowboard',
|
|
176
|
+
37: 'sports ball',
|
|
177
|
+
38: 'kite',
|
|
178
|
+
39: 'baseball bat',
|
|
179
|
+
40: 'baseball glove',
|
|
180
|
+
41: 'skateboard',
|
|
181
|
+
42: 'surfboard',
|
|
182
|
+
43: 'tennis racket',
|
|
183
|
+
44: 'bottle',
|
|
184
|
+
45: 'N/A',
|
|
185
|
+
46: 'wine glass',
|
|
186
|
+
47: 'cup',
|
|
187
|
+
48: 'fork',
|
|
188
|
+
49: 'knife',
|
|
189
|
+
50: 'spoon',
|
|
190
|
+
51: 'bowl',
|
|
191
|
+
52: 'banana',
|
|
192
|
+
53: 'apple',
|
|
193
|
+
54: 'sandwich',
|
|
194
|
+
55: 'orange',
|
|
195
|
+
56: 'broccoli',
|
|
196
|
+
57: 'carrot',
|
|
197
|
+
58: 'hot dog',
|
|
198
|
+
59: 'pizza',
|
|
199
|
+
60: 'donut',
|
|
200
|
+
61: 'cake',
|
|
201
|
+
62: 'chair',
|
|
202
|
+
63: 'couch',
|
|
203
|
+
64: 'potted plant',
|
|
204
|
+
65: 'bed',
|
|
205
|
+
66: 'N/A',
|
|
206
|
+
67: 'dining table',
|
|
207
|
+
68: 'N/A',
|
|
208
|
+
69: 'N/A',
|
|
209
|
+
70: 'toilet',
|
|
210
|
+
71: 'N/A',
|
|
211
|
+
72: 'tv',
|
|
212
|
+
73: 'laptop',
|
|
213
|
+
74: 'mouse',
|
|
214
|
+
75: 'remote',
|
|
215
|
+
76: 'keyboard',
|
|
216
|
+
77: 'cell phone',
|
|
217
|
+
78: 'microwave',
|
|
218
|
+
79: 'oven',
|
|
219
|
+
80: 'toaster',
|
|
220
|
+
81: 'sink',
|
|
221
|
+
82: 'refrigerator',
|
|
222
|
+
83: 'N/A',
|
|
223
|
+
84: 'book',
|
|
224
|
+
85: 'clock',
|
|
225
|
+
86: 'vase',
|
|
226
|
+
87: 'scissors',
|
|
227
|
+
88: 'teddy bear',
|
|
228
|
+
89: 'hair drier',
|
|
229
|
+
90: 'toothbrush'
|
|
230
|
+
}
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pixeltable
|
|
3
|
+
Version: 0.2.8
|
|
4
|
+
Summary: Pixeltable: The Multimodal AI Data Plane
|
|
5
|
+
Author: Marcel Kornacker
|
|
6
|
+
Author-email: marcelk@gmail.com
|
|
7
|
+
Requires-Python: >=3.9,<4.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Requires-Dist: av (>=10.0.0)
|
|
14
|
+
Requires-Dist: beautifulsoup4 (>=4.0.0,<5.0.0)
|
|
15
|
+
Requires-Dist: cloudpickle (>=2.2.1,<3.0.0)
|
|
16
|
+
Requires-Dist: ftfy (>=6.2.0,<7.0.0)
|
|
17
|
+
Requires-Dist: jinja2 (>=3.1.3,<4.0.0)
|
|
18
|
+
Requires-Dist: jmespath (>=1.0.1,<2.0.0)
|
|
19
|
+
Requires-Dist: mistune (>=3.0.2,<4.0.0)
|
|
20
|
+
Requires-Dist: more-itertools (>=10.2,<11.0)
|
|
21
|
+
Requires-Dist: numpy (>=1.25)
|
|
22
|
+
Requires-Dist: opencv-python-headless (>=4.7.0.68,<5.0.0.0)
|
|
23
|
+
Requires-Dist: pandas (>=2.0,<3.0)
|
|
24
|
+
Requires-Dist: pgserver (==0.1.4)
|
|
25
|
+
Requires-Dist: pgvector (>=0.2.1,<0.3.0)
|
|
26
|
+
Requires-Dist: pillow (>=9.3.0)
|
|
27
|
+
Requires-Dist: psutil (>=5.9.5,<6.0.0)
|
|
28
|
+
Requires-Dist: psycopg2-binary (>=2.9.5,<3.0.0)
|
|
29
|
+
Requires-Dist: pymupdf (>=1.24.1,<2.0.0)
|
|
30
|
+
Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
|
|
31
|
+
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
|
32
|
+
Requires-Dist: setuptools (==69.1.1)
|
|
33
|
+
Requires-Dist: sqlalchemy[mypy] (>=2.0.23,<3.0.0)
|
|
34
|
+
Requires-Dist: tenacity (>=8.2,<9.0)
|
|
35
|
+
Requires-Dist: tqdm (>=4.64)
|
|
36
|
+
Description-Content-Type: text/markdown
|
|
37
|
+
|
|
38
|
+
<div align="center">
|
|
39
|
+
<img src="https://raw.githubusercontent.com/pixeltable/pixeltable/master/docs/release/pixeltable-banner.png" alt="Pixeltable" width="45%" />
|
|
40
|
+
|
|
41
|
+
# Unifying Data, Models, and Orchestration for AI Products
|
|
42
|
+
|
|
43
|
+
[](https://opensource.org/licenses/Apache-2.0)
|
|
44
|
+

|
|
45
|
+
[]()
|
|
46
|
+
[](https://github.com/pixeltable/pixeltable/actions)
|
|
47
|
+
[](https://pypi.org/project/pixeltable/)
|
|
48
|
+
|
|
49
|
+
[Installation](https://pixeltable.github.io/pixeltable/getting-started/) | [Documentation](https://pixeltable.readme.io/) | [API Reference](https://pixeltable.github.io/pixeltable/) | [Code Samples](https://pixeltable.readme.io/recipes) | [Examples](https://github.com/pixeltable/pixeltable/tree/master/docs/release/tutorials)
|
|
50
|
+
</div>
|
|
51
|
+
|
|
52
|
+
Pixeltable is a Python library that lets AI engineers and data scientists focus on exploration, modeling, and app development without dealing with the customary data plumbing.
|
|
53
|
+
|
|
54
|
+
## What problems does Pixeltable solve?
|
|
55
|
+
|
|
56
|
+
Today’s solutions for AI app development require extensive custom coding and infrastructure plumbing. Tracking lineage and versions between and across data transformations, models, and deployment is cumbersome. With Pixeltable you can store, transform, index, and iterate on your data within the same table interface, whether it's text, images, embeddings, or even video. Built-in lineage and versioning ensure transparency and reproducibility, while the development-to-production mirror streamlines deployment.
|
|
57
|
+
|
|
58
|
+
## 💾 Installation
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
%pip install pixeltable
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
To verify that it's working:
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
import pixeltable as pxt
|
|
68
|
+
pxt.init()
|
|
69
|
+
```
|
|
70
|
+
> [!NOTE]
|
|
71
|
+
> Check out the [Pixeltable Basics](https://pixeltable.readme.io/docs/pixeltable-basics) tutorial for a tour of its most important features.
|
|
72
|
+
|
|
73
|
+
## 💡 Get Started
|
|
74
|
+
Learn how to create tables, populate them with data, and enhance them with built-in or user-defined transformations and AI operations.
|
|
75
|
+
|
|
76
|
+
| Topic | Notebook | API |
|
|
77
|
+
|:--------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------:|
|
|
78
|
+
| Get Started | <a target="_blank" href="https://colab.research.google.com/github/pixeltable/pixeltable/blob/master/docs/tutorials/pixeltable-basics.ipynb"> <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/> </a> | [](https://pixeltable.github.io/pixeltable/api/pixeltable/) |
|
|
79
|
+
| User-Defined Functions (UDFs) | <a target="_blank" href="https://colab.research.google.com/github/pixeltable/pixeltable/blob/master/docs/release/howto/udfs-in-pixeltable.ipynb"> <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/> </a> | [](https://pixeltable.github.io/pixeltable/api/iterators/document-splitter/) |
|
|
80
|
+
| Comparing Object Detection Models | <a target="_blank" href="https://colab.research.google.com/github/pixeltable/pixeltable/blob/master/docs/release/tutorials/object-detection-in-videos.ipynb"> <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/> </a> | [](https://pixeltable.github.io/pixeltable/api-cheat-sheet/#frame-extraction-for-video-data) |
|
|
81
|
+
| Experimenting with Chunking (RAG) | <a target="_blank" href="https://colab.research.google.com/github/pixeltable/pixeltable/blob/master/docs/release/tutorials/rag-operations.ipynb"> <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/> | [](https://pixeltable.github.io/pixeltable/api/iterators/document-splitter/) |
|
|
82
|
+
| Working with External Files | <a target="_blank" href="https://colab.research.google.com/github/pixeltable/pixeltable/blob/master/docs/release/howto/working-with-external-files.ipynb"> <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/> </a> | [](https://pixeltable.github.io/pixeltable/api-cheat-sheet/#inserting-data-into-a-table) |
|
|
83
|
+
|
|
84
|
+
## ❓ FAQ
|
|
85
|
+
|
|
86
|
+
### What does Pixeltable provide me with? Pixeltable provides:
|
|
87
|
+
|
|
88
|
+
- Data storage and versioning
|
|
89
|
+
- Combined Data and Model Lineage
|
|
90
|
+
- Indexing (e.g. embedding vectors) and Data Retrieval
|
|
91
|
+
- Orchestration of multimodal workloads
|
|
92
|
+
- Incremental updates
|
|
93
|
+
- Code is automatically production-ready
|
|
94
|
+
|
|
95
|
+
### Why should you use Pixeltable?
|
|
96
|
+
|
|
97
|
+
- **It gives you transparency and reproducibility**
|
|
98
|
+
- All generated data is automatically recorded and versioned
|
|
99
|
+
- You will never need to re-run a workload because you lost track of the input data
|
|
100
|
+
- **It saves you money**
|
|
101
|
+
- All data changes are automatically incremental
|
|
102
|
+
- You never need to re-run pipelines from scratch because you’re adding data
|
|
103
|
+
- **It integrates with any existing Python code or libraries**
|
|
104
|
+
- Bring your ever-changing code and workloads
|
|
105
|
+
- You choose the models, tools, and AI practices (e.g., your embedding model for a vector index); Pixeltable orchestrates the data
|
|
106
|
+
|
|
107
|
+
### What is Pixeltable not providing?
|
|
108
|
+
|
|
109
|
+
- Pixeltable is not a low-code, prescriptive AI solution. We empower you to use the best frameworks and techniques for your specific needs.
|
|
110
|
+
- We do not aim to replace your existing AI toolkit, but rather enhance it by streamlining the underlying data infrastructure and orchestration.
|
|
111
|
+
|
|
112
|
+
> [!TIP]
|
|
113
|
+
> Check out the [Integrations](https://pixeltable.readme.io/docs/working-with-openai) section, and feel free to submit a request for additional ones.
|
|
114
|
+
|
|
115
|
+
## 📙 Example of Use Cases
|
|
116
|
+
|
|
117
|
+
- **Interact with video data at the frame level** without having to think about frame extraction, intermediate file storage, or storage space explosion.
|
|
118
|
+
- **Augment your data incrementally and interactively with built-in functions and UDFs**, such as image transformations, model inference, and visualizations, without having to think about data pipelines, incremental updates, or capturing function output.
|
|
119
|
+
- **Interact with all the data relevant to your AI application** (video, images, documents, audio, structured data, JSON) through a simple dataframe-style API directly in Python. This includes:
|
|
120
|
+
- similarity search on embeddings, supported by high-dimensional vector indexing;
|
|
121
|
+
- path expressions and transformations on JSON data;
|
|
122
|
+
- PIL and OpenCV image operations;
|
|
123
|
+
- assembling frames into videos.
|
|
124
|
+
- **Perform keyword and image similarity search at the video frame level** without having to worry about frame storage.
|
|
125
|
+
- **Access all Pixeltable-resident data directly as a PyTorch dataset** in your training scripts.
|
|
126
|
+
- **Understand the compute and storage costs of your data at the granularity** of individual augmentations and get cost projections before adding new data and new augmentations.
|
|
127
|
+
- **Rely on Pixeltable's automatic versioning and snapshot functionality** to protect against regressions and to ensure reproducibility.
|
|
128
|
+
|
|
129
|
+
## 🐛 Contributions & Feedback
|
|
130
|
+
|
|
131
|
+
Are you experiencing issues or bugs with Pixeltable? File an [Issue](https://github.com/pixeltable/pixeltable/issues).
|
|
132
|
+
</br>Do you want to contribute? Feel free to open a [PR](https://github.com/pixeltable/pixeltable/pulls).
|
|
133
|
+
|
|
134
|
+
## :classical_building: License
|
|
135
|
+
|
|
136
|
+
This library is licensed under the Apache 2.0 License.
|
|
137
|
+
|