pixeltable 0.4.12__py3-none-any.whl → 0.4.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

pixeltable/__init__.py CHANGED
@@ -1,7 +1,17 @@
1
1
  # ruff: noqa: F401
2
2
 
3
3
  from .__version__ import __version__, __version_tuple__
4
- from .catalog import Column, ColumnMetadata, IndexMetadata, InsertableTable, Table, TableMetadata, UpdateStatus, View
4
+ from .catalog import (
5
+ Column,
6
+ ColumnMetadata,
7
+ IndexMetadata,
8
+ InsertableTable,
9
+ Table,
10
+ TableMetadata,
11
+ UpdateStatus,
12
+ VersionMetadata,
13
+ View,
14
+ )
5
15
  from .dataframe import DataFrame
6
16
  from .exceptions import Error, ExprEvalError, PixeltableWarning
7
17
  from .func import Aggregator, Function, Tool, ToolChoice, Tools, expr_udf, mcp_udfs, query, retrieval_udf, uda, udf
@@ -8,7 +8,8 @@ from .insertable_table import InsertableTable
8
8
  from .named_function import NamedFunction
9
9
  from .path import Path
10
10
  from .schema_object import SchemaObject
11
- from .table import ColumnMetadata, IndexMetadata, Table, TableMetadata
11
+ from .table import Table
12
+ from .table_metadata import ColumnMetadata, IndexMetadata, TableMetadata, VersionMetadata
12
13
  from .table_version import TableVersion
13
14
  from .table_version_handle import ColumnHandle, TableVersionHandle
14
15
  from .table_version_path import TableVersionPath
@@ -7,9 +7,7 @@ import json
7
7
  import logging
8
8
  from keyword import iskeyword as is_python_keyword
9
9
  from pathlib import Path
10
- from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Literal, Optional, TypedDict, overload
11
-
12
- from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
10
+ from typing import TYPE_CHECKING, Any, Iterable, Literal, Optional, overload
13
11
  from uuid import UUID
14
12
 
15
13
  import pandas as pd
@@ -17,6 +15,13 @@ import sqlalchemy as sql
17
15
 
18
16
  import pixeltable as pxt
19
17
  from pixeltable import catalog, env, exceptions as excs, exprs, index, type_system as ts
18
+ from pixeltable.catalog.table_metadata import (
19
+ ColumnMetadata,
20
+ EmbeddingIndexParams,
21
+ IndexMetadata,
22
+ TableMetadata,
23
+ VersionMetadata,
24
+ )
20
25
  from pixeltable.metadata import schema
21
26
  from pixeltable.metadata.utils import MetadataUtils
22
27
 
@@ -37,6 +42,9 @@ from .table_version_handle import TableVersionHandle
37
42
  from .table_version_path import TableVersionPath
38
43
  from .update_status import UpdateStatus
39
44
 
45
+ from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
46
+
47
+
40
48
  if TYPE_CHECKING:
41
49
  import torch.utils.data
42
50
 
@@ -95,7 +103,7 @@ class Table(SchemaObject):
95
103
 
96
104
  return op()
97
105
 
98
- def _get_metadata(self) -> 'TableMetadata':
106
+ def _get_metadata(self) -> TableMetadata:
99
107
  columns = self._tbl_version_path.columns()
100
108
  column_info: dict[str, ColumnMetadata] = {}
101
109
  for col in columns:
@@ -1690,43 +1698,35 @@ class Table(SchemaObject):
1690
1698
  def _ipython_key_completions_(self) -> list[str]:
1691
1699
  return list(self._get_schema().keys())
1692
1700
 
1693
- _REPORT_SCHEMA: ClassVar[dict[str, ts.ColumnType]] = {
1694
- 'version': ts.IntType(),
1695
- 'created_at': ts.TimestampType(),
1696
- 'user': ts.StringType(nullable=True),
1697
- 'note': ts.StringType(),
1698
- 'inserts': ts.IntType(nullable=True),
1699
- 'updates': ts.IntType(nullable=True),
1700
- 'deletes': ts.IntType(nullable=True),
1701
- 'errors': ts.IntType(nullable=True),
1702
- 'computed': ts.IntType(),
1703
- 'schema_change': ts.StringType(),
1704
- }
1705
-
1706
- def history(self, n: Optional[int] = None) -> pixeltable.dataframe.DataFrameResultSet:
1707
- """Returns rows of information about the versions of this table, most recent first.
1701
+ def get_versions(self, n: Optional[int] = None) -> list[VersionMetadata]:
1702
+ """
1703
+ Returns information about versions of this table, most recent first.
1704
+
1705
+ `get_versions()` is intended for programmatic access to version metadata; for human-readable
1706
+ output, use [`history()`][pixeltable.Table.history] instead.
1708
1707
 
1709
1708
  Args:
1710
- n: a limit to the number of versions listed
1709
+ n: if specified, will return at most `n` versions
1711
1710
 
1712
- Examples:
1713
- Report history:
1711
+ Returns:
1712
+ A list of [VersionMetadata][pixeltable.VersionMetadata] dictionaries, one per version retrieved, most
1713
+ recent first.
1714
1714
 
1715
- >>> tbl.history()
1715
+ Examples:
1716
+ Retrieve metadata about all versions of the table `tbl`:
1716
1717
 
1717
- Report only the most recent 5 changes to the table:
1718
+ >>> tbl.get_versions()
1718
1719
 
1719
- >>> tbl.history(n=5)
1720
+ Retrieve metadata about the most recent 5 versions of the table `tbl`:
1720
1721
 
1721
- Returns:
1722
- A list of information about each version, ordered from most recent to oldest version.
1722
+ >>> tbl.get_versions(n=5)
1723
1723
  """
1724
1724
  from pixeltable.catalog import Catalog
1725
1725
 
1726
1726
  if n is None:
1727
1727
  n = 1_000_000_000
1728
1728
  if not isinstance(n, int) or n < 1:
1729
- raise excs.Error(f'Invalid value for n: {n}')
1729
+ raise excs.Error(f'Invalid value for `n`: {n}')
1730
1730
 
1731
1731
  # Retrieve the table history components from the catalog
1732
1732
  tbl_id = self._id
@@ -1744,104 +1744,60 @@ class Table(SchemaObject):
1744
1744
  else:
1745
1745
  over_count = 0
1746
1746
 
1747
- report_lines: list[list[Any]] = []
1747
+ metadata_dicts: list[VersionMetadata] = []
1748
1748
  for vers_md in vers_list[0 : len(vers_list) - over_count]:
1749
1749
  version = vers_md.version_md.version
1750
- schema_change = md_dict.get(version, '')
1750
+ schema_change = md_dict.get(version, None)
1751
1751
  update_status = vers_md.version_md.update_status
1752
1752
  if update_status is None:
1753
1753
  update_status = UpdateStatus()
1754
- change_type = 'schema' if schema_change != '' else ''
1755
- if change_type == '':
1756
- change_type = 'data'
1754
+ change_type: Literal['schema', 'data'] = 'schema' if schema_change is not None else 'data'
1757
1755
  rcs = update_status.row_count_stats + update_status.cascade_row_count_stats
1758
- report_line = [
1759
- version,
1760
- datetime.datetime.fromtimestamp(vers_md.version_md.created_at),
1761
- vers_md.version_md.user,
1762
- change_type,
1763
- rcs.ins_rows,
1764
- rcs.upd_rows,
1765
- rcs.del_rows,
1766
- rcs.num_excs,
1767
- rcs.computed_values,
1768
- schema_change,
1769
- ]
1770
- report_lines.append(report_line)
1756
+ metadata_dicts.append(
1757
+ VersionMetadata(
1758
+ version=version,
1759
+ created_at=datetime.datetime.fromtimestamp(vers_md.version_md.created_at, tz=datetime.timezone.utc),
1760
+ user=vers_md.version_md.user,
1761
+ change_type=change_type,
1762
+ inserts=rcs.ins_rows,
1763
+ updates=rcs.upd_rows,
1764
+ deletes=rcs.del_rows,
1765
+ errors=rcs.num_excs,
1766
+ computed=rcs.computed_values,
1767
+ schema_change=schema_change,
1768
+ )
1769
+ )
1771
1770
 
1772
- return pxt.dataframe.DataFrameResultSet(report_lines, self._REPORT_SCHEMA)
1771
+ return metadata_dicts
1772
+
1773
+ def history(self, n: Optional[int] = None) -> pd.DataFrame:
1774
+ """
1775
+ Returns a human-readable report about versions of this table.
1776
+
1777
+ `history()` is intended for human-readable output of version metadata; for programmatic access,
1778
+ use [`get_versions()`][pixeltable.Table.get_versions] instead.
1779
+
1780
+ Args:
1781
+ n: if specified, will return at most `n` versions
1782
+
1783
+ Returns:
1784
+ A report with information about each version, one per row, most recent first.
1785
+
1786
+ Examples:
1787
+ Report all versions of the table:
1788
+
1789
+ >>> tbl.history()
1790
+
1791
+ Report only the most recent 5 changes to the table:
1792
+
1793
+ >>> tbl.history(n=5)
1794
+ """
1795
+ versions = self.get_versions(n)
1796
+ assert len(versions) > 0
1797
+ return pd.DataFrame([list(v.values()) for v in versions], columns=list(versions[0].keys()))
1773
1798
 
1774
1799
  def __check_mutable(self, op_descr: str) -> None:
1775
1800
  if self._tbl_version_path.is_snapshot():
1776
1801
  raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a snapshot.')
1777
1802
  if self._tbl_version_path.is_replica():
1778
1803
  raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a {self._display_name()}.')
1779
-
1780
-
1781
- class ColumnMetadata(TypedDict):
1782
- """Metadata for a column of a Pixeltable table."""
1783
-
1784
- name: str
1785
- """The name of the column."""
1786
- type_: str
1787
- """The type specifier of the column."""
1788
- version_added: int
1789
- """The table version when this column was added."""
1790
- is_stored: bool
1791
- """`True` if this is a stored column; `False` if it is dynamically computed."""
1792
- is_primary_key: bool
1793
- """`True` if this column is part of the table's primary key."""
1794
- media_validation: Optional[Literal['on_read', 'on_write']]
1795
- """The media validation policy for this column."""
1796
- computed_with: Optional[str]
1797
- """Expression used to compute this column; `None` if this is not a computed column."""
1798
-
1799
-
1800
- class IndexMetadata(TypedDict):
1801
- """Metadata for a column of a Pixeltable table."""
1802
-
1803
- name: str
1804
- """The name of the index."""
1805
- columns: list[str]
1806
- """The table columns that are indexed."""
1807
- index_type: Literal['embedding']
1808
- """The type of index (currently only `'embedding'` is supported, but others will be added in the future)."""
1809
- parameters: EmbeddingIndexParams
1810
-
1811
-
1812
- class EmbeddingIndexParams(TypedDict):
1813
- metric: Literal['cosine', 'ip', 'l2']
1814
- """Index metric."""
1815
- embeddings: list[str]
1816
- """List of embeddings defined for this index."""
1817
-
1818
-
1819
- class TableMetadata(TypedDict):
1820
- """Metadata for a Pixeltable table."""
1821
-
1822
- name: str
1823
- """The name of the table (ex: `'my_table'`)."""
1824
- path: str
1825
- """The full path of the table (ex: `'my_dir.my_subdir.my_table'`)."""
1826
- columns: dict[str, ColumnMetadata]
1827
- """Column metadata for all of the visible columns of the table."""
1828
- indices: dict[str, IndexMetadata]
1829
- """Index metadata for all of the indices of the table."""
1830
- is_replica: bool
1831
- """`True` if this table is a replica of another (shared) table."""
1832
- is_view: bool
1833
- """`True` if this table is a view."""
1834
- is_snapshot: bool
1835
- """`True` if this table is a snapshot."""
1836
- version: int
1837
- """The current version of the table."""
1838
- version_created: datetime.datetime
1839
- """The timestamp when this table version was created."""
1840
- schema_version: int
1841
- """The current schema version of the table."""
1842
- comment: Optional[str]
1843
- """User-provided table comment, if one exists."""
1844
- media_validation: Literal['on_read', 'on_write']
1845
- """The media validation policy for this table."""
1846
- base: Optional[str]
1847
- """If this table is a view or snapshot, the full path of its base table; otherwise `None`."""
@@ -0,0 +1,96 @@
1
+ import datetime
2
+ from typing import Literal, Optional, TypedDict
3
+
4
+
5
+ class ColumnMetadata(TypedDict):
6
+ """Metadata for a column of a Pixeltable table."""
7
+
8
+ name: str
9
+ """The name of the column."""
10
+ type_: str
11
+ """The type specifier of the column."""
12
+ version_added: int
13
+ """The table version when this column was added."""
14
+ is_stored: bool
15
+ """`True` if this is a stored column; `False` if it is dynamically computed."""
16
+ is_primary_key: bool
17
+ """`True` if this column is part of the table's primary key."""
18
+ media_validation: Optional[Literal['on_read', 'on_write']]
19
+ """The media validation policy for this column."""
20
+ computed_with: Optional[str]
21
+ """Expression used to compute this column; `None` if this is not a computed column."""
22
+
23
+
24
+ class EmbeddingIndexParams(TypedDict):
25
+ metric: Literal['cosine', 'ip', 'l2']
26
+ """Index metric."""
27
+ embeddings: list[str]
28
+ """List of embeddings defined for this index."""
29
+
30
+
31
+ class IndexMetadata(TypedDict):
32
+ """Metadata for a column of a Pixeltable table."""
33
+
34
+ name: str
35
+ """The name of the index."""
36
+ columns: list[str]
37
+ """The table columns that are indexed."""
38
+ index_type: Literal['embedding']
39
+ """The type of index (currently only `'embedding'` is supported, but others will be added in the future)."""
40
+ parameters: EmbeddingIndexParams
41
+
42
+
43
+ class TableMetadata(TypedDict):
44
+ """Metadata for a Pixeltable table."""
45
+
46
+ name: str
47
+ """The name of the table (ex: `'my_table'`)."""
48
+ path: str
49
+ """The full path of the table (ex: `'my_dir.my_subdir.my_table'`)."""
50
+ columns: dict[str, ColumnMetadata]
51
+ """Column metadata for all of the visible columns of the table."""
52
+ indices: dict[str, IndexMetadata]
53
+ """Index metadata for all of the indices of the table."""
54
+ is_replica: bool
55
+ """`True` if this table is a replica of another (shared) table."""
56
+ is_view: bool
57
+ """`True` if this table is a view."""
58
+ is_snapshot: bool
59
+ """`True` if this table is a snapshot."""
60
+ version: int
61
+ """The current version of the table."""
62
+ version_created: datetime.datetime
63
+ """The timestamp when this table version was created."""
64
+ schema_version: int
65
+ """The current schema version of the table."""
66
+ comment: Optional[str]
67
+ """User-provided table comment, if one exists."""
68
+ media_validation: Literal['on_read', 'on_write']
69
+ """The media validation policy for this table."""
70
+ base: Optional[str]
71
+ """If this table is a view or snapshot, the full path of its base table; otherwise `None`."""
72
+
73
+
74
+ class VersionMetadata(TypedDict):
75
+ """Metadata for a specific version of a Pixeltable table."""
76
+
77
+ """The version number."""
78
+ version: int
79
+ """The timestamp when this version was created."""
80
+ created_at: datetime.datetime
81
+ """The user who created this version, if defined."""
82
+ user: str | None
83
+ """The type of table transformation that this version represents (`'data'` or `'schema'`)."""
84
+ change_type: Literal['data', 'schema']
85
+ """The number of rows inserted in this version."""
86
+ inserts: int
87
+ """The number of rows updated in this version."""
88
+ updates: int
89
+ """The number of rows deleted in this version."""
90
+ deletes: int
91
+ """The number of errors encountered during this version."""
92
+ errors: int
93
+ """The number of computed values calculated in this version."""
94
+ computed: int
95
+ """A description of the schema change that occurred in this version, if any."""
96
+ schema_change: str | None
pixeltable/env.py CHANGED
@@ -743,6 +743,7 @@ class Env:
743
743
  self.__register_package('whisper', library_name='openai-whisper')
744
744
  self.__register_package('whisperx')
745
745
  self.__register_package('yolox', library_name='pixeltable-yolox')
746
+ self.__register_package('lancedb')
746
747
 
747
748
  def __register_package(self, package_name: str, library_name: Optional[str] = None) -> None:
748
749
  is_installed: bool
pixeltable/globals.py CHANGED
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  import logging
4
4
  import os
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, NamedTuple, Optional, Sequence, Union
6
+ from typing import TYPE_CHECKING, Any, Iterable, Literal, NamedTuple, Optional, Union
7
7
 
8
8
  import pandas as pd
9
9
  import pydantic
@@ -24,9 +24,8 @@ if TYPE_CHECKING:
24
24
  str,
25
25
  os.PathLike,
26
26
  Path, # OS paths, filenames, URLs
27
- Iterator[dict[str, Any]], # iterator producing dictionaries of values
28
- RowData, # list of dictionaries
29
- Sequence[pydantic.BaseModel], # list of Pydantic models
27
+ Iterable[dict[str, Any]], # dictionaries of values
28
+ Iterable[pydantic.BaseModel], # Pydantic model instances
30
29
  DataFrame, # Pixeltable DataFrame
31
30
  pd.DataFrame, # pandas DataFrame
32
31
  datasets.Dataset,
pixeltable/io/__init__.py CHANGED
@@ -4,11 +4,12 @@ from .datarows import import_json, import_rows
4
4
  from .external_store import ExternalStore
5
5
  from .globals import create_label_studio_project, export_images_as_fo_dataset
6
6
  from .hf_datasets import import_huggingface_dataset
7
+ from .lancedb import export_lancedb
7
8
  from .pandas import import_csv, import_excel, import_pandas
8
9
  from .parquet import export_parquet, import_parquet
9
10
 
10
11
  __default_dir = {symbol for symbol in dir() if not symbol.startswith('_')}
11
- __removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows'}
12
+ __removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows', 'lancedb'}
12
13
  __all__ = sorted(__default_dir - __removed_symbols)
13
14
 
14
15
 
@@ -0,0 +1,3 @@
1
+ from pixeltable.utils.lancedb import export_lancedb
2
+
3
+ __all__ = ['export_lancedb']
pixeltable/io/parquet.py CHANGED
@@ -1,46 +1,22 @@
1
1
  from __future__ import annotations
2
2
 
3
- import datetime
4
- import io
5
3
  import json
6
4
  import logging
7
5
  import typing
8
- from collections import deque
9
6
  from pathlib import Path
10
7
  from typing import Any, Optional
11
8
 
12
- import numpy as np
13
- import PIL.Image
14
-
15
9
  import pixeltable as pxt
16
10
  import pixeltable.exceptions as excs
17
11
  from pixeltable.catalog import Catalog
18
12
  from pixeltable.utils.transactional_directory import transactional_directory
19
13
 
20
14
  if typing.TYPE_CHECKING:
21
- import pyarrow as pa
22
-
23
15
  import pixeltable as pxt
24
16
 
25
17
  _logger = logging.getLogger('pixeltable')
26
18
 
27
19
 
28
- def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
29
- import pyarrow as pa
30
- from pyarrow import parquet
31
-
32
- pydict = {}
33
- for field in schema:
34
- if isinstance(field.type, pa.FixedShapeTensorType):
35
- stacked_arr = np.stack(value_batch[field.name])
36
- pydict[field.name] = pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr)
37
- else:
38
- pydict[field.name] = value_batch[field.name]
39
-
40
- tab = pa.Table.from_pydict(pydict, schema=schema)
41
- parquet.write_table(tab, str(output_path))
42
-
43
-
44
20
  def export_parquet(
45
21
  table_or_df: pxt.Table | pxt.DataFrame,
46
22
  parquet_path: Path,
@@ -63,7 +39,9 @@ def export_parquet(
63
39
  If False, will raise an error if the Dataframe has any image column.
64
40
  Default False.
65
41
  """
66
- from pixeltable.utils.arrow import to_arrow_schema
42
+ import pyarrow as pa
43
+
44
+ from pixeltable.utils.arrow import to_record_batches
67
45
 
68
46
  df: pxt.DataFrame
69
47
  if isinstance(table_or_df, pxt.catalog.Table):
@@ -71,9 +49,6 @@ def export_parquet(
71
49
  else:
72
50
  df = table_or_df
73
51
 
74
- type_dict = {k: v.as_dict() for k, v in df.schema.items()}
75
- arrow_schema = to_arrow_schema(df.schema)
76
-
77
52
  if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
78
53
  raise excs.Error('Cannot export Dataframe with image columns when inline_images is False')
79
54
 
@@ -81,70 +56,15 @@ def export_parquet(
81
56
  with transactional_directory(parquet_path) as temp_path:
82
57
  # dump metadata json file so we can inspect what was the source of the parquet file later on.
83
58
  json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
59
+ type_dict = {k: v.as_dict() for k, v in df.schema.items()}
84
60
  json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
85
-
86
61
  batch_num = 0
87
- current_value_batch: dict[str, deque] = {k: deque() for k in df.schema}
88
- current_byte_estimate = 0
89
-
90
62
  with Catalog.get().begin_xact(for_write=False):
91
- for data_row in df._exec():
92
- for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
93
- val = data_row[e.slot_idx]
94
- if val is None:
95
- current_value_batch[col_name].append(val)
96
- continue
97
-
98
- assert val is not None
99
- if col_type.is_image_type():
100
- # images get inlined into the parquet file
101
- if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
102
- # if there is a file, read directly to preserve information
103
- with open(data_row.file_paths[e.slot_idx], 'rb') as f:
104
- val = f.read()
105
- elif isinstance(val, PIL.Image.Image):
106
- # if no file available, eg. bc it is computed, convert to png
107
- buf = io.BytesIO()
108
- val.save(buf, format='PNG')
109
- val = buf.getvalue()
110
- else:
111
- raise excs.Error(f'unknown image type {type(val)}')
112
- length = len(val)
113
- elif col_type.is_string_type():
114
- length = len(val)
115
- elif col_type.is_video_type() or col_type.is_audio_type():
116
- if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
117
- val = data_row.file_paths[e.slot_idx]
118
- else:
119
- raise excs.Error(f'unknown audio/video type {type(val)}')
120
- length = len(val)
121
- elif col_type.is_json_type():
122
- val = json.dumps(val)
123
- length = len(val)
124
- elif col_type.is_array_type():
125
- length = val.nbytes
126
- elif col_type.is_int_type() or col_type.is_float_type():
127
- length = 8
128
- elif col_type.is_bool_type():
129
- length = 1
130
- elif col_type.is_date_type():
131
- length = 4
132
- elif col_type.is_timestamp_type():
133
- val = val.astimezone(datetime.timezone.utc)
134
- length = 8
135
- else:
136
- raise excs.Error(f'unknown type {col_type} for {col_name}')
137
-
138
- current_value_batch[col_name].append(val)
139
- current_byte_estimate += length
140
- if current_byte_estimate > partition_size_bytes:
141
- assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
142
- _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
143
- batch_num += 1
144
- current_value_batch = {k: deque() for k in df.schema}
145
- current_byte_estimate = 0
146
-
147
- _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
63
+ for record_batch in to_record_batches(df, partition_size_bytes):
64
+ output_path = temp_path / f'part-{batch_num:05d}.parquet'
65
+ arrow_tbl = pa.Table.from_batches([record_batch]) # type: ignore
66
+ pa.parquet.write_table(arrow_tbl, str(output_path))
67
+ batch_num += 1
148
68
 
149
69
 
150
70
  def import_parquet(
@@ -469,12 +469,12 @@ class ParquetTableDataConduit(TableDataConduit):
469
469
  return t
470
470
 
471
471
  def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
472
- from pixeltable.utils.arrow import ar_infer_schema
472
+ from pixeltable.utils.arrow import to_pxt_schema
473
473
 
474
474
  if self.source_column_map is None:
475
475
  if self.src_schema_overrides is None:
476
476
  self.src_schema_overrides = {}
477
- self.src_schema = ar_infer_schema(self.pq_ds.schema, self.src_schema_overrides, self.src_pk)
477
+ self.src_schema = to_pxt_schema(self.pq_ds.schema, self.src_schema_overrides, self.src_pk)
478
478
  inferred_schema, inferred_pk, self.source_column_map = normalize_schema_names(
479
479
  self.src_schema, self.src_pk, self.src_schema_overrides
480
480
  )
pixeltable/utils/arrow.py CHANGED
@@ -1,11 +1,18 @@
1
1
  import datetime
2
- from typing import Any, Iterator, Optional
2
+ import io
3
+ import json
4
+ from typing import TYPE_CHECKING, Any, Iterator, Optional, cast
3
5
 
4
6
  import numpy as np
7
+ import PIL.Image
5
8
  import pyarrow as pa
6
9
 
10
+ import pixeltable.exceptions as excs
7
11
  import pixeltable.type_system as ts
8
12
 
13
+ if TYPE_CHECKING:
14
+ import pixeltable as pxt
15
+
9
16
  PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
10
17
  pa.string(): ts.StringType(nullable=True),
11
18
  pa.large_string(): ts.StringType(nullable=True),
@@ -71,7 +78,7 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
71
78
  return None
72
79
 
73
80
 
74
- def ar_infer_schema(
81
+ def to_pxt_schema(
75
82
  arrow_schema: pa.Schema, schema_overrides: dict[str, Any], primary_key: list[str]
76
83
  ) -> dict[str, ts.ColumnType]:
77
84
  """Convert a pyarrow Schema to a schema using pyarrow names and pixeltable types."""
@@ -88,6 +95,94 @@ def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
88
95
  return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items()) # type: ignore[misc]
89
96
 
90
97
 
98
+ def _to_record_batch(column_vals: dict[str, list[Any]], schema: pa.Schema) -> pa.RecordBatch:
99
+ import pyarrow as pa
100
+
101
+ pa_arrays: list[pa.Array] = []
102
+ for field in schema:
103
+ if isinstance(field.type, pa.FixedShapeTensorType):
104
+ stacked_arr = np.stack(column_vals[field.name])
105
+ pa_arrays.append(pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr))
106
+ else:
107
+ pa_array = cast(pa.Array, pa.array(column_vals[field.name]))
108
+ pa_arrays.append(pa_array)
109
+ return pa.RecordBatch.from_arrays(pa_arrays, schema=schema) # type: ignore
110
+
111
+
112
+ def to_record_batches(df: 'pxt.DataFrame', batch_size_bytes: int) -> Iterator[pa.RecordBatch]:
113
+ arrow_schema = to_arrow_schema(df.schema)
114
+ batch_columns: dict[str, list[Any]] = {k: [] for k in df.schema}
115
+ current_byte_estimate = 0
116
+ num_batch_rows = 0
117
+
118
+ # TODO: in order to avoid having to deal with ExprEvalError here, DataFrameResultSet should be an iterator
119
+ # over _exec()
120
+ try:
121
+ for data_row in df._exec():
122
+ num_batch_rows += 1
123
+ for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
124
+ val = data_row[e.slot_idx]
125
+ val_size_bytes: int
126
+ if val is None:
127
+ batch_columns[col_name].append(val)
128
+ continue
129
+
130
+ assert val is not None
131
+ if col_type.is_image_type():
132
+ # images get inlined into the parquet file
133
+ if data_row.file_paths[e.slot_idx] is not None:
134
+ # if there is a file, read directly to preserve information
135
+ with open(data_row.file_paths[e.slot_idx], 'rb') as f:
136
+ val = f.read()
137
+ elif isinstance(val, PIL.Image.Image):
138
+ # no file available: save as png
139
+ buf = io.BytesIO()
140
+ val.save(buf, format='png')
141
+ val = buf.getvalue()
142
+ else:
143
+ raise excs.Error(f'unknown image type {type(val)}')
144
+ val_size_bytes = len(val)
145
+ elif col_type.is_string_type():
146
+ val_size_bytes = len(val)
147
+ elif col_type.is_media_type():
148
+ assert data_row.file_paths[e.slot_idx] is not None
149
+ val = data_row.file_paths[e.slot_idx]
150
+ val_size_bytes = len(val)
151
+ elif col_type.is_json_type():
152
+ val = json.dumps(val)
153
+ val_size_bytes = len(val)
154
+ elif col_type.is_array_type():
155
+ val_size_bytes = val.nbytes
156
+ elif col_type.is_int_type() or col_type.is_float_type():
157
+ val_size_bytes = 8
158
+ elif col_type.is_bool_type():
159
+ val_size_bytes = 1
160
+ elif col_type.is_date_type():
161
+ val_size_bytes = 4
162
+ elif col_type.is_timestamp_type():
163
+ val = val.astimezone(datetime.timezone.utc)
164
+ val_size_bytes = 8
165
+ else:
166
+ raise excs.Error(f'unknown type {col_type} for {col_name}')
167
+
168
+ batch_columns[col_name].append(val)
169
+ current_byte_estimate += val_size_bytes
170
+
171
+ if current_byte_estimate > batch_size_bytes and num_batch_rows > 0:
172
+ record_batch = _to_record_batch(batch_columns, arrow_schema)
173
+ yield record_batch
174
+ batch_columns = {k: [] for k in df.schema}
175
+ current_byte_estimate = 0
176
+ num_batch_rows = 0
177
+
178
+ except excs.ExprEvalError as e:
179
+ df._raise_expr_eval_err(e)
180
+
181
+ if num_batch_rows > 0:
182
+ record_batch = _to_record_batch(batch_columns, arrow_schema)
183
+ yield record_batch
184
+
185
+
91
186
  def to_pydict(batch: pa.Table | pa.RecordBatch) -> dict[str, list | np.ndarray]:
92
187
  """Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
93
188
  this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.
@@ -0,0 +1,88 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import shutil
5
+ from pathlib import Path
6
+ from typing import Literal
7
+
8
+ import pixeltable as pxt
9
+ import pixeltable.exceptions as excs
10
+ from pixeltable.catalog import Catalog
11
+ from pixeltable.env import Env
12
+
13
+ _logger = logging.getLogger('pixeltable')
14
+
15
+
16
+ def export_lancedb(
17
+ table_or_df: pxt.Table | pxt.DataFrame,
18
+ db_uri: Path,
19
+ table_name: str,
20
+ batch_size_bytes: int = 128 * 2**20,
21
+ if_exists: Literal['error', 'overwrite', 'append'] = 'error',
22
+ ) -> None:
23
+ """
24
+ Exports a dataframe's data to a LanceDB table.
25
+
26
+ This utilizes LanceDB's streaming interface for efficient table creation, via a sequence of in-memory pyarrow
27
+ `RecordBatches`, the size of which can be controlled with the `batch_size_bytes` parameter.
28
+
29
+ __Requirements:__
30
+
31
+ - `pip install lancedb`
32
+
33
+ Args:
34
+ table_or_df : Table or Dataframe to export.
35
+ db_uri: Local Path to the LanceDB database.
36
+ table_name : Name of the table in the LanceDB database.
37
+ batch_size_bytes : Maximum size in bytes for each batch.
38
+ if_exists: Determines the behavior if the table already exists. Must be one of the following:
39
+
40
+ - `'error'`: raise an error
41
+ - `'overwrite'`: overwrite the existing table
42
+ - `'append'`: append to the existing table
43
+ """
44
+ Env.get().require_package('lancedb')
45
+
46
+ import lancedb # type: ignore[import-untyped]
47
+
48
+ from pixeltable.utils.arrow import to_arrow_schema, to_record_batches
49
+
50
+ if if_exists not in ('error', 'overwrite', 'append'):
51
+ raise excs.Error("export_lancedb(): 'if_exists' must be one of: ['error', 'overwrite', 'append']")
52
+
53
+ df: pxt.DataFrame
54
+ if isinstance(table_or_df, pxt.catalog.Table):
55
+ df = table_or_df._df()
56
+ else:
57
+ df = table_or_df
58
+
59
+ db_exists = False
60
+ if db_uri.exists():
61
+ if not db_uri.is_dir():
62
+ raise excs.Error(f"export_lancedb(): '{db_uri!s}' exists and is not a directory")
63
+ db_exists = True
64
+
65
+ try:
66
+ db = lancedb.connect(str(db_uri))
67
+ lance_tbl: lancedb.LanceTable | None = None
68
+ try:
69
+ lance_tbl = db.open_table(table_name)
70
+ if if_exists == 'error':
71
+ raise excs.Error(f'export_lancedb(): table {table_name!r} already exists in {db_uri!r}')
72
+ except ValueError:
73
+ # table doesn't exist
74
+ pass
75
+
76
+ with Catalog.get().begin_xact(for_write=False):
77
+ if lance_tbl is None or if_exists == 'overwrite':
78
+ mode = 'overwrite' if lance_tbl is not None else 'create'
79
+ arrow_schema = to_arrow_schema(df.schema)
80
+ _ = db.create_table(table_name, to_record_batches(df, batch_size_bytes), schema=arrow_schema, mode=mode)
81
+ else:
82
+ lance_tbl.add(to_record_batches(df, batch_size_bytes))
83
+
84
+ except Exception as e:
85
+ # cleanup
86
+ if not db_exists:
87
+ shutil.rmtree(db_uri)
88
+ raise e
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pixeltable
3
- Version: 0.4.12
3
+ Version: 0.4.13
4
4
  Summary: AI Data Infrastructure: Declarative, Multimodal, and Incremental
5
5
  Project-URL: homepage, https://pixeltable.com/
6
6
  Project-URL: repository, https://github.com/pixeltable/pixeltable
@@ -55,44 +55,41 @@ Requires-Dist: toml>=0.10
55
55
  Requires-Dist: tqdm>=4.64
56
56
  Description-Content-Type: text/markdown
57
57
 
58
- <div align="center">
59
- <img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/resources/pixeltable-logo-large.png"
60
- alt="Pixeltable Logo" width="50%" />
61
- <br></br>
58
+ <picture class="github-only">
59
+ <source media="(prefers-color-scheme: light)" srcset="https://github.com/user-attachments/assets/e9bf82b2-cace-4bd8-9523-b65495eb8131">
60
+ <source media="(prefers-color-scheme: dark)" srcset="https://github.com/user-attachments/assets/c5ab123e-806c-49bf-93e7-151353719b16">
61
+ <img alt="Pixeltable Logo" src="https://github.com/user-attachments/assets/e9bf82b2-cace-4bd8-9523-b65495eb8131" width="40%">
62
+ </picture>
62
63
 
63
- <h2>Declarative Data Infrastructure for Multimodal AI Apps</h2>
64
+ <div>
65
+ <br>
66
+ </div>
67
+
68
+ The only open source Python library providing declarative data infrastructure for building multimodal AI applications, enabling incremental storage, transformation, indexing, retrieval, and orchestration of data.
64
69
 
65
70
  [![License](https://img.shields.io/badge/License-Apache%202.0-0530AD.svg)](https://opensource.org/licenses/Apache-2.0)
66
- ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pixeltable?logo=python&logoColor=white&)
67
- ![Platform Support](https://img.shields.io/badge/platform-Linux%20%7C%20macOS%20%7C%20Windows-E5DDD4)
68
- <br>
69
71
  [![tests status](https://github.com/pixeltable/pixeltable/actions/workflows/pytest.yml/badge.svg)](https://github.com/pixeltable/pixeltable/actions/workflows/pytest.yml)
70
72
  [![nightly status](https://github.com/pixeltable/pixeltable/actions/workflows/nightly.yml/badge.svg)](https://github.com/pixeltable/pixeltable/actions/workflows/nightly.yml)
71
73
  [![stress-tests status](https://github.com/pixeltable/pixeltable/actions/workflows/stress-tests.yml/badge.svg)](https://github.com/pixeltable/pixeltable/actions/workflows/stress-tests.yml)
72
74
  [![PyPI Package](https://img.shields.io/pypi/v/pixeltable?color=4D148C)](https://pypi.org/project/pixeltable/)
73
75
  [![My Discord (1306431018890166272)](https://img.shields.io/badge/💬-Discord-%235865F2.svg)](https://discord.gg/QPyqFYx2UN)
74
76
 
75
- [**Installation**](https://docs.pixeltable.com/docs/overview/installation) |
76
77
  [**Quick Start**](https://docs.pixeltable.com/docs/overview/quick-start) |
77
78
  [**Documentation**](https://docs.pixeltable.com/) |
78
79
  [**API Reference**](https://pixeltable.github.io/pixeltable/) |
79
- [**Examples**](https://docs.pixeltable.com/docs/examples/use-cases) |
80
+ [**Sample Apps**](https://github.com/pixeltable/pixeltable/tree/main/docs/sample-apps) |
80
81
  [**Discord Community**](https://discord.gg/QPyqFYx2UN)
81
82
 
82
- </div>
83
-
84
83
  ---
85
84
 
86
- ## 💾 Installation
85
+ ## Installation
87
86
 
88
87
  ```python
89
88
  pip install pixeltable
90
89
  ```
90
+ Pixeltable replaces the complex multi-system architecture typically needed for AI applications (databases, file storage, vector DBs, APIs, orchestration) with a single declarative table interface that natively handles multimodal data like images, videos, and documents.
91
91
 
92
- **Pixeltable unifies storage, retrieval, and orchestration for multimodal data.**
93
- It stores metadata and computed results persistently, typically in a `.pixeltable` directory in your workspace.
94
-
95
- ## Pixeltable Demo
92
+ ## Demo
96
93
 
97
94
  https://github.com/user-attachments/assets/b50fd6df-5169-4881-9dbe-1b6e5d06cede
98
95
 
@@ -152,7 +149,7 @@ results = t.select(
152
149
  ).collect()
153
150
  ```
154
151
 
155
- ## What Happened?
152
+ ## What Happened?
156
153
 
157
154
  * **Data Ingestion & Storage:** References [files](https://docs.pixeltable.com/docs/datastore/bringing-data)
158
155
  (images, videos, audio, docs) in place, handles structured data.
@@ -174,7 +171,7 @@ as in the `insert` statement above, Pixeltable caches them locally before proces
174
171
  [Working with External Files](https://github.com/pixeltable/pixeltable/blob/main/docs/notebooks/feature-guides/working-with-external-files.ipynb)
175
172
  notebook for more details.
176
173
 
177
- ## 🗄️ Where Did My Data Go?
174
+ ## Where Did My Data Go?
178
175
 
179
176
  Pixeltable workloads generate various outputs, including both structured outputs (such as bounding boxes for detected
180
177
  objects) and/or unstructured outputs (such as generated images or video). By default, everything resides in your
@@ -186,125 +183,163 @@ a unified table interface over both structured and unstructured data.
186
183
  In general, the user is not expected to interact directly with the data in `~/.pixeltable`; the data store is fully
187
184
  managed by Pixeltable and is intended to be accessed through the Pixeltable Python SDK.
188
185
 
189
- ## ⚖️ Key Principles
186
+ ## Key Principles
190
187
 
191
- * **[Unified Multimodal Interface:](https://docs.pixeltable.com/docs/datastore/tables-and-operations)** `pxt.Image`,
192
- `pxt.Video`, `pxt.Audio`, `pxt.Document`, etc. – manage diverse data consistently.
188
+ **[Unified Multimodal Interface:](https://docs.pixeltable.com/docs/datastore/tables-and-operations)** `pxt.Image`,
189
+ `pxt.Video`, `pxt.Audio`, `pxt.Document`, etc. – manage diverse data consistently.
193
190
 
194
- ```python
195
- t = pxt.create_table(
196
- 'media',
197
- {
198
- 'img': pxt.Image,
199
- 'video': pxt.Video
200
- }
201
- )
202
- ```
191
+ ```python
192
+ t = pxt.create_table(
193
+ 'media',
194
+ {
195
+ 'img': pxt.Image,
196
+ 'video': pxt.Video
197
+ }
198
+ )
199
+ ```
203
200
 
204
- * **[Declarative Computed Columns:](https://docs.pixeltable.com/docs/datastore/computed-columns)** Define processing
205
- steps once; they run automatically on new/updated data.
201
+ **[Declarative Computed Columns:](https://docs.pixeltable.com/docs/datastore/computed-columns)** Define processing
202
+ steps once; they run automatically on new/updated data.
206
203
 
207
- ```python
208
- t.add_computed_column(
209
- classification=huggingface.vit_for_image_classification(
210
- t.image
211
- )
212
- )
213
- ```
204
+ ```python
205
+ t.add_computed_column(
206
+ classification=huggingface.vit_for_image_classification(
207
+ t.image
208
+ )
209
+ )
210
+ ```
214
211
 
215
- * **[Built-in Vector Search:](https://docs.pixeltable.com/docs/datastore/embedding-index)** Add embedding indexes and
216
- perform similarity searches directly on tables/views.
212
+ **[Built-in Vector Search:](https://docs.pixeltable.com/docs/datastore/embedding-index)** Add embedding indexes and
213
+ perform similarity searches directly on tables/views.
217
214
 
218
- ```python
219
- t.add_embedding_index(
220
- 'img',
221
- embedding=clip.using(
222
- model_id='openai/clip-vit-base-patch32'
223
- )
224
- )
215
+ ```python
216
+ t.add_embedding_index(
217
+ 'img',
218
+ embedding=clip.using(
219
+ model_id='openai/clip-vit-base-patch32'
220
+ )
221
+ )
225
222
 
226
- sim = t.img.similarity("cat playing with yarn")
227
- ```
223
+ sim = t.img.similarity("cat playing with yarn")
224
+ ```
228
225
 
229
- * **[On-the-Fly Data Views:](https://docs.pixeltable.com/docs/datastore/views)** Create virtual tables using iterators
230
- for efficient processing without data duplication.
226
+ **[Incremental View Maintenance:](https://docs.pixeltable.com/docs/datastore/views)** Create virtual tables using iterators
227
+ for efficient processing without data duplication.
231
228
 
232
- ```python
233
- frames = pxt.create_view(
234
- 'frames',
235
- videos,
236
- iterator=FrameIterator.create(
237
- video=videos.video,
238
- fps=1
239
- )
240
- )
241
- ```
229
+ ```python
230
+ # Document chunking with overlap & metadata and many more options to build your own iterator
231
+ chunks = pxt.create_view('chunks', docs,
232
+ iterator=DocumentSplitter.create(
233
+ document=docs.doc,
234
+ separators='sentence,token_limit',
235
+ overlap=50, limit=500
236
+ ))
237
+
238
+ # Video frame extraction
239
+ frames = pxt.create_view('frames', videos,
240
+ iterator=FrameIterator.create(video=videos.video, fps=0.5))
241
+ ```
242
242
 
243
- * **[Seamless AI Integration:](https://docs.pixeltable.com/docs/integrations/frameworks)** Built-in functions for
244
- OpenAI, Anthropic, Hugging Face, CLIP, YOLOX, and more.
243
+ **[Seamless AI Integration:](https://docs.pixeltable.com/docs/integrations/frameworks)** Built-in functions for
244
+ OpenAI, Anthropic, Hugging Face, CLIP, YOLOX, and more.
245
245
 
246
- ```python
247
- t.add_computed_column(
248
- response=openai.chat_completions(
249
- messages=[{"role": "user", "content": t.prompt}]
250
- )
251
- )
252
- ```
246
+ ```python
247
+ # LLM integration (OpenAI, Anthropic, etc.)
248
+ t.add_computed_column(
249
+ response=openai.chat_completions(
250
+ messages=[{"role": "user", "content": t.prompt}], model='gpt-4o-mini'
251
+ )
252
+ )
253
253
 
254
- * **[Bring Your Own Code:](https://docs.pixeltable.com/docs/datastore/custom-functions)** Extend Pixeltable with simple
255
- Python User-Defined Functions.
254
+ # Computer vision (YOLOX object detection)
255
+ t.add_computed_column(
256
+ detections=yolox(t.image, model_id='yolox_s', threshold=0.5)
257
+ )
256
258
 
257
- ```python
258
- @pxt.udf
259
- def format_prompt(context: list, question: str) -> str:
260
- return f"Context: {context}\nQuestion: {question}"
261
- ```
259
+ # Embedding models (Hugging Face, CLIP)
260
+ t.add_computed_column(
261
+ embeddings=huggingface.sentence_transformer(
262
+ t.text, model_id='all-MiniLM-L6-v2'
263
+ )
264
+ )
265
+ ```
262
266
 
263
- * **[Agentic Workflows / Tool Calling:](https://docs.pixeltable.com/docs/examples/chat/tools)** Register `@pxt.udf` or
264
- `@pxt.query` functions as tools and orchestrate LLM-based tool use (incl. multimodal).
267
+ **[Bring Your Own Code:](https://docs.pixeltable.com/docs/datastore/custom-functions)** Extend Pixeltable with UDFs, batch processing, and custom aggregators.
268
+
269
+ ```python
270
+ @pxt.udf
271
+ def format_prompt(context: list, question: str) -> str:
272
+ return f"Context: {context}\nQuestion: {question}"
273
+ ```
265
274
 
266
- ```python
267
- # Example tools: a UDF and a Query function for RAG
268
- tools = pxt.tools(get_weather_udf, search_context_query)
275
+ **[Agentic Workflows / Tool Calling:](https://docs.pixeltable.com/docs/examples/chat/tools)** Register `@pxt.udf`,
276
+ `@pxt.query` functions, or **MCP tools** as tools.
269
277
 
270
- # LLM decides which tool to call; Pixeltable executes it
271
- t.add_computed_column(
272
- tool_output=invoke_tools(tools, t.llm_tool_choice)
273
- )
274
- ```
275
-
276
- * **[Data Persistence:](https://docs.pixeltable.com/docs/datastore/tables-and-operations#data-operations)** All data,
277
- metadata, and computed results are automatically stored and versioned.
278
-
279
- ```python
280
- t = pxt.get_table('my_table') # Get a handle to an existing table
281
- t.select(t.account, t.balance).collect() # Query its contents
282
- t.revert() # Undo the last modification to the table and restore its previous state
283
- ```
284
-
285
- * **[Time Travel:](https://docs.pixeltable.com/docs/datastore/tables-and-operations#data-operations)** By default,
286
- Pixeltable preserves the full change history of each table, and any prior version can be selected and queried.
287
-
288
- ```python
289
- t.history() # Display a human-readable list of all prior versions of the table
290
- old_version = pxt.get_table('my_table:472') # Get a handle to a specific table version
291
- old_version.select(t.account, t.balance).collect() # Query the older version
292
- ```
293
-
294
- * **[SQL-like Python Querying:](https://docs.pixeltable.com/docs/datastore/filtering-and-selecting)** Familiar syntax
295
- combined with powerful AI capabilities.
296
-
297
- ```python
298
- results = (
299
- t.where(t.score > 0.8)
300
- .order_by(t.timestamp)
301
- .select(t.image, score=t.score)
302
- .limit(10)
303
- .collect()
304
- )
305
- ```
278
+ ```python
279
+ # Example tools: UDFs, Query functions, and MCP tools
280
+ mcp_tools = pxt.mcp_udfs('http://localhost:8000/mcp') # Load from MCP server
281
+ tools = pxt.tools(get_weather_udf, search_context_query, *mcp_tools)
282
+
283
+ # LLM decides which tool to call; Pixeltable executes it
284
+ t.add_computed_column(
285
+ tool_output=invoke_tools(tools, t.llm_tool_choice)
286
+ )
287
+ ```
288
+
289
+ **[Data Persistence:](https://docs.pixeltable.com/docs/datastore/tables-and-operations#data-operations)** All data,
290
+ metadata, and computed results are automatically stored and versioned.
291
+
292
+ ```python
293
+ t = pxt.get_table('my_table') # Get a handle to an existing table
294
+ t.select(t.account, t.balance).collect() # Query its contents
295
+ t.revert() # Undo the last modification to the table and restore its previous state
296
+ ```
297
+
298
+ **[Time Travel:](https://docs.pixeltable.com/docs/datastore/tables-and-operations#data-operations)** By default,
299
+ Pixeltable preserves the full change history of each table, and any prior version can be selected and queried.
300
+
301
+ ```python
302
+ t.history() # Display a human-readable list of all prior versions of the table
303
+ old_version = pxt.get_table('my_table:472') # Get a handle to a specific table version
304
+ old_version.select(t.account, t.balance).collect() # Query the older version
305
+ ```
306
+
307
+ **[SQL-like Python Querying:](https://docs.pixeltable.com/docs/datastore/filtering-and-selecting)** Familiar syntax
308
+ combined with powerful AI capabilities.
309
+
310
+ ```python
311
+ results = (
312
+ t.where(t.score > 0.8)
313
+ .order_by(t.timestamp)
314
+ .select(t.image, score=t.score)
315
+ .limit(10)
316
+ .collect()
317
+ )
318
+ ```
319
+
320
+ **[I/O & Integration:](https://pixeltable.github.io/pixeltable/pixeltable/io/)** Export to multiple
321
+ formats and integrate with ML/AI tools ecosystem.
322
+
323
+ ```python
324
+ # Export to analytics/ML formats
325
+ pxt.export_parquet(table, 'data.parquet', partition_size_bytes=100_000_000)
326
+ pxt.export_lancedb(table, 'vector_db')
327
+
328
+ # DataFrame conversions
329
+ results = table.select(table.image, table.labels).collect()
330
+ df = results.to_pandas() # → pandas DataFrame
331
+ models = results.to_pydantic(MyModel) # → Pydantic models
332
+
333
+ # Specialized ML dataset formats
334
+ coco_path = table.to_coco_dataset() # → COCO annotations
335
+ pytorch_ds = table.to_pytorch_dataset('pt') # → PyTorch DataLoader ready
336
+
337
+ # ML tool integrations
338
+ pxt.create_label_studio_project(table, label_config) # Annotation
339
+ pxt.export_images_as_fo_dataset(table, table.image) # FiftyOne
340
+ ```
306
341
 
307
- ## 💡 Key Examples
342
+ ## Key Examples
308
343
 
309
344
  *(See the [Full Quick Start](https://docs.pixeltable.com/docs/overview/quick-start) or
310
345
  [Notebook Gallery](#-notebook-gallery) for more details)*
@@ -497,7 +532,7 @@ print("--- Final Answer ---")
497
532
  print(qa.select(qa.answer).collect())
498
533
  ```
499
534
 
500
- ## 📚 Notebook Gallery
535
+ ## Notebook Gallery
501
536
 
502
537
  Explore Pixeltable's capabilities interactively:
503
538
 
@@ -514,7 +549,7 @@ Explore Pixeltable's capabilities interactively:
514
549
  | Object Detection | <a target="_blank" href="https://colab.research.google.com/github/pixeltable/pixeltable/blob/release/docs/notebooks/use-cases/object-detection-in-videos.ipynb"> <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/> </a> | Image/Text Search | <a target="_blank" href="https://github.com/pixeltable/pixeltable/tree/main/docs/sample-apps/text-and-image-similarity-search-nextjs-fastapi"> <img src="https://img.shields.io/badge/🖥️%20App-black.svg" alt="GitHub App"/> |
515
550
  | Audio Transcription | <a target="_blank" href="https://colab.research.google.com/github/pixeltable/pixeltable/blob/release/docs/notebooks/use-cases/audio-transcriptions.ipynb"> <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/> | Discord Bot | <a target="_blank" href="https://github.com/pixeltable/pixeltable/blob/main/docs/sample-apps/context-aware-discord-bot"> <img src="https://img.shields.io/badge/%F0%9F%92%AC%20Bot-%235865F2.svg" alt="GitHub App"/></a> |
516
551
 
517
- ## 🚨 Maintaining Production-Ready Multimodal AI Apps is Still Too Hard
552
+ ## Maintaining Production-Ready Multimodal AI Apps is Still Too Hard
518
553
 
519
554
  Building robust AI applications, especially [multimodal](https://docs.pixeltable.com/docs/datastore/bringing-data) ones,
520
555
  requires stitching together numerous tools:
@@ -528,7 +563,7 @@ requires stitching together numerous tools:
528
563
 
529
564
  This complex "data plumbing" slows down development, increases costs, and makes applications brittle and hard to reproduce.
530
565
 
531
- ## 🔮 Roadmap (2025)
566
+ ## Roadmap (2025)
532
567
 
533
568
  ### Cloud Infrastructure and Deployment
534
569
 
@@ -538,13 +573,13 @@ We're working on a hosted Pixeltable service that will:
538
573
  * Provide a persistent cloud instance
539
574
  * Turn Pixeltable workflows (Tables, Queries, UDFs) into API endpoints/[MCP Servers](https://github.com/pixeltable/pixeltable-mcp-server)
540
575
 
541
- ## 🤝 Contributing
576
+ ## Contributing
542
577
 
543
578
  We love contributions! Whether it's reporting bugs, suggesting features, improving documentation, or submitting code
544
579
  changes, please check out our [Contributing Guide](CONTRIBUTING.md) and join the
545
580
  [Discussions](https://github.com/pixeltable/pixeltable/discussions) or our
546
581
  [Discord Server](https://discord.gg/QPyqFYx2UN).
547
582
 
548
- ## 🏢 License
583
+ ## License
549
584
 
550
585
  Pixeltable is licensed under the Apache 2.0 License.
@@ -1,15 +1,15 @@
1
- pixeltable/__init__.py,sha256=wJ_4oQdkBAaaVKM8XiZKKSsWPnoemZxh34o6_5vDcxk,1562
1
+ pixeltable/__init__.py,sha256=PDfphK_WypPopRbBNhJ0wXiX5T9Vp4Vq9Hf8Oz_oXZA,1620
2
2
  pixeltable/__version__.py,sha256=LnMIuAxx6nAQDMev_jnZyUdgsaiE3F8lulfXQBRl9qQ,112
3
3
  pixeltable/config.py,sha256=-aoSVF0Aak83IC-u-XANw3if76TDq5VnnWNWoFDR5Hc,8390
4
4
  pixeltable/dataframe.py,sha256=XbrzPjnPgZKJ5lVgPO71cK-nRHCpqGCGWFc52kUO8_E,64213
5
- pixeltable/env.py,sha256=FlE7s649xBiE5WSs65WwQ4bKbPjMYQaF0Z0HeuEuCs4,44160
5
+ pixeltable/env.py,sha256=LUTOi3DcinsVFoqiOmsG8Dlhe8yWBEfgIdY9rOlJMME,44203
6
6
  pixeltable/exceptions.py,sha256=Gm8d3TL2iiv6Pj2DLd29wp_j41qNBhxXL9iTQnL4Nk4,1116
7
- pixeltable/globals.py,sha256=nR6XJKFlsb12oo_wOWAoAMlnPbHY7FhM3dgEKoM9iSM,39262
7
+ pixeltable/globals.py,sha256=dktqUbpsiLorB4-1VjYDp7LH0rfqfh_3c8OD819K_H4,39183
8
8
  pixeltable/plan.py,sha256=4yAe7ExAqaSvkFxwK7LPH_HpmoumwqoLeOo7czJ8CyQ,48001
9
9
  pixeltable/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  pixeltable/store.py,sha256=CneWUmgN-EwaPYLcizlAxONC7WYwMr8SNpSFeNBBmOA,22885
11
11
  pixeltable/type_system.py,sha256=UfPZZy4zJ2kGvdHXI9rqxOGAjgIxCZ9QGvvidPWcq-M,56153
12
- pixeltable/catalog/__init__.py,sha256=zw6hiyAIjMBxCExtsr7G51ul2XQ9fTQQKcs45rIy7xA,682
12
+ pixeltable/catalog/__init__.py,sha256=GL0MLxqCBHlhKWqhC3e9B4kwTazagTOiqBHHRjyWbTg,726
13
13
  pixeltable/catalog/catalog.py,sha256=gaq10XFwkr6jyv8yVi5xV3_oiDkPvqVe55vxOo14W6k,93853
14
14
  pixeltable/catalog/column.py,sha256=MXa5o3ku94T8ZFEL7wnAvqvlk65fOmmHPqIvrUVf3uo,13514
15
15
  pixeltable/catalog/dir.py,sha256=VYTscPlKR6XhupPTXlJ8txAHxS5GSpPJ3LIleDJagVQ,2047
@@ -18,7 +18,8 @@ pixeltable/catalog/insertable_table.py,sha256=VUuJ8z7OtMqgy_LMzkn1KzeLXdR-9poTtt
18
18
  pixeltable/catalog/named_function.py,sha256=vZ-j7P4HugWh9OmUzBMwyRYvO3tQn9jWyJz_1stPavU,1210
19
19
  pixeltable/catalog/path.py,sha256=O3FfxrvyX2crijBhp_2k4-3mG3BFxwba-tlPB74QtJQ,3780
20
20
  pixeltable/catalog/schema_object.py,sha256=rQ6-3rzqnOHyEEHi97kai2S7BO3D9AkH7rirnfbGc14,1785
21
- pixeltable/catalog/table.py,sha256=Ug65hRZhzjp3sIUSppA-mXUEWLXgPK22bq22f7WFy0M,81816
21
+ pixeltable/catalog/table.py,sha256=phOf59IZJO7xPPR91F2trJpA4TC9lic-dd13mbiUz5Q,80222
22
+ pixeltable/catalog/table_metadata.py,sha256=MVxJLS6Tz2PVOerlnoOOjjhq6LxUdDLeN0BUJf42Smw,3518
22
23
  pixeltable/catalog/table_version.py,sha256=SRF2ACp_DcPMLTbc4dbZSgYEfW6-o-UzDOBehecKbb0,65073
23
24
  pixeltable/catalog/table_version_handle.py,sha256=FTPRqcGY-h-POcWyZbd9b8P2D5zIw5OSUvwF_dbyCGo,3608
24
25
  pixeltable/catalog/table_version_path.py,sha256=IaFVDH06_6ZMuBv5eLNCRTlWizpvz95jgAzqp4OVx_o,9713
@@ -113,16 +114,17 @@ pixeltable/index/__init__.py,sha256=97aFuxiP_oz1ldn5iq8IWApkOV8XG6ZIBW5-9rkS0vM,
113
114
  pixeltable/index/base.py,sha256=200s7v3Zy810bRlbSAYzxxaEjVssl6r8esTHiSvWRwQ,1704
114
115
  pixeltable/index/btree.py,sha256=8B06D67ay0DFUtEBC5q4bLjxMq7ILpKyyoLAiSaamzA,2503
115
116
  pixeltable/index/embedding_index.py,sha256=B_k_3UJmSv7t2ljUg8GC_D4t1jc03PVsTAvxqiTmHBA,11754
116
- pixeltable/io/__init__.py,sha256=chVGh3ygtZwSY6g_skIyCsjxwzo2847jDq9YGObAY98,608
117
+ pixeltable/io/__init__.py,sha256=SO9xvWuQHfg_YyVahDmstB3lSuMoPKRarW8qgUR81jM,655
117
118
  pixeltable/io/datarows.py,sha256=s2fDQTttGxq7cS5JwKFEJRSKn6WsXTaGdmm9VJSl_2M,6154
118
119
  pixeltable/io/external_store.py,sha256=rOYBwTqcZZVU2toWxJ_9Iy2w2YO0DhuABrM2xGmqHSo,14787
119
120
  pixeltable/io/fiftyone.py,sha256=JcAL9zFszSTcsws6ioF1KZZJFmUeg-11W-c4Gyh3FyQ,6891
120
121
  pixeltable/io/globals.py,sha256=B9ubI9Z0m2wGPZXWmZm10vlaP0UCuUsVyrMWvyudZSc,11360
121
122
  pixeltable/io/hf_datasets.py,sha256=5WfWfXoQppG1Bx_pS5n44KO1Vo_mEb_S82PLB8cLfAU,5606
122
123
  pixeltable/io/label_studio.py,sha256=OCQBVgGjXRSdukFQv2ZKdaBmpxanqH9ibDLxZd1L3mc,31469
124
+ pixeltable/io/lancedb.py,sha256=kNcYXptieMlJ6yxEIZHVFklEMOEB2mrSyp7XZmOw4qs,82
123
125
  pixeltable/io/pandas.py,sha256=xQmkwbqE9_fjbbPUgeG5yNICrbVVK73UHxDL-cgrQw0,9007
124
- pixeltable/io/parquet.py,sha256=qoVDuCoW-Tq14IlzN_psoNP7z83hIQ3ZEg_pKzHSqoY,7796
125
- pixeltable/io/table_data_conduit.py,sha256=--UWwG6agBtOA5PLPfjxp2XKoAQ-f5nSPJqOgA5DAAI,22062
126
+ pixeltable/io/parquet.py,sha256=qVvg9nixJnK9gXYxZocD8HE13SznyLrgW9IsehtT4j4,4101
127
+ pixeltable/io/table_data_conduit.py,sha256=8jwQ3IOoOBS-8j2TEfgiqsFUD85kEP5IjoC0dg2uPEk,22058
126
128
  pixeltable/io/utils.py,sha256=qzBTmqdIawXMt2bfXQOraYnEstL69eC2Z33nl8RrwJk,4244
127
129
  pixeltable/iterators/__init__.py,sha256=hI937cmBRU3eWbfJ7miFthAGUo_xmcYciw6gAjOCg9g,470
128
130
  pixeltable/iterators/audio.py,sha256=HYE8JcqaJsTGdrq4NkwV5tn7lcyMp6Fjrm59efOLzb0,9671
@@ -171,7 +173,7 @@ pixeltable/share/__init__.py,sha256=PTX1mw61Ss4acEOI-sUlu0HaoVsosLqwDfh0ldn8Hkg,
171
173
  pixeltable/share/packager.py,sha256=5rSKnQCs3YP5h48d79bXEK4L8tLUSeTSbXaB8X9SmBI,31265
172
174
  pixeltable/share/publish.py,sha256=VE_H3ux56gdSHd8_ganxCnNYtxrjaalMPgwAIYmdbE8,11300
173
175
  pixeltable/utils/__init__.py,sha256=45qEM20L2VuIe-Cc3BTKWFqQb-S7A8qDtmmgl77zYK0,1728
174
- pixeltable/utils/arrow.py,sha256=Rooa02GL5k--D2utlKATtYKrrlsHbbi6JmkarXMux1M,6384
176
+ pixeltable/utils/arrow.py,sha256=U7vb_ffPCR7zv-phyBMPMDosPdKN6LK4IVMpfm2mRy8,10424
175
177
  pixeltable/utils/av.py,sha256=omJufz62dzaTTwlR7quKfcT7apf8KkBLJ9cQ9240dt0,4016
176
178
  pixeltable/utils/coco.py,sha256=Y1DWVYguZD4VhKyf7JruYfHWvhkJLq39fzbiSm5cdyY,7304
177
179
  pixeltable/utils/code.py,sha256=3CZMVJm69JIG5sxmd56mjB4Fo4L-s0_Y8YvQeJIj0F0,1280
@@ -185,14 +187,15 @@ pixeltable/utils/filecache.py,sha256=3TTEqhGg0pEAP_l0GKn34uspC4dha1jPab1Ka9_oTBM
185
187
  pixeltable/utils/formatter.py,sha256=tbMxE9rBw6wdKUnJhNZ8h9uAF8dZKcihQ2KesqAag9A,10096
186
188
  pixeltable/utils/http_server.py,sha256=6khOAtpVj1lDIm9Dx8VIECLm87cFEp4IFbAg8T92A2o,2441
187
189
  pixeltable/utils/iceberg.py,sha256=COeNqqy5RRMkDGLS8CTnaUeAccG10x2fwP3e1veuqIA,522
190
+ pixeltable/utils/lancedb.py,sha256=Otr-t47YACRo0Cq9-FyelcUuan1Kgs4gxCOpLOckj3s,2988
188
191
  pixeltable/utils/media_store.py,sha256=-rYfpZOUrWU1YtEFrxdrn9Na0NeyRW3HJYsOdH-kJO4,10898
189
192
  pixeltable/utils/pydantic.py,sha256=-ztUsuRXA7B6bywb5Yy1h5pNQ2DnsT1d0oHMxqtK3WY,2011
190
193
  pixeltable/utils/pytorch.py,sha256=564VHRdDHwD9h0v5lBHEDTJ8c6zx8wuzWYx8ZYjBxlI,3621
191
194
  pixeltable/utils/s3.py,sha256=pxip2MlCqd2Qon2dzJXzfxvwtZyc-BAsjAnLL4J_OXY,587
192
195
  pixeltable/utils/sql.py,sha256=Sa4Lh-VGe8GToU5W7DRiWf2lMl9B6saPqemiT0ZdHEc,806
193
196
  pixeltable/utils/transactional_directory.py,sha256=OFKmu90oP7KwBAljwjnzP_w8euGdAXob3y4Nx9SCNHA,1357
194
- pixeltable-0.4.12.dist-info/METADATA,sha256=mJSdwTquIKJqX6iziu0ZKHMg0guXT1HrGwqNCGvjZJw,24248
195
- pixeltable-0.4.12.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
196
- pixeltable-0.4.12.dist-info/entry_points.txt,sha256=rrKugZmxDtGnXCnEQ5UJMaaSYY7-g1cLjUZ4W1moIhM,98
197
- pixeltable-0.4.12.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
198
- pixeltable-0.4.12.dist-info/RECORD,,
197
+ pixeltable-0.4.13.dist-info/METADATA,sha256=VSQp0eAebSMwoxcFkjAwTQbtuLISMx-PZ-LoCJo55hg,25631
198
+ pixeltable-0.4.13.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
199
+ pixeltable-0.4.13.dist-info/entry_points.txt,sha256=rrKugZmxDtGnXCnEQ5UJMaaSYY7-g1cLjUZ4W1moIhM,98
200
+ pixeltable-0.4.13.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
201
+ pixeltable-0.4.13.dist-info/RECORD,,