pixeltable 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +2 -27
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +309 -59
- pixeltable/catalog/globals.py +5 -5
- pixeltable/catalog/insertable_table.py +13 -1
- pixeltable/catalog/path.py +13 -6
- pixeltable/catalog/table.py +28 -41
- pixeltable/catalog/table_version.py +100 -72
- pixeltable/catalog/view.py +35 -9
- pixeltable/dataframe.py +2 -2
- pixeltable/exceptions.py +20 -2
- pixeltable/exec/expr_eval/evaluators.py +0 -4
- pixeltable/exec/expr_eval/expr_eval_node.py +0 -1
- pixeltable/exec/sql_node.py +3 -3
- pixeltable/exprs/json_path.py +1 -5
- pixeltable/func/__init__.py +1 -1
- pixeltable/func/aggregate_function.py +1 -1
- pixeltable/func/callable_function.py +1 -1
- pixeltable/func/expr_template_function.py +2 -2
- pixeltable/func/function.py +3 -4
- pixeltable/func/query_template_function.py +87 -4
- pixeltable/func/tools.py +1 -1
- pixeltable/func/udf.py +1 -1
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/anthropic.py +1 -1
- pixeltable/functions/bedrock.py +130 -0
- pixeltable/functions/huggingface.py +7 -6
- pixeltable/functions/image.py +15 -16
- pixeltable/functions/mistralai.py +3 -2
- pixeltable/functions/openai.py +9 -8
- pixeltable/functions/together.py +4 -3
- pixeltable/globals.py +7 -2
- pixeltable/io/datarows.py +4 -3
- pixeltable/io/label_studio.py +17 -17
- pixeltable/io/pandas.py +13 -12
- pixeltable/io/table_data_conduit.py +8 -2
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_31.py +11 -0
- pixeltable/metadata/converters/convert_32.py +15 -0
- pixeltable/metadata/converters/convert_33.py +17 -0
- pixeltable/metadata/notes.py +3 -0
- pixeltable/metadata/schema.py +26 -1
- pixeltable/plan.py +2 -3
- pixeltable/share/packager.py +9 -25
- pixeltable/share/publish.py +20 -9
- pixeltable/store.py +7 -4
- pixeltable/utils/exception_handler.py +59 -0
- {pixeltable-0.3.11.dist-info → pixeltable-0.3.13.dist-info}/METADATA +1 -1
- {pixeltable-0.3.11.dist-info → pixeltable-0.3.13.dist-info}/RECORD +53 -48
- {pixeltable-0.3.11.dist-info → pixeltable-0.3.13.dist-info}/WHEEL +1 -1
- {pixeltable-0.3.11.dist-info → pixeltable-0.3.13.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.11.dist-info → pixeltable-0.3.13.dist-info}/entry_points.txt +0 -0
pixeltable/globals.py
CHANGED
|
@@ -616,9 +616,14 @@ def _extract_paths(
|
|
|
616
616
|
matches = [name for name, entry in dir_entries.items() if entry.dir is not None]
|
|
617
617
|
else:
|
|
618
618
|
matches = [name for name, entry in dir_entries.items() if entry.table is not None]
|
|
619
|
+
|
|
620
|
+
# Filter out system paths
|
|
621
|
+
matches = [name for name in matches if catalog.is_valid_identifier(name)]
|
|
619
622
|
result = [parent.append(name) for name in matches]
|
|
620
|
-
|
|
621
|
-
|
|
623
|
+
|
|
624
|
+
for name, entry in dir_entries.items():
|
|
625
|
+
if len(entry.dir_entries) > 0 and catalog.is_valid_identifier(name):
|
|
626
|
+
result.extend(_extract_paths(entry.dir_entries, parent=parent.append(name), entry_type=entry_type))
|
|
622
627
|
return result
|
|
623
628
|
|
|
624
629
|
|
pixeltable/io/datarows.py
CHANGED
|
@@ -3,13 +3,14 @@ from __future__ import annotations
|
|
|
3
3
|
from typing import Any, Iterable, Optional, Union
|
|
4
4
|
|
|
5
5
|
import pixeltable as pxt
|
|
6
|
+
import pixeltable.type_system as ts
|
|
6
7
|
from pixeltable import exceptions as excs
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
def _infer_schema_from_rows(
|
|
10
11
|
rows: Iterable[dict[str, Any]], schema_overrides: dict[str, Any], primary_key: list[str]
|
|
11
|
-
) -> dict[str,
|
|
12
|
-
schema: dict[str,
|
|
12
|
+
) -> dict[str, ts.ColumnType]:
|
|
13
|
+
schema: dict[str, ts.ColumnType] = {}
|
|
13
14
|
cols_with_nones: set[str] = set()
|
|
14
15
|
|
|
15
16
|
for n, row in enumerate(rows):
|
|
@@ -23,7 +24,7 @@ def _infer_schema_from_rows(
|
|
|
23
24
|
elif value is not None:
|
|
24
25
|
# If `key` is not in `schema_overrides`, then we infer its type from the data.
|
|
25
26
|
# The column type will always be nullable by default.
|
|
26
|
-
col_type =
|
|
27
|
+
col_type = ts.ColumnType.infer_literal_type(value, nullable=col_name not in primary_key)
|
|
27
28
|
if col_type is None:
|
|
28
29
|
raise excs.Error(
|
|
29
30
|
f'Could not infer type for column `{col_name}`; the value in row {n} '
|
pixeltable/io/label_studio.py
CHANGED
|
@@ -11,7 +11,7 @@ import label_studio_sdk # type: ignore[import-untyped]
|
|
|
11
11
|
import PIL.Image
|
|
12
12
|
from requests.exceptions import HTTPError
|
|
13
13
|
|
|
14
|
-
import pixeltable as
|
|
14
|
+
import pixeltable.type_system as ts
|
|
15
15
|
from pixeltable import Column, Table, env, exceptions as excs
|
|
16
16
|
from pixeltable.config import Config
|
|
17
17
|
from pixeltable.exprs import ColumnRef, DataRow, Expr
|
|
@@ -89,21 +89,21 @@ class LabelStudioProject(Project):
|
|
|
89
89
|
def __project_config(self) -> '_LabelStudioConfig':
|
|
90
90
|
return self.__parse_project_config(self.project_params['label_config'])
|
|
91
91
|
|
|
92
|
-
def get_export_columns(self) -> dict[str,
|
|
92
|
+
def get_export_columns(self) -> dict[str, ts.ColumnType]:
|
|
93
93
|
"""
|
|
94
94
|
The data keys and preannotation fields specified in this Label Studio project.
|
|
95
95
|
"""
|
|
96
96
|
return self.__project_config.export_columns
|
|
97
97
|
|
|
98
|
-
def get_import_columns(self) -> dict[str,
|
|
98
|
+
def get_import_columns(self) -> dict[str, ts.ColumnType]:
|
|
99
99
|
"""
|
|
100
100
|
Always contains a single entry:
|
|
101
101
|
|
|
102
102
|
```
|
|
103
|
-
{"annotations":
|
|
103
|
+
{"annotations": ts.JsonType(nullable=True)}
|
|
104
104
|
```
|
|
105
105
|
"""
|
|
106
|
-
return {ANNOTATIONS_COLUMN:
|
|
106
|
+
return {ANNOTATIONS_COLUMN: ts.JsonType(nullable=True)}
|
|
107
107
|
|
|
108
108
|
def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
|
|
109
109
|
_logger.info(
|
|
@@ -412,8 +412,8 @@ class LabelStudioProject(Project):
|
|
|
412
412
|
# TODO(aaron-siegel): Simplify this once propagation is properly implemented in batch_update
|
|
413
413
|
ancestor = t
|
|
414
414
|
while local_annotations_col not in ancestor._tbl_version.get().cols:
|
|
415
|
-
assert ancestor.
|
|
416
|
-
ancestor = ancestor.
|
|
415
|
+
assert ancestor._base_table is not None
|
|
416
|
+
ancestor = ancestor._base_table
|
|
417
417
|
update_status = ancestor.batch_update(updates)
|
|
418
418
|
env.Env.get().console_logger.info(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
|
|
419
419
|
return SyncStatus(pxt_rows_updated=update_status.num_rows, num_excs=update_status.num_excs)
|
|
@@ -577,10 +577,10 @@ class LabelStudioProject(Project):
|
|
|
577
577
|
else:
|
|
578
578
|
local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
|
|
579
579
|
if local_annotations_column not in t._schema:
|
|
580
|
-
t.add_columns({local_annotations_column:
|
|
580
|
+
t.add_columns({local_annotations_column: ts.JsonType(nullable=True)})
|
|
581
581
|
|
|
582
582
|
resolved_col_mapping = cls.validate_columns(
|
|
583
|
-
t, config.export_columns, {ANNOTATIONS_COLUMN:
|
|
583
|
+
t, config.export_columns, {ANNOTATIONS_COLUMN: ts.JsonType(nullable=True)}, col_mapping
|
|
584
584
|
)
|
|
585
585
|
|
|
586
586
|
# Perform some additional validation
|
|
@@ -649,7 +649,7 @@ class LabelStudioProject(Project):
|
|
|
649
649
|
@dataclass(frozen=True)
|
|
650
650
|
class _DataKey:
|
|
651
651
|
name: Optional[str] # The 'name' attribute of the data key; may differ from the field name
|
|
652
|
-
column_type:
|
|
652
|
+
column_type: ts.ColumnType
|
|
653
653
|
|
|
654
654
|
|
|
655
655
|
@dataclass(frozen=True)
|
|
@@ -673,18 +673,18 @@ class _LabelStudioConfig:
|
|
|
673
673
|
)
|
|
674
674
|
|
|
675
675
|
@property
|
|
676
|
-
def export_columns(self) -> dict[str,
|
|
676
|
+
def export_columns(self) -> dict[str, ts.ColumnType]:
|
|
677
677
|
data_key_cols = {key_id: key_info.column_type for key_id, key_info in self.data_keys.items()}
|
|
678
|
-
rl_cols = {name:
|
|
678
|
+
rl_cols = {name: ts.JsonType() for name in self.rectangle_labels}
|
|
679
679
|
return {**data_key_cols, **rl_cols}
|
|
680
680
|
|
|
681
681
|
|
|
682
682
|
ANNOTATIONS_COLUMN = 'annotations'
|
|
683
683
|
_PAGE_SIZE = 100 # This is the default used in the LS SDK
|
|
684
684
|
_LS_TAG_MAP = {
|
|
685
|
-
'header':
|
|
686
|
-
'text':
|
|
687
|
-
'image':
|
|
688
|
-
'video':
|
|
689
|
-
'audio':
|
|
685
|
+
'header': ts.StringType(),
|
|
686
|
+
'text': ts.StringType(),
|
|
687
|
+
'image': ts.ImageType(),
|
|
688
|
+
'video': ts.VideoType(),
|
|
689
|
+
'audio': ts.AudioType(),
|
|
690
690
|
}
|
pixeltable/io/pandas.py
CHANGED
|
@@ -8,6 +8,7 @@ from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
|
|
|
8
8
|
|
|
9
9
|
import pixeltable as pxt
|
|
10
10
|
import pixeltable.exceptions as excs
|
|
11
|
+
import pixeltable.type_system as ts
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
def import_pandas(
|
|
@@ -119,15 +120,15 @@ def _df_check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> No
|
|
|
119
120
|
|
|
120
121
|
|
|
121
122
|
def df_infer_schema(
|
|
122
|
-
df: pd.DataFrame, schema_overrides: dict[str,
|
|
123
|
-
) -> dict[str,
|
|
123
|
+
df: pd.DataFrame, schema_overrides: dict[str, ts.ColumnType], primary_key: list[str]
|
|
124
|
+
) -> dict[str, ts.ColumnType]:
|
|
124
125
|
"""
|
|
125
126
|
Infers a Pixeltable schema from a Pandas DataFrame.
|
|
126
127
|
|
|
127
128
|
Returns:
|
|
128
129
|
A tuple containing a Pixeltable schema and a list of primary key column names.
|
|
129
130
|
"""
|
|
130
|
-
pd_schema: dict[str,
|
|
131
|
+
pd_schema: dict[str, ts.ColumnType] = {}
|
|
131
132
|
for pd_name, pd_dtype in zip(df.columns, df.dtypes):
|
|
132
133
|
if pd_name in schema_overrides:
|
|
133
134
|
pxt_type = schema_overrides[pd_name]
|
|
@@ -138,7 +139,7 @@ def df_infer_schema(
|
|
|
138
139
|
return pd_schema
|
|
139
140
|
|
|
140
141
|
|
|
141
|
-
def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[
|
|
142
|
+
def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[ts.ColumnType]:
|
|
142
143
|
"""
|
|
143
144
|
Determines a pixeltable ColumnType from a pandas dtype
|
|
144
145
|
|
|
@@ -146,21 +147,21 @@ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.C
|
|
|
146
147
|
pd_dtype: A pandas dtype object
|
|
147
148
|
|
|
148
149
|
Returns:
|
|
149
|
-
|
|
150
|
+
ts.ColumnType: A pixeltable ColumnType
|
|
150
151
|
"""
|
|
151
152
|
# Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly
|
|
152
153
|
# compatible with NumPy dtypes
|
|
153
154
|
# The timezone-aware datetime64[ns, tz=] dtype is a pandas extension dtype
|
|
154
155
|
if is_datetime64_any_dtype(pd_dtype):
|
|
155
|
-
return
|
|
156
|
+
return ts.TimestampType(nullable=nullable)
|
|
156
157
|
if is_extension_array_dtype(pd_dtype):
|
|
157
158
|
return None
|
|
158
159
|
# Most other pandas dtypes are directly NumPy compatible
|
|
159
160
|
assert isinstance(pd_dtype, np.dtype)
|
|
160
|
-
return
|
|
161
|
+
return ts.ArrayType.from_np_dtype(pd_dtype, nullable)
|
|
161
162
|
|
|
162
163
|
|
|
163
|
-
def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable: bool) ->
|
|
164
|
+
def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable: bool) -> ts.ColumnType:
|
|
164
165
|
"""
|
|
165
166
|
Infers a Pixeltable type based on a pandas dtype.
|
|
166
167
|
"""
|
|
@@ -176,12 +177,12 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
|
|
|
176
177
|
|
|
177
178
|
if len(data_col) == 0:
|
|
178
179
|
# No non-null values; default to FloatType (the Pandas type of an all-NaN column)
|
|
179
|
-
return
|
|
180
|
+
return ts.FloatType(nullable=nullable)
|
|
180
181
|
|
|
181
|
-
inferred_type =
|
|
182
|
+
inferred_type = ts.ColumnType.infer_common_literal_type(data_col)
|
|
182
183
|
if inferred_type is None:
|
|
183
184
|
# Fallback on StringType if everything else fails
|
|
184
|
-
return
|
|
185
|
+
return ts.StringType(nullable=nullable)
|
|
185
186
|
else:
|
|
186
187
|
return inferred_type.copy(nullable=nullable)
|
|
187
188
|
|
|
@@ -189,7 +190,7 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
|
|
|
189
190
|
|
|
190
191
|
|
|
191
192
|
def _df_row_to_pxt_row(
|
|
192
|
-
row: tuple[Any, ...], schema: dict[str,
|
|
193
|
+
row: tuple[Any, ...], schema: dict[str, ts.ColumnType], col_mapping: Optional[dict[str, str]]
|
|
193
194
|
) -> dict[str, Any]:
|
|
194
195
|
"""Convert a row to insertable format"""
|
|
195
196
|
pxt_row: dict[str, Any] = {}
|
|
@@ -15,6 +15,7 @@ from pyarrow.parquet import ParquetDataset
|
|
|
15
15
|
|
|
16
16
|
import pixeltable as pxt
|
|
17
17
|
import pixeltable.exceptions as excs
|
|
18
|
+
import pixeltable.type_system as ts
|
|
18
19
|
from pixeltable.io.pandas import _df_check_primary_key_values, _df_row_to_pxt_row, df_infer_schema
|
|
19
20
|
from pixeltable.utils import parse_local_file_path
|
|
20
21
|
|
|
@@ -72,6 +73,11 @@ class TableDataConduit:
|
|
|
72
73
|
def check_source_format(self) -> None:
|
|
73
74
|
assert self.source_format is None or TableDataConduitFormat.is_valid(self.source_format)
|
|
74
75
|
|
|
76
|
+
def __post_init__(self) -> None:
|
|
77
|
+
"""If no extra_fields were provided, initialize to empty dict"""
|
|
78
|
+
if self.extra_fields is None:
|
|
79
|
+
self.extra_fields = {}
|
|
80
|
+
|
|
75
81
|
@classmethod
|
|
76
82
|
def is_rowdata_structure(cls, d: TableDataSource) -> bool:
|
|
77
83
|
if not isinstance(d, list) or len(d) == 0:
|
|
@@ -83,7 +89,7 @@ class TableDataConduit:
|
|
|
83
89
|
|
|
84
90
|
def normalize_pxt_schema_types(self) -> None:
|
|
85
91
|
for name, coltype in self.pxt_schema.items():
|
|
86
|
-
self.pxt_schema[name] =
|
|
92
|
+
self.pxt_schema[name] = ts.ColumnType.normalize_type(coltype)
|
|
87
93
|
|
|
88
94
|
def infer_schema(self) -> dict[str, Any]:
|
|
89
95
|
raise NotImplementedError
|
|
@@ -393,7 +399,7 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
393
399
|
f'Column name `{self.column_name_for_split}` already exists in dataset schema;'
|
|
394
400
|
f'provide a different `column_name_for_split`'
|
|
395
401
|
)
|
|
396
|
-
self.src_schema[self.column_name_for_split] =
|
|
402
|
+
self.src_schema[self.column_name_for_split] = ts.StringType(nullable=True)
|
|
397
403
|
|
|
398
404
|
inferred_schema, inferred_pk, self.source_column_map = normalize_schema_names(
|
|
399
405
|
self.src_schema, self.src_pk, self.src_schema_overrides, True
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -16,7 +16,7 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
19
|
-
VERSION =
|
|
19
|
+
VERSION = 34
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -3,7 +3,7 @@ from typing import Any, Optional
|
|
|
3
3
|
|
|
4
4
|
import sqlalchemy as sql
|
|
5
5
|
|
|
6
|
-
import pixeltable as
|
|
6
|
+
import pixeltable.type_system as ts
|
|
7
7
|
from pixeltable.metadata import register_converter, schema
|
|
8
8
|
from pixeltable.metadata.converters.util import convert_table_md
|
|
9
9
|
|
|
@@ -34,7 +34,7 @@ def __update_timestamp_literals(k: Any, v: Any) -> Optional[tuple[Any, Any]]:
|
|
|
34
34
|
# timestamp literal, which (in version 19) is stored in the DB as a naive datetime.
|
|
35
35
|
# We convert it to an aware datetime, stored in UTC.
|
|
36
36
|
assert v['_classname'] == 'Literal'
|
|
37
|
-
assert v['val_t'] ==
|
|
37
|
+
assert v['val_t'] == ts.ColumnType.Type.TIMESTAMP.name
|
|
38
38
|
assert isinstance(v['val'], str)
|
|
39
39
|
dt = datetime.datetime.fromisoformat(v['val'])
|
|
40
40
|
assert dt.tzinfo is None # In version 19 all timestamps are naive
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import sqlalchemy as sql
|
|
2
|
+
|
|
3
|
+
from pixeltable.metadata import register_converter
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@register_converter(version=31)
|
|
7
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
8
|
+
# Add a column "lock_dummy: int8" to the dirs table in the store
|
|
9
|
+
# This column is the target of an UPDATE operation to synchronize directory operations
|
|
10
|
+
with engine.begin() as conn:
|
|
11
|
+
conn.execute(sql.text('ALTER TABLE dirs ADD COLUMN lock_dummy int8'))
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from uuid import UUID
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
from pixeltable.metadata import register_converter
|
|
6
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_converter(version=32)
|
|
10
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
11
|
+
convert_table_md(engine, table_md_updater=__update_table_md)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def __update_table_md(table_md: dict, table_id: UUID) -> None:
|
|
15
|
+
table_md['is_replica'] = False
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from uuid import UUID
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
from pixeltable.metadata import register_converter
|
|
6
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_converter(version=33)
|
|
10
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
11
|
+
convert_table_md(engine, table_md_updater=__update_table_md)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def __update_table_md(table_md: dict, table_id: UUID) -> None:
|
|
15
|
+
"""Set default value of 'is_pk' field in column metadata to False"""
|
|
16
|
+
for col_md in table_md['column_md'].values():
|
|
17
|
+
col_md['is_pk'] = False if col_md['is_pk'] is None else col_md['is_pk']
|
pixeltable/metadata/notes.py
CHANGED
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
# rather than as a comment, so that the existence of a description can be enforced by
|
|
3
3
|
# the unit tests when new versions are added.
|
|
4
4
|
VERSION_NOTES = {
|
|
5
|
+
34: 'Set default value for is_pk field in column metadata to False',
|
|
6
|
+
33: 'Add is_replica field to table metadata',
|
|
7
|
+
32: 'Add the lock_dummy BIGINT column to the dirs table',
|
|
5
8
|
31: 'Add table ids to metadata structs',
|
|
6
9
|
30: 'Store default values and constant arguments as literals',
|
|
7
10
|
29: 'Add user and additional_md fields to metadata structs',
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import typing
|
|
3
3
|
import uuid
|
|
4
|
-
from typing import Any, Optional, TypeVar, Union, get_type_hints
|
|
4
|
+
from typing import Any, NamedTuple, Optional, TypeVar, Union, get_type_hints
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sql
|
|
7
7
|
from sqlalchemy import BigInteger, ForeignKey, Integer, LargeBinary, orm
|
|
@@ -84,6 +84,8 @@ class Dir(Base):
|
|
|
84
84
|
)
|
|
85
85
|
parent_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
|
|
86
86
|
md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # DirMd
|
|
87
|
+
# This field is updated to synchronize database operations across multiple sessions
|
|
88
|
+
lock_dummy: orm.Mapped[int] = orm.mapped_column(BigInteger, nullable=True)
|
|
87
89
|
|
|
88
90
|
|
|
89
91
|
@dataclasses.dataclass
|
|
@@ -155,6 +157,7 @@ class ViewMd:
|
|
|
155
157
|
class TableMd:
|
|
156
158
|
tbl_id: str # uuid.UUID
|
|
157
159
|
name: str
|
|
160
|
+
is_replica: bool
|
|
158
161
|
|
|
159
162
|
user: Optional[str]
|
|
160
163
|
|
|
@@ -286,3 +289,25 @@ class Function(Base):
|
|
|
286
289
|
dir_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
|
|
287
290
|
md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # FunctionMd
|
|
288
291
|
binary_obj: orm.Mapped[Optional[bytes]] = orm.mapped_column(LargeBinary, nullable=True)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
class FullTableMd(NamedTuple):
|
|
295
|
+
tbl_md: TableMd
|
|
296
|
+
version_md: TableVersionMd
|
|
297
|
+
schema_version_md: TableSchemaVersionMd
|
|
298
|
+
|
|
299
|
+
def as_dict(self) -> dict[str, Any]:
|
|
300
|
+
return {
|
|
301
|
+
'table_id': self.tbl_md.tbl_id,
|
|
302
|
+
'table_md': dataclasses.asdict(self.tbl_md),
|
|
303
|
+
'table_version_md': dataclasses.asdict(self.version_md),
|
|
304
|
+
'table_schema_version_md': dataclasses.asdict(self.schema_version_md),
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
@classmethod
|
|
308
|
+
def from_dict(cls, data_dict: dict[str, Any]) -> 'FullTableMd':
|
|
309
|
+
return FullTableMd(
|
|
310
|
+
tbl_md=md_from_dict(TableMd, data_dict['table_md']),
|
|
311
|
+
version_md=md_from_dict(TableVersionMd, data_dict['table_version_md']),
|
|
312
|
+
schema_version_md=md_from_dict(TableSchemaVersionMd, data_dict['table_schema_version_md']),
|
|
313
|
+
)
|
pixeltable/plan.py
CHANGED
|
@@ -768,8 +768,7 @@ class Planner:
|
|
|
768
768
|
# - select list subexprs that aren't aggregates
|
|
769
769
|
# - join clause subexprs
|
|
770
770
|
# - subexprs of Where clause conjuncts that can't be run in SQL
|
|
771
|
-
# - all grouping exprs
|
|
772
|
-
# run in Python)
|
|
771
|
+
# - all grouping exprs
|
|
773
772
|
candidates = list(
|
|
774
773
|
exprs.Expr.list_subexprs(
|
|
775
774
|
analyzer.select_list,
|
|
@@ -784,7 +783,7 @@ class Planner:
|
|
|
784
783
|
candidates.extend(
|
|
785
784
|
exprs.Expr.subexprs(analyzer.filter, filter=sql_elements.contains, traverse_matches=False)
|
|
786
785
|
)
|
|
787
|
-
if
|
|
786
|
+
if analyzer.group_by_clause is not None:
|
|
788
787
|
candidates.extend(
|
|
789
788
|
exprs.Expr.list_subexprs(analyzer.group_by_clause, filter=sql_elements.contains, traverse_matches=False)
|
|
790
789
|
)
|
pixeltable/share/packager.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import dataclasses
|
|
2
1
|
import io
|
|
3
2
|
import json
|
|
4
3
|
import logging
|
|
@@ -6,7 +5,6 @@ import tarfile
|
|
|
6
5
|
import urllib.parse
|
|
7
6
|
import urllib.request
|
|
8
7
|
import uuid
|
|
9
|
-
from datetime import datetime
|
|
10
8
|
from pathlib import Path
|
|
11
9
|
from typing import Any, Iterator, Optional
|
|
12
10
|
|
|
@@ -58,28 +56,14 @@ class TablePackager:
|
|
|
58
56
|
self.tmp_dir = Path(Env.get().create_tmp_path())
|
|
59
57
|
self.media_files = {}
|
|
60
58
|
|
|
61
|
-
#
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
'
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
# These are temporary; will replace with a better solution once the concurrency
|
|
70
|
-
# changes to catalog have been merged
|
|
71
|
-
'table_md': dataclasses.asdict(t._tbl_version.get()._create_tbl_md()),
|
|
72
|
-
'table_version_md': dataclasses.asdict(
|
|
73
|
-
t._tbl_version.get()._create_version_md(datetime.now().timestamp())
|
|
74
|
-
),
|
|
75
|
-
'table_schema_version_md': dataclasses.asdict(
|
|
76
|
-
t._tbl_version.get()._create_schema_version_md(0)
|
|
77
|
-
),
|
|
78
|
-
}
|
|
79
|
-
for t in (table, *table._bases)
|
|
80
|
-
]
|
|
81
|
-
},
|
|
82
|
-
}
|
|
59
|
+
# Load metadata
|
|
60
|
+
with Env.get().begin_xact():
|
|
61
|
+
tbl_md = catalog.Catalog.get().load_replica_md(table)
|
|
62
|
+
self.md = {
|
|
63
|
+
'pxt_version': pxt.__version__,
|
|
64
|
+
'pxt_md_version': metadata.VERSION,
|
|
65
|
+
'md': {'tables': [md.as_dict() for md in tbl_md]},
|
|
66
|
+
}
|
|
83
67
|
if additional_md is not None:
|
|
84
68
|
self.md.update(additional_md)
|
|
85
69
|
|
|
@@ -94,7 +78,7 @@ class TablePackager:
|
|
|
94
78
|
json.dump(self.md, fp)
|
|
95
79
|
self.iceberg_catalog = sqlite_catalog(self.tmp_dir / 'warehouse')
|
|
96
80
|
with Env.get().begin_xact():
|
|
97
|
-
ancestors = (self.table, *self.table.
|
|
81
|
+
ancestors = (self.table, *self.table._base_tables)
|
|
98
82
|
for t in ancestors:
|
|
99
83
|
_logger.info(f"Exporting table '{t._path}'.")
|
|
100
84
|
self.__export_table(t)
|
pixeltable/share/publish.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import sys
|
|
3
2
|
import urllib.parse
|
|
4
3
|
import urllib.request
|
|
@@ -10,22 +9,22 @@ from tqdm import tqdm
|
|
|
10
9
|
import pixeltable as pxt
|
|
11
10
|
from pixeltable import exceptions as excs
|
|
12
11
|
from pixeltable.env import Env
|
|
12
|
+
from pixeltable.metadata.schema import FullTableMd
|
|
13
13
|
from pixeltable.utils import sha256sum
|
|
14
14
|
|
|
15
15
|
from .packager import TablePackager
|
|
16
16
|
|
|
17
17
|
# These URLs are abstracted out for now, but will be replaced with actual (hard-coded) URLs once the
|
|
18
18
|
# pixeltable.com URLs are available.
|
|
19
|
-
|
|
20
|
-
|
|
19
|
+
|
|
20
|
+
PIXELTABLE_API_URL = 'https://internal-api.pixeltable.com'
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
def publish_snapshot(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
|
|
24
24
|
packager = TablePackager(src_tbl, additional_md={'table_uri': dest_tbl_uri})
|
|
25
|
-
request_json = packager.md
|
|
26
|
-
headers_json = {'X-api-key': Env.get().pxt_api_key}
|
|
27
|
-
|
|
28
|
-
response = requests.post(_PUBLISH_URL, json=request_json, headers=headers_json)
|
|
25
|
+
request_json = packager.md | {'operation_type': 'publish_snapshot'}
|
|
26
|
+
headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
|
|
27
|
+
response = requests.post(PIXELTABLE_API_URL, json=request_json, headers=headers_json)
|
|
29
28
|
if response.status_code != 200:
|
|
30
29
|
raise excs.Error(f'Error publishing snapshot: {response.text}')
|
|
31
30
|
response_json = response.json()
|
|
@@ -47,14 +46,14 @@ def publish_snapshot(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
|
|
|
47
46
|
Env.get().console_logger.info('Finalizing snapshot ...')
|
|
48
47
|
|
|
49
48
|
finalize_request_json = {
|
|
49
|
+
'operation_type': 'finalize_snapshot',
|
|
50
50
|
'upload_id': upload_id,
|
|
51
51
|
'datafile': bundle.name,
|
|
52
52
|
'size': bundle.stat().st_size,
|
|
53
53
|
'sha256': sha256sum(bundle), # Generate our own SHA for independent verification
|
|
54
54
|
}
|
|
55
|
-
|
|
56
55
|
# TODO: Use Pydantic for validation
|
|
57
|
-
finalize_response = requests.post(
|
|
56
|
+
finalize_response = requests.post(PIXELTABLE_API_URL, json=finalize_request_json, headers=headers_json)
|
|
58
57
|
if finalize_response.status_code != 200:
|
|
59
58
|
raise excs.Error(f'Error finalizing snapshot: {finalize_response.text}')
|
|
60
59
|
finalize_response_json = finalize_response.json()
|
|
@@ -66,6 +65,18 @@ def publish_snapshot(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
|
|
|
66
65
|
return confirmed_tbl_uri
|
|
67
66
|
|
|
68
67
|
|
|
68
|
+
def clone_snapshot(dest_tbl_uri: str) -> list[FullTableMd]:
|
|
69
|
+
headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
|
|
70
|
+
clone_request_json = {'operation_type': 'clone_snapshot', 'table_uri': dest_tbl_uri}
|
|
71
|
+
response = requests.post(PIXELTABLE_API_URL, json=clone_request_json, headers=headers_json)
|
|
72
|
+
if response.status_code != 200:
|
|
73
|
+
raise excs.Error(f'Error cloning snapshot: {response.text}')
|
|
74
|
+
response_json = response.json()
|
|
75
|
+
if not isinstance(response_json, dict) or 'table_uri' not in response_json:
|
|
76
|
+
raise excs.Error(f'Unexpected response from server.\n{response_json}')
|
|
77
|
+
return [FullTableMd.from_dict(t) for t in response_json['md']['tables']]
|
|
78
|
+
|
|
79
|
+
|
|
69
80
|
def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult) -> None:
|
|
70
81
|
from pixeltable.utils.s3 import get_client
|
|
71
82
|
|
pixeltable/store.py
CHANGED
|
@@ -16,6 +16,7 @@ from pixeltable import catalog, exceptions as excs, exprs
|
|
|
16
16
|
from pixeltable.env import Env
|
|
17
17
|
from pixeltable.exec import ExecNode
|
|
18
18
|
from pixeltable.metadata import schema
|
|
19
|
+
from pixeltable.utils.exception_handler import run_cleanup
|
|
19
20
|
from pixeltable.utils.media_store import MediaStore
|
|
20
21
|
from pixeltable.utils.sql import log_explain, log_stmt
|
|
21
22
|
|
|
@@ -232,7 +233,6 @@ class StoreBase:
|
|
|
232
233
|
assert col.tbl.id == self.tbl_version.id
|
|
233
234
|
num_excs = 0
|
|
234
235
|
num_rows = 0
|
|
235
|
-
|
|
236
236
|
# create temp table to store output of exec_plan, with the same primary key as the store table
|
|
237
237
|
tmp_name = f'temp_{self._storage_name()}'
|
|
238
238
|
tmp_pk_cols = [sql.Column(col.name, col.type, primary_key=True) for col in self.pk_columns()]
|
|
@@ -301,10 +301,13 @@ class StoreBase:
|
|
|
301
301
|
)
|
|
302
302
|
log_explain(_logger, update_stmt, conn)
|
|
303
303
|
conn.execute(update_stmt)
|
|
304
|
-
|
|
305
304
|
finally:
|
|
306
|
-
|
|
307
|
-
|
|
305
|
+
|
|
306
|
+
def remove_tmp_tbl() -> None:
|
|
307
|
+
self.sa_md.remove(tmp_tbl)
|
|
308
|
+
tmp_tbl.drop(bind=conn)
|
|
309
|
+
|
|
310
|
+
run_cleanup(remove_tmp_tbl, raise_error=True)
|
|
308
311
|
return num_excs
|
|
309
312
|
|
|
310
313
|
def insert_rows(
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
from typing import Any, Callable, Optional, TypeVar
|
|
4
|
+
|
|
5
|
+
R = TypeVar('R')
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _is_in_exception() -> bool:
|
|
9
|
+
"""
|
|
10
|
+
Check if code is currently executing within an exception context.
|
|
11
|
+
"""
|
|
12
|
+
current_exception = sys.exc_info()[1]
|
|
13
|
+
return current_exception is not None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def run_cleanup_on_exception(cleanup_func: Callable[..., R], *args: Any, **kwargs: Any) -> Optional[R]:
|
|
17
|
+
"""
|
|
18
|
+
Runs cleanup only when running in exception context.
|
|
19
|
+
|
|
20
|
+
The function `run_cleanup_on_exception()` should be used to clean up resources when an operation fails.
|
|
21
|
+
This is typically done using a try, except, and finally block, with the resource cleanup logic placed within
|
|
22
|
+
the except block. However, this pattern may not handle KeyboardInterrupt exceptions.
|
|
23
|
+
To ensure that resources are always cleaned up at least once when an exception or KeyboardInterrupt occurs,
|
|
24
|
+
create an idempotent function for cleaning up resources and pass it to the `run_cleanup_on_exception()` function
|
|
25
|
+
from the finally block.
|
|
26
|
+
"""
|
|
27
|
+
if _is_in_exception():
|
|
28
|
+
return run_cleanup(cleanup_func, *args, raise_error=False, **kwargs)
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool = True, **kwargs: Any) -> Optional[R]:
|
|
33
|
+
"""
|
|
34
|
+
Runs a cleanup function. If interrupted, retry cleanup.
|
|
35
|
+
The `run_cleanup()` function ensures that the `cleanup_func()` function executes at least once.
|
|
36
|
+
If the `cleanup_func()` is interrupted during execution, it will be retried.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
cleanup_func: an idempotent function
|
|
40
|
+
raise_error: raise an exception if an error occurs during cleanup.
|
|
41
|
+
"""
|
|
42
|
+
try:
|
|
43
|
+
logging.debug(f'Running cleanup function: {cleanup_func.__name__!r}')
|
|
44
|
+
return cleanup_func(*args, **kwargs)
|
|
45
|
+
except KeyboardInterrupt as interrupt:
|
|
46
|
+
# Save original exception and re-attempt cleanup
|
|
47
|
+
original_exception = interrupt
|
|
48
|
+
logging.debug(f'Cleanup {cleanup_func.__name__!r} interrupted, retrying')
|
|
49
|
+
try:
|
|
50
|
+
return cleanup_func(*args, **kwargs)
|
|
51
|
+
except Exception as e:
|
|
52
|
+
# Suppress this exception
|
|
53
|
+
logging.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e}')
|
|
54
|
+
raise KeyboardInterrupt from original_exception
|
|
55
|
+
except Exception as e:
|
|
56
|
+
logging.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e}')
|
|
57
|
+
if raise_error:
|
|
58
|
+
raise e
|
|
59
|
+
return None
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: pixeltable
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.13
|
|
4
4
|
Summary: AI Data Infrastructure: Declarative, Multimodal, and Incremental
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Keywords: data-science,machine-learning,database,ai,computer-vision,chatbot,ml,artificial-intelligence,feature-engineering,multimodal,mlops,feature-store,vector-database,llm,genai
|