pixeltable 0.4.12__py3-none-any.whl → 0.4.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +11 -1
- pixeltable/catalog/__init__.py +2 -1
- pixeltable/catalog/catalog.py +179 -63
- pixeltable/catalog/column.py +24 -20
- pixeltable/catalog/table.py +96 -124
- pixeltable/catalog/table_metadata.py +96 -0
- pixeltable/catalog/table_version.py +15 -6
- pixeltable/catalog/view.py +22 -22
- pixeltable/config.py +2 -0
- pixeltable/dataframe.py +3 -2
- pixeltable/env.py +43 -21
- pixeltable/exec/__init__.py +1 -0
- pixeltable/exec/aggregation_node.py +0 -1
- pixeltable/exec/cache_prefetch_node.py +74 -98
- pixeltable/exec/data_row_batch.py +2 -18
- pixeltable/exec/in_memory_data_node.py +1 -1
- pixeltable/exec/object_store_save_node.py +299 -0
- pixeltable/exec/sql_node.py +28 -33
- pixeltable/exprs/data_row.py +31 -25
- pixeltable/exprs/json_path.py +6 -5
- pixeltable/exprs/row_builder.py +6 -12
- pixeltable/functions/gemini.py +1 -1
- pixeltable/functions/openai.py +1 -1
- pixeltable/functions/video.py +5 -6
- pixeltable/globals.py +6 -7
- pixeltable/index/embedding_index.py +5 -8
- pixeltable/io/__init__.py +2 -1
- pixeltable/io/fiftyone.py +1 -1
- pixeltable/io/label_studio.py +4 -5
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/parquet.py +9 -89
- pixeltable/io/table_data_conduit.py +2 -2
- pixeltable/iterators/audio.py +1 -1
- pixeltable/iterators/document.py +10 -12
- pixeltable/iterators/video.py +1 -1
- pixeltable/metadata/schema.py +7 -0
- pixeltable/plan.py +26 -1
- pixeltable/share/packager.py +8 -2
- pixeltable/share/publish.py +3 -9
- pixeltable/type_system.py +1 -3
- pixeltable/utils/arrow.py +97 -2
- pixeltable/utils/dbms.py +31 -5
- pixeltable/utils/gcs_store.py +283 -0
- pixeltable/utils/lancedb.py +88 -0
- pixeltable/utils/local_store.py +316 -0
- pixeltable/utils/object_stores.py +497 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +354 -0
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/METADATA +162 -127
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/RECORD +53 -47
- pixeltable/utils/media_store.py +0 -248
- pixeltable/utils/s3.py +0 -17
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/licenses/LICENSE +0 -0
|
@@ -138,15 +138,12 @@ class EmbeddingIndex(IndexBase):
|
|
|
138
138
|
|
|
139
139
|
def create_index(self, index_name: str, index_value_col: catalog.Column) -> None:
|
|
140
140
|
"""Create the index on the index value column"""
|
|
141
|
-
|
|
142
|
-
index_name,
|
|
143
|
-
index_value_col.sa_col,
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
postgresql_ops={index_value_col.sa_col.name: self.PGVECTOR_OPS[self.metric]},
|
|
141
|
+
Env.get().dbms.create_vector_index(
|
|
142
|
+
index_name=index_name,
|
|
143
|
+
index_value_sa_col=index_value_col.sa_col,
|
|
144
|
+
conn=Env.get().conn,
|
|
145
|
+
metric=self.PGVECTOR_OPS[self.metric],
|
|
147
146
|
)
|
|
148
|
-
conn = Env.get().conn
|
|
149
|
-
idx.create(bind=conn)
|
|
150
147
|
|
|
151
148
|
def drop_index(self, index_name: str, index_value_col: catalog.Column) -> None:
|
|
152
149
|
"""Drop the index on the index value column"""
|
pixeltable/io/__init__.py
CHANGED
|
@@ -4,11 +4,12 @@ from .datarows import import_json, import_rows
|
|
|
4
4
|
from .external_store import ExternalStore
|
|
5
5
|
from .globals import create_label_studio_project, export_images_as_fo_dataset
|
|
6
6
|
from .hf_datasets import import_huggingface_dataset
|
|
7
|
+
from .lancedb import export_lancedb
|
|
7
8
|
from .pandas import import_csv, import_excel, import_pandas
|
|
8
9
|
from .parquet import export_parquet, import_parquet
|
|
9
10
|
|
|
10
11
|
__default_dir = {symbol for symbol in dir() if not symbol.startswith('_')}
|
|
11
|
-
__removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows'}
|
|
12
|
+
__removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows', 'lancedb'}
|
|
12
13
|
__all__ = sorted(__default_dir - __removed_symbols)
|
|
13
14
|
|
|
14
15
|
|
pixeltable/io/fiftyone.py
CHANGED
|
@@ -9,7 +9,7 @@ import puremagic
|
|
|
9
9
|
import pixeltable as pxt
|
|
10
10
|
import pixeltable.exceptions as excs
|
|
11
11
|
from pixeltable import exprs
|
|
12
|
-
from pixeltable.utils.
|
|
12
|
+
from pixeltable.utils.local_store import TempStore
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
|
pixeltable/io/label_studio.py
CHANGED
|
@@ -19,7 +19,7 @@ from pixeltable.config import Config
|
|
|
19
19
|
from pixeltable.exprs import ColumnRef, DataRow, Expr
|
|
20
20
|
from pixeltable.io.external_store import Project
|
|
21
21
|
from pixeltable.utils import coco
|
|
22
|
-
from pixeltable.utils.
|
|
22
|
+
from pixeltable.utils.local_store import TempStore
|
|
23
23
|
|
|
24
24
|
# label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
|
|
25
25
|
# the import two different ways to insure intercompatibility
|
|
@@ -46,6 +46,9 @@ class LabelStudioProject(Project):
|
|
|
46
46
|
"""
|
|
47
47
|
An [`ExternalStore`][pixeltable.io.ExternalStore] that represents a Label Studio project, providing functionality
|
|
48
48
|
for synchronizing between a Pixeltable table and a Label Studio project.
|
|
49
|
+
|
|
50
|
+
The constructor will NOT create a new Label Studio project; it is also used when loading
|
|
51
|
+
metadata for existing projects.
|
|
49
52
|
"""
|
|
50
53
|
|
|
51
54
|
project_id: int # Label Studio project ID
|
|
@@ -60,10 +63,6 @@ class LabelStudioProject(Project):
|
|
|
60
63
|
col_mapping: dict[ColumnHandle, str],
|
|
61
64
|
stored_proxies: Optional[dict[ColumnHandle, ColumnHandle]] = None,
|
|
62
65
|
):
|
|
63
|
-
"""
|
|
64
|
-
The constructor will NOT create a new Label Studio project; it is also used when loading
|
|
65
|
-
metadata for existing projects.
|
|
66
|
-
"""
|
|
67
66
|
self.project_id = project_id
|
|
68
67
|
self.media_import_method = media_import_method
|
|
69
68
|
self._project = None
|
pixeltable/io/lancedb.py
ADDED
pixeltable/io/parquet.py
CHANGED
|
@@ -1,46 +1,22 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import datetime
|
|
4
|
-
import io
|
|
5
3
|
import json
|
|
6
4
|
import logging
|
|
7
5
|
import typing
|
|
8
|
-
from collections import deque
|
|
9
6
|
from pathlib import Path
|
|
10
7
|
from typing import Any, Optional
|
|
11
8
|
|
|
12
|
-
import numpy as np
|
|
13
|
-
import PIL.Image
|
|
14
|
-
|
|
15
9
|
import pixeltable as pxt
|
|
16
10
|
import pixeltable.exceptions as excs
|
|
17
11
|
from pixeltable.catalog import Catalog
|
|
18
12
|
from pixeltable.utils.transactional_directory import transactional_directory
|
|
19
13
|
|
|
20
14
|
if typing.TYPE_CHECKING:
|
|
21
|
-
import pyarrow as pa
|
|
22
|
-
|
|
23
15
|
import pixeltable as pxt
|
|
24
16
|
|
|
25
17
|
_logger = logging.getLogger('pixeltable')
|
|
26
18
|
|
|
27
19
|
|
|
28
|
-
def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
|
|
29
|
-
import pyarrow as pa
|
|
30
|
-
from pyarrow import parquet
|
|
31
|
-
|
|
32
|
-
pydict = {}
|
|
33
|
-
for field in schema:
|
|
34
|
-
if isinstance(field.type, pa.FixedShapeTensorType):
|
|
35
|
-
stacked_arr = np.stack(value_batch[field.name])
|
|
36
|
-
pydict[field.name] = pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr)
|
|
37
|
-
else:
|
|
38
|
-
pydict[field.name] = value_batch[field.name]
|
|
39
|
-
|
|
40
|
-
tab = pa.Table.from_pydict(pydict, schema=schema)
|
|
41
|
-
parquet.write_table(tab, str(output_path))
|
|
42
|
-
|
|
43
|
-
|
|
44
20
|
def export_parquet(
|
|
45
21
|
table_or_df: pxt.Table | pxt.DataFrame,
|
|
46
22
|
parquet_path: Path,
|
|
@@ -63,7 +39,9 @@ def export_parquet(
|
|
|
63
39
|
If False, will raise an error if the Dataframe has any image column.
|
|
64
40
|
Default False.
|
|
65
41
|
"""
|
|
66
|
-
|
|
42
|
+
import pyarrow as pa
|
|
43
|
+
|
|
44
|
+
from pixeltable.utils.arrow import to_record_batches
|
|
67
45
|
|
|
68
46
|
df: pxt.DataFrame
|
|
69
47
|
if isinstance(table_or_df, pxt.catalog.Table):
|
|
@@ -71,9 +49,6 @@ def export_parquet(
|
|
|
71
49
|
else:
|
|
72
50
|
df = table_or_df
|
|
73
51
|
|
|
74
|
-
type_dict = {k: v.as_dict() for k, v in df.schema.items()}
|
|
75
|
-
arrow_schema = to_arrow_schema(df.schema)
|
|
76
|
-
|
|
77
52
|
if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
|
|
78
53
|
raise excs.Error('Cannot export Dataframe with image columns when inline_images is False')
|
|
79
54
|
|
|
@@ -81,70 +56,15 @@ def export_parquet(
|
|
|
81
56
|
with transactional_directory(parquet_path) as temp_path:
|
|
82
57
|
# dump metadata json file so we can inspect what was the source of the parquet file later on.
|
|
83
58
|
json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
|
|
59
|
+
type_dict = {k: v.as_dict() for k, v in df.schema.items()}
|
|
84
60
|
json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
|
|
85
|
-
|
|
86
61
|
batch_num = 0
|
|
87
|
-
current_value_batch: dict[str, deque] = {k: deque() for k in df.schema}
|
|
88
|
-
current_byte_estimate = 0
|
|
89
|
-
|
|
90
62
|
with Catalog.get().begin_xact(for_write=False):
|
|
91
|
-
for
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
continue
|
|
97
|
-
|
|
98
|
-
assert val is not None
|
|
99
|
-
if col_type.is_image_type():
|
|
100
|
-
# images get inlined into the parquet file
|
|
101
|
-
if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
|
|
102
|
-
# if there is a file, read directly to preserve information
|
|
103
|
-
with open(data_row.file_paths[e.slot_idx], 'rb') as f:
|
|
104
|
-
val = f.read()
|
|
105
|
-
elif isinstance(val, PIL.Image.Image):
|
|
106
|
-
# if no file available, eg. bc it is computed, convert to png
|
|
107
|
-
buf = io.BytesIO()
|
|
108
|
-
val.save(buf, format='PNG')
|
|
109
|
-
val = buf.getvalue()
|
|
110
|
-
else:
|
|
111
|
-
raise excs.Error(f'unknown image type {type(val)}')
|
|
112
|
-
length = len(val)
|
|
113
|
-
elif col_type.is_string_type():
|
|
114
|
-
length = len(val)
|
|
115
|
-
elif col_type.is_video_type() or col_type.is_audio_type():
|
|
116
|
-
if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
|
|
117
|
-
val = data_row.file_paths[e.slot_idx]
|
|
118
|
-
else:
|
|
119
|
-
raise excs.Error(f'unknown audio/video type {type(val)}')
|
|
120
|
-
length = len(val)
|
|
121
|
-
elif col_type.is_json_type():
|
|
122
|
-
val = json.dumps(val)
|
|
123
|
-
length = len(val)
|
|
124
|
-
elif col_type.is_array_type():
|
|
125
|
-
length = val.nbytes
|
|
126
|
-
elif col_type.is_int_type() or col_type.is_float_type():
|
|
127
|
-
length = 8
|
|
128
|
-
elif col_type.is_bool_type():
|
|
129
|
-
length = 1
|
|
130
|
-
elif col_type.is_date_type():
|
|
131
|
-
length = 4
|
|
132
|
-
elif col_type.is_timestamp_type():
|
|
133
|
-
val = val.astimezone(datetime.timezone.utc)
|
|
134
|
-
length = 8
|
|
135
|
-
else:
|
|
136
|
-
raise excs.Error(f'unknown type {col_type} for {col_name}')
|
|
137
|
-
|
|
138
|
-
current_value_batch[col_name].append(val)
|
|
139
|
-
current_byte_estimate += length
|
|
140
|
-
if current_byte_estimate > partition_size_bytes:
|
|
141
|
-
assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
|
|
142
|
-
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
143
|
-
batch_num += 1
|
|
144
|
-
current_value_batch = {k: deque() for k in df.schema}
|
|
145
|
-
current_byte_estimate = 0
|
|
146
|
-
|
|
147
|
-
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
63
|
+
for record_batch in to_record_batches(df, partition_size_bytes):
|
|
64
|
+
output_path = temp_path / f'part-{batch_num:05d}.parquet'
|
|
65
|
+
arrow_tbl = pa.Table.from_batches([record_batch]) # type: ignore
|
|
66
|
+
pa.parquet.write_table(arrow_tbl, str(output_path))
|
|
67
|
+
batch_num += 1
|
|
148
68
|
|
|
149
69
|
|
|
150
70
|
def import_parquet(
|
|
@@ -469,12 +469,12 @@ class ParquetTableDataConduit(TableDataConduit):
|
|
|
469
469
|
return t
|
|
470
470
|
|
|
471
471
|
def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
472
|
-
from pixeltable.utils.arrow import
|
|
472
|
+
from pixeltable.utils.arrow import to_pxt_schema
|
|
473
473
|
|
|
474
474
|
if self.source_column_map is None:
|
|
475
475
|
if self.src_schema_overrides is None:
|
|
476
476
|
self.src_schema_overrides = {}
|
|
477
|
-
self.src_schema =
|
|
477
|
+
self.src_schema = to_pxt_schema(self.pq_ds.schema, self.src_schema_overrides, self.src_pk)
|
|
478
478
|
inferred_schema, inferred_pk, self.source_column_map = normalize_schema_names(
|
|
479
479
|
self.src_schema, self.src_pk, self.src_schema_overrides
|
|
480
480
|
)
|
pixeltable/iterators/audio.py
CHANGED
|
@@ -6,7 +6,7 @@ from typing import Any, ClassVar, Optional
|
|
|
6
6
|
import av
|
|
7
7
|
|
|
8
8
|
from pixeltable import exceptions as excs, type_system as ts
|
|
9
|
-
from pixeltable.utils.
|
|
9
|
+
from pixeltable.utils.local_store import TempStore
|
|
10
10
|
|
|
11
11
|
from .base import ComponentIterator
|
|
12
12
|
|
pixeltable/iterators/document.py
CHANGED
|
@@ -94,6 +94,16 @@ class DocumentSplitter(ComponentIterator):
|
|
|
94
94
|
include additional metadata fields if specified in the `metadata` parameter, as explained below.
|
|
95
95
|
|
|
96
96
|
Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
separators: separators to use to chunk the document. Options are:
|
|
100
|
+
`'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
|
|
101
|
+
This may be a comma-separated string, e.g., `'heading,token_limit'`.
|
|
102
|
+
limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
|
|
103
|
+
or `'char_limit'` is specified.
|
|
104
|
+
metadata: additional metadata fields to include in the output. Options are:
|
|
105
|
+
`'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
|
|
106
|
+
(PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
|
|
97
107
|
"""
|
|
98
108
|
|
|
99
109
|
METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
|
|
@@ -116,18 +126,6 @@ class DocumentSplitter(ComponentIterator):
|
|
|
116
126
|
tiktoken_encoding: Optional[str] = 'cl100k_base',
|
|
117
127
|
tiktoken_target_model: Optional[str] = None,
|
|
118
128
|
):
|
|
119
|
-
"""Init method for `DocumentSplitter` class.
|
|
120
|
-
|
|
121
|
-
Args:
|
|
122
|
-
separators: separators to use to chunk the document. Options are:
|
|
123
|
-
`'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
|
|
124
|
-
This may be a comma-separated string, e.g., `'heading,token_limit'`.
|
|
125
|
-
limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
|
|
126
|
-
or `'char_limit'` is specified.
|
|
127
|
-
metadata: additional metadata fields to include in the output. Options are:
|
|
128
|
-
`'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
|
|
129
|
-
(PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
|
|
130
|
-
"""
|
|
131
129
|
if html_skip_tags is None:
|
|
132
130
|
html_skip_tags = ['nav']
|
|
133
131
|
self._doc_handle = get_document_handle(document)
|
pixeltable/iterators/video.py
CHANGED
|
@@ -14,7 +14,7 @@ import pixeltable as pxt
|
|
|
14
14
|
import pixeltable.exceptions as excs
|
|
15
15
|
import pixeltable.type_system as ts
|
|
16
16
|
import pixeltable.utils.av as av_utils
|
|
17
|
-
from pixeltable.utils.
|
|
17
|
+
from pixeltable.utils.local_store import TempStore
|
|
18
18
|
|
|
19
19
|
from .base import ComponentIterator
|
|
20
20
|
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -115,6 +115,9 @@ class ColumnMd:
|
|
|
115
115
|
# if True, the column is present in the stored table
|
|
116
116
|
stored: Optional[bool]
|
|
117
117
|
|
|
118
|
+
# If present, the URI for the destination for column values
|
|
119
|
+
destination: Optional[str] = None
|
|
120
|
+
|
|
118
121
|
|
|
119
122
|
@dataclasses.dataclass
|
|
120
123
|
class IndexMd:
|
|
@@ -244,6 +247,9 @@ class TableVersionMd:
|
|
|
244
247
|
schema_version: int
|
|
245
248
|
user: Optional[str] = None # User that created this version
|
|
246
249
|
update_status: Optional[UpdateStatus] = None # UpdateStatus of the change that created this version
|
|
250
|
+
# A version fragment cannot be queried or instantiated via get_table(). A fragment represents a version of a
|
|
251
|
+
# replica table that has incomplete data, and exists only to provide base table support for a dependent view.
|
|
252
|
+
is_fragment: bool = False
|
|
247
253
|
additional_md: dict[str, Any] = dataclasses.field(default_factory=dict)
|
|
248
254
|
|
|
249
255
|
|
|
@@ -353,6 +359,7 @@ class FullTableMd(NamedTuple):
|
|
|
353
359
|
def is_pure_snapshot(self) -> bool:
|
|
354
360
|
return (
|
|
355
361
|
self.tbl_md.view_md is not None
|
|
362
|
+
and self.tbl_md.view_md.is_snapshot
|
|
356
363
|
and self.tbl_md.view_md.predicate is None
|
|
357
364
|
and len(self.schema_version_md.columns) == 0
|
|
358
365
|
)
|
pixeltable/plan.py
CHANGED
|
@@ -403,6 +403,8 @@ class Planner:
|
|
|
403
403
|
ignore_errors=ignore_errors,
|
|
404
404
|
)
|
|
405
405
|
)
|
|
406
|
+
plan = cls._insert_save_node(tbl.id, row_builder.stored_media_cols, input_node=plan)
|
|
407
|
+
|
|
406
408
|
return plan
|
|
407
409
|
|
|
408
410
|
@classmethod
|
|
@@ -499,6 +501,9 @@ class Planner:
|
|
|
499
501
|
for i, col in enumerate(all_base_cols):
|
|
500
502
|
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
501
503
|
plan.ctx.num_computed_exprs = len(recomputed_exprs)
|
|
504
|
+
|
|
505
|
+
plan = cls._insert_save_node(tbl.tbl_version.id, plan.row_builder.stored_media_cols, input_node=plan)
|
|
506
|
+
|
|
502
507
|
recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
|
|
503
508
|
return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
|
|
504
509
|
|
|
@@ -597,6 +602,7 @@ class Planner:
|
|
|
597
602
|
# we're returning everything to the user, so we might as well do it in a single batch
|
|
598
603
|
ctx.batch_size = 0
|
|
599
604
|
plan.set_ctx(ctx)
|
|
605
|
+
plan = cls._insert_save_node(tbl.tbl_version.id, plan.row_builder.stored_media_cols, input_node=plan)
|
|
600
606
|
recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
|
|
601
607
|
return (
|
|
602
608
|
plan,
|
|
@@ -650,6 +656,8 @@ class Planner:
|
|
|
650
656
|
for i, col in enumerate(copied_cols + list(recomputed_cols)): # same order as select_list
|
|
651
657
|
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
652
658
|
# TODO: avoid duplication with view_load_plan() logic (where does this belong?)
|
|
659
|
+
plan = cls._insert_save_node(view.tbl_version.id, plan.row_builder.stored_media_cols, input_node=plan)
|
|
660
|
+
|
|
653
661
|
return plan
|
|
654
662
|
|
|
655
663
|
@classmethod
|
|
@@ -718,6 +726,8 @@ class Planner:
|
|
|
718
726
|
|
|
719
727
|
exec_ctx.ignore_errors = True
|
|
720
728
|
plan.set_ctx(exec_ctx)
|
|
729
|
+
plan = cls._insert_save_node(view.tbl_version.id, plan.row_builder.stored_media_cols, input_node=plan)
|
|
730
|
+
|
|
721
731
|
return plan, len(row_builder.default_eval_ctx.target_exprs)
|
|
722
732
|
|
|
723
733
|
@classmethod
|
|
@@ -762,6 +772,17 @@ class Planner:
|
|
|
762
772
|
combined_ordering = combined
|
|
763
773
|
return combined_ordering
|
|
764
774
|
|
|
775
|
+
@classmethod
|
|
776
|
+
def _insert_save_node(
|
|
777
|
+
cls, tbl_id: UUID, stored_media_cols: list[exprs.ColumnSlotIdx], input_node: exec.ExecNode
|
|
778
|
+
) -> exec.ExecNode:
|
|
779
|
+
"""Return an ObjectStoreSaveNode if stored media columns are present, otherwise return input"""
|
|
780
|
+
if len(stored_media_cols) == 0:
|
|
781
|
+
return input_node
|
|
782
|
+
save_node = exec.ObjectStoreSaveNode(tbl_id, stored_media_cols, input_node)
|
|
783
|
+
save_node.set_ctx(input_node.ctx)
|
|
784
|
+
return save_node
|
|
785
|
+
|
|
765
786
|
@classmethod
|
|
766
787
|
def _is_contained_in(cls, l1: Iterable[exprs.Expr], l2: Iterable[exprs.Expr]) -> bool:
|
|
767
788
|
"""Returns True if l1 is contained in l2"""
|
|
@@ -771,7 +792,7 @@ class Planner:
|
|
|
771
792
|
def _insert_prefetch_node(
|
|
772
793
|
cls, tbl_id: UUID, expressions: Iterable[exprs.Expr], input_node: exec.ExecNode
|
|
773
794
|
) -> exec.ExecNode:
|
|
774
|
-
"""Return a
|
|
795
|
+
"""Return a node to prefetch data if needed, otherwise return input"""
|
|
775
796
|
# we prefetch external files for all media ColumnRefs, even those that aren't part of the dependencies
|
|
776
797
|
# of output_exprs: if unstored iterator columns are present, we might need to materialize ColumnRefs that
|
|
777
798
|
# aren't explicitly captured as dependencies
|
|
@@ -989,6 +1010,7 @@ class Planner:
|
|
|
989
1010
|
if not agg_output.issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
|
|
990
1011
|
# we need an ExprEvalNode to evaluate the remaining output exprs
|
|
991
1012
|
plan = exec.ExprEvalNode(row_builder, eval_ctx.target_exprs, agg_output, input=plan)
|
|
1013
|
+
plan = cls._insert_save_node(tbl.tbl_version.id, row_builder.stored_media_cols, input_node=plan)
|
|
992
1014
|
else:
|
|
993
1015
|
if not exprs.ExprSet(sql_exprs).issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
|
|
994
1016
|
# we need an ExprEvalNode to evaluate the remaining output exprs
|
|
@@ -1034,10 +1056,13 @@ class Planner:
|
|
|
1034
1056
|
plan = cls._create_query_plan(
|
|
1035
1057
|
row_builder=row_builder, analyzer=analyzer, eval_ctx=row_builder.default_eval_ctx, with_pk=True
|
|
1036
1058
|
)
|
|
1059
|
+
|
|
1037
1060
|
plan.ctx.batch_size = 16
|
|
1038
1061
|
plan.ctx.show_pbar = True
|
|
1039
1062
|
plan.ctx.ignore_errors = True
|
|
1040
1063
|
computed_exprs = row_builder.output_exprs - row_builder.input_exprs
|
|
1041
1064
|
plan.ctx.num_computed_exprs = len(computed_exprs) # we are adding a computed column, so we need to evaluate it
|
|
1042
1065
|
|
|
1066
|
+
plan = cls._insert_save_node(tbl.tbl_version.id, row_builder.stored_media_cols, input_node=plan)
|
|
1067
|
+
|
|
1043
1068
|
return plan
|
pixeltable/share/packager.py
CHANGED
|
@@ -24,7 +24,8 @@ from pixeltable.env import Env
|
|
|
24
24
|
from pixeltable.metadata import schema
|
|
25
25
|
from pixeltable.utils import sha256sum
|
|
26
26
|
from pixeltable.utils.formatter import Formatter
|
|
27
|
-
from pixeltable.utils.
|
|
27
|
+
from pixeltable.utils.local_store import TempStore
|
|
28
|
+
from pixeltable.utils.object_stores import ObjectOps
|
|
28
29
|
|
|
29
30
|
_logger = logging.getLogger('pixeltable')
|
|
30
31
|
|
|
@@ -362,6 +363,8 @@ class TableRestorer:
|
|
|
362
363
|
for md in tbl_md:
|
|
363
364
|
md.tbl_md.is_replica = True
|
|
364
365
|
|
|
366
|
+
assert not tbl_md[0].version_md.is_fragment # Top-level table cannot be a version fragment
|
|
367
|
+
|
|
365
368
|
cat = catalog.Catalog.get()
|
|
366
369
|
|
|
367
370
|
with cat.begin_xact(for_write=True):
|
|
@@ -369,6 +372,9 @@ class TableRestorer:
|
|
|
369
372
|
# versions that have not been seen before.
|
|
370
373
|
cat.create_replica(catalog.Path.parse(self.tbl_path), tbl_md)
|
|
371
374
|
|
|
375
|
+
_logger.debug(f'Now will import data for {len(tbl_md)} table(s):')
|
|
376
|
+
_logger.debug(repr([md.tbl_md.tbl_id for md in tbl_md[::-1]]))
|
|
377
|
+
|
|
372
378
|
# Now we need to load data for replica_tbl and its ancestors, except that we skip
|
|
373
379
|
# replica_tbl itself if it's a pure snapshot.
|
|
374
380
|
for md in tbl_md[::-1]: # Base table first
|
|
@@ -619,7 +625,7 @@ class TableRestorer:
|
|
|
619
625
|
# in self.media_files.
|
|
620
626
|
src_path = self.tmp_dir / 'media' / parsed_url.netloc
|
|
621
627
|
# Move the file to the media store and update the URL.
|
|
622
|
-
self.media_files[url] =
|
|
628
|
+
self.media_files[url] = ObjectOps.put_file(media_col, src_path, relocate_or_delete=True)
|
|
623
629
|
return self.media_files[url]
|
|
624
630
|
# For any type of URL other than a local file, just return the URL as-is.
|
|
625
631
|
return url
|
pixeltable/share/publish.py
CHANGED
|
@@ -14,7 +14,7 @@ import pixeltable as pxt
|
|
|
14
14
|
from pixeltable import exceptions as excs
|
|
15
15
|
from pixeltable.env import Env
|
|
16
16
|
from pixeltable.utils import sha256sum
|
|
17
|
-
from pixeltable.utils.
|
|
17
|
+
from pixeltable.utils.local_store import TempStore
|
|
18
18
|
|
|
19
19
|
from .packager import TablePackager, TableRestorer
|
|
20
20
|
|
|
@@ -79,16 +79,13 @@ def push_replica(
|
|
|
79
79
|
|
|
80
80
|
|
|
81
81
|
def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult) -> None:
|
|
82
|
-
from pixeltable.utils.s3 import get_client
|
|
83
|
-
|
|
84
82
|
bucket = parsed_location.netloc
|
|
85
83
|
remote_dir = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_location.path)))
|
|
86
84
|
remote_path = str(remote_dir / bundle.name)[1:] # Remove initial /
|
|
87
85
|
|
|
88
86
|
Env.get().console_logger.info(f'Uploading snapshot to: {bucket}:{remote_path}')
|
|
89
87
|
|
|
90
|
-
|
|
91
|
-
s3_client = get_client(**boto_config)
|
|
88
|
+
s3_client = Env.get().get_client('s3')
|
|
92
89
|
|
|
93
90
|
upload_args = {'ChecksumAlgorithm': 'SHA256'}
|
|
94
91
|
|
|
@@ -135,16 +132,13 @@ def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
|
|
|
135
132
|
|
|
136
133
|
|
|
137
134
|
def _download_bundle_from_s3(parsed_location: urllib.parse.ParseResult, bundle_filename: str) -> Path:
|
|
138
|
-
from pixeltable.utils.s3 import get_client
|
|
139
|
-
|
|
140
135
|
bucket = parsed_location.netloc
|
|
141
136
|
remote_dir = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_location.path)))
|
|
142
137
|
remote_path = str(remote_dir / bundle_filename)[1:] # Remove initial /
|
|
143
138
|
|
|
144
139
|
Env.get().console_logger.info(f'Downloading snapshot from: {bucket}:{remote_path}')
|
|
145
140
|
|
|
146
|
-
|
|
147
|
-
s3_client = get_client(**boto_config)
|
|
141
|
+
s3_client = Env.get().get_client('s3')
|
|
148
142
|
|
|
149
143
|
obj = s3_client.head_object(Bucket=bucket, Key=remote_path) # Check if the object exists
|
|
150
144
|
bundle_size = obj['ContentLength']
|
pixeltable/type_system.py
CHANGED
|
@@ -1081,9 +1081,7 @@ class ImageType(ColumnType):
|
|
|
1081
1081
|
mode: Optional[str] = None,
|
|
1082
1082
|
nullable: bool = False,
|
|
1083
1083
|
):
|
|
1084
|
-
|
|
1085
|
-
TODO: does it make sense to specify only width or height?
|
|
1086
|
-
"""
|
|
1084
|
+
# TODO: does it make sense to specify only width or height?
|
|
1087
1085
|
super().__init__(self.Type.IMAGE, nullable=nullable)
|
|
1088
1086
|
assert not (width is not None and size is not None)
|
|
1089
1087
|
assert not (height is not None and size is not None)
|
pixeltable/utils/arrow.py
CHANGED
|
@@ -1,11 +1,18 @@
|
|
|
1
1
|
import datetime
|
|
2
|
-
|
|
2
|
+
import io
|
|
3
|
+
import json
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Iterator, Optional, cast
|
|
3
5
|
|
|
4
6
|
import numpy as np
|
|
7
|
+
import PIL.Image
|
|
5
8
|
import pyarrow as pa
|
|
6
9
|
|
|
10
|
+
import pixeltable.exceptions as excs
|
|
7
11
|
import pixeltable.type_system as ts
|
|
8
12
|
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
import pixeltable as pxt
|
|
15
|
+
|
|
9
16
|
PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
|
|
10
17
|
pa.string(): ts.StringType(nullable=True),
|
|
11
18
|
pa.large_string(): ts.StringType(nullable=True),
|
|
@@ -71,7 +78,7 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
|
|
|
71
78
|
return None
|
|
72
79
|
|
|
73
80
|
|
|
74
|
-
def
|
|
81
|
+
def to_pxt_schema(
|
|
75
82
|
arrow_schema: pa.Schema, schema_overrides: dict[str, Any], primary_key: list[str]
|
|
76
83
|
) -> dict[str, ts.ColumnType]:
|
|
77
84
|
"""Convert a pyarrow Schema to a schema using pyarrow names and pixeltable types."""
|
|
@@ -88,6 +95,94 @@ def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
|
|
|
88
95
|
return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items()) # type: ignore[misc]
|
|
89
96
|
|
|
90
97
|
|
|
98
|
+
def _to_record_batch(column_vals: dict[str, list[Any]], schema: pa.Schema) -> pa.RecordBatch:
|
|
99
|
+
import pyarrow as pa
|
|
100
|
+
|
|
101
|
+
pa_arrays: list[pa.Array] = []
|
|
102
|
+
for field in schema:
|
|
103
|
+
if isinstance(field.type, pa.FixedShapeTensorType):
|
|
104
|
+
stacked_arr = np.stack(column_vals[field.name])
|
|
105
|
+
pa_arrays.append(pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr))
|
|
106
|
+
else:
|
|
107
|
+
pa_array = cast(pa.Array, pa.array(column_vals[field.name]))
|
|
108
|
+
pa_arrays.append(pa_array)
|
|
109
|
+
return pa.RecordBatch.from_arrays(pa_arrays, schema=schema) # type: ignore
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def to_record_batches(df: 'pxt.DataFrame', batch_size_bytes: int) -> Iterator[pa.RecordBatch]:
|
|
113
|
+
arrow_schema = to_arrow_schema(df.schema)
|
|
114
|
+
batch_columns: dict[str, list[Any]] = {k: [] for k in df.schema}
|
|
115
|
+
current_byte_estimate = 0
|
|
116
|
+
num_batch_rows = 0
|
|
117
|
+
|
|
118
|
+
# TODO: in order to avoid having to deal with ExprEvalError here, DataFrameResultSet should be an iterator
|
|
119
|
+
# over _exec()
|
|
120
|
+
try:
|
|
121
|
+
for data_row in df._exec():
|
|
122
|
+
num_batch_rows += 1
|
|
123
|
+
for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
|
|
124
|
+
val = data_row[e.slot_idx]
|
|
125
|
+
val_size_bytes: int
|
|
126
|
+
if val is None:
|
|
127
|
+
batch_columns[col_name].append(val)
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
assert val is not None
|
|
131
|
+
if col_type.is_image_type():
|
|
132
|
+
# images get inlined into the parquet file
|
|
133
|
+
if data_row.file_paths[e.slot_idx] is not None:
|
|
134
|
+
# if there is a file, read directly to preserve information
|
|
135
|
+
with open(data_row.file_paths[e.slot_idx], 'rb') as f:
|
|
136
|
+
val = f.read()
|
|
137
|
+
elif isinstance(val, PIL.Image.Image):
|
|
138
|
+
# no file available: save as png
|
|
139
|
+
buf = io.BytesIO()
|
|
140
|
+
val.save(buf, format='png')
|
|
141
|
+
val = buf.getvalue()
|
|
142
|
+
else:
|
|
143
|
+
raise excs.Error(f'unknown image type {type(val)}')
|
|
144
|
+
val_size_bytes = len(val)
|
|
145
|
+
elif col_type.is_string_type():
|
|
146
|
+
val_size_bytes = len(val)
|
|
147
|
+
elif col_type.is_media_type():
|
|
148
|
+
assert data_row.file_paths[e.slot_idx] is not None
|
|
149
|
+
val = data_row.file_paths[e.slot_idx]
|
|
150
|
+
val_size_bytes = len(val)
|
|
151
|
+
elif col_type.is_json_type():
|
|
152
|
+
val = json.dumps(val)
|
|
153
|
+
val_size_bytes = len(val)
|
|
154
|
+
elif col_type.is_array_type():
|
|
155
|
+
val_size_bytes = val.nbytes
|
|
156
|
+
elif col_type.is_int_type() or col_type.is_float_type():
|
|
157
|
+
val_size_bytes = 8
|
|
158
|
+
elif col_type.is_bool_type():
|
|
159
|
+
val_size_bytes = 1
|
|
160
|
+
elif col_type.is_date_type():
|
|
161
|
+
val_size_bytes = 4
|
|
162
|
+
elif col_type.is_timestamp_type():
|
|
163
|
+
val = val.astimezone(datetime.timezone.utc)
|
|
164
|
+
val_size_bytes = 8
|
|
165
|
+
else:
|
|
166
|
+
raise excs.Error(f'unknown type {col_type} for {col_name}')
|
|
167
|
+
|
|
168
|
+
batch_columns[col_name].append(val)
|
|
169
|
+
current_byte_estimate += val_size_bytes
|
|
170
|
+
|
|
171
|
+
if current_byte_estimate > batch_size_bytes and num_batch_rows > 0:
|
|
172
|
+
record_batch = _to_record_batch(batch_columns, arrow_schema)
|
|
173
|
+
yield record_batch
|
|
174
|
+
batch_columns = {k: [] for k in df.schema}
|
|
175
|
+
current_byte_estimate = 0
|
|
176
|
+
num_batch_rows = 0
|
|
177
|
+
|
|
178
|
+
except excs.ExprEvalError as e:
|
|
179
|
+
df._raise_expr_eval_err(e)
|
|
180
|
+
|
|
181
|
+
if num_batch_rows > 0:
|
|
182
|
+
record_batch = _to_record_batch(batch_columns, arrow_schema)
|
|
183
|
+
yield record_batch
|
|
184
|
+
|
|
185
|
+
|
|
91
186
|
def to_pydict(batch: pa.Table | pa.RecordBatch) -> dict[str, list | np.ndarray]:
|
|
92
187
|
"""Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
|
|
93
188
|
this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.
|