pixeltable 0.4.11__py3-none-any.whl → 0.4.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +11 -1
- pixeltable/catalog/__init__.py +2 -1
- pixeltable/catalog/table.py +88 -118
- pixeltable/catalog/table_metadata.py +96 -0
- pixeltable/catalog/table_version.py +23 -26
- pixeltable/catalog/view.py +3 -1
- pixeltable/dataframe.py +29 -8
- pixeltable/env.py +2 -6
- pixeltable/exprs/compound_predicate.py +2 -1
- pixeltable/functions/anthropic.py +17 -6
- pixeltable/functions/groq.py +2 -2
- pixeltable/functions/openai.py +6 -3
- pixeltable/globals.py +11 -7
- pixeltable/io/__init__.py +2 -1
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/parquet.py +9 -89
- pixeltable/io/table_data_conduit.py +2 -2
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/publish.py +12 -10
- pixeltable/utils/arrow.py +97 -2
- pixeltable/utils/lancedb.py +88 -0
- pixeltable/utils/media_store.py +11 -0
- {pixeltable-0.4.11.dist-info → pixeltable-0.4.13.dist-info}/METADATA +162 -127
- {pixeltable-0.4.11.dist-info → pixeltable-0.4.13.dist-info}/RECORD +27 -24
- {pixeltable-0.4.11.dist-info → pixeltable-0.4.13.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.11.dist-info → pixeltable-0.4.13.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.11.dist-info → pixeltable-0.4.13.dist-info}/licenses/LICENSE +0 -0
|
@@ -41,9 +41,9 @@ def _anthropic_client() -> 'anthropic.AsyncAnthropic':
|
|
|
41
41
|
def _get_header_info(
|
|
42
42
|
headers: httpx.Headers,
|
|
43
43
|
) -> tuple[
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
44
|
+
tuple[int, int, datetime.datetime] | None,
|
|
45
|
+
tuple[int, int, datetime.datetime] | None,
|
|
46
|
+
tuple[int, int, datetime.datetime] | None,
|
|
47
47
|
]:
|
|
48
48
|
"""Extract rate limit info from Anthropic API response headers."""
|
|
49
49
|
requests_limit_str = headers.get('anthropic-ratelimit-requests-limit')
|
|
@@ -54,7 +54,9 @@ def _get_header_info(
|
|
|
54
54
|
requests_reset = (
|
|
55
55
|
datetime.datetime.fromisoformat(requests_reset_str.replace('Z', '+00:00')) if requests_reset_str else None
|
|
56
56
|
)
|
|
57
|
-
requests_info = (
|
|
57
|
+
requests_info = (
|
|
58
|
+
(requests_limit, requests_remaining, requests_reset) if requests_reset and requests_remaining else None
|
|
59
|
+
)
|
|
58
60
|
|
|
59
61
|
input_tokens_limit_str = headers.get('anthropic-ratelimit-input-tokens-limit')
|
|
60
62
|
input_tokens_limit = int(input_tokens_limit_str) if input_tokens_limit_str is not None else None
|
|
@@ -66,7 +68,11 @@ def _get_header_info(
|
|
|
66
68
|
if input_tokens_reset_str
|
|
67
69
|
else None
|
|
68
70
|
)
|
|
69
|
-
input_tokens_info = (
|
|
71
|
+
input_tokens_info = (
|
|
72
|
+
(input_tokens_limit, input_tokens_remaining, input_tokens_reset)
|
|
73
|
+
if input_tokens_reset and input_tokens_remaining
|
|
74
|
+
else None
|
|
75
|
+
)
|
|
70
76
|
|
|
71
77
|
output_tokens_limit_str = headers.get('anthropic-ratelimit-output-tokens-limit')
|
|
72
78
|
output_tokens_limit = int(output_tokens_limit_str) if output_tokens_limit_str is not None else None
|
|
@@ -79,9 +85,14 @@ def _get_header_info(
|
|
|
79
85
|
else None
|
|
80
86
|
)
|
|
81
87
|
output_tokens_info = (
|
|
82
|
-
(output_tokens_limit, output_tokens_remaining, output_tokens_reset)
|
|
88
|
+
(output_tokens_limit, output_tokens_remaining, output_tokens_reset)
|
|
89
|
+
if output_tokens_reset and output_tokens_remaining
|
|
90
|
+
else None
|
|
83
91
|
)
|
|
84
92
|
|
|
93
|
+
if requests_info is None or input_tokens_info is None or output_tokens_info is None:
|
|
94
|
+
_logger.debug(f'get_header_info(): incomplete rate limit info: {headers}')
|
|
95
|
+
|
|
85
96
|
return requests_info, input_tokens_info, output_tokens_info
|
|
86
97
|
|
|
87
98
|
|
pixeltable/functions/groq.py
CHANGED
|
@@ -62,11 +62,11 @@ async def chat_completions(
|
|
|
62
62
|
A dictionary containing the response and other metadata.
|
|
63
63
|
|
|
64
64
|
Examples:
|
|
65
|
-
Add a computed column that applies the model `
|
|
65
|
+
Add a computed column that applies the model `llama-3.1-8b-instant`
|
|
66
66
|
to an existing Pixeltable column `tbl.prompt` of the table `tbl`:
|
|
67
67
|
|
|
68
68
|
>>> messages = [{'role': 'user', 'content': tbl.prompt}]
|
|
69
|
-
... tbl.add_computed_column(response=chat_completions(messages, model='
|
|
69
|
+
... tbl.add_computed_column(response=chat_completions(messages, model='llama-3.1-8b-instant'))
|
|
70
70
|
"""
|
|
71
71
|
if model_kwargs is None:
|
|
72
72
|
model_kwargs = {}
|
pixeltable/functions/openai.py
CHANGED
|
@@ -113,7 +113,7 @@ def _parse_header_duration(duration_str: str) -> datetime.timedelta:
|
|
|
113
113
|
|
|
114
114
|
def _get_header_info(
|
|
115
115
|
headers: httpx.Headers,
|
|
116
|
-
) -> tuple[
|
|
116
|
+
) -> tuple[tuple[int, int, datetime.datetime] | None, tuple[int, int, datetime.datetime] | None]:
|
|
117
117
|
now = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
118
118
|
|
|
119
119
|
requests_limit_str = headers.get('x-ratelimit-limit-requests')
|
|
@@ -122,7 +122,7 @@ def _get_header_info(
|
|
|
122
122
|
requests_remaining = int(requests_remaining_str) if requests_remaining_str is not None else None
|
|
123
123
|
requests_reset_str = headers.get('x-ratelimit-reset-requests', '5s') # Default to 5 seconds
|
|
124
124
|
requests_reset_ts = now + _parse_header_duration(requests_reset_str)
|
|
125
|
-
requests_info = (requests_limit, requests_remaining, requests_reset_ts)
|
|
125
|
+
requests_info = (requests_limit, requests_remaining, requests_reset_ts) if requests_remaining is not None else None
|
|
126
126
|
|
|
127
127
|
tokens_limit_str = headers.get('x-ratelimit-limit-tokens')
|
|
128
128
|
tokens_limit = int(tokens_limit_str) if tokens_limit_str is not None else None
|
|
@@ -130,7 +130,10 @@ def _get_header_info(
|
|
|
130
130
|
tokens_remaining = int(tokens_remaining_str) if tokens_remaining_str is not None else None
|
|
131
131
|
tokens_reset_str = headers.get('x-ratelimit-reset-tokens', '5s') # Default to 5 seconds
|
|
132
132
|
tokens_reset_ts = now + _parse_header_duration(tokens_reset_str)
|
|
133
|
-
tokens_info = (tokens_limit, tokens_remaining, tokens_reset_ts)
|
|
133
|
+
tokens_info = (tokens_limit, tokens_remaining, tokens_reset_ts) if tokens_remaining is not None else None
|
|
134
|
+
|
|
135
|
+
if requests_info is None or tokens_info is None:
|
|
136
|
+
_logger.debug(f'get_header_info(): incomplete rate limit info: {headers}')
|
|
134
137
|
|
|
135
138
|
return requests_info, tokens_info
|
|
136
139
|
|
pixeltable/globals.py
CHANGED
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import TYPE_CHECKING, Any, Iterable,
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Iterable, Literal, NamedTuple, Optional, Union
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
import pydantic
|
|
@@ -24,9 +24,8 @@ if TYPE_CHECKING:
|
|
|
24
24
|
str,
|
|
25
25
|
os.PathLike,
|
|
26
26
|
Path, # OS paths, filenames, URLs
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
Sequence[pydantic.BaseModel], # list of Pydantic models
|
|
27
|
+
Iterable[dict[str, Any]], # dictionaries of values
|
|
28
|
+
Iterable[pydantic.BaseModel], # Pydantic model instances
|
|
30
29
|
DataFrame, # Pixeltable DataFrame
|
|
31
30
|
pd.DataFrame, # pandas DataFrame
|
|
32
31
|
datasets.Dataset,
|
|
@@ -542,9 +541,14 @@ def drop_table(
|
|
|
542
541
|
assert isinstance(table, str)
|
|
543
542
|
tbl_path = table
|
|
544
543
|
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
544
|
+
if tbl_path.startswith('pxt://'):
|
|
545
|
+
# Remote table
|
|
546
|
+
share.delete_replica(tbl_path)
|
|
547
|
+
else:
|
|
548
|
+
# Local table
|
|
549
|
+
path_obj = catalog.Path.parse(tbl_path)
|
|
550
|
+
if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
|
|
551
|
+
Catalog.get().drop_table(path_obj, force=force, if_not_exists=if_not_exists_)
|
|
548
552
|
|
|
549
553
|
|
|
550
554
|
def get_dir_contents(dir_path: str = '', recursive: bool = True) -> 'DirContents':
|
pixeltable/io/__init__.py
CHANGED
|
@@ -4,11 +4,12 @@ from .datarows import import_json, import_rows
|
|
|
4
4
|
from .external_store import ExternalStore
|
|
5
5
|
from .globals import create_label_studio_project, export_images_as_fo_dataset
|
|
6
6
|
from .hf_datasets import import_huggingface_dataset
|
|
7
|
+
from .lancedb import export_lancedb
|
|
7
8
|
from .pandas import import_csv, import_excel, import_pandas
|
|
8
9
|
from .parquet import export_parquet, import_parquet
|
|
9
10
|
|
|
10
11
|
__default_dir = {symbol for symbol in dir() if not symbol.startswith('_')}
|
|
11
|
-
__removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows'}
|
|
12
|
+
__removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows', 'lancedb'}
|
|
12
13
|
__all__ = sorted(__default_dir - __removed_symbols)
|
|
13
14
|
|
|
14
15
|
|
pixeltable/io/lancedb.py
ADDED
pixeltable/io/parquet.py
CHANGED
|
@@ -1,46 +1,22 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import datetime
|
|
4
|
-
import io
|
|
5
3
|
import json
|
|
6
4
|
import logging
|
|
7
5
|
import typing
|
|
8
|
-
from collections import deque
|
|
9
6
|
from pathlib import Path
|
|
10
7
|
from typing import Any, Optional
|
|
11
8
|
|
|
12
|
-
import numpy as np
|
|
13
|
-
import PIL.Image
|
|
14
|
-
|
|
15
9
|
import pixeltable as pxt
|
|
16
10
|
import pixeltable.exceptions as excs
|
|
17
11
|
from pixeltable.catalog import Catalog
|
|
18
12
|
from pixeltable.utils.transactional_directory import transactional_directory
|
|
19
13
|
|
|
20
14
|
if typing.TYPE_CHECKING:
|
|
21
|
-
import pyarrow as pa
|
|
22
|
-
|
|
23
15
|
import pixeltable as pxt
|
|
24
16
|
|
|
25
17
|
_logger = logging.getLogger('pixeltable')
|
|
26
18
|
|
|
27
19
|
|
|
28
|
-
def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
|
|
29
|
-
import pyarrow as pa
|
|
30
|
-
from pyarrow import parquet
|
|
31
|
-
|
|
32
|
-
pydict = {}
|
|
33
|
-
for field in schema:
|
|
34
|
-
if isinstance(field.type, pa.FixedShapeTensorType):
|
|
35
|
-
stacked_arr = np.stack(value_batch[field.name])
|
|
36
|
-
pydict[field.name] = pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr)
|
|
37
|
-
else:
|
|
38
|
-
pydict[field.name] = value_batch[field.name]
|
|
39
|
-
|
|
40
|
-
tab = pa.Table.from_pydict(pydict, schema=schema)
|
|
41
|
-
parquet.write_table(tab, str(output_path))
|
|
42
|
-
|
|
43
|
-
|
|
44
20
|
def export_parquet(
|
|
45
21
|
table_or_df: pxt.Table | pxt.DataFrame,
|
|
46
22
|
parquet_path: Path,
|
|
@@ -63,7 +39,9 @@ def export_parquet(
|
|
|
63
39
|
If False, will raise an error if the Dataframe has any image column.
|
|
64
40
|
Default False.
|
|
65
41
|
"""
|
|
66
|
-
|
|
42
|
+
import pyarrow as pa
|
|
43
|
+
|
|
44
|
+
from pixeltable.utils.arrow import to_record_batches
|
|
67
45
|
|
|
68
46
|
df: pxt.DataFrame
|
|
69
47
|
if isinstance(table_or_df, pxt.catalog.Table):
|
|
@@ -71,9 +49,6 @@ def export_parquet(
|
|
|
71
49
|
else:
|
|
72
50
|
df = table_or_df
|
|
73
51
|
|
|
74
|
-
type_dict = {k: v.as_dict() for k, v in df.schema.items()}
|
|
75
|
-
arrow_schema = to_arrow_schema(df.schema)
|
|
76
|
-
|
|
77
52
|
if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
|
|
78
53
|
raise excs.Error('Cannot export Dataframe with image columns when inline_images is False')
|
|
79
54
|
|
|
@@ -81,70 +56,15 @@ def export_parquet(
|
|
|
81
56
|
with transactional_directory(parquet_path) as temp_path:
|
|
82
57
|
# dump metadata json file so we can inspect what was the source of the parquet file later on.
|
|
83
58
|
json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
|
|
59
|
+
type_dict = {k: v.as_dict() for k, v in df.schema.items()}
|
|
84
60
|
json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
|
|
85
|
-
|
|
86
61
|
batch_num = 0
|
|
87
|
-
current_value_batch: dict[str, deque] = {k: deque() for k in df.schema}
|
|
88
|
-
current_byte_estimate = 0
|
|
89
|
-
|
|
90
62
|
with Catalog.get().begin_xact(for_write=False):
|
|
91
|
-
for
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
continue
|
|
97
|
-
|
|
98
|
-
assert val is not None
|
|
99
|
-
if col_type.is_image_type():
|
|
100
|
-
# images get inlined into the parquet file
|
|
101
|
-
if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
|
|
102
|
-
# if there is a file, read directly to preserve information
|
|
103
|
-
with open(data_row.file_paths[e.slot_idx], 'rb') as f:
|
|
104
|
-
val = f.read()
|
|
105
|
-
elif isinstance(val, PIL.Image.Image):
|
|
106
|
-
# if no file available, eg. bc it is computed, convert to png
|
|
107
|
-
buf = io.BytesIO()
|
|
108
|
-
val.save(buf, format='PNG')
|
|
109
|
-
val = buf.getvalue()
|
|
110
|
-
else:
|
|
111
|
-
raise excs.Error(f'unknown image type {type(val)}')
|
|
112
|
-
length = len(val)
|
|
113
|
-
elif col_type.is_string_type():
|
|
114
|
-
length = len(val)
|
|
115
|
-
elif col_type.is_video_type() or col_type.is_audio_type():
|
|
116
|
-
if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
|
|
117
|
-
val = data_row.file_paths[e.slot_idx]
|
|
118
|
-
else:
|
|
119
|
-
raise excs.Error(f'unknown audio/video type {type(val)}')
|
|
120
|
-
length = len(val)
|
|
121
|
-
elif col_type.is_json_type():
|
|
122
|
-
val = json.dumps(val)
|
|
123
|
-
length = len(val)
|
|
124
|
-
elif col_type.is_array_type():
|
|
125
|
-
length = val.nbytes
|
|
126
|
-
elif col_type.is_int_type() or col_type.is_float_type():
|
|
127
|
-
length = 8
|
|
128
|
-
elif col_type.is_bool_type():
|
|
129
|
-
length = 1
|
|
130
|
-
elif col_type.is_date_type():
|
|
131
|
-
length = 4
|
|
132
|
-
elif col_type.is_timestamp_type():
|
|
133
|
-
val = val.astimezone(datetime.timezone.utc)
|
|
134
|
-
length = 8
|
|
135
|
-
else:
|
|
136
|
-
raise excs.Error(f'unknown type {col_type} for {col_name}')
|
|
137
|
-
|
|
138
|
-
current_value_batch[col_name].append(val)
|
|
139
|
-
current_byte_estimate += length
|
|
140
|
-
if current_byte_estimate > partition_size_bytes:
|
|
141
|
-
assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
|
|
142
|
-
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
143
|
-
batch_num += 1
|
|
144
|
-
current_value_batch = {k: deque() for k in df.schema}
|
|
145
|
-
current_byte_estimate = 0
|
|
146
|
-
|
|
147
|
-
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
63
|
+
for record_batch in to_record_batches(df, partition_size_bytes):
|
|
64
|
+
output_path = temp_path / f'part-{batch_num:05d}.parquet'
|
|
65
|
+
arrow_tbl = pa.Table.from_batches([record_batch]) # type: ignore
|
|
66
|
+
pa.parquet.write_table(arrow_tbl, str(output_path))
|
|
67
|
+
batch_num += 1
|
|
148
68
|
|
|
149
69
|
|
|
150
70
|
def import_parquet(
|
|
@@ -469,12 +469,12 @@ class ParquetTableDataConduit(TableDataConduit):
|
|
|
469
469
|
return t
|
|
470
470
|
|
|
471
471
|
def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
472
|
-
from pixeltable.utils.arrow import
|
|
472
|
+
from pixeltable.utils.arrow import to_pxt_schema
|
|
473
473
|
|
|
474
474
|
if self.source_column_map is None:
|
|
475
475
|
if self.src_schema_overrides is None:
|
|
476
476
|
self.src_schema_overrides = {}
|
|
477
|
-
self.src_schema =
|
|
477
|
+
self.src_schema = to_pxt_schema(self.pq_ds.schema, self.src_schema_overrides, self.src_pk)
|
|
478
478
|
inferred_schema, inferred_pk, self.source_column_map = normalize_schema_names(
|
|
479
479
|
self.src_schema, self.src_pk, self.src_schema_overrides
|
|
480
480
|
)
|
pixeltable/share/__init__.py
CHANGED
pixeltable/share/publish.py
CHANGED
|
@@ -27,15 +27,11 @@ PIXELTABLE_API_URL = os.environ.get('PIXELTABLE_API_URL', 'https://internal-api.
|
|
|
27
27
|
def push_replica(
|
|
28
28
|
dest_tbl_uri: str, src_tbl: pxt.Table, bucket: str | None = None, access: Literal['public', 'private'] = 'private'
|
|
29
29
|
) -> str:
|
|
30
|
-
if not src_tbl._tbl_version_path.is_snapshot():
|
|
31
|
-
raise excs.Error('Only snapshots may be published.')
|
|
32
|
-
|
|
33
30
|
packager = TablePackager(
|
|
34
31
|
src_tbl, additional_md={'table_uri': dest_tbl_uri, 'bucket_name': bucket, 'is_public': access == 'public'}
|
|
35
32
|
)
|
|
36
33
|
request_json = packager.md | {'operation_type': 'publish_snapshot'}
|
|
37
|
-
|
|
38
|
-
response = requests.post(PIXELTABLE_API_URL, json=request_json, headers=headers_json)
|
|
34
|
+
response = requests.post(PIXELTABLE_API_URL, json=request_json, headers=_api_headers())
|
|
39
35
|
if response.status_code != 200:
|
|
40
36
|
raise excs.Error(f'Error publishing snapshot: {response.text}')
|
|
41
37
|
response_json = response.json()
|
|
@@ -70,7 +66,7 @@ def push_replica(
|
|
|
70
66
|
'preview_data': packager.md['preview_data'],
|
|
71
67
|
}
|
|
72
68
|
# TODO: Use Pydantic for validation
|
|
73
|
-
finalize_response = requests.post(PIXELTABLE_API_URL, json=finalize_request_json, headers=
|
|
69
|
+
finalize_response = requests.post(PIXELTABLE_API_URL, json=finalize_request_json, headers=_api_headers())
|
|
74
70
|
if finalize_response.status_code != 200:
|
|
75
71
|
raise excs.Error(f'Error finalizing snapshot: {finalize_response.text}')
|
|
76
72
|
finalize_response_json = finalize_response.json()
|
|
@@ -112,9 +108,8 @@ def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult
|
|
|
112
108
|
|
|
113
109
|
|
|
114
110
|
def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
|
|
115
|
-
headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
|
|
116
111
|
clone_request_json = {'operation_type': 'clone_snapshot', 'table_uri': src_tbl_uri}
|
|
117
|
-
response = requests.post(PIXELTABLE_API_URL, json=clone_request_json, headers=
|
|
112
|
+
response = requests.post(PIXELTABLE_API_URL, json=clone_request_json, headers=_api_headers())
|
|
118
113
|
if response.status_code != 200:
|
|
119
114
|
raise excs.Error(f'Error cloning snapshot: {response.text}')
|
|
120
115
|
response_json = response.json()
|
|
@@ -268,11 +263,18 @@ def _download_from_presigned_url(
|
|
|
268
263
|
# TODO: This will be replaced by drop_table with cloud table uri
|
|
269
264
|
def delete_replica(dest_path: str) -> None:
|
|
270
265
|
"""Delete cloud replica"""
|
|
271
|
-
headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
|
|
272
266
|
delete_request_json = {'operation_type': 'delete_snapshot', 'table_uri': dest_path}
|
|
273
|
-
response = requests.post(PIXELTABLE_API_URL, json=delete_request_json, headers=
|
|
267
|
+
response = requests.post(PIXELTABLE_API_URL, json=delete_request_json, headers=_api_headers())
|
|
274
268
|
if response.status_code != 200:
|
|
275
269
|
raise excs.Error(f'Error deleting replica: {response.text}')
|
|
276
270
|
response_json = response.json()
|
|
277
271
|
if not isinstance(response_json, dict) or 'table_uri' not in response_json:
|
|
278
272
|
raise excs.Error(f'Error deleting replica: unexpected response from server.\n{response_json}')
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def _api_headers() -> dict[str, str]:
|
|
276
|
+
headers = {'Content-Type': 'application/json'}
|
|
277
|
+
api_key = Env.get().pxt_api_key
|
|
278
|
+
if api_key is not None:
|
|
279
|
+
headers['X-api-key'] = api_key
|
|
280
|
+
return headers
|
pixeltable/utils/arrow.py
CHANGED
|
@@ -1,11 +1,18 @@
|
|
|
1
1
|
import datetime
|
|
2
|
-
|
|
2
|
+
import io
|
|
3
|
+
import json
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Iterator, Optional, cast
|
|
3
5
|
|
|
4
6
|
import numpy as np
|
|
7
|
+
import PIL.Image
|
|
5
8
|
import pyarrow as pa
|
|
6
9
|
|
|
10
|
+
import pixeltable.exceptions as excs
|
|
7
11
|
import pixeltable.type_system as ts
|
|
8
12
|
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
import pixeltable as pxt
|
|
15
|
+
|
|
9
16
|
PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
|
|
10
17
|
pa.string(): ts.StringType(nullable=True),
|
|
11
18
|
pa.large_string(): ts.StringType(nullable=True),
|
|
@@ -71,7 +78,7 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
|
|
|
71
78
|
return None
|
|
72
79
|
|
|
73
80
|
|
|
74
|
-
def
|
|
81
|
+
def to_pxt_schema(
|
|
75
82
|
arrow_schema: pa.Schema, schema_overrides: dict[str, Any], primary_key: list[str]
|
|
76
83
|
) -> dict[str, ts.ColumnType]:
|
|
77
84
|
"""Convert a pyarrow Schema to a schema using pyarrow names and pixeltable types."""
|
|
@@ -88,6 +95,94 @@ def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
|
|
|
88
95
|
return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items()) # type: ignore[misc]
|
|
89
96
|
|
|
90
97
|
|
|
98
|
+
def _to_record_batch(column_vals: dict[str, list[Any]], schema: pa.Schema) -> pa.RecordBatch:
|
|
99
|
+
import pyarrow as pa
|
|
100
|
+
|
|
101
|
+
pa_arrays: list[pa.Array] = []
|
|
102
|
+
for field in schema:
|
|
103
|
+
if isinstance(field.type, pa.FixedShapeTensorType):
|
|
104
|
+
stacked_arr = np.stack(column_vals[field.name])
|
|
105
|
+
pa_arrays.append(pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr))
|
|
106
|
+
else:
|
|
107
|
+
pa_array = cast(pa.Array, pa.array(column_vals[field.name]))
|
|
108
|
+
pa_arrays.append(pa_array)
|
|
109
|
+
return pa.RecordBatch.from_arrays(pa_arrays, schema=schema) # type: ignore
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def to_record_batches(df: 'pxt.DataFrame', batch_size_bytes: int) -> Iterator[pa.RecordBatch]:
|
|
113
|
+
arrow_schema = to_arrow_schema(df.schema)
|
|
114
|
+
batch_columns: dict[str, list[Any]] = {k: [] for k in df.schema}
|
|
115
|
+
current_byte_estimate = 0
|
|
116
|
+
num_batch_rows = 0
|
|
117
|
+
|
|
118
|
+
# TODO: in order to avoid having to deal with ExprEvalError here, DataFrameResultSet should be an iterator
|
|
119
|
+
# over _exec()
|
|
120
|
+
try:
|
|
121
|
+
for data_row in df._exec():
|
|
122
|
+
num_batch_rows += 1
|
|
123
|
+
for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
|
|
124
|
+
val = data_row[e.slot_idx]
|
|
125
|
+
val_size_bytes: int
|
|
126
|
+
if val is None:
|
|
127
|
+
batch_columns[col_name].append(val)
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
assert val is not None
|
|
131
|
+
if col_type.is_image_type():
|
|
132
|
+
# images get inlined into the parquet file
|
|
133
|
+
if data_row.file_paths[e.slot_idx] is not None:
|
|
134
|
+
# if there is a file, read directly to preserve information
|
|
135
|
+
with open(data_row.file_paths[e.slot_idx], 'rb') as f:
|
|
136
|
+
val = f.read()
|
|
137
|
+
elif isinstance(val, PIL.Image.Image):
|
|
138
|
+
# no file available: save as png
|
|
139
|
+
buf = io.BytesIO()
|
|
140
|
+
val.save(buf, format='png')
|
|
141
|
+
val = buf.getvalue()
|
|
142
|
+
else:
|
|
143
|
+
raise excs.Error(f'unknown image type {type(val)}')
|
|
144
|
+
val_size_bytes = len(val)
|
|
145
|
+
elif col_type.is_string_type():
|
|
146
|
+
val_size_bytes = len(val)
|
|
147
|
+
elif col_type.is_media_type():
|
|
148
|
+
assert data_row.file_paths[e.slot_idx] is not None
|
|
149
|
+
val = data_row.file_paths[e.slot_idx]
|
|
150
|
+
val_size_bytes = len(val)
|
|
151
|
+
elif col_type.is_json_type():
|
|
152
|
+
val = json.dumps(val)
|
|
153
|
+
val_size_bytes = len(val)
|
|
154
|
+
elif col_type.is_array_type():
|
|
155
|
+
val_size_bytes = val.nbytes
|
|
156
|
+
elif col_type.is_int_type() or col_type.is_float_type():
|
|
157
|
+
val_size_bytes = 8
|
|
158
|
+
elif col_type.is_bool_type():
|
|
159
|
+
val_size_bytes = 1
|
|
160
|
+
elif col_type.is_date_type():
|
|
161
|
+
val_size_bytes = 4
|
|
162
|
+
elif col_type.is_timestamp_type():
|
|
163
|
+
val = val.astimezone(datetime.timezone.utc)
|
|
164
|
+
val_size_bytes = 8
|
|
165
|
+
else:
|
|
166
|
+
raise excs.Error(f'unknown type {col_type} for {col_name}')
|
|
167
|
+
|
|
168
|
+
batch_columns[col_name].append(val)
|
|
169
|
+
current_byte_estimate += val_size_bytes
|
|
170
|
+
|
|
171
|
+
if current_byte_estimate > batch_size_bytes and num_batch_rows > 0:
|
|
172
|
+
record_batch = _to_record_batch(batch_columns, arrow_schema)
|
|
173
|
+
yield record_batch
|
|
174
|
+
batch_columns = {k: [] for k in df.schema}
|
|
175
|
+
current_byte_estimate = 0
|
|
176
|
+
num_batch_rows = 0
|
|
177
|
+
|
|
178
|
+
except excs.ExprEvalError as e:
|
|
179
|
+
df._raise_expr_eval_err(e)
|
|
180
|
+
|
|
181
|
+
if num_batch_rows > 0:
|
|
182
|
+
record_batch = _to_record_batch(batch_columns, arrow_schema)
|
|
183
|
+
yield record_batch
|
|
184
|
+
|
|
185
|
+
|
|
91
186
|
def to_pydict(batch: pa.Table | pa.RecordBatch) -> dict[str, list | np.ndarray]:
|
|
92
187
|
"""Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
|
|
93
188
|
this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import shutil
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
import pixeltable as pxt
|
|
9
|
+
import pixeltable.exceptions as excs
|
|
10
|
+
from pixeltable.catalog import Catalog
|
|
11
|
+
from pixeltable.env import Env
|
|
12
|
+
|
|
13
|
+
_logger = logging.getLogger('pixeltable')
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def export_lancedb(
|
|
17
|
+
table_or_df: pxt.Table | pxt.DataFrame,
|
|
18
|
+
db_uri: Path,
|
|
19
|
+
table_name: str,
|
|
20
|
+
batch_size_bytes: int = 128 * 2**20,
|
|
21
|
+
if_exists: Literal['error', 'overwrite', 'append'] = 'error',
|
|
22
|
+
) -> None:
|
|
23
|
+
"""
|
|
24
|
+
Exports a dataframe's data to a LanceDB table.
|
|
25
|
+
|
|
26
|
+
This utilizes LanceDB's streaming interface for efficient table creation, via a sequence of in-memory pyarrow
|
|
27
|
+
`RecordBatches`, the size of which can be controlled with the `batch_size_bytes` parameter.
|
|
28
|
+
|
|
29
|
+
__Requirements:__
|
|
30
|
+
|
|
31
|
+
- `pip install lancedb`
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
table_or_df : Table or Dataframe to export.
|
|
35
|
+
db_uri: Local Path to the LanceDB database.
|
|
36
|
+
table_name : Name of the table in the LanceDB database.
|
|
37
|
+
batch_size_bytes : Maximum size in bytes for each batch.
|
|
38
|
+
if_exists: Determines the behavior if the table already exists. Must be one of the following:
|
|
39
|
+
|
|
40
|
+
- `'error'`: raise an error
|
|
41
|
+
- `'overwrite'`: overwrite the existing table
|
|
42
|
+
- `'append'`: append to the existing table
|
|
43
|
+
"""
|
|
44
|
+
Env.get().require_package('lancedb')
|
|
45
|
+
|
|
46
|
+
import lancedb # type: ignore[import-untyped]
|
|
47
|
+
|
|
48
|
+
from pixeltable.utils.arrow import to_arrow_schema, to_record_batches
|
|
49
|
+
|
|
50
|
+
if if_exists not in ('error', 'overwrite', 'append'):
|
|
51
|
+
raise excs.Error("export_lancedb(): 'if_exists' must be one of: ['error', 'overwrite', 'append']")
|
|
52
|
+
|
|
53
|
+
df: pxt.DataFrame
|
|
54
|
+
if isinstance(table_or_df, pxt.catalog.Table):
|
|
55
|
+
df = table_or_df._df()
|
|
56
|
+
else:
|
|
57
|
+
df = table_or_df
|
|
58
|
+
|
|
59
|
+
db_exists = False
|
|
60
|
+
if db_uri.exists():
|
|
61
|
+
if not db_uri.is_dir():
|
|
62
|
+
raise excs.Error(f"export_lancedb(): '{db_uri!s}' exists and is not a directory")
|
|
63
|
+
db_exists = True
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
db = lancedb.connect(str(db_uri))
|
|
67
|
+
lance_tbl: lancedb.LanceTable | None = None
|
|
68
|
+
try:
|
|
69
|
+
lance_tbl = db.open_table(table_name)
|
|
70
|
+
if if_exists == 'error':
|
|
71
|
+
raise excs.Error(f'export_lancedb(): table {table_name!r} already exists in {db_uri!r}')
|
|
72
|
+
except ValueError:
|
|
73
|
+
# table doesn't exist
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
with Catalog.get().begin_xact(for_write=False):
|
|
77
|
+
if lance_tbl is None or if_exists == 'overwrite':
|
|
78
|
+
mode = 'overwrite' if lance_tbl is not None else 'create'
|
|
79
|
+
arrow_schema = to_arrow_schema(df.schema)
|
|
80
|
+
_ = db.create_table(table_name, to_record_batches(df, batch_size_bytes), schema=arrow_schema, mode=mode)
|
|
81
|
+
else:
|
|
82
|
+
lance_tbl.add(to_record_batches(df, batch_size_bytes))
|
|
83
|
+
|
|
84
|
+
except Exception as e:
|
|
85
|
+
# cleanup
|
|
86
|
+
if not db_exists:
|
|
87
|
+
shutil.rmtree(db_uri)
|
|
88
|
+
raise e
|
pixeltable/utils/media_store.py
CHANGED
|
@@ -189,6 +189,12 @@ class MediaStore:
|
|
|
189
189
|
result.sort(key=lambda e: e[3], reverse=True)
|
|
190
190
|
return result
|
|
191
191
|
|
|
192
|
+
def clear(self) -> None:
|
|
193
|
+
"""Clear all files from the media store."""
|
|
194
|
+
assert self.__base_dir.exists()
|
|
195
|
+
shutil.rmtree(self.__base_dir)
|
|
196
|
+
self.__base_dir.mkdir()
|
|
197
|
+
|
|
192
198
|
|
|
193
199
|
class TempStore:
|
|
194
200
|
"""
|
|
@@ -235,3 +241,8 @@ class TempStore:
|
|
|
235
241
|
if tbl_id is not None:
|
|
236
242
|
return MediaStore(cls._tmp_dir())._prepare_media_path_raw(tbl_id, 0, 0, extension)
|
|
237
243
|
return cls._tmp_dir() / f'{uuid.uuid4()}{extension}'
|
|
244
|
+
|
|
245
|
+
@classmethod
|
|
246
|
+
def clear(cls) -> None:
|
|
247
|
+
"""Clear all files from the temporary store."""
|
|
248
|
+
MediaStore(cls._tmp_dir()).clear()
|