pixeltable 0.4.6__py3-none-any.whl → 0.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +4 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/catalog.py +7 -9
- pixeltable/catalog/column.py +49 -0
- pixeltable/catalog/insertable_table.py +0 -7
- pixeltable/catalog/schema_object.py +1 -14
- pixeltable/catalog/table.py +180 -67
- pixeltable/catalog/table_version.py +42 -146
- pixeltable/catalog/table_version_path.py +6 -5
- pixeltable/catalog/view.py +2 -1
- pixeltable/config.py +24 -9
- pixeltable/dataframe.py +5 -6
- pixeltable/env.py +113 -21
- pixeltable/exec/aggregation_node.py +1 -1
- pixeltable/exec/cache_prefetch_node.py +4 -3
- pixeltable/exec/exec_node.py +0 -8
- pixeltable/exec/expr_eval/expr_eval_node.py +2 -2
- pixeltable/exec/expr_eval/globals.py +1 -0
- pixeltable/exec/expr_eval/schedulers.py +52 -19
- pixeltable/exec/in_memory_data_node.py +2 -3
- pixeltable/exprs/array_slice.py +2 -2
- pixeltable/exprs/data_row.py +15 -2
- pixeltable/exprs/expr.py +9 -9
- pixeltable/exprs/function_call.py +61 -23
- pixeltable/exprs/globals.py +1 -2
- pixeltable/exprs/json_path.py +3 -3
- pixeltable/exprs/row_builder.py +25 -21
- pixeltable/exprs/string_op.py +3 -3
- pixeltable/func/expr_template_function.py +6 -3
- pixeltable/func/query_template_function.py +2 -2
- pixeltable/func/signature.py +30 -3
- pixeltable/func/tools.py +2 -2
- pixeltable/functions/anthropic.py +76 -27
- pixeltable/functions/deepseek.py +5 -1
- pixeltable/functions/gemini.py +11 -2
- pixeltable/functions/globals.py +2 -2
- pixeltable/functions/huggingface.py +6 -12
- pixeltable/functions/llama_cpp.py +9 -1
- pixeltable/functions/openai.py +76 -55
- pixeltable/functions/video.py +59 -6
- pixeltable/functions/vision.py +2 -2
- pixeltable/globals.py +86 -13
- pixeltable/io/datarows.py +3 -3
- pixeltable/io/fiftyone.py +7 -7
- pixeltable/io/globals.py +3 -3
- pixeltable/io/hf_datasets.py +4 -4
- pixeltable/io/label_studio.py +2 -1
- pixeltable/io/pandas.py +6 -6
- pixeltable/io/parquet.py +3 -3
- pixeltable/io/table_data_conduit.py +2 -2
- pixeltable/io/utils.py +2 -2
- pixeltable/iterators/audio.py +3 -2
- pixeltable/iterators/document.py +2 -8
- pixeltable/iterators/video.py +49 -9
- pixeltable/plan.py +0 -16
- pixeltable/share/packager.py +51 -42
- pixeltable/share/publish.py +134 -7
- pixeltable/store.py +5 -25
- pixeltable/type_system.py +5 -8
- pixeltable/utils/__init__.py +2 -2
- pixeltable/utils/arrow.py +5 -5
- pixeltable/utils/description_helper.py +3 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/media_store.py +131 -66
- {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/METADATA +238 -122
- {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/RECORD +69 -69
- {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/licenses/LICENSE +0 -0
pixeltable/share/publish.py
CHANGED
|
@@ -1,36 +1,45 @@
|
|
|
1
|
+
import os
|
|
1
2
|
import sys
|
|
2
3
|
import urllib.parse
|
|
3
4
|
import urllib.request
|
|
4
5
|
from pathlib import Path
|
|
6
|
+
from typing import Literal, Optional
|
|
5
7
|
|
|
6
8
|
import requests
|
|
9
|
+
from requests.adapters import HTTPAdapter
|
|
7
10
|
from tqdm import tqdm
|
|
11
|
+
from urllib3.util.retry import Retry
|
|
8
12
|
|
|
9
13
|
import pixeltable as pxt
|
|
10
14
|
from pixeltable import exceptions as excs
|
|
11
15
|
from pixeltable.env import Env
|
|
12
16
|
from pixeltable.utils import sha256sum
|
|
17
|
+
from pixeltable.utils.media_store import TempStore
|
|
13
18
|
|
|
14
19
|
from .packager import TablePackager, TableRestorer
|
|
15
20
|
|
|
16
21
|
# These URLs are abstracted out for now, but will be replaced with actual (hard-coded) URLs once the
|
|
17
22
|
# pixeltable.com URLs are available.
|
|
18
23
|
|
|
19
|
-
PIXELTABLE_API_URL = 'https://internal-api.pixeltable.com'
|
|
24
|
+
PIXELTABLE_API_URL = os.environ.get('PIXELTABLE_API_URL', 'https://internal-api.pixeltable.com')
|
|
20
25
|
|
|
21
26
|
|
|
22
|
-
def push_replica(
|
|
23
|
-
|
|
27
|
+
def push_replica(
|
|
28
|
+
dest_tbl_uri: str, src_tbl: pxt.Table, bucket: str | None = None, access: Literal['public', 'private'] = 'private'
|
|
29
|
+
) -> str:
|
|
30
|
+
if not src_tbl._tbl_version_path.is_snapshot():
|
|
24
31
|
raise excs.Error('Only snapshots may be published.')
|
|
25
32
|
|
|
26
|
-
packager = TablePackager(
|
|
33
|
+
packager = TablePackager(
|
|
34
|
+
src_tbl, additional_md={'table_uri': dest_tbl_uri, 'bucket_name': bucket, 'is_public': access == 'public'}
|
|
35
|
+
)
|
|
27
36
|
request_json = packager.md | {'operation_type': 'publish_snapshot'}
|
|
28
37
|
headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
|
|
29
38
|
response = requests.post(PIXELTABLE_API_URL, json=request_json, headers=headers_json)
|
|
30
39
|
if response.status_code != 200:
|
|
31
40
|
raise excs.Error(f'Error publishing snapshot: {response.text}')
|
|
32
41
|
response_json = response.json()
|
|
33
|
-
if not isinstance(response_json, dict)
|
|
42
|
+
if not isinstance(response_json, dict):
|
|
34
43
|
raise excs.Error(f'Error publishing snapshot: unexpected response from server.\n{response_json}')
|
|
35
44
|
upload_id = response_json['upload_id']
|
|
36
45
|
destination_uri = response_json['destination_uri']
|
|
@@ -42,17 +51,23 @@ def push_replica(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
|
|
|
42
51
|
parsed_location = urllib.parse.urlparse(destination_uri)
|
|
43
52
|
if parsed_location.scheme == 's3':
|
|
44
53
|
_upload_bundle_to_s3(bundle, parsed_location)
|
|
54
|
+
elif parsed_location.scheme == 'https':
|
|
55
|
+
_upload_to_presigned_url(file_path=bundle, url=parsed_location.geturl())
|
|
45
56
|
else:
|
|
46
57
|
raise excs.Error(f'Unsupported destination: {destination_uri}')
|
|
47
58
|
|
|
48
59
|
Env.get().console_logger.info('Finalizing snapshot ...')
|
|
49
60
|
|
|
50
61
|
finalize_request_json = {
|
|
62
|
+
'table_uri': dest_tbl_uri,
|
|
51
63
|
'operation_type': 'finalize_snapshot',
|
|
52
64
|
'upload_id': upload_id,
|
|
53
65
|
'datafile': bundle.name,
|
|
54
66
|
'size': bundle.stat().st_size,
|
|
55
67
|
'sha256': sha256sum(bundle), # Generate our own SHA for independent verification
|
|
68
|
+
'rows': packager.md['row_count'], # TODO rename rows to row_count once cloud side changes are complete
|
|
69
|
+
'preview_header': packager.md['preview_header'],
|
|
70
|
+
'preview_data': packager.md['preview_data'],
|
|
56
71
|
}
|
|
57
72
|
# TODO: Use Pydantic for validation
|
|
58
73
|
finalize_response = requests.post(PIXELTABLE_API_URL, json=finalize_request_json, headers=headers_json)
|
|
@@ -107,11 +122,14 @@ def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
|
|
|
107
122
|
raise excs.Error(f'Error cloning shapshot: unexpected response from server.\n{response_json}')
|
|
108
123
|
|
|
109
124
|
primary_tbl_additional_md = response_json['md']['tables'][0]['table_md']['additional_md']
|
|
110
|
-
bundle_uri =
|
|
125
|
+
bundle_uri = response_json['destination_uri']
|
|
111
126
|
bundle_filename = primary_tbl_additional_md['datafile']
|
|
112
127
|
parsed_location = urllib.parse.urlparse(bundle_uri)
|
|
113
128
|
if parsed_location.scheme == 's3':
|
|
114
129
|
bundle_path = _download_bundle_from_s3(parsed_location, bundle_filename)
|
|
130
|
+
elif parsed_location.scheme == 'https':
|
|
131
|
+
bundle_path = TempStore.create_path()
|
|
132
|
+
_download_from_presigned_url(url=parsed_location.geturl(), output_path=bundle_path)
|
|
115
133
|
else:
|
|
116
134
|
raise excs.Error(f'Unexpected response from server: unsupported bundle uri: {bundle_uri}')
|
|
117
135
|
|
|
@@ -136,7 +154,7 @@ def _download_bundle_from_s3(parsed_location: urllib.parse.ParseResult, bundle_f
|
|
|
136
154
|
obj = s3_client.head_object(Bucket=bucket, Key=remote_path) # Check if the object exists
|
|
137
155
|
bundle_size = obj['ContentLength']
|
|
138
156
|
|
|
139
|
-
bundle_path =
|
|
157
|
+
bundle_path = TempStore.create_path()
|
|
140
158
|
progress_bar = tqdm(
|
|
141
159
|
desc='Downloading',
|
|
142
160
|
total=bundle_size,
|
|
@@ -149,3 +167,112 @@ def _download_bundle_from_s3(parsed_location: urllib.parse.ParseResult, bundle_f
|
|
|
149
167
|
)
|
|
150
168
|
s3_client.download_file(Bucket=bucket, Key=remote_path, Filename=str(bundle_path), Callback=progress_bar.update)
|
|
151
169
|
return bundle_path
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _create_retry_session(
|
|
173
|
+
max_retries: int = 3, backoff_factor: float = 1.0, status_forcelist: Optional[list] = None
|
|
174
|
+
) -> requests.Session:
|
|
175
|
+
"""Create a requests session with retry configuration"""
|
|
176
|
+
if status_forcelist is None:
|
|
177
|
+
status_forcelist = [
|
|
178
|
+
408, # Request Timeout
|
|
179
|
+
429, # Too Many Requests (rate limiting)
|
|
180
|
+
500, # Internal Server Error (server-side error)
|
|
181
|
+
502, # Bad Gateway (proxy/gateway got invalid response)
|
|
182
|
+
503, # Service Unavailable (server overloaded or down)
|
|
183
|
+
504, # Gateway Timeout (proxy/gateway timeout)
|
|
184
|
+
]
|
|
185
|
+
retry_strategy = Retry(
|
|
186
|
+
total=max_retries,
|
|
187
|
+
read=max_retries,
|
|
188
|
+
connect=max_retries,
|
|
189
|
+
backoff_factor=backoff_factor,
|
|
190
|
+
status_forcelist=status_forcelist,
|
|
191
|
+
allowed_methods=['GET', 'PUT', 'POST', 'DELETE'],
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
session = requests.Session()
|
|
195
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
196
|
+
session.mount('https://', adapter)
|
|
197
|
+
return session
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _upload_to_presigned_url(file_path: Path, url: str, max_retries: int = 3) -> requests.Response:
|
|
201
|
+
"""Upload file with progress bar and retries"""
|
|
202
|
+
file_size = file_path.stat().st_size
|
|
203
|
+
|
|
204
|
+
headers = {'Content-Length': str(file_size), 'Content-Type': 'application/octet-stream'}
|
|
205
|
+
|
|
206
|
+
session = _create_retry_session(max_retries=max_retries)
|
|
207
|
+
try:
|
|
208
|
+
with (
|
|
209
|
+
open(file_path, 'rb') as f,
|
|
210
|
+
tqdm.wrapattr(
|
|
211
|
+
f,
|
|
212
|
+
method='read',
|
|
213
|
+
total=file_size,
|
|
214
|
+
desc='Uploading',
|
|
215
|
+
unit='B',
|
|
216
|
+
unit_scale=True,
|
|
217
|
+
unit_divisor=1024,
|
|
218
|
+
miniters=1, # Update every iteration (should be fine for an upload)
|
|
219
|
+
ncols=100,
|
|
220
|
+
file=sys.stdout,
|
|
221
|
+
) as file_with_progress,
|
|
222
|
+
):
|
|
223
|
+
response = session.put(
|
|
224
|
+
url,
|
|
225
|
+
data=file_with_progress,
|
|
226
|
+
headers=headers,
|
|
227
|
+
timeout=(60, 1800), # 60 seconds to connect and 300 seconds for server response
|
|
228
|
+
)
|
|
229
|
+
response.raise_for_status()
|
|
230
|
+
return response
|
|
231
|
+
finally:
|
|
232
|
+
session.close()
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _download_from_presigned_url(
|
|
236
|
+
url: str, output_path: Path, headers: Optional[dict[str, str]] = None, max_retries: int = 3
|
|
237
|
+
) -> None:
|
|
238
|
+
"""Download file with progress bar and retries"""
|
|
239
|
+
session = _create_retry_session(max_retries=max_retries)
|
|
240
|
+
|
|
241
|
+
try:
|
|
242
|
+
# Stream download with progress
|
|
243
|
+
response = session.get(
|
|
244
|
+
url, headers=headers, stream=True, timeout=(60, 300)
|
|
245
|
+
) # 60 seconds to connect and 300 seconds for server response
|
|
246
|
+
response.raise_for_status()
|
|
247
|
+
|
|
248
|
+
total_size = int(response.headers.get('content-length', 0))
|
|
249
|
+
progress_bar = tqdm(
|
|
250
|
+
desc='Downloading',
|
|
251
|
+
total=total_size,
|
|
252
|
+
unit='B',
|
|
253
|
+
unit_scale=True,
|
|
254
|
+
unit_divisor=1024,
|
|
255
|
+
miniters=1,
|
|
256
|
+
ncols=100,
|
|
257
|
+
file=sys.stdout,
|
|
258
|
+
)
|
|
259
|
+
with open(output_path, 'wb') as f:
|
|
260
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
261
|
+
if chunk:
|
|
262
|
+
f.write(chunk)
|
|
263
|
+
progress_bar.update(len(chunk))
|
|
264
|
+
finally:
|
|
265
|
+
session.close()
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
# TODO: This will be replaced by drop_table with cloud table uri
|
|
269
|
+
def delete_replica(dest_path: str) -> None:
|
|
270
|
+
"""Delete cloud replica"""
|
|
271
|
+
headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
|
|
272
|
+
delete_request_json = {'operation_type': 'delete_snapshot', 'table_uri': dest_path}
|
|
273
|
+
response = requests.post(PIXELTABLE_API_URL, json=delete_request_json, headers=headers_json)
|
|
274
|
+
if response.status_code != 200:
|
|
275
|
+
raise excs.Error(f'Error deleting replica: {response.text}')
|
|
276
|
+
response_json = response.json()
|
|
277
|
+
if not isinstance(response_json, dict) or 'table_uri' not in response_json:
|
|
278
|
+
raise excs.Error(f'Error deleting replica: unexpected response from server.\n{response_json}')
|
pixeltable/store.py
CHANGED
|
@@ -4,7 +4,7 @@ import abc
|
|
|
4
4
|
import logging
|
|
5
5
|
import sys
|
|
6
6
|
import warnings
|
|
7
|
-
from typing import Any, Iterable, Iterator, Optional
|
|
7
|
+
from typing import Any, Iterable, Iterator, Optional
|
|
8
8
|
|
|
9
9
|
import more_itertools
|
|
10
10
|
import psycopg
|
|
@@ -17,7 +17,6 @@ from pixeltable.env import Env
|
|
|
17
17
|
from pixeltable.exec import ExecNode
|
|
18
18
|
from pixeltable.metadata import schema
|
|
19
19
|
from pixeltable.utils.exception_handler import run_cleanup
|
|
20
|
-
from pixeltable.utils.media_store import MediaStore
|
|
21
20
|
from pixeltable.utils.sql import log_explain, log_stmt
|
|
22
21
|
|
|
23
22
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -123,21 +122,6 @@ class StoreBase:
|
|
|
123
122
|
def _storage_name(self) -> str:
|
|
124
123
|
"""Return the name of the data store table"""
|
|
125
124
|
|
|
126
|
-
def _move_tmp_media_file(self, file_url: Optional[str], col: catalog.Column) -> str:
|
|
127
|
-
src_path = MediaStore.resolve_tmp_url(file_url)
|
|
128
|
-
if src_path is None:
|
|
129
|
-
return file_url
|
|
130
|
-
assert col.tbl.id == self.tbl_version.id # Ensure the column belongs to the same table as this store
|
|
131
|
-
new_file_url = MediaStore.relocate_local_media_file(src_path, col)
|
|
132
|
-
return new_file_url
|
|
133
|
-
|
|
134
|
-
def _move_tmp_media_files(
|
|
135
|
-
self, table_row: list[Any], media_cols_by_sql_idx: dict[int, catalog.Column], v_min: int
|
|
136
|
-
) -> None:
|
|
137
|
-
"""Move tmp media files that we generated to a permanent location"""
|
|
138
|
-
for n, col in media_cols_by_sql_idx.items():
|
|
139
|
-
table_row[n] = self._move_tmp_media_file(table_row[n], col)
|
|
140
|
-
|
|
141
125
|
def count(self) -> int:
|
|
142
126
|
"""Return the number of rows visible in self.tbl_version"""
|
|
143
127
|
stmt = (
|
|
@@ -235,7 +219,6 @@ class StoreBase:
|
|
|
235
219
|
# create temp table to store output of exec_plan, with the same primary key as the store table
|
|
236
220
|
tmp_name = f'temp_{self._storage_name()}'
|
|
237
221
|
tmp_pk_cols = tuple(sql.Column(col.name, col.type, primary_key=True) for col in self.pk_columns())
|
|
238
|
-
tmp_val_col_sql_idx = len(tmp_pk_cols)
|
|
239
222
|
tmp_val_col = sql.Column(col.sa_col.name, col.sa_col.type)
|
|
240
223
|
tmp_cols = [*tmp_pk_cols, tmp_val_col]
|
|
241
224
|
# add error columns if the store column records errors
|
|
@@ -262,9 +245,7 @@ class StoreBase:
|
|
|
262
245
|
if abort_on_exc and row.has_exc():
|
|
263
246
|
exc = row.get_first_exc()
|
|
264
247
|
raise excs.Error(f'Error while evaluating computed column {col.name!r}:\n{exc}') from exc
|
|
265
|
-
table_row, num_row_exc = row_builder.
|
|
266
|
-
if col.col_type.is_media_type():
|
|
267
|
-
table_row[tmp_val_col_sql_idx] = self._move_tmp_media_file(table_row[tmp_val_col_sql_idx], col)
|
|
248
|
+
table_row, num_row_exc = row_builder.create_store_table_row(row, None, row.pk)
|
|
268
249
|
num_excs += num_row_exc
|
|
269
250
|
batch_table_rows.append(tuple(table_row))
|
|
270
251
|
|
|
@@ -317,7 +298,7 @@ class StoreBase:
|
|
|
317
298
|
progress_bar: Optional[tqdm] = None # create this only after we started executing
|
|
318
299
|
row_builder = exec_plan.row_builder
|
|
319
300
|
|
|
320
|
-
store_col_names
|
|
301
|
+
store_col_names = row_builder.store_column_names()
|
|
321
302
|
|
|
322
303
|
try:
|
|
323
304
|
table_rows: list[tuple[Any]] = []
|
|
@@ -337,7 +318,7 @@ class StoreBase:
|
|
|
337
318
|
rowid = (next(rowids),) if rowids is not None else row.pk[:-1]
|
|
338
319
|
pk = (*rowid, v_min)
|
|
339
320
|
assert len(pk) == len(self._pk_cols)
|
|
340
|
-
table_row, num_row_exc = row_builder.
|
|
321
|
+
table_row, num_row_exc = row_builder.create_store_table_row(row, cols_with_excs, pk)
|
|
341
322
|
num_excs += num_row_exc
|
|
342
323
|
|
|
343
324
|
if show_progress:
|
|
@@ -351,7 +332,6 @@ class StoreBase:
|
|
|
351
332
|
)
|
|
352
333
|
progress_bar.update(1)
|
|
353
334
|
|
|
354
|
-
self._move_tmp_media_files(table_row, media_cols_by_idx, v_min)
|
|
355
335
|
batch_table_rows.append(tuple(table_row))
|
|
356
336
|
|
|
357
337
|
table_rows.extend(batch_table_rows)
|
|
@@ -427,7 +407,7 @@ class StoreBase:
|
|
|
427
407
|
base_versions_clause = (
|
|
428
408
|
sql.true() if len(base_versions) == 0 else self.base._versions_clause(base_versions, match_on_vmin)
|
|
429
409
|
)
|
|
430
|
-
set_clause: dict[sql.Column,
|
|
410
|
+
set_clause: dict[sql.Column, int | sql.Column] = {self.v_max_col: current_version}
|
|
431
411
|
for index_info in self.tbl_version.get().idxs_by_name.values():
|
|
432
412
|
# copy value column to undo column
|
|
433
413
|
set_clause[index_info.undo_col.sa_col] = index_info.val_col.sa_col
|
pixeltable/type_system.py
CHANGED
|
@@ -292,7 +292,7 @@ class ColumnType:
|
|
|
292
292
|
|
|
293
293
|
@classmethod
|
|
294
294
|
def from_python_type(
|
|
295
|
-
cls, t:
|
|
295
|
+
cls, t: type | _GenericAlias, nullable_default: bool = False, allow_builtin_types: bool = True
|
|
296
296
|
) -> Optional[ColumnType]:
|
|
297
297
|
"""
|
|
298
298
|
Convert a Python type into a Pixeltable `ColumnType` instance.
|
|
@@ -311,7 +311,7 @@ class ColumnType:
|
|
|
311
311
|
if origin in (typing.Union, types.UnionType):
|
|
312
312
|
# Check if `t` has the form Optional[T].
|
|
313
313
|
if len(type_args) == 2 and type(None) in type_args:
|
|
314
|
-
# `t` is a type of the form Optional[T] (equivalently,
|
|
314
|
+
# `t` is a type of the form Optional[T] (equivalently, T | None or None | T).
|
|
315
315
|
# We treat it as the underlying type but with nullable=True.
|
|
316
316
|
underlying_py_type = type_args[0] if type_args[1] is type(None) else type_args[1]
|
|
317
317
|
underlying = cls.from_python_type(underlying_py_type, allow_builtin_types=allow_builtin_types)
|
|
@@ -361,10 +361,7 @@ class ColumnType:
|
|
|
361
361
|
|
|
362
362
|
@classmethod
|
|
363
363
|
def normalize_type(
|
|
364
|
-
cls,
|
|
365
|
-
t: Union[ColumnType, type, _AnnotatedAlias],
|
|
366
|
-
nullable_default: bool = False,
|
|
367
|
-
allow_builtin_types: bool = True,
|
|
364
|
+
cls, t: ColumnType | type | _AnnotatedAlias, nullable_default: bool = False, allow_builtin_types: bool = True
|
|
368
365
|
) -> ColumnType:
|
|
369
366
|
"""
|
|
370
367
|
Convert any type recognizable by Pixeltable to its corresponding ColumnType.
|
|
@@ -389,7 +386,7 @@ class ColumnType:
|
|
|
389
386
|
]
|
|
390
387
|
|
|
391
388
|
@classmethod
|
|
392
|
-
def __raise_exc_for_invalid_type(cls, t:
|
|
389
|
+
def __raise_exc_for_invalid_type(cls, t: type | _AnnotatedAlias) -> None:
|
|
393
390
|
for builtin_type, suggestion in cls.__TYPE_SUGGESTIONS:
|
|
394
391
|
if t is builtin_type or (isinstance(t, type) and issubclass(t, builtin_type)):
|
|
395
392
|
name = t.__name__ if t.__module__ == 'builtins' else f'{t.__module__}.{t.__name__}'
|
|
@@ -405,7 +402,7 @@ class ColumnType:
|
|
|
405
402
|
return cls.from_python_type(py_type) if py_type is not None else None
|
|
406
403
|
|
|
407
404
|
@classmethod
|
|
408
|
-
def __json_schema_to_py_type(cls, schema: dict[str, Any]) ->
|
|
405
|
+
def __json_schema_to_py_type(cls, schema: dict[str, Any]) -> type | _GenericAlias | None:
|
|
409
406
|
if 'type' in schema:
|
|
410
407
|
if schema['type'] == 'null':
|
|
411
408
|
return type(None)
|
pixeltable/utils/__init__.py
CHANGED
|
@@ -2,7 +2,7 @@ import hashlib
|
|
|
2
2
|
import urllib.parse
|
|
3
3
|
import urllib.request
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Optional
|
|
5
|
+
from typing import Optional
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def print_perf_counter_delta(delta: float) -> str:
|
|
@@ -24,7 +24,7 @@ def print_perf_counter_delta(delta: float) -> str:
|
|
|
24
24
|
return f'{delta:.2f} s'
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
def sha256sum(path:
|
|
27
|
+
def sha256sum(path: Path | str) -> str:
|
|
28
28
|
"""
|
|
29
29
|
Compute the SHA256 hash of a file.
|
|
30
30
|
"""
|
pixeltable/utils/arrow.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import datetime
|
|
2
|
-
from typing import Any, Iterator, Optional
|
|
2
|
+
from typing import Any, Iterator, Optional
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pyarrow as pa
|
|
@@ -88,11 +88,11 @@ def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
|
|
|
88
88
|
return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items()) # type: ignore[misc]
|
|
89
89
|
|
|
90
90
|
|
|
91
|
-
def to_pydict(batch:
|
|
91
|
+
def to_pydict(batch: pa.Table | pa.RecordBatch) -> dict[str, list | np.ndarray]:
|
|
92
92
|
"""Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
|
|
93
93
|
this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.
|
|
94
94
|
"""
|
|
95
|
-
out: dict[str,
|
|
95
|
+
out: dict[str, list | np.ndarray] = {}
|
|
96
96
|
for k, name in enumerate(batch.schema.names):
|
|
97
97
|
col = batch.column(k)
|
|
98
98
|
if isinstance(col.type, pa.FixedShapeTensorType):
|
|
@@ -105,7 +105,7 @@ def to_pydict(batch: Union[pa.Table, pa.RecordBatch]) -> dict[str, Union[list, n
|
|
|
105
105
|
return out
|
|
106
106
|
|
|
107
107
|
|
|
108
|
-
def iter_tuples(batch:
|
|
108
|
+
def iter_tuples(batch: pa.Table | pa.RecordBatch) -> Iterator[dict[str, Any]]:
|
|
109
109
|
"""Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
|
|
110
110
|
pydict = to_pydict(batch)
|
|
111
111
|
assert len(pydict) > 0, 'empty record batch'
|
|
@@ -145,7 +145,7 @@ def _ar_val_to_pxt_val(val: Any, pxt_type: ts.ColumnType) -> Any:
|
|
|
145
145
|
|
|
146
146
|
|
|
147
147
|
def iter_tuples2(
|
|
148
|
-
batch:
|
|
148
|
+
batch: pa.Table | pa.RecordBatch, col_mapping: Optional[dict[str, str]], schema: dict[str, ts.ColumnType]
|
|
149
149
|
) -> Iterator[dict[str, Any]]:
|
|
150
150
|
"""Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
|
|
151
151
|
pydict = to_pydict(batch)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import dataclasses
|
|
2
|
-
from typing import Optional
|
|
2
|
+
from typing import Optional
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
from pandas.io.formats.style import Styler
|
|
@@ -7,7 +7,7 @@ from pandas.io.formats.style import Styler
|
|
|
7
7
|
|
|
8
8
|
@dataclasses.dataclass
|
|
9
9
|
class _Descriptor:
|
|
10
|
-
body:
|
|
10
|
+
body: str | pd.DataFrame
|
|
11
11
|
# The remaining fields only affect the behavior if `body` is a pd.DataFrame.
|
|
12
12
|
show_index: bool
|
|
13
13
|
show_header: bool
|
|
@@ -33,7 +33,7 @@ class DescriptionHelper:
|
|
|
33
33
|
|
|
34
34
|
def append(
|
|
35
35
|
self,
|
|
36
|
-
descriptor:
|
|
36
|
+
descriptor: str | pd.DataFrame,
|
|
37
37
|
show_index: bool = False,
|
|
38
38
|
show_header: bool = True,
|
|
39
39
|
styler: Optional[Styler] = None,
|
pixeltable/utils/iceberg.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
from typing import Union
|
|
3
2
|
|
|
4
3
|
from pyiceberg.catalog.sql import SqlCatalog
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def sqlite_catalog(warehouse_path:
|
|
6
|
+
def sqlite_catalog(warehouse_path: str | Path, name: str = 'pixeltable') -> SqlCatalog:
|
|
8
7
|
"""
|
|
9
8
|
Instantiate a sqlite Iceberg catalog at the specified path. If no catalog exists, one will be created.
|
|
10
9
|
"""
|