pixeltable 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -0
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +9 -2
- pixeltable/catalog/column.py +1 -1
- pixeltable/catalog/dir.py +1 -1
- pixeltable/catalog/table.py +3 -1
- pixeltable/catalog/table_version.py +12 -2
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/catalog/view.py +64 -20
- pixeltable/dataframe.py +11 -6
- pixeltable/env.py +12 -0
- pixeltable/exec/expr_eval/evaluators.py +4 -2
- pixeltable/exec/expr_eval/expr_eval_node.py +4 -1
- pixeltable/exprs/comparison.py +8 -4
- pixeltable/exprs/data_row.py +9 -7
- pixeltable/exprs/expr.py +2 -2
- pixeltable/exprs/function_call.py +155 -313
- pixeltable/exprs/json_mapper.py +25 -8
- pixeltable/exprs/json_path.py +6 -5
- pixeltable/exprs/object_ref.py +16 -5
- pixeltable/exprs/row_builder.py +10 -3
- pixeltable/func/aggregate_function.py +29 -15
- pixeltable/func/callable_function.py +11 -8
- pixeltable/func/expr_template_function.py +3 -9
- pixeltable/func/function.py +148 -74
- pixeltable/func/signature.py +65 -30
- pixeltable/func/tools.py +26 -26
- pixeltable/func/udf.py +1 -1
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/anthropic.py +9 -3
- pixeltable/functions/deepseek.py +121 -0
- pixeltable/functions/image.py +7 -7
- pixeltable/functions/openai.py +30 -13
- pixeltable/functions/video.py +14 -7
- pixeltable/globals.py +14 -3
- pixeltable/index/embedding_index.py +4 -13
- pixeltable/io/globals.py +88 -77
- pixeltable/io/hf_datasets.py +34 -34
- pixeltable/io/pandas.py +75 -76
- pixeltable/io/parquet.py +19 -27
- pixeltable/io/utils.py +115 -0
- pixeltable/iterators/audio.py +2 -1
- pixeltable/iterators/video.py +1 -1
- pixeltable/metadata/__init__.py +2 -1
- pixeltable/metadata/converters/convert_15.py +18 -8
- pixeltable/metadata/converters/convert_27.py +31 -0
- pixeltable/metadata/converters/convert_28.py +15 -0
- pixeltable/metadata/converters/convert_29.py +111 -0
- pixeltable/metadata/converters/util.py +12 -1
- pixeltable/metadata/notes.py +3 -0
- pixeltable/metadata/schema.py +8 -0
- pixeltable/share/__init__.py +1 -0
- pixeltable/share/packager.py +41 -13
- pixeltable/share/publish.py +97 -0
- pixeltable/type_system.py +40 -14
- pixeltable/utils/__init__.py +41 -0
- pixeltable/utils/arrow.py +40 -7
- pixeltable/utils/formatter.py +1 -1
- {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/METADATA +34 -49
- {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/RECORD +63 -57
- {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/WHEEL +1 -1
- {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/entry_points.txt +0 -0
pixeltable/share/packager.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import dataclasses
|
|
1
2
|
import io
|
|
2
3
|
import json
|
|
3
4
|
import logging
|
|
@@ -5,8 +6,9 @@ import tarfile
|
|
|
5
6
|
import urllib.parse
|
|
6
7
|
import urllib.request
|
|
7
8
|
import uuid
|
|
9
|
+
from datetime import datetime
|
|
8
10
|
from pathlib import Path
|
|
9
|
-
from typing import Any, Iterator
|
|
11
|
+
from typing import Any, Iterator, Optional
|
|
10
12
|
|
|
11
13
|
import more_itertools
|
|
12
14
|
import numpy as np
|
|
@@ -15,7 +17,8 @@ import pyiceberg.catalog
|
|
|
15
17
|
|
|
16
18
|
import pixeltable as pxt
|
|
17
19
|
import pixeltable.type_system as ts
|
|
18
|
-
from pixeltable import exprs
|
|
20
|
+
from pixeltable import catalog, exprs, metadata
|
|
21
|
+
from pixeltable.dataframe import DataFrame
|
|
19
22
|
from pixeltable.env import Env
|
|
20
23
|
from pixeltable.utils.arrow import PXT_TO_PA_TYPES
|
|
21
24
|
from pixeltable.utils.iceberg import sqlite_catalog
|
|
@@ -28,6 +31,7 @@ class TablePackager:
|
|
|
28
31
|
Packages a pixeltable Table into a tarball containing Iceberg tables and media files. The structure of the tarball
|
|
29
32
|
is as follows:
|
|
30
33
|
|
|
34
|
+
metadata.json # Pixeltable metadata for the packaged table
|
|
31
35
|
warehouse/catalog.db # sqlite Iceberg catalog
|
|
32
36
|
warehouse/pxt.db/** # Iceberg metadata and data files (parquet/avro/json)
|
|
33
37
|
media/** # Local media files
|
|
@@ -43,16 +47,40 @@ class TablePackager:
|
|
|
43
47
|
'media/{uuid}{extension}', and the Iceberg table will contain the ephemeral URI 'pxtmedia://{uuid}{extension}'.
|
|
44
48
|
"""
|
|
45
49
|
|
|
46
|
-
table:
|
|
50
|
+
table: catalog.Table # The table to be packaged
|
|
47
51
|
tmp_dir: Path # Temporary directory where the package will reside
|
|
48
52
|
iceberg_catalog: pyiceberg.catalog.Catalog
|
|
49
53
|
media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
|
|
54
|
+
md: dict[str, Any]
|
|
50
55
|
|
|
51
|
-
def __init__(self, table:
|
|
56
|
+
def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
|
|
52
57
|
self.table = table
|
|
53
58
|
self.tmp_dir = Path(Env.get().create_tmp_path())
|
|
54
59
|
self.media_files = {}
|
|
55
60
|
|
|
61
|
+
# Generate metadata
|
|
62
|
+
self.md = {
|
|
63
|
+
'pxt_version': pxt.__version__,
|
|
64
|
+
'pxt_md_version': metadata.VERSION,
|
|
65
|
+
'md': {
|
|
66
|
+
'tables': [
|
|
67
|
+
{
|
|
68
|
+
'table_id': str(t._tbl_version.id),
|
|
69
|
+
# These are temporary; will replace with a better solution once the concurrency changes to catalog have
|
|
70
|
+
# been merged
|
|
71
|
+
'table_md': dataclasses.asdict(t._tbl_version._create_tbl_md()),
|
|
72
|
+
'table_version_md': dataclasses.asdict(
|
|
73
|
+
t._tbl_version._create_version_md(datetime.now().timestamp())
|
|
74
|
+
),
|
|
75
|
+
'table_schema_version_md': dataclasses.asdict(t._tbl_version._create_schema_version_md(0)),
|
|
76
|
+
}
|
|
77
|
+
for t in (table, *table._bases)
|
|
78
|
+
]
|
|
79
|
+
},
|
|
80
|
+
}
|
|
81
|
+
if additional_md is not None:
|
|
82
|
+
self.md.update(additional_md)
|
|
83
|
+
|
|
56
84
|
def package(self) -> Path:
|
|
57
85
|
"""
|
|
58
86
|
Export the table to a tarball containing Iceberg tables and media files.
|
|
@@ -60,8 +88,10 @@ class TablePackager:
|
|
|
60
88
|
assert not self.tmp_dir.exists() # Packaging can only be done once per TablePackager instance
|
|
61
89
|
_logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
|
|
62
90
|
self.tmp_dir.mkdir()
|
|
91
|
+
with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
|
|
92
|
+
json.dump(self.md, fp)
|
|
63
93
|
self.iceberg_catalog = sqlite_catalog(self.tmp_dir / 'warehouse')
|
|
64
|
-
ancestors =
|
|
94
|
+
ancestors = (self.table, *self.table._bases)
|
|
65
95
|
for t in ancestors:
|
|
66
96
|
_logger.info(f"Exporting table '{t._path}'.")
|
|
67
97
|
self.__export_table(t)
|
|
@@ -70,7 +100,7 @@ class TablePackager:
|
|
|
70
100
|
_logger.info(f'Packaging complete: {bundle_path}')
|
|
71
101
|
return bundle_path
|
|
72
102
|
|
|
73
|
-
def __export_table(self, t:
|
|
103
|
+
def __export_table(self, t: catalog.Table) -> None:
|
|
74
104
|
"""
|
|
75
105
|
Exports the data from `t` into an Iceberg table.
|
|
76
106
|
"""
|
|
@@ -116,7 +146,7 @@ class TablePackager:
|
|
|
116
146
|
iceberg_tbl.append(pa_table)
|
|
117
147
|
|
|
118
148
|
@classmethod
|
|
119
|
-
def __iceberg_namespace(cls, table:
|
|
149
|
+
def __iceberg_namespace(cls, table: catalog.Table) -> str:
|
|
120
150
|
"""
|
|
121
151
|
Iceberg tables must have a namespace, which cannot be the empty string, so we prepend `pxt` to the table path.
|
|
122
152
|
"""
|
|
@@ -149,11 +179,7 @@ class TablePackager:
|
|
|
149
179
|
return PXT_TO_PA_TYPES.get(col_type.__class__)
|
|
150
180
|
|
|
151
181
|
def __to_pa_tables(
|
|
152
|
-
self,
|
|
153
|
-
df: pxt.DataFrame,
|
|
154
|
-
actual_col_types: list[pxt.ColumnType],
|
|
155
|
-
arrow_schema: pa.Schema,
|
|
156
|
-
batch_size: int = 1_000,
|
|
182
|
+
self, df: DataFrame, actual_col_types: list[ts.ColumnType], arrow_schema: pa.Schema, batch_size: int = 1_000
|
|
157
183
|
) -> Iterator[pa.Table]:
|
|
158
184
|
"""
|
|
159
185
|
Load a DataFrame as a sequence of pyarrow tables. The pyarrow tables are batched into smaller chunks
|
|
@@ -165,7 +191,7 @@ class TablePackager:
|
|
|
165
191
|
cols['_v_min'] = [row[-1] for row in rows]
|
|
166
192
|
yield pa.Table.from_pydict(cols, schema=arrow_schema)
|
|
167
193
|
|
|
168
|
-
def __to_pa_rows(self, df:
|
|
194
|
+
def __to_pa_rows(self, df: DataFrame, actual_col_types: list[ts.ColumnType]) -> Iterator[list]:
|
|
169
195
|
for row in df._exec():
|
|
170
196
|
vals = [row[e.slot_idx] for e in df._select_list_exprs]
|
|
171
197
|
result = [self.__to_pa_value(val, col_type) for val, col_type in zip(vals, actual_col_types)]
|
|
@@ -210,6 +236,8 @@ class TablePackager:
|
|
|
210
236
|
def __build_tarball(self) -> Path:
|
|
211
237
|
bundle_path = self.tmp_dir / 'bundle.tar.bz2'
|
|
212
238
|
with tarfile.open(bundle_path, 'w:bz2') as tf:
|
|
239
|
+
# Add metadata json
|
|
240
|
+
tf.add(self.tmp_dir / 'metadata.json', arcname='metadata.json')
|
|
213
241
|
# Add the Iceberg warehouse dir (including the catalog)
|
|
214
242
|
tf.add(self.tmp_dir / 'warehouse', arcname='warehouse', recursive=True)
|
|
215
243
|
# Add the media files
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
import urllib.parse
|
|
5
|
+
import urllib.request
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import requests
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
|
|
12
|
+
import pixeltable as pxt
|
|
13
|
+
from pixeltable import exceptions as excs, metadata
|
|
14
|
+
from pixeltable.env import Env
|
|
15
|
+
from pixeltable.utils import sha256sum
|
|
16
|
+
|
|
17
|
+
from .packager import TablePackager
|
|
18
|
+
|
|
19
|
+
# These URLs are abstracted out for now, but will be replaced with actual (hard-coded) URLs once the
|
|
20
|
+
# pixeltable.com URLs are available.
|
|
21
|
+
_PUBLISH_URL = os.environ.get('PIXELTABLE_PUBLISH_URL')
|
|
22
|
+
_FINALIZE_URL = os.environ.get('PIXELTABLE_FINALIZE_URL')
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def publish_snapshot(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
|
|
26
|
+
packager = TablePackager(src_tbl, additional_md={'table_uri': dest_tbl_uri})
|
|
27
|
+
request_json = packager.md
|
|
28
|
+
headers_json = {'X-api-key': Env.get().pxt_api_key}
|
|
29
|
+
|
|
30
|
+
response = requests.post(_PUBLISH_URL, json=request_json, headers=headers_json)
|
|
31
|
+
if response.status_code != 200:
|
|
32
|
+
raise excs.Error(f'Error publishing snapshot: {response.text}')
|
|
33
|
+
response_json = response.json()
|
|
34
|
+
if not isinstance(response_json, dict) or response_json.get('destination') != 's3':
|
|
35
|
+
raise excs.Error(f'Error publishing snapshot: unexpected response from server.\n{response_json}')
|
|
36
|
+
upload_id = response_json['upload_id']
|
|
37
|
+
destination_uri = response_json['destination_uri']
|
|
38
|
+
|
|
39
|
+
Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path}' at: {dest_tbl_uri}")
|
|
40
|
+
|
|
41
|
+
bundle = packager.package()
|
|
42
|
+
|
|
43
|
+
parsed_location = urllib.parse.urlparse(destination_uri)
|
|
44
|
+
if parsed_location.scheme == 's3':
|
|
45
|
+
_upload_bundle_to_s3(bundle, parsed_location)
|
|
46
|
+
else:
|
|
47
|
+
raise excs.Error(f'Unsupported destination: {destination_uri}')
|
|
48
|
+
|
|
49
|
+
Env.get().console_logger.info(f'Finalizing snapshot ...')
|
|
50
|
+
|
|
51
|
+
finalize_request_json = {
|
|
52
|
+
'upload_id': upload_id,
|
|
53
|
+
'datafile': bundle.name,
|
|
54
|
+
'size': bundle.stat().st_size,
|
|
55
|
+
'sha256': sha256sum(bundle), # Generate our own SHA for independent verification
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# TODO: Use Pydantic for validation
|
|
59
|
+
finalize_response = requests.post(_FINALIZE_URL, json=finalize_request_json, headers=headers_json)
|
|
60
|
+
if finalize_response.status_code != 200:
|
|
61
|
+
raise excs.Error(f'Error finalizing snapshot: {finalize_response.text}')
|
|
62
|
+
finalize_response_json = finalize_response.json()
|
|
63
|
+
if not isinstance(finalize_response_json, dict) or 'confirmed_table_uri' not in finalize_response_json:
|
|
64
|
+
raise excs.Error(f'Error finalizing snapshot: unexpected response from server.\n{finalize_response_json}')
|
|
65
|
+
|
|
66
|
+
confirmed_tbl_uri = finalize_response_json['confirmed_table_uri']
|
|
67
|
+
Env.get().console_logger.info(f'The published snapshot is now available at: {confirmed_tbl_uri}')
|
|
68
|
+
return confirmed_tbl_uri
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult) -> None:
|
|
72
|
+
from pixeltable.utils.s3 import get_client
|
|
73
|
+
|
|
74
|
+
bucket = parsed_location.netloc
|
|
75
|
+
remote_dir = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_location.path)))
|
|
76
|
+
remote_path = str(remote_dir / bundle.name)[1:] # Remove initial /
|
|
77
|
+
|
|
78
|
+
Env.get().console_logger.info(f'Uploading snapshot to: {bucket}:{remote_path}')
|
|
79
|
+
|
|
80
|
+
boto_config = {'max_pool_connections': 5, 'connect_timeout': 15, 'retries': {'max_attempts': 3, 'mode': 'adaptive'}}
|
|
81
|
+
s3_client = get_client(**boto_config)
|
|
82
|
+
|
|
83
|
+
upload_args = {'ChecksumAlgorithm': 'SHA256'}
|
|
84
|
+
|
|
85
|
+
progress_bar = tqdm(
|
|
86
|
+
desc=f'Uploading',
|
|
87
|
+
total=bundle.stat().st_size,
|
|
88
|
+
unit='B',
|
|
89
|
+
unit_scale=True,
|
|
90
|
+
unit_divisor=1024,
|
|
91
|
+
miniters=1, # Update every iteration (should be fine for an upload)
|
|
92
|
+
ncols=100,
|
|
93
|
+
file=sys.stdout,
|
|
94
|
+
)
|
|
95
|
+
s3_client.upload_file(
|
|
96
|
+
Filename=str(bundle), Bucket=bucket, Key=str(remote_path), ExtraArgs=upload_args, Callback=progress_bar.update
|
|
97
|
+
)
|
pixeltable/type_system.py
CHANGED
|
@@ -8,10 +8,9 @@ import json
|
|
|
8
8
|
import typing
|
|
9
9
|
import urllib.parse
|
|
10
10
|
import urllib.request
|
|
11
|
-
from pathlib import Path
|
|
12
11
|
from typing import Any, Iterable, Literal, Mapping, Optional, Sequence, Union
|
|
13
12
|
|
|
14
|
-
import av
|
|
13
|
+
import av
|
|
15
14
|
import jsonschema
|
|
16
15
|
import jsonschema.protocols
|
|
17
16
|
import jsonschema.validators
|
|
@@ -22,6 +21,7 @@ import sqlalchemy as sql
|
|
|
22
21
|
from typing_extensions import _AnnotatedAlias
|
|
23
22
|
|
|
24
23
|
import pixeltable.exceptions as excs
|
|
24
|
+
from pixeltable.utils import parse_local_file_path
|
|
25
25
|
|
|
26
26
|
from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
|
|
27
27
|
|
|
@@ -47,8 +47,8 @@ class ColumnType:
|
|
|
47
47
|
@classmethod
|
|
48
48
|
def supertype(
|
|
49
49
|
cls,
|
|
50
|
-
type1: 'ColumnType.Type',
|
|
51
|
-
type2: 'ColumnType.Type',
|
|
50
|
+
type1: Optional['ColumnType.Type'],
|
|
51
|
+
type2: Optional['ColumnType.Type'],
|
|
52
52
|
# we need to pass this in because we can't easily append it as a class member
|
|
53
53
|
common_supertypes: dict[tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type'],
|
|
54
54
|
) -> Optional['ColumnType.Type']:
|
|
@@ -93,6 +93,9 @@ class ColumnType:
|
|
|
93
93
|
self._type = t
|
|
94
94
|
self._nullable = nullable
|
|
95
95
|
|
|
96
|
+
def has_supertype(self) -> bool:
|
|
97
|
+
return True
|
|
98
|
+
|
|
96
99
|
@property
|
|
97
100
|
def nullable(self) -> bool:
|
|
98
101
|
return self._nullable
|
|
@@ -271,8 +274,10 @@ class ColumnType:
|
|
|
271
274
|
inferred_type = val_type
|
|
272
275
|
else:
|
|
273
276
|
inferred_type = inferred_type.supertype(val_type)
|
|
274
|
-
|
|
275
|
-
|
|
277
|
+
if inferred_type is None:
|
|
278
|
+
return None
|
|
279
|
+
if not inferred_type.has_supertype():
|
|
280
|
+
return inferred_type
|
|
276
281
|
return inferred_type
|
|
277
282
|
|
|
278
283
|
@classmethod
|
|
@@ -397,12 +402,9 @@ class ColumnType:
|
|
|
397
402
|
def _validate_file_path(self, val: Any) -> None:
|
|
398
403
|
"""Raises TypeError if not a valid local file path or not a path/byte sequence"""
|
|
399
404
|
if isinstance(val, str):
|
|
400
|
-
|
|
401
|
-
if
|
|
402
|
-
|
|
403
|
-
path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
|
|
404
|
-
if not path.is_file():
|
|
405
|
-
raise TypeError(f'File not found: {str(path)}')
|
|
405
|
+
path = parse_local_file_path(val)
|
|
406
|
+
if path is not None and not path.is_file():
|
|
407
|
+
raise TypeError(f'File not found: {path}')
|
|
406
408
|
else:
|
|
407
409
|
if not isinstance(val, bytes):
|
|
408
410
|
raise TypeError(f'expected file path or bytes, got {type(val)}')
|
|
@@ -495,7 +497,7 @@ class InvalidType(ColumnType):
|
|
|
495
497
|
super().__init__(self.Type.INVALID, nullable=nullable)
|
|
496
498
|
|
|
497
499
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
498
|
-
|
|
500
|
+
return sql.types.NullType()
|
|
499
501
|
|
|
500
502
|
def print_value(self, val: Any) -> str:
|
|
501
503
|
return str(val)
|
|
@@ -508,6 +510,9 @@ class StringType(ColumnType):
|
|
|
508
510
|
def __init__(self, nullable: bool = False):
|
|
509
511
|
super().__init__(self.Type.STRING, nullable=nullable)
|
|
510
512
|
|
|
513
|
+
def has_supertype(self):
|
|
514
|
+
return not self.nullable
|
|
515
|
+
|
|
511
516
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
512
517
|
return sql.String()
|
|
513
518
|
|
|
@@ -591,6 +596,9 @@ class TimestampType(ColumnType):
|
|
|
591
596
|
def __init__(self, nullable: bool = False):
|
|
592
597
|
super().__init__(self.Type.TIMESTAMP, nullable=nullable)
|
|
593
598
|
|
|
599
|
+
def has_supertype(self):
|
|
600
|
+
return not self.nullable
|
|
601
|
+
|
|
594
602
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
595
603
|
return sql.TIMESTAMP(timezone=True)
|
|
596
604
|
|
|
@@ -601,6 +609,8 @@ class TimestampType(ColumnType):
|
|
|
601
609
|
def _create_literal(self, val: Any) -> Any:
|
|
602
610
|
if isinstance(val, str):
|
|
603
611
|
return datetime.datetime.fromisoformat(val)
|
|
612
|
+
if isinstance(val, datetime.datetime):
|
|
613
|
+
return val
|
|
604
614
|
return val
|
|
605
615
|
|
|
606
616
|
|
|
@@ -651,6 +661,10 @@ class JsonType(ColumnType):
|
|
|
651
661
|
return val_type.print_value(val)
|
|
652
662
|
|
|
653
663
|
def _validate_literal(self, val: Any) -> None:
|
|
664
|
+
if isinstance(val, tuple):
|
|
665
|
+
val = list(val)
|
|
666
|
+
if isinstance(val, pydantic.BaseModel):
|
|
667
|
+
val = val.model_dump()
|
|
654
668
|
if not self.__is_valid_json(val):
|
|
655
669
|
raise TypeError(f'That literal is not a valid Pixeltable JSON object: {val}')
|
|
656
670
|
if self.__validator is not None:
|
|
@@ -818,14 +832,20 @@ class ArrayType(ColumnType):
|
|
|
818
832
|
return hash((self._type, self.nullable, self.shape, self.dtype))
|
|
819
833
|
|
|
820
834
|
def supertype(self, other: ColumnType) -> Optional[ArrayType]:
|
|
835
|
+
basic_supertype = super().supertype(other)
|
|
836
|
+
if basic_supertype is not None:
|
|
837
|
+
assert isinstance(basic_supertype, ArrayType)
|
|
838
|
+
return basic_supertype
|
|
839
|
+
|
|
821
840
|
if not isinstance(other, ArrayType):
|
|
822
841
|
return None
|
|
842
|
+
|
|
823
843
|
super_dtype = self.Type.supertype(self.dtype, other.dtype, self.common_supertypes)
|
|
824
844
|
if super_dtype is None:
|
|
825
845
|
# if the dtypes are incompatible, then the supertype is a fully general array
|
|
826
846
|
return ArrayType(nullable=(self.nullable or other.nullable))
|
|
827
847
|
super_shape: Optional[tuple[Optional[int], ...]]
|
|
828
|
-
if len(self.shape) != len(other.shape):
|
|
848
|
+
if self.shape is None or other.shape is None or len(self.shape) != len(other.shape):
|
|
829
849
|
super_shape = None
|
|
830
850
|
else:
|
|
831
851
|
super_shape = tuple(n1 if n1 == n2 else None for n1, n2 in zip(self.shape, other.shape))
|
|
@@ -1009,8 +1029,14 @@ class ImageType(ColumnType):
|
|
|
1009
1029
|
return hash((self._type, self.nullable, self.size, self.mode))
|
|
1010
1030
|
|
|
1011
1031
|
def supertype(self, other: ColumnType) -> Optional[ImageType]:
|
|
1032
|
+
basic_supertype = super().supertype(other)
|
|
1033
|
+
if basic_supertype is not None:
|
|
1034
|
+
assert isinstance(basic_supertype, ImageType)
|
|
1035
|
+
return basic_supertype
|
|
1036
|
+
|
|
1012
1037
|
if not isinstance(other, ImageType):
|
|
1013
1038
|
return None
|
|
1039
|
+
|
|
1014
1040
|
width = self.width if self.width == other.width else None
|
|
1015
1041
|
height = self.height if self.height == other.height else None
|
|
1016
1042
|
mode = self.mode if self.mode == other.mode else None
|
pixeltable/utils/__init__.py
CHANGED
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import urllib.parse
|
|
3
|
+
import urllib.request
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, Union
|
|
6
|
+
|
|
7
|
+
|
|
1
8
|
def print_perf_counter_delta(delta: float) -> str:
|
|
2
9
|
"""Prints a performance counter delta in a human-readable format.
|
|
3
10
|
|
|
@@ -15,3 +22,37 @@ def print_perf_counter_delta(delta: float) -> str:
|
|
|
15
22
|
return f'{delta * 1e3:.2f} ms'
|
|
16
23
|
else:
|
|
17
24
|
return f'{delta:.2f} s'
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def sha256sum(path: Union[Path, str]) -> str:
|
|
28
|
+
"""
|
|
29
|
+
Compute the SHA256 hash of a file.
|
|
30
|
+
"""
|
|
31
|
+
if isinstance(path, str):
|
|
32
|
+
path = Path(path)
|
|
33
|
+
|
|
34
|
+
h = hashlib.sha256()
|
|
35
|
+
with open(path, 'rb') as file:
|
|
36
|
+
while chunk := file.read(h.block_size):
|
|
37
|
+
h.update(chunk)
|
|
38
|
+
|
|
39
|
+
return h.hexdigest()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def parse_local_file_path(file_or_url: str) -> Optional[Path]:
|
|
43
|
+
"""
|
|
44
|
+
Parses a string that may be either a URL or a local file path.
|
|
45
|
+
|
|
46
|
+
If the string is a local file path or a file-scheme URL (file://), then a Path object will be returned.
|
|
47
|
+
Otherwise, None will be returned.
|
|
48
|
+
"""
|
|
49
|
+
parsed = urllib.parse.urlparse(file_or_url)
|
|
50
|
+
if len(parsed.scheme) <= 1:
|
|
51
|
+
# We're using `urlparse` to help distinguish file paths from URLs. If there is no scheme, then it's a file path.
|
|
52
|
+
# If there's a single-character scheme, we also interpret this as a file path; this insures that drive letters
|
|
53
|
+
# on Windows pathnames are correctly handled.
|
|
54
|
+
return Path(file_or_url).absolute()
|
|
55
|
+
elif parsed.scheme == 'file':
|
|
56
|
+
return Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
|
|
57
|
+
else:
|
|
58
|
+
return None
|
pixeltable/utils/arrow.py
CHANGED
|
@@ -8,6 +8,8 @@ import pixeltable.type_system as ts
|
|
|
8
8
|
|
|
9
9
|
PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
|
|
10
10
|
pa.string(): ts.StringType(nullable=True),
|
|
11
|
+
pa.large_string(): ts.StringType(nullable=True),
|
|
12
|
+
pa.timestamp('us', tz=datetime.timezone.utc): ts.TimestampType(nullable=True),
|
|
11
13
|
pa.bool_(): ts.BoolType(nullable=True),
|
|
12
14
|
pa.uint8(): ts.IntType(nullable=True),
|
|
13
15
|
pa.int8(): ts.IntType(nullable=True),
|
|
@@ -16,6 +18,7 @@ PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
|
|
|
16
18
|
pa.int32(): ts.IntType(nullable=True),
|
|
17
19
|
pa.int64(): ts.IntType(nullable=True),
|
|
18
20
|
pa.float32(): ts.FloatType(nullable=True),
|
|
21
|
+
pa.float64(): ts.FloatType(nullable=True),
|
|
19
22
|
}
|
|
20
23
|
|
|
21
24
|
PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
|
|
@@ -32,19 +35,20 @@ PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
|
|
|
32
35
|
}
|
|
33
36
|
|
|
34
37
|
|
|
35
|
-
def to_pixeltable_type(arrow_type: pa.DataType) -> Optional[ts.ColumnType]:
|
|
38
|
+
def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.ColumnType]:
|
|
36
39
|
"""Convert a pyarrow DataType to a pixeltable ColumnType if one is defined.
|
|
37
40
|
Returns None if no conversion is currently implemented.
|
|
38
41
|
"""
|
|
39
42
|
if isinstance(arrow_type, pa.TimestampType):
|
|
40
|
-
return ts.TimestampType(nullable=
|
|
43
|
+
return ts.TimestampType(nullable=nullable)
|
|
41
44
|
elif arrow_type in PA_TO_PXT_TYPES:
|
|
42
|
-
|
|
45
|
+
pt = PA_TO_PXT_TYPES[arrow_type]
|
|
46
|
+
return pt.copy(nullable=nullable)
|
|
43
47
|
elif isinstance(arrow_type, pa.FixedShapeTensorType):
|
|
44
|
-
dtype = to_pixeltable_type(arrow_type.value_type)
|
|
48
|
+
dtype = to_pixeltable_type(arrow_type.value_type, nullable)
|
|
45
49
|
if dtype is None:
|
|
46
50
|
return None
|
|
47
|
-
return ts.ArrayType(shape=arrow_type.shape, dtype=dtype)
|
|
51
|
+
return ts.ArrayType(shape=arrow_type.shape, dtype=dtype, nullable=nullable)
|
|
48
52
|
else:
|
|
49
53
|
return None
|
|
50
54
|
|
|
@@ -61,8 +65,17 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
|
|
|
61
65
|
return None
|
|
62
66
|
|
|
63
67
|
|
|
64
|
-
def
|
|
65
|
-
|
|
68
|
+
def ar_infer_schema(
|
|
69
|
+
arrow_schema: pa.Schema, schema_overrides: dict[str, Any], primary_key: list[str]
|
|
70
|
+
) -> dict[str, ts.ColumnType]:
|
|
71
|
+
"""Convert a pyarrow Schema to a schema using pyarrow names and pixeltable types."""
|
|
72
|
+
ar_schema = {
|
|
73
|
+
field.name: to_pixeltable_type(field.type, field.name not in primary_key)
|
|
74
|
+
if field.name not in schema_overrides
|
|
75
|
+
else schema_overrides[field.name]
|
|
76
|
+
for field in arrow_schema
|
|
77
|
+
}
|
|
78
|
+
return ar_schema
|
|
66
79
|
|
|
67
80
|
|
|
68
81
|
def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
|
|
@@ -96,3 +109,23 @@ def iter_tuples(batch: Union[pa.Table, pa.RecordBatch]) -> Iterator[dict[str, An
|
|
|
96
109
|
|
|
97
110
|
for i in range(batch_size):
|
|
98
111
|
yield {col_name: values[i] for col_name, values in pydict.items()}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def iter_tuples2(
|
|
115
|
+
batch: Union[pa.Table, pa.RecordBatch], col_mapping: Optional[dict[str, str]], schema: dict[str, ts.ColumnType]
|
|
116
|
+
) -> Iterator[dict[str, Any]]:
|
|
117
|
+
"""Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
|
|
118
|
+
pydict = to_pydict(batch)
|
|
119
|
+
assert len(pydict) > 0, 'empty record batch'
|
|
120
|
+
for _, v in pydict.items():
|
|
121
|
+
batch_size = len(v)
|
|
122
|
+
break
|
|
123
|
+
|
|
124
|
+
for i in range(batch_size):
|
|
125
|
+
# Convert a row to insertable format
|
|
126
|
+
yield {
|
|
127
|
+
(pxt_name := col_name if col_mapping is None else col_mapping[col_name]): schema[pxt_name].create_literal(
|
|
128
|
+
values[i]
|
|
129
|
+
)
|
|
130
|
+
for col_name, values in pydict.items()
|
|
131
|
+
}
|