pixeltable 0.2.17__py3-none-any.whl → 0.2.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +8 -7
- pixeltable/catalog/column.py +11 -8
- pixeltable/catalog/insertable_table.py +1 -1
- pixeltable/catalog/path_dict.py +8 -6
- pixeltable/catalog/table.py +20 -13
- pixeltable/catalog/table_version.py +91 -54
- pixeltable/catalog/table_version_path.py +7 -9
- pixeltable/catalog/view.py +2 -1
- pixeltable/dataframe.py +1 -1
- pixeltable/env.py +173 -83
- pixeltable/exec/aggregation_node.py +2 -1
- pixeltable/exec/component_iteration_node.py +1 -1
- pixeltable/exec/sql_node.py +11 -8
- pixeltable/exprs/__init__.py +1 -0
- pixeltable/exprs/arithmetic_expr.py +4 -4
- pixeltable/exprs/array_slice.py +2 -1
- pixeltable/exprs/column_property_ref.py +9 -7
- pixeltable/exprs/column_ref.py +2 -1
- pixeltable/exprs/comparison.py +10 -7
- pixeltable/exprs/compound_predicate.py +3 -2
- pixeltable/exprs/data_row.py +19 -4
- pixeltable/exprs/expr.py +46 -35
- pixeltable/exprs/expr_set.py +32 -9
- pixeltable/exprs/function_call.py +56 -32
- pixeltable/exprs/in_predicate.py +3 -2
- pixeltable/exprs/inline_array.py +2 -1
- pixeltable/exprs/inline_dict.py +2 -1
- pixeltable/exprs/is_null.py +3 -2
- pixeltable/exprs/json_mapper.py +5 -4
- pixeltable/exprs/json_path.py +7 -1
- pixeltable/exprs/literal.py +34 -7
- pixeltable/exprs/method_ref.py +3 -3
- pixeltable/exprs/object_ref.py +6 -5
- pixeltable/exprs/row_builder.py +25 -17
- pixeltable/exprs/rowid_ref.py +2 -1
- pixeltable/exprs/similarity_expr.py +2 -1
- pixeltable/exprs/sql_element_cache.py +30 -0
- pixeltable/exprs/type_cast.py +3 -3
- pixeltable/exprs/variable.py +2 -1
- pixeltable/ext/functions/whisperx.py +4 -4
- pixeltable/ext/functions/yolox.py +6 -6
- pixeltable/func/aggregate_function.py +1 -0
- pixeltable/func/function.py +28 -4
- pixeltable/functions/__init__.py +4 -2
- pixeltable/functions/anthropic.py +15 -5
- pixeltable/functions/fireworks.py +1 -1
- pixeltable/functions/globals.py +6 -1
- pixeltable/functions/huggingface.py +2 -2
- pixeltable/functions/image.py +17 -2
- pixeltable/functions/json.py +5 -5
- pixeltable/functions/mistralai.py +188 -0
- pixeltable/functions/openai.py +6 -10
- pixeltable/functions/string.py +3 -2
- pixeltable/functions/timestamp.py +95 -7
- pixeltable/functions/together.py +4 -4
- pixeltable/functions/video.py +2 -2
- pixeltable/functions/vision.py +27 -17
- pixeltable/functions/whisper.py +1 -1
- pixeltable/io/hf_datasets.py +17 -15
- pixeltable/io/pandas.py +0 -2
- pixeltable/io/parquet.py +15 -14
- pixeltable/iterators/document.py +16 -15
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_19.py +46 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +5 -4
- pixeltable/plan.py +100 -78
- pixeltable/store.py +5 -1
- pixeltable/tool/create_test_db_dump.py +4 -3
- pixeltable/type_system.py +12 -14
- pixeltable/utils/documents.py +45 -42
- pixeltable/utils/formatter.py +2 -2
- {pixeltable-0.2.17.dist-info → pixeltable-0.2.18.dist-info}/METADATA +79 -21
- pixeltable-0.2.18.dist-info/RECORD +147 -0
- pixeltable-0.2.17.dist-info/RECORD +0 -144
- {pixeltable-0.2.17.dist-info → pixeltable-0.2.18.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.17.dist-info → pixeltable-0.2.18.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.17.dist-info → pixeltable-0.2.18.dist-info}/entry_points.txt +0 -0
pixeltable/io/parquet.py
CHANGED
|
@@ -7,7 +7,7 @@ import random
|
|
|
7
7
|
import typing
|
|
8
8
|
from collections import deque
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import Dict, Optional
|
|
10
|
+
from typing import Dict, Optional, Any
|
|
11
11
|
|
|
12
12
|
import PIL.Image
|
|
13
13
|
import numpy as np
|
|
@@ -142,21 +142,22 @@ def import_parquet(
|
|
|
142
142
|
table_path: str,
|
|
143
143
|
*,
|
|
144
144
|
parquet_path: str,
|
|
145
|
-
|
|
146
|
-
**kwargs,
|
|
147
|
-
) -> pxt.
|
|
148
|
-
"""
|
|
145
|
+
schema_overrides: Optional[Dict[str, ts.ColumnType]] = None,
|
|
146
|
+
**kwargs: Any,
|
|
147
|
+
) -> pxt.Table:
|
|
148
|
+
"""Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
|
|
149
|
+
|
|
149
150
|
Args:
|
|
150
|
-
|
|
151
|
+
table_path: Path to the table.
|
|
151
152
|
parquet_path: Path to an individual Parquet file or directory of Parquet files.
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
153
|
+
schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
|
|
154
|
+
name `name` will be given type `type`, instead of being inferred from the Parquet dataset. The keys in
|
|
155
|
+
`schema_overrides` should be the column names of the Parquet dataset (whether or not they are valid
|
|
156
|
+
Pixeltable identifiers).
|
|
156
157
|
kwargs: Additional arguments to pass to `create_table`.
|
|
157
158
|
|
|
158
159
|
Returns:
|
|
159
|
-
|
|
160
|
+
A handle to the newly created [`Table`][pixeltable.Table].
|
|
160
161
|
"""
|
|
161
162
|
import pixeltable as pxt
|
|
162
163
|
from pyarrow import parquet
|
|
@@ -166,10 +167,10 @@ def import_parquet(
|
|
|
166
167
|
parquet_dataset = parquet.ParquetDataset(input_path)
|
|
167
168
|
|
|
168
169
|
schema = parquet_schema_to_pixeltable_schema(parquet_path)
|
|
169
|
-
if
|
|
170
|
-
|
|
170
|
+
if schema_overrides is None:
|
|
171
|
+
schema_overrides = {}
|
|
171
172
|
|
|
172
|
-
schema.update(
|
|
173
|
+
schema.update(schema_overrides)
|
|
173
174
|
for k, v in schema.items():
|
|
174
175
|
if v is None:
|
|
175
176
|
raise exc.Error(f'Could not infer pixeltable type for column {k} from parquet file')
|
pixeltable/iterators/document.py
CHANGED
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import enum
|
|
3
3
|
import logging
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Any, Iterable, Iterator, Optional
|
|
5
5
|
|
|
6
6
|
import ftfy
|
|
7
7
|
|
|
8
8
|
from pixeltable.env import Env
|
|
9
9
|
from pixeltable.exceptions import Error
|
|
10
|
-
from pixeltable.type_system import ColumnType, DocumentType,
|
|
10
|
+
from pixeltable.type_system import ColumnType, DocumentType, IntType, JsonType, StringType
|
|
11
11
|
from pixeltable.utils.documents import get_document_handle
|
|
12
|
+
|
|
12
13
|
from .base import ComponentIterator
|
|
13
14
|
|
|
14
15
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -38,12 +39,12 @@ class DocumentSectionMetadata:
|
|
|
38
39
|
sourceline: Optional[int] = None
|
|
39
40
|
# the stack of headings up to the most recently observed one;
|
|
40
41
|
# eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
|
|
41
|
-
heading: Optional[
|
|
42
|
+
heading: Optional[dict[str, str]] = None
|
|
42
43
|
|
|
43
44
|
# pdf-specific metadata
|
|
44
45
|
page: Optional[int] = None
|
|
45
46
|
# bounding box as an {x1, y1, x2, y2} dictionary
|
|
46
|
-
bounding_box: Optional[
|
|
47
|
+
bounding_box: Optional[dict[str, float]] = None
|
|
47
48
|
|
|
48
49
|
|
|
49
50
|
@dataclasses.dataclass
|
|
@@ -53,7 +54,7 @@ class DocumentSection:
|
|
|
53
54
|
metadata: Optional[DocumentSectionMetadata]
|
|
54
55
|
|
|
55
56
|
|
|
56
|
-
def _parse_separators(separators: str) ->
|
|
57
|
+
def _parse_separators(separators: str) -> list[Separator]:
|
|
57
58
|
ret = []
|
|
58
59
|
for s in separators.split(','):
|
|
59
60
|
clean_s = s.strip().upper()
|
|
@@ -67,7 +68,7 @@ def _parse_separators(separators: str) -> List[Separator]:
|
|
|
67
68
|
return ret
|
|
68
69
|
|
|
69
70
|
|
|
70
|
-
def _parse_metadata(metadata: str) ->
|
|
71
|
+
def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
|
|
71
72
|
ret = []
|
|
72
73
|
for m in metadata.split(','):
|
|
73
74
|
clean_m = m.strip().upper()
|
|
@@ -161,7 +162,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
161
162
|
self._sections = self._char_chunks(self._sections)
|
|
162
163
|
|
|
163
164
|
@classmethod
|
|
164
|
-
def input_schema(cls) ->
|
|
165
|
+
def input_schema(cls) -> dict[str, ColumnType]:
|
|
165
166
|
return {
|
|
166
167
|
'document': DocumentType(nullable=False),
|
|
167
168
|
'separators': StringType(nullable=False),
|
|
@@ -174,7 +175,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
174
175
|
}
|
|
175
176
|
|
|
176
177
|
@classmethod
|
|
177
|
-
def output_schema(cls, *args: Any, **kwargs: Any) ->
|
|
178
|
+
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
|
|
178
179
|
schema = {'text': StringType()}
|
|
179
180
|
md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
|
|
180
181
|
|
|
@@ -208,7 +209,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
208
209
|
|
|
209
210
|
return schema, []
|
|
210
211
|
|
|
211
|
-
def __next__(self) ->
|
|
212
|
+
def __next__(self) -> dict[str, Any]:
|
|
212
213
|
while True:
|
|
213
214
|
section = next(self._sections)
|
|
214
215
|
if section.text is None:
|
|
@@ -236,7 +237,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
236
237
|
accumulated_text = [] # currently accumulated text
|
|
237
238
|
# accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
|
|
238
239
|
|
|
239
|
-
headings:
|
|
240
|
+
headings: dict[str, str] = {} # current state of observed headings (level -> text)
|
|
240
241
|
sourceline = 0 # most recently seen sourceline
|
|
241
242
|
|
|
242
243
|
def update_metadata(el: bs4.Tag) -> None:
|
|
@@ -250,7 +251,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
250
251
|
del headings[l]
|
|
251
252
|
headings[el.name] = el.get_text().strip()
|
|
252
253
|
|
|
253
|
-
def emit() ->
|
|
254
|
+
def emit() -> Iterator[DocumentSection]:
|
|
254
255
|
nonlocal accumulated_text, headings, sourceline
|
|
255
256
|
if len(accumulated_text) > 0:
|
|
256
257
|
md = DocumentSectionMetadata(sourceline=sourceline, heading=headings.copy())
|
|
@@ -294,9 +295,9 @@ class DocumentSplitter(ComponentIterator):
|
|
|
294
295
|
# current state
|
|
295
296
|
accumulated_text = [] # currently accumulated text
|
|
296
297
|
# accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
|
|
297
|
-
headings:
|
|
298
|
+
headings: dict[str, str] = {} # current state of observed headings (level -> text)
|
|
298
299
|
|
|
299
|
-
def update_headings(heading:
|
|
300
|
+
def update_headings(heading: dict) -> None:
|
|
300
301
|
# update current state
|
|
301
302
|
nonlocal headings
|
|
302
303
|
assert 'type' in heading and heading['type'] == 'heading'
|
|
@@ -309,14 +310,14 @@ class DocumentSplitter(ComponentIterator):
|
|
|
309
310
|
del headings[l]
|
|
310
311
|
headings[level] = text
|
|
311
312
|
|
|
312
|
-
def emit() ->
|
|
313
|
+
def emit() -> Iterator[DocumentSection]:
|
|
313
314
|
nonlocal accumulated_text, headings
|
|
314
315
|
if len(accumulated_text) > 0:
|
|
315
316
|
metadata = DocumentSectionMetadata(sourceline=0, heading=headings.copy())
|
|
316
317
|
yield DocumentSection(text=ftfy.fix_text(' '.join(accumulated_text)), metadata=metadata)
|
|
317
318
|
accumulated_text = []
|
|
318
319
|
|
|
319
|
-
def process_element(el:
|
|
320
|
+
def process_element(el: dict) -> Iterator[DocumentSection]:
|
|
320
321
|
# process the element and emit sections as necessary
|
|
321
322
|
nonlocal accumulated_text, headings, emit_on_heading, emit_on_paragraph
|
|
322
323
|
assert 'type' in el
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
|
|
|
10
10
|
from .schema import SystemInfo, SystemInfoMd
|
|
11
11
|
|
|
12
12
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
13
|
-
VERSION =
|
|
13
|
+
VERSION = 20
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
|
|
4
|
+
import sqlalchemy as sql
|
|
5
|
+
|
|
6
|
+
import pixeltable as pxt
|
|
7
|
+
from pixeltable.metadata import register_converter, schema
|
|
8
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@register_converter(version=19)
|
|
12
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
13
|
+
# Convert all timestamp literals to aware datetimes
|
|
14
|
+
convert_table_md(engine, substitution_fn=__update_timestamp_literals)
|
|
15
|
+
|
|
16
|
+
# Convert all timestamp columns to TIMESTAMPTZ. (This conversion will take place in the database
|
|
17
|
+
# default time zone, which is what we want, since in versions <= 19 they were naive timestamps.)
|
|
18
|
+
with engine.begin() as conn:
|
|
19
|
+
tables = conn.execute(sql.select(schema.Table.id, schema.Table.md))
|
|
20
|
+
for id, md in tables:
|
|
21
|
+
store_prefix = 'view' if md['view_md'] is not None else 'tbl'
|
|
22
|
+
store_name = f'{store_prefix}_{id.hex}'
|
|
23
|
+
column_md = md['column_md']
|
|
24
|
+
timestamp_cols = [
|
|
25
|
+
col_id for col_id, col in column_md.items()
|
|
26
|
+
if col['col_type']['_classname'] == 'TimestampType'
|
|
27
|
+
]
|
|
28
|
+
for col_id in timestamp_cols:
|
|
29
|
+
conn.execute(
|
|
30
|
+
sql.text(f'ALTER TABLE {store_name} ALTER COLUMN col_{col_id} TYPE TIMESTAMPTZ')
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def __update_timestamp_literals(k: Any, v: Any) -> Optional[tuple[Any, Any]]:
|
|
35
|
+
if isinstance(v, dict) and 'val_t' in v:
|
|
36
|
+
# It's a literal with an explicit 'val_t' field. In version 19 this can only mean a
|
|
37
|
+
# timestamp literal, which (in version 19) is stored in the DB as a naive datetime.
|
|
38
|
+
# We convert it to an aware datetime, stored in UTC.
|
|
39
|
+
assert v['_classname'] == 'Literal'
|
|
40
|
+
assert v['val_t'] == pxt.ColumnType.Type.TIMESTAMP.name
|
|
41
|
+
assert isinstance(v['val'], str)
|
|
42
|
+
dt = datetime.datetime.fromisoformat(v['val'])
|
|
43
|
+
assert dt.tzinfo is None # In version 19 all timestamps are naive
|
|
44
|
+
dt_utc = dt.astimezone(datetime.timezone.utc)
|
|
45
|
+
v['val'] = dt_utc.isoformat()
|
|
46
|
+
return k, v
|
pixeltable/metadata/notes.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# rather than as a comment, so that the existence of a description can be enforced by
|
|
3
3
|
# the unit tests when new versions are added.
|
|
4
4
|
VERSION_NOTES = {
|
|
5
|
+
20: 'Store DB timestamps in UTC',
|
|
5
6
|
19: 'UDF renames; ImageMemberAccess removal',
|
|
6
7
|
18: 'Restructured index metadata',
|
|
7
8
|
17: 'Renamed remotes to external_stores',
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -3,6 +3,7 @@ import uuid
|
|
|
3
3
|
from typing import Optional, List, get_type_hints, Type, Any, TypeVar, Tuple, Union
|
|
4
4
|
|
|
5
5
|
import sqlalchemy as sql
|
|
6
|
+
import sqlalchemy.orm as orm
|
|
6
7
|
from sqlalchemy import ForeignKey
|
|
7
8
|
from sqlalchemy import Integer, BigInteger, LargeBinary
|
|
8
9
|
from sqlalchemy.dialects.postgresql import UUID, JSONB
|
|
@@ -64,8 +65,8 @@ class DirMd:
|
|
|
64
65
|
class Dir(Base):
|
|
65
66
|
__tablename__ = 'dirs'
|
|
66
67
|
|
|
67
|
-
id =
|
|
68
|
-
parent_id =
|
|
68
|
+
id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False)
|
|
69
|
+
parent_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
|
|
69
70
|
md = sql.Column(JSONB, nullable=False)
|
|
70
71
|
|
|
71
72
|
|
|
@@ -163,8 +164,8 @@ class Table(Base):
|
|
|
163
164
|
|
|
164
165
|
MAX_VERSION = 9223372036854775807 # 2^63 - 1
|
|
165
166
|
|
|
166
|
-
id =
|
|
167
|
-
dir_id =
|
|
167
|
+
id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), primary_key=True, nullable=False)
|
|
168
|
+
dir_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=False)
|
|
168
169
|
md = sql.Column(JSONB, nullable=False) # TableMd
|
|
169
170
|
|
|
170
171
|
|