pixeltable 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +5 -3
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -0
- pixeltable/catalog/catalog.py +335 -128
- pixeltable/catalog/column.py +21 -5
- pixeltable/catalog/dir.py +19 -6
- pixeltable/catalog/insertable_table.py +34 -37
- pixeltable/catalog/named_function.py +0 -4
- pixeltable/catalog/schema_object.py +28 -42
- pixeltable/catalog/table.py +195 -158
- pixeltable/catalog/table_version.py +187 -232
- pixeltable/catalog/table_version_handle.py +50 -0
- pixeltable/catalog/table_version_path.py +49 -33
- pixeltable/catalog/view.py +56 -96
- pixeltable/config.py +103 -0
- pixeltable/dataframe.py +90 -90
- pixeltable/env.py +98 -168
- pixeltable/exec/aggregation_node.py +5 -4
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/component_iteration_node.py +13 -9
- pixeltable/exec/data_row_batch.py +3 -3
- pixeltable/exec/exec_context.py +0 -4
- pixeltable/exec/exec_node.py +3 -2
- pixeltable/exec/expr_eval/schedulers.py +2 -1
- pixeltable/exec/in_memory_data_node.py +9 -4
- pixeltable/exec/row_update_node.py +1 -2
- pixeltable/exec/sql_node.py +20 -16
- pixeltable/exprs/column_ref.py +9 -9
- pixeltable/exprs/comparison.py +1 -1
- pixeltable/exprs/data_row.py +4 -4
- pixeltable/exprs/expr.py +20 -5
- pixeltable/exprs/function_call.py +98 -58
- pixeltable/exprs/json_mapper.py +25 -8
- pixeltable/exprs/json_path.py +6 -5
- pixeltable/exprs/object_ref.py +16 -5
- pixeltable/exprs/row_builder.py +15 -15
- pixeltable/exprs/rowid_ref.py +21 -7
- pixeltable/func/__init__.py +1 -1
- pixeltable/func/function.py +38 -6
- pixeltable/func/query_template_function.py +3 -6
- pixeltable/func/tools.py +26 -26
- pixeltable/func/udf.py +1 -1
- pixeltable/functions/__init__.py +2 -0
- pixeltable/functions/anthropic.py +9 -3
- pixeltable/functions/fireworks.py +7 -4
- pixeltable/functions/globals.py +4 -5
- pixeltable/functions/huggingface.py +1 -5
- pixeltable/functions/image.py +17 -7
- pixeltable/functions/llama_cpp.py +1 -1
- pixeltable/functions/mistralai.py +1 -1
- pixeltable/functions/ollama.py +4 -4
- pixeltable/functions/openai.py +26 -23
- pixeltable/functions/string.py +23 -30
- pixeltable/functions/timestamp.py +11 -6
- pixeltable/functions/together.py +14 -12
- pixeltable/functions/util.py +1 -1
- pixeltable/functions/video.py +5 -4
- pixeltable/functions/vision.py +6 -9
- pixeltable/functions/whisper.py +3 -3
- pixeltable/globals.py +246 -260
- pixeltable/index/__init__.py +2 -0
- pixeltable/index/base.py +1 -1
- pixeltable/index/btree.py +3 -1
- pixeltable/index/embedding_index.py +11 -5
- pixeltable/io/external_store.py +11 -12
- pixeltable/io/label_studio.py +4 -3
- pixeltable/io/parquet.py +57 -56
- pixeltable/iterators/__init__.py +4 -2
- pixeltable/iterators/audio.py +11 -11
- pixeltable/iterators/document.py +10 -10
- pixeltable/iterators/string.py +1 -2
- pixeltable/iterators/video.py +14 -15
- pixeltable/metadata/__init__.py +9 -5
- pixeltable/metadata/converters/convert_10.py +0 -1
- pixeltable/metadata/converters/convert_15.py +0 -2
- pixeltable/metadata/converters/convert_23.py +0 -2
- pixeltable/metadata/converters/convert_24.py +3 -3
- pixeltable/metadata/converters/convert_25.py +1 -1
- pixeltable/metadata/converters/convert_27.py +0 -2
- pixeltable/metadata/converters/convert_28.py +0 -2
- pixeltable/metadata/converters/convert_29.py +7 -8
- pixeltable/metadata/converters/util.py +7 -7
- pixeltable/metadata/schema.py +27 -19
- pixeltable/plan.py +68 -40
- pixeltable/share/packager.py +12 -9
- pixeltable/store.py +37 -38
- pixeltable/type_system.py +41 -28
- pixeltable/utils/filecache.py +2 -1
- {pixeltable-0.3.5.dist-info → pixeltable-0.3.7.dist-info}/METADATA +1 -1
- pixeltable-0.3.7.dist-info/RECORD +174 -0
- pixeltable-0.3.5.dist-info/RECORD +0 -172
- {pixeltable-0.3.5.dist-info → pixeltable-0.3.7.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.5.dist-info → pixeltable-0.3.7.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.5.dist-info → pixeltable-0.3.7.dist-info}/entry_points.txt +0 -0
pixeltable/index/base.py
CHANGED
|
@@ -37,7 +37,7 @@ class IndexBase(abc.ABC):
|
|
|
37
37
|
pass
|
|
38
38
|
|
|
39
39
|
@abc.abstractmethod
|
|
40
|
-
def create_index(self, index_name: str, index_value_col: catalog.Column
|
|
40
|
+
def create_index(self, index_name: str, index_value_col: catalog.Column) -> None:
|
|
41
41
|
"""Create the index on the index value column"""
|
|
42
42
|
pass
|
|
43
43
|
|
pixeltable/index/btree.py
CHANGED
|
@@ -6,6 +6,7 @@ import sqlalchemy as sql
|
|
|
6
6
|
# import pixeltable.catalog as catalog
|
|
7
7
|
import pixeltable.exceptions as excs
|
|
8
8
|
from pixeltable import catalog, exprs
|
|
9
|
+
from pixeltable.env import Env
|
|
9
10
|
from pixeltable.func.udf import udf
|
|
10
11
|
|
|
11
12
|
from .base import IndexBase
|
|
@@ -52,9 +53,10 @@ class BtreeIndex(IndexBase):
|
|
|
52
53
|
"""Return the sqlalchemy type of the index value column"""
|
|
53
54
|
return self.value_expr.col_type.to_sa_type()
|
|
54
55
|
|
|
55
|
-
def create_index(self, index_name: str, index_value_col: 'catalog.Column'
|
|
56
|
+
def create_index(self, index_name: str, index_value_col: 'catalog.Column') -> None:
|
|
56
57
|
"""Create the index on the index value column"""
|
|
57
58
|
idx = sql.Index(index_name, index_value_col.sa_col, postgresql_using='btree')
|
|
59
|
+
conn = Env.get().conn
|
|
58
60
|
idx.create(bind=conn)
|
|
59
61
|
|
|
60
62
|
@classmethod
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import enum
|
|
4
|
-
from typing import Any, Optional
|
|
4
|
+
from typing import Any, ClassVar, Optional
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pgvector.sqlalchemy # type: ignore[import-untyped]
|
|
@@ -11,6 +11,7 @@ import sqlalchemy as sql
|
|
|
11
11
|
import pixeltable.exceptions as excs
|
|
12
12
|
import pixeltable.type_system as ts
|
|
13
13
|
from pixeltable import catalog, exprs, func
|
|
14
|
+
from pixeltable.env import Env
|
|
14
15
|
|
|
15
16
|
from .base import IndexBase
|
|
16
17
|
|
|
@@ -31,7 +32,11 @@ class EmbeddingIndex(IndexBase):
|
|
|
31
32
|
IP = 2
|
|
32
33
|
L2 = 3
|
|
33
34
|
|
|
34
|
-
PGVECTOR_OPS
|
|
35
|
+
PGVECTOR_OPS: ClassVar[dict[Metric, str]] = {
|
|
36
|
+
Metric.COSINE: 'vector_cosine_ops',
|
|
37
|
+
Metric.IP: 'vector_ip_ops',
|
|
38
|
+
Metric.L2: 'vector_l2_ops',
|
|
39
|
+
}
|
|
35
40
|
|
|
36
41
|
metric: Metric
|
|
37
42
|
value_expr: exprs.FunctionCall
|
|
@@ -55,7 +60,7 @@ class EmbeddingIndex(IndexBase):
|
|
|
55
60
|
if metric.lower() not in metric_names:
|
|
56
61
|
raise excs.Error(f'Invalid metric {metric}, must be one of {metric_names}')
|
|
57
62
|
if not c.col_type.is_string_type() and not c.col_type.is_image_type():
|
|
58
|
-
raise excs.Error(
|
|
63
|
+
raise excs.Error('Embedding index requires string or image column')
|
|
59
64
|
|
|
60
65
|
self.string_embed = None
|
|
61
66
|
self.image_embed = None
|
|
@@ -131,7 +136,7 @@ class EmbeddingIndex(IndexBase):
|
|
|
131
136
|
"""Return the sqlalchemy type of the index value column"""
|
|
132
137
|
return self.index_col_type
|
|
133
138
|
|
|
134
|
-
def create_index(self, index_name: str, index_value_col: catalog.Column
|
|
139
|
+
def create_index(self, index_name: str, index_value_col: catalog.Column) -> None:
|
|
135
140
|
"""Create the index on the index value column"""
|
|
136
141
|
idx = sql.Index(
|
|
137
142
|
index_name,
|
|
@@ -140,6 +145,7 @@ class EmbeddingIndex(IndexBase):
|
|
|
140
145
|
postgresql_with={'m': 16, 'ef_construction': 64},
|
|
141
146
|
postgresql_ops={index_value_col.sa_col.name: self.PGVECTOR_OPS[self.metric]},
|
|
142
147
|
)
|
|
148
|
+
conn = Env.get().conn
|
|
143
149
|
idx.create(bind=conn)
|
|
144
150
|
|
|
145
151
|
def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.ColumnElement:
|
|
@@ -219,7 +225,7 @@ class EmbeddingIndex(IndexBase):
|
|
|
219
225
|
)
|
|
220
226
|
|
|
221
227
|
shape = return_type.shape
|
|
222
|
-
if len(shape) != 1 or shape[0]
|
|
228
|
+
if len(shape) != 1 or shape[0] is None:
|
|
223
229
|
raise excs.Error(
|
|
224
230
|
f'The function `{embed_fn.name}` is not a valid embedding: '
|
|
225
231
|
f'it must return a 1-dimensional array of a specific length, but returns {return_type}'
|
pixeltable/io/external_store.py
CHANGED
|
@@ -8,12 +8,10 @@ from dataclasses import dataclass
|
|
|
8
8
|
from typing import Any, Optional
|
|
9
9
|
from uuid import UUID
|
|
10
10
|
|
|
11
|
-
import sqlalchemy as sql
|
|
12
|
-
|
|
13
11
|
import pixeltable.exceptions as excs
|
|
14
12
|
import pixeltable.type_system as ts
|
|
15
13
|
from pixeltable import Column, Table
|
|
16
|
-
from pixeltable.catalog import TableVersion
|
|
14
|
+
from pixeltable.catalog import TableVersion, TableVersionHandle
|
|
17
15
|
|
|
18
16
|
_logger = logging.getLogger('pixeltable')
|
|
19
17
|
|
|
@@ -33,13 +31,13 @@ class ExternalStore(abc.ABC):
|
|
|
33
31
|
return self.__name
|
|
34
32
|
|
|
35
33
|
@abc.abstractmethod
|
|
36
|
-
def link(self, tbl_version: TableVersion
|
|
34
|
+
def link(self, tbl_version: TableVersion) -> None:
|
|
37
35
|
"""
|
|
38
36
|
Called by `TableVersion.link()` to implement store-specific logic.
|
|
39
37
|
"""
|
|
40
38
|
|
|
41
39
|
@abc.abstractmethod
|
|
42
|
-
def unlink(self, tbl_version: TableVersion
|
|
40
|
+
def unlink(self, tbl_version: TableVersion) -> None:
|
|
43
41
|
"""
|
|
44
42
|
Called by `TableVersion.unlink()` to implement store-specific logic.
|
|
45
43
|
"""
|
|
@@ -94,7 +92,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
94
92
|
def get_local_columns(self) -> list[Column]:
|
|
95
93
|
return list(self.col_mapping.keys())
|
|
96
94
|
|
|
97
|
-
def link(self, tbl_version: TableVersion
|
|
95
|
+
def link(self, tbl_version: TableVersion) -> None:
|
|
98
96
|
# All of the media columns being linked need to either be stored computed columns, or else have stored proxies.
|
|
99
97
|
# This ensures that the media in those columns resides in the media store.
|
|
100
98
|
# First determine which columns (if any) need stored proxies, but don't have one yet.
|
|
@@ -110,6 +108,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
110
108
|
if col not in self.stored_proxies:
|
|
111
109
|
# We didn't find it in an existing Project
|
|
112
110
|
stored_proxies_needed.append(col)
|
|
111
|
+
|
|
113
112
|
if len(stored_proxies_needed) > 0:
|
|
114
113
|
_logger.info(f'Creating stored proxies for columns: {[col.name for col in stored_proxies_needed]}')
|
|
115
114
|
# Create stored proxies for columns that need one. Increment the schema version
|
|
@@ -119,12 +118,12 @@ class Project(ExternalStore, abc.ABC):
|
|
|
119
118
|
tbl_version.schema_version = tbl_version.version
|
|
120
119
|
proxy_cols = [self.create_stored_proxy(tbl_version, col) for col in stored_proxies_needed]
|
|
121
120
|
# Add the columns; this will also update table metadata.
|
|
122
|
-
tbl_version._add_columns(proxy_cols,
|
|
121
|
+
tbl_version._add_columns(proxy_cols, print_stats=False, on_error='ignore')
|
|
123
122
|
# We don't need to retain `UpdateStatus` since the stored proxies are intended to be
|
|
124
123
|
# invisible to the user.
|
|
125
|
-
tbl_version._update_md(time.time(),
|
|
124
|
+
tbl_version._update_md(time.time(), preceding_schema_version=preceding_schema_version)
|
|
126
125
|
|
|
127
|
-
def unlink(self, tbl_version: TableVersion
|
|
126
|
+
def unlink(self, tbl_version: TableVersion) -> None:
|
|
128
127
|
# Determine which stored proxies can be deleted. (A stored proxy can be deleted if it is not referenced by
|
|
129
128
|
# any *other* external store for this table.)
|
|
130
129
|
deletions_needed: set[Column] = set(self.stored_proxies.values())
|
|
@@ -139,7 +138,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
139
138
|
tbl_version.schema_version = tbl_version.version
|
|
140
139
|
tbl_version._drop_columns(deletions_needed)
|
|
141
140
|
self.stored_proxies.clear()
|
|
142
|
-
tbl_version._update_md(time.time(),
|
|
141
|
+
tbl_version._update_md(time.time(), preceding_schema_version=preceding_schema_version)
|
|
143
142
|
|
|
144
143
|
def create_stored_proxy(self, tbl_version: TableVersion, col: Column) -> Column:
|
|
145
144
|
"""
|
|
@@ -163,7 +162,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
163
162
|
sa_col_type=col.col_type.to_sa_type(),
|
|
164
163
|
schema_version_add=tbl_version.schema_version,
|
|
165
164
|
)
|
|
166
|
-
proxy_col.tbl = tbl_version
|
|
165
|
+
proxy_col.tbl = TableVersionHandle(tbl_version.id, tbl_version.effective_version, tbl_version=tbl_version)
|
|
167
166
|
tbl_version.next_col_id += 1
|
|
168
167
|
self.stored_proxies[col] = proxy_col
|
|
169
168
|
return proxy_col
|
|
@@ -279,7 +278,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
279
278
|
|
|
280
279
|
tbl_id = UUID(d['tbl_id'])
|
|
281
280
|
col_id = d['col_id']
|
|
282
|
-
return Catalog.get().
|
|
281
|
+
return Catalog.get().get_tbl_version(tbl_id, None).cols_by_id[col_id]
|
|
283
282
|
|
|
284
283
|
|
|
285
284
|
@dataclass(frozen=True)
|
pixeltable/io/label_studio.py
CHANGED
|
@@ -15,6 +15,7 @@ import pixeltable as pxt
|
|
|
15
15
|
import pixeltable.env as env
|
|
16
16
|
import pixeltable.exceptions as excs
|
|
17
17
|
from pixeltable import Column, Table
|
|
18
|
+
from pixeltable.config import Config
|
|
18
19
|
from pixeltable.exprs import ColumnRef, DataRow, Expr
|
|
19
20
|
from pixeltable.io.external_store import Project, SyncStatus
|
|
20
21
|
from pixeltable.utils import coco
|
|
@@ -356,7 +357,7 @@ class LabelStudioProject(Project):
|
|
|
356
357
|
@classmethod
|
|
357
358
|
def __localpath_to_lspath(cls, localpath: str) -> str:
|
|
358
359
|
# Transform the local path into Label Studio's bespoke path format.
|
|
359
|
-
relpath = Path(localpath).relative_to(
|
|
360
|
+
relpath = Path(localpath).relative_to(Config.get().home)
|
|
360
361
|
return f'/data/local-files/?d={str(relpath)}'
|
|
361
362
|
|
|
362
363
|
def __delete_stale_tasks(
|
|
@@ -410,7 +411,7 @@ class LabelStudioProject(Project):
|
|
|
410
411
|
# batch_update on the actual ancestor table that holds the annotations column.
|
|
411
412
|
# TODO(aaron-siegel): Simplify this once propagation is properly implemented in batch_update
|
|
412
413
|
ancestor = t
|
|
413
|
-
while local_annotations_col not in ancestor._tbl_version.cols:
|
|
414
|
+
while local_annotations_col not in ancestor._tbl_version.get().cols:
|
|
414
415
|
assert ancestor._base is not None
|
|
415
416
|
ancestor = ancestor._base
|
|
416
417
|
update_status = ancestor.batch_update(updates)
|
|
@@ -618,7 +619,7 @@ class LabelStudioProject(Project):
|
|
|
618
619
|
|
|
619
620
|
if media_import_method == 'file':
|
|
620
621
|
# We need to set up a local storage connection to receive media files
|
|
621
|
-
os.environ['LABEL_STUDIO_LOCAL_FILES_DOCUMENT_ROOT'] = str(
|
|
622
|
+
os.environ['LABEL_STUDIO_LOCAL_FILES_DOCUMENT_ROOT'] = str(Config.get().home)
|
|
622
623
|
try:
|
|
623
624
|
project.connect_local_import_storage(local_store_path=str(env.Env.get().media_dir))
|
|
624
625
|
except HTTPError as exc:
|
pixeltable/io/parquet.py
CHANGED
|
@@ -90,63 +90,64 @@ def export_parquet(
|
|
|
90
90
|
current_value_batch: dict[str, deque] = {k: deque() for k in df.schema.keys()}
|
|
91
91
|
current_byte_estimate = 0
|
|
92
92
|
|
|
93
|
-
|
|
94
|
-
for
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
93
|
+
with Env.get().begin_xact():
|
|
94
|
+
for data_row in df._exec():
|
|
95
|
+
for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
|
|
96
|
+
val = data_row[e.slot_idx]
|
|
97
|
+
if val is None:
|
|
98
|
+
current_value_batch[col_name].append(val)
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
assert val is not None
|
|
102
|
+
if col_type.is_image_type():
|
|
103
|
+
# images get inlined into the parquet file
|
|
104
|
+
if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
|
|
105
|
+
# if there is a file, read directly to preserve information
|
|
106
|
+
with open(data_row.file_paths[e.slot_idx], 'rb') as f:
|
|
107
|
+
val = f.read()
|
|
108
|
+
elif isinstance(val, PIL.Image.Image):
|
|
109
|
+
# if no file available, eg. bc it is computed, convert to png
|
|
110
|
+
buf = io.BytesIO()
|
|
111
|
+
val.save(buf, format='PNG')
|
|
112
|
+
val = buf.getvalue()
|
|
113
|
+
else:
|
|
114
|
+
assert False, f'unknown image type {type(val)}'
|
|
115
|
+
length = len(val)
|
|
116
|
+
elif col_type.is_string_type():
|
|
117
|
+
length = len(val)
|
|
118
|
+
elif col_type.is_video_type():
|
|
119
|
+
if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
|
|
120
|
+
val = data_row.file_paths[e.slot_idx]
|
|
121
|
+
else:
|
|
122
|
+
assert False, f'unknown video type {type(val)}'
|
|
123
|
+
length = len(val)
|
|
124
|
+
elif col_type.is_json_type():
|
|
125
|
+
val = json.dumps(val)
|
|
126
|
+
length = len(val)
|
|
127
|
+
elif col_type.is_array_type():
|
|
128
|
+
length = val.nbytes
|
|
129
|
+
elif col_type.is_int_type():
|
|
130
|
+
length = 8
|
|
131
|
+
elif col_type.is_float_type():
|
|
132
|
+
length = 8
|
|
133
|
+
elif col_type.is_bool_type():
|
|
134
|
+
length = 1
|
|
135
|
+
elif col_type.is_timestamp_type():
|
|
136
|
+
val = val.astimezone(datetime.timezone.utc)
|
|
137
|
+
length = 8
|
|
120
138
|
else:
|
|
121
|
-
assert False, f'unknown
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
length = 1
|
|
134
|
-
elif col_type.is_timestamp_type():
|
|
135
|
-
val = val.astimezone(datetime.timezone.utc)
|
|
136
|
-
length = 8
|
|
137
|
-
else:
|
|
138
|
-
assert False, f'unknown type {col_type} for {col_name}'
|
|
139
|
-
|
|
140
|
-
current_value_batch[col_name].append(val)
|
|
141
|
-
current_byte_estimate += length
|
|
142
|
-
if current_byte_estimate > partition_size_bytes:
|
|
143
|
-
assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
|
|
144
|
-
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
145
|
-
batch_num += 1
|
|
146
|
-
current_value_batch = {k: deque() for k in df.schema.keys()}
|
|
147
|
-
current_byte_estimate = 0
|
|
148
|
-
|
|
149
|
-
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
139
|
+
assert False, f'unknown type {col_type} for {col_name}'
|
|
140
|
+
|
|
141
|
+
current_value_batch[col_name].append(val)
|
|
142
|
+
current_byte_estimate += length
|
|
143
|
+
if current_byte_estimate > partition_size_bytes:
|
|
144
|
+
assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
|
|
145
|
+
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
146
|
+
batch_num += 1
|
|
147
|
+
current_value_batch = {k: deque() for k in df.schema.keys()}
|
|
148
|
+
current_byte_estimate = 0
|
|
149
|
+
|
|
150
|
+
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
150
151
|
|
|
151
152
|
|
|
152
153
|
def import_parquet(
|
pixeltable/iterators/__init__.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# ruff: noqa: F401
|
|
2
|
+
|
|
1
3
|
from .audio import AudioSplitter
|
|
2
4
|
from .base import ComponentIterator
|
|
3
5
|
from .document import DocumentSplitter
|
|
@@ -5,9 +7,9 @@ from .image import TileIterator
|
|
|
5
7
|
from .string import StringSplitter
|
|
6
8
|
from .video import FrameIterator
|
|
7
9
|
|
|
8
|
-
__default_dir =
|
|
10
|
+
__default_dir = {symbol for symbol in dir() if not symbol.startswith('_')}
|
|
9
11
|
__removed_symbols = {'base', 'document', 'video'}
|
|
10
|
-
__all__ = sorted(
|
|
12
|
+
__all__ = sorted(__default_dir - __removed_symbols)
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
def __dir__():
|
pixeltable/iterators/audio.py
CHANGED
|
@@ -1,15 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import math
|
|
3
2
|
import uuid
|
|
4
3
|
from fractions import Fraction
|
|
5
4
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Optional
|
|
5
|
+
from typing import Any, ClassVar, Optional
|
|
7
6
|
|
|
8
7
|
import av
|
|
9
8
|
|
|
10
|
-
import
|
|
11
|
-
import pixeltable.exceptions as excs
|
|
12
|
-
import pixeltable.type_system as ts
|
|
9
|
+
from pixeltable import env, exceptions as excs, type_system as ts
|
|
13
10
|
|
|
14
11
|
from .base import ComponentIterator
|
|
15
12
|
|
|
@@ -18,7 +15,8 @@ _logger = logging.getLogger('pixeltable')
|
|
|
18
15
|
|
|
19
16
|
class AudioSplitter(ComponentIterator):
|
|
20
17
|
"""
|
|
21
|
-
Iterator over chunks of an audio file. The audio file is split into smaller chunks,
|
|
18
|
+
Iterator over chunks of an audio file. The audio file is split into smaller chunks,
|
|
19
|
+
where the duration of each chunk is determined by chunk_duration_sec.
|
|
22
20
|
The iterator yields audio chunks as pxt.Audio, along with the start and end time of each chunk.
|
|
23
21
|
If the input contains no audio, no chunks are yielded.
|
|
24
22
|
|
|
@@ -39,11 +37,11 @@ class AudioSplitter(ComponentIterator):
|
|
|
39
37
|
|
|
40
38
|
# List of chunks to extract
|
|
41
39
|
# Each chunk is defined by start and end presentation timestamps in audio file (int)
|
|
42
|
-
chunks_to_extract_in_pts: Optional[list[tuple[int, int]]]
|
|
40
|
+
chunks_to_extract_in_pts: Optional[list[tuple[int, int]]]
|
|
43
41
|
# next chunk to extract
|
|
44
42
|
next_pos: int
|
|
45
43
|
|
|
46
|
-
__codec_map = {
|
|
44
|
+
__codec_map: ClassVar[dict[str, str]] = {
|
|
47
45
|
'mp3': 'mp3', # MP3 decoder -> mp3/libmp3lame encoder
|
|
48
46
|
'mp3float': 'mp3', # MP3float decoder -> mp3 encoder
|
|
49
47
|
'aac': 'aac', # AAC decoder -> AAC encoder
|
|
@@ -88,7 +86,8 @@ class AudioSplitter(ComponentIterator):
|
|
|
88
86
|
)
|
|
89
87
|
]
|
|
90
88
|
_logger.debug(
|
|
91
|
-
f'AudioIterator: path={self.audio_path} total_audio_duration_pts={total_audio_duration_pts}
|
|
89
|
+
f'AudioIterator: path={self.audio_path} total_audio_duration_pts={total_audio_duration_pts} '
|
|
90
|
+
f'chunks_to_extract_in_pts={self.chunks_to_extract_in_pts}'
|
|
92
91
|
)
|
|
93
92
|
|
|
94
93
|
@classmethod
|
|
@@ -155,7 +154,7 @@ class AudioSplitter(ComponentIterator):
|
|
|
155
154
|
try:
|
|
156
155
|
frame = next(self.container.decode(audio=0))
|
|
157
156
|
except EOFError as e:
|
|
158
|
-
raise excs.Error(f
|
|
157
|
+
raise excs.Error(f"Failed to read audio file '{self.audio_path}': {e}") from e
|
|
159
158
|
except StopIteration:
|
|
160
159
|
# no more frames to scan
|
|
161
160
|
break
|
|
@@ -163,7 +162,8 @@ class AudioSplitter(ComponentIterator):
|
|
|
163
162
|
# Current frame is behind chunk's start time, always get frame next to chunk's start time
|
|
164
163
|
continue
|
|
165
164
|
if frame.pts >= target_chunk_end:
|
|
166
|
-
# Frame has crossed the chunk boundary, it should be picked up by next chunk, throw away
|
|
165
|
+
# Frame has crossed the chunk boundary, it should be picked up by next chunk, throw away
|
|
166
|
+
# the current frame
|
|
167
167
|
break
|
|
168
168
|
frame_end = frame.pts + frame.samples
|
|
169
169
|
if frame_count == 0:
|
pixeltable/iterators/document.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import enum
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Any, Iterable, Iterator, Optional, Union
|
|
4
|
+
from typing import Any, ClassVar, Iterable, Iterator, Optional, Union
|
|
5
5
|
|
|
6
6
|
import ftfy
|
|
7
7
|
|
|
@@ -96,7 +96,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
96
96
|
Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
|
|
97
97
|
"""
|
|
98
98
|
|
|
99
|
-
METADATA_COLUMN_TYPES = {
|
|
99
|
+
METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
|
|
100
100
|
ChunkMetadata.TITLE: StringType(nullable=True),
|
|
101
101
|
ChunkMetadata.HEADING: JsonType(nullable=True),
|
|
102
102
|
ChunkMetadata.SOURCELINE: IntType(nullable=True),
|
|
@@ -164,7 +164,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
164
164
|
assert self._doc_handle.txt_doc is not None
|
|
165
165
|
self._sections = self._txt_sections()
|
|
166
166
|
else:
|
|
167
|
-
|
|
167
|
+
raise AssertionError(f'Unsupported document format: {self._doc_handle.format}')
|
|
168
168
|
|
|
169
169
|
if Separator.SENTENCE in self._separators:
|
|
170
170
|
self._sections = self._sentence_sections(self._sections)
|
|
@@ -215,7 +215,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
215
215
|
|
|
216
216
|
# check dependencies at the end
|
|
217
217
|
if Separator.SENTENCE in separators:
|
|
218
|
-
Env.get().
|
|
218
|
+
_ = Env.get().spacy_nlp
|
|
219
219
|
if Separator.TOKEN_LIMIT in separators:
|
|
220
220
|
Env.get().require_package('tiktoken')
|
|
221
221
|
|
|
@@ -259,9 +259,9 @@ class DocumentSplitter(ComponentIterator):
|
|
|
259
259
|
sourceline = el.sourceline
|
|
260
260
|
if el.name in _HTML_HEADINGS:
|
|
261
261
|
# remove the previously seen lower levels
|
|
262
|
-
lower_levels = [
|
|
263
|
-
for
|
|
264
|
-
del headings[
|
|
262
|
+
lower_levels = [lv for lv in headings if lv > el.name]
|
|
263
|
+
for lv in lower_levels:
|
|
264
|
+
del headings[lv]
|
|
265
265
|
headings[el.name] = el.get_text().strip()
|
|
266
266
|
|
|
267
267
|
def emit() -> Iterator[DocumentSection]:
|
|
@@ -320,9 +320,9 @@ class DocumentSplitter(ComponentIterator):
|
|
|
320
320
|
level = f'h{lint}'
|
|
321
321
|
text = heading['children'][0]['raw'].strip()
|
|
322
322
|
# remove the previously seen lower levels
|
|
323
|
-
lower_levels = [
|
|
324
|
-
for
|
|
325
|
-
del headings[
|
|
323
|
+
lower_levels = [lv for lv in headings if lv > level]
|
|
324
|
+
for lv in lower_levels:
|
|
325
|
+
del headings[lv]
|
|
326
326
|
headings[level] = text
|
|
327
327
|
|
|
328
328
|
def emit() -> Iterator[DocumentSection]:
|
pixeltable/iterators/string.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from typing import Any, Iterator
|
|
2
2
|
|
|
3
|
-
import
|
|
4
|
-
import pixeltable.type_system as ts
|
|
3
|
+
from pixeltable import exceptions as excs, type_system as ts
|
|
5
4
|
from pixeltable.env import Env
|
|
6
5
|
from pixeltable.iterators.base import ComponentIterator
|
|
7
6
|
|
pixeltable/iterators/video.py
CHANGED
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
import math
|
|
3
3
|
from fractions import Fraction
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Any, Optional
|
|
5
|
+
from typing import Any, Optional
|
|
6
6
|
|
|
7
7
|
import av
|
|
8
8
|
import pandas as pd
|
|
@@ -91,21 +91,20 @@ class FrameIterator(ComponentIterator):
|
|
|
91
91
|
self.frames_to_extract = None
|
|
92
92
|
else:
|
|
93
93
|
spacing = float(self.video_frame_count) / float(num_frames)
|
|
94
|
-
self.frames_to_extract =
|
|
94
|
+
self.frames_to_extract = [round(i * spacing) for i in range(num_frames)]
|
|
95
95
|
assert len(self.frames_to_extract) == num_frames
|
|
96
|
+
elif fps is None or fps == 0.0:
|
|
97
|
+
# Extract all frames
|
|
98
|
+
self.frames_to_extract = None
|
|
99
|
+
elif fps > float(self.video_framerate):
|
|
100
|
+
raise excs.Error(
|
|
101
|
+
f'Video {video}: requested fps ({fps}) exceeds that of the video ({float(self.video_framerate)})'
|
|
102
|
+
)
|
|
96
103
|
else:
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
raise excs.Error(
|
|
102
|
-
f'Video {video}: requested fps ({fps}) exceeds that of the video ({float(self.video_framerate)})'
|
|
103
|
-
)
|
|
104
|
-
else:
|
|
105
|
-
# Extract frames at the implied frequency
|
|
106
|
-
freq = fps / float(self.video_framerate)
|
|
107
|
-
n = math.ceil(self.video_frame_count * freq) # number of frames to extract
|
|
108
|
-
self.frames_to_extract = list(round(i / freq) for i in range(n))
|
|
104
|
+
# Extract frames at the implied frequency
|
|
105
|
+
freq = fps / float(self.video_framerate)
|
|
106
|
+
n = math.ceil(self.video_frame_count * freq) # number of frames to extract
|
|
107
|
+
self.frames_to_extract = [round(i / freq) for i in range(n)]
|
|
109
108
|
|
|
110
109
|
_logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps} num_frames={self.num_frames}')
|
|
111
110
|
self.next_pos = 0
|
|
@@ -149,7 +148,7 @@ class FrameIterator(ComponentIterator):
|
|
|
149
148
|
try:
|
|
150
149
|
frame = next(self.container.decode(video=0))
|
|
151
150
|
except EOFError:
|
|
152
|
-
raise StopIteration
|
|
151
|
+
raise StopIteration from None
|
|
153
152
|
# Compute the index of the current frame in the video based on the presentation timestamp (pts);
|
|
154
153
|
# this ensures we have a canonical understanding of frame index, regardless of how we got here
|
|
155
154
|
# (seek or iteration)
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -1,14 +1,20 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import importlib
|
|
3
|
+
import logging
|
|
3
4
|
import os
|
|
4
5
|
import pkgutil
|
|
5
6
|
from typing import Callable
|
|
6
7
|
|
|
7
8
|
import sqlalchemy as sql
|
|
8
|
-
|
|
9
|
+
from sqlalchemy import orm
|
|
10
|
+
|
|
11
|
+
from pixeltable.utils.console_output import ConsoleLogger
|
|
9
12
|
|
|
10
13
|
from .schema import SystemInfo, SystemInfoMd
|
|
11
14
|
|
|
15
|
+
_console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
|
|
16
|
+
|
|
17
|
+
|
|
12
18
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
13
19
|
VERSION = 30
|
|
14
20
|
|
|
@@ -30,7 +36,6 @@ converter_cbs: dict[int, Callable[[sql.engine.Engine], None]] = {}
|
|
|
30
36
|
|
|
31
37
|
def register_converter(version: int) -> Callable[[Callable[[sql.engine.Engine], None]], None]:
|
|
32
38
|
def decorator(fn: Callable[[sql.engine.Engine], None]) -> None:
|
|
33
|
-
global converter_cbs
|
|
34
39
|
assert version not in converter_cbs
|
|
35
40
|
converter_cbs[version] = fn
|
|
36
41
|
|
|
@@ -53,9 +58,8 @@ def upgrade_md(engine: sql.engine.Engine) -> None:
|
|
|
53
58
|
while md_version < VERSION:
|
|
54
59
|
if md_version not in converter_cbs:
|
|
55
60
|
raise RuntimeError(f'No metadata converter for version {md_version}')
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
Env.get().console_logger.info(f'Converting metadata from version {md_version} to {md_version + 1}')
|
|
61
|
+
# We can't use the console logger in Env, because Env might not have been initialized yet.
|
|
62
|
+
_console_logger.info(f'Converting metadata from version {md_version} to {md_version + 1}')
|
|
59
63
|
converter_cbs[md_version](engine)
|
|
60
64
|
md_version += 1
|
|
61
65
|
# update system info
|
|
@@ -13,4 +13,3 @@ def _(engine: sql.engine.Engine) -> None:
|
|
|
13
13
|
conn.execute(sql.update(Table).values(md=Table.md - 'parameters'))
|
|
14
14
|
# Add `table_attrs` to all instances of tableschemaversions.md.
|
|
15
15
|
conn.execute(sql.update(TableSchemaVersion).values(md=TableSchemaVersion.md.concat(default_table_attrs)))
|
|
16
|
-
return
|
|
@@ -5,8 +5,6 @@ from typing import Any
|
|
|
5
5
|
import cloudpickle # type: ignore[import-untyped]
|
|
6
6
|
import sqlalchemy as sql
|
|
7
7
|
|
|
8
|
-
import pixeltable.func as func
|
|
9
|
-
import pixeltable.type_system as ts
|
|
10
8
|
from pixeltable.metadata import register_converter
|
|
11
9
|
from pixeltable.metadata.schema import Function
|
|
12
10
|
|
|
@@ -1,12 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any, Optional
|
|
3
2
|
from uuid import UUID
|
|
4
3
|
|
|
5
4
|
import sqlalchemy as sql
|
|
6
5
|
|
|
7
6
|
from pixeltable.metadata import register_converter
|
|
8
7
|
from pixeltable.metadata.converters.util import convert_table_md
|
|
9
|
-
from pixeltable.metadata.schema import Table
|
|
10
8
|
|
|
11
9
|
_logger = logging.getLogger('pixeltable')
|
|
12
10
|
|
|
@@ -19,11 +19,11 @@ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], A
|
|
|
19
19
|
isinstance(v, dict)
|
|
20
20
|
and '_classpath' in v
|
|
21
21
|
and v['_classpath']
|
|
22
|
-
in
|
|
22
|
+
in {
|
|
23
23
|
'pixeltable.func.callable_function.CallableFunction',
|
|
24
24
|
'pixeltable.func.aggregate_function.AggregateFunction',
|
|
25
25
|
'pixeltable.func.expr_template_function.ExprTemplateFunction',
|
|
26
|
-
|
|
26
|
+
}
|
|
27
27
|
):
|
|
28
28
|
if 'path' in v:
|
|
29
29
|
assert 'signature' not in v
|
|
@@ -50,6 +50,6 @@ def __substitute_path(path: str) -> str:
|
|
|
50
50
|
# versions, it's necessary to resolve the function symbol to get the signature. The following
|
|
51
51
|
# adjustment is necessary for function names that are stored in db artifacts of version < 25, but
|
|
52
52
|
# have changed in some version > 25.
|
|
53
|
-
if path in
|
|
53
|
+
if path in {'pixeltable.functions.huggingface.clip_text', 'pixeltable.functions.huggingface.clip_image'}:
|
|
54
54
|
return 'pixeltable.functions.huggingface.clip'
|
|
55
55
|
return path
|