pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +23 -5
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -3
- pixeltable/catalog/catalog.py +1318 -404
- pixeltable/catalog/column.py +186 -115
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/globals.py +11 -43
- pixeltable/catalog/insertable_table.py +167 -79
- pixeltable/catalog/path.py +61 -23
- pixeltable/catalog/schema_object.py +9 -10
- pixeltable/catalog/table.py +626 -308
- pixeltable/catalog/table_metadata.py +101 -0
- pixeltable/catalog/table_version.py +713 -569
- pixeltable/catalog/table_version_handle.py +37 -6
- pixeltable/catalog/table_version_path.py +42 -29
- pixeltable/catalog/tbl_ops.py +50 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +108 -94
- pixeltable/config.py +128 -22
- pixeltable/dataframe.py +188 -100
- pixeltable/env.py +407 -136
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +3 -0
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +231 -0
- pixeltable/exec/cell_reconstruction_node.py +135 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +7 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +190 -30
- pixeltable/exec/globals.py +32 -0
- pixeltable/exec/in_memory_data_node.py +18 -18
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +206 -101
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +34 -30
- pixeltable/exprs/column_ref.py +92 -96
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +152 -55
- pixeltable/exprs/expr.py +62 -43
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +75 -37
- pixeltable/exprs/globals.py +1 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +10 -27
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +127 -53
- pixeltable/exprs/rowid_ref.py +8 -12
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +10 -10
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +20 -18
- pixeltable/func/signature.py +43 -16
- pixeltable/func/tools.py +23 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +6 -0
- pixeltable/functions/anthropic.py +93 -33
- pixeltable/functions/audio.py +114 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +1 -1
- pixeltable/functions/deepseek.py +20 -9
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +28 -11
- pixeltable/functions/globals.py +13 -13
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1046 -23
- pixeltable/functions/image.py +9 -18
- pixeltable/functions/llama_cpp.py +23 -8
- pixeltable/functions/math.py +3 -4
- pixeltable/functions/mistralai.py +4 -15
- pixeltable/functions/ollama.py +16 -9
- pixeltable/functions/openai.py +104 -82
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +2 -2
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +13 -14
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/video.py +1388 -106
- pixeltable/functions/vision.py +7 -7
- pixeltable/functions/whisper.py +15 -7
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +332 -105
- pixeltable/index/base.py +13 -22
- pixeltable/index/btree.py +23 -22
- pixeltable/index/embedding_index.py +32 -44
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +7 -6
- pixeltable/io/external_store.py +49 -77
- pixeltable/io/fiftyone.py +11 -11
- pixeltable/io/globals.py +29 -28
- pixeltable/io/hf_datasets.py +17 -9
- pixeltable/io/label_studio.py +70 -66
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +12 -11
- pixeltable/io/parquet.py +13 -93
- pixeltable/io/table_data_conduit.py +71 -47
- pixeltable/io/utils.py +3 -3
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +21 -11
- pixeltable/iterators/document.py +116 -55
- pixeltable/iterators/image.py +5 -2
- pixeltable/iterators/video.py +293 -13
- pixeltable/metadata/__init__.py +4 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/util.py +13 -12
- pixeltable/metadata/notes.py +4 -0
- pixeltable/metadata/schema.py +79 -42
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +274 -223
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +259 -129
- pixeltable/share/protocol/__init__.py +34 -0
- pixeltable/share/protocol/common.py +170 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +109 -0
- pixeltable/share/publish.py +213 -57
- pixeltable/store.py +238 -175
- pixeltable/type_system.py +104 -63
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +108 -13
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +31 -5
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +283 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +88 -0
- pixeltable/utils/local_store.py +316 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +528 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +392 -0
- pixeltable-0.4.20.dist-info/METADATA +587 -0
- pixeltable-0.4.20.dist-info/RECORD +218 -0
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
- pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable/utils/sample.py +0 -25
- pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
- pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
- pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
|
@@ -8,9 +8,11 @@ import urllib.parse
|
|
|
8
8
|
import urllib.request
|
|
9
9
|
from dataclasses import dataclass, field, fields
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal,
|
|
11
|
+
from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, cast
|
|
12
12
|
|
|
13
|
+
import numpy as np
|
|
13
14
|
import pandas as pd
|
|
15
|
+
import PIL
|
|
14
16
|
from pyarrow.parquet import ParquetDataset
|
|
15
17
|
|
|
16
18
|
import pixeltable as pxt
|
|
@@ -47,16 +49,16 @@ class TableDataConduitFormat(str, enum.Enum):
|
|
|
47
49
|
|
|
48
50
|
@dataclass
|
|
49
51
|
class TableDataConduit:
|
|
50
|
-
source: TableDataSource
|
|
51
|
-
source_format:
|
|
52
|
-
source_column_map:
|
|
52
|
+
source: 'TableDataSource'
|
|
53
|
+
source_format: str | None = None
|
|
54
|
+
source_column_map: dict[str, str] | None = None
|
|
53
55
|
if_row_exists: Literal['update', 'ignore', 'error'] = 'error'
|
|
54
|
-
pxt_schema:
|
|
55
|
-
src_schema_overrides:
|
|
56
|
-
src_schema:
|
|
57
|
-
pxt_pk:
|
|
58
|
-
src_pk:
|
|
59
|
-
valid_rows:
|
|
56
|
+
pxt_schema: dict[str, ts.ColumnType] | None = None
|
|
57
|
+
src_schema_overrides: dict[str, ts.ColumnType] | None = None
|
|
58
|
+
src_schema: dict[str, ts.ColumnType] | None = None
|
|
59
|
+
pxt_pk: list[str] | None = None
|
|
60
|
+
src_pk: list[str] | None = None
|
|
61
|
+
valid_rows: RowData | None = None
|
|
60
62
|
extra_fields: dict[str, Any] = field(default_factory=dict)
|
|
61
63
|
|
|
62
64
|
reqd_col_names: set[str] = field(default_factory=set)
|
|
@@ -87,7 +89,7 @@ class TableDataConduit:
|
|
|
87
89
|
for name, coltype in self.pxt_schema.items():
|
|
88
90
|
self.pxt_schema[name] = ts.ColumnType.normalize_type(coltype)
|
|
89
91
|
|
|
90
|
-
def infer_schema(self) -> dict[str,
|
|
92
|
+
def infer_schema(self) -> dict[str, ts.ColumnType]:
|
|
91
93
|
raise NotImplementedError
|
|
92
94
|
|
|
93
95
|
def valid_row_batch(self) -> Iterator[RowData]:
|
|
@@ -101,7 +103,7 @@ class TableDataConduit:
|
|
|
101
103
|
def add_table_info(self, table: pxt.Table) -> None:
|
|
102
104
|
"""Add information about the table into which we are inserting data"""
|
|
103
105
|
assert isinstance(table, pxt.Table)
|
|
104
|
-
self.pxt_schema = table.
|
|
106
|
+
self.pxt_schema = table._get_schema()
|
|
105
107
|
self.pxt_pk = table._tbl_version.get().primary_key
|
|
106
108
|
for col in table._tbl_version_path.columns():
|
|
107
109
|
if col.is_required_for_insert:
|
|
@@ -137,7 +139,7 @@ class DFTableDataConduit(TableDataConduit):
|
|
|
137
139
|
t.pxt_df = tds.source
|
|
138
140
|
return t
|
|
139
141
|
|
|
140
|
-
def infer_schema(self) -> dict[str,
|
|
142
|
+
def infer_schema(self) -> dict[str, ts.ColumnType]:
|
|
141
143
|
self.pxt_schema = self.pxt_df.schema
|
|
142
144
|
self.pxt_pk = self.src_pk
|
|
143
145
|
return self.pxt_schema
|
|
@@ -149,7 +151,7 @@ class DFTableDataConduit(TableDataConduit):
|
|
|
149
151
|
|
|
150
152
|
|
|
151
153
|
class RowDataTableDataConduit(TableDataConduit):
|
|
152
|
-
raw_rows:
|
|
154
|
+
raw_rows: RowData | None = None
|
|
153
155
|
disable_mapping: bool = True
|
|
154
156
|
batch_count: int = 0
|
|
155
157
|
|
|
@@ -168,7 +170,7 @@ class RowDataTableDataConduit(TableDataConduit):
|
|
|
168
170
|
t.batch_count = 0
|
|
169
171
|
return t
|
|
170
172
|
|
|
171
|
-
def infer_schema(self) -> dict[str,
|
|
173
|
+
def infer_schema(self) -> dict[str, ts.ColumnType]:
|
|
172
174
|
from .datarows import _infer_schema_from_rows
|
|
173
175
|
|
|
174
176
|
if self.source_column_map is None:
|
|
@@ -239,7 +241,7 @@ class PandasTableDataConduit(TableDataConduit):
|
|
|
239
241
|
t.batch_count = 0
|
|
240
242
|
return t
|
|
241
243
|
|
|
242
|
-
def infer_schema_part1(self) -> tuple[dict[str,
|
|
244
|
+
def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
243
245
|
"""Return inferred schema, inferred primary key, and source column map"""
|
|
244
246
|
if self.source_column_map is None:
|
|
245
247
|
if self.src_schema_overrides is None:
|
|
@@ -252,7 +254,7 @@ class PandasTableDataConduit(TableDataConduit):
|
|
|
252
254
|
else:
|
|
253
255
|
raise NotImplementedError()
|
|
254
256
|
|
|
255
|
-
def infer_schema(self) -> dict[str,
|
|
257
|
+
def infer_schema(self) -> dict[str, ts.ColumnType]:
|
|
256
258
|
self.pxt_schema, self.pxt_pk = self.infer_schema_part1()
|
|
257
259
|
self.normalize_pxt_schema_types()
|
|
258
260
|
_df_check_primary_key_values(self.pd_df, self.src_pk)
|
|
@@ -325,10 +327,13 @@ class JsonTableDataConduit(TableDataConduit):
|
|
|
325
327
|
|
|
326
328
|
|
|
327
329
|
class HFTableDataConduit(TableDataConduit):
|
|
328
|
-
|
|
329
|
-
|
|
330
|
+
"""
|
|
331
|
+
TODO:
|
|
332
|
+
- use set_format('arrow') and convert ChunkedArrays to PIL.Image.Image instead of going through numpy, which is slow
|
|
333
|
+
"""
|
|
334
|
+
|
|
335
|
+
column_name_for_split: str | None = None
|
|
330
336
|
categorical_features: dict[str, dict[int, str]]
|
|
331
|
-
hf_schema: dict[str, Any] = None
|
|
332
337
|
dataset_dict: dict[str, datasets.Dataset] = None
|
|
333
338
|
hf_schema_source: dict[str, Any] = None
|
|
334
339
|
|
|
@@ -340,9 +345,19 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
340
345
|
import datasets
|
|
341
346
|
|
|
342
347
|
assert isinstance(tds.source, (datasets.Dataset, datasets.DatasetDict))
|
|
343
|
-
t.hf_ds = tds.source
|
|
344
348
|
if 'column_name_for_split' in t.extra_fields:
|
|
345
349
|
t.column_name_for_split = t.extra_fields['column_name_for_split']
|
|
350
|
+
|
|
351
|
+
# make sure we get numpy arrays for arrays, not Python lists
|
|
352
|
+
source = tds.source.with_format(type='numpy')
|
|
353
|
+
if isinstance(source, datasets.Dataset):
|
|
354
|
+
# when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
|
|
355
|
+
raw_name = source.split._name
|
|
356
|
+
split_name = raw_name.split('[')[0] if raw_name is not None else None
|
|
357
|
+
t.dataset_dict = {split_name: source}
|
|
358
|
+
else:
|
|
359
|
+
assert isinstance(source, datasets.DatasetDict)
|
|
360
|
+
t.dataset_dict = source
|
|
346
361
|
return t
|
|
347
362
|
|
|
348
363
|
@classmethod
|
|
@@ -356,13 +371,13 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
356
371
|
except ImportError:
|
|
357
372
|
return False
|
|
358
373
|
|
|
359
|
-
def infer_schema_part1(self) -> tuple[dict[str,
|
|
374
|
+
def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
360
375
|
from pixeltable.io.hf_datasets import _get_hf_schema, huggingface_schema_to_pxt_schema
|
|
361
376
|
|
|
362
377
|
if self.source_column_map is None:
|
|
363
378
|
if self.src_schema_overrides is None:
|
|
364
379
|
self.src_schema_overrides = {}
|
|
365
|
-
self.hf_schema_source = _get_hf_schema(self.
|
|
380
|
+
self.hf_schema_source = _get_hf_schema(self.source)
|
|
366
381
|
self.src_schema = huggingface_schema_to_pxt_schema(
|
|
367
382
|
self.hf_schema_source, self.src_schema_overrides, self.src_pk
|
|
368
383
|
)
|
|
@@ -397,15 +412,6 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
397
412
|
def prepare_insert(self) -> None:
|
|
398
413
|
import datasets
|
|
399
414
|
|
|
400
|
-
if isinstance(self.source, datasets.Dataset):
|
|
401
|
-
# when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
|
|
402
|
-
raw_name = self.source.split._name
|
|
403
|
-
split_name = raw_name.split('[')[0] if raw_name is not None else None
|
|
404
|
-
self.dataset_dict = {split_name: self.source}
|
|
405
|
-
else:
|
|
406
|
-
assert isinstance(self.source, datasets.DatasetDict)
|
|
407
|
-
self.dataset_dict = self.source
|
|
408
|
-
|
|
409
415
|
# extract all class labels from the dataset to translate category ints to strings
|
|
410
416
|
self.categorical_features = {
|
|
411
417
|
feature_name: feature_type.names
|
|
@@ -416,26 +422,44 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
416
422
|
self.source_column_map = {}
|
|
417
423
|
self.check_source_columns_are_insertable(self.hf_schema_source.keys())
|
|
418
424
|
|
|
419
|
-
def _translate_row(self, row: dict[str, Any], split_name: str) -> dict[str, Any]:
|
|
425
|
+
def _translate_row(self, row: dict[str, Any], split_name: str, features: datasets.Features) -> dict[str, Any]:
|
|
420
426
|
output_row: dict[str, Any] = {}
|
|
421
427
|
for col_name, val in row.items():
|
|
422
428
|
# translate category ints to strings
|
|
423
429
|
new_val = self.categorical_features[col_name][val] if col_name in self.categorical_features else val
|
|
424
430
|
mapped_col_name = self.source_column_map.get(col_name, col_name)
|
|
425
431
|
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
checked_val = self.pxt_schema[mapped_col_name].create_literal(new_val)
|
|
429
|
-
except TypeError as e:
|
|
430
|
-
msg = str(e)
|
|
431
|
-
raise excs.Error(f'Error in column {col_name}: {msg[0].lower() + msg[1:]}\nRow: {row}') from e
|
|
432
|
-
output_row[mapped_col_name] = checked_val
|
|
432
|
+
new_val = self._translate_val(new_val, features[col_name])
|
|
433
|
+
output_row[mapped_col_name] = new_val
|
|
433
434
|
|
|
434
435
|
# add split name to output row
|
|
435
436
|
if self.column_name_for_split is not None:
|
|
436
437
|
output_row[self.column_name_for_split] = split_name
|
|
437
438
|
return output_row
|
|
438
439
|
|
|
440
|
+
def _translate_val(self, val: Any, feature: datasets.Feature) -> Any:
|
|
441
|
+
"""Convert numpy scalars to Python types and images to PIL.Image.Image"""
|
|
442
|
+
import datasets
|
|
443
|
+
|
|
444
|
+
if isinstance(feature, datasets.Value):
|
|
445
|
+
if isinstance(val, (np.generic, np.ndarray)):
|
|
446
|
+
# a scalar, which we want as a standard Python type
|
|
447
|
+
assert np.ndim(val) == 0
|
|
448
|
+
return val.item()
|
|
449
|
+
else:
|
|
450
|
+
# a standard Python object
|
|
451
|
+
return val
|
|
452
|
+
elif isinstance(feature, datasets.Sequence):
|
|
453
|
+
assert np.ndim(val) > 0
|
|
454
|
+
return val
|
|
455
|
+
elif isinstance(feature, datasets.Image):
|
|
456
|
+
return PIL.Image.fromarray(val)
|
|
457
|
+
elif isinstance(feature, dict):
|
|
458
|
+
assert isinstance(val, dict)
|
|
459
|
+
return {k: self._translate_val(v, feature[k]) for k, v in val.items()}
|
|
460
|
+
else:
|
|
461
|
+
return val
|
|
462
|
+
|
|
439
463
|
def valid_row_batch(self) -> Iterator[RowData]:
|
|
440
464
|
for split_name, split_dataset in self.dataset_dict.items():
|
|
441
465
|
num_batches = split_dataset.size_in_bytes / self._K_BATCH_SIZE_BYTES
|
|
@@ -444,7 +468,7 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
444
468
|
|
|
445
469
|
batch = []
|
|
446
470
|
for row in split_dataset:
|
|
447
|
-
batch.append(self._translate_row(row, split_name))
|
|
471
|
+
batch.append(self._translate_row(row, split_name, split_dataset.features))
|
|
448
472
|
if len(batch) >= tuples_per_batch:
|
|
449
473
|
yield batch
|
|
450
474
|
batch = []
|
|
@@ -454,7 +478,7 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
454
478
|
|
|
455
479
|
|
|
456
480
|
class ParquetTableDataConduit(TableDataConduit):
|
|
457
|
-
pq_ds:
|
|
481
|
+
pq_ds: ParquetDataset | None = None
|
|
458
482
|
|
|
459
483
|
@classmethod
|
|
460
484
|
def from_tds(cls, tds: TableDataConduit) -> 'ParquetTableDataConduit':
|
|
@@ -469,13 +493,13 @@ class ParquetTableDataConduit(TableDataConduit):
|
|
|
469
493
|
t.pq_ds = parquet.ParquetDataset(str(input_path))
|
|
470
494
|
return t
|
|
471
495
|
|
|
472
|
-
def infer_schema_part1(self) -> tuple[dict[str,
|
|
473
|
-
from pixeltable.utils.arrow import
|
|
496
|
+
def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
497
|
+
from pixeltable.utils.arrow import to_pxt_schema
|
|
474
498
|
|
|
475
499
|
if self.source_column_map is None:
|
|
476
500
|
if self.src_schema_overrides is None:
|
|
477
501
|
self.src_schema_overrides = {}
|
|
478
|
-
self.src_schema =
|
|
502
|
+
self.src_schema = to_pxt_schema(self.pq_ds.schema, self.src_schema_overrides, self.src_pk)
|
|
479
503
|
inferred_schema, inferred_pk, self.source_column_map = normalize_schema_names(
|
|
480
504
|
self.src_schema, self.src_pk, self.src_schema_overrides
|
|
481
505
|
)
|
|
@@ -483,7 +507,7 @@ class ParquetTableDataConduit(TableDataConduit):
|
|
|
483
507
|
else:
|
|
484
508
|
raise NotImplementedError()
|
|
485
509
|
|
|
486
|
-
def infer_schema(self) -> dict[str,
|
|
510
|
+
def infer_schema(self) -> dict[str, ts.ColumnType]:
|
|
487
511
|
self.pxt_schema, self.pxt_pk = self.infer_schema_part1()
|
|
488
512
|
self.normalize_pxt_schema_types()
|
|
489
513
|
self.prepare_insert()
|
|
@@ -504,7 +528,7 @@ class ParquetTableDataConduit(TableDataConduit):
|
|
|
504
528
|
from pixeltable.utils.arrow import iter_tuples2
|
|
505
529
|
|
|
506
530
|
try:
|
|
507
|
-
for fragment in self.pq_ds.fragments:
|
|
531
|
+
for fragment in self.pq_ds.fragments:
|
|
508
532
|
for batch in fragment.to_batches():
|
|
509
533
|
dict_batch = list(iter_tuples2(batch, self.source_column_map, self.pxt_schema))
|
|
510
534
|
self.total_rows += len(dict_batch)
|
pixeltable/io/utils.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from keyword import iskeyword as is_python_keyword
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any
|
|
3
3
|
|
|
4
4
|
import pixeltable as pxt
|
|
5
5
|
import pixeltable.exceptions as excs
|
|
@@ -21,7 +21,7 @@ def normalize_pxt_col_name(name: str) -> str:
|
|
|
21
21
|
return id
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
def normalize_primary_key_parameter(primary_key:
|
|
24
|
+
def normalize_primary_key_parameter(primary_key: str | list[str] | None = None) -> list[str]:
|
|
25
25
|
if primary_key is None:
|
|
26
26
|
primary_key = []
|
|
27
27
|
elif isinstance(primary_key, str):
|
|
@@ -40,7 +40,7 @@ def normalize_schema_names(
|
|
|
40
40
|
primary_key: list[str],
|
|
41
41
|
schema_overrides: dict[str, Any],
|
|
42
42
|
require_valid_pxt_column_names: bool = False,
|
|
43
|
-
) -> tuple[dict[str, Any], list[str],
|
|
43
|
+
) -> tuple[dict[str, Any], list[str], dict[str, str] | None]:
|
|
44
44
|
"""
|
|
45
45
|
Convert all names in the input schema from source names to valid Pixeltable identifiers
|
|
46
46
|
- Ensure that all names are unique.
|
pixeltable/iterators/__init__.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
"""Iterators for splitting media and documents into components."""
|
|
1
2
|
# ruff: noqa: F401
|
|
2
3
|
|
|
3
4
|
from .audio import AudioSplitter
|
|
@@ -5,7 +6,7 @@ from .base import ComponentIterator
|
|
|
5
6
|
from .document import DocumentSplitter
|
|
6
7
|
from .image import TileIterator
|
|
7
8
|
from .string import StringSplitter
|
|
8
|
-
from .video import FrameIterator
|
|
9
|
+
from .video import FrameIterator, VideoSplitter
|
|
9
10
|
|
|
10
11
|
__default_dir = {symbol for symbol in dir() if not symbol.startswith('_')}
|
|
11
12
|
__removed_symbols = {'base', 'document', 'video'}
|
pixeltable/iterators/audio.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import uuid
|
|
3
2
|
from fractions import Fraction
|
|
4
3
|
from pathlib import Path
|
|
5
|
-
from typing import Any, ClassVar
|
|
4
|
+
from typing import Any, ClassVar
|
|
6
5
|
|
|
7
6
|
import av
|
|
8
7
|
|
|
9
|
-
from pixeltable import
|
|
8
|
+
from pixeltable import exceptions as excs, type_system as ts
|
|
9
|
+
from pixeltable.utils.local_store import TempStore
|
|
10
10
|
|
|
11
11
|
from .base import ComponentIterator
|
|
12
12
|
|
|
@@ -37,7 +37,7 @@ class AudioSplitter(ComponentIterator):
|
|
|
37
37
|
|
|
38
38
|
# List of chunks to extract
|
|
39
39
|
# Each chunk is defined by start and end presentation timestamps in audio file (int)
|
|
40
|
-
chunks_to_extract_in_pts:
|
|
40
|
+
chunks_to_extract_in_pts: list[tuple[int, int]] | None
|
|
41
41
|
# next chunk to extract
|
|
42
42
|
next_pos: int
|
|
43
43
|
|
|
@@ -55,12 +55,9 @@ class AudioSplitter(ComponentIterator):
|
|
|
55
55
|
def __init__(
|
|
56
56
|
self, audio: str, chunk_duration_sec: float, *, overlap_sec: float = 0.0, min_chunk_duration_sec: float = 0.0
|
|
57
57
|
):
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
|
|
62
|
-
if overlap_sec >= chunk_duration_sec:
|
|
63
|
-
raise excs.Error('overlap_sec must be less than chunk_duration_sec')
|
|
58
|
+
assert chunk_duration_sec > 0.0
|
|
59
|
+
assert chunk_duration_sec >= min_chunk_duration_sec
|
|
60
|
+
assert overlap_sec < chunk_duration_sec
|
|
64
61
|
audio_path = Path(audio)
|
|
65
62
|
assert audio_path.exists() and audio_path.is_file()
|
|
66
63
|
self.audio_path = audio_path
|
|
@@ -128,6 +125,19 @@ class AudioSplitter(ComponentIterator):
|
|
|
128
125
|
|
|
129
126
|
@classmethod
|
|
130
127
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
128
|
+
param_names = ['chunk_duration_sec', 'min_chunk_duration_sec', 'overlap_sec']
|
|
129
|
+
params = dict(zip(param_names, args))
|
|
130
|
+
params.update(kwargs)
|
|
131
|
+
|
|
132
|
+
chunk_duration_sec = params['chunk_duration_sec']
|
|
133
|
+
min_chunk_duration_sec = params.get('min_chunk_duration_sec', 0.0)
|
|
134
|
+
overlap_sec = params.get('overlap_sec', 0.0)
|
|
135
|
+
if chunk_duration_sec <= 0.0:
|
|
136
|
+
raise excs.Error('chunk_duration_sec must be a positive number')
|
|
137
|
+
if chunk_duration_sec < min_chunk_duration_sec:
|
|
138
|
+
raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
|
|
139
|
+
if overlap_sec >= chunk_duration_sec:
|
|
140
|
+
raise excs.Error('overlap_sec must be less than chunk_duration_sec')
|
|
131
141
|
return {
|
|
132
142
|
'start_time_sec': ts.FloatType(),
|
|
133
143
|
'end_time_sec': ts.FloatType(),
|
|
@@ -140,7 +150,7 @@ class AudioSplitter(ComponentIterator):
|
|
|
140
150
|
target_chunk_start, target_chunk_end = self.chunks_to_extract_in_pts[self.next_pos]
|
|
141
151
|
chunk_start_pts = 0
|
|
142
152
|
chunk_end_pts = 0
|
|
143
|
-
chunk_file = str(
|
|
153
|
+
chunk_file = str(TempStore.create_path(extension=self.audio_path.suffix))
|
|
144
154
|
output_container = av.open(chunk_file, mode='w')
|
|
145
155
|
input_stream = self.container.streams.audio[0]
|
|
146
156
|
codec_name = AudioSplitter.__codec_map.get(input_stream.codec_context.name, input_stream.codec_context.name)
|