pixeltable 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

pixeltable/__init__.py CHANGED
@@ -30,7 +30,7 @@ from .globals import (
30
30
  from .type_system import Array, Audio, Bool, Date, Document, Float, Image, Int, Json, Required, String, Timestamp, Video
31
31
 
32
32
  # This import must go last to avoid circular imports.
33
- from . import ext, functions, io, iterators # isort: skip
33
+ from . import functions, io, iterators # isort: skip
34
34
 
35
35
  # This is the safest / most maintainable way to construct __all__: start with the default and "blacklist"
36
36
  # stuff that we don't want in there. (Using a "whitelist" is considerably harder to maintain.)
@@ -2,13 +2,17 @@ from __future__ import annotations
2
2
 
3
3
  import enum
4
4
  import logging
5
- from typing import TYPE_CHECKING, Any, Literal, Optional, overload
5
+ from typing import TYPE_CHECKING, Any, Literal, Optional, Sequence, cast, overload
6
6
  from uuid import UUID
7
7
 
8
+ import pydantic
9
+ import pydantic_core
10
+
8
11
  import pixeltable as pxt
9
12
  from pixeltable import exceptions as excs, type_system as ts
10
13
  from pixeltable.env import Env
11
14
  from pixeltable.utils.filecache import FileCache
15
+ from pixeltable.utils.pydantic import is_json_convertible
12
16
 
13
17
  from .globals import MediaValidation
14
18
  from .table import Table
@@ -137,8 +141,24 @@ class InsertableTable(Table):
137
141
  from pixeltable.catalog import Catalog
138
142
  from pixeltable.io.table_data_conduit import UnkTableDataConduit
139
143
 
144
+ if source is not None and isinstance(source, Sequence) and len(source) == 0:
145
+ raise excs.Error('Cannot insert an empty sequence')
146
+ fail_on_exception = OnErrorParameter.fail_on_exception(on_error)
147
+
140
148
  with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
141
149
  table = self
150
+
151
+ # TODO: unify with TableDataConduit
152
+ if source is not None and isinstance(source, Sequence) and isinstance(source[0], pydantic.BaseModel):
153
+ status = self._insert_pydantic(
154
+ cast(Sequence[pydantic.BaseModel], source), # needed for mypy
155
+ print_stats=print_stats,
156
+ fail_on_exception=fail_on_exception,
157
+ )
158
+ Env.get().console_logger.info(status.insert_msg)
159
+ FileCache.get().emit_eviction_warnings()
160
+ return status
161
+
142
162
  if source is None:
143
163
  source = [kwargs]
144
164
  kwargs = None
@@ -154,7 +174,6 @@ class InsertableTable(Table):
154
174
  data_source.add_table_info(table)
155
175
  data_source.prepare_for_insert_into_table()
156
176
 
157
- fail_on_exception = OnErrorParameter.fail_on_exception(on_error)
158
177
  return table.insert_table_data_source(
159
178
  data_source=data_source, fail_on_exception=fail_on_exception, print_stats=print_stats
160
179
  )
@@ -184,32 +203,110 @@ class InsertableTable(Table):
184
203
  FileCache.get().emit_eviction_warnings()
185
204
  return status
186
205
 
187
- def _validate_input_rows(self, rows: list[dict[str, Any]]) -> None:
188
- """Verify that the input rows match the table schema"""
189
- valid_col_names = set(self._get_schema().keys())
190
- reqd_col_names = set(self._tbl_version_path.tbl_version.get().get_required_col_names())
191
- computed_col_names = set(self._tbl_version_path.tbl_version.get().get_computed_col_names())
192
- for row in rows:
193
- assert isinstance(row, dict)
194
- col_names = set(row.keys())
195
- if len(reqd_col_names - col_names) > 0:
196
- raise excs.Error(f'Missing required column(s) ({", ".join(reqd_col_names - col_names)}) in row {row}')
197
-
198
- for col_name, val in row.items():
199
- if col_name not in valid_col_names:
200
- raise excs.Error(f'Unknown column name {col_name!r} in row {row}')
201
- if col_name in computed_col_names:
202
- raise excs.Error(f'Value for computed column {col_name!r} in row {row}')
203
-
204
- # validate data
205
- col = self._tbl_version_path.get_column(col_name)
206
- try:
207
- # basic sanity checks here
208
- checked_val = col.col_type.create_literal(val)
209
- row[col_name] = checked_val
210
- except TypeError as e:
211
- msg = str(e)
212
- raise excs.Error(f'Error in column {col.name}: {msg[0].lower() + msg[1:]}\nRow: {row}') from e
206
+ def _insert_pydantic(
207
+ self, rows: Sequence[pydantic.BaseModel], print_stats: bool = False, fail_on_exception: bool = True
208
+ ) -> UpdateStatus:
209
+ model_class = type(rows[0])
210
+ self._validate_pydantic_model(model_class)
211
+ # convert rows one-by-one in order to be able to print meaningful error messages
212
+ pxt_rows: list[dict[str, Any]] = []
213
+ for i, row in enumerate(rows):
214
+ try:
215
+ pxt_rows.append(row.model_dump(mode='json'))
216
+ except pydantic_core.PydanticSerializationError as e:
217
+ raise excs.Error(f'Row {i}: error serializing pydantic model to JSON:\n{e!s}') from e
218
+
219
+ # explicitly check that all required columns are present and non-None in the rows,
220
+ # because we ignore nullability when validating the pydantic model
221
+ reqd_col_names = [col.name for col in self._tbl_version_path.columns() if col.is_required_for_insert]
222
+ for i, pxt_row in enumerate(pxt_rows):
223
+ if type(rows[i]) is not model_class:
224
+ raise excs.Error(
225
+ f'Expected {model_class.__name__!r} instance, got {type(rows[i]).__name__!r} (in row {i})'
226
+ )
227
+ for col_name in reqd_col_names:
228
+ if pxt_row.get(col_name) is None:
229
+ raise excs.Error(f'Missing required column {col_name!r} in row {i}')
230
+
231
+ status = self._tbl_version.get().insert(
232
+ rows=pxt_rows, df=None, print_stats=print_stats, fail_on_exception=fail_on_exception
233
+ )
234
+ return status
235
+
236
+ def _validate_pydantic_model(self, model: type[pydantic.BaseModel]) -> None:
237
+ """
238
+ Check if a Pydantic model is compatible with this table for insert operations.
239
+
240
+ A model is compatible if:
241
+ - All required table columns have corresponding model fields with compatible types
242
+ - Model does not define fields for computed columns
243
+ - Model field types are compatible with table column types
244
+ """
245
+ assert isinstance(model, type) and issubclass(model, pydantic.BaseModel)
246
+
247
+ schema = self._get_schema()
248
+ required_cols = set(self._tbl_version.get().get_required_col_names())
249
+ computed_cols = set(self._tbl_version.get().get_computed_col_names())
250
+ model_fields = model.model_fields
251
+ model_field_names = set(model_fields.keys())
252
+
253
+ missing_required = required_cols - model_field_names
254
+ if missing_required:
255
+ raise excs.Error(
256
+ f'Pydantic model {model.__name__!r} is missing required columns: '
257
+ f'{", ".join(f"{col_name!r}" for col_name in missing_required)}'
258
+ )
259
+
260
+ computed_in_model = computed_cols & model_field_names
261
+ if computed_in_model:
262
+ raise excs.Error(
263
+ f'Pydantic model {model.__name__!r} has fields for computed columns: '
264
+ f'{", ".join(f"{col_name!r}" for col_name in computed_in_model)}'
265
+ )
266
+
267
+ # validate type compatibility
268
+ common_fields = model_field_names & set(schema.keys())
269
+ if len(common_fields) == 0:
270
+ raise excs.Error(
271
+ f'Pydantic model {model.__name__!r} has no fields that map to columns in table {self._name!r}'
272
+ )
273
+ for field_name in common_fields:
274
+ pxt_col_type = schema[field_name]
275
+ model_field = model_fields[field_name]
276
+ model_type = model_field.annotation
277
+
278
+ # we ignore nullability: we want to accept optional model fields for required table columns, as long as
279
+ # the model instances provide a non-null value
280
+ # allow_enum=True: model_dump(mode='json') converts enums to their values
281
+ inferred_pxt_type = ts.ColumnType.from_python_type(model_type, infer_pydantic_json=True)
282
+ if inferred_pxt_type is None:
283
+ raise excs.Error(
284
+ f'Pydantic model {model.__name__!r}: cannot infer Pixeltable type for column {field_name!r}'
285
+ )
286
+
287
+ if pxt_col_type.is_media_type():
288
+ # media types require file paths, either as str or Path
289
+ if not inferred_pxt_type.is_string_type():
290
+ raise excs.Error(
291
+ f"Column {field_name!r} requires a 'str' or 'Path' field in {model.__name__!r}, but it is "
292
+ f'{model_type.__name__!r}'
293
+ )
294
+ else:
295
+ if not pxt_col_type.is_supertype_of(inferred_pxt_type, ignore_nullable=True):
296
+ raise excs.Error(
297
+ f'Pydantic model {model.__name__!r} has incompatible type ({model_type.__name__}) '
298
+ f'for column {field_name!r} ({pxt_col_type})'
299
+ )
300
+
301
+ if (
302
+ isinstance(model_type, type)
303
+ and issubclass(model_type, pydantic.BaseModel)
304
+ and not is_json_convertible(model_type)
305
+ ):
306
+ raise excs.Error(
307
+ f'Pydantic model {model.__name__!r} has field {field_name!r} with nested model '
308
+ f'{model_type.__name__!r}, which is not JSON-convertible'
309
+ )
213
310
 
214
311
  def delete(self, where: Optional['exprs.Expr'] = None) -> UpdateStatus:
215
312
  """Delete rows in this table.
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import abc
4
4
  import builtins
5
+ import datetime
5
6
  import json
6
7
  import logging
7
8
  from keyword import iskeyword as is_python_keyword
@@ -9,7 +10,6 @@ from pathlib import Path
9
10
  from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Literal, Optional, TypedDict, overload
10
11
 
11
12
  from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
12
- import datetime
13
13
  from uuid import UUID
14
14
 
15
15
  import pandas as pd
@@ -1356,6 +1356,15 @@ class Table(SchemaObject):
1356
1356
  Insert rows from a CSV file:
1357
1357
 
1358
1358
  >>> tbl.insert(source='path/to/file.csv')
1359
+
1360
+ Insert Pydantic model instances into a table with two `pxt.Int` columns `a` and `b`:
1361
+
1362
+ >>> class MyModel(pydantic.BaseModel):
1363
+ ... a: int
1364
+ ... b: int
1365
+ ...
1366
+ ... models = [MyModel(a=1, b=2), MyModel(a=3, b=4)]
1367
+ ... tbl.insert(models)
1359
1368
  """
1360
1369
  raise NotImplementedError
1361
1370
 
pixeltable/config.py CHANGED
@@ -167,6 +167,7 @@ KNOWN_CONFIG_OPTIONS = {
167
167
  'deepseek': {'api_key': 'Deepseek API key', 'rate_limit': 'Rate limit for Deepseek API requests'},
168
168
  'fireworks': {'api_key': 'Fireworks API key', 'rate_limit': 'Rate limit for Fireworks API requests'},
169
169
  'gemini': {'api_key': 'Gemini API key', 'rate_limits': 'Per-model rate limits for Gemini API requests'},
170
+ 'hf': {'auth_token': 'Hugging Face access token'},
170
171
  'imagen': {'rate_limits': 'Per-model rate limits for Imagen API requests'},
171
172
  'veo': {'rate_limits': 'Per-model rate limits for Veo API requests'},
172
173
  'groq': {'api_key': 'Groq API key', 'rate_limit': 'Rate limit for Groq API requests'},
pixeltable/env.py CHANGED
@@ -27,6 +27,7 @@ import nest_asyncio # type: ignore[import-untyped]
27
27
  import pixeltable_pgserver
28
28
  import sqlalchemy as sql
29
29
  from pillow_heif import register_heif_opener # type: ignore[import-untyped]
30
+ from tenacity import retry, stop_after_attempt, wait_exponential_jitter
30
31
  from tqdm import TqdmWarning
31
32
 
32
33
  from pixeltable import exceptions as excs
@@ -104,10 +105,14 @@ class Env:
104
105
  cls._instance._clean_up()
105
106
  cls._instance = None
106
107
  env = Env()
107
- env._set_up(reinit_db=reinit_db)
108
- env._upgrade_metadata()
109
- cls._instance = env
110
- cls.__initializing = False
108
+ try:
109
+ env._set_up(reinit_db=reinit_db)
110
+ env._upgrade_metadata()
111
+ cls._instance = env
112
+ finally:
113
+ # Reset the initializing flag, even if setup fails.
114
+ # This prevents the environment from being left in a broken state.
115
+ cls.__initializing = False
111
116
 
112
117
  def __init__(self) -> None:
113
118
  assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
@@ -500,14 +505,24 @@ class Env:
500
505
  assert self._db_url is not None
501
506
  assert self._db_name is not None
502
507
 
508
+ @retry(
509
+ stop=stop_after_attempt(3), # Stop after 3 attempts
510
+ wait=wait_exponential_jitter(initial=0.2, max=1.0, jitter=0.2), # Exponential backoff with jitter
511
+ )
503
512
  def _init_metadata(self) -> None:
504
513
  """
505
514
  Create pixeltable metadata tables and system metadata.
506
515
  This is an idempotent operation.
516
+
517
+ Retry logic handles race conditions when multiple Pixeltable processes
518
+ attempt to initialize metadata tables simultaneously. The first process may succeed
519
+ in creating tables while others encounter database constraints (e.g., "table already exists").
520
+ Exponential backoff with jitter reduces contention between competing processes.
507
521
  """
508
522
  assert self._sa_engine is not None
509
523
  from pixeltable import metadata
510
524
 
525
+ self._logger.debug('Creating pixeltable metadata')
511
526
  metadata.schema.base_metadata.create_all(self._sa_engine, checkfirst=True)
512
527
  metadata.create_system_info(self._sa_engine)
513
528
 
@@ -26,6 +26,8 @@ from . import (
26
26
  video,
27
27
  vision,
28
28
  whisper,
29
+ whisperx,
30
+ yolox,
29
31
  )
30
32
  from .globals import count, map, max, mean, min, sum
31
33
 
@@ -0,0 +1,177 @@
1
+ from typing import TYPE_CHECKING, Any, Optional
2
+
3
+ import numpy as np
4
+
5
+ import pixeltable as pxt
6
+ from pixeltable.config import Config
7
+ from pixeltable.functions.util import resolve_torch_device
8
+ from pixeltable.utils.code import local_public_names
9
+
10
+ if TYPE_CHECKING:
11
+ from transformers import Wav2Vec2Model
12
+ from whisperx.asr import FasterWhisperPipeline # type: ignore[import-untyped]
13
+ from whisperx.diarize import DiarizationPipeline # type: ignore[import-untyped]
14
+
15
+
16
+ @pxt.udf
17
+ def transcribe(
18
+ audio: pxt.Audio,
19
+ *,
20
+ model: str,
21
+ diarize: bool = False,
22
+ compute_type: Optional[str] = None,
23
+ language: Optional[str] = None,
24
+ task: Optional[str] = None,
25
+ chunk_size: Optional[int] = None,
26
+ alignment_model_name: Optional[str] = None,
27
+ interpolate_method: Optional[str] = None,
28
+ return_char_alignments: Optional[bool] = None,
29
+ diarization_model_name: Optional[str] = None,
30
+ num_speakers: Optional[int] = None,
31
+ min_speakers: Optional[int] = None,
32
+ max_speakers: Optional[int] = None,
33
+ ) -> dict:
34
+ """
35
+ Transcribe an audio file using WhisperX.
36
+
37
+ This UDF runs a transcription model _locally_ using the WhisperX library,
38
+ equivalent to the WhisperX `transcribe` function, as described in the
39
+ [WhisperX library documentation](https://github.com/m-bain/whisperX).
40
+
41
+ If `diarize=True`, then speaker diarization will also be performed. Several of the UDF parameters are only valid if
42
+ `diarize=True`, as documented in the parameters list below.
43
+
44
+ __Requirements:__
45
+
46
+ - `pip install whisperx`
47
+
48
+ Args:
49
+ audio: The audio file to transcribe.
50
+ model: The name of the model to use for transcription.
51
+ diarize: Whether to perform speaker diarization.
52
+ compute_type: The compute type to use for the model (e.g., `'int8'`, `'float16'`). If `None`,
53
+ defaults to `'float16'` on CUDA devices and `'int8'` otherwise.
54
+ language: The language code for the transcription (e.g., `'en'` for English).
55
+ task: The task to perform (e.g., `'transcribe'` or `'translate'`). Defaults to `'transcribe'`.
56
+ chunk_size: The size of the audio chunks to process, in seconds. Defaults to `30`.
57
+ alignment_model_name: The name of the alignment model to use. If `None`, uses the default model for the given
58
+ language. Only valid if `diarize=True`.
59
+ interpolate_method: The method to use for interpolation of the alignment results. If not specified, uses the
60
+ WhisperX default (`'nearest'`). Only valid if `diarize=True`.
61
+ return_char_alignments: Whether to return character-level alignments. Defaults to `False`.
62
+ Only valid if `diarize=True`.
63
+ diarization_model_name: The name of the diarization model to use. Defaults to
64
+ `pyannote/speaker-diarization-3.1`. Only valid if `diarize=True`.
65
+ num_speakers: The number of speakers to expect in the audio. By default, the model with try to detect the
66
+ number of speakers. Only valid if `diarize=True`.
67
+ min_speakers: If specified, the minimum number of speakers to expect in the audio.
68
+ Only valid if `diarize=True`.
69
+ max_speakers: If specified, the maximum number of speakers to expect in the audio.
70
+ Only valid if `diarize=True`.
71
+
72
+ Returns:
73
+ A dictionary containing the audio transcription, diarization (if enabled), and various other metadata.
74
+
75
+ Examples:
76
+ Add a computed column that applies the model `tiny.en` to an existing Pixeltable column `tbl.audio`
77
+ of the table `tbl`:
78
+
79
+ >>> tbl.add_computed_column(result=transcribe(tbl.audio, model='tiny.en'))
80
+
81
+ Add a computed column that applies the model `tiny.en` to an existing Pixeltable column `tbl.audio`
82
+ of the table `tbl`, with speaker diarization enabled, expecting at least 2 speakers:
83
+
84
+ >>> tbl.add_computed_column(
85
+ ... result=transcribe(
86
+ ... tbl.audio, model='tiny.en', diarize=True, min_speakers=2
87
+ ... )
88
+ ... )
89
+ """
90
+ import whisperx # type: ignore[import-untyped]
91
+
92
+ if not diarize:
93
+ args = locals()
94
+ for param in (
95
+ 'alignment_model_name',
96
+ 'interpolate_method',
97
+ 'return_char_alignments',
98
+ 'diarization_model_name',
99
+ 'num_speakers',
100
+ 'min_speakers',
101
+ 'max_speakers',
102
+ ):
103
+ if args[param] is not None:
104
+ raise pxt.Error(f'`{param}` can only be set if `diarize=True`')
105
+
106
+ device = resolve_torch_device('auto', allow_mps=False)
107
+ compute_type = compute_type or ('float16' if device == 'cuda' else 'int8')
108
+ transcription_model = _lookup_transcription_model(model, device, compute_type)
109
+ audio_array: np.ndarray = whisperx.load_audio(audio)
110
+ kwargs: dict[str, Any] = {'language': language, 'task': task}
111
+ if chunk_size is not None:
112
+ kwargs['chunk_size'] = chunk_size
113
+ result: dict[str, Any] = transcription_model.transcribe(audio_array, batch_size=16, **kwargs)
114
+
115
+ if diarize:
116
+ # Alignment
117
+ alignment_model, metadata = _lookup_alignment_model(result['language'], device, alignment_model_name)
118
+ kwargs = {}
119
+ if interpolate_method is not None:
120
+ kwargs['interpolate_method'] = interpolate_method
121
+ if return_char_alignments is not None:
122
+ kwargs['return_char_alignments'] = return_char_alignments
123
+ result = whisperx.align(result['segments'], alignment_model, metadata, audio_array, device, **kwargs)
124
+
125
+ # Diarization
126
+ diarization_model = _lookup_diarization_model(device, diarization_model_name)
127
+ diarization_segments = diarization_model(
128
+ audio_array, num_speakers=num_speakers, min_speakers=min_speakers, max_speakers=max_speakers
129
+ )
130
+ result = whisperx.assign_word_speakers(diarization_segments, result)
131
+
132
+ return result
133
+
134
+
135
+ def _lookup_transcription_model(model: str, device: str, compute_type: str) -> 'FasterWhisperPipeline':
136
+ import whisperx
137
+
138
+ key = (model, device, compute_type)
139
+ if key not in _model_cache:
140
+ transcription_model = whisperx.load_model(model, device, compute_type=compute_type)
141
+ _model_cache[key] = transcription_model
142
+ return _model_cache[key]
143
+
144
+
145
+ def _lookup_alignment_model(language_code: str, device: str, model_name: Optional[str]) -> tuple['Wav2Vec2Model', dict]:
146
+ import whisperx
147
+
148
+ key = (language_code, device, model_name)
149
+ if key not in _alignment_model_cache:
150
+ model, metadata = whisperx.load_align_model(language_code=language_code, device=device, model_name=model_name)
151
+ _alignment_model_cache[key] = (model, metadata)
152
+ return _alignment_model_cache[key]
153
+
154
+
155
+ def _lookup_diarization_model(device: str, model_name: Optional[str]) -> 'DiarizationPipeline':
156
+ from whisperx.diarize import DiarizationPipeline
157
+
158
+ key = (device, model_name)
159
+ if key not in _diarization_model_cache:
160
+ auth_token = Config.get().get_string_value('auth_token', section='hf')
161
+ kwargs: dict[str, Any] = {'device': device, 'use_auth_token': auth_token}
162
+ if model_name is not None:
163
+ kwargs['model_name'] = model_name
164
+ _diarization_model_cache[key] = DiarizationPipeline(**kwargs)
165
+ return _diarization_model_cache[key]
166
+
167
+
168
+ _model_cache: dict[tuple[str, str, str], 'FasterWhisperPipeline'] = {}
169
+ _alignment_model_cache: dict[tuple[str, str, Optional[str]], tuple['Wav2Vec2Model', dict]] = {}
170
+ _diarization_model_cache: dict[tuple[str, Optional[str]], 'DiarizationPipeline'] = {}
171
+
172
+
173
+ __all__ = local_public_names(__name__)
174
+
175
+
176
+ def __dir__() -> list[str]:
177
+ return __all__
@@ -20,8 +20,6 @@ def yolox(images: Batch[PIL.Image.Image], *, model_id: str, threshold: float = 0
20
20
  Computes YOLOX object detections for the specified image. `model_id` should reference one of the models
21
21
  defined in the [YOLOX documentation](https://github.com/Megvii-BaseDetection/YOLOX).
22
22
 
23
- YOLOX is part of the `pixeltable.ext` package: long-term support in Pixeltable is not guaranteed.
24
-
25
23
  __Requirements__:
26
24
 
27
25
  - `pip install pixeltable-yolox`
@@ -55,8 +53,6 @@ def yolo_to_coco(detections: dict) -> list:
55
53
  """
56
54
  Converts the output of a YOLOX object detection model to COCO format.
57
55
 
58
- YOLOX is part of the `pixeltable.ext` package: long-term support in Pixeltable is not guaranteed.
59
-
60
56
  Args:
61
57
  detections: The output of a YOLOX object detection model, as returned by `yolox`.
62
58
 
pixeltable/globals.py CHANGED
@@ -3,9 +3,10 @@ from __future__ import annotations
3
3
  import logging
4
4
  import os
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, NamedTuple, Optional, Union
6
+ from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, NamedTuple, Optional, Sequence, Union
7
7
 
8
8
  import pandas as pd
9
+ import pydantic
9
10
  from pandas.io.formats.style import Styler
10
11
 
11
12
  from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share, type_system as ts
@@ -25,6 +26,7 @@ if TYPE_CHECKING:
25
26
  Path, # OS paths, filenames, URLs
26
27
  Iterator[dict[str, Any]], # iterator producing dictionaries of values
27
28
  RowData, # list of dictionaries
29
+ Sequence[pydantic.BaseModel], # list of Pydantic models
28
30
  DataFrame, # Pixeltable DataFrame
29
31
  pd.DataFrame, # pandas DataFrame
30
32
  datasets.Dataset,
@@ -25,6 +25,7 @@ def create_system_info(engine: sql.engine.Engine) -> None:
25
25
  """Create the system metadata record"""
26
26
  system_md = SystemInfoMd(schema_version=VERSION)
27
27
  record = SystemInfo(md=dataclasses.asdict(system_md))
28
+ _logger.debug(f'Creating pixeltable system info record {record}')
28
29
  with orm.Session(engine, future=True) as session:
29
30
  # Write system metadata only once for idempotency
30
31
  if session.query(SystemInfo).count() == 0:
@@ -54,7 +55,8 @@ for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/convert
54
55
  def upgrade_md(engine: sql.engine.Engine) -> None:
55
56
  """Upgrade the metadata schema to the current version"""
56
57
  with orm.Session(engine) as session:
57
- system_info = session.query(SystemInfo).one().md
58
+ # Get exclusive lock on SystemInfo row
59
+ system_info = session.query(SystemInfo).with_for_update().one().md
58
60
  md_version = system_info['schema_version']
59
61
  assert isinstance(md_version, int)
60
62
  _logger.info(f'Current database version: {md_version}, installed version: {VERSION}')
@@ -0,0 +1,3 @@
1
+ from .mypy_plugin import plugin
2
+
3
+ __all__ = ['plugin']
@@ -0,0 +1,123 @@
1
+ from typing import Callable, ClassVar, Optional
2
+
3
+ from mypy import nodes
4
+ from mypy.plugin import AnalyzeTypeContext, ClassDefContext, FunctionContext, MethodSigContext, Plugin
5
+ from mypy.plugins.common import add_attribute_to_class, add_method_to_class
6
+ from mypy.types import AnyType, FunctionLike, Instance, NoneType, Type, TypeOfAny
7
+
8
+ import pixeltable as pxt
9
+ from pixeltable import exprs
10
+
11
+
12
+ class PxtPlugin(Plugin):
13
+ __UDA_FULLNAME = f'{pxt.uda.__module__}.{pxt.uda.__name__}'
14
+ __ARRAY_GETITEM_FULLNAME = f'{pxt.Array.__module__}.{pxt.Array.__name__}.__class_getitem__'
15
+ __ADD_COLUMN_FULLNAME = f'{pxt.Table.__module__}.{pxt.Table.__name__}.{pxt.Table.add_column.__name__}'
16
+ __ADD_COMPUTED_COLUMN_FULLNAME = (
17
+ f'{pxt.Table.__module__}.{pxt.Table.__name__}.{pxt.Table.add_computed_column.__name__}'
18
+ )
19
+ __TYPE_MAP: ClassVar[dict] = {
20
+ pxt.Json: 'typing.Any',
21
+ pxt.Array: 'numpy.ndarray',
22
+ pxt.Image: 'PIL.Image.Image',
23
+ pxt.Video: 'builtins.str',
24
+ pxt.Audio: 'builtins.str',
25
+ pxt.Document: 'builtins.str',
26
+ }
27
+ __FULLNAME_MAP: ClassVar[dict] = {f'{k.__module__}.{k.__name__}': v for k, v in __TYPE_MAP.items()}
28
+
29
+ def get_function_hook(self, fullname: str) -> Optional[Callable[[FunctionContext], Type]]:
30
+ return adjust_uda_type
31
+
32
+ def get_type_analyze_hook(self, fullname: str) -> Optional[Callable[[AnalyzeTypeContext], Type]]:
33
+ if fullname in self.__FULLNAME_MAP:
34
+ subst_name = self.__FULLNAME_MAP[fullname]
35
+ return lambda ctx: adjust_pxt_type(ctx, subst_name)
36
+ return None
37
+
38
+ def get_method_signature_hook(self, fullname: str) -> Optional[Callable[[MethodSigContext], FunctionLike]]:
39
+ if fullname in (self.__ADD_COLUMN_FULLNAME, self.__ADD_COMPUTED_COLUMN_FULLNAME):
40
+ return adjust_kwargs
41
+ return None
42
+
43
+ def get_class_decorator_hook_2(self, fullname: str) -> Optional[Callable[[ClassDefContext], bool]]:
44
+ if fullname == self.__UDA_FULLNAME:
45
+ return adjust_uda_methods
46
+ return None
47
+
48
+
49
+ def plugin(version: str) -> type:
50
+ return PxtPlugin
51
+
52
+
53
+ _AGGREGATOR_FULLNAME = f'{pxt.Aggregator.__module__}.{pxt.Aggregator.__name__}'
54
+ _FN_CALL_FULLNAME = f'{exprs.Expr.__module__}.{exprs.Expr.__name__}'
55
+
56
+
57
+ def adjust_uda_type(ctx: FunctionContext) -> Type:
58
+ """
59
+ Mypy doesn't understand that a class with a @uda decorator isn't actually a class, so it assumes
60
+ that sum(expr), for example, actually returns an instance of sum. We correct this by changing the
61
+ return type of any subclass of `Aggregator` to `FunctionCall`.
62
+ """
63
+ ret_type = ctx.default_return_type
64
+ if isinstance(ret_type, Instance) and (
65
+ ret_type.type.fullname == _AGGREGATOR_FULLNAME
66
+ or any(base.type.fullname == _AGGREGATOR_FULLNAME for base in ret_type.type.bases)
67
+ ):
68
+ ret_type = AnyType(TypeOfAny.special_form)
69
+ return ret_type
70
+
71
+
72
+ def adjust_pxt_type(ctx: AnalyzeTypeContext, subst_name: str) -> Type:
73
+ """
74
+ Replaces the special Pixeltable classes (such as pxt.Array) with their standard equivalents (such as np.ndarray).
75
+ """
76
+ if subst_name == 'typing.Any':
77
+ return AnyType(TypeOfAny.special_form)
78
+ return ctx.api.named_type(subst_name, [])
79
+
80
+
81
+ def adjust_kwargs(ctx: MethodSigContext) -> FunctionLike:
82
+ """
83
+ Mypy has a "feature" where it will spit out multiple warnings if a method with signature
84
+ ```
85
+ def my_func(*, arg1: int, arg2: str, **kwargs: Expr)
86
+ ```
87
+ (for example) is called with bare kwargs:
88
+ ```
89
+ my_func(my_kwarg=value)
90
+ ```
91
+ This is a disaster for type-checking of add_column and add_computed_column. Here we adjust the signature so
92
+ that mypy thinks it is simply
93
+ ```
94
+ def my_func(**kwargs: Any)
95
+ ```
96
+ thereby avoiding any type-checking errors. For details, see: <https://github.com/python/mypy/issues/18481>
97
+ """
98
+ sig = ctx.default_signature
99
+ new_arg_names = sig.arg_names[-1:]
100
+ new_arg_types = [AnyType(TypeOfAny.special_form)]
101
+ new_arg_kinds = sig.arg_kinds[-1:]
102
+ return sig.copy_modified(arg_names=new_arg_names, arg_types=new_arg_types, arg_kinds=new_arg_kinds)
103
+
104
+
105
+ def adjust_uda_methods(ctx: ClassDefContext) -> bool:
106
+ """
107
+ Mypy does not handle the `@pxt.uda` aggregator well; it continues to treat the decorated class as a class,
108
+ even though it has been replaced by an `AggregateFunction`. Here we add static methods to the class that
109
+ imitate various (instance) methods of `AggregateFunction` so that they can be properly type-checked.
110
+ """
111
+ list_type = ctx.api.named_type('builtins.list', [AnyType(TypeOfAny.special_form)])
112
+ fn_arg = nodes.Argument(nodes.Var('fn'), AnyType(TypeOfAny.special_form), None, nodes.ARG_POS)
113
+ args_arg = nodes.Argument(nodes.Var('args'), AnyType(TypeOfAny.special_form), None, nodes.ARG_STAR)
114
+ kwargs_arg = nodes.Argument(nodes.Var('kwargs'), AnyType(TypeOfAny.special_form), None, nodes.ARG_STAR2)
115
+ add_method_to_class(ctx.api, ctx.cls, '__init__', args=[args_arg, kwargs_arg], return_type=NoneType())
116
+ add_method_to_class(
117
+ ctx.api, ctx.cls, 'to_sql', args=[fn_arg], return_type=AnyType(TypeOfAny.special_form), is_staticmethod=True
118
+ )
119
+ add_method_to_class(
120
+ ctx.api, ctx.cls, 'overload', args=[fn_arg], return_type=AnyType(TypeOfAny.special_form), is_staticmethod=True
121
+ )
122
+ add_attribute_to_class(ctx.api, ctx.cls, 'signatures', typ=list_type, is_classvar=True)
123
+ return True
pixeltable/type_system.py CHANGED
@@ -9,8 +9,11 @@ import types
9
9
  import typing
10
10
  import urllib.parse
11
11
  import urllib.request
12
+ from pathlib import Path
12
13
  from typing import Any, ClassVar, Iterable, Literal, Mapping, Optional, Sequence, Union
13
14
 
15
+ from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
16
+
14
17
  import av
15
18
  import jsonschema
16
19
  import jsonschema.protocols
@@ -24,8 +27,6 @@ from typing_extensions import _AnnotatedAlias
24
27
  import pixeltable.exceptions as excs
25
28
  from pixeltable.utils import parse_local_file_path
26
29
 
27
- from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
28
-
29
30
 
30
31
  class ColumnType:
31
32
  @enum.unique
@@ -292,7 +293,11 @@ class ColumnType:
292
293
 
293
294
  @classmethod
294
295
  def from_python_type(
295
- cls, t: type | _GenericAlias, nullable_default: bool = False, allow_builtin_types: bool = True
296
+ cls,
297
+ t: type | _GenericAlias,
298
+ nullable_default: bool = False,
299
+ allow_builtin_types: bool = True,
300
+ infer_pydantic_json: bool = False,
296
301
  ) -> Optional[ColumnType]:
297
302
  """
298
303
  Convert a Python type into a Pixeltable `ColumnType` instance.
@@ -305,6 +310,8 @@ class ColumnType:
305
310
  allowed (as in UDF definitions). If False, then only Pixeltable types such as `pxt.String`,
306
311
  `pxt.Int`, etc., will be allowed (as in schema definitions). `Optional` and `Required`
307
312
  designations will be allowed regardless.
313
+ infer_pydantic_json: If True, accepts an extended set of built-ins (eg, Enum, Path) and returns the type to
314
+ which pydantic.BaseModel.model_dump(mode='json') serializes it.
308
315
  """
309
316
  origin = typing.get_origin(t)
310
317
  type_args = typing.get_args(t)
@@ -314,7 +321,9 @@ class ColumnType:
314
321
  # `t` is a type of the form Optional[T] (equivalently, T | None or None | T).
315
322
  # We treat it as the underlying type but with nullable=True.
316
323
  underlying_py_type = type_args[0] if type_args[1] is type(None) else type_args[1]
317
- underlying = cls.from_python_type(underlying_py_type, allow_builtin_types=allow_builtin_types)
324
+ underlying = cls.from_python_type(
325
+ underlying_py_type, allow_builtin_types=allow_builtin_types, infer_pydantic_json=infer_pydantic_json
326
+ )
318
327
  if underlying is not None:
319
328
  return underlying.copy(nullable=True)
320
329
  elif origin is Required:
@@ -341,6 +350,13 @@ class ColumnType:
341
350
  if literal_type is None:
342
351
  return None
343
352
  return literal_type.copy(nullable=(literal_type.nullable or nullable_default))
353
+ if infer_pydantic_json and isinstance(t, type) and issubclass(t, enum.Enum):
354
+ literal_type = cls.infer_common_literal_type(member.value for member in t)
355
+ if literal_type is None:
356
+ return None
357
+ return literal_type.copy(nullable=(literal_type.nullable or nullable_default))
358
+ if infer_pydantic_json and t is Path:
359
+ return StringType(nullable=nullable_default)
344
360
  if t is str:
345
361
  return StringType(nullable=nullable_default)
346
362
  if t is int:
@@ -0,0 +1,60 @@
1
+ import typing
2
+ from datetime import datetime
3
+ from enum import Enum
4
+ from types import UnionType
5
+ from typing import Any, Union
6
+
7
+ import pydantic
8
+
9
+
10
+ def is_json_convertible(model: type[pydantic.BaseModel]) -> bool:
11
+ """
12
+ Determine if instances of a Pydantic model can be converted to valid JSON
13
+ based on the type hints of its fields.
14
+ """
15
+ type_hints = typing.get_type_hints(model)
16
+ return all(_type_is_json_convertible(field_type) for field_type in type_hints.values())
17
+
18
+
19
+ def _type_is_json_convertible(type_hint: Any) -> bool:
20
+ """
21
+ Recursively check if a type hint represents a JSON-compatible type.
22
+
23
+ TODO: also allow ndarrays and PIL.Image.Image, once we support those within json structures.
24
+ """
25
+ if type_hint is type(None):
26
+ return True
27
+ if type_hint is Any:
28
+ return False
29
+
30
+ if type_hint in (str, int, float, bool, datetime):
31
+ return True
32
+
33
+ if isinstance(type_hint, type) and issubclass(type_hint, Enum):
34
+ return all(isinstance(member.value, (str, int, float, bool, type(None))) for member in type_hint)
35
+
36
+ if isinstance(type_hint, type) and issubclass(type_hint, pydantic.BaseModel):
37
+ return is_json_convertible(type_hint)
38
+
39
+ origin = typing.get_origin(type_hint)
40
+ args = typing.get_args(type_hint)
41
+
42
+ if origin in (Union, UnionType):
43
+ return all(_type_is_json_convertible(arg) for arg in args)
44
+
45
+ if origin in (list, tuple):
46
+ return all(_type_is_json_convertible(arg) for arg in args) if len(args) > 0 else False
47
+
48
+ if origin is dict:
49
+ if len(args) != 2:
50
+ # we can't tell what this is
51
+ return False
52
+ key_type, value_type = args
53
+ # keys must be strings, values must be json-convertible
54
+ return key_type is str and _type_is_json_convertible(value_type)
55
+
56
+ # Literal types are json-convertible if their values are
57
+ if origin is typing.Literal:
58
+ return all(isinstance(val, (str, int, float, bool, type(None))) for val in args)
59
+
60
+ return False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pixeltable
3
- Version: 0.4.8
3
+ Version: 0.4.9
4
4
  Summary: AI Data Infrastructure: Declarative, Multimodal, and Incremental
5
5
  Project-URL: homepage, https://pixeltable.com/
6
6
  Project-URL: repository, https://github.com/pixeltable/pixeltable
@@ -1,24 +1,24 @@
1
- pixeltable/__init__.py,sha256=fAeDmkHIxf7jbpIAWhygnBy5iqTeH2blX9L1yNqLFFY,1567
1
+ pixeltable/__init__.py,sha256=wJ_4oQdkBAaaVKM8XiZKKSsWPnoemZxh34o6_5vDcxk,1562
2
2
  pixeltable/__version__.py,sha256=LnMIuAxx6nAQDMev_jnZyUdgsaiE3F8lulfXQBRl9qQ,112
3
- pixeltable/config.py,sha256=jRNlg_d6-qsQeX2OZCqsaBkinww-laTmybeiRH9YWlY,8335
3
+ pixeltable/config.py,sha256=-aoSVF0Aak83IC-u-XANw3if76TDq5VnnWNWoFDR5Hc,8390
4
4
  pixeltable/dataframe.py,sha256=I6iEJGD4pivUN-cPVFq_rcniZN7C55xpr37sMJ2BIdE,62986
5
- pixeltable/env.py,sha256=qU3wADj9lWdsllqXG92EP4E5M81G1T8QmBrSR2MiTQ8,42095
5
+ pixeltable/env.py,sha256=EZXZPx-OKNo-QqOik1tZyJKSK0brM_b3p2r9ksS6JJs,42964
6
6
  pixeltable/exceptions.py,sha256=Gm8d3TL2iiv6Pj2DLd29wp_j41qNBhxXL9iTQnL4Nk4,1116
7
- pixeltable/globals.py,sha256=oho6dsuREOYYYhbXyiKfLYZvRJuuvw5SQYM2BqkhsNw,39027
7
+ pixeltable/globals.py,sha256=8NijkEmtjY5me6J8zF4G-t1v5_z4q7btOK2yjUREUak,39118
8
8
  pixeltable/plan.py,sha256=4yAe7ExAqaSvkFxwK7LPH_HpmoumwqoLeOo7czJ8CyQ,48001
9
9
  pixeltable/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  pixeltable/store.py,sha256=CneWUmgN-EwaPYLcizlAxONC7WYwMr8SNpSFeNBBmOA,22885
11
- pixeltable/type_system.py,sha256=CvePMwTSr8ygc0wnYa-etgoXQLvzMyjNDLIC8TmIXa4,55294
11
+ pixeltable/type_system.py,sha256=UfPZZy4zJ2kGvdHXI9rqxOGAjgIxCZ9QGvvidPWcq-M,56153
12
12
  pixeltable/catalog/__init__.py,sha256=zw6hiyAIjMBxCExtsr7G51ul2XQ9fTQQKcs45rIy7xA,682
13
13
  pixeltable/catalog/catalog.py,sha256=gaq10XFwkr6jyv8yVi5xV3_oiDkPvqVe55vxOo14W6k,93853
14
14
  pixeltable/catalog/column.py,sha256=MXa5o3ku94T8ZFEL7wnAvqvlk65fOmmHPqIvrUVf3uo,13514
15
15
  pixeltable/catalog/dir.py,sha256=VYTscPlKR6XhupPTXlJ8txAHxS5GSpPJ3LIleDJagVQ,2047
16
16
  pixeltable/catalog/globals.py,sha256=uMIDsbeDzFxZbcgKDTOiT5plC1gAKgz1oxxdh1odIPw,2648
17
- pixeltable/catalog/insertable_table.py,sha256=PDjUhsiVKPT5utHi9ETCEnJWhQSCLBzF_1ITDIIvFRI,9292
17
+ pixeltable/catalog/insertable_table.py,sha256=VUuJ8z7OtMqgy_LMzkn1KzeLXdR-9poTttClscQ_uaU,13899
18
18
  pixeltable/catalog/named_function.py,sha256=vZ-j7P4HugWh9OmUzBMwyRYvO3tQn9jWyJz_1stPavU,1210
19
19
  pixeltable/catalog/path.py,sha256=O3FfxrvyX2crijBhp_2k4-3mG3BFxwba-tlPB74QtJQ,3780
20
20
  pixeltable/catalog/schema_object.py,sha256=rQ6-3rzqnOHyEEHi97kai2S7BO3D9AkH7rirnfbGc14,1785
21
- pixeltable/catalog/table.py,sha256=xsslPl5CS6nvXxV9Wwky0J4CDTervD6LxLuBKMhvVVY,80996
21
+ pixeltable/catalog/table.py,sha256=NLo8mcM8SKM8jC4uzRT5elhrC0XTGunjQYznqrCz3w0,81315
22
22
  pixeltable/catalog/table_version.py,sha256=jTYKzAdQuHBrknQLADBqjLhKwwqeVxmAPosMKsL051Q,64983
23
23
  pixeltable/catalog/table_version_handle.py,sha256=FTPRqcGY-h-POcWyZbd9b8P2D5zIw5OSUvwF_dbyCGo,3608
24
24
  pixeltable/catalog/table_version_path.py,sha256=IaFVDH06_6ZMuBv5eLNCRTlWizpvz95jgAzqp4OVx_o,9713
@@ -69,10 +69,6 @@ pixeltable/exprs/sql_element_cache.py,sha256=c7Q6vFK4xnf9vmcRYnXiAcwPBBwmw0dolft
69
69
  pixeltable/exprs/string_op.py,sha256=PGWRH1yUaqj7xszdumIBOTHzVkXE0k831jXxIeFPDog,4131
70
70
  pixeltable/exprs/type_cast.py,sha256=_nDzTxg5kXVGLewI0FrH2zmwJzgptdxYd5Jvuyig0UI,2322
71
71
  pixeltable/exprs/variable.py,sha256=UwWwaNECbtwyC8v0g8iqCa3a6mO8z9lK7ta5NrlCwvs,1493
72
- pixeltable/ext/__init__.py,sha256=UgDXWzGWiQIrwOuEvWTePLBcR2kecllPAE7gp-42Awg,457
73
- pixeltable/ext/functions/__init__.py,sha256=Ox3kUHn5IslVEmEKsjrHfkHDrUkmLl9RCO2YkrPJkgc,193
74
- pixeltable/ext/functions/whisperx.py,sha256=qda6kFQSvZTY2asfrYPwHb1cvSa03LbhJ-Wf9b7qPhw,2355
75
- pixeltable/ext/functions/yolox.py,sha256=dX22nMb-0n2hZi7WhZ1Y4LIpFk5loyeXXuSUcc2Fgrg,3724
76
72
  pixeltable/func/__init__.py,sha256=SQPtGr_9dZNyXzxaZQcP3oVLKnbbs4UqV6sg8XUQHxQ,535
77
73
  pixeltable/func/aggregate_function.py,sha256=5_MgqHAlMaacX2sPIHv_auTvYXtqR5MIZy_WqYQSdho,13264
78
74
  pixeltable/func/callable_function.py,sha256=g_pA-g631YcFGLix9PpHYfgjOeS2qF0Csm1VxX8fah0,9278
@@ -85,7 +81,7 @@ pixeltable/func/query_template_function.py,sha256=aX6GgANSdDTQwrObEV-B_la_oVRVky
85
81
  pixeltable/func/signature.py,sha256=LdHbdim14Zu7Xt1pMhOCzl6Xn2fq5CQQpwSXmu28umw,14988
86
82
  pixeltable/func/tools.py,sha256=2_M_u0Jiy5-uToZziB4O54aTuJeaytPmh71q3I2ydNw,6062
87
83
  pixeltable/func/udf.py,sha256=6tKpMt37t3BmXwRyA5fFAd6OM4D5EPEd2KuAr7DQhr0,13231
88
- pixeltable/functions/__init__.py,sha256=Akk6Nk-rpz2D_V4kJTfyP56xnNbCz3EtxVAuwLoiysA,588
84
+ pixeltable/functions/__init__.py,sha256=ZeRB7ksbzjdrvePXtd_mNxyP2RhjvN0ayl5nv7TdWcQ,613
89
85
  pixeltable/functions/anthropic.py,sha256=2Ja-pryC_3Yd1sXW-pibRuvKjgyfYqOhhl6nBWNOBt0,10504
90
86
  pixeltable/functions/audio.py,sha256=6_tUhSZgxhOQQJemvZYNEoKNjWdr3SgJsvLkKCSmtfw,1633
91
87
  pixeltable/functions/bedrock.py,sha256=lTCFHjYunF3minBGWcjXR90yJ8resFjXr4niyKhfxms,4217
@@ -111,6 +107,8 @@ pixeltable/functions/util.py,sha256=uQNkyBSkTVMe1wbUI2Q0nz-mM3qPVTF86yK8c9OFIcE,
111
107
  pixeltable/functions/video.py,sha256=Z0X0Z-oCS-c4cqjlfCPLUxvTUAkQdxDZ-tL-jAIKKA0,10590
112
108
  pixeltable/functions/vision.py,sha256=17h9bOm3NJyQzFMBwXDHMqnkcuCspyQJgHdBOXV1Ip8,15380
113
109
  pixeltable/functions/whisper.py,sha256=c9E6trhc2UcShVaGaEBCUEpArke1ql3MV5We0qSgmuU,2960
110
+ pixeltable/functions/whisperx.py,sha256=BT9gwXEf5V1lgDxynkrrH6gsuCLqjCzfMJKj5DaOtSM,7661
111
+ pixeltable/functions/yolox.py,sha256=ZdYr6WIqTCHOJoZSoXe4CbME54dYeeeOhkOi1I7VtcE,3518
114
112
  pixeltable/index/__init__.py,sha256=97aFuxiP_oz1ldn5iq8IWApkOV8XG6ZIBW5-9rkS0vM,122
115
113
  pixeltable/index/base.py,sha256=200s7v3Zy810bRlbSAYzxxaEjVssl6r8esTHiSvWRwQ,1704
116
114
  pixeltable/index/btree.py,sha256=8B06D67ay0DFUtEBC5q4bLjxMq7ILpKyyoLAiSaamzA,2503
@@ -133,7 +131,7 @@ pixeltable/iterators/document.py,sha256=7NIN5W5jHVm4v5_FzGsH0XJigtPCm8DfXJUc3_hE
133
131
  pixeltable/iterators/image.py,sha256=RrFdf5cnFIQzWKJk4uYi1m1p2qAiz909THYhRQ27DbY,3603
134
132
  pixeltable/iterators/string.py,sha256=URj5edWp-CsorjN_8nnfWGvtIFs_Zh4VPm6htlJbFkU,1257
135
133
  pixeltable/iterators/video.py,sha256=PKztCS_FEtu-AoHR6X-wJG6UJddX195lS-9eQp5ClGc,10810
136
- pixeltable/metadata/__init__.py,sha256=iJxMsd3s5yNZ5ciIBzQCa0frXZKgvFj2_-H0Sf4N1mk,3154
134
+ pixeltable/metadata/__init__.py,sha256=oTO9kN6h4xJ2lsk4a2bq6ejAD-4wToy7b5_i3Pq1Qnc,3289
137
135
  pixeltable/metadata/notes.py,sha256=3fdZDFpL1-b194Ejv0Y0YP-vbnV-XvVP9wOmZM9XARA,1545
138
136
  pixeltable/metadata/schema.py,sha256=fs9W2SLh32Ehxc9AChVH7YCtlSSnQkgGMbEyOh0B4W0,13416
139
137
  pixeltable/metadata/utils.py,sha256=NJQXWhhK1hdOZ4H3hh9N0mqbl-I9JqMUqrfA6OWLflE,2682
@@ -167,6 +165,8 @@ pixeltable/metadata/converters/convert_37.py,sha256=IVZGtKFaaYMGBs39V_H_okWvpxxa
167
165
  pixeltable/metadata/converters/convert_38.py,sha256=YyNyocwzzdJRcI0YSCo_70Q4hSk63235iE4IxhwSEzs,1169
168
166
  pixeltable/metadata/converters/convert_39.py,sha256=YaEfgStxtYGRbuRLFw8wTAZVJRzIU6zL6nPU2zuDcEU,4658
169
167
  pixeltable/metadata/converters/util.py,sha256=QUBOj2F_6rCAdIo0lgD1IVgAM15Vmq7ikQspB4s0eQ8,7732
168
+ pixeltable/mypy/__init__.py,sha256=cD_oHXClR_bDM8qVNIfaOAgRhQjPfcWvLcinz79ua6o,54
169
+ pixeltable/mypy/mypy_plugin.py,sha256=KCjzKOeKW5CBqJOq9Ch7ZJ25ICPc4nlTB49DxtC6oDM,5460
170
170
  pixeltable/share/__init__.py,sha256=AtR4nS6YkfkFRkXA-zZXFTK5pSQjHry8MnxdVLUk5SA,68
171
171
  pixeltable/share/packager.py,sha256=5rSKnQCs3YP5h48d79bXEK4L8tLUSeTSbXaB8X9SmBI,31265
172
172
  pixeltable/share/publish.py,sha256=KS_R59AuVkHWkXHwELP74xgSHs8Z5z8SBPMcjzttt44,11469
@@ -185,12 +185,13 @@ pixeltable/utils/formatter.py,sha256=tbMxE9rBw6wdKUnJhNZ8h9uAF8dZKcihQ2KesqAag9A
185
185
  pixeltable/utils/http_server.py,sha256=6khOAtpVj1lDIm9Dx8VIECLm87cFEp4IFbAg8T92A2o,2441
186
186
  pixeltable/utils/iceberg.py,sha256=COeNqqy5RRMkDGLS8CTnaUeAccG10x2fwP3e1veuqIA,522
187
187
  pixeltable/utils/media_store.py,sha256=HVOuK5JTTvgSH_st0lsapv39Lnu29QGpkKUtZQybBTA,10560
188
+ pixeltable/utils/pydantic.py,sha256=-ztUsuRXA7B6bywb5Yy1h5pNQ2DnsT1d0oHMxqtK3WY,2011
188
189
  pixeltable/utils/pytorch.py,sha256=564VHRdDHwD9h0v5lBHEDTJ8c6zx8wuzWYx8ZYjBxlI,3621
189
190
  pixeltable/utils/s3.py,sha256=pxip2MlCqd2Qon2dzJXzfxvwtZyc-BAsjAnLL4J_OXY,587
190
191
  pixeltable/utils/sql.py,sha256=Sa4Lh-VGe8GToU5W7DRiWf2lMl9B6saPqemiT0ZdHEc,806
191
192
  pixeltable/utils/transactional_directory.py,sha256=OFKmu90oP7KwBAljwjnzP_w8euGdAXob3y4Nx9SCNHA,1357
192
- pixeltable-0.4.8.dist-info/METADATA,sha256=GQlU61_sG3nWj-sIhckeyHH06y2rruNsxEAsLp-iaCc,24247
193
- pixeltable-0.4.8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
194
- pixeltable-0.4.8.dist-info/entry_points.txt,sha256=rrKugZmxDtGnXCnEQ5UJMaaSYY7-g1cLjUZ4W1moIhM,98
195
- pixeltable-0.4.8.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
196
- pixeltable-0.4.8.dist-info/RECORD,,
193
+ pixeltable-0.4.9.dist-info/METADATA,sha256=OvTlQgjU2P7wXoyAQhd8p4MrQU1jv5btGrtIHhRF9so,24247
194
+ pixeltable-0.4.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
195
+ pixeltable-0.4.9.dist-info/entry_points.txt,sha256=rrKugZmxDtGnXCnEQ5UJMaaSYY7-g1cLjUZ4W1moIhM,98
196
+ pixeltable-0.4.9.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
197
+ pixeltable-0.4.9.dist-info/RECORD,,
@@ -1,17 +0,0 @@
1
- """
2
- Extended integrations for Pixeltable. This package contains experimental or demonstration features that
3
- are not intended for production use. Long-term support cannot be guaranteed, usually because the features
4
- have dependencies whose future support is unclear.
5
- """
6
-
7
- # ruff: noqa: F401
8
-
9
- from pixeltable.utils.code import local_public_names
10
-
11
- from . import functions
12
-
13
- __all__ = local_public_names(__name__)
14
-
15
-
16
- def __dir__() -> list[str]:
17
- return __all__
@@ -1,11 +0,0 @@
1
- # ruff: noqa: F401
2
-
3
- from pixeltable.utils.code import local_public_names
4
-
5
- from . import whisperx, yolox
6
-
7
- __all__ = local_public_names(__name__)
8
-
9
-
10
- def __dir__() -> list[str]:
11
- return __all__
@@ -1,77 +0,0 @@
1
- from typing import TYPE_CHECKING, Optional
2
-
3
- from pixeltable.utils.code import local_public_names
4
-
5
- if TYPE_CHECKING:
6
- from whisperx.asr import FasterWhisperPipeline # type: ignore[import-untyped]
7
-
8
- import pixeltable as pxt
9
-
10
-
11
- @pxt.udf
12
- def transcribe(
13
- audio: pxt.Audio,
14
- *,
15
- model: str,
16
- compute_type: Optional[str] = None,
17
- language: Optional[str] = None,
18
- chunk_size: int = 30,
19
- ) -> dict:
20
- """
21
- Transcribe an audio file using WhisperX.
22
-
23
- This UDF runs a transcription model _locally_ using the WhisperX library,
24
- equivalent to the WhisperX `transcribe` function, as described in the
25
- [WhisperX library documentation](https://github.com/m-bain/whisperX).
26
-
27
- WhisperX is part of the `pixeltable.ext` package: long-term support in Pixeltable is not guaranteed.
28
-
29
- __Requirements:__
30
-
31
- - `pip install whisperx`
32
-
33
- Args:
34
- audio: The audio file to transcribe.
35
- model: The name of the model to use for transcription.
36
-
37
- See the [WhisperX library documentation](https://github.com/m-bain/whisperX) for details
38
- on the remaining parameters.
39
-
40
- Returns:
41
- A dictionary containing the transcription and various other metadata.
42
-
43
- Examples:
44
- Add a computed column that applies the model `tiny.en` to an existing Pixeltable column `tbl.audio`
45
- of the table `tbl`:
46
-
47
- >>> tbl.add_computed_column(result=transcribe(tbl.audio, model='tiny.en'))
48
- """
49
- import torch
50
- import whisperx # type: ignore[import-untyped]
51
-
52
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
53
- compute_type = compute_type or ('float16' if device == 'cuda' else 'int8')
54
- model = _lookup_model(model, device, compute_type)
55
- audio_array = whisperx.load_audio(audio)
56
- result = model.transcribe(audio_array, batch_size=16, language=language, chunk_size=chunk_size)
57
- return result
58
-
59
-
60
- def _lookup_model(model_id: str, device: str, compute_type: str) -> 'FasterWhisperPipeline':
61
- import whisperx
62
-
63
- key = (model_id, device, compute_type)
64
- if key not in _model_cache:
65
- model = whisperx.load_model(model_id, device, compute_type=compute_type)
66
- _model_cache[key] = model
67
- return _model_cache[key]
68
-
69
-
70
- _model_cache: dict[tuple[str, str, str], 'FasterWhisperPipeline'] = {}
71
-
72
-
73
- __all__ = local_public_names(__name__)
74
-
75
-
76
- def __dir__() -> list[str]:
77
- return __all__