pixeltable 0.4.7__py3-none-any.whl → 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (50) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/catalog/catalog.py +4 -6
  3. pixeltable/catalog/insertable_table.py +125 -28
  4. pixeltable/catalog/table.py +51 -15
  5. pixeltable/catalog/table_version.py +12 -8
  6. pixeltable/catalog/table_version_path.py +6 -5
  7. pixeltable/config.py +25 -9
  8. pixeltable/dataframe.py +3 -3
  9. pixeltable/env.py +89 -20
  10. pixeltable/exec/aggregation_node.py +1 -1
  11. pixeltable/exec/cache_prefetch_node.py +4 -3
  12. pixeltable/exec/exec_node.py +0 -8
  13. pixeltable/exec/expr_eval/globals.py +1 -0
  14. pixeltable/exec/expr_eval/schedulers.py +16 -4
  15. pixeltable/exec/in_memory_data_node.py +2 -3
  16. pixeltable/exprs/data_row.py +5 -5
  17. pixeltable/exprs/function_call.py +59 -21
  18. pixeltable/exprs/row_builder.py +11 -5
  19. pixeltable/func/expr_template_function.py +6 -3
  20. pixeltable/functions/__init__.py +2 -0
  21. pixeltable/functions/anthropic.py +1 -2
  22. pixeltable/functions/deepseek.py +5 -1
  23. pixeltable/functions/gemini.py +11 -2
  24. pixeltable/functions/huggingface.py +6 -12
  25. pixeltable/functions/openai.py +2 -1
  26. pixeltable/functions/video.py +5 -5
  27. pixeltable/functions/whisperx.py +177 -0
  28. pixeltable/{ext/functions → functions}/yolox.py +0 -4
  29. pixeltable/globals.py +16 -3
  30. pixeltable/io/fiftyone.py +3 -3
  31. pixeltable/io/label_studio.py +2 -1
  32. pixeltable/iterators/audio.py +3 -2
  33. pixeltable/iterators/document.py +0 -6
  34. pixeltable/metadata/__init__.py +3 -1
  35. pixeltable/mypy/__init__.py +3 -0
  36. pixeltable/mypy/mypy_plugin.py +123 -0
  37. pixeltable/plan.py +0 -16
  38. pixeltable/share/packager.py +6 -6
  39. pixeltable/share/publish.py +134 -7
  40. pixeltable/type_system.py +20 -4
  41. pixeltable/utils/media_store.py +131 -66
  42. pixeltable/utils/pydantic.py +60 -0
  43. {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/METADATA +186 -121
  44. {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/RECORD +47 -46
  45. pixeltable/ext/__init__.py +0 -17
  46. pixeltable/ext/functions/__init__.py +0 -11
  47. pixeltable/ext/functions/whisperx.py +0 -77
  48. {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/WHEEL +0 -0
  49. {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/entry_points.txt +0 -0
  50. {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/licenses/LICENSE +0 -0
@@ -63,13 +63,10 @@ def sentence_transformer(
63
63
 
64
64
  @sentence_transformer.conditional_return_type
65
65
  def _(model_id: str) -> ts.ArrayType:
66
- try:
67
- from sentence_transformers import SentenceTransformer
66
+ from sentence_transformers import SentenceTransformer
68
67
 
69
- model = _lookup_model(model_id, SentenceTransformer)
70
- return ts.ArrayType((model.get_sentence_embedding_dimension(),), dtype=ts.FloatType(), nullable=False)
71
- except ImportError:
72
- return ts.ArrayType((None,), dtype=ts.FloatType(), nullable=False)
68
+ model = _lookup_model(model_id, SentenceTransformer)
69
+ return ts.ArrayType((model.get_sentence_embedding_dimension(),), dtype=ts.FloatType(), nullable=False)
73
70
 
74
71
 
75
72
  @pxt.udf
@@ -201,13 +198,10 @@ def _(image: Batch[PIL.Image.Image], *, model_id: str) -> Batch[pxt.Array[(None,
201
198
 
202
199
  @clip.conditional_return_type
203
200
  def _(model_id: str) -> ts.ArrayType:
204
- try:
205
- from transformers import CLIPModel
201
+ from transformers import CLIPModel
206
202
 
207
- model = _lookup_model(model_id, CLIPModel.from_pretrained)
208
- return ts.ArrayType((model.config.projection_dim,), dtype=ts.FloatType(), nullable=False)
209
- except ImportError:
210
- return ts.ArrayType((None,), dtype=ts.FloatType(), nullable=False)
203
+ model = _lookup_model(model_id, CLIPModel.from_pretrained)
204
+ return ts.ArrayType((model.config.projection_dim,), dtype=ts.FloatType(), nullable=False)
211
205
 
212
206
 
213
207
  @pxt.udf(batch_size=4)
@@ -23,6 +23,7 @@ import pixeltable as pxt
23
23
  from pixeltable import env, exprs, type_system as ts
24
24
  from pixeltable.func import Batch, Tools
25
25
  from pixeltable.utils.code import local_public_names
26
+ from pixeltable.utils.media_store import TempStore
26
27
 
27
28
  if TYPE_CHECKING:
28
29
  import openai
@@ -216,7 +217,7 @@ async def speech(input: str, *, model: str, voice: str, model_kwargs: Optional[d
216
217
 
217
218
  content = await _openai_client().audio.speech.create(input=input, model=model, voice=voice, **model_kwargs)
218
219
  ext = model_kwargs.get('response_format', 'mp3')
219
- output_filename = str(env.Env.get().create_tmp_path(f'.{ext}'))
220
+ output_filename = str(TempStore.create_path(extension=f'.{ext}'))
220
221
  content.write_to_file(output_filename)
221
222
  return output_filename
222
223
 
@@ -9,8 +9,8 @@ import numpy as np
9
9
  import PIL.Image
10
10
 
11
11
  import pixeltable as pxt
12
- from pixeltable import env
13
12
  from pixeltable.utils.code import local_public_names
13
+ from pixeltable.utils.media_store import TempStore
14
14
 
15
15
  _format_defaults: dict[str, tuple[str, str]] = { # format -> (codec, ext)
16
16
  'wav': ('pcm_s16le', 'wav'),
@@ -109,7 +109,7 @@ class make_video(pxt.Aggregator):
109
109
  if frame is None:
110
110
  return
111
111
  if self.container is None:
112
- self.out_file = env.Env.get().create_tmp_path('.mp4')
112
+ self.out_file = TempStore.create_path(extension='.mp4')
113
113
  self.container = av.open(str(self.out_file), mode='w')
114
114
  self.stream = self.container.add_stream('h264', rate=self.fps)
115
115
  self.stream.pix_fmt = 'yuv420p'
@@ -158,16 +158,16 @@ def extract_audio(
158
158
  return None
159
159
  audio_stream = container.streams.audio[stream_idx]
160
160
  # create this in our tmp directory, so it'll get cleaned up if it's being generated as part of a query
161
- output_filename = str(env.Env.get().create_tmp_path(f'.{ext}'))
161
+ output_path = str(TempStore.create_path(extension=f'.{ext}'))
162
162
 
163
- with av.open(output_filename, 'w', format=format) as output_container:
163
+ with av.open(output_path, 'w', format=format) as output_container:
164
164
  output_stream = output_container.add_stream(codec or default_codec)
165
165
  assert isinstance(output_stream, av.audio.stream.AudioStream)
166
166
  for packet in container.demux(audio_stream):
167
167
  for frame in packet.decode():
168
168
  output_container.mux(output_stream.encode(frame)) # type: ignore[arg-type]
169
169
 
170
- return output_filename
170
+ return output_path
171
171
 
172
172
 
173
173
  @pxt.udf(is_method=True)
@@ -0,0 +1,177 @@
1
+ from typing import TYPE_CHECKING, Any, Optional
2
+
3
+ import numpy as np
4
+
5
+ import pixeltable as pxt
6
+ from pixeltable.config import Config
7
+ from pixeltable.functions.util import resolve_torch_device
8
+ from pixeltable.utils.code import local_public_names
9
+
10
+ if TYPE_CHECKING:
11
+ from transformers import Wav2Vec2Model
12
+ from whisperx.asr import FasterWhisperPipeline # type: ignore[import-untyped]
13
+ from whisperx.diarize import DiarizationPipeline # type: ignore[import-untyped]
14
+
15
+
16
+ @pxt.udf
17
+ def transcribe(
18
+ audio: pxt.Audio,
19
+ *,
20
+ model: str,
21
+ diarize: bool = False,
22
+ compute_type: Optional[str] = None,
23
+ language: Optional[str] = None,
24
+ task: Optional[str] = None,
25
+ chunk_size: Optional[int] = None,
26
+ alignment_model_name: Optional[str] = None,
27
+ interpolate_method: Optional[str] = None,
28
+ return_char_alignments: Optional[bool] = None,
29
+ diarization_model_name: Optional[str] = None,
30
+ num_speakers: Optional[int] = None,
31
+ min_speakers: Optional[int] = None,
32
+ max_speakers: Optional[int] = None,
33
+ ) -> dict:
34
+ """
35
+ Transcribe an audio file using WhisperX.
36
+
37
+ This UDF runs a transcription model _locally_ using the WhisperX library,
38
+ equivalent to the WhisperX `transcribe` function, as described in the
39
+ [WhisperX library documentation](https://github.com/m-bain/whisperX).
40
+
41
+ If `diarize=True`, then speaker diarization will also be performed. Several of the UDF parameters are only valid if
42
+ `diarize=True`, as documented in the parameters list below.
43
+
44
+ __Requirements:__
45
+
46
+ - `pip install whisperx`
47
+
48
+ Args:
49
+ audio: The audio file to transcribe.
50
+ model: The name of the model to use for transcription.
51
+ diarize: Whether to perform speaker diarization.
52
+ compute_type: The compute type to use for the model (e.g., `'int8'`, `'float16'`). If `None`,
53
+ defaults to `'float16'` on CUDA devices and `'int8'` otherwise.
54
+ language: The language code for the transcription (e.g., `'en'` for English).
55
+ task: The task to perform (e.g., `'transcribe'` or `'translate'`). Defaults to `'transcribe'`.
56
+ chunk_size: The size of the audio chunks to process, in seconds. Defaults to `30`.
57
+ alignment_model_name: The name of the alignment model to use. If `None`, uses the default model for the given
58
+ language. Only valid if `diarize=True`.
59
+ interpolate_method: The method to use for interpolation of the alignment results. If not specified, uses the
60
+ WhisperX default (`'nearest'`). Only valid if `diarize=True`.
61
+ return_char_alignments: Whether to return character-level alignments. Defaults to `False`.
62
+ Only valid if `diarize=True`.
63
+ diarization_model_name: The name of the diarization model to use. Defaults to
64
+ `pyannote/speaker-diarization-3.1`. Only valid if `diarize=True`.
65
+ num_speakers: The number of speakers to expect in the audio. By default, the model with try to detect the
66
+ number of speakers. Only valid if `diarize=True`.
67
+ min_speakers: If specified, the minimum number of speakers to expect in the audio.
68
+ Only valid if `diarize=True`.
69
+ max_speakers: If specified, the maximum number of speakers to expect in the audio.
70
+ Only valid if `diarize=True`.
71
+
72
+ Returns:
73
+ A dictionary containing the audio transcription, diarization (if enabled), and various other metadata.
74
+
75
+ Examples:
76
+ Add a computed column that applies the model `tiny.en` to an existing Pixeltable column `tbl.audio`
77
+ of the table `tbl`:
78
+
79
+ >>> tbl.add_computed_column(result=transcribe(tbl.audio, model='tiny.en'))
80
+
81
+ Add a computed column that applies the model `tiny.en` to an existing Pixeltable column `tbl.audio`
82
+ of the table `tbl`, with speaker diarization enabled, expecting at least 2 speakers:
83
+
84
+ >>> tbl.add_computed_column(
85
+ ... result=transcribe(
86
+ ... tbl.audio, model='tiny.en', diarize=True, min_speakers=2
87
+ ... )
88
+ ... )
89
+ """
90
+ import whisperx # type: ignore[import-untyped]
91
+
92
+ if not diarize:
93
+ args = locals()
94
+ for param in (
95
+ 'alignment_model_name',
96
+ 'interpolate_method',
97
+ 'return_char_alignments',
98
+ 'diarization_model_name',
99
+ 'num_speakers',
100
+ 'min_speakers',
101
+ 'max_speakers',
102
+ ):
103
+ if args[param] is not None:
104
+ raise pxt.Error(f'`{param}` can only be set if `diarize=True`')
105
+
106
+ device = resolve_torch_device('auto', allow_mps=False)
107
+ compute_type = compute_type or ('float16' if device == 'cuda' else 'int8')
108
+ transcription_model = _lookup_transcription_model(model, device, compute_type)
109
+ audio_array: np.ndarray = whisperx.load_audio(audio)
110
+ kwargs: dict[str, Any] = {'language': language, 'task': task}
111
+ if chunk_size is not None:
112
+ kwargs['chunk_size'] = chunk_size
113
+ result: dict[str, Any] = transcription_model.transcribe(audio_array, batch_size=16, **kwargs)
114
+
115
+ if diarize:
116
+ # Alignment
117
+ alignment_model, metadata = _lookup_alignment_model(result['language'], device, alignment_model_name)
118
+ kwargs = {}
119
+ if interpolate_method is not None:
120
+ kwargs['interpolate_method'] = interpolate_method
121
+ if return_char_alignments is not None:
122
+ kwargs['return_char_alignments'] = return_char_alignments
123
+ result = whisperx.align(result['segments'], alignment_model, metadata, audio_array, device, **kwargs)
124
+
125
+ # Diarization
126
+ diarization_model = _lookup_diarization_model(device, diarization_model_name)
127
+ diarization_segments = diarization_model(
128
+ audio_array, num_speakers=num_speakers, min_speakers=min_speakers, max_speakers=max_speakers
129
+ )
130
+ result = whisperx.assign_word_speakers(diarization_segments, result)
131
+
132
+ return result
133
+
134
+
135
+ def _lookup_transcription_model(model: str, device: str, compute_type: str) -> 'FasterWhisperPipeline':
136
+ import whisperx
137
+
138
+ key = (model, device, compute_type)
139
+ if key not in _model_cache:
140
+ transcription_model = whisperx.load_model(model, device, compute_type=compute_type)
141
+ _model_cache[key] = transcription_model
142
+ return _model_cache[key]
143
+
144
+
145
+ def _lookup_alignment_model(language_code: str, device: str, model_name: Optional[str]) -> tuple['Wav2Vec2Model', dict]:
146
+ import whisperx
147
+
148
+ key = (language_code, device, model_name)
149
+ if key not in _alignment_model_cache:
150
+ model, metadata = whisperx.load_align_model(language_code=language_code, device=device, model_name=model_name)
151
+ _alignment_model_cache[key] = (model, metadata)
152
+ return _alignment_model_cache[key]
153
+
154
+
155
+ def _lookup_diarization_model(device: str, model_name: Optional[str]) -> 'DiarizationPipeline':
156
+ from whisperx.diarize import DiarizationPipeline
157
+
158
+ key = (device, model_name)
159
+ if key not in _diarization_model_cache:
160
+ auth_token = Config.get().get_string_value('auth_token', section='hf')
161
+ kwargs: dict[str, Any] = {'device': device, 'use_auth_token': auth_token}
162
+ if model_name is not None:
163
+ kwargs['model_name'] = model_name
164
+ _diarization_model_cache[key] = DiarizationPipeline(**kwargs)
165
+ return _diarization_model_cache[key]
166
+
167
+
168
+ _model_cache: dict[tuple[str, str, str], 'FasterWhisperPipeline'] = {}
169
+ _alignment_model_cache: dict[tuple[str, str, Optional[str]], tuple['Wav2Vec2Model', dict]] = {}
170
+ _diarization_model_cache: dict[tuple[str, Optional[str]], 'DiarizationPipeline'] = {}
171
+
172
+
173
+ __all__ = local_public_names(__name__)
174
+
175
+
176
+ def __dir__() -> list[str]:
177
+ return __all__
@@ -20,8 +20,6 @@ def yolox(images: Batch[PIL.Image.Image], *, model_id: str, threshold: float = 0
20
20
  Computes YOLOX object detections for the specified image. `model_id` should reference one of the models
21
21
  defined in the [YOLOX documentation](https://github.com/Megvii-BaseDetection/YOLOX).
22
22
 
23
- YOLOX is part of the `pixeltable.ext` package: long-term support in Pixeltable is not guaranteed.
24
-
25
23
  __Requirements__:
26
24
 
27
25
  - `pip install pixeltable-yolox`
@@ -55,8 +53,6 @@ def yolo_to_coco(detections: dict) -> list:
55
53
  """
56
54
  Converts the output of a YOLOX object detection model to COCO format.
57
55
 
58
- YOLOX is part of the `pixeltable.ext` package: long-term support in Pixeltable is not guaranteed.
59
-
60
56
  Args:
61
57
  detections: The output of a YOLOX object detection model, as returned by `yolox`.
62
58
 
pixeltable/globals.py CHANGED
@@ -3,9 +3,10 @@ from __future__ import annotations
3
3
  import logging
4
4
  import os
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, NamedTuple, Optional, Union
6
+ from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, NamedTuple, Optional, Sequence, Union
7
7
 
8
8
  import pandas as pd
9
+ import pydantic
9
10
  from pandas.io.formats.style import Styler
10
11
 
11
12
  from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share, type_system as ts
@@ -25,6 +26,7 @@ if TYPE_CHECKING:
25
26
  Path, # OS paths, filenames, URLs
26
27
  Iterator[dict[str, Any]], # iterator producing dictionaries of values
27
28
  RowData, # list of dictionaries
29
+ Sequence[pydantic.BaseModel], # list of Pydantic models
28
30
  DataFrame, # Pixeltable DataFrame
29
31
  pd.DataFrame, # pandas DataFrame
30
32
  datasets.Dataset,
@@ -396,7 +398,12 @@ def create_snapshot(
396
398
  )
397
399
 
398
400
 
399
- def create_replica(destination: str, source: str | catalog.Table) -> Optional[catalog.Table]:
401
+ def create_replica(
402
+ destination: str,
403
+ source: str | catalog.Table,
404
+ bucket_name: str | None = None,
405
+ access: Literal['public', 'private'] = 'private',
406
+ ) -> Optional[catalog.Table]:
400
407
  """
401
408
  Create a replica of a table. Can be used either to create a remote replica of a local table, or to create a local
402
409
  replica of a remote table. A given table can have at most one replica per Pixeltable instance.
@@ -405,6 +412,12 @@ def create_replica(destination: str, source: str | catalog.Table) -> Optional[ca
405
412
  destination: Path where the replica will be created. Can be either a local path such as `'my_dir.my_table'`, or
406
413
  a remote URI such as `'pxt://username/mydir.my_table'`.
407
414
  source: Path to the source table, or (if the source table is a local table) a handle to the source table.
415
+ bucket_name: The name of the pixeltable cloud-registered bucket to use to store replica's data.
416
+ If no `bucket_name` is provided, the default Pixeltable storage bucket will be used.
417
+ access: Access control for the replica.
418
+
419
+ - `'public'`: Anyone can access this replica.
420
+ - `'private'`: Only the owner can access.
408
421
  """
409
422
  remote_dest = destination.startswith('pxt://')
410
423
  remote_source = isinstance(source, str) and source.startswith('pxt://')
@@ -414,7 +427,7 @@ def create_replica(destination: str, source: str | catalog.Table) -> Optional[ca
414
427
  if remote_dest:
415
428
  if isinstance(source, str):
416
429
  source = get_table(source)
417
- share.push_replica(destination, source)
430
+ share.push_replica(destination, source, bucket_name, access)
418
431
  return None
419
432
  else:
420
433
  assert isinstance(source, str)
pixeltable/io/fiftyone.py CHANGED
@@ -9,7 +9,7 @@ import puremagic
9
9
  import pixeltable as pxt
10
10
  import pixeltable.exceptions as excs
11
11
  from pixeltable import exprs
12
- from pixeltable.env import Env
12
+ from pixeltable.utils.media_store import TempStore
13
13
 
14
14
 
15
15
  class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
@@ -100,7 +100,7 @@ class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
100
100
  assert isinstance(file, str)
101
101
  else:
102
102
  # Write the dynamically created image to a temp file
103
- file = str(Env.get().create_tmp_path(f'.{self.__image_format}'))
103
+ file = TempStore.create_path(extension=f'.{self.__image_format}')
104
104
  img.save(file, format=self.__image_format)
105
105
 
106
106
  metadata = fo.ImageMetadata(
@@ -108,7 +108,7 @@ class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
108
108
  mime_type=puremagic.from_file(file, mime=True),
109
109
  width=img.width,
110
110
  height=img.height,
111
- filepath=file,
111
+ filepath=str(file),
112
112
  num_channels=len(img.getbands()),
113
113
  )
114
114
 
@@ -19,6 +19,7 @@ from pixeltable.config import Config
19
19
  from pixeltable.exprs import ColumnRef, DataRow, Expr
20
20
  from pixeltable.io.external_store import Project
21
21
  from pixeltable.utils import coco
22
+ from pixeltable.utils.media_store import TempStore
22
23
 
23
24
  # label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
24
25
  # the import two different ways to insure intercompatibility
@@ -215,7 +216,7 @@ class LabelStudioProject(Project):
215
216
  else:
216
217
  # No localpath; create a temp file and upload it
217
218
  assert isinstance(row[media_col_idx], PIL.Image.Image)
218
- file = env.Env.get().create_tmp_path(extension='.png')
219
+ file = TempStore.create_path(extension='.png')
219
220
  row[media_col_idx].save(file, format='png')
220
221
  task_id = self.project.import_tasks(file)[0]
221
222
  os.remove(file)
@@ -5,7 +5,8 @@ from typing import Any, ClassVar, Optional
5
5
 
6
6
  import av
7
7
 
8
- from pixeltable import env, exceptions as excs, type_system as ts
8
+ from pixeltable import exceptions as excs, type_system as ts
9
+ from pixeltable.utils.media_store import TempStore
9
10
 
10
11
  from .base import ComponentIterator
11
12
 
@@ -149,7 +150,7 @@ class AudioSplitter(ComponentIterator):
149
150
  target_chunk_start, target_chunk_end = self.chunks_to_extract_in_pts[self.next_pos]
150
151
  chunk_start_pts = 0
151
152
  chunk_end_pts = 0
152
- chunk_file = str(env.Env.get().create_tmp_path(self.audio_path.suffix))
153
+ chunk_file = str(TempStore.create_path(extension=self.audio_path.suffix))
153
154
  output_container = av.open(chunk_file, mode='w')
154
155
  input_stream = self.container.streams.audio[0]
155
156
  codec_name = AudioSplitter.__codec_map.get(input_stream.codec_context.name, input_stream.codec_context.name)
@@ -213,12 +213,6 @@ class DocumentSplitter(ComponentIterator):
213
213
  if kwargs.get('limit') is None:
214
214
  raise Error('limit is required with "token_limit"/"char_limit" separators')
215
215
 
216
- # check dependencies at the end
217
- if Separator.SENTENCE in separators:
218
- _ = Env.get().spacy_nlp
219
- if Separator.TOKEN_LIMIT in separators:
220
- Env.get().require_package('tiktoken')
221
-
222
216
  return schema, []
223
217
 
224
218
  def __next__(self) -> dict[str, Any]:
@@ -25,6 +25,7 @@ def create_system_info(engine: sql.engine.Engine) -> None:
25
25
  """Create the system metadata record"""
26
26
  system_md = SystemInfoMd(schema_version=VERSION)
27
27
  record = SystemInfo(md=dataclasses.asdict(system_md))
28
+ _logger.debug(f'Creating pixeltable system info record {record}')
28
29
  with orm.Session(engine, future=True) as session:
29
30
  # Write system metadata only once for idempotency
30
31
  if session.query(SystemInfo).count() == 0:
@@ -54,7 +55,8 @@ for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/convert
54
55
  def upgrade_md(engine: sql.engine.Engine) -> None:
55
56
  """Upgrade the metadata schema to the current version"""
56
57
  with orm.Session(engine) as session:
57
- system_info = session.query(SystemInfo).one().md
58
+ # Get exclusive lock on SystemInfo row
59
+ system_info = session.query(SystemInfo).with_for_update().one().md
58
60
  md_version = system_info['schema_version']
59
61
  assert isinstance(md_version, int)
60
62
  _logger.info(f'Current database version: {md_version}, installed version: {VERSION}')
@@ -0,0 +1,3 @@
1
+ from .mypy_plugin import plugin
2
+
3
+ __all__ = ['plugin']
@@ -0,0 +1,123 @@
1
+ from typing import Callable, ClassVar, Optional
2
+
3
+ from mypy import nodes
4
+ from mypy.plugin import AnalyzeTypeContext, ClassDefContext, FunctionContext, MethodSigContext, Plugin
5
+ from mypy.plugins.common import add_attribute_to_class, add_method_to_class
6
+ from mypy.types import AnyType, FunctionLike, Instance, NoneType, Type, TypeOfAny
7
+
8
+ import pixeltable as pxt
9
+ from pixeltable import exprs
10
+
11
+
12
+ class PxtPlugin(Plugin):
13
+ __UDA_FULLNAME = f'{pxt.uda.__module__}.{pxt.uda.__name__}'
14
+ __ARRAY_GETITEM_FULLNAME = f'{pxt.Array.__module__}.{pxt.Array.__name__}.__class_getitem__'
15
+ __ADD_COLUMN_FULLNAME = f'{pxt.Table.__module__}.{pxt.Table.__name__}.{pxt.Table.add_column.__name__}'
16
+ __ADD_COMPUTED_COLUMN_FULLNAME = (
17
+ f'{pxt.Table.__module__}.{pxt.Table.__name__}.{pxt.Table.add_computed_column.__name__}'
18
+ )
19
+ __TYPE_MAP: ClassVar[dict] = {
20
+ pxt.Json: 'typing.Any',
21
+ pxt.Array: 'numpy.ndarray',
22
+ pxt.Image: 'PIL.Image.Image',
23
+ pxt.Video: 'builtins.str',
24
+ pxt.Audio: 'builtins.str',
25
+ pxt.Document: 'builtins.str',
26
+ }
27
+ __FULLNAME_MAP: ClassVar[dict] = {f'{k.__module__}.{k.__name__}': v for k, v in __TYPE_MAP.items()}
28
+
29
+ def get_function_hook(self, fullname: str) -> Optional[Callable[[FunctionContext], Type]]:
30
+ return adjust_uda_type
31
+
32
+ def get_type_analyze_hook(self, fullname: str) -> Optional[Callable[[AnalyzeTypeContext], Type]]:
33
+ if fullname in self.__FULLNAME_MAP:
34
+ subst_name = self.__FULLNAME_MAP[fullname]
35
+ return lambda ctx: adjust_pxt_type(ctx, subst_name)
36
+ return None
37
+
38
+ def get_method_signature_hook(self, fullname: str) -> Optional[Callable[[MethodSigContext], FunctionLike]]:
39
+ if fullname in (self.__ADD_COLUMN_FULLNAME, self.__ADD_COMPUTED_COLUMN_FULLNAME):
40
+ return adjust_kwargs
41
+ return None
42
+
43
+ def get_class_decorator_hook_2(self, fullname: str) -> Optional[Callable[[ClassDefContext], bool]]:
44
+ if fullname == self.__UDA_FULLNAME:
45
+ return adjust_uda_methods
46
+ return None
47
+
48
+
49
+ def plugin(version: str) -> type:
50
+ return PxtPlugin
51
+
52
+
53
+ _AGGREGATOR_FULLNAME = f'{pxt.Aggregator.__module__}.{pxt.Aggregator.__name__}'
54
+ _FN_CALL_FULLNAME = f'{exprs.Expr.__module__}.{exprs.Expr.__name__}'
55
+
56
+
57
+ def adjust_uda_type(ctx: FunctionContext) -> Type:
58
+ """
59
+ Mypy doesn't understand that a class with a @uda decorator isn't actually a class, so it assumes
60
+ that sum(expr), for example, actually returns an instance of sum. We correct this by changing the
61
+ return type of any subclass of `Aggregator` to `FunctionCall`.
62
+ """
63
+ ret_type = ctx.default_return_type
64
+ if isinstance(ret_type, Instance) and (
65
+ ret_type.type.fullname == _AGGREGATOR_FULLNAME
66
+ or any(base.type.fullname == _AGGREGATOR_FULLNAME for base in ret_type.type.bases)
67
+ ):
68
+ ret_type = AnyType(TypeOfAny.special_form)
69
+ return ret_type
70
+
71
+
72
+ def adjust_pxt_type(ctx: AnalyzeTypeContext, subst_name: str) -> Type:
73
+ """
74
+ Replaces the special Pixeltable classes (such as pxt.Array) with their standard equivalents (such as np.ndarray).
75
+ """
76
+ if subst_name == 'typing.Any':
77
+ return AnyType(TypeOfAny.special_form)
78
+ return ctx.api.named_type(subst_name, [])
79
+
80
+
81
+ def adjust_kwargs(ctx: MethodSigContext) -> FunctionLike:
82
+ """
83
+ Mypy has a "feature" where it will spit out multiple warnings if a method with signature
84
+ ```
85
+ def my_func(*, arg1: int, arg2: str, **kwargs: Expr)
86
+ ```
87
+ (for example) is called with bare kwargs:
88
+ ```
89
+ my_func(my_kwarg=value)
90
+ ```
91
+ This is a disaster for type-checking of add_column and add_computed_column. Here we adjust the signature so
92
+ that mypy thinks it is simply
93
+ ```
94
+ def my_func(**kwargs: Any)
95
+ ```
96
+ thereby avoiding any type-checking errors. For details, see: <https://github.com/python/mypy/issues/18481>
97
+ """
98
+ sig = ctx.default_signature
99
+ new_arg_names = sig.arg_names[-1:]
100
+ new_arg_types = [AnyType(TypeOfAny.special_form)]
101
+ new_arg_kinds = sig.arg_kinds[-1:]
102
+ return sig.copy_modified(arg_names=new_arg_names, arg_types=new_arg_types, arg_kinds=new_arg_kinds)
103
+
104
+
105
+ def adjust_uda_methods(ctx: ClassDefContext) -> bool:
106
+ """
107
+ Mypy does not handle the `@pxt.uda` aggregator well; it continues to treat the decorated class as a class,
108
+ even though it has been replaced by an `AggregateFunction`. Here we add static methods to the class that
109
+ imitate various (instance) methods of `AggregateFunction` so that they can be properly type-checked.
110
+ """
111
+ list_type = ctx.api.named_type('builtins.list', [AnyType(TypeOfAny.special_form)])
112
+ fn_arg = nodes.Argument(nodes.Var('fn'), AnyType(TypeOfAny.special_form), None, nodes.ARG_POS)
113
+ args_arg = nodes.Argument(nodes.Var('args'), AnyType(TypeOfAny.special_form), None, nodes.ARG_STAR)
114
+ kwargs_arg = nodes.Argument(nodes.Var('kwargs'), AnyType(TypeOfAny.special_form), None, nodes.ARG_STAR2)
115
+ add_method_to_class(ctx.api, ctx.cls, '__init__', args=[args_arg, kwargs_arg], return_type=NoneType())
116
+ add_method_to_class(
117
+ ctx.api, ctx.cls, 'to_sql', args=[fn_arg], return_type=AnyType(TypeOfAny.special_form), is_staticmethod=True
118
+ )
119
+ add_method_to_class(
120
+ ctx.api, ctx.cls, 'overload', args=[fn_arg], return_type=AnyType(TypeOfAny.special_form), is_staticmethod=True
121
+ )
122
+ add_attribute_to_class(ctx.api, ctx.cls, 'signatures', typ=list_type, is_classvar=True)
123
+ return True
pixeltable/plan.py CHANGED
@@ -394,9 +394,6 @@ class Planner:
394
394
  row_builder, computed_exprs, plan.output_exprs, input=plan, maintain_input_order=False
395
395
  )
396
396
 
397
- stored_col_info = row_builder.output_slot_idxs()
398
- stored_img_col_info = [info for info in stored_col_info if info.col.col_type.is_image_type()]
399
- plan.set_stored_img_cols(stored_img_col_info)
400
397
  plan.set_ctx(
401
398
  exec.ExecContext(
402
399
  row_builder,
@@ -428,10 +425,6 @@ class Planner:
428
425
  col = tbl.cols_by_name[col_name]
429
426
  plan.row_builder.add_table_column(col, expr.slot_idx)
430
427
 
431
- stored_col_info = plan.row_builder.output_slot_idxs()
432
- stored_img_col_info = [info for info in stored_col_info if info.col.col_type.is_image_type()]
433
- plan.set_stored_img_cols(stored_img_col_info)
434
-
435
428
  plan.set_ctx(
436
429
  exec.ExecContext(
437
430
  plan.row_builder, batch_size=0, show_pbar=True, num_computed_exprs=0, ignore_errors=ignore_errors
@@ -657,10 +650,6 @@ class Planner:
657
650
  for i, col in enumerate(copied_cols + list(recomputed_cols)): # same order as select_list
658
651
  plan.row_builder.add_table_column(col, select_list[i].slot_idx)
659
652
  # TODO: avoid duplication with view_load_plan() logic (where does this belong?)
660
- stored_img_col_info = [
661
- info for info in plan.row_builder.output_slot_idxs() if info.col.col_type.is_image_type()
662
- ]
663
- plan.set_stored_img_cols(stored_img_col_info)
664
653
  return plan
665
654
 
666
655
  @classmethod
@@ -727,8 +716,6 @@ class Planner:
727
716
  row_builder, output_exprs=view_output_exprs, input_exprs=base_output_exprs, input=plan
728
717
  )
729
718
 
730
- stored_img_col_info = [info for info in row_builder.output_slot_idxs() if info.col.col_type.is_image_type()]
731
- plan.set_stored_img_cols(stored_img_col_info)
732
719
  exec_ctx.ignore_errors = True
733
720
  plan.set_ctx(exec_ctx)
734
721
  return plan, len(row_builder.default_eval_ctx.target_exprs)
@@ -1053,7 +1040,4 @@ class Planner:
1053
1040
  computed_exprs = row_builder.output_exprs - row_builder.input_exprs
1054
1041
  plan.ctx.num_computed_exprs = len(computed_exprs) # we are adding a computed column, so we need to evaluate it
1055
1042
 
1056
- # we want to flush images
1057
- if col.is_computed and col.is_stored and col.col_type.is_image_type():
1058
- plan.set_stored_img_cols(row_builder.output_slot_idxs())
1059
1043
  return plan
@@ -24,7 +24,7 @@ from pixeltable.env import Env
24
24
  from pixeltable.metadata import schema
25
25
  from pixeltable.utils import sha256sum
26
26
  from pixeltable.utils.formatter import Formatter
27
- from pixeltable.utils.media_store import MediaStore
27
+ from pixeltable.utils.media_store import MediaStore, TempStore
28
28
 
29
29
  _logger = logging.getLogger('pixeltable')
30
30
 
@@ -57,7 +57,7 @@ class TablePackager:
57
57
 
58
58
  def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
59
59
  self.table = table
60
- self.tmp_dir = Path(Env.get().create_tmp_path())
60
+ self.tmp_dir = TempStore.create_path()
61
61
  self.media_files = {}
62
62
 
63
63
  # Load metadata
@@ -92,10 +92,10 @@ class TablePackager:
92
92
  self.bundle_path = self.__build_tarball()
93
93
 
94
94
  _logger.info('Extracting preview data.')
95
- self.md['count'] = self.table.count()
95
+ self.md['row_count'] = self.table.count()
96
96
  preview_header, preview = self.__extract_preview_data()
97
97
  self.md['preview_header'] = preview_header
98
- self.md['preview'] = preview
98
+ self.md['preview_data'] = preview
99
99
 
100
100
  _logger.info(f'Packaging complete: {self.bundle_path}')
101
101
  return self.bundle_path
@@ -335,7 +335,7 @@ class TableRestorer:
335
335
  def __init__(self, tbl_path: str, md: Optional[dict[str, Any]] = None) -> None:
336
336
  self.tbl_path = tbl_path
337
337
  self.md = md
338
- self.tmp_dir = Path(Env.get().create_tmp_path())
338
+ self.tmp_dir = TempStore.create_path()
339
339
  self.media_files = {}
340
340
 
341
341
  def restore(self, bundle_path: Path) -> pxt.Table:
@@ -619,7 +619,7 @@ class TableRestorer:
619
619
  # in self.media_files.
620
620
  src_path = self.tmp_dir / 'media' / parsed_url.netloc
621
621
  # Move the file to the media store and update the URL.
622
- self.media_files[url] = MediaStore.relocate_local_media_file(src_path, media_col)
622
+ self.media_files[url] = MediaStore.get().relocate_local_media_file(src_path, media_col)
623
623
  return self.media_files[url]
624
624
  # For any type of URL other than a local file, just return the URL as-is.
625
625
  return url