pixeltable 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +2 -2
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/column.py +41 -29
- pixeltable/catalog/globals.py +18 -0
- pixeltable/catalog/insertable_table.py +30 -10
- pixeltable/catalog/table.py +198 -86
- pixeltable/catalog/table_version.py +47 -53
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/catalog/view.py +17 -18
- pixeltable/dataframe.py +27 -36
- pixeltable/env.py +7 -0
- pixeltable/exec/__init__.py +0 -1
- pixeltable/exec/aggregation_node.py +6 -3
- pixeltable/exec/cache_prefetch_node.py +189 -43
- pixeltable/exec/data_row_batch.py +5 -22
- pixeltable/exec/exec_context.py +2 -2
- pixeltable/exec/exec_node.py +3 -2
- pixeltable/exec/expr_eval_node.py +23 -16
- pixeltable/exec/in_memory_data_node.py +6 -3
- pixeltable/exec/sql_node.py +24 -25
- pixeltable/exprs/arithmetic_expr.py +12 -5
- pixeltable/exprs/array_slice.py +7 -7
- pixeltable/exprs/column_property_ref.py +37 -10
- pixeltable/exprs/column_ref.py +97 -14
- pixeltable/exprs/comparison.py +10 -5
- pixeltable/exprs/compound_predicate.py +8 -7
- pixeltable/exprs/data_row.py +27 -18
- pixeltable/exprs/expr.py +53 -52
- pixeltable/exprs/expr_set.py +5 -0
- pixeltable/exprs/function_call.py +32 -16
- pixeltable/exprs/globals.py +4 -1
- pixeltable/exprs/in_predicate.py +8 -7
- pixeltable/exprs/inline_expr.py +4 -4
- pixeltable/exprs/is_null.py +4 -4
- pixeltable/exprs/json_mapper.py +11 -12
- pixeltable/exprs/json_path.py +6 -11
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +5 -4
- pixeltable/exprs/object_ref.py +2 -1
- pixeltable/exprs/row_builder.py +88 -36
- pixeltable/exprs/rowid_ref.py +12 -11
- pixeltable/exprs/similarity_expr.py +12 -7
- pixeltable/exprs/sql_element_cache.py +7 -5
- pixeltable/exprs/type_cast.py +8 -6
- pixeltable/exprs/variable.py +5 -4
- pixeltable/func/aggregate_function.py +9 -9
- pixeltable/func/expr_template_function.py +6 -5
- pixeltable/func/function.py +11 -10
- pixeltable/func/udf.py +6 -11
- pixeltable/functions/__init__.py +2 -2
- pixeltable/functions/globals.py +5 -7
- pixeltable/functions/huggingface.py +155 -45
- pixeltable/functions/llama_cpp.py +107 -0
- pixeltable/functions/mistralai.py +1 -1
- pixeltable/functions/ollama.py +147 -0
- pixeltable/functions/openai.py +1 -1
- pixeltable/functions/replicate.py +72 -0
- pixeltable/functions/string.py +9 -0
- pixeltable/functions/together.py +1 -1
- pixeltable/functions/util.py +5 -2
- pixeltable/globals.py +67 -26
- pixeltable/index/btree.py +16 -3
- pixeltable/index/embedding_index.py +4 -4
- pixeltable/io/__init__.py +1 -2
- pixeltable/io/fiftyone.py +178 -0
- pixeltable/io/globals.py +96 -2
- pixeltable/iterators/base.py +3 -2
- pixeltable/iterators/document.py +1 -1
- pixeltable/iterators/video.py +120 -63
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_21.py +34 -0
- pixeltable/metadata/converters/util.py +45 -4
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +8 -0
- pixeltable/plan.py +17 -15
- pixeltable/py.typed +0 -0
- pixeltable/store.py +7 -2
- pixeltable/tool/create_test_db_dump.py +1 -1
- pixeltable/tool/create_test_video.py +1 -1
- pixeltable/tool/embed_udf.py +1 -1
- pixeltable/tool/mypy_plugin.py +28 -5
- pixeltable/type_system.py +100 -36
- pixeltable/utils/coco.py +5 -5
- pixeltable/utils/documents.py +15 -1
- pixeltable/utils/formatter.py +12 -13
- pixeltable/utils/s3.py +6 -3
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/METADATA +158 -49
- pixeltable-0.2.23.dist-info/RECORD +153 -0
- pixeltable/exec/media_validation_node.py +0 -43
- pixeltable-0.2.21.dist-info/RECORD +0 -148
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
3
|
+
|
|
4
|
+
import pixeltable as pxt
|
|
5
|
+
import pixeltable.exceptions as excs
|
|
6
|
+
from pixeltable.env import Env
|
|
7
|
+
from pixeltable.utils.code import local_public_names
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
import llama_cpp
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@pxt.udf
|
|
14
|
+
def create_chat_completion(
|
|
15
|
+
messages: list[dict],
|
|
16
|
+
*,
|
|
17
|
+
model_path: Optional[str] = None,
|
|
18
|
+
repo_id: Optional[str] = None,
|
|
19
|
+
repo_filename: Optional[str] = None,
|
|
20
|
+
args: Optional[dict[str, Any]] = None,
|
|
21
|
+
) -> dict:
|
|
22
|
+
"""
|
|
23
|
+
Generate a chat completion from a list of messages.
|
|
24
|
+
|
|
25
|
+
The model can be specified either as a local path, or as a repo_id and repo_filename that reference a pretrained
|
|
26
|
+
model on the Hugging Face model hub. Exactly one of `model_path` or `repo_id` must be provided; if `model_path`
|
|
27
|
+
is provided, then an optional `repo_filename` can also be specified.
|
|
28
|
+
|
|
29
|
+
For additional details, see the
|
|
30
|
+
[llama_cpp create_chat_completions documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
messages: A list of messages to generate a response for.
|
|
34
|
+
model_path: Path to the model (if using a local model).
|
|
35
|
+
repo_id: The Hugging Face model repo id (if using a pretrained model).
|
|
36
|
+
repo_filename: A filename or glob pattern to match the model file in the repo (optional, if using a
|
|
37
|
+
pretrained model).
|
|
38
|
+
args: Additional arguments to pass to the `create_chat_completions` call, such as `max_tokens`, `temperature`,
|
|
39
|
+
`top_p`, and `top_k`. For details, see the
|
|
40
|
+
[llama_cpp create_chat_completions documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
|
|
41
|
+
"""
|
|
42
|
+
Env.get().require_package('llama_cpp', min_version=[0, 3, 1])
|
|
43
|
+
|
|
44
|
+
if args is None:
|
|
45
|
+
args = {}
|
|
46
|
+
|
|
47
|
+
if (model_path is None) == (repo_id is None):
|
|
48
|
+
raise excs.Error('Exactly one of `model_path` or `repo_id` must be provided.')
|
|
49
|
+
if (repo_id is None) and (repo_filename is not None):
|
|
50
|
+
raise excs.Error('`repo_filename` can only be provided along with `repo_id`.')
|
|
51
|
+
|
|
52
|
+
n_gpu_layers = -1 if _is_gpu_available() else 0 # 0 = CPU only, -1 = offload all layers to GPU
|
|
53
|
+
|
|
54
|
+
if model_path is not None:
|
|
55
|
+
llm = _lookup_local_model(model_path, n_gpu_layers)
|
|
56
|
+
else:
|
|
57
|
+
Env.get().require_package('huggingface_hub')
|
|
58
|
+
llm = _lookup_pretrained_model(repo_id, repo_filename, n_gpu_layers)
|
|
59
|
+
return llm.create_chat_completion(messages, **args) # type: ignore
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _is_gpu_available() -> bool:
|
|
63
|
+
import llama_cpp
|
|
64
|
+
|
|
65
|
+
global _IS_GPU_AVAILABLE
|
|
66
|
+
if _IS_GPU_AVAILABLE is None:
|
|
67
|
+
llama_cpp_path = Path(llama_cpp.__file__).parent
|
|
68
|
+
lib = llama_cpp.llama_cpp.load_shared_library('llama', llama_cpp_path / 'lib')
|
|
69
|
+
_IS_GPU_AVAILABLE = bool(lib.llama_supports_gpu_offload())
|
|
70
|
+
|
|
71
|
+
return _IS_GPU_AVAILABLE
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _lookup_local_model(model_path: str, n_gpu_layers: int) -> 'llama_cpp.Llama':
|
|
75
|
+
import llama_cpp
|
|
76
|
+
|
|
77
|
+
key = (model_path, None, n_gpu_layers)
|
|
78
|
+
if key not in _model_cache:
|
|
79
|
+
llm = llama_cpp.Llama(model_path, n_gpu_layers=n_gpu_layers, verbose=False)
|
|
80
|
+
_model_cache[key] = llm
|
|
81
|
+
return _model_cache[key]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _lookup_pretrained_model(repo_id: str, filename: Optional[str], n_gpu_layers: int) -> 'llama_cpp.Llama':
|
|
85
|
+
import llama_cpp
|
|
86
|
+
|
|
87
|
+
key = (repo_id, filename, n_gpu_layers)
|
|
88
|
+
if key not in _model_cache:
|
|
89
|
+
llm = llama_cpp.Llama.from_pretrained(
|
|
90
|
+
repo_id=repo_id,
|
|
91
|
+
filename=filename,
|
|
92
|
+
n_gpu_layers=n_gpu_layers,
|
|
93
|
+
verbose=False,
|
|
94
|
+
)
|
|
95
|
+
_model_cache[key] = llm
|
|
96
|
+
return _model_cache[key]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
_model_cache: dict[tuple[str, str, int], Any] = {}
|
|
100
|
+
_IS_GPU_AVAILABLE: Optional[bool] = None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
__all__ = local_public_names(__name__)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def __dir__():
|
|
107
|
+
return __all__
|
|
@@ -141,7 +141,7 @@ _embedding_dimensions_cache: dict[str, int] = {
|
|
|
141
141
|
|
|
142
142
|
|
|
143
143
|
@pxt.udf(batch_size=16)
|
|
144
|
-
def embeddings(input: Batch[str], *, model: str) -> Batch[pxt.Array[(None,),
|
|
144
|
+
def embeddings(input: Batch[str], *, model: str) -> Batch[pxt.Array[(None,), pxt.Float]]:
|
|
145
145
|
"""
|
|
146
146
|
Embeddings API.
|
|
147
147
|
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Optional
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
import pixeltable as pxt
|
|
6
|
+
from pixeltable import env
|
|
7
|
+
from pixeltable.func import Batch
|
|
8
|
+
from pixeltable.utils.code import local_public_names
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
import ollama
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@env.register_client('ollama')
|
|
15
|
+
def _(host: str) -> 'ollama.Client':
|
|
16
|
+
import ollama
|
|
17
|
+
return ollama.Client(host=host)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _ollama_client() -> Optional['ollama.Client']:
|
|
21
|
+
try:
|
|
22
|
+
return env.Env.get().get_client('ollama')
|
|
23
|
+
except Exception:
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@pxt.udf
|
|
28
|
+
def generate(
|
|
29
|
+
prompt: str,
|
|
30
|
+
*,
|
|
31
|
+
model: str,
|
|
32
|
+
suffix: str = '',
|
|
33
|
+
system: str = '',
|
|
34
|
+
template: str = '',
|
|
35
|
+
context: Optional[list[int]] = None,
|
|
36
|
+
raw: bool = False,
|
|
37
|
+
format: str = '',
|
|
38
|
+
options: Optional[dict] = None,
|
|
39
|
+
) -> dict:
|
|
40
|
+
"""
|
|
41
|
+
Generate a response for a given prompt with a provided model.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
prompt: The prompt to generate a response for.
|
|
45
|
+
model: The model name.
|
|
46
|
+
suffix: The text after the model response.
|
|
47
|
+
format: The format of the response; must be one of `'json'` or `''` (the empty string).
|
|
48
|
+
system: System message.
|
|
49
|
+
template: Prompt template to use.
|
|
50
|
+
context: The context parameter returned from a previous call to `generate()`.
|
|
51
|
+
raw: If `True`, no formatting will be applied to the prompt.
|
|
52
|
+
options: Additional options to pass to the `chat` call, such as `max_tokens`, `temperature`, `top_p`, and `top_k`.
|
|
53
|
+
For details, see the
|
|
54
|
+
[Valid Parameters and Values](https://github.com/ollama/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values)
|
|
55
|
+
section of the Ollama documentation.
|
|
56
|
+
"""
|
|
57
|
+
env.Env.get().require_package('ollama')
|
|
58
|
+
import ollama
|
|
59
|
+
|
|
60
|
+
client = _ollama_client() or ollama
|
|
61
|
+
return client.generate(
|
|
62
|
+
model=model,
|
|
63
|
+
prompt=prompt,
|
|
64
|
+
suffix=suffix,
|
|
65
|
+
system=system,
|
|
66
|
+
template=template,
|
|
67
|
+
context=context,
|
|
68
|
+
raw=raw,
|
|
69
|
+
format=format,
|
|
70
|
+
options=options,
|
|
71
|
+
) # type: ignore[call-overload]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@pxt.udf
|
|
75
|
+
def chat(
|
|
76
|
+
messages: list[dict],
|
|
77
|
+
*,
|
|
78
|
+
model: str,
|
|
79
|
+
tools: Optional[list[dict]] = None,
|
|
80
|
+
format: str = '',
|
|
81
|
+
options: Optional[dict] = None,
|
|
82
|
+
) -> dict:
|
|
83
|
+
"""
|
|
84
|
+
Generate the next message in a chat with a provided model.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
messages: The messages of the chat.
|
|
88
|
+
model: The model name.
|
|
89
|
+
tools: Tools for the model to use.
|
|
90
|
+
format: The format of the response; must be one of `'json'` or `''` (the empty string).
|
|
91
|
+
options: Additional options to pass to the `chat` call, such as `max_tokens`, `temperature`, `top_p`, and `top_k`.
|
|
92
|
+
For details, see the
|
|
93
|
+
[Valid Parameters and Values](https://github.com/ollama/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values)
|
|
94
|
+
section of the Ollama documentation.
|
|
95
|
+
"""
|
|
96
|
+
env.Env.get().require_package('ollama')
|
|
97
|
+
import ollama
|
|
98
|
+
|
|
99
|
+
client = _ollama_client() or ollama
|
|
100
|
+
return client.chat(
|
|
101
|
+
model=model,
|
|
102
|
+
messages=messages,
|
|
103
|
+
tools=tools,
|
|
104
|
+
format=format,
|
|
105
|
+
options=options,
|
|
106
|
+
) # type: ignore[call-overload]
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@pxt.udf(batch_size=16)
|
|
110
|
+
def embed(
|
|
111
|
+
input: Batch[str],
|
|
112
|
+
*,
|
|
113
|
+
model: str,
|
|
114
|
+
truncate: bool = True,
|
|
115
|
+
options: Optional[dict] = None,
|
|
116
|
+
) -> Batch[pxt.Array[(None,), pxt.Float]]:
|
|
117
|
+
"""
|
|
118
|
+
Generate embeddings from a model.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
input: The input text to generate embeddings for.
|
|
122
|
+
model: The model name.
|
|
123
|
+
truncate: Truncates the end of each input to fit within context length.
|
|
124
|
+
Returns error if false and context length is exceeded.
|
|
125
|
+
options: Additional options to pass to the `embed` call.
|
|
126
|
+
For details, see the
|
|
127
|
+
[Valid Parameters and Values](https://github.com/ollama/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values)
|
|
128
|
+
section of the Ollama documentation.
|
|
129
|
+
"""
|
|
130
|
+
env.Env.get().require_package('ollama')
|
|
131
|
+
import ollama
|
|
132
|
+
|
|
133
|
+
client = _ollama_client() or ollama
|
|
134
|
+
results = client.embed(
|
|
135
|
+
model=model,
|
|
136
|
+
input=input,
|
|
137
|
+
truncate=truncate,
|
|
138
|
+
options=options, # type: ignore[arg-type]
|
|
139
|
+
)
|
|
140
|
+
return [np.array(data, dtype=np.float64) for data in results['embeddings']]
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
__all__ = local_public_names(__name__)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def __dir__():
|
|
147
|
+
return __all__
|
pixeltable/functions/openai.py
CHANGED
|
@@ -304,7 +304,7 @@ _embedding_dimensions_cache: dict[str, int] = {
|
|
|
304
304
|
@pxt.udf(batch_size=32)
|
|
305
305
|
def embeddings(
|
|
306
306
|
input: Batch[str], *, model: str, dimensions: Optional[int] = None, user: Optional[str] = None
|
|
307
|
-
) -> Batch[pxt.Array[(None,),
|
|
307
|
+
) -> Batch[pxt.Array[(None,), pxt.Float]]:
|
|
308
308
|
"""
|
|
309
309
|
Creates an embedding vector representing the input text.
|
|
310
310
|
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs)
|
|
3
|
+
that wrap various endpoints from the Replicate API. In order to use them, you must
|
|
4
|
+
first `pip install replicate` and configure your Replicate credentials, as described in
|
|
5
|
+
the [Working with Replicate](https://pixeltable.readme.io/docs/working-with-replicate) tutorial.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
|
+
|
|
10
|
+
import pixeltable as pxt
|
|
11
|
+
from pixeltable.env import Env, register_client
|
|
12
|
+
from pixeltable.utils.code import local_public_names
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
import replicate # type: ignore[import-untyped]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@register_client('replicate')
|
|
19
|
+
def _(api_token: str) -> 'replicate.Client':
|
|
20
|
+
import replicate
|
|
21
|
+
return replicate.Client(api_token=api_token)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _replicate_client() -> 'replicate.Client':
|
|
25
|
+
return Env.get().get_client('replicate')
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@pxt.udf
|
|
29
|
+
def run(
|
|
30
|
+
input: dict[str, Any],
|
|
31
|
+
*,
|
|
32
|
+
ref: str,
|
|
33
|
+
) -> dict[str, Any]:
|
|
34
|
+
"""
|
|
35
|
+
Run a model on Replicate.
|
|
36
|
+
|
|
37
|
+
For additional details, see: <https://replicate.com/docs/topics/models/run-a-model>
|
|
38
|
+
|
|
39
|
+
__Requirements:__
|
|
40
|
+
|
|
41
|
+
- `pip install replicate`
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
input: The input parameters for the model.
|
|
45
|
+
ref: The name of the model to run.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
The output of the model.
|
|
49
|
+
|
|
50
|
+
Examples:
|
|
51
|
+
Add a computed column that applies the model `meta/meta-llama-3-8b-instruct`
|
|
52
|
+
to an existing Pixeltable column `tbl.prompt` of the table `tbl`:
|
|
53
|
+
|
|
54
|
+
>>> input = {'system_prompt': 'You are a helpful assistant.', 'prompt': tbl.prompt}
|
|
55
|
+
... tbl['response'] = run(input, ref='meta/meta-llama-3-8b-instruct')
|
|
56
|
+
|
|
57
|
+
Add a computed column that uses the model `black-forest-labs/flux-schnell`
|
|
58
|
+
to generate images from an existing Pixeltable column `tbl.prompt`:
|
|
59
|
+
|
|
60
|
+
>>> input = {'prompt': tbl.prompt, 'go_fast': True, 'megapixels': '1'}
|
|
61
|
+
... tbl['response'] = run(input, ref='black-forest-labs/flux-schnell')
|
|
62
|
+
... tbl['image'] = tbl.response.output[0].astype(pxt.Image)
|
|
63
|
+
"""
|
|
64
|
+
Env.get().require_package('replicate')
|
|
65
|
+
return _replicate_client().run(ref, input, use_file_output=False)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
__all__ = local_public_names(__name__)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def __dir__():
|
|
72
|
+
return __all__
|
pixeltable/functions/string.py
CHANGED
|
@@ -283,6 +283,15 @@ def isspace(self: str) -> bool:
|
|
|
283
283
|
"""
|
|
284
284
|
return self.isspace()
|
|
285
285
|
|
|
286
|
+
@pxt.udf
|
|
287
|
+
def join(sep: str, elements: list) -> str:
|
|
288
|
+
"""
|
|
289
|
+
Return a string which is the concatenation of the strings in `elements`.
|
|
290
|
+
|
|
291
|
+
Equivalent to [`str.join()`](https://docs.python.org/3/library/stdtypes.html#str.join)
|
|
292
|
+
"""
|
|
293
|
+
return sep.join(elements)
|
|
294
|
+
|
|
286
295
|
@pxt.udf(is_method=True)
|
|
287
296
|
def len(self: str) -> int:
|
|
288
297
|
"""
|
pixeltable/functions/together.py
CHANGED
|
@@ -186,7 +186,7 @@ _embedding_dimensions_cache = {
|
|
|
186
186
|
|
|
187
187
|
|
|
188
188
|
@pxt.udf(batch_size=32)
|
|
189
|
-
def embeddings(input: Batch[str], *, model: str) -> Batch[pxt.Array[(None,),
|
|
189
|
+
def embeddings(input: Batch[str], *, model: str) -> Batch[pxt.Array[(None,), pxt.Float]]:
|
|
190
190
|
"""
|
|
191
191
|
Query an embedding model for a given string of text.
|
|
192
192
|
|
pixeltable/functions/util.py
CHANGED
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
import PIL.Image
|
|
2
2
|
|
|
3
|
+
from pixeltable.env import Env
|
|
3
4
|
|
|
4
|
-
|
|
5
|
+
|
|
6
|
+
def resolve_torch_device(device: str, allow_mps: bool = True) -> str:
|
|
7
|
+
Env.get().require_package('torch')
|
|
5
8
|
import torch
|
|
6
9
|
|
|
7
10
|
if device == 'auto':
|
|
8
11
|
if torch.cuda.is_available():
|
|
9
12
|
return 'cuda'
|
|
10
|
-
if torch.backends.mps.is_available():
|
|
13
|
+
if allow_mps and torch.backends.mps.is_available():
|
|
11
14
|
return 'mps'
|
|
12
15
|
return 'cpu'
|
|
13
16
|
return device
|
pixeltable/globals.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import logging
|
|
3
|
-
from typing import Any, Iterable, Optional, Union
|
|
3
|
+
from typing import Any, Iterable, Optional, Union, Literal
|
|
4
4
|
from uuid import UUID
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
@@ -33,6 +33,7 @@ def create_table(
|
|
|
33
33
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
34
34
|
num_retained_versions: int = 10,
|
|
35
35
|
comment: str = '',
|
|
36
|
+
media_validation: Literal['on_read', 'on_write'] = 'on_write'
|
|
36
37
|
) -> catalog.Table:
|
|
37
38
|
"""Create a new base table.
|
|
38
39
|
|
|
@@ -44,6 +45,9 @@ def create_table(
|
|
|
44
45
|
table.
|
|
45
46
|
num_retained_versions: Number of versions of the table to retain.
|
|
46
47
|
comment: An optional comment; its meaning is user-defined.
|
|
48
|
+
media_validation: Media validation policy for the table.
|
|
49
|
+
- `'on_read'`: validate media files at query time
|
|
50
|
+
- `'on_write'`: validate media files during insert/update operations
|
|
47
51
|
|
|
48
52
|
Returns:
|
|
49
53
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
@@ -89,14 +93,8 @@ def create_table(
|
|
|
89
93
|
raise excs.Error('primary_key must be a single column name or a list of column names')
|
|
90
94
|
|
|
91
95
|
tbl = catalog.InsertableTable._create(
|
|
92
|
-
dir._id,
|
|
93
|
-
|
|
94
|
-
schema,
|
|
95
|
-
df,
|
|
96
|
-
primary_key=primary_key,
|
|
97
|
-
num_retained_versions=num_retained_versions,
|
|
98
|
-
comment=comment,
|
|
99
|
-
)
|
|
96
|
+
dir._id, path.name, schema, df, primary_key=primary_key, num_retained_versions=num_retained_versions,
|
|
97
|
+
comment=comment, media_validation=catalog.MediaValidation.validated(media_validation, 'media_validation'))
|
|
100
98
|
Catalog.get().paths[path] = tbl
|
|
101
99
|
|
|
102
100
|
_logger.info(f'Created table `{path_str}`.')
|
|
@@ -112,6 +110,7 @@ def create_view(
|
|
|
112
110
|
iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]] = None,
|
|
113
111
|
num_retained_versions: int = 10,
|
|
114
112
|
comment: str = '',
|
|
113
|
+
media_validation: Literal['on_read', 'on_write'] = 'on_write',
|
|
115
114
|
ignore_errors: bool = False,
|
|
116
115
|
) -> Optional[catalog.Table]:
|
|
117
116
|
"""Create a view of an existing table object (which itself can be a view or a snapshot or a base table).
|
|
@@ -124,7 +123,8 @@ def create_view(
|
|
|
124
123
|
additional_columns: If specified, will add these columns to the view once it is created. The format
|
|
125
124
|
of the `additional_columns` parameter is identical to the format of the `schema_or_df` parameter in
|
|
126
125
|
[`create_table`][pixeltable.create_table].
|
|
127
|
-
is_snapshot: Whether the view is a snapshot.
|
|
126
|
+
is_snapshot: Whether the view is a snapshot. Setting this to `True` is equivalent to calling
|
|
127
|
+
[`create_snapshot`][pixeltable.create_snapshot].
|
|
128
128
|
iterator: The iterator to use for this view. If specified, then this view will be a one-to-many view of
|
|
129
129
|
the base table.
|
|
130
130
|
num_retained_versions: Number of versions of the view to retain.
|
|
@@ -143,11 +143,6 @@ def create_view(
|
|
|
143
143
|
|
|
144
144
|
>>> tbl = pxt.get_table('my_table')
|
|
145
145
|
... view = pxt.create_view('my_view', tbl.where(tbl.col1 > 10))
|
|
146
|
-
|
|
147
|
-
Create a snapshot of `my_table`:
|
|
148
|
-
|
|
149
|
-
>>> tbl = pxt.get_table('my_table')
|
|
150
|
-
... snapshot_view = pxt.create_view('my_snapshot_view', tbl, is_snapshot=True)
|
|
151
146
|
"""
|
|
152
147
|
where: Optional[exprs.Expr] = None
|
|
153
148
|
if isinstance(base, catalog.Table):
|
|
@@ -177,23 +172,69 @@ def create_view(
|
|
|
177
172
|
iterator_class, iterator_args = iterator
|
|
178
173
|
|
|
179
174
|
view = catalog.View._create(
|
|
180
|
-
dir._id,
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
predicate=where,
|
|
185
|
-
is_snapshot=is_snapshot,
|
|
186
|
-
iterator_cls=iterator_class,
|
|
187
|
-
iterator_args=iterator_args,
|
|
188
|
-
num_retained_versions=num_retained_versions,
|
|
189
|
-
comment=comment,
|
|
190
|
-
)
|
|
175
|
+
dir._id, path.name, base=tbl_version_path, additional_columns=additional_columns, predicate=where,
|
|
176
|
+
is_snapshot=is_snapshot, iterator_cls=iterator_class, iterator_args=iterator_args,
|
|
177
|
+
num_retained_versions=num_retained_versions, comment=comment,
|
|
178
|
+
media_validation=catalog.MediaValidation.validated(media_validation, 'media_validation'))
|
|
191
179
|
Catalog.get().paths[path] = view
|
|
192
180
|
_logger.info(f'Created view `{path_str}`.')
|
|
193
181
|
FileCache.get().emit_eviction_warnings()
|
|
194
182
|
return view
|
|
195
183
|
|
|
196
184
|
|
|
185
|
+
def create_snapshot(
|
|
186
|
+
path_str: str,
|
|
187
|
+
base: Union[catalog.Table, DataFrame],
|
|
188
|
+
*,
|
|
189
|
+
additional_columns: Optional[dict[str, Any]] = None,
|
|
190
|
+
iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]] = None,
|
|
191
|
+
num_retained_versions: int = 10,
|
|
192
|
+
comment: str = '',
|
|
193
|
+
media_validation: Literal['on_read', 'on_write'] = 'on_write',
|
|
194
|
+
ignore_errors: bool = False,
|
|
195
|
+
) -> Optional[catalog.Table]:
|
|
196
|
+
"""Create a snapshot of an existing table object (which itself can be a view or a snapshot or a base table).
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
path_str: A name for the snapshot; can be either a simple name such as `my_snapshot`, or a pathname such as
|
|
200
|
+
`dir1.my_snapshot`.
|
|
201
|
+
base: [`Table`][pixeltable.Table] (i.e., table or view or snapshot) or [`DataFrame`][pixeltable.DataFrame] to
|
|
202
|
+
base the snapshot on.
|
|
203
|
+
additional_columns: If specified, will add these columns to the snapshot once it is created. The format
|
|
204
|
+
of the `additional_columns` parameter is identical to the format of the `schema_or_df` parameter in
|
|
205
|
+
[`create_table`][pixeltable.create_table].
|
|
206
|
+
iterator: The iterator to use for this snapshot. If specified, then this snapshot will be a one-to-many view of
|
|
207
|
+
the base table.
|
|
208
|
+
num_retained_versions: Number of versions of the view to retain.
|
|
209
|
+
comment: Optional comment for the view.
|
|
210
|
+
ignore_errors: if True, fail silently if the path already exists or is invalid.
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
A handle to the [`Table`][pixeltable.Table] representing the newly created snapshot. If the path already
|
|
214
|
+
exists or is invalid and `ignore_errors=True`, returns `None`.
|
|
215
|
+
|
|
216
|
+
Raises:
|
|
217
|
+
Error: if the path already exists or is invalid and `ignore_errors=False`.
|
|
218
|
+
|
|
219
|
+
Examples:
|
|
220
|
+
Create a snapshot of `my_table`:
|
|
221
|
+
|
|
222
|
+
>>> tbl = pxt.get_table('my_table')
|
|
223
|
+
... snapshot = pxt.create_snapshot('my_snapshot', tbl)
|
|
224
|
+
"""
|
|
225
|
+
return create_view(
|
|
226
|
+
path_str,
|
|
227
|
+
base,
|
|
228
|
+
additional_columns=additional_columns,
|
|
229
|
+
iterator=iterator,
|
|
230
|
+
is_snapshot=True,
|
|
231
|
+
num_retained_versions=num_retained_versions,
|
|
232
|
+
comment=comment,
|
|
233
|
+
media_validation=media_validation,
|
|
234
|
+
ignore_errors=ignore_errors,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
|
|
197
238
|
def get_table(path: str) -> catalog.Table:
|
|
198
239
|
"""Get a handle to an existing table, view, or snapshot.
|
|
199
240
|
|
pixeltable/index/btree.py
CHANGED
|
@@ -1,13 +1,16 @@
|
|
|
1
|
-
from typing import Optional
|
|
1
|
+
from typing import Optional, TYPE_CHECKING
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
5
|
+
# TODO: why does this import result in a circular import, but the one im embedding_index.py doesn't?
|
|
6
|
+
# import pixeltable.catalog as catalog
|
|
5
7
|
import pixeltable.exceptions as excs
|
|
6
8
|
from pixeltable import catalog, exprs
|
|
7
9
|
from pixeltable.func.udf import udf
|
|
8
|
-
|
|
9
10
|
from .base import IndexBase
|
|
10
11
|
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
import pixeltable.exprs
|
|
11
14
|
|
|
12
15
|
class BtreeIndex(IndexBase):
|
|
13
16
|
"""
|
|
@@ -15,6 +18,8 @@ class BtreeIndex(IndexBase):
|
|
|
15
18
|
"""
|
|
16
19
|
MAX_STRING_LEN = 256
|
|
17
20
|
|
|
21
|
+
value_expr: 'pixeltable.exprs.Expr'
|
|
22
|
+
|
|
18
23
|
@staticmethod
|
|
19
24
|
@udf
|
|
20
25
|
def str_filter(s: Optional[str]) -> Optional[str]:
|
|
@@ -25,7 +30,14 @@ class BtreeIndex(IndexBase):
|
|
|
25
30
|
def __init__(self, c: 'catalog.Column'):
|
|
26
31
|
if not c.col_type.is_scalar_type() and not c.col_type.is_media_type():
|
|
27
32
|
raise excs.Error(f'Index on column {c.name}: B-tree index requires scalar or media type, got {c.col_type}')
|
|
28
|
-
|
|
33
|
+
if c.col_type.is_media_type():
|
|
34
|
+
# an index on a media column is an index on the file url
|
|
35
|
+
# no validation for media columns: we're only interested in the string value
|
|
36
|
+
self.value_expr = exprs.ColumnRef(c, perform_validation=False)
|
|
37
|
+
else:
|
|
38
|
+
self.value_expr = (
|
|
39
|
+
BtreeIndex.str_filter(exprs.ColumnRef(c)) if c.col_type.is_string_type() else exprs.ColumnRef(c)
|
|
40
|
+
)
|
|
29
41
|
|
|
30
42
|
def index_value_expr(self) -> 'exprs.Expr':
|
|
31
43
|
return self.value_expr
|
|
@@ -52,3 +64,4 @@ class BtreeIndex(IndexBase):
|
|
|
52
64
|
@classmethod
|
|
53
65
|
def from_dict(cls, c: 'catalog.Column', d: dict) -> 'BtreeIndex':
|
|
54
66
|
return cls(c)
|
|
67
|
+
|
|
@@ -86,8 +86,8 @@ class EmbeddingIndex(IndexBase):
|
|
|
86
86
|
)
|
|
87
87
|
idx.create(bind=conn)
|
|
88
88
|
|
|
89
|
-
def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.
|
|
90
|
-
"""Create a
|
|
89
|
+
def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.ColumnElement:
|
|
90
|
+
"""Create a ColumnElement that represents '<val_column> <op> <item>'"""
|
|
91
91
|
assert isinstance(item, (str, PIL.Image.Image))
|
|
92
92
|
if isinstance(item, str):
|
|
93
93
|
assert self.string_embed is not None
|
|
@@ -104,8 +104,8 @@ class EmbeddingIndex(IndexBase):
|
|
|
104
104
|
assert self.metric == self.Metric.L2
|
|
105
105
|
return val_column.sa_col.l2_distance(embedding)
|
|
106
106
|
|
|
107
|
-
def order_by_clause(self, val_column: catalog.Column, item: Any, is_asc: bool) -> sql.
|
|
108
|
-
"""Create a
|
|
107
|
+
def order_by_clause(self, val_column: catalog.Column, item: Any, is_asc: bool) -> sql.ColumnElement:
|
|
108
|
+
"""Create a ColumnElement that is used in an ORDER BY clause"""
|
|
109
109
|
assert isinstance(item, (str, PIL.Image.Image))
|
|
110
110
|
embedding: Optional[np.ndarray] = None
|
|
111
111
|
if isinstance(item, str):
|
pixeltable/io/__init__.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
from .external_store import ExternalStore, SyncStatus
|
|
2
|
-
from .globals import create_label_studio_project,
|
|
2
|
+
from .globals import create_label_studio_project, export_images_as_fo_dataset, import_json, import_rows
|
|
3
3
|
from .hf_datasets import import_huggingface_dataset
|
|
4
4
|
from .pandas import import_csv, import_excel, import_pandas
|
|
5
5
|
from .parquet import import_parquet
|
|
6
6
|
|
|
7
|
-
|
|
8
7
|
__default_dir = set(symbol for symbol in dir() if not symbol.startswith('_'))
|
|
9
8
|
__removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet'}
|
|
10
9
|
__all__ = sorted(list(__default_dir - __removed_symbols))
|