pixeltable 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +3 -1
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/column.py +14 -2
- pixeltable/catalog/insertable_table.py +32 -17
- pixeltable/catalog/table.py +194 -12
- pixeltable/catalog/table_version.py +270 -110
- pixeltable/catalog/table_version_path.py +6 -1
- pixeltable/datatransfer/__init__.py +1 -0
- pixeltable/datatransfer/label_studio.py +526 -0
- pixeltable/datatransfer/remote.py +113 -0
- pixeltable/env.py +156 -73
- pixeltable/exprs/column_ref.py +2 -2
- pixeltable/exprs/comparison.py +39 -1
- pixeltable/exprs/data_row.py +7 -0
- pixeltable/exprs/expr.py +11 -12
- pixeltable/exprs/function_call.py +0 -3
- pixeltable/exprs/globals.py +14 -2
- pixeltable/exprs/similarity_expr.py +5 -3
- pixeltable/ext/functions/whisperx.py +30 -0
- pixeltable/ext/functions/yolox.py +16 -0
- pixeltable/func/aggregate_function.py +2 -2
- pixeltable/func/expr_template_function.py +3 -1
- pixeltable/func/udf.py +2 -2
- pixeltable/functions/fireworks.py +9 -4
- pixeltable/functions/huggingface.py +25 -1
- pixeltable/functions/openai.py +15 -10
- pixeltable/functions/together.py +11 -6
- pixeltable/functions/util.py +0 -43
- pixeltable/functions/video.py +46 -8
- pixeltable/globals.py +20 -2
- pixeltable/index/__init__.py +1 -0
- pixeltable/index/base.py +6 -1
- pixeltable/index/btree.py +54 -0
- pixeltable/index/embedding_index.py +4 -1
- pixeltable/io/__init__.py +1 -0
- pixeltable/io/globals.py +59 -0
- pixeltable/iterators/base.py +4 -4
- pixeltable/iterators/document.py +26 -15
- pixeltable/iterators/video.py +9 -1
- pixeltable/metadata/__init__.py +2 -2
- pixeltable/metadata/converters/convert_14.py +13 -0
- pixeltable/metadata/converters/convert_15.py +29 -0
- pixeltable/metadata/converters/util.py +63 -0
- pixeltable/metadata/schema.py +12 -6
- pixeltable/plan.py +9 -5
- pixeltable/store.py +14 -21
- pixeltable/tool/create_test_db_dump.py +16 -0
- pixeltable/type_system.py +14 -4
- pixeltable/utils/coco.py +94 -0
- pixeltable-0.2.7.dist-info/METADATA +137 -0
- {pixeltable-0.2.6.dist-info → pixeltable-0.2.7.dist-info}/RECORD +53 -46
- pixeltable/func/nos_function.py +0 -202
- pixeltable/utils/clip.py +0 -18
- pixeltable-0.2.6.dist-info/METADATA +0 -131
- {pixeltable-0.2.6.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.6.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +0 -0
|
@@ -6,8 +6,13 @@ import pixeltable as pxt
|
|
|
6
6
|
from pixeltable import env
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
|
|
9
|
+
@env.register_client('fireworks')
|
|
10
|
+
def _(api_key: str) -> fireworks.client.Fireworks:
|
|
11
|
+
return fireworks.client.Fireworks(api_key=api_key)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _fireworks_client() -> fireworks.client.Fireworks:
|
|
15
|
+
return env.Env.get().get_client('fireworks')
|
|
11
16
|
|
|
12
17
|
|
|
13
18
|
@pxt.udf
|
|
@@ -26,8 +31,8 @@ def chat_completions(
|
|
|
26
31
|
'top_p': top_p,
|
|
27
32
|
'temperature': temperature
|
|
28
33
|
}
|
|
29
|
-
kwargs_not_none =
|
|
30
|
-
return
|
|
34
|
+
kwargs_not_none = {k: v for k, v in kwargs.items() if v is not None}
|
|
35
|
+
return _fireworks_client().chat.completions.create(
|
|
31
36
|
model=model,
|
|
32
37
|
messages=messages,
|
|
33
38
|
**kwargs_not_none
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Callable, TypeVar, Optional
|
|
1
|
+
from typing import Callable, TypeVar, Optional, Any
|
|
2
2
|
|
|
3
3
|
import PIL.Image
|
|
4
4
|
import numpy as np
|
|
@@ -14,6 +14,7 @@ from pixeltable.functions.util import resolve_torch_device
|
|
|
14
14
|
def sentence_transformer(
|
|
15
15
|
sentences: Batch[str], *, model_id: str, normalize_embeddings: bool = False
|
|
16
16
|
) -> Batch[np.ndarray]:
|
|
17
|
+
"""Runs the specified sentence transformer model."""
|
|
17
18
|
env.Env.get().require_package('sentence_transformers')
|
|
18
19
|
from sentence_transformers import SentenceTransformer
|
|
19
20
|
|
|
@@ -46,6 +47,7 @@ def sentence_transformer_list(sentences: list, *, model_id: str, normalize_embed
|
|
|
46
47
|
|
|
47
48
|
@pxt.udf(batch_size=32)
|
|
48
49
|
def cross_encoder(sentences1: Batch[str], sentences2: Batch[str], *, model_id: str) -> Batch[float]:
|
|
50
|
+
"""Runs the specified cross-encoder model."""
|
|
49
51
|
env.Env.get().require_package('sentence_transformers')
|
|
50
52
|
from sentence_transformers import CrossEncoder
|
|
51
53
|
|
|
@@ -68,6 +70,7 @@ def cross_encoder_list(sentence1: str, sentences2: list, *, model_id: str) -> li
|
|
|
68
70
|
|
|
69
71
|
@pxt.udf(batch_size=32, return_type=ts.ArrayType((None,), dtype=ts.FloatType(), nullable=False))
|
|
70
72
|
def clip_text(text: Batch[str], *, model_id: str) -> Batch[np.ndarray]:
|
|
73
|
+
"""Runs the specified CLIP model on text."""
|
|
71
74
|
env.Env.get().require_package('transformers')
|
|
72
75
|
device = resolve_torch_device('auto')
|
|
73
76
|
import torch
|
|
@@ -85,6 +88,7 @@ def clip_text(text: Batch[str], *, model_id: str) -> Batch[np.ndarray]:
|
|
|
85
88
|
|
|
86
89
|
@pxt.udf(batch_size=32, return_type=ts.ArrayType((None,), dtype=ts.FloatType(), nullable=False))
|
|
87
90
|
def clip_image(image: Batch[PIL.Image.Image], *, model_id: str) -> Batch[np.ndarray]:
|
|
91
|
+
"""Runs the specified CLIP model on images."""
|
|
88
92
|
env.Env.get().require_package('transformers')
|
|
89
93
|
device = resolve_torch_device('auto')
|
|
90
94
|
import torch
|
|
@@ -113,6 +117,7 @@ def _(model_id: str) -> ts.ArrayType:
|
|
|
113
117
|
|
|
114
118
|
@pxt.udf(batch_size=4)
|
|
115
119
|
def detr_for_object_detection(image: Batch[PIL.Image.Image], *, model_id: str, threshold: float = 0.5) -> Batch[dict]:
|
|
120
|
+
"""Runs the specified DETR model."""
|
|
116
121
|
env.Env.get().require_package('transformers')
|
|
117
122
|
device = resolve_torch_device('auto')
|
|
118
123
|
import torch
|
|
@@ -140,6 +145,25 @@ def detr_for_object_detection(image: Batch[PIL.Image.Image], *, model_id: str, t
|
|
|
140
145
|
]
|
|
141
146
|
|
|
142
147
|
|
|
148
|
+
@pxt.udf
|
|
149
|
+
def detr_to_coco(image: PIL.Image.Image, detr_info: dict[str, Any]) -> dict[str, Any]:
|
|
150
|
+
bboxes, labels = detr_info['boxes'], detr_info['labels']
|
|
151
|
+
annotations = [
|
|
152
|
+
{
|
|
153
|
+
'bbox': [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]],
|
|
154
|
+
'category': label
|
|
155
|
+
}
|
|
156
|
+
for bbox, label in zip(bboxes, labels)
|
|
157
|
+
]
|
|
158
|
+
return {
|
|
159
|
+
'image': {
|
|
160
|
+
'width': image.width,
|
|
161
|
+
'height': image.height
|
|
162
|
+
},
|
|
163
|
+
'annotations': annotations
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
|
|
143
167
|
T = TypeVar('T')
|
|
144
168
|
|
|
145
169
|
|
pixeltable/functions/openai.py
CHANGED
|
@@ -16,8 +16,13 @@ from pixeltable import env
|
|
|
16
16
|
from pixeltable.func import Batch
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
|
|
20
|
-
|
|
19
|
+
@env.register_client('openai')
|
|
20
|
+
def _(api_key: str) -> openai.OpenAI:
|
|
21
|
+
return openai.OpenAI(api_key=api_key)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _openai_client() -> openai.OpenAI:
|
|
25
|
+
return env.Env.get().get_client('openai')
|
|
21
26
|
|
|
22
27
|
|
|
23
28
|
# Exponential backoff decorator using tenacity.
|
|
@@ -44,7 +49,7 @@ def speech(
|
|
|
44
49
|
response_format: Optional[str] = None,
|
|
45
50
|
speed: Optional[float] = None
|
|
46
51
|
) -> str:
|
|
47
|
-
content =
|
|
52
|
+
content = _openai_client().audio.speech.create(
|
|
48
53
|
input=input,
|
|
49
54
|
model=model,
|
|
50
55
|
voice=voice,
|
|
@@ -71,7 +76,7 @@ def transcriptions(
|
|
|
71
76
|
temperature: Optional[float] = None
|
|
72
77
|
) -> dict:
|
|
73
78
|
file = pathlib.Path(audio)
|
|
74
|
-
transcription =
|
|
79
|
+
transcription = _openai_client().audio.transcriptions.create(
|
|
75
80
|
file=file,
|
|
76
81
|
model=model,
|
|
77
82
|
language=_opt(language),
|
|
@@ -93,7 +98,7 @@ def translations(
|
|
|
93
98
|
temperature: Optional[float] = None
|
|
94
99
|
) -> dict:
|
|
95
100
|
file = pathlib.Path(audio)
|
|
96
|
-
translation =
|
|
101
|
+
translation = _openai_client().audio.translations.create(
|
|
97
102
|
file=file,
|
|
98
103
|
model=model,
|
|
99
104
|
prompt=_opt(prompt),
|
|
@@ -127,7 +132,7 @@ def chat_completions(
|
|
|
127
132
|
tool_choice: Optional[dict] = None,
|
|
128
133
|
user: Optional[str] = None
|
|
129
134
|
) -> dict:
|
|
130
|
-
result =
|
|
135
|
+
result = _openai_client().chat.completions.create(
|
|
131
136
|
messages=messages,
|
|
132
137
|
model=model,
|
|
133
138
|
frequency_penalty=_opt(frequency_penalty),
|
|
@@ -171,7 +176,7 @@ def vision(
|
|
|
171
176
|
}}
|
|
172
177
|
]}
|
|
173
178
|
]
|
|
174
|
-
result =
|
|
179
|
+
result = _openai_client().chat.completions.create(
|
|
175
180
|
messages=messages,
|
|
176
181
|
model=model
|
|
177
182
|
)
|
|
@@ -197,7 +202,7 @@ def embeddings(
|
|
|
197
202
|
dimensions: Optional[int] = None,
|
|
198
203
|
user: Optional[str] = None
|
|
199
204
|
) -> Batch[np.ndarray]:
|
|
200
|
-
result =
|
|
205
|
+
result = _openai_client().embeddings.create(
|
|
201
206
|
input=input,
|
|
202
207
|
model=model,
|
|
203
208
|
dimensions=_opt(dimensions),
|
|
@@ -235,7 +240,7 @@ def image_generations(
|
|
|
235
240
|
user: Optional[str] = None
|
|
236
241
|
) -> PIL.Image.Image:
|
|
237
242
|
# TODO(aaron-siegel): Decompose CPU/GPU ops into separate functions
|
|
238
|
-
result =
|
|
243
|
+
result = _openai_client().images.generate(
|
|
239
244
|
prompt=prompt,
|
|
240
245
|
model=_opt(model),
|
|
241
246
|
quality=_opt(quality),
|
|
@@ -275,7 +280,7 @@ def moderations(
|
|
|
275
280
|
*,
|
|
276
281
|
model: Optional[str] = None
|
|
277
282
|
) -> dict:
|
|
278
|
-
result =
|
|
283
|
+
result = _openai_client().moderations.create(
|
|
279
284
|
input=input,
|
|
280
285
|
model=_opt(model)
|
|
281
286
|
)
|
pixeltable/functions/together.py
CHANGED
|
@@ -11,8 +11,13 @@ from pixeltable import env
|
|
|
11
11
|
from pixeltable.func import Batch
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
@env.register_client('together')
|
|
15
|
+
def _(api_key: str) -> together.Together:
|
|
16
|
+
return together.Together(api_key=api_key)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _together_client() -> together.Together:
|
|
20
|
+
return env.Env.get().get_client('together')
|
|
16
21
|
|
|
17
22
|
|
|
18
23
|
@pxt.udf
|
|
@@ -31,7 +36,7 @@ def completions(
|
|
|
31
36
|
n: Optional[int] = None,
|
|
32
37
|
safety_model: Optional[str] = None
|
|
33
38
|
) -> dict:
|
|
34
|
-
return
|
|
39
|
+
return _together_client().completions.create(
|
|
35
40
|
prompt=prompt,
|
|
36
41
|
model=model,
|
|
37
42
|
max_tokens=max_tokens,
|
|
@@ -66,7 +71,7 @@ def chat_completions(
|
|
|
66
71
|
tools: Optional[dict] = None,
|
|
67
72
|
tool_choice: Optional[dict] = None
|
|
68
73
|
) -> dict:
|
|
69
|
-
return
|
|
74
|
+
return _together_client().chat.completions.create(
|
|
70
75
|
messages=messages,
|
|
71
76
|
model=model,
|
|
72
77
|
max_tokens=max_tokens,
|
|
@@ -99,7 +104,7 @@ _embedding_dimensions_cache = {
|
|
|
99
104
|
|
|
100
105
|
@pxt.udf(batch_size=32, return_type=pxt.ArrayType((None,), dtype=pxt.FloatType()))
|
|
101
106
|
def embeddings(input: Batch[str], *, model: str) -> Batch[np.ndarray]:
|
|
102
|
-
result =
|
|
107
|
+
result = _together_client().embeddings.create(input=input, model=model)
|
|
103
108
|
return [
|
|
104
109
|
np.array(data.embedding, dtype=np.float64)
|
|
105
110
|
for data in result.data
|
|
@@ -127,7 +132,7 @@ def image_generations(
|
|
|
127
132
|
negative_prompt: Optional[str] = None,
|
|
128
133
|
) -> PIL.Image.Image:
|
|
129
134
|
# TODO(aaron-siegel): Decompose CPU/GPU ops into separate functions
|
|
130
|
-
result =
|
|
135
|
+
result = _together_client().images.generate(
|
|
131
136
|
prompt=prompt,
|
|
132
137
|
model=model,
|
|
133
138
|
steps=steps,
|
pixeltable/functions/util.py
CHANGED
|
@@ -1,46 +1,3 @@
|
|
|
1
|
-
from typing import Tuple, List, Optional
|
|
2
|
-
import types
|
|
3
|
-
import sys
|
|
4
|
-
|
|
5
|
-
import pixeltable.func as func
|
|
6
|
-
import pixeltable.type_system as ts
|
|
7
|
-
import pixeltable.env as env
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def create_nos_modules() -> List[types.ModuleType]:
|
|
11
|
-
"""Create module pixeltable.functions.nos with one submodule per task and return the submodules"""
|
|
12
|
-
models = env.Env.get().nos_client.ListModels()
|
|
13
|
-
model_info = [env.Env.get().nos_client.GetModelInfo(model) for model in models]
|
|
14
|
-
model_info.sort(key=lambda info: info.task.value)
|
|
15
|
-
|
|
16
|
-
module_name = 'pixeltable.functions.nos'
|
|
17
|
-
nos_module = types.ModuleType(module_name)
|
|
18
|
-
nos_module.__package__ = 'pixeltable.functions'
|
|
19
|
-
sys.modules[module_name] = nos_module
|
|
20
|
-
|
|
21
|
-
prev_task = ''
|
|
22
|
-
new_modules: List[types.ModuleType] = []
|
|
23
|
-
sub_module: Optional[types.ModuleType] = None
|
|
24
|
-
for info in model_info:
|
|
25
|
-
if info.task.value != prev_task:
|
|
26
|
-
# we construct one submodule per task
|
|
27
|
-
namespace = info.task.name.lower()
|
|
28
|
-
submodule_name = f'{module_name}.{namespace}'
|
|
29
|
-
sub_module = types.ModuleType(submodule_name)
|
|
30
|
-
sub_module.__package__ = module_name
|
|
31
|
-
setattr(nos_module, namespace, sub_module)
|
|
32
|
-
new_modules.append(sub_module)
|
|
33
|
-
sys.modules[submodule_name] = sub_module
|
|
34
|
-
prev_task = info.task.value
|
|
35
|
-
|
|
36
|
-
# add a Function for this model to the module
|
|
37
|
-
model_id = info.name.replace("/", "_").replace("-", "_")
|
|
38
|
-
pt_func = func.NOSFunction(info, f'{submodule_name}.{model_id}')
|
|
39
|
-
setattr(sub_module, model_id, pt_func)
|
|
40
|
-
|
|
41
|
-
return new_modules
|
|
42
|
-
|
|
43
|
-
|
|
44
1
|
def resolve_torch_device(device: str) -> str:
|
|
45
2
|
import torch
|
|
46
3
|
if device == 'auto':
|
pixeltable/functions/video.py
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
|
-
from typing import Optional
|
|
2
1
|
import uuid
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
3
4
|
import av
|
|
4
|
-
import sys
|
|
5
5
|
|
|
6
6
|
import pixeltable.env as env
|
|
7
7
|
import pixeltable.func as func
|
|
8
8
|
import pixeltable.type_system as ts
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
_format_defaults = { # format -> (codec, ext)
|
|
10
|
+
_format_defaults = { # format -> (codec, ext)
|
|
12
11
|
'wav': ('pcm_s16le', 'wav'),
|
|
13
12
|
'mp3': ('libmp3lame', 'mp3'),
|
|
14
13
|
'flac': ('flac', 'flac'),
|
|
@@ -35,11 +34,13 @@ _extract_audio_param_types = [
|
|
|
35
34
|
ts.VideoType(nullable=False),
|
|
36
35
|
ts.IntType(nullable=False),
|
|
37
36
|
ts.StringType(nullable=False),
|
|
38
|
-
ts.StringType(nullable=
|
|
37
|
+
ts.StringType(nullable=True),
|
|
39
38
|
]
|
|
39
|
+
|
|
40
|
+
|
|
40
41
|
@func.udf(return_type=ts.AudioType(nullable=True), param_types=_extract_audio_param_types)
|
|
41
42
|
def extract_audio(
|
|
42
|
-
|
|
43
|
+
video_path: str, stream_idx: int = 0, format: str = 'wav', codec: Optional[str] = None
|
|
43
44
|
) -> Optional[str]:
|
|
44
45
|
"""Extract an audio stream from a video file, save it as a media file and return its path"""
|
|
45
46
|
if format not in _format_defaults:
|
|
@@ -51,12 +52,49 @@ def extract_audio(
|
|
|
51
52
|
return None
|
|
52
53
|
audio_stream = container.streams.audio[stream_idx]
|
|
53
54
|
# create this in our tmp directory, so it'll get cleaned up if it's being generated as part of a query
|
|
54
|
-
output_filename = str(env.Env.get().tmp_dir / f
|
|
55
|
+
output_filename = str(env.Env.get().tmp_dir / f'{uuid.uuid4()}.{ext}')
|
|
55
56
|
|
|
56
|
-
with av.open(output_filename,
|
|
57
|
+
with av.open(output_filename, 'w', format=format) as output_container:
|
|
57
58
|
output_stream = output_container.add_stream(codec or default_codec)
|
|
58
59
|
for packet in container.demux(audio_stream):
|
|
59
60
|
for frame in packet.decode():
|
|
60
61
|
output_container.mux(output_stream.encode(frame))
|
|
61
62
|
|
|
62
63
|
return output_filename
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@func.udf(return_type=ts.JsonType(nullable=False), param_types=[ts.VideoType(nullable=False)])
|
|
67
|
+
def get_metadata(video: str) -> dict:
|
|
68
|
+
"""Gets various metadata associated with a video file.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
video (str): Path to the video file.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
A dictionary containing the associated metadata.
|
|
75
|
+
"""
|
|
76
|
+
with av.open(video) as container:
|
|
77
|
+
assert isinstance(container, av.container.InputContainer)
|
|
78
|
+
video_streams_info = [
|
|
79
|
+
{
|
|
80
|
+
'duration': stream.duration,
|
|
81
|
+
'frames': stream.frames,
|
|
82
|
+
'language': stream.language,
|
|
83
|
+
'average_rate': float(stream.average_rate) if stream.average_rate is not None else None,
|
|
84
|
+
'base_rate': float(stream.base_rate) if stream.base_rate is not None else None,
|
|
85
|
+
'guessed_rate': float(stream.guessed_rate) if stream.guessed_rate is not None else None,
|
|
86
|
+
'pix_fmt': getattr(stream.codec_context, 'pix_fmt', None),
|
|
87
|
+
'width': stream.width,
|
|
88
|
+
'height': stream.height,
|
|
89
|
+
}
|
|
90
|
+
for stream in container.streams
|
|
91
|
+
if isinstance(stream, av.video.stream.VideoStream)
|
|
92
|
+
]
|
|
93
|
+
result = {
|
|
94
|
+
'bit_exact': container.bit_exact,
|
|
95
|
+
'bit_rate': container.bit_rate,
|
|
96
|
+
'size': container.size,
|
|
97
|
+
'metadata': container.metadata,
|
|
98
|
+
'streams': video_streams_info, # TODO: Audio streams?
|
|
99
|
+
}
|
|
100
|
+
return result
|
pixeltable/globals.py
CHANGED
|
@@ -96,8 +96,8 @@ def create_view(
|
|
|
96
96
|
schema: dictionary mapping column names to column types, value expressions, or to column specifications.
|
|
97
97
|
filter: Predicate to filter rows of the base table.
|
|
98
98
|
is_snapshot: Whether the view is a snapshot.
|
|
99
|
-
|
|
100
|
-
|
|
99
|
+
iterator: The iterator to use for this view. If specified, then this view will be a one-to-many view of
|
|
100
|
+
the base table.
|
|
101
101
|
num_retained_versions: Number of versions of the view to retain.
|
|
102
102
|
ignore_errors: if True, fail silently if the path already exists or is invalid.
|
|
103
103
|
|
|
@@ -423,3 +423,21 @@ def get_path(schema_obj: catalog.SchemaObject) -> str:
|
|
|
423
423
|
dir_id = dir._dir_id
|
|
424
424
|
path_elements.append(schema_obj._name)
|
|
425
425
|
return '.'.join(path_elements)
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def configure_logging(
|
|
429
|
+
*,
|
|
430
|
+
to_stdout: Optional[bool] = None,
|
|
431
|
+
level: Optional[int] = None,
|
|
432
|
+
add: Optional[str] = None,
|
|
433
|
+
remove: Optional[str] = None,
|
|
434
|
+
) -> None:
|
|
435
|
+
"""Configure logging.
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
to_stdout: if True, also log to stdout
|
|
439
|
+
level: default log level
|
|
440
|
+
add: comma-separated list of 'module name:log level' pairs; ex.: add='video:10'
|
|
441
|
+
remove: comma-separated list of module names
|
|
442
|
+
"""
|
|
443
|
+
return Env.get().configure_logging(to_stdout=to_stdout, level=level, add=add, remove=remove)
|
pixeltable/index/__init__.py
CHANGED
pixeltable/index/base.py
CHANGED
|
@@ -27,7 +27,12 @@ class IndexBase(abc.ABC):
|
|
|
27
27
|
pass
|
|
28
28
|
|
|
29
29
|
@abc.abstractmethod
|
|
30
|
-
def
|
|
30
|
+
def records_value_errors(self) -> bool:
|
|
31
|
+
"""True if index_value_expr() can raise errors"""
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
@abc.abstractmethod
|
|
35
|
+
def index_sa_type(self) -> sql.types.TypeEngine:
|
|
31
36
|
"""Return the sqlalchemy type of the index value column"""
|
|
32
37
|
pass
|
|
33
38
|
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
# TODO: why does this import result in a circular import, but the one im embedding_index.py doesn't?
|
|
6
|
+
#import pixeltable.catalog as catalog
|
|
7
|
+
import pixeltable.exceptions as excs
|
|
8
|
+
import pixeltable.func as func
|
|
9
|
+
from .base import IndexBase
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BtreeIndex(IndexBase):
|
|
13
|
+
"""
|
|
14
|
+
Interface to B-tree indices in Postgres.
|
|
15
|
+
"""
|
|
16
|
+
MAX_STRING_LEN = 256
|
|
17
|
+
|
|
18
|
+
@func.udf
|
|
19
|
+
def str_filter(s: Optional[str]) -> Optional[str]:
|
|
20
|
+
if s is None:
|
|
21
|
+
return None
|
|
22
|
+
return s[:BtreeIndex.MAX_STRING_LEN]
|
|
23
|
+
|
|
24
|
+
def __init__(self, c: 'catalog.Column'):
|
|
25
|
+
if not c.col_type.is_scalar_type() and not c.col_type.is_media_type():
|
|
26
|
+
raise excs.Error(f'Index on column {c.name}: B-tree index requires scalar or media type, got {c.col_type}')
|
|
27
|
+
from pixeltable.exprs import ColumnRef
|
|
28
|
+
self.value_expr = self.str_filter(ColumnRef(c)) if c.col_type.is_string_type() else ColumnRef(c)
|
|
29
|
+
|
|
30
|
+
def index_value_expr(self) -> 'pixeltable.exprs.Expr':
|
|
31
|
+
return self.value_expr
|
|
32
|
+
|
|
33
|
+
def records_value_errors(self) -> bool:
|
|
34
|
+
return False
|
|
35
|
+
|
|
36
|
+
def index_sa_type(self) -> sql.types.TypeEngine:
|
|
37
|
+
"""Return the sqlalchemy type of the index value column"""
|
|
38
|
+
return self.value_expr.col_type.to_sa_type()
|
|
39
|
+
|
|
40
|
+
def create_index(self, index_name: str, index_value_col: 'catalog.Column', conn: sql.engine.Connection) -> None:
|
|
41
|
+
"""Create the index on the index value column"""
|
|
42
|
+
idx = sql.Index(index_name, index_value_col.sa_col, postgresql_using='btree')
|
|
43
|
+
idx.create(bind=conn)
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def display_name(cls) -> str:
|
|
47
|
+
return 'btree'
|
|
48
|
+
|
|
49
|
+
def as_dict(self) -> dict:
|
|
50
|
+
return {}
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def from_dict(cls, c: 'catalog.Column', d: dict) -> 'BtreeIndex':
|
|
54
|
+
return cls(c)
|
|
@@ -70,7 +70,10 @@ class EmbeddingIndex(IndexBase):
|
|
|
70
70
|
"""Return expression that computes the value that goes into the index"""
|
|
71
71
|
return self.value_expr
|
|
72
72
|
|
|
73
|
-
def
|
|
73
|
+
def records_value_errors(self) -> bool:
|
|
74
|
+
return True
|
|
75
|
+
|
|
76
|
+
def index_sa_type(self) -> sql.types.TypeEngine:
|
|
74
77
|
"""Return the sqlalchemy type of the index value column"""
|
|
75
78
|
return self.index_col_type
|
|
76
79
|
|
pixeltable/io/__init__.py
CHANGED
pixeltable/io/globals.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from typing import Any, Optional, Literal
|
|
2
|
+
|
|
3
|
+
import pixeltable as pxt
|
|
4
|
+
from pixeltable import Table
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def create_label_studio_project(
|
|
8
|
+
t: Table,
|
|
9
|
+
label_config: str,
|
|
10
|
+
col_mapping: Optional[dict[str, str]] = None,
|
|
11
|
+
title: Optional[str] = None,
|
|
12
|
+
media_import_method: Literal['post', 'file'] = 'file',
|
|
13
|
+
sync_immediately: bool = True,
|
|
14
|
+
**kwargs: Any
|
|
15
|
+
) -> None:
|
|
16
|
+
"""
|
|
17
|
+
Creates a new Label Studio project and links it to the specified `Table`.
|
|
18
|
+
|
|
19
|
+
The required parameter `label_config` specifies the Label Studio project configuration,
|
|
20
|
+
in XML format, as described in the Label Studio documentation. The linked project will
|
|
21
|
+
have one column for each data field in the configuration; for example, if the
|
|
22
|
+
configuration has an entry
|
|
23
|
+
```
|
|
24
|
+
<Image name="image_obj" value="$image"/>
|
|
25
|
+
```
|
|
26
|
+
then the linked project will have a column named `image`. In addition, the linked project
|
|
27
|
+
will always have a JSON-typed column `annotations` representing the output.
|
|
28
|
+
|
|
29
|
+
By default, Pixeltable will link each of these columns to a column of the specified `Table`
|
|
30
|
+
with the same name. If any of the data fields are missing, an exception will be thrown. If
|
|
31
|
+
the `annotations` column is missing, it will be created. The default names can be overridden
|
|
32
|
+
by specifying an optional `col_mapping`, with Pixeltable column names as keys and Label
|
|
33
|
+
Studio field names as values.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
t: The Table to link to.
|
|
37
|
+
label_config: The Label Studio project configuration, in XML format.
|
|
38
|
+
col_mapping: An optional mapping of local column names to remote column names.
|
|
39
|
+
title: An optional title for the Label Studio project. If not specified, the
|
|
40
|
+
name of the `Table` will be used as a default.
|
|
41
|
+
sync_immediately: If `True`, immediately perform an initial synchronization by
|
|
42
|
+
importing all rows of the `Table` as Label Studio tasks.
|
|
43
|
+
"""
|
|
44
|
+
from pixeltable.datatransfer.label_studio import LabelStudioProject, ANNOTATIONS_COLUMN
|
|
45
|
+
|
|
46
|
+
ls_project = LabelStudioProject.create(title or t.get_name(), label_config, media_import_method, **kwargs)
|
|
47
|
+
|
|
48
|
+
# Create a column to hold the annotations, if one does not yet exist.
|
|
49
|
+
if col_mapping is not None and ANNOTATIONS_COLUMN in col_mapping.values():
|
|
50
|
+
local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
|
|
51
|
+
else:
|
|
52
|
+
local_annotations_column = ANNOTATIONS_COLUMN
|
|
53
|
+
if local_annotations_column not in t.column_names():
|
|
54
|
+
t[local_annotations_column] = pxt.JsonType(nullable=True)
|
|
55
|
+
|
|
56
|
+
# Link the project to `t`, and sync if appropriate.
|
|
57
|
+
t._link(ls_project, col_mapping)
|
|
58
|
+
if sync_immediately:
|
|
59
|
+
t.sync()
|
pixeltable/iterators/base.py
CHANGED
|
@@ -6,11 +6,11 @@ from pixeltable.type_system import ColumnType
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class ComponentIterator(ABC):
|
|
9
|
-
"""Base class for iterators."""
|
|
9
|
+
"""Base class for Pixeltable iterators."""
|
|
10
10
|
|
|
11
11
|
@classmethod
|
|
12
12
|
@abstractmethod
|
|
13
|
-
def input_schema(cls) ->
|
|
13
|
+
def input_schema(cls) -> dict[str, ColumnType]:
|
|
14
14
|
"""Provide the Pixeltable types of the init() parameters
|
|
15
15
|
|
|
16
16
|
The keys need to match the names of the init() parameters. This is equivalent to the parameters_types
|
|
@@ -20,7 +20,7 @@ class ComponentIterator(ABC):
|
|
|
20
20
|
|
|
21
21
|
@classmethod
|
|
22
22
|
@abstractmethod
|
|
23
|
-
def output_schema(cls, *args: Any, **kwargs: Any) ->
|
|
23
|
+
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
|
|
24
24
|
"""Specify the dictionary returned by next() and a list of unstored column names
|
|
25
25
|
|
|
26
26
|
Returns:
|
|
@@ -33,7 +33,7 @@ class ComponentIterator(ABC):
|
|
|
33
33
|
return self
|
|
34
34
|
|
|
35
35
|
@abstractmethod
|
|
36
|
-
def __next__(self) ->
|
|
36
|
+
def __next__(self) -> dict[str, Any]:
|
|
37
37
|
"""Return the next element of the iterator as a dictionary or raise StopIteration"""
|
|
38
38
|
raise NotImplementedError
|
|
39
39
|
|
pixeltable/iterators/document.py
CHANGED
|
@@ -13,6 +13,7 @@ from .base import ComponentIterator
|
|
|
13
13
|
|
|
14
14
|
_logger = logging.getLogger('pixeltable')
|
|
15
15
|
|
|
16
|
+
|
|
16
17
|
class ChunkMetadata(enum.Enum):
|
|
17
18
|
TITLE = 1
|
|
18
19
|
HEADING = 2
|
|
@@ -20,6 +21,7 @@ class ChunkMetadata(enum.Enum):
|
|
|
20
21
|
PAGE = 4
|
|
21
22
|
BOUNDING_BOX = 5
|
|
22
23
|
|
|
24
|
+
|
|
23
25
|
class Separator(enum.Enum):
|
|
24
26
|
HEADING = 1
|
|
25
27
|
PARAGRAPH = 2
|
|
@@ -28,6 +30,7 @@ class Separator(enum.Enum):
|
|
|
28
30
|
CHAR_LIMIT = 5
|
|
29
31
|
PAGE = 6
|
|
30
32
|
|
|
33
|
+
|
|
31
34
|
@dataclasses.dataclass
|
|
32
35
|
class DocumentSectionMetadata:
|
|
33
36
|
"""Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
|
|
@@ -42,6 +45,7 @@ class DocumentSectionMetadata:
|
|
|
42
45
|
# bounding box as an {x1, y1, x2, y2} dictionary
|
|
43
46
|
bounding_box: Optional[Dict[str, float]] = None
|
|
44
47
|
|
|
48
|
+
|
|
45
49
|
@dataclasses.dataclass
|
|
46
50
|
class DocumentSection:
|
|
47
51
|
"""A single document chunk, according to some of the splitting criteria"""
|
|
@@ -79,20 +83,14 @@ def _parse_metadata(metadata: str) -> List[ChunkMetadata]:
|
|
|
79
83
|
|
|
80
84
|
_HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
|
|
81
85
|
|
|
86
|
+
|
|
82
87
|
class DocumentSplitter(ComponentIterator):
|
|
83
|
-
"""Iterator over
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
`metadata`: which additional metadata fields to include in the output schema:
|
|
90
|
-
'title', 'heading' (HTML and Markdown), 'sourceline' (HTML), 'page' (PDF), 'bounding_box' (PDF).
|
|
91
|
-
The input can be a comma-separated string of these values eg. 'title,heading,sourceline'.
|
|
92
|
-
`separators`: which separators to use to split the document into rows. Options are:
|
|
93
|
-
'heading', 'paragraph', 'sentence', 'token_limit', 'char_limit', 'page'. As with metadata, this is can be a
|
|
94
|
-
comma-separated string eg. 'heading, token_limit'.
|
|
95
|
-
`limit`: the maximum number of tokens or characters in each chunk if 'token_limit' or 'char_limit' is specified.
|
|
88
|
+
"""Iterator over chunks of a document. The document is chunked according to the specified `separators`.
|
|
89
|
+
|
|
90
|
+
The iterator yields a `text` field containing the text of the chunk, and it may also
|
|
91
|
+
include additional metadata fields if specified in the `metadata` parameter, as explained below.
|
|
92
|
+
|
|
93
|
+
Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
|
|
96
94
|
"""
|
|
97
95
|
METADATA_COLUMN_TYPES = {
|
|
98
96
|
ChunkMetadata.TITLE: StringType(nullable=True),
|
|
@@ -103,10 +101,23 @@ class DocumentSplitter(ComponentIterator):
|
|
|
103
101
|
}
|
|
104
102
|
|
|
105
103
|
def __init__(
|
|
106
|
-
self, document: str, *, separators: str, limit: Optional[int] = None, overlap: Optional[int] = None,
|
|
107
|
-
|
|
104
|
+
self, document: str, *, separators: str, limit: Optional[int] = None, overlap: Optional[int] = None,
|
|
105
|
+
metadata: str = '',
|
|
106
|
+
html_skip_tags: Optional[list[str]] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
|
|
108
107
|
tiktoken_target_model: Optional[str] = None
|
|
109
108
|
):
|
|
109
|
+
"""Init method for `DocumentSplitter` class.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
separators: separators to use to chunk the document. Options are:
|
|
113
|
+
`'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
|
|
114
|
+
This may be a comma-separated string, e.g., `'heading,token_limit'`.
|
|
115
|
+
limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
|
|
116
|
+
or `'char_limit'` is specified.
|
|
117
|
+
metadata: additional metadata fields to include in the output. Options are:
|
|
118
|
+
`'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
|
|
119
|
+
(PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
|
|
120
|
+
"""
|
|
110
121
|
if html_skip_tags is None:
|
|
111
122
|
html_skip_tags = ['nav']
|
|
112
123
|
self._doc_handle = get_document_handle(document)
|