pixeltable 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +20 -9
- pixeltable/__version__.py +3 -0
- pixeltable/catalog/column.py +23 -7
- pixeltable/catalog/insertable_table.py +32 -19
- pixeltable/catalog/table.py +210 -20
- pixeltable/catalog/table_version.py +272 -111
- pixeltable/catalog/table_version_path.py +6 -1
- pixeltable/dataframe.py +184 -110
- pixeltable/datatransfer/__init__.py +1 -0
- pixeltable/datatransfer/label_studio.py +526 -0
- pixeltable/datatransfer/remote.py +113 -0
- pixeltable/env.py +213 -79
- pixeltable/exec/__init__.py +2 -1
- pixeltable/exec/data_row_batch.py +6 -7
- pixeltable/exec/expr_eval_node.py +28 -28
- pixeltable/exec/sql_scan_node.py +7 -6
- pixeltable/exprs/__init__.py +4 -3
- pixeltable/exprs/column_ref.py +11 -2
- pixeltable/exprs/comparison.py +39 -1
- pixeltable/exprs/data_row.py +7 -0
- pixeltable/exprs/expr.py +26 -19
- pixeltable/exprs/function_call.py +17 -18
- pixeltable/exprs/globals.py +14 -2
- pixeltable/exprs/image_member_access.py +9 -28
- pixeltable/exprs/in_predicate.py +96 -0
- pixeltable/exprs/inline_array.py +13 -11
- pixeltable/exprs/inline_dict.py +15 -13
- pixeltable/exprs/row_builder.py +7 -1
- pixeltable/exprs/similarity_expr.py +67 -0
- pixeltable/ext/functions/whisperx.py +30 -0
- pixeltable/ext/functions/yolox.py +16 -0
- pixeltable/func/__init__.py +0 -2
- pixeltable/func/aggregate_function.py +5 -2
- pixeltable/func/callable_function.py +57 -13
- pixeltable/func/expr_template_function.py +14 -3
- pixeltable/func/function.py +35 -4
- pixeltable/func/signature.py +5 -15
- pixeltable/func/udf.py +8 -12
- pixeltable/functions/fireworks.py +9 -4
- pixeltable/functions/huggingface.py +48 -5
- pixeltable/functions/openai.py +49 -11
- pixeltable/functions/pil/image.py +61 -64
- pixeltable/functions/together.py +32 -6
- pixeltable/functions/util.py +0 -43
- pixeltable/functions/video.py +46 -8
- pixeltable/globals.py +443 -0
- pixeltable/index/__init__.py +1 -0
- pixeltable/index/base.py +9 -2
- pixeltable/index/btree.py +54 -0
- pixeltable/index/embedding_index.py +91 -15
- pixeltable/io/__init__.py +4 -0
- pixeltable/io/globals.py +59 -0
- pixeltable/{utils → io}/hf_datasets.py +48 -17
- pixeltable/io/pandas.py +148 -0
- pixeltable/{utils → io}/parquet.py +58 -33
- pixeltable/iterators/__init__.py +1 -1
- pixeltable/iterators/base.py +8 -4
- pixeltable/iterators/document.py +225 -93
- pixeltable/iterators/video.py +16 -9
- pixeltable/metadata/__init__.py +8 -4
- pixeltable/metadata/converters/convert_12.py +3 -0
- pixeltable/metadata/converters/convert_13.py +41 -0
- pixeltable/metadata/converters/convert_14.py +13 -0
- pixeltable/metadata/converters/convert_15.py +29 -0
- pixeltable/metadata/converters/util.py +63 -0
- pixeltable/metadata/schema.py +12 -6
- pixeltable/plan.py +11 -24
- pixeltable/store.py +16 -23
- pixeltable/tool/create_test_db_dump.py +49 -14
- pixeltable/type_system.py +27 -58
- pixeltable/utils/coco.py +94 -0
- pixeltable/utils/documents.py +42 -12
- pixeltable/utils/http_server.py +70 -0
- pixeltable-0.2.7.dist-info/METADATA +137 -0
- pixeltable-0.2.7.dist-info/RECORD +126 -0
- {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +1 -1
- pixeltable/client.py +0 -600
- pixeltable/exprs/image_similarity_predicate.py +0 -58
- pixeltable/func/batched_function.py +0 -53
- pixeltable/func/nos_function.py +0 -202
- pixeltable/tests/conftest.py +0 -171
- pixeltable/tests/ext/test_yolox.py +0 -21
- pixeltable/tests/functions/test_fireworks.py +0 -43
- pixeltable/tests/functions/test_functions.py +0 -60
- pixeltable/tests/functions/test_huggingface.py +0 -158
- pixeltable/tests/functions/test_openai.py +0 -162
- pixeltable/tests/functions/test_together.py +0 -112
- pixeltable/tests/test_audio.py +0 -65
- pixeltable/tests/test_catalog.py +0 -27
- pixeltable/tests/test_client.py +0 -21
- pixeltable/tests/test_component_view.py +0 -379
- pixeltable/tests/test_dataframe.py +0 -440
- pixeltable/tests/test_dirs.py +0 -107
- pixeltable/tests/test_document.py +0 -120
- pixeltable/tests/test_exprs.py +0 -802
- pixeltable/tests/test_function.py +0 -332
- pixeltable/tests/test_index.py +0 -138
- pixeltable/tests/test_migration.py +0 -44
- pixeltable/tests/test_nos.py +0 -54
- pixeltable/tests/test_snapshot.py +0 -231
- pixeltable/tests/test_table.py +0 -1343
- pixeltable/tests/test_transactional_directory.py +0 -42
- pixeltable/tests/test_types.py +0 -52
- pixeltable/tests/test_video.py +0 -159
- pixeltable/tests/test_view.py +0 -535
- pixeltable/tests/utils.py +0 -442
- pixeltable/utils/clip.py +0 -18
- pixeltable-0.2.5.dist-info/METADATA +0 -128
- pixeltable-0.2.5.dist-info/RECORD +0 -139
- {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
pixeltable/env.py
CHANGED
|
@@ -5,39 +5,48 @@ import glob
|
|
|
5
5
|
import http.server
|
|
6
6
|
import importlib
|
|
7
7
|
import importlib.util
|
|
8
|
+
import inspect
|
|
8
9
|
import logging
|
|
9
10
|
import os
|
|
10
|
-
import socketserver
|
|
11
11
|
import sys
|
|
12
12
|
import threading
|
|
13
13
|
import uuid
|
|
14
14
|
import warnings
|
|
15
|
+
from dataclasses import dataclass
|
|
15
16
|
from pathlib import Path
|
|
16
17
|
from typing import Callable, Optional, Dict, Any, List
|
|
17
18
|
|
|
18
19
|
import pgserver
|
|
19
20
|
import sqlalchemy as sql
|
|
20
21
|
import yaml
|
|
21
|
-
from sqlalchemy_utils.functions import database_exists, create_database, drop_database
|
|
22
22
|
from tqdm import TqdmWarning
|
|
23
23
|
|
|
24
24
|
import pixeltable.exceptions as excs
|
|
25
25
|
from pixeltable import metadata
|
|
26
|
+
from pixeltable.utils.http_server import make_server
|
|
26
27
|
|
|
27
28
|
|
|
28
29
|
class Env:
|
|
29
30
|
"""
|
|
30
31
|
Store for runtime globals.
|
|
31
32
|
"""
|
|
33
|
+
|
|
32
34
|
_instance: Optional[Env] = None
|
|
33
35
|
_log_fmt_str = '%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d: %(message)s'
|
|
34
36
|
|
|
35
37
|
@classmethod
|
|
36
38
|
def get(cls) -> Env:
|
|
37
39
|
if cls._instance is None:
|
|
38
|
-
cls.
|
|
40
|
+
cls._init_env()
|
|
39
41
|
return cls._instance
|
|
40
42
|
|
|
43
|
+
@classmethod
|
|
44
|
+
def _init_env(cls, reinit_db: bool = False) -> None:
|
|
45
|
+
env = Env()
|
|
46
|
+
env._set_up(reinit_db=reinit_db)
|
|
47
|
+
env._upgrade_metadata()
|
|
48
|
+
cls._instance = env
|
|
49
|
+
|
|
41
50
|
def __init__(self):
|
|
42
51
|
self._home: Optional[Path] = None
|
|
43
52
|
self._media_dir: Optional[Path] = None # computed media files
|
|
@@ -46,7 +55,7 @@ class Env:
|
|
|
46
55
|
self._log_dir: Optional[Path] = None # log files
|
|
47
56
|
self._tmp_dir: Optional[Path] = None # any tmp files
|
|
48
57
|
self._sa_engine: Optional[sql.engine.base.Engine] = None
|
|
49
|
-
self._pgdata_dir
|
|
58
|
+
self._pgdata_dir: Optional[Path] = None
|
|
50
59
|
self._db_name: Optional[str] = None
|
|
51
60
|
self._db_server: Optional[pgserver.PostgresServer] = None
|
|
52
61
|
self._db_url: Optional[str] = None
|
|
@@ -54,12 +63,11 @@ class Env:
|
|
|
54
63
|
# info about installed packages that are utilized by some parts of the code;
|
|
55
64
|
# package name -> version; version == []: package is installed, but we haven't determined the version yet
|
|
56
65
|
self._installed_packages: Dict[str, Optional[List[int]]] = {}
|
|
57
|
-
self._nos_client: Optional[Any] = None
|
|
58
66
|
self._spacy_nlp: Optional[Any] = None # spacy.Language
|
|
59
|
-
self._httpd: Optional[
|
|
67
|
+
self._httpd: Optional[http.server.HTTPServer] = None
|
|
60
68
|
self._http_address: Optional[str] = None
|
|
61
69
|
|
|
62
|
-
self._registered_clients: dict[str,
|
|
70
|
+
self._registered_clients: dict[str, ApiClient] = {}
|
|
63
71
|
|
|
64
72
|
# logging-related state
|
|
65
73
|
self._logger = logging.getLogger('pixeltable')
|
|
@@ -94,13 +102,43 @@ class Env:
|
|
|
94
102
|
assert self._http_address is not None
|
|
95
103
|
return self._http_address
|
|
96
104
|
|
|
105
|
+
def configure_logging(
|
|
106
|
+
self,
|
|
107
|
+
*,
|
|
108
|
+
to_stdout: Optional[bool] = None,
|
|
109
|
+
level: Optional[int] = None,
|
|
110
|
+
add: Optional[str] = None,
|
|
111
|
+
remove: Optional[str] = None,
|
|
112
|
+
) -> None:
|
|
113
|
+
"""Configure logging.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
to_stdout: if True, also log to stdout
|
|
117
|
+
level: default log level
|
|
118
|
+
add: comma-separated list of 'module name:log level' pairs; ex.: add='video:10'
|
|
119
|
+
remove: comma-separated list of module names
|
|
120
|
+
"""
|
|
121
|
+
if to_stdout is not None:
|
|
122
|
+
self.log_to_stdout(to_stdout)
|
|
123
|
+
if level is not None:
|
|
124
|
+
self.set_log_level(level)
|
|
125
|
+
if add is not None:
|
|
126
|
+
for module, level_str in [t.split(':') for t in add.split(',')]:
|
|
127
|
+
self.set_module_log_level(module, int(level_str))
|
|
128
|
+
if remove is not None:
|
|
129
|
+
for module in remove.split(','):
|
|
130
|
+
self.set_module_log_level(module, None)
|
|
131
|
+
if to_stdout is None and level is None and add is None and remove is None:
|
|
132
|
+
self.print_log_config()
|
|
133
|
+
|
|
97
134
|
def print_log_config(self) -> None:
|
|
98
135
|
print(f'logging to {self._logfilename}')
|
|
99
136
|
print(f'{"" if self._log_to_stdout else "not "}logging to stdout')
|
|
100
137
|
print(f'default log level: {logging.getLevelName(self._default_log_level)}')
|
|
101
138
|
print(
|
|
102
139
|
f'module log levels: '
|
|
103
|
-
f'{",".join([name + ":" + logging.getLevelName(val) for name, val in self._module_log_level.items()])}'
|
|
140
|
+
f'{",".join([name + ":" + logging.getLevelName(val) for name, val in self._module_log_level.items()])}'
|
|
141
|
+
)
|
|
104
142
|
|
|
105
143
|
def log_to_stdout(self, enable: bool = True) -> None:
|
|
106
144
|
self._log_to_stdout = enable
|
|
@@ -135,10 +173,14 @@ class Env:
|
|
|
135
173
|
else:
|
|
136
174
|
return False
|
|
137
175
|
|
|
138
|
-
def
|
|
176
|
+
def _set_up(self, echo: bool = False, reinit_db: bool = False) -> None:
|
|
139
177
|
if self._initialized:
|
|
140
178
|
return
|
|
141
179
|
|
|
180
|
+
# Disable spurious warnings
|
|
181
|
+
warnings.simplefilter('ignore', category=TqdmWarning)
|
|
182
|
+
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
|
183
|
+
|
|
142
184
|
self._initialized = True
|
|
143
185
|
home = Path(os.environ.get('PIXELTABLE_HOME', str(Path.home() / '.pixeltable')))
|
|
144
186
|
assert self._home is None or self._home == home
|
|
@@ -204,6 +246,14 @@ class Env:
|
|
|
204
246
|
av_logger.addHandler(av_fh)
|
|
205
247
|
av_logger.propagate = False
|
|
206
248
|
|
|
249
|
+
# configure web-server logging
|
|
250
|
+
http_logfilename = self._logfilename.replace('.log', '_http.log')
|
|
251
|
+
http_fh = logging.FileHandler(self._log_dir / http_logfilename, mode='w')
|
|
252
|
+
http_fh.setFormatter(logging.Formatter(self._log_fmt_str))
|
|
253
|
+
http_logger = logging.getLogger('pixeltable.http.server')
|
|
254
|
+
http_logger.addHandler(http_fh)
|
|
255
|
+
http_logger.propagate = False
|
|
256
|
+
|
|
207
257
|
# empty tmp dir
|
|
208
258
|
for path in glob.glob(f'{self._tmp_dir}/*'):
|
|
209
259
|
os.remove(path)
|
|
@@ -216,23 +266,19 @@ class Env:
|
|
|
216
266
|
self._db_url = self._db_server.get_uri(database=self._db_name)
|
|
217
267
|
|
|
218
268
|
if reinit_db:
|
|
219
|
-
if
|
|
220
|
-
|
|
269
|
+
if self._store_db_exists():
|
|
270
|
+
self._drop_store_db()
|
|
221
271
|
|
|
222
|
-
if not
|
|
272
|
+
if not self._store_db_exists():
|
|
223
273
|
self._logger.info(f'creating database at {self.db_url}')
|
|
224
|
-
|
|
225
|
-
self.
|
|
274
|
+
self._create_store_db()
|
|
275
|
+
self._create_engine(echo=echo)
|
|
226
276
|
from pixeltable.metadata import schema
|
|
227
277
|
schema.Base.metadata.create_all(self._sa_engine)
|
|
228
278
|
metadata.create_system_info(self._sa_engine)
|
|
229
|
-
# enable pgvector
|
|
230
|
-
with self._sa_engine.begin() as conn:
|
|
231
|
-
conn.execute(sql.text('CREATE EXTENSION vector'))
|
|
232
279
|
else:
|
|
233
280
|
self._logger.info(f'found database {self.db_url}')
|
|
234
|
-
|
|
235
|
-
self._sa_engine = sql.create_engine(self.db_url, echo=echo, future=True)
|
|
281
|
+
self._create_engine(echo=echo)
|
|
236
282
|
|
|
237
283
|
print(f'Connected to Pixeltable database at: {self.db_url}')
|
|
238
284
|
|
|
@@ -240,72 +286,122 @@ class Env:
|
|
|
240
286
|
self._set_up_runtime()
|
|
241
287
|
self.log_to_stdout(False)
|
|
242
288
|
|
|
243
|
-
|
|
244
|
-
|
|
289
|
+
def _create_engine(self, echo: bool = False) -> None:
|
|
290
|
+
self._sa_engine = sql.create_engine(self.db_url, echo=echo, future=True, isolation_level='AUTOCOMMIT')
|
|
245
291
|
|
|
246
|
-
def
|
|
292
|
+
def _store_db_exists(self) -> bool:
|
|
293
|
+
assert self._db_name is not None
|
|
294
|
+
# don't try to connect to self.db_name, it may not exist
|
|
295
|
+
db_url = self._db_server.get_uri(database='postgres')
|
|
296
|
+
engine = sql.create_engine(db_url, future=True)
|
|
297
|
+
try:
|
|
298
|
+
with engine.begin() as conn:
|
|
299
|
+
stmt = f"SELECT COUNT(*) FROM pg_database WHERE datname = '{self._db_name}'"
|
|
300
|
+
result = conn.scalar(sql.text(stmt))
|
|
301
|
+
assert result <= 1
|
|
302
|
+
return result == 1
|
|
303
|
+
finally:
|
|
304
|
+
engine.dispose()
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _create_store_db(self) -> None:
|
|
308
|
+
assert self._db_name is not None
|
|
309
|
+
# create the db
|
|
310
|
+
pg_db_url = self._db_server.get_uri(database='postgres')
|
|
311
|
+
engine = sql.create_engine(pg_db_url, future=True, isolation_level='AUTOCOMMIT')
|
|
312
|
+
preparer = engine.dialect.identifier_preparer
|
|
313
|
+
try:
|
|
314
|
+
with engine.begin() as conn:
|
|
315
|
+
# use C collation to get standard C/Python-style sorting
|
|
316
|
+
stmt = (
|
|
317
|
+
f"CREATE DATABASE {preparer.quote(self._db_name)} "
|
|
318
|
+
"ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C' TEMPLATE template0"
|
|
319
|
+
)
|
|
320
|
+
conn.execute(sql.text(stmt))
|
|
321
|
+
finally:
|
|
322
|
+
engine.dispose()
|
|
323
|
+
|
|
324
|
+
# enable pgvector
|
|
325
|
+
store_db_url = self._db_server.get_uri(database=self._db_name)
|
|
326
|
+
engine = sql.create_engine(store_db_url, future=True, isolation_level='AUTOCOMMIT')
|
|
327
|
+
try:
|
|
328
|
+
with engine.begin() as conn:
|
|
329
|
+
conn.execute(sql.text('CREATE EXTENSION vector'))
|
|
330
|
+
finally:
|
|
331
|
+
engine.dispose()
|
|
332
|
+
|
|
333
|
+
def _drop_store_db(self) -> None:
|
|
334
|
+
assert self._db_name is not None
|
|
335
|
+
db_url = self._db_server.get_uri(database='postgres')
|
|
336
|
+
engine = sql.create_engine(db_url, future=True, isolation_level='AUTOCOMMIT')
|
|
337
|
+
preparer = engine.dialect.identifier_preparer
|
|
338
|
+
try:
|
|
339
|
+
with engine.begin() as conn:
|
|
340
|
+
# terminate active connections
|
|
341
|
+
stmt = (f"""
|
|
342
|
+
SELECT pg_terminate_backend(pg_stat_activity.pid)
|
|
343
|
+
FROM pg_stat_activity
|
|
344
|
+
WHERE pg_stat_activity.datname = '{self._db_name}'
|
|
345
|
+
AND pid <> pg_backend_pid()
|
|
346
|
+
""")
|
|
347
|
+
conn.execute(sql.text(stmt))
|
|
348
|
+
# drop db
|
|
349
|
+
stmt = f'DROP DATABASE {preparer.quote(self._db_name)}'
|
|
350
|
+
conn.execute(sql.text(stmt))
|
|
351
|
+
finally:
|
|
352
|
+
engine.dispose()
|
|
353
|
+
|
|
354
|
+
def _upgrade_metadata(self) -> None:
|
|
247
355
|
metadata.upgrade_md(self._sa_engine)
|
|
248
356
|
|
|
249
|
-
def
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
self._nos_client = nos.client.InferenceClient()
|
|
254
|
-
self._logger.info('waiting for NOS')
|
|
255
|
-
self._nos_client.WaitForServer()
|
|
256
|
-
|
|
257
|
-
# now that we have a client, we can create the module
|
|
258
|
-
import importlib
|
|
259
|
-
try:
|
|
260
|
-
importlib.import_module('pixeltable.functions.nos')
|
|
261
|
-
# it's already been created
|
|
262
|
-
return
|
|
263
|
-
except ImportError:
|
|
264
|
-
pass
|
|
265
|
-
from pixeltable.functions.util import create_nos_modules
|
|
266
|
-
_ = create_nos_modules()
|
|
357
|
+
def _register_client(self, name: str, init_fn: Callable) -> None:
|
|
358
|
+
sig = inspect.signature(init_fn)
|
|
359
|
+
param_names = list(sig.parameters.keys())
|
|
360
|
+
self._registered_clients[name] = ApiClient(init_fn=init_fn, param_names=param_names)
|
|
267
361
|
|
|
268
|
-
def get_client(self, name: str
|
|
362
|
+
def get_client(self, name: str) -> Any:
|
|
269
363
|
"""
|
|
270
|
-
Gets the client with the specified name,
|
|
364
|
+
Gets the client with the specified name, initializing it if necessary.
|
|
271
365
|
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
- environ: The name of the environment variable to use for the API key, if no API key is found in config
|
|
275
|
-
(defaults to f'{name.upper()}_API_KEY')
|
|
366
|
+
Args:
|
|
367
|
+
- name: The name of the client
|
|
276
368
|
"""
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
369
|
+
cl = self._registered_clients[name]
|
|
370
|
+
if cl.client_obj is not None:
|
|
371
|
+
return cl.client_obj # Already initialized
|
|
372
|
+
|
|
373
|
+
# Construct a client. For each client parameter, first check if the parameter is in the environment;
|
|
374
|
+
# if not, look in Pixeltable config from `config.yaml`.
|
|
375
|
+
|
|
376
|
+
init_kwargs: dict[str, str] = {}
|
|
377
|
+
for param in cl.param_names:
|
|
378
|
+
environ = f'{name.upper()}_{param.upper()}'
|
|
379
|
+
if environ in os.environ:
|
|
380
|
+
init_kwargs[param] = os.environ[environ]
|
|
381
|
+
elif name.lower() in self._config and param in self._config[name.lower()]:
|
|
382
|
+
init_kwargs[param] = self._config[name.lower()][param.lower()]
|
|
383
|
+
if param not in init_kwargs or init_kwargs[param] == '':
|
|
384
|
+
raise excs.Error(
|
|
385
|
+
f'`{name}` client not initialized: parameter `{param}` is not configured.\n'
|
|
386
|
+
f'To fix this, specify the `{environ}` environment variable, or put `{param.lower()}` in '
|
|
387
|
+
f'the `{name.lower()}` section of $PIXELTABLE_HOME/config.yaml.'
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
cl.client_obj = cl.init_fn(**init_kwargs)
|
|
292
391
|
self._logger.info(f'Initialized `{name}` client.')
|
|
293
|
-
return
|
|
392
|
+
return cl.client_obj
|
|
294
393
|
|
|
295
394
|
def _start_web_server(self) -> None:
|
|
296
395
|
"""
|
|
297
396
|
The http server root is the file system root.
|
|
298
397
|
eg: /home/media/foo.mp4 is located at http://127.0.0.1:{port}/home/media/foo.mp4
|
|
398
|
+
in windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
|
|
299
399
|
This arrangement enables serving media hosted within _home,
|
|
300
400
|
as well as external media inserted into pixeltable or produced by pixeltable.
|
|
301
401
|
The port is chosen dynamically to prevent conflicts.
|
|
302
402
|
"""
|
|
303
403
|
# Port 0 means OS picks one for us.
|
|
304
|
-
|
|
305
|
-
class FixedRootHandler(http.server.SimpleHTTPRequestHandler):
|
|
306
|
-
def __init__(self, *args, **kwargs):
|
|
307
|
-
super().__init__(*args, directory='/', **kwargs)
|
|
308
|
-
self._httpd = socketserver.TCPServer(address, FixedRootHandler)
|
|
404
|
+
self._httpd = make_server('127.0.0.1', 0)
|
|
309
405
|
port = self._httpd.server_address[1]
|
|
310
406
|
self._http_address = f'http://127.0.0.1:{port}'
|
|
311
407
|
|
|
@@ -335,19 +431,21 @@ class Env:
|
|
|
335
431
|
check('transformers')
|
|
336
432
|
check('sentence_transformers')
|
|
337
433
|
check('yolox')
|
|
434
|
+
check('whisperx')
|
|
338
435
|
check('boto3')
|
|
436
|
+
check('fitz') # pymupdf
|
|
339
437
|
check('pyarrow')
|
|
340
438
|
check('spacy') # TODO: deal with en-core-web-sm
|
|
341
439
|
if self.is_installed_package('spacy'):
|
|
342
440
|
import spacy
|
|
441
|
+
|
|
343
442
|
self._spacy_nlp = spacy.load('en_core_web_sm')
|
|
344
443
|
check('tiktoken')
|
|
345
444
|
check('openai')
|
|
346
445
|
check('together')
|
|
347
446
|
check('fireworks')
|
|
348
|
-
check('
|
|
349
|
-
|
|
350
|
-
self._create_nos_client()
|
|
447
|
+
check('label_studio_sdk')
|
|
448
|
+
check('openpyxl')
|
|
351
449
|
|
|
352
450
|
def require_package(self, package: str, min_version: Optional[List[int]] = None) -> None:
|
|
353
451
|
assert package in self._installed_packages
|
|
@@ -357,7 +455,7 @@ class Env:
|
|
|
357
455
|
return
|
|
358
456
|
|
|
359
457
|
# check whether we have a version >= the required one
|
|
360
|
-
if self._installed_packages[package]
|
|
458
|
+
if not self._installed_packages[package]:
|
|
361
459
|
m = importlib.import_module(package)
|
|
362
460
|
module_version = [int(x) for x in m.__version__.split('.')]
|
|
363
461
|
self._installed_packages[package] = module_version
|
|
@@ -365,9 +463,12 @@ class Env:
|
|
|
365
463
|
if len(min_version) < len(installed_version):
|
|
366
464
|
normalized_min_version = min_version + [0] * (len(installed_version) - len(min_version))
|
|
367
465
|
if any([a < b for a, b in zip(installed_version, normalized_min_version)]):
|
|
368
|
-
raise excs.Error(
|
|
369
|
-
|
|
370
|
-
|
|
466
|
+
raise excs.Error(
|
|
467
|
+
(
|
|
468
|
+
f'The installed version of package {package} is {".".join(str(v) for v in installed_version)}, '
|
|
469
|
+
f'but version >={".".join(str(v) for v in min_version)} is required'
|
|
470
|
+
)
|
|
471
|
+
)
|
|
371
472
|
|
|
372
473
|
def num_tmp_files(self) -> int:
|
|
373
474
|
return len(glob.glob(f'{self._tmp_dir}/*'))
|
|
@@ -405,11 +506,44 @@ class Env:
|
|
|
405
506
|
assert self._sa_engine is not None
|
|
406
507
|
return self._sa_engine
|
|
407
508
|
|
|
408
|
-
@property
|
|
409
|
-
def nos_client(self) -> Any:
|
|
410
|
-
return self._nos_client
|
|
411
|
-
|
|
412
509
|
@property
|
|
413
510
|
def spacy_nlp(self) -> Any:
|
|
414
511
|
assert self._spacy_nlp is not None
|
|
415
|
-
return self._spacy_nlp
|
|
512
|
+
return self._spacy_nlp
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
def register_client(name: str) -> Callable:
|
|
516
|
+
"""Decorator that registers a third-party API client for use by Pixeltable.
|
|
517
|
+
|
|
518
|
+
The decorated function is an initialization wrapper for the client, and can have
|
|
519
|
+
any number of string parameters, with a signature such as:
|
|
520
|
+
|
|
521
|
+
```
|
|
522
|
+
def my_client(api_key: str, url: str) -> my_client_sdk.Client:
|
|
523
|
+
return my_client_sdk.Client(api_key=api_key, url=url)
|
|
524
|
+
```
|
|
525
|
+
|
|
526
|
+
The initialization wrapper will not be called immediately; initialization will
|
|
527
|
+
be deferred until the first time the client is used. At initialization time,
|
|
528
|
+
Pixeltable will attempt to load the client parameters from config. For each
|
|
529
|
+
config parameter:
|
|
530
|
+
- If an environment variable named MY_CLIENT_API_KEY (for example) is set, use it;
|
|
531
|
+
- Otherwise, look for 'api_key' in the 'my_client' section of config.yaml.
|
|
532
|
+
|
|
533
|
+
If all config parameters are found, Pixeltable calls the initialization function;
|
|
534
|
+
otherwise it throws an exception.
|
|
535
|
+
|
|
536
|
+
Args:
|
|
537
|
+
- name (str): The name of the API client (e.g., 'openai' or 'label-studio').
|
|
538
|
+
"""
|
|
539
|
+
def decorator(fn: Callable) -> None:
|
|
540
|
+
Env.get()._register_client(name, fn)
|
|
541
|
+
|
|
542
|
+
return decorator
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
@dataclass
|
|
546
|
+
class ApiClient:
|
|
547
|
+
init_fn: Callable
|
|
548
|
+
param_names: list[str]
|
|
549
|
+
client_obj: Optional[Any] = None
|
pixeltable/exec/__init__.py
CHANGED
|
@@ -6,4 +6,5 @@ from .exec_node import ExecNode
|
|
|
6
6
|
from .expr_eval_node import ExprEvalNode
|
|
7
7
|
from .in_memory_data_node import InMemoryDataNode
|
|
8
8
|
from .sql_scan_node import SqlScanNode
|
|
9
|
-
from .media_validation_node import MediaValidationNode
|
|
9
|
+
from .media_validation_node import MediaValidationNode
|
|
10
|
+
from .data_row_batch import DataRowBatch
|
|
@@ -14,9 +14,8 @@ class DataRowBatch:
|
|
|
14
14
|
|
|
15
15
|
Contains the metadata needed to initialize DataRows.
|
|
16
16
|
"""
|
|
17
|
-
def __init__(self, tbl: catalog.TableVersion, row_builder: exprs.RowBuilder, len: int = 0):
|
|
18
|
-
self.
|
|
19
|
-
self.tbl_version = tbl.version
|
|
17
|
+
def __init__(self, tbl: Optional[catalog.TableVersion], row_builder: exprs.RowBuilder, len: int = 0):
|
|
18
|
+
self.tbl = tbl
|
|
20
19
|
self.row_builder = row_builder
|
|
21
20
|
self.img_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_image_type()]
|
|
22
21
|
# non-image media slots
|
|
@@ -42,9 +41,10 @@ class DataRowBatch:
|
|
|
42
41
|
|
|
43
42
|
def set_row_ids(self, row_ids: List[int]) -> None:
|
|
44
43
|
"""Sets pks for rows in batch"""
|
|
44
|
+
assert self.tbl is not None
|
|
45
45
|
assert len(row_ids) == len(self.rows)
|
|
46
46
|
for row, row_id in zip(self.rows, row_ids):
|
|
47
|
-
row.set_pk((row_id, self.
|
|
47
|
+
row.set_pk((row_id, self.tbl))
|
|
48
48
|
|
|
49
49
|
def __len__(self) -> int:
|
|
50
50
|
return len(self.rows)
|
|
@@ -57,6 +57,7 @@ class DataRowBatch:
|
|
|
57
57
|
flushed_slot_idxs: Optional[List[int]] = None
|
|
58
58
|
) -> None:
|
|
59
59
|
"""Flushes images in the given range of rows."""
|
|
60
|
+
assert self.tbl is not None
|
|
60
61
|
if stored_img_info is None:
|
|
61
62
|
stored_img_info = []
|
|
62
63
|
if flushed_slot_idxs is None:
|
|
@@ -67,12 +68,10 @@ class DataRowBatch:
|
|
|
67
68
|
idx_range = slice(0, len(self.rows))
|
|
68
69
|
for row in self.rows[idx_range]:
|
|
69
70
|
for info in stored_img_info:
|
|
70
|
-
filepath = str(MediaStore.prepare_media_path(self.
|
|
71
|
+
filepath = str(MediaStore.prepare_media_path(self.tbl.id, info.col.id, self.tbl.version))
|
|
71
72
|
row.flush_img(info.slot_idx, filepath)
|
|
72
73
|
for slot_idx in flushed_slot_idxs:
|
|
73
74
|
row.flush_img(slot_idx)
|
|
74
|
-
#_logger.debug(
|
|
75
|
-
#f'flushed images in range {idx_range}: slot_idxs={flushed_slot_idxs} stored_img_info={stored_img_info}')
|
|
76
75
|
|
|
77
76
|
def __iter__(self) -> Iterator[exprs.DataRow]:
|
|
78
77
|
return DataRowBatchIterator(self)
|
|
@@ -1,20 +1,20 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import warnings
|
|
3
|
-
from typing import List, Optional, Tuple
|
|
4
|
-
from dataclasses import dataclass, field
|
|
5
1
|
import logging
|
|
2
|
+
import sys
|
|
6
3
|
import time
|
|
4
|
+
import warnings
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import List, Optional
|
|
7
7
|
|
|
8
8
|
from tqdm import tqdm, TqdmWarning
|
|
9
9
|
|
|
10
|
+
import pixeltable.exprs as exprs
|
|
11
|
+
from pixeltable.func import CallableFunction
|
|
10
12
|
from .data_row_batch import DataRowBatch
|
|
11
13
|
from .exec_node import ExecNode
|
|
12
|
-
import pixeltable.exprs as exprs
|
|
13
|
-
import pixeltable.func as func
|
|
14
|
-
|
|
15
14
|
|
|
16
15
|
_logger = logging.getLogger('pixeltable')
|
|
17
16
|
|
|
17
|
+
|
|
18
18
|
class ExprEvalNode(ExecNode):
|
|
19
19
|
"""Materializes expressions
|
|
20
20
|
"""
|
|
@@ -22,7 +22,7 @@ class ExprEvalNode(ExecNode):
|
|
|
22
22
|
class Cohort:
|
|
23
23
|
"""List of exprs that form an evaluation context and contain calls to at most one external function"""
|
|
24
24
|
exprs: List[exprs.Expr]
|
|
25
|
-
|
|
25
|
+
batched_fn: Optional[CallableFunction]
|
|
26
26
|
segment_ctxs: List[exprs.RowBuilder.EvalCtx]
|
|
27
27
|
target_slot_idxs: List[int]
|
|
28
28
|
batch_size: int = 8
|
|
@@ -63,12 +63,12 @@ class ExprEvalNode(ExecNode):
|
|
|
63
63
|
if self.pbar is not None:
|
|
64
64
|
self.pbar.close()
|
|
65
65
|
|
|
66
|
-
def _get_batched_fn(self, expr: exprs.Expr) -> Optional[
|
|
67
|
-
if
|
|
68
|
-
return
|
|
69
|
-
return
|
|
66
|
+
def _get_batched_fn(self, expr: exprs.Expr) -> Optional[CallableFunction]:
|
|
67
|
+
if isinstance(expr, exprs.FunctionCall) and isinstance(expr.fn, CallableFunction) and expr.fn.is_batched:
|
|
68
|
+
return expr.fn
|
|
69
|
+
return None
|
|
70
70
|
|
|
71
|
-
def
|
|
71
|
+
def _is_batched_fn_call(self, expr: exprs.Expr) -> bool:
|
|
72
72
|
return self._get_batched_fn(expr) is not None
|
|
73
73
|
|
|
74
74
|
def _create_cohorts(self) -> None:
|
|
@@ -76,14 +76,14 @@ class ExprEvalNode(ExecNode):
|
|
|
76
76
|
# break up all_exprs into cohorts such that each cohort contains calls to at most one external function;
|
|
77
77
|
# seed the cohorts with only the ext fn calls
|
|
78
78
|
cohorts: List[List[exprs.Expr]] = []
|
|
79
|
-
|
|
79
|
+
current_batched_fn: Optional[CallableFunction] = None
|
|
80
80
|
for e in all_exprs:
|
|
81
|
-
if not self.
|
|
81
|
+
if not self._is_batched_fn_call(e):
|
|
82
82
|
continue
|
|
83
|
-
if
|
|
83
|
+
if current_batched_fn is None or current_batched_fn != e.fn:
|
|
84
84
|
# create a new cohort
|
|
85
85
|
cohorts.append([])
|
|
86
|
-
|
|
86
|
+
current_batched_fn = e.fn
|
|
87
87
|
cohorts[-1].append(e)
|
|
88
88
|
|
|
89
89
|
# expand the cohorts to include all exprs that are in the same evaluation context as the external calls;
|
|
@@ -115,18 +115,18 @@ class ExprEvalNode(ExecNode):
|
|
|
115
115
|
assert len(cohort) > 0
|
|
116
116
|
# create the first segment here, so we can avoid checking for an empty list in the loop
|
|
117
117
|
segments = [[cohort[0]]]
|
|
118
|
-
|
|
119
|
-
|
|
118
|
+
is_batched_segment = self._is_batched_fn_call(cohort[0])
|
|
119
|
+
batched_fn: Optional[CallableFunction] = self._get_batched_fn(cohort[0])
|
|
120
120
|
for e in cohort[1:]:
|
|
121
|
-
if self.
|
|
121
|
+
if self._is_batched_fn_call(e):
|
|
122
122
|
segments.append([e])
|
|
123
|
-
|
|
124
|
-
|
|
123
|
+
is_batched_segment = True
|
|
124
|
+
batched_fn = self._get_batched_fn(e)
|
|
125
125
|
else:
|
|
126
|
-
if
|
|
126
|
+
if is_batched_segment:
|
|
127
127
|
# start a new segment
|
|
128
128
|
segments.append([])
|
|
129
|
-
|
|
129
|
+
is_batched_segment = False
|
|
130
130
|
segments[-1].append(e)
|
|
131
131
|
|
|
132
132
|
# we create the EvalCtxs manually because create_eval_ctx() would repeat the dependencies of each segment
|
|
@@ -135,21 +135,21 @@ class ExprEvalNode(ExecNode):
|
|
|
135
135
|
slot_idxs=[e.slot_idx for e in s], exprs=s, target_slot_idxs=[], target_exprs=[])
|
|
136
136
|
for s in segments
|
|
137
137
|
]
|
|
138
|
-
cohort_info = self.Cohort(cohort,
|
|
138
|
+
cohort_info = self.Cohort(cohort, batched_fn, segment_ctxs, target_slot_idxs[i])
|
|
139
139
|
self.cohorts.append(cohort_info)
|
|
140
140
|
|
|
141
141
|
def _exec_cohort(self, cohort: Cohort, rows: DataRowBatch) -> None:
|
|
142
142
|
"""Compute the cohort for the entire input batch by dividing it up into sub-batches"""
|
|
143
143
|
batch_start_idx = 0 # start row of the current sub-batch
|
|
144
144
|
# for multi-resolution models, we re-assess the correct ext fn batch size for each input batch
|
|
145
|
-
ext_batch_size = cohort.
|
|
145
|
+
ext_batch_size = cohort.batched_fn.get_batch_size() if cohort.batched_fn is not None else None
|
|
146
146
|
if ext_batch_size is not None:
|
|
147
147
|
cohort.batch_size = ext_batch_size
|
|
148
148
|
|
|
149
149
|
while batch_start_idx < len(rows):
|
|
150
150
|
num_batch_rows = min(cohort.batch_size, len(rows) - batch_start_idx)
|
|
151
151
|
for segment_ctx in cohort.segment_ctxs:
|
|
152
|
-
if not self.
|
|
152
|
+
if not self._is_batched_fn_call(segment_ctx.exprs[0]):
|
|
153
153
|
# compute batch row-wise
|
|
154
154
|
for row_idx in range(batch_start_idx, batch_start_idx + num_batch_rows):
|
|
155
155
|
self.row_builder.eval(
|
|
@@ -193,7 +193,7 @@ class ExprEvalNode(ExecNode):
|
|
|
193
193
|
for k in kwarg_batches.keys()
|
|
194
194
|
}
|
|
195
195
|
start_ts = time.perf_counter()
|
|
196
|
-
result_batch = fn_call.fn.
|
|
196
|
+
result_batch = fn_call.fn.exec_batch(*call_args, **call_kwargs)
|
|
197
197
|
self.ctx.profile.eval_time[fn_call.slot_idx] += time.perf_counter() - start_ts
|
|
198
198
|
self.ctx.profile.eval_count[fn_call.slot_idx] += num_ext_batch_rows
|
|
199
199
|
|
pixeltable/exec/sql_scan_node.py
CHANGED
|
@@ -21,7 +21,6 @@ class SqlScanNode(ExecNode):
|
|
|
21
21
|
select_list: Iterable[exprs.Expr],
|
|
22
22
|
where_clause: Optional[exprs.Expr] = None, filter: Optional[exprs.Predicate] = None,
|
|
23
23
|
order_by_items: Optional[List[Tuple[exprs.Expr, bool]]] = None,
|
|
24
|
-
similarity_clause: Optional[exprs.ImageSimilarityPredicate] = None,
|
|
25
24
|
limit: int = 0, set_pk: bool = False, exact_version_only: Optional[List[catalog.TableVersion]] = None
|
|
26
25
|
):
|
|
27
26
|
"""
|
|
@@ -77,15 +76,17 @@ class SqlScanNode(ExecNode):
|
|
|
77
76
|
# the number of tables that need to be joined to the target table
|
|
78
77
|
for rowid_ref in [e for e, _ in order_by_items if isinstance(e, exprs.RowidRef)]:
|
|
79
78
|
rowid_ref.set_tbl(tbl)
|
|
80
|
-
order_by_clause
|
|
79
|
+
order_by_clause: List[sql.ClauseElement] = []
|
|
80
|
+
for e, asc in order_by_items:
|
|
81
|
+
if isinstance(e, exprs.SimilarityExpr):
|
|
82
|
+
order_by_clause.append(e.as_order_by_clause(asc))
|
|
83
|
+
else:
|
|
84
|
+
order_by_clause.append(e.sql_expr().desc() if not asc else e.sql_expr())
|
|
81
85
|
|
|
82
86
|
if where_clause is not None:
|
|
83
87
|
sql_where_clause = where_clause.sql_expr()
|
|
84
88
|
assert sql_where_clause is not None
|
|
85
89
|
self.stmt = self.stmt.where(sql_where_clause)
|
|
86
|
-
if similarity_clause is not None:
|
|
87
|
-
self.stmt = self.stmt.order_by(
|
|
88
|
-
similarity_clause.img_col_ref.col.sa_idx_col.l2_distance(similarity_clause.embedding()))
|
|
89
90
|
if len(order_by_clause) > 0:
|
|
90
91
|
self.stmt = self.stmt.order_by(*order_by_clause)
|
|
91
92
|
elif target.id in row_builder.unstored_iter_args:
|
|
@@ -201,7 +202,7 @@ class SqlScanNode(ExecNode):
|
|
|
201
202
|
self.row_builder.eval(output_row, self.filter_eval_ctx, profile=self.ctx.profile)
|
|
202
203
|
if output_row[self.filter.slot_idx]:
|
|
203
204
|
needs_row = True
|
|
204
|
-
if self.limit
|
|
205
|
+
if self.limit > 0 and len(output_batch) >= self.limit:
|
|
205
206
|
self.has_more_rows = False
|
|
206
207
|
break
|
|
207
208
|
else:
|