pixeltable 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (110) hide show
  1. pixeltable/__init__.py +20 -9
  2. pixeltable/__version__.py +3 -0
  3. pixeltable/catalog/column.py +23 -7
  4. pixeltable/catalog/insertable_table.py +32 -19
  5. pixeltable/catalog/table.py +210 -20
  6. pixeltable/catalog/table_version.py +272 -111
  7. pixeltable/catalog/table_version_path.py +6 -1
  8. pixeltable/dataframe.py +184 -110
  9. pixeltable/datatransfer/__init__.py +1 -0
  10. pixeltable/datatransfer/label_studio.py +526 -0
  11. pixeltable/datatransfer/remote.py +113 -0
  12. pixeltable/env.py +213 -79
  13. pixeltable/exec/__init__.py +2 -1
  14. pixeltable/exec/data_row_batch.py +6 -7
  15. pixeltable/exec/expr_eval_node.py +28 -28
  16. pixeltable/exec/sql_scan_node.py +7 -6
  17. pixeltable/exprs/__init__.py +4 -3
  18. pixeltable/exprs/column_ref.py +11 -2
  19. pixeltable/exprs/comparison.py +39 -1
  20. pixeltable/exprs/data_row.py +7 -0
  21. pixeltable/exprs/expr.py +26 -19
  22. pixeltable/exprs/function_call.py +17 -18
  23. pixeltable/exprs/globals.py +14 -2
  24. pixeltable/exprs/image_member_access.py +9 -28
  25. pixeltable/exprs/in_predicate.py +96 -0
  26. pixeltable/exprs/inline_array.py +13 -11
  27. pixeltable/exprs/inline_dict.py +15 -13
  28. pixeltable/exprs/row_builder.py +7 -1
  29. pixeltable/exprs/similarity_expr.py +67 -0
  30. pixeltable/ext/functions/whisperx.py +30 -0
  31. pixeltable/ext/functions/yolox.py +16 -0
  32. pixeltable/func/__init__.py +0 -2
  33. pixeltable/func/aggregate_function.py +5 -2
  34. pixeltable/func/callable_function.py +57 -13
  35. pixeltable/func/expr_template_function.py +14 -3
  36. pixeltable/func/function.py +35 -4
  37. pixeltable/func/signature.py +5 -15
  38. pixeltable/func/udf.py +8 -12
  39. pixeltable/functions/fireworks.py +9 -4
  40. pixeltable/functions/huggingface.py +48 -5
  41. pixeltable/functions/openai.py +49 -11
  42. pixeltable/functions/pil/image.py +61 -64
  43. pixeltable/functions/together.py +32 -6
  44. pixeltable/functions/util.py +0 -43
  45. pixeltable/functions/video.py +46 -8
  46. pixeltable/globals.py +443 -0
  47. pixeltable/index/__init__.py +1 -0
  48. pixeltable/index/base.py +9 -2
  49. pixeltable/index/btree.py +54 -0
  50. pixeltable/index/embedding_index.py +91 -15
  51. pixeltable/io/__init__.py +4 -0
  52. pixeltable/io/globals.py +59 -0
  53. pixeltable/{utils → io}/hf_datasets.py +48 -17
  54. pixeltable/io/pandas.py +148 -0
  55. pixeltable/{utils → io}/parquet.py +58 -33
  56. pixeltable/iterators/__init__.py +1 -1
  57. pixeltable/iterators/base.py +8 -4
  58. pixeltable/iterators/document.py +225 -93
  59. pixeltable/iterators/video.py +16 -9
  60. pixeltable/metadata/__init__.py +8 -4
  61. pixeltable/metadata/converters/convert_12.py +3 -0
  62. pixeltable/metadata/converters/convert_13.py +41 -0
  63. pixeltable/metadata/converters/convert_14.py +13 -0
  64. pixeltable/metadata/converters/convert_15.py +29 -0
  65. pixeltable/metadata/converters/util.py +63 -0
  66. pixeltable/metadata/schema.py +12 -6
  67. pixeltable/plan.py +11 -24
  68. pixeltable/store.py +16 -23
  69. pixeltable/tool/create_test_db_dump.py +49 -14
  70. pixeltable/type_system.py +27 -58
  71. pixeltable/utils/coco.py +94 -0
  72. pixeltable/utils/documents.py +42 -12
  73. pixeltable/utils/http_server.py +70 -0
  74. pixeltable-0.2.7.dist-info/METADATA +137 -0
  75. pixeltable-0.2.7.dist-info/RECORD +126 -0
  76. {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +1 -1
  77. pixeltable/client.py +0 -600
  78. pixeltable/exprs/image_similarity_predicate.py +0 -58
  79. pixeltable/func/batched_function.py +0 -53
  80. pixeltable/func/nos_function.py +0 -202
  81. pixeltable/tests/conftest.py +0 -171
  82. pixeltable/tests/ext/test_yolox.py +0 -21
  83. pixeltable/tests/functions/test_fireworks.py +0 -43
  84. pixeltable/tests/functions/test_functions.py +0 -60
  85. pixeltable/tests/functions/test_huggingface.py +0 -158
  86. pixeltable/tests/functions/test_openai.py +0 -162
  87. pixeltable/tests/functions/test_together.py +0 -112
  88. pixeltable/tests/test_audio.py +0 -65
  89. pixeltable/tests/test_catalog.py +0 -27
  90. pixeltable/tests/test_client.py +0 -21
  91. pixeltable/tests/test_component_view.py +0 -379
  92. pixeltable/tests/test_dataframe.py +0 -440
  93. pixeltable/tests/test_dirs.py +0 -107
  94. pixeltable/tests/test_document.py +0 -120
  95. pixeltable/tests/test_exprs.py +0 -802
  96. pixeltable/tests/test_function.py +0 -332
  97. pixeltable/tests/test_index.py +0 -138
  98. pixeltable/tests/test_migration.py +0 -44
  99. pixeltable/tests/test_nos.py +0 -54
  100. pixeltable/tests/test_snapshot.py +0 -231
  101. pixeltable/tests/test_table.py +0 -1343
  102. pixeltable/tests/test_transactional_directory.py +0 -42
  103. pixeltable/tests/test_types.py +0 -52
  104. pixeltable/tests/test_video.py +0 -159
  105. pixeltable/tests/test_view.py +0 -535
  106. pixeltable/tests/utils.py +0 -442
  107. pixeltable/utils/clip.py +0 -18
  108. pixeltable-0.2.5.dist-info/METADATA +0 -128
  109. pixeltable-0.2.5.dist-info/RECORD +0 -139
  110. {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
pixeltable/env.py CHANGED
@@ -5,39 +5,48 @@ import glob
5
5
  import http.server
6
6
  import importlib
7
7
  import importlib.util
8
+ import inspect
8
9
  import logging
9
10
  import os
10
- import socketserver
11
11
  import sys
12
12
  import threading
13
13
  import uuid
14
14
  import warnings
15
+ from dataclasses import dataclass
15
16
  from pathlib import Path
16
17
  from typing import Callable, Optional, Dict, Any, List
17
18
 
18
19
  import pgserver
19
20
  import sqlalchemy as sql
20
21
  import yaml
21
- from sqlalchemy_utils.functions import database_exists, create_database, drop_database
22
22
  from tqdm import TqdmWarning
23
23
 
24
24
  import pixeltable.exceptions as excs
25
25
  from pixeltable import metadata
26
+ from pixeltable.utils.http_server import make_server
26
27
 
27
28
 
28
29
  class Env:
29
30
  """
30
31
  Store for runtime globals.
31
32
  """
33
+
32
34
  _instance: Optional[Env] = None
33
35
  _log_fmt_str = '%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d: %(message)s'
34
36
 
35
37
  @classmethod
36
38
  def get(cls) -> Env:
37
39
  if cls._instance is None:
38
- cls._instance = Env()
40
+ cls._init_env()
39
41
  return cls._instance
40
42
 
43
+ @classmethod
44
+ def _init_env(cls, reinit_db: bool = False) -> None:
45
+ env = Env()
46
+ env._set_up(reinit_db=reinit_db)
47
+ env._upgrade_metadata()
48
+ cls._instance = env
49
+
41
50
  def __init__(self):
42
51
  self._home: Optional[Path] = None
43
52
  self._media_dir: Optional[Path] = None # computed media files
@@ -46,7 +55,7 @@ class Env:
46
55
  self._log_dir: Optional[Path] = None # log files
47
56
  self._tmp_dir: Optional[Path] = None # any tmp files
48
57
  self._sa_engine: Optional[sql.engine.base.Engine] = None
49
- self._pgdata_dir : Optional[Path] = None
58
+ self._pgdata_dir: Optional[Path] = None
50
59
  self._db_name: Optional[str] = None
51
60
  self._db_server: Optional[pgserver.PostgresServer] = None
52
61
  self._db_url: Optional[str] = None
@@ -54,12 +63,11 @@ class Env:
54
63
  # info about installed packages that are utilized by some parts of the code;
55
64
  # package name -> version; version == []: package is installed, but we haven't determined the version yet
56
65
  self._installed_packages: Dict[str, Optional[List[int]]] = {}
57
- self._nos_client: Optional[Any] = None
58
66
  self._spacy_nlp: Optional[Any] = None # spacy.Language
59
- self._httpd: Optional[socketserver.TCPServer] = None
67
+ self._httpd: Optional[http.server.HTTPServer] = None
60
68
  self._http_address: Optional[str] = None
61
69
 
62
- self._registered_clients: dict[str, Any] = {}
70
+ self._registered_clients: dict[str, ApiClient] = {}
63
71
 
64
72
  # logging-related state
65
73
  self._logger = logging.getLogger('pixeltable')
@@ -94,13 +102,43 @@ class Env:
94
102
  assert self._http_address is not None
95
103
  return self._http_address
96
104
 
105
+ def configure_logging(
106
+ self,
107
+ *,
108
+ to_stdout: Optional[bool] = None,
109
+ level: Optional[int] = None,
110
+ add: Optional[str] = None,
111
+ remove: Optional[str] = None,
112
+ ) -> None:
113
+ """Configure logging.
114
+
115
+ Args:
116
+ to_stdout: if True, also log to stdout
117
+ level: default log level
118
+ add: comma-separated list of 'module name:log level' pairs; ex.: add='video:10'
119
+ remove: comma-separated list of module names
120
+ """
121
+ if to_stdout is not None:
122
+ self.log_to_stdout(to_stdout)
123
+ if level is not None:
124
+ self.set_log_level(level)
125
+ if add is not None:
126
+ for module, level_str in [t.split(':') for t in add.split(',')]:
127
+ self.set_module_log_level(module, int(level_str))
128
+ if remove is not None:
129
+ for module in remove.split(','):
130
+ self.set_module_log_level(module, None)
131
+ if to_stdout is None and level is None and add is None and remove is None:
132
+ self.print_log_config()
133
+
97
134
  def print_log_config(self) -> None:
98
135
  print(f'logging to {self._logfilename}')
99
136
  print(f'{"" if self._log_to_stdout else "not "}logging to stdout')
100
137
  print(f'default log level: {logging.getLevelName(self._default_log_level)}')
101
138
  print(
102
139
  f'module log levels: '
103
- f'{",".join([name + ":" + logging.getLevelName(val) for name, val in self._module_log_level.items()])}')
140
+ f'{",".join([name + ":" + logging.getLevelName(val) for name, val in self._module_log_level.items()])}'
141
+ )
104
142
 
105
143
  def log_to_stdout(self, enable: bool = True) -> None:
106
144
  self._log_to_stdout = enable
@@ -135,10 +173,14 @@ class Env:
135
173
  else:
136
174
  return False
137
175
 
138
- def set_up(self, echo: bool = False, reinit_db: bool = False) -> None:
176
+ def _set_up(self, echo: bool = False, reinit_db: bool = False) -> None:
139
177
  if self._initialized:
140
178
  return
141
179
 
180
+ # Disable spurious warnings
181
+ warnings.simplefilter('ignore', category=TqdmWarning)
182
+ os.environ['TOKENIZERS_PARALLELISM'] = 'false'
183
+
142
184
  self._initialized = True
143
185
  home = Path(os.environ.get('PIXELTABLE_HOME', str(Path.home() / '.pixeltable')))
144
186
  assert self._home is None or self._home == home
@@ -204,6 +246,14 @@ class Env:
204
246
  av_logger.addHandler(av_fh)
205
247
  av_logger.propagate = False
206
248
 
249
+ # configure web-server logging
250
+ http_logfilename = self._logfilename.replace('.log', '_http.log')
251
+ http_fh = logging.FileHandler(self._log_dir / http_logfilename, mode='w')
252
+ http_fh.setFormatter(logging.Formatter(self._log_fmt_str))
253
+ http_logger = logging.getLogger('pixeltable.http.server')
254
+ http_logger.addHandler(http_fh)
255
+ http_logger.propagate = False
256
+
207
257
  # empty tmp dir
208
258
  for path in glob.glob(f'{self._tmp_dir}/*'):
209
259
  os.remove(path)
@@ -216,23 +266,19 @@ class Env:
216
266
  self._db_url = self._db_server.get_uri(database=self._db_name)
217
267
 
218
268
  if reinit_db:
219
- if database_exists(self.db_url):
220
- drop_database(self.db_url)
269
+ if self._store_db_exists():
270
+ self._drop_store_db()
221
271
 
222
- if not database_exists(self.db_url):
272
+ if not self._store_db_exists():
223
273
  self._logger.info(f'creating database at {self.db_url}')
224
- create_database(self.db_url)
225
- self._sa_engine = sql.create_engine(self.db_url, echo=echo, future=True)
274
+ self._create_store_db()
275
+ self._create_engine(echo=echo)
226
276
  from pixeltable.metadata import schema
227
277
  schema.Base.metadata.create_all(self._sa_engine)
228
278
  metadata.create_system_info(self._sa_engine)
229
- # enable pgvector
230
- with self._sa_engine.begin() as conn:
231
- conn.execute(sql.text('CREATE EXTENSION vector'))
232
279
  else:
233
280
  self._logger.info(f'found database {self.db_url}')
234
- if self._sa_engine is None:
235
- self._sa_engine = sql.create_engine(self.db_url, echo=echo, future=True)
281
+ self._create_engine(echo=echo)
236
282
 
237
283
  print(f'Connected to Pixeltable database at: {self.db_url}')
238
284
 
@@ -240,72 +286,122 @@ class Env:
240
286
  self._set_up_runtime()
241
287
  self.log_to_stdout(False)
242
288
 
243
- # Disable spurious warnings
244
- warnings.simplefilter("ignore", category=TqdmWarning)
289
+ def _create_engine(self, echo: bool = False) -> None:
290
+ self._sa_engine = sql.create_engine(self.db_url, echo=echo, future=True, isolation_level='AUTOCOMMIT')
245
291
 
246
- def upgrade_metadata(self) -> None:
292
+ def _store_db_exists(self) -> bool:
293
+ assert self._db_name is not None
294
+ # don't try to connect to self.db_name, it may not exist
295
+ db_url = self._db_server.get_uri(database='postgres')
296
+ engine = sql.create_engine(db_url, future=True)
297
+ try:
298
+ with engine.begin() as conn:
299
+ stmt = f"SELECT COUNT(*) FROM pg_database WHERE datname = '{self._db_name}'"
300
+ result = conn.scalar(sql.text(stmt))
301
+ assert result <= 1
302
+ return result == 1
303
+ finally:
304
+ engine.dispose()
305
+
306
+
307
+ def _create_store_db(self) -> None:
308
+ assert self._db_name is not None
309
+ # create the db
310
+ pg_db_url = self._db_server.get_uri(database='postgres')
311
+ engine = sql.create_engine(pg_db_url, future=True, isolation_level='AUTOCOMMIT')
312
+ preparer = engine.dialect.identifier_preparer
313
+ try:
314
+ with engine.begin() as conn:
315
+ # use C collation to get standard C/Python-style sorting
316
+ stmt = (
317
+ f"CREATE DATABASE {preparer.quote(self._db_name)} "
318
+ "ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C' TEMPLATE template0"
319
+ )
320
+ conn.execute(sql.text(stmt))
321
+ finally:
322
+ engine.dispose()
323
+
324
+ # enable pgvector
325
+ store_db_url = self._db_server.get_uri(database=self._db_name)
326
+ engine = sql.create_engine(store_db_url, future=True, isolation_level='AUTOCOMMIT')
327
+ try:
328
+ with engine.begin() as conn:
329
+ conn.execute(sql.text('CREATE EXTENSION vector'))
330
+ finally:
331
+ engine.dispose()
332
+
333
+ def _drop_store_db(self) -> None:
334
+ assert self._db_name is not None
335
+ db_url = self._db_server.get_uri(database='postgres')
336
+ engine = sql.create_engine(db_url, future=True, isolation_level='AUTOCOMMIT')
337
+ preparer = engine.dialect.identifier_preparer
338
+ try:
339
+ with engine.begin() as conn:
340
+ # terminate active connections
341
+ stmt = (f"""
342
+ SELECT pg_terminate_backend(pg_stat_activity.pid)
343
+ FROM pg_stat_activity
344
+ WHERE pg_stat_activity.datname = '{self._db_name}'
345
+ AND pid <> pg_backend_pid()
346
+ """)
347
+ conn.execute(sql.text(stmt))
348
+ # drop db
349
+ stmt = f'DROP DATABASE {preparer.quote(self._db_name)}'
350
+ conn.execute(sql.text(stmt))
351
+ finally:
352
+ engine.dispose()
353
+
354
+ def _upgrade_metadata(self) -> None:
247
355
  metadata.upgrade_md(self._sa_engine)
248
356
 
249
- def _create_nos_client(self) -> None:
250
- import nos
251
- self._logger.info('connecting to NOS')
252
- nos.init(logging_level=logging.DEBUG)
253
- self._nos_client = nos.client.InferenceClient()
254
- self._logger.info('waiting for NOS')
255
- self._nos_client.WaitForServer()
256
-
257
- # now that we have a client, we can create the module
258
- import importlib
259
- try:
260
- importlib.import_module('pixeltable.functions.nos')
261
- # it's already been created
262
- return
263
- except ImportError:
264
- pass
265
- from pixeltable.functions.util import create_nos_modules
266
- _ = create_nos_modules()
357
+ def _register_client(self, name: str, init_fn: Callable) -> None:
358
+ sig = inspect.signature(init_fn)
359
+ param_names = list(sig.parameters.keys())
360
+ self._registered_clients[name] = ApiClient(init_fn=init_fn, param_names=param_names)
267
361
 
268
- def get_client(self, name: str, init: Callable, environ: Optional[str] = None) -> Any:
362
+ def get_client(self, name: str) -> Any:
269
363
  """
270
- Gets the client with the specified name, using `init` to construct one if necessary.
364
+ Gets the client with the specified name, initializing it if necessary.
271
365
 
272
- - name: The name of the client
273
- - init: A `Callable` with signature `fn(api_key: str) -> Any` that constructs a client object
274
- - environ: The name of the environment variable to use for the API key, if no API key is found in config
275
- (defaults to f'{name.upper()}_API_KEY')
366
+ Args:
367
+ - name: The name of the client
276
368
  """
277
- if name in self._registered_clients:
278
- return self._registered_clients[name]
279
-
280
- if environ is None:
281
- environ = f'{name.upper()}_API_KEY'
282
-
283
- if name in self._config and 'api_key' in self._config[name]:
284
- api_key = self._config[name]['api_key']
285
- else:
286
- api_key = os.environ.get(environ)
287
- if api_key is None or api_key == '':
288
- raise excs.Error(f'`{name}` client not initialized (no API key configured).')
289
-
290
- client = init(api_key)
291
- self._registered_clients[name] = client
369
+ cl = self._registered_clients[name]
370
+ if cl.client_obj is not None:
371
+ return cl.client_obj # Already initialized
372
+
373
+ # Construct a client. For each client parameter, first check if the parameter is in the environment;
374
+ # if not, look in Pixeltable config from `config.yaml`.
375
+
376
+ init_kwargs: dict[str, str] = {}
377
+ for param in cl.param_names:
378
+ environ = f'{name.upper()}_{param.upper()}'
379
+ if environ in os.environ:
380
+ init_kwargs[param] = os.environ[environ]
381
+ elif name.lower() in self._config and param in self._config[name.lower()]:
382
+ init_kwargs[param] = self._config[name.lower()][param.lower()]
383
+ if param not in init_kwargs or init_kwargs[param] == '':
384
+ raise excs.Error(
385
+ f'`{name}` client not initialized: parameter `{param}` is not configured.\n'
386
+ f'To fix this, specify the `{environ}` environment variable, or put `{param.lower()}` in '
387
+ f'the `{name.lower()}` section of $PIXELTABLE_HOME/config.yaml.'
388
+ )
389
+
390
+ cl.client_obj = cl.init_fn(**init_kwargs)
292
391
  self._logger.info(f'Initialized `{name}` client.')
293
- return client
392
+ return cl.client_obj
294
393
 
295
394
  def _start_web_server(self) -> None:
296
395
  """
297
396
  The http server root is the file system root.
298
397
  eg: /home/media/foo.mp4 is located at http://127.0.0.1:{port}/home/media/foo.mp4
398
+ in windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
299
399
  This arrangement enables serving media hosted within _home,
300
400
  as well as external media inserted into pixeltable or produced by pixeltable.
301
401
  The port is chosen dynamically to prevent conflicts.
302
402
  """
303
403
  # Port 0 means OS picks one for us.
304
- address = ("127.0.0.1", 0)
305
- class FixedRootHandler(http.server.SimpleHTTPRequestHandler):
306
- def __init__(self, *args, **kwargs):
307
- super().__init__(*args, directory='/', **kwargs)
308
- self._httpd = socketserver.TCPServer(address, FixedRootHandler)
404
+ self._httpd = make_server('127.0.0.1', 0)
309
405
  port = self._httpd.server_address[1]
310
406
  self._http_address = f'http://127.0.0.1:{port}'
311
407
 
@@ -335,19 +431,21 @@ class Env:
335
431
  check('transformers')
336
432
  check('sentence_transformers')
337
433
  check('yolox')
434
+ check('whisperx')
338
435
  check('boto3')
436
+ check('fitz') # pymupdf
339
437
  check('pyarrow')
340
438
  check('spacy') # TODO: deal with en-core-web-sm
341
439
  if self.is_installed_package('spacy'):
342
440
  import spacy
441
+
343
442
  self._spacy_nlp = spacy.load('en_core_web_sm')
344
443
  check('tiktoken')
345
444
  check('openai')
346
445
  check('together')
347
446
  check('fireworks')
348
- check('nos')
349
- if self.is_installed_package('nos'):
350
- self._create_nos_client()
447
+ check('label_studio_sdk')
448
+ check('openpyxl')
351
449
 
352
450
  def require_package(self, package: str, min_version: Optional[List[int]] = None) -> None:
353
451
  assert package in self._installed_packages
@@ -357,7 +455,7 @@ class Env:
357
455
  return
358
456
 
359
457
  # check whether we have a version >= the required one
360
- if self._installed_packages[package] == []:
458
+ if not self._installed_packages[package]:
361
459
  m = importlib.import_module(package)
362
460
  module_version = [int(x) for x in m.__version__.split('.')]
363
461
  self._installed_packages[package] = module_version
@@ -365,9 +463,12 @@ class Env:
365
463
  if len(min_version) < len(installed_version):
366
464
  normalized_min_version = min_version + [0] * (len(installed_version) - len(min_version))
367
465
  if any([a < b for a, b in zip(installed_version, normalized_min_version)]):
368
- raise excs.Error((
369
- f'The installed version of package {package} is {".".join([str[v] for v in installed_version])}, '
370
- f'but version >={".".join([str[v] for v in min_version])} is required'))
466
+ raise excs.Error(
467
+ (
468
+ f'The installed version of package {package} is {".".join(str(v) for v in installed_version)}, '
469
+ f'but version >={".".join(str(v) for v in min_version)} is required'
470
+ )
471
+ )
371
472
 
372
473
  def num_tmp_files(self) -> int:
373
474
  return len(glob.glob(f'{self._tmp_dir}/*'))
@@ -405,11 +506,44 @@ class Env:
405
506
  assert self._sa_engine is not None
406
507
  return self._sa_engine
407
508
 
408
- @property
409
- def nos_client(self) -> Any:
410
- return self._nos_client
411
-
412
509
  @property
413
510
  def spacy_nlp(self) -> Any:
414
511
  assert self._spacy_nlp is not None
415
- return self._spacy_nlp
512
+ return self._spacy_nlp
513
+
514
+
515
+ def register_client(name: str) -> Callable:
516
+ """Decorator that registers a third-party API client for use by Pixeltable.
517
+
518
+ The decorated function is an initialization wrapper for the client, and can have
519
+ any number of string parameters, with a signature such as:
520
+
521
+ ```
522
+ def my_client(api_key: str, url: str) -> my_client_sdk.Client:
523
+ return my_client_sdk.Client(api_key=api_key, url=url)
524
+ ```
525
+
526
+ The initialization wrapper will not be called immediately; initialization will
527
+ be deferred until the first time the client is used. At initialization time,
528
+ Pixeltable will attempt to load the client parameters from config. For each
529
+ config parameter:
530
+ - If an environment variable named MY_CLIENT_API_KEY (for example) is set, use it;
531
+ - Otherwise, look for 'api_key' in the 'my_client' section of config.yaml.
532
+
533
+ If all config parameters are found, Pixeltable calls the initialization function;
534
+ otherwise it throws an exception.
535
+
536
+ Args:
537
+ - name (str): The name of the API client (e.g., 'openai' or 'label-studio').
538
+ """
539
+ def decorator(fn: Callable) -> None:
540
+ Env.get()._register_client(name, fn)
541
+
542
+ return decorator
543
+
544
+
545
+ @dataclass
546
+ class ApiClient:
547
+ init_fn: Callable
548
+ param_names: list[str]
549
+ client_obj: Optional[Any] = None
@@ -6,4 +6,5 @@ from .exec_node import ExecNode
6
6
  from .expr_eval_node import ExprEvalNode
7
7
  from .in_memory_data_node import InMemoryDataNode
8
8
  from .sql_scan_node import SqlScanNode
9
- from .media_validation_node import MediaValidationNode
9
+ from .media_validation_node import MediaValidationNode
10
+ from .data_row_batch import DataRowBatch
@@ -14,9 +14,8 @@ class DataRowBatch:
14
14
 
15
15
  Contains the metadata needed to initialize DataRows.
16
16
  """
17
- def __init__(self, tbl: catalog.TableVersion, row_builder: exprs.RowBuilder, len: int = 0):
18
- self.tbl_id = tbl.id
19
- self.tbl_version = tbl.version
17
+ def __init__(self, tbl: Optional[catalog.TableVersion], row_builder: exprs.RowBuilder, len: int = 0):
18
+ self.tbl = tbl
20
19
  self.row_builder = row_builder
21
20
  self.img_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_image_type()]
22
21
  # non-image media slots
@@ -42,9 +41,10 @@ class DataRowBatch:
42
41
 
43
42
  def set_row_ids(self, row_ids: List[int]) -> None:
44
43
  """Sets pks for rows in batch"""
44
+ assert self.tbl is not None
45
45
  assert len(row_ids) == len(self.rows)
46
46
  for row, row_id in zip(self.rows, row_ids):
47
- row.set_pk((row_id, self.tbl_version))
47
+ row.set_pk((row_id, self.tbl))
48
48
 
49
49
  def __len__(self) -> int:
50
50
  return len(self.rows)
@@ -57,6 +57,7 @@ class DataRowBatch:
57
57
  flushed_slot_idxs: Optional[List[int]] = None
58
58
  ) -> None:
59
59
  """Flushes images in the given range of rows."""
60
+ assert self.tbl is not None
60
61
  if stored_img_info is None:
61
62
  stored_img_info = []
62
63
  if flushed_slot_idxs is None:
@@ -67,12 +68,10 @@ class DataRowBatch:
67
68
  idx_range = slice(0, len(self.rows))
68
69
  for row in self.rows[idx_range]:
69
70
  for info in stored_img_info:
70
- filepath = str(MediaStore.prepare_media_path(self.tbl_id, info.col.id, self.tbl_version))
71
+ filepath = str(MediaStore.prepare_media_path(self.tbl.id, info.col.id, self.tbl.version))
71
72
  row.flush_img(info.slot_idx, filepath)
72
73
  for slot_idx in flushed_slot_idxs:
73
74
  row.flush_img(slot_idx)
74
- #_logger.debug(
75
- #f'flushed images in range {idx_range}: slot_idxs={flushed_slot_idxs} stored_img_info={stored_img_info}')
76
75
 
77
76
  def __iter__(self) -> Iterator[exprs.DataRow]:
78
77
  return DataRowBatchIterator(self)
@@ -1,20 +1,20 @@
1
- import sys
2
- import warnings
3
- from typing import List, Optional, Tuple
4
- from dataclasses import dataclass, field
5
1
  import logging
2
+ import sys
6
3
  import time
4
+ import warnings
5
+ from dataclasses import dataclass
6
+ from typing import List, Optional
7
7
 
8
8
  from tqdm import tqdm, TqdmWarning
9
9
 
10
+ import pixeltable.exprs as exprs
11
+ from pixeltable.func import CallableFunction
10
12
  from .data_row_batch import DataRowBatch
11
13
  from .exec_node import ExecNode
12
- import pixeltable.exprs as exprs
13
- import pixeltable.func as func
14
-
15
14
 
16
15
  _logger = logging.getLogger('pixeltable')
17
16
 
17
+
18
18
  class ExprEvalNode(ExecNode):
19
19
  """Materializes expressions
20
20
  """
@@ -22,7 +22,7 @@ class ExprEvalNode(ExecNode):
22
22
  class Cohort:
23
23
  """List of exprs that form an evaluation context and contain calls to at most one external function"""
24
24
  exprs: List[exprs.Expr]
25
- ext_function: Optional[func.BatchedFunction]
25
+ batched_fn: Optional[CallableFunction]
26
26
  segment_ctxs: List[exprs.RowBuilder.EvalCtx]
27
27
  target_slot_idxs: List[int]
28
28
  batch_size: int = 8
@@ -63,12 +63,12 @@ class ExprEvalNode(ExecNode):
63
63
  if self.pbar is not None:
64
64
  self.pbar.close()
65
65
 
66
- def _get_batched_fn(self, expr: exprs.Expr) -> Optional[func.BatchedFunction]:
67
- if not isinstance(expr, exprs.FunctionCall):
68
- return None
69
- return expr.fn if isinstance(expr.fn, func.BatchedFunction) else None
66
+ def _get_batched_fn(self, expr: exprs.Expr) -> Optional[CallableFunction]:
67
+ if isinstance(expr, exprs.FunctionCall) and isinstance(expr.fn, CallableFunction) and expr.fn.is_batched:
68
+ return expr.fn
69
+ return None
70
70
 
71
- def _is_ext_call(self, expr: exprs.Expr) -> bool:
71
+ def _is_batched_fn_call(self, expr: exprs.Expr) -> bool:
72
72
  return self._get_batched_fn(expr) is not None
73
73
 
74
74
  def _create_cohorts(self) -> None:
@@ -76,14 +76,14 @@ class ExprEvalNode(ExecNode):
76
76
  # break up all_exprs into cohorts such that each cohort contains calls to at most one external function;
77
77
  # seed the cohorts with only the ext fn calls
78
78
  cohorts: List[List[exprs.Expr]] = []
79
- current_ext_function: Optional[func.BatchedFunction] = None
79
+ current_batched_fn: Optional[CallableFunction] = None
80
80
  for e in all_exprs:
81
- if not self._is_ext_call(e):
81
+ if not self._is_batched_fn_call(e):
82
82
  continue
83
- if current_ext_function is None or current_ext_function != e.fn:
83
+ if current_batched_fn is None or current_batched_fn != e.fn:
84
84
  # create a new cohort
85
85
  cohorts.append([])
86
- current_ext_function = e.fn
86
+ current_batched_fn = e.fn
87
87
  cohorts[-1].append(e)
88
88
 
89
89
  # expand the cohorts to include all exprs that are in the same evaluation context as the external calls;
@@ -115,18 +115,18 @@ class ExprEvalNode(ExecNode):
115
115
  assert len(cohort) > 0
116
116
  # create the first segment here, so we can avoid checking for an empty list in the loop
117
117
  segments = [[cohort[0]]]
118
- is_ext_segment = self._is_ext_call(cohort[0])
119
- ext_fn: Optional[func.BatchedFunction] = self._get_batched_fn(cohort[0])
118
+ is_batched_segment = self._is_batched_fn_call(cohort[0])
119
+ batched_fn: Optional[CallableFunction] = self._get_batched_fn(cohort[0])
120
120
  for e in cohort[1:]:
121
- if self._is_ext_call(e):
121
+ if self._is_batched_fn_call(e):
122
122
  segments.append([e])
123
- is_ext_segment = True
124
- ext_fn = self._get_batched_fn(e)
123
+ is_batched_segment = True
124
+ batched_fn = self._get_batched_fn(e)
125
125
  else:
126
- if is_ext_segment:
126
+ if is_batched_segment:
127
127
  # start a new segment
128
128
  segments.append([])
129
- is_ext_segment = False
129
+ is_batched_segment = False
130
130
  segments[-1].append(e)
131
131
 
132
132
  # we create the EvalCtxs manually because create_eval_ctx() would repeat the dependencies of each segment
@@ -135,21 +135,21 @@ class ExprEvalNode(ExecNode):
135
135
  slot_idxs=[e.slot_idx for e in s], exprs=s, target_slot_idxs=[], target_exprs=[])
136
136
  for s in segments
137
137
  ]
138
- cohort_info = self.Cohort(cohort, ext_fn, segment_ctxs, target_slot_idxs[i])
138
+ cohort_info = self.Cohort(cohort, batched_fn, segment_ctxs, target_slot_idxs[i])
139
139
  self.cohorts.append(cohort_info)
140
140
 
141
141
  def _exec_cohort(self, cohort: Cohort, rows: DataRowBatch) -> None:
142
142
  """Compute the cohort for the entire input batch by dividing it up into sub-batches"""
143
143
  batch_start_idx = 0 # start row of the current sub-batch
144
144
  # for multi-resolution models, we re-assess the correct ext fn batch size for each input batch
145
- ext_batch_size = cohort.ext_function.get_batch_size() if cohort.ext_function is not None else None
145
+ ext_batch_size = cohort.batched_fn.get_batch_size() if cohort.batched_fn is not None else None
146
146
  if ext_batch_size is not None:
147
147
  cohort.batch_size = ext_batch_size
148
148
 
149
149
  while batch_start_idx < len(rows):
150
150
  num_batch_rows = min(cohort.batch_size, len(rows) - batch_start_idx)
151
151
  for segment_ctx in cohort.segment_ctxs:
152
- if not self._is_ext_call(segment_ctx.exprs[0]):
152
+ if not self._is_batched_fn_call(segment_ctx.exprs[0]):
153
153
  # compute batch row-wise
154
154
  for row_idx in range(batch_start_idx, batch_start_idx + num_batch_rows):
155
155
  self.row_builder.eval(
@@ -193,7 +193,7 @@ class ExprEvalNode(ExecNode):
193
193
  for k in kwarg_batches.keys()
194
194
  }
195
195
  start_ts = time.perf_counter()
196
- result_batch = fn_call.fn.invoke(call_args, call_kwargs)
196
+ result_batch = fn_call.fn.exec_batch(*call_args, **call_kwargs)
197
197
  self.ctx.profile.eval_time[fn_call.slot_idx] += time.perf_counter() - start_ts
198
198
  self.ctx.profile.eval_count[fn_call.slot_idx] += num_ext_batch_rows
199
199
 
@@ -21,7 +21,6 @@ class SqlScanNode(ExecNode):
21
21
  select_list: Iterable[exprs.Expr],
22
22
  where_clause: Optional[exprs.Expr] = None, filter: Optional[exprs.Predicate] = None,
23
23
  order_by_items: Optional[List[Tuple[exprs.Expr, bool]]] = None,
24
- similarity_clause: Optional[exprs.ImageSimilarityPredicate] = None,
25
24
  limit: int = 0, set_pk: bool = False, exact_version_only: Optional[List[catalog.TableVersion]] = None
26
25
  ):
27
26
  """
@@ -77,15 +76,17 @@ class SqlScanNode(ExecNode):
77
76
  # the number of tables that need to be joined to the target table
78
77
  for rowid_ref in [e for e, _ in order_by_items if isinstance(e, exprs.RowidRef)]:
79
78
  rowid_ref.set_tbl(tbl)
80
- order_by_clause = [e.sql_expr().desc() if not asc else e.sql_expr() for e, asc in order_by_items]
79
+ order_by_clause: List[sql.ClauseElement] = []
80
+ for e, asc in order_by_items:
81
+ if isinstance(e, exprs.SimilarityExpr):
82
+ order_by_clause.append(e.as_order_by_clause(asc))
83
+ else:
84
+ order_by_clause.append(e.sql_expr().desc() if not asc else e.sql_expr())
81
85
 
82
86
  if where_clause is not None:
83
87
  sql_where_clause = where_clause.sql_expr()
84
88
  assert sql_where_clause is not None
85
89
  self.stmt = self.stmt.where(sql_where_clause)
86
- if similarity_clause is not None:
87
- self.stmt = self.stmt.order_by(
88
- similarity_clause.img_col_ref.col.sa_idx_col.l2_distance(similarity_clause.embedding()))
89
90
  if len(order_by_clause) > 0:
90
91
  self.stmt = self.stmt.order_by(*order_by_clause)
91
92
  elif target.id in row_builder.unstored_iter_args:
@@ -201,7 +202,7 @@ class SqlScanNode(ExecNode):
201
202
  self.row_builder.eval(output_row, self.filter_eval_ctx, profile=self.ctx.profile)
202
203
  if output_row[self.filter.slot_idx]:
203
204
  needs_row = True
204
- if self.limit is not None and len(output_batch) >= self.limit:
205
+ if self.limit > 0 and len(output_batch) >= self.limit:
205
206
  self.has_more_rows = False
206
207
  break
207
208
  else: