pixeltable 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +34 -6
- pixeltable/catalog/__init__.py +13 -0
- pixeltable/catalog/catalog.py +159 -0
- pixeltable/catalog/column.py +200 -0
- pixeltable/catalog/dir.py +32 -0
- pixeltable/catalog/globals.py +33 -0
- pixeltable/catalog/insertable_table.py +191 -0
- pixeltable/catalog/named_function.py +36 -0
- pixeltable/catalog/path.py +58 -0
- pixeltable/catalog/path_dict.py +139 -0
- pixeltable/catalog/schema_object.py +39 -0
- pixeltable/catalog/table.py +581 -0
- pixeltable/catalog/table_version.py +749 -0
- pixeltable/catalog/table_version_path.py +133 -0
- pixeltable/catalog/view.py +203 -0
- pixeltable/client.py +520 -30
- pixeltable/dataframe.py +540 -349
- pixeltable/env.py +373 -45
- pixeltable/exceptions.py +12 -21
- pixeltable/exec/__init__.py +9 -0
- pixeltable/exec/aggregation_node.py +78 -0
- pixeltable/exec/cache_prefetch_node.py +113 -0
- pixeltable/exec/component_iteration_node.py +79 -0
- pixeltable/exec/data_row_batch.py +95 -0
- pixeltable/exec/exec_context.py +22 -0
- pixeltable/exec/exec_node.py +61 -0
- pixeltable/exec/expr_eval_node.py +217 -0
- pixeltable/exec/in_memory_data_node.py +69 -0
- pixeltable/exec/media_validation_node.py +43 -0
- pixeltable/exec/sql_scan_node.py +225 -0
- pixeltable/exprs/__init__.py +24 -0
- pixeltable/exprs/arithmetic_expr.py +102 -0
- pixeltable/exprs/array_slice.py +71 -0
- pixeltable/exprs/column_property_ref.py +77 -0
- pixeltable/exprs/column_ref.py +105 -0
- pixeltable/exprs/comparison.py +77 -0
- pixeltable/exprs/compound_predicate.py +98 -0
- pixeltable/exprs/data_row.py +187 -0
- pixeltable/exprs/expr.py +586 -0
- pixeltable/exprs/expr_set.py +39 -0
- pixeltable/exprs/function_call.py +380 -0
- pixeltable/exprs/globals.py +69 -0
- pixeltable/exprs/image_member_access.py +115 -0
- pixeltable/exprs/image_similarity_predicate.py +58 -0
- pixeltable/exprs/inline_array.py +107 -0
- pixeltable/exprs/inline_dict.py +101 -0
- pixeltable/exprs/is_null.py +38 -0
- pixeltable/exprs/json_mapper.py +121 -0
- pixeltable/exprs/json_path.py +159 -0
- pixeltable/exprs/literal.py +54 -0
- pixeltable/exprs/object_ref.py +41 -0
- pixeltable/exprs/predicate.py +44 -0
- pixeltable/exprs/row_builder.py +355 -0
- pixeltable/exprs/rowid_ref.py +94 -0
- pixeltable/exprs/type_cast.py +53 -0
- pixeltable/exprs/variable.py +45 -0
- pixeltable/func/__init__.py +9 -0
- pixeltable/func/aggregate_function.py +194 -0
- pixeltable/func/batched_function.py +53 -0
- pixeltable/func/callable_function.py +69 -0
- pixeltable/func/expr_template_function.py +82 -0
- pixeltable/func/function.py +110 -0
- pixeltable/func/function_registry.py +227 -0
- pixeltable/func/globals.py +36 -0
- pixeltable/func/nos_function.py +202 -0
- pixeltable/func/signature.py +166 -0
- pixeltable/func/udf.py +163 -0
- pixeltable/functions/__init__.py +52 -103
- pixeltable/functions/eval.py +216 -0
- pixeltable/functions/fireworks.py +61 -0
- pixeltable/functions/huggingface.py +120 -0
- pixeltable/functions/image.py +16 -0
- pixeltable/functions/openai.py +88 -0
- pixeltable/functions/pil/image.py +148 -7
- pixeltable/functions/string.py +13 -0
- pixeltable/functions/together.py +27 -0
- pixeltable/functions/util.py +41 -0
- pixeltable/functions/video.py +62 -0
- pixeltable/iterators/__init__.py +3 -0
- pixeltable/iterators/base.py +48 -0
- pixeltable/iterators/document.py +311 -0
- pixeltable/iterators/video.py +89 -0
- pixeltable/metadata/__init__.py +54 -0
- pixeltable/metadata/converters/convert_10.py +18 -0
- pixeltable/metadata/schema.py +211 -0
- pixeltable/plan.py +656 -0
- pixeltable/store.py +413 -182
- pixeltable/tests/conftest.py +143 -87
- pixeltable/tests/test_audio.py +65 -0
- pixeltable/tests/test_catalog.py +27 -0
- pixeltable/tests/test_client.py +14 -14
- pixeltable/tests/test_component_view.py +372 -0
- pixeltable/tests/test_dataframe.py +433 -0
- pixeltable/tests/test_dirs.py +78 -62
- pixeltable/tests/test_document.py +117 -0
- pixeltable/tests/test_exprs.py +591 -135
- pixeltable/tests/test_function.py +297 -67
- pixeltable/tests/test_functions.py +283 -1
- pixeltable/tests/test_migration.py +43 -0
- pixeltable/tests/test_nos.py +54 -0
- pixeltable/tests/test_snapshot.py +208 -0
- pixeltable/tests/test_table.py +1085 -262
- pixeltable/tests/test_transactional_directory.py +42 -0
- pixeltable/tests/test_types.py +5 -11
- pixeltable/tests/test_video.py +149 -34
- pixeltable/tests/test_view.py +530 -0
- pixeltable/tests/utils.py +186 -45
- pixeltable/tool/create_test_db_dump.py +149 -0
- pixeltable/type_system.py +490 -126
- pixeltable/utils/__init__.py +17 -46
- pixeltable/utils/clip.py +12 -15
- pixeltable/utils/coco.py +136 -0
- pixeltable/utils/documents.py +39 -0
- pixeltable/utils/filecache.py +195 -0
- pixeltable/utils/help.py +11 -0
- pixeltable/utils/media_store.py +76 -0
- pixeltable/utils/parquet.py +126 -0
- pixeltable/utils/pytorch.py +172 -0
- pixeltable/utils/s3.py +13 -0
- pixeltable/utils/sql.py +17 -0
- pixeltable/utils/transactional_directory.py +35 -0
- pixeltable-0.2.0.dist-info/LICENSE +18 -0
- pixeltable-0.2.0.dist-info/METADATA +117 -0
- pixeltable-0.2.0.dist-info/RECORD +125 -0
- {pixeltable-0.1.1.dist-info → pixeltable-0.2.0.dist-info}/WHEEL +1 -1
- pixeltable/catalog.py +0 -1421
- pixeltable/exprs.py +0 -1745
- pixeltable/function.py +0 -269
- pixeltable/functions/clip.py +0 -10
- pixeltable/functions/pil/__init__.py +0 -23
- pixeltable/functions/tf.py +0 -21
- pixeltable/index.py +0 -57
- pixeltable/tests/test_dict.py +0 -24
- pixeltable/tests/test_tf.py +0 -69
- pixeltable/tf.py +0 -33
- pixeltable/utils/tf.py +0 -33
- pixeltable/utils/video.py +0 -32
- pixeltable-0.1.1.dist-info/METADATA +0 -31
- pixeltable-0.1.1.dist-info/RECORD +0 -36
pixeltable/env.py
CHANGED
|
@@ -1,86 +1,414 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import datetime
|
|
3
|
+
import os
|
|
4
|
+
from typing import Optional, Dict, Any, List
|
|
2
5
|
from pathlib import Path
|
|
3
6
|
import sqlalchemy as sql
|
|
7
|
+
import uuid
|
|
8
|
+
import importlib
|
|
9
|
+
import importlib.util
|
|
10
|
+
|
|
11
|
+
import http.server
|
|
12
|
+
import socketserver
|
|
13
|
+
import threading
|
|
14
|
+
import typing
|
|
15
|
+
import uuid
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Optional, Dict, Any, List
|
|
18
|
+
|
|
19
|
+
import yaml
|
|
4
20
|
from sqlalchemy_utils.functions import database_exists, create_database, drop_database
|
|
21
|
+
import pgserver
|
|
22
|
+
import logging
|
|
23
|
+
import sys
|
|
24
|
+
import glob
|
|
5
25
|
|
|
26
|
+
from pixeltable import metadata
|
|
27
|
+
import pixeltable.exceptions as excs
|
|
28
|
+
|
|
29
|
+
if typing.TYPE_CHECKING:
|
|
30
|
+
import openai
|
|
6
31
|
|
|
7
32
|
class Env:
|
|
8
33
|
"""
|
|
9
34
|
Store for runtime globals.
|
|
10
35
|
"""
|
|
11
|
-
_instance: Optional[
|
|
36
|
+
_instance: Optional[Env] = None
|
|
37
|
+
_log_fmt_str = '%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d: %(message)s'
|
|
12
38
|
|
|
13
39
|
@classmethod
|
|
14
|
-
def get(cls) ->
|
|
40
|
+
def get(cls) -> Env:
|
|
15
41
|
if cls._instance is None:
|
|
16
42
|
cls._instance = Env()
|
|
17
43
|
return cls._instance
|
|
18
44
|
|
|
19
45
|
def __init__(self):
|
|
20
46
|
self._home: Optional[Path] = None
|
|
21
|
-
self.
|
|
22
|
-
self.
|
|
23
|
-
self.
|
|
24
|
-
self.
|
|
47
|
+
self._media_dir: Optional[Path] = None # computed media files
|
|
48
|
+
self._file_cache_dir: Optional[Path] = None # cached media files with external URL
|
|
49
|
+
self._dataset_cache_dir: Optional[Path] = None # cached datasets (eg, pytorch or COCO)
|
|
50
|
+
self._log_dir: Optional[Path] = None # log files
|
|
51
|
+
self._tmp_dir: Optional[Path] = None # any tmp files
|
|
25
52
|
self._sa_engine: Optional[sql.engine.base.Engine] = None
|
|
53
|
+
self._pgdata_dir : Optional[Path] = None
|
|
26
54
|
self._db_name: Optional[str] = None
|
|
55
|
+
self._db_server: Optional[pgserver.PostgresServer] = None
|
|
56
|
+
self._db_url: Optional[str] = None
|
|
57
|
+
|
|
58
|
+
# info about installed packages that are utilized by some parts of the code;
|
|
59
|
+
# package name -> version; version == []: package is installed, but we haven't determined the version yet
|
|
60
|
+
self._installed_packages: Dict[str, Optional[List[int]]] = {}
|
|
61
|
+
self._nos_client: Optional[Any] = None
|
|
62
|
+
self._openai_client: Optional['openai.OpenAI'] = None
|
|
63
|
+
self._has_together_client: bool = False
|
|
64
|
+
self._spacy_nlp: Optional[Any] = None # spacy.Language
|
|
65
|
+
self._httpd: Optional[socketserver.TCPServer] = None
|
|
66
|
+
self._http_address: Optional[str] = None
|
|
67
|
+
|
|
68
|
+
# logging-related state
|
|
69
|
+
self._logger = logging.getLogger('pixeltable')
|
|
70
|
+
self._logger.setLevel(logging.DEBUG) # allow everything to pass, we filter in _log_filter()
|
|
71
|
+
self._logger.propagate = False
|
|
72
|
+
self._logger.addFilter(self._log_filter)
|
|
73
|
+
self._default_log_level = logging.INFO
|
|
74
|
+
self._logfilename: Optional[str] = None
|
|
75
|
+
self._log_to_stdout = False
|
|
76
|
+
self._module_log_level: Dict[str, int] = {} # module name -> log level
|
|
77
|
+
|
|
78
|
+
# config
|
|
79
|
+
self._config_file: Optional[Path] = None
|
|
80
|
+
self._config: Optional[Dict[str, Any]] = None
|
|
81
|
+
|
|
82
|
+
# create logging handler to also log to stdout
|
|
83
|
+
self._stdout_handler = logging.StreamHandler(stream=sys.stdout)
|
|
84
|
+
self._stdout_handler.setFormatter(logging.Formatter(self._log_fmt_str))
|
|
85
|
+
self._initialized = False
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def config(self):
|
|
89
|
+
return self._config
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def db_url(self) -> str:
|
|
93
|
+
assert self._db_url is not None
|
|
94
|
+
return self._db_url
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def http_address(self) -> str:
|
|
98
|
+
assert self._http_address is not None
|
|
99
|
+
return self._http_address
|
|
100
|
+
|
|
101
|
+
def print_log_config(self) -> None:
|
|
102
|
+
print(f'logging to {self._logfilename}')
|
|
103
|
+
print(f'{"" if self._log_to_stdout else "not "}logging to stdout')
|
|
104
|
+
print(f'default log level: {logging.getLevelName(self._default_log_level)}')
|
|
105
|
+
print(
|
|
106
|
+
f'module log levels: '
|
|
107
|
+
f'{",".join([name + ":" + logging.getLevelName(val) for name, val in self._module_log_level.items()])}')
|
|
108
|
+
|
|
109
|
+
def log_to_stdout(self, enable: bool = True) -> None:
|
|
110
|
+
self._log_to_stdout = enable
|
|
111
|
+
if enable:
|
|
112
|
+
self._logger.addHandler(self._stdout_handler)
|
|
113
|
+
else:
|
|
114
|
+
self._logger.removeHandler(self._stdout_handler)
|
|
115
|
+
|
|
116
|
+
def set_log_level(self, level: int) -> None:
|
|
117
|
+
self._default_log_level = level
|
|
118
|
+
|
|
119
|
+
def set_module_log_level(self, module: str, level: Optional[int]) -> None:
|
|
120
|
+
if level is None:
|
|
121
|
+
self._module_log_level.pop(module, None)
|
|
122
|
+
else:
|
|
123
|
+
self._module_log_level[module] = level
|
|
124
|
+
|
|
125
|
+
def is_installed_package(self, package_name: str) -> bool:
|
|
126
|
+
return self._installed_packages[package_name] is not None
|
|
127
|
+
|
|
128
|
+
def _log_filter(self, record: logging.LogRecord) -> bool:
|
|
129
|
+
if record.name == 'pixeltable':
|
|
130
|
+
# accept log messages from a configured pixeltable module (at any level of the module hierarchy)
|
|
131
|
+
path_parts = list(Path(record.pathname).parts)
|
|
132
|
+
path_parts.reverse()
|
|
133
|
+
max_idx = path_parts.index('pixeltable')
|
|
134
|
+
for module_name in path_parts[:max_idx]:
|
|
135
|
+
if module_name in self._module_log_level and record.levelno >= self._module_log_level[module_name]:
|
|
136
|
+
return True
|
|
137
|
+
if record.levelno >= self._default_log_level:
|
|
138
|
+
return True
|
|
139
|
+
else:
|
|
140
|
+
return False
|
|
141
|
+
|
|
142
|
+
def set_up(self, echo: bool = False, reinit_db: bool = False) -> None:
|
|
143
|
+
if self._initialized:
|
|
144
|
+
return
|
|
145
|
+
|
|
146
|
+
self._initialized = True
|
|
147
|
+
home = Path(os.environ.get('PIXELTABLE_HOME', str(Path.home() / '.pixeltable')))
|
|
148
|
+
assert self._home is None or self._home == home
|
|
149
|
+
self._home = home
|
|
150
|
+
self._config_file = Path(os.environ.get('PIXELTABLE_CONFIG', str(self._home / 'config.yaml')))
|
|
151
|
+
self._media_dir = self._home / 'media'
|
|
152
|
+
self._file_cache_dir = self._home / 'file_cache'
|
|
153
|
+
self._dataset_cache_dir = self._home / 'dataset_cache'
|
|
154
|
+
self._log_dir = self._home / 'logs'
|
|
155
|
+
self._tmp_dir = self._home / 'tmp'
|
|
156
|
+
self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(self._home / 'pgdata')))
|
|
157
|
+
|
|
158
|
+
# Read in the config
|
|
159
|
+
if os.path.isfile(self._config_file):
|
|
160
|
+
with open(self._config_file, 'r') as stream:
|
|
161
|
+
try:
|
|
162
|
+
self._config = yaml.safe_load(stream)
|
|
163
|
+
except yaml.YAMLError as exc:
|
|
164
|
+
self._logger.error(f'Could not read config file: {self._config_file}')
|
|
165
|
+
self._config = {}
|
|
166
|
+
else:
|
|
167
|
+
self._config = {}
|
|
27
168
|
|
|
28
|
-
def set_up(
|
|
29
|
-
self, home_parent: Optional[Path] = Path.home(), db_name: str = 'pixeltable', echo: bool = False
|
|
30
|
-
) -> None:
|
|
31
|
-
self.set_home(home_parent / '.pixeltable')
|
|
32
169
|
if self._home.exists() and not self._home.is_dir():
|
|
33
170
|
raise RuntimeError(f'{self._home} is not a directory')
|
|
34
171
|
|
|
35
|
-
self._db_name = db_name
|
|
36
|
-
db_url = f'postgresql:///{self._db_name}'
|
|
37
|
-
|
|
38
172
|
if not self._home.exists():
|
|
39
|
-
|
|
173
|
+
# we don't have our logger set up yet, so print to stdout
|
|
174
|
+
print(f'Creating a Pixeltable instance at: {self._home}')
|
|
40
175
|
self._home.mkdir()
|
|
41
|
-
|
|
42
|
-
self.
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
self.
|
|
48
|
-
|
|
49
|
-
|
|
176
|
+
# TODO (aaron-siegel) This is the existing behavior, but it seems scary. If something happens to
|
|
177
|
+
# self._home, it will cause the DB to be destroyed even if pgdata is in an alternate location.
|
|
178
|
+
# PROPOSAL: require `reinit_db` to be set explicitly to destroy the DB.
|
|
179
|
+
reinit_db = True
|
|
180
|
+
|
|
181
|
+
if not self._media_dir.exists():
|
|
182
|
+
self._media_dir.mkdir()
|
|
183
|
+
if not self._file_cache_dir.exists():
|
|
184
|
+
self._file_cache_dir.mkdir()
|
|
185
|
+
if not self._dataset_cache_dir.exists():
|
|
186
|
+
self._dataset_cache_dir.mkdir()
|
|
187
|
+
if not self._log_dir.exists():
|
|
188
|
+
self._log_dir.mkdir()
|
|
189
|
+
if not self._tmp_dir.exists():
|
|
190
|
+
self._tmp_dir.mkdir()
|
|
191
|
+
|
|
192
|
+
# configure _logger to log to a file
|
|
193
|
+
self._logfilename = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + '.log'
|
|
194
|
+
fh = logging.FileHandler(self._log_dir / self._logfilename, mode='w')
|
|
195
|
+
fh.setFormatter(logging.Formatter(self._log_fmt_str))
|
|
196
|
+
self._logger.addHandler(fh)
|
|
197
|
+
sql_logger = logging.getLogger('sqlalchemy.engine')
|
|
198
|
+
sql_logger.setLevel(logging.INFO)
|
|
199
|
+
sql_logger.addHandler(fh)
|
|
200
|
+
sql_logger.propagate = False
|
|
201
|
+
|
|
202
|
+
# empty tmp dir
|
|
203
|
+
for path in glob.glob(f'{self._tmp_dir}/*'):
|
|
204
|
+
os.remove(path)
|
|
205
|
+
|
|
206
|
+
self._db_name = os.environ.get('PIXELTABLE_DB', 'pixeltable')
|
|
207
|
+
|
|
208
|
+
# cleanup_mode=None will leave db on for debugging purposes
|
|
209
|
+
self._db_server = pgserver.get_server(self._pgdata_dir, cleanup_mode=None)
|
|
210
|
+
self._db_url = self._db_server.get_uri(database=self._db_name)
|
|
211
|
+
|
|
212
|
+
if reinit_db:
|
|
213
|
+
if database_exists(self.db_url):
|
|
214
|
+
drop_database(self.db_url)
|
|
215
|
+
|
|
216
|
+
if not database_exists(self.db_url):
|
|
217
|
+
self._logger.info(f'creating database at {self.db_url}')
|
|
218
|
+
create_database(self.db_url)
|
|
219
|
+
self._sa_engine = sql.create_engine(self.db_url, echo=echo, future=True)
|
|
220
|
+
from pixeltable.metadata import schema
|
|
221
|
+
schema.Base.metadata.create_all(self._sa_engine)
|
|
222
|
+
metadata.create_system_info(self._sa_engine)
|
|
223
|
+
# enable pgvector
|
|
224
|
+
with self._sa_engine.begin() as conn:
|
|
225
|
+
conn.execute(sql.text('CREATE EXTENSION vector'))
|
|
50
226
|
else:
|
|
227
|
+
self._logger.info(f'found database {self.db_url}')
|
|
51
228
|
if self._sa_engine is None:
|
|
52
|
-
self._sa_engine = sql.create_engine(db_url, echo=echo, future=True)
|
|
229
|
+
self._sa_engine = sql.create_engine(self.db_url, echo=echo, future=True)
|
|
230
|
+
|
|
231
|
+
print(f'Connected to Pixeltable database at: {self.db_url}')
|
|
232
|
+
|
|
233
|
+
# we now have a home directory and db; start other services
|
|
234
|
+
self._set_up_runtime()
|
|
235
|
+
self.log_to_stdout(False)
|
|
53
236
|
|
|
54
|
-
def
|
|
55
|
-
|
|
56
|
-
if database_exists(db_url):
|
|
57
|
-
drop_database(db_url)
|
|
237
|
+
def upgrade_metadata(self) -> None:
|
|
238
|
+
metadata.upgrade_md(self._sa_engine)
|
|
58
239
|
|
|
59
|
-
def
|
|
60
|
-
|
|
240
|
+
def _create_nos_client(self) -> None:
|
|
241
|
+
import nos
|
|
242
|
+
self._logger.info('connecting to NOS')
|
|
243
|
+
nos.init(logging_level=logging.DEBUG)
|
|
244
|
+
self._nos_client = nos.client.InferenceClient()
|
|
245
|
+
self._logger.info('waiting for NOS')
|
|
246
|
+
self._nos_client.WaitForServer()
|
|
247
|
+
|
|
248
|
+
# now that we have a client, we can create the module
|
|
249
|
+
import importlib
|
|
250
|
+
try:
|
|
251
|
+
importlib.import_module('pixeltable.functions.nos')
|
|
252
|
+
# it's already been created
|
|
61
253
|
return
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
254
|
+
except ImportError:
|
|
255
|
+
pass
|
|
256
|
+
from pixeltable.functions.util import create_nos_modules
|
|
257
|
+
_ = create_nos_modules()
|
|
258
|
+
|
|
259
|
+
def _create_openai_client(self) -> None:
|
|
260
|
+
if 'openai' in self._config and 'api_key' in self._config['openai']:
|
|
261
|
+
api_key = self._config['openai']['api_key']
|
|
262
|
+
else:
|
|
263
|
+
api_key = os.environ.get('OPENAI_API_KEY')
|
|
264
|
+
if api_key is None or api_key == '':
|
|
265
|
+
self._logger.info("OpenAI client not initialized (no API key configured).")
|
|
266
|
+
return
|
|
267
|
+
import openai
|
|
268
|
+
self._logger.info('Initializing OpenAI client.')
|
|
269
|
+
self._openai_client = openai.OpenAI(api_key=api_key)
|
|
270
|
+
|
|
271
|
+
def _create_together_client(self) -> None:
|
|
272
|
+
if 'together' in self._config and 'api_key' in self._config['together']:
|
|
273
|
+
api_key = self._config['together']['api_key']
|
|
274
|
+
else:
|
|
275
|
+
api_key = os.environ.get('TOGETHER_API_KEY')
|
|
276
|
+
if api_key is None or api_key == '':
|
|
277
|
+
self._logger.info('Together client not initialized (no API key configured).')
|
|
278
|
+
return
|
|
279
|
+
import together
|
|
280
|
+
self._logger.info('Initializing Together client.')
|
|
281
|
+
together.api_key = api_key
|
|
282
|
+
self._has_together_client = True
|
|
283
|
+
|
|
284
|
+
def _start_web_server(self) -> None:
|
|
285
|
+
"""
|
|
286
|
+
The http server root is the file system root.
|
|
287
|
+
eg: /home/media/foo.mp4 is located at http://127.0.0.1:{port}/home/media/foo.mp4
|
|
288
|
+
This arrangement enables serving media hosted within _home,
|
|
289
|
+
as well as external media inserted into pixeltable or produced by pixeltable.
|
|
290
|
+
The port is chosen dynamically to prevent conflicts.
|
|
291
|
+
"""
|
|
292
|
+
# Port 0 means OS picks one for us.
|
|
293
|
+
address = ("127.0.0.1", 0)
|
|
294
|
+
class FixedRootHandler(http.server.SimpleHTTPRequestHandler):
|
|
295
|
+
def __init__(self, *args, **kwargs):
|
|
296
|
+
super().__init__(*args, directory='/', **kwargs)
|
|
297
|
+
self._httpd = socketserver.TCPServer(address, FixedRootHandler)
|
|
298
|
+
port = self._httpd.server_address[1]
|
|
299
|
+
self._http_address = f'http://127.0.0.1:{port}'
|
|
300
|
+
|
|
301
|
+
def run_server():
|
|
302
|
+
logging.log(logging.INFO, f'running web server at {self._http_address}')
|
|
303
|
+
self._httpd.serve_forever()
|
|
304
|
+
|
|
305
|
+
# Run the server in a separate thread
|
|
306
|
+
thread = threading.Thread(target=run_server, daemon=True)
|
|
307
|
+
thread.start()
|
|
308
|
+
|
|
309
|
+
def _set_up_runtime(self) -> None:
|
|
310
|
+
"""Check for and start runtime services"""
|
|
311
|
+
self._start_web_server()
|
|
312
|
+
self._check_installed_packages()
|
|
313
|
+
|
|
314
|
+
def _check_installed_packages(self) -> None:
|
|
315
|
+
def check(package: str) -> None:
|
|
316
|
+
if importlib.util.find_spec(package) is not None:
|
|
317
|
+
self._installed_packages[package] = []
|
|
318
|
+
else:
|
|
319
|
+
self._installed_packages[package] = None
|
|
320
|
+
|
|
321
|
+
check('torch')
|
|
322
|
+
check('torchvision')
|
|
323
|
+
check('transformers')
|
|
324
|
+
check('sentence_transformers')
|
|
325
|
+
check('boto3')
|
|
326
|
+
check('pyarrow')
|
|
327
|
+
check('spacy') # TODO: deal with en-core-web-sm
|
|
328
|
+
if self.is_installed_package('spacy'):
|
|
329
|
+
import spacy
|
|
330
|
+
self._spacy_nlp = spacy.load('en_core_web_sm')
|
|
331
|
+
check('tiktoken')
|
|
332
|
+
check('openai')
|
|
333
|
+
if self.is_installed_package('openai'):
|
|
334
|
+
self._create_openai_client()
|
|
335
|
+
check('together')
|
|
336
|
+
if self.is_installed_package('together'):
|
|
337
|
+
self._create_together_client()
|
|
338
|
+
check('fireworks')
|
|
339
|
+
check('nos')
|
|
340
|
+
if self.is_installed_package('nos'):
|
|
341
|
+
self._create_nos_client()
|
|
342
|
+
|
|
343
|
+
def require_package(self, package: str, min_version: Optional[List[int]] = None) -> None:
|
|
344
|
+
assert package in self._installed_packages
|
|
345
|
+
if self._installed_packages[package] is None:
|
|
346
|
+
raise excs.Error(f'Package {package} is not installed')
|
|
347
|
+
if min_version is None:
|
|
348
|
+
return
|
|
349
|
+
|
|
350
|
+
# check whether we have a version >= the required one
|
|
351
|
+
if self._installed_packages[package] == []:
|
|
352
|
+
m = importlib.import_module(package)
|
|
353
|
+
module_version = [int(x) for x in m.__version__.split('.')]
|
|
354
|
+
self._installed_packages[package] = module_version
|
|
355
|
+
installed_version = self._installed_packages[package]
|
|
356
|
+
if len(min_version) < len(installed_version):
|
|
357
|
+
normalized_min_version = min_version + [0] * (len(installed_version) - len(min_version))
|
|
358
|
+
if any([a < b for a, b in zip(installed_version, normalized_min_version)]):
|
|
359
|
+
raise excs.Error((
|
|
360
|
+
f'The installed version of package {package} is {".".join([str[v] for v in installed_version])}, '
|
|
361
|
+
f'but version >={".".join([str[v] for v in min_version])} is required'))
|
|
362
|
+
|
|
363
|
+
def num_tmp_files(self) -> int:
|
|
364
|
+
return len(glob.glob(f'{self._tmp_dir}/*'))
|
|
365
|
+
|
|
366
|
+
def create_tmp_path(self, extension: str = '') -> Path:
|
|
367
|
+
return self._tmp_dir / f'{uuid.uuid4()}{extension}'
|
|
368
|
+
|
|
369
|
+
@property
|
|
370
|
+
def home(self) -> Path:
|
|
371
|
+
assert self._home is not None
|
|
372
|
+
return self._home
|
|
373
|
+
|
|
374
|
+
@property
|
|
375
|
+
def media_dir(self) -> Path:
|
|
376
|
+
assert self._media_dir is not None
|
|
377
|
+
return self._media_dir
|
|
67
378
|
|
|
68
379
|
@property
|
|
69
|
-
def
|
|
70
|
-
assert self.
|
|
71
|
-
return self.
|
|
380
|
+
def file_cache_dir(self) -> Path:
|
|
381
|
+
assert self._file_cache_dir is not None
|
|
382
|
+
return self._file_cache_dir
|
|
72
383
|
|
|
73
384
|
@property
|
|
74
|
-
def
|
|
75
|
-
assert self.
|
|
76
|
-
return self.
|
|
385
|
+
def dataset_cache_dir(self) -> Path:
|
|
386
|
+
assert self._dataset_cache_dir is not None
|
|
387
|
+
return self._dataset_cache_dir
|
|
77
388
|
|
|
78
389
|
@property
|
|
79
|
-
def
|
|
80
|
-
assert self.
|
|
81
|
-
return self.
|
|
390
|
+
def tmp_dir(self) -> Path:
|
|
391
|
+
assert self._tmp_dir is not None
|
|
392
|
+
return self._tmp_dir
|
|
82
393
|
|
|
83
394
|
@property
|
|
84
395
|
def engine(self) -> sql.engine.base.Engine:
|
|
85
396
|
assert self._sa_engine is not None
|
|
86
397
|
return self._sa_engine
|
|
398
|
+
|
|
399
|
+
@property
|
|
400
|
+
def nos_client(self) -> Any:
|
|
401
|
+
return self._nos_client
|
|
402
|
+
|
|
403
|
+
@property
|
|
404
|
+
def openai_client(self) -> Optional['openai.OpenAI']:
|
|
405
|
+
return self._openai_client
|
|
406
|
+
|
|
407
|
+
@property
|
|
408
|
+
def has_together_client(self) -> bool:
|
|
409
|
+
return self._has_together_client
|
|
410
|
+
|
|
411
|
+
@property
|
|
412
|
+
def spacy_nlp(self) -> Any:
|
|
413
|
+
assert self._spacy_nlp is not None
|
|
414
|
+
return self._spacy_nlp
|
pixeltable/exceptions.py
CHANGED
|
@@ -1,26 +1,17 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class DuplicateNameError(Exception):
|
|
6
|
-
pass
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class UnknownEntityError(Exception):
|
|
10
|
-
pass
|
|
1
|
+
from typing import List, Any
|
|
2
|
+
from types import TracebackType
|
|
3
|
+
from dataclasses import dataclass
|
|
11
4
|
|
|
12
5
|
|
|
13
|
-
class
|
|
14
|
-
pass
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class DirectoryNotEmptyError(Exception):
|
|
18
|
-
pass
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class InsertError(Exception):
|
|
6
|
+
class Error(Exception):
|
|
22
7
|
pass
|
|
23
8
|
|
|
24
9
|
|
|
25
|
-
|
|
26
|
-
|
|
10
|
+
@dataclass
|
|
11
|
+
class ExprEvalError(Exception):
|
|
12
|
+
expr: Any # exprs.Expr, but we're not importing pixeltable.exprs to avoid circular imports
|
|
13
|
+
expr_msg: str
|
|
14
|
+
exc: Exception
|
|
15
|
+
exc_tb: TracebackType
|
|
16
|
+
input_vals: List[Any]
|
|
17
|
+
row_num: int
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from .aggregation_node import AggregationNode
|
|
2
|
+
from .cache_prefetch_node import CachePrefetchNode
|
|
3
|
+
from .component_iteration_node import ComponentIterationNode
|
|
4
|
+
from .exec_context import ExecContext
|
|
5
|
+
from .exec_node import ExecNode
|
|
6
|
+
from .expr_eval_node import ExprEvalNode
|
|
7
|
+
from .in_memory_data_node import InMemoryDataNode
|
|
8
|
+
from .sql_scan_node import SqlScanNode
|
|
9
|
+
from .media_validation_node import MediaValidationNode
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import sys
|
|
5
|
+
from typing import List, Optional, Any
|
|
6
|
+
|
|
7
|
+
import pixeltable.catalog as catalog
|
|
8
|
+
import pixeltable.exceptions as excs
|
|
9
|
+
import pixeltable.exprs as exprs
|
|
10
|
+
from .data_row_batch import DataRowBatch
|
|
11
|
+
from .exec_node import ExecNode
|
|
12
|
+
|
|
13
|
+
_logger = logging.getLogger('pixeltable')
|
|
14
|
+
|
|
15
|
+
class AggregationNode(ExecNode):
|
|
16
|
+
def __init__(
|
|
17
|
+
self, tbl: catalog.TableVersion, row_builder: exprs.RowBuilder, group_by: List[exprs.Expr],
|
|
18
|
+
agg_fn_calls: List[exprs.FunctionCall], input_exprs: List[exprs.Expr], input: ExecNode
|
|
19
|
+
):
|
|
20
|
+
super().__init__(row_builder, group_by + agg_fn_calls, input_exprs, input)
|
|
21
|
+
self.input = input
|
|
22
|
+
self.group_by = group_by
|
|
23
|
+
self.input_exprs = input_exprs
|
|
24
|
+
self.agg_fn_calls = agg_fn_calls
|
|
25
|
+
self.agg_fn_eval_ctx = row_builder.create_eval_ctx(agg_fn_calls, exclude=input_exprs)
|
|
26
|
+
self.output_batch = DataRowBatch(tbl, row_builder, 0)
|
|
27
|
+
|
|
28
|
+
def _reset_agg_state(self, row_num: int) -> None:
|
|
29
|
+
for fn_call in self.agg_fn_calls:
|
|
30
|
+
try:
|
|
31
|
+
fn_call.reset_agg()
|
|
32
|
+
except Exception as e:
|
|
33
|
+
_, _, exc_tb = sys.exc_info()
|
|
34
|
+
expr_msg = f'init() function of the aggregate {fn_call}'
|
|
35
|
+
raise excs.ExprEvalError(fn_call, expr_msg, e, exc_tb, [], row_num)
|
|
36
|
+
|
|
37
|
+
def _update_agg_state(self, row: exprs.DataRow, row_num: int) -> None:
|
|
38
|
+
for fn_call in self.agg_fn_calls:
|
|
39
|
+
try:
|
|
40
|
+
fn_call.update(row)
|
|
41
|
+
except Exception as e:
|
|
42
|
+
_, _, exc_tb = sys.exc_info()
|
|
43
|
+
expr_msg = f'update() function of the aggregate {fn_call}'
|
|
44
|
+
input_vals = [row[d.slot_idx] for d in fn_call.dependencies()]
|
|
45
|
+
raise excs.ExprEvalError(fn_call, expr_msg, e, exc_tb, input_vals, row_num)
|
|
46
|
+
|
|
47
|
+
def __next__(self) -> DataRowBatch:
|
|
48
|
+
if self.output_batch is None:
|
|
49
|
+
raise StopIteration
|
|
50
|
+
|
|
51
|
+
prev_row: Optional[exprs.DataRow] = None
|
|
52
|
+
current_group: Optional[List[Any]] = None # the values of the group-by exprs
|
|
53
|
+
num_input_rows = 0
|
|
54
|
+
for row_batch in self.input:
|
|
55
|
+
num_input_rows += len(row_batch)
|
|
56
|
+
for row in row_batch:
|
|
57
|
+
group = [row[e.slot_idx] for e in self.group_by]
|
|
58
|
+
if current_group is None:
|
|
59
|
+
current_group = group
|
|
60
|
+
self._reset_agg_state(0)
|
|
61
|
+
if group != current_group:
|
|
62
|
+
# we're entering a new group, emit a row for the previous one
|
|
63
|
+
self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
|
|
64
|
+
self.output_batch.add_row(prev_row)
|
|
65
|
+
current_group = group
|
|
66
|
+
self._reset_agg_state(0)
|
|
67
|
+
self._update_agg_state(row, 0)
|
|
68
|
+
prev_row = row
|
|
69
|
+
# emit the last group
|
|
70
|
+
self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
|
|
71
|
+
self.output_batch.add_row(prev_row)
|
|
72
|
+
|
|
73
|
+
result = self.output_batch
|
|
74
|
+
result.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
|
|
75
|
+
self.output_batch = None
|
|
76
|
+
_logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(result.rows)} rows')
|
|
77
|
+
return result
|
|
78
|
+
|