pixeltable 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (140) hide show
  1. pixeltable/__init__.py +21 -4
  2. pixeltable/catalog/__init__.py +13 -0
  3. pixeltable/catalog/catalog.py +159 -0
  4. pixeltable/catalog/column.py +200 -0
  5. pixeltable/catalog/dir.py +32 -0
  6. pixeltable/catalog/globals.py +33 -0
  7. pixeltable/catalog/insertable_table.py +191 -0
  8. pixeltable/catalog/named_function.py +36 -0
  9. pixeltable/catalog/path.py +58 -0
  10. pixeltable/catalog/path_dict.py +139 -0
  11. pixeltable/catalog/schema_object.py +39 -0
  12. pixeltable/catalog/table.py +581 -0
  13. pixeltable/catalog/table_version.py +749 -0
  14. pixeltable/catalog/table_version_path.py +133 -0
  15. pixeltable/catalog/view.py +203 -0
  16. pixeltable/client.py +520 -31
  17. pixeltable/dataframe.py +540 -349
  18. pixeltable/env.py +373 -48
  19. pixeltable/exceptions.py +12 -21
  20. pixeltable/exec/__init__.py +9 -0
  21. pixeltable/exec/aggregation_node.py +78 -0
  22. pixeltable/exec/cache_prefetch_node.py +113 -0
  23. pixeltable/exec/component_iteration_node.py +79 -0
  24. pixeltable/exec/data_row_batch.py +95 -0
  25. pixeltable/exec/exec_context.py +22 -0
  26. pixeltable/exec/exec_node.py +61 -0
  27. pixeltable/exec/expr_eval_node.py +217 -0
  28. pixeltable/exec/in_memory_data_node.py +69 -0
  29. pixeltable/exec/media_validation_node.py +43 -0
  30. pixeltable/exec/sql_scan_node.py +225 -0
  31. pixeltable/exprs/__init__.py +24 -0
  32. pixeltable/exprs/arithmetic_expr.py +102 -0
  33. pixeltable/exprs/array_slice.py +71 -0
  34. pixeltable/exprs/column_property_ref.py +77 -0
  35. pixeltable/exprs/column_ref.py +105 -0
  36. pixeltable/exprs/comparison.py +77 -0
  37. pixeltable/exprs/compound_predicate.py +98 -0
  38. pixeltable/exprs/data_row.py +187 -0
  39. pixeltable/exprs/expr.py +586 -0
  40. pixeltable/exprs/expr_set.py +39 -0
  41. pixeltable/exprs/function_call.py +380 -0
  42. pixeltable/exprs/globals.py +69 -0
  43. pixeltable/exprs/image_member_access.py +115 -0
  44. pixeltable/exprs/image_similarity_predicate.py +58 -0
  45. pixeltable/exprs/inline_array.py +107 -0
  46. pixeltable/exprs/inline_dict.py +101 -0
  47. pixeltable/exprs/is_null.py +38 -0
  48. pixeltable/exprs/json_mapper.py +121 -0
  49. pixeltable/exprs/json_path.py +159 -0
  50. pixeltable/exprs/literal.py +54 -0
  51. pixeltable/exprs/object_ref.py +41 -0
  52. pixeltable/exprs/predicate.py +44 -0
  53. pixeltable/exprs/row_builder.py +355 -0
  54. pixeltable/exprs/rowid_ref.py +94 -0
  55. pixeltable/exprs/type_cast.py +53 -0
  56. pixeltable/exprs/variable.py +45 -0
  57. pixeltable/func/__init__.py +9 -0
  58. pixeltable/func/aggregate_function.py +194 -0
  59. pixeltable/func/batched_function.py +53 -0
  60. pixeltable/func/callable_function.py +69 -0
  61. pixeltable/func/expr_template_function.py +82 -0
  62. pixeltable/func/function.py +110 -0
  63. pixeltable/func/function_registry.py +227 -0
  64. pixeltable/func/globals.py +36 -0
  65. pixeltable/func/nos_function.py +202 -0
  66. pixeltable/func/signature.py +166 -0
  67. pixeltable/func/udf.py +163 -0
  68. pixeltable/functions/__init__.py +52 -103
  69. pixeltable/functions/eval.py +216 -0
  70. pixeltable/functions/fireworks.py +61 -0
  71. pixeltable/functions/huggingface.py +120 -0
  72. pixeltable/functions/image.py +16 -0
  73. pixeltable/functions/openai.py +88 -0
  74. pixeltable/functions/pil/image.py +148 -7
  75. pixeltable/functions/string.py +13 -0
  76. pixeltable/functions/together.py +27 -0
  77. pixeltable/functions/util.py +41 -0
  78. pixeltable/functions/video.py +62 -0
  79. pixeltable/iterators/__init__.py +3 -0
  80. pixeltable/iterators/base.py +48 -0
  81. pixeltable/iterators/document.py +311 -0
  82. pixeltable/iterators/video.py +89 -0
  83. pixeltable/metadata/__init__.py +54 -0
  84. pixeltable/metadata/converters/convert_10.py +18 -0
  85. pixeltable/metadata/schema.py +211 -0
  86. pixeltable/plan.py +656 -0
  87. pixeltable/store.py +413 -182
  88. pixeltable/tests/conftest.py +143 -86
  89. pixeltable/tests/test_audio.py +65 -0
  90. pixeltable/tests/test_catalog.py +27 -0
  91. pixeltable/tests/test_client.py +14 -14
  92. pixeltable/tests/test_component_view.py +372 -0
  93. pixeltable/tests/test_dataframe.py +433 -0
  94. pixeltable/tests/test_dirs.py +78 -62
  95. pixeltable/tests/test_document.py +117 -0
  96. pixeltable/tests/test_exprs.py +591 -135
  97. pixeltable/tests/test_function.py +297 -67
  98. pixeltable/tests/test_functions.py +283 -1
  99. pixeltable/tests/test_migration.py +43 -0
  100. pixeltable/tests/test_nos.py +54 -0
  101. pixeltable/tests/test_snapshot.py +208 -0
  102. pixeltable/tests/test_table.py +1086 -258
  103. pixeltable/tests/test_transactional_directory.py +42 -0
  104. pixeltable/tests/test_types.py +5 -11
  105. pixeltable/tests/test_video.py +149 -34
  106. pixeltable/tests/test_view.py +530 -0
  107. pixeltable/tests/utils.py +186 -45
  108. pixeltable/tool/create_test_db_dump.py +149 -0
  109. pixeltable/type_system.py +490 -133
  110. pixeltable/utils/__init__.py +17 -46
  111. pixeltable/utils/clip.py +12 -15
  112. pixeltable/utils/coco.py +136 -0
  113. pixeltable/utils/documents.py +39 -0
  114. pixeltable/utils/filecache.py +195 -0
  115. pixeltable/utils/help.py +11 -0
  116. pixeltable/utils/media_store.py +76 -0
  117. pixeltable/utils/parquet.py +126 -0
  118. pixeltable/utils/pytorch.py +172 -0
  119. pixeltable/utils/s3.py +13 -0
  120. pixeltable/utils/sql.py +17 -0
  121. pixeltable/utils/transactional_directory.py +35 -0
  122. pixeltable-0.2.0.dist-info/LICENSE +18 -0
  123. pixeltable-0.2.0.dist-info/METADATA +117 -0
  124. pixeltable-0.2.0.dist-info/RECORD +125 -0
  125. {pixeltable-0.1.2.dist-info → pixeltable-0.2.0.dist-info}/WHEEL +1 -1
  126. pixeltable/catalog.py +0 -1421
  127. pixeltable/exprs.py +0 -1745
  128. pixeltable/function.py +0 -269
  129. pixeltable/functions/clip.py +0 -10
  130. pixeltable/functions/pil/__init__.py +0 -23
  131. pixeltable/functions/tf.py +0 -21
  132. pixeltable/index.py +0 -57
  133. pixeltable/tests/test_dict.py +0 -24
  134. pixeltable/tests/test_tf.py +0 -69
  135. pixeltable/tf.py +0 -33
  136. pixeltable/utils/tf.py +0 -33
  137. pixeltable/utils/video.py +0 -32
  138. pixeltable-0.1.2.dist-info/LICENSE +0 -201
  139. pixeltable-0.1.2.dist-info/METADATA +0 -89
  140. pixeltable-0.1.2.dist-info/RECORD +0 -37
pixeltable/env.py CHANGED
@@ -1,89 +1,414 @@
1
- from typing import Optional
1
+ from __future__ import annotations
2
+ import datetime
3
+ import os
4
+ from typing import Optional, Dict, Any, List
2
5
  from pathlib import Path
3
6
  import sqlalchemy as sql
7
+ import uuid
8
+ import importlib
9
+ import importlib.util
10
+
11
+ import http.server
12
+ import socketserver
13
+ import threading
14
+ import typing
15
+ import uuid
16
+ from pathlib import Path
17
+ from typing import Optional, Dict, Any, List
18
+
19
+ import yaml
4
20
  from sqlalchemy_utils.functions import database_exists, create_database, drop_database
21
+ import pgserver
22
+ import logging
23
+ import sys
24
+ import glob
5
25
 
26
+ from pixeltable import metadata
27
+ import pixeltable.exceptions as excs
28
+
29
+ if typing.TYPE_CHECKING:
30
+ import openai
6
31
 
7
32
  class Env:
8
33
  """
9
34
  Store for runtime globals.
10
35
  """
11
- _instance: Optional['Env'] = None
36
+ _instance: Optional[Env] = None
37
+ _log_fmt_str = '%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d: %(message)s'
12
38
 
13
39
  @classmethod
14
- def get(cls) -> 'Env':
40
+ def get(cls) -> Env:
15
41
  if cls._instance is None:
16
42
  cls._instance = Env()
17
43
  return cls._instance
18
44
 
19
45
  def __init__(self):
20
46
  self._home: Optional[Path] = None
21
- self._db_path: Optional[Path] = None
22
- self._img_dir: Optional[Path] = None
23
- self._nnidx_dir: Optional[Path] = None
24
- self._tmp_video_dir: Optional[Path] = None
47
+ self._media_dir: Optional[Path] = None # computed media files
48
+ self._file_cache_dir: Optional[Path] = None # cached media files with external URL
49
+ self._dataset_cache_dir: Optional[Path] = None # cached datasets (eg, pytorch or COCO)
50
+ self._log_dir: Optional[Path] = None # log files
51
+ self._tmp_dir: Optional[Path] = None # any tmp files
25
52
  self._sa_engine: Optional[sql.engine.base.Engine] = None
53
+ self._pgdata_dir : Optional[Path] = None
26
54
  self._db_name: Optional[str] = None
55
+ self._db_server: Optional[pgserver.PostgresServer] = None
56
+ self._db_url: Optional[str] = None
57
+
58
+ # info about installed packages that are utilized by some parts of the code;
59
+ # package name -> version; version == []: package is installed, but we haven't determined the version yet
60
+ self._installed_packages: Dict[str, Optional[List[int]]] = {}
61
+ self._nos_client: Optional[Any] = None
62
+ self._openai_client: Optional['openai.OpenAI'] = None
63
+ self._has_together_client: bool = False
64
+ self._spacy_nlp: Optional[Any] = None # spacy.Language
65
+ self._httpd: Optional[socketserver.TCPServer] = None
66
+ self._http_address: Optional[str] = None
67
+
68
+ # logging-related state
69
+ self._logger = logging.getLogger('pixeltable')
70
+ self._logger.setLevel(logging.DEBUG) # allow everything to pass, we filter in _log_filter()
71
+ self._logger.propagate = False
72
+ self._logger.addFilter(self._log_filter)
73
+ self._default_log_level = logging.INFO
74
+ self._logfilename: Optional[str] = None
75
+ self._log_to_stdout = False
76
+ self._module_log_level: Dict[str, int] = {} # module name -> log level
77
+
78
+ # config
79
+ self._config_file: Optional[Path] = None
80
+ self._config: Optional[Dict[str, Any]] = None
81
+
82
+ # create logging handler to also log to stdout
83
+ self._stdout_handler = logging.StreamHandler(stream=sys.stdout)
84
+ self._stdout_handler.setFormatter(logging.Formatter(self._log_fmt_str))
85
+ self._initialized = False
86
+
87
+ @property
88
+ def config(self):
89
+ return self._config
90
+
91
+ @property
92
+ def db_url(self) -> str:
93
+ assert self._db_url is not None
94
+ return self._db_url
95
+
96
+ @property
97
+ def http_address(self) -> str:
98
+ assert self._http_address is not None
99
+ return self._http_address
100
+
101
+ def print_log_config(self) -> None:
102
+ print(f'logging to {self._logfilename}')
103
+ print(f'{"" if self._log_to_stdout else "not "}logging to stdout')
104
+ print(f'default log level: {logging.getLevelName(self._default_log_level)}')
105
+ print(
106
+ f'module log levels: '
107
+ f'{",".join([name + ":" + logging.getLevelName(val) for name, val in self._module_log_level.items()])}')
108
+
109
+ def log_to_stdout(self, enable: bool = True) -> None:
110
+ self._log_to_stdout = enable
111
+ if enable:
112
+ self._logger.addHandler(self._stdout_handler)
113
+ else:
114
+ self._logger.removeHandler(self._stdout_handler)
115
+
116
+ def set_log_level(self, level: int) -> None:
117
+ self._default_log_level = level
118
+
119
+ def set_module_log_level(self, module: str, level: Optional[int]) -> None:
120
+ if level is None:
121
+ self._module_log_level.pop(module, None)
122
+ else:
123
+ self._module_log_level[module] = level
124
+
125
+ def is_installed_package(self, package_name: str) -> bool:
126
+ return self._installed_packages[package_name] is not None
127
+
128
+ def _log_filter(self, record: logging.LogRecord) -> bool:
129
+ if record.name == 'pixeltable':
130
+ # accept log messages from a configured pixeltable module (at any level of the module hierarchy)
131
+ path_parts = list(Path(record.pathname).parts)
132
+ path_parts.reverse()
133
+ max_idx = path_parts.index('pixeltable')
134
+ for module_name in path_parts[:max_idx]:
135
+ if module_name in self._module_log_level and record.levelno >= self._module_log_level[module_name]:
136
+ return True
137
+ if record.levelno >= self._default_log_level:
138
+ return True
139
+ else:
140
+ return False
141
+
142
+ def set_up(self, echo: bool = False, reinit_db: bool = False) -> None:
143
+ if self._initialized:
144
+ return
145
+
146
+ self._initialized = True
147
+ home = Path(os.environ.get('PIXELTABLE_HOME', str(Path.home() / '.pixeltable')))
148
+ assert self._home is None or self._home == home
149
+ self._home = home
150
+ self._config_file = Path(os.environ.get('PIXELTABLE_CONFIG', str(self._home / 'config.yaml')))
151
+ self._media_dir = self._home / 'media'
152
+ self._file_cache_dir = self._home / 'file_cache'
153
+ self._dataset_cache_dir = self._home / 'dataset_cache'
154
+ self._log_dir = self._home / 'logs'
155
+ self._tmp_dir = self._home / 'tmp'
156
+ self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(self._home / 'pgdata')))
157
+
158
+ # Read in the config
159
+ if os.path.isfile(self._config_file):
160
+ with open(self._config_file, 'r') as stream:
161
+ try:
162
+ self._config = yaml.safe_load(stream)
163
+ except yaml.YAMLError as exc:
164
+ self._logger.error(f'Could not read config file: {self._config_file}')
165
+ self._config = {}
166
+ else:
167
+ self._config = {}
27
168
 
28
- def set_up(self, home_str: Optional[str], db_name: Optional[str], echo: bool = False) -> None:
29
- home = Path.home() / '.pixeltable' if home_str is None else Path(home_str)
30
- if db_name is None:
31
- db_name = 'pixeltable'
32
- self.set_home(home)
33
169
  if self._home.exists() and not self._home.is_dir():
34
170
  raise RuntimeError(f'{self._home} is not a directory')
35
171
 
36
- self._db_name = db_name
37
- db_url = f'postgresql:///{self._db_name}'
38
-
39
172
  if not self._home.exists():
40
- print(f'setting up Pixeltable at {self._home}, db at {db_url}')
173
+ # we don't have our logger set up yet, so print to stdout
174
+ print(f'Creating a Pixeltable instance at: {self._home}')
41
175
  self._home.mkdir()
42
- self._img_dir.mkdir()
43
- self._nnidx_dir.mkdir()
44
- self._tmp_video_dir.mkdir()
45
- self.tear_down()
46
- if not database_exists(db_url):
47
- create_database(db_url)
48
- self._sa_engine = sql.create_engine(db_url, echo=echo, future=True)
49
- from pixeltable import store
50
- store.Base.metadata.create_all(self._sa_engine)
176
+ # TODO (aaron-siegel) This is the existing behavior, but it seems scary. If something happens to
177
+ # self._home, it will cause the DB to be destroyed even if pgdata is in an alternate location.
178
+ # PROPOSAL: require `reinit_db` to be set explicitly to destroy the DB.
179
+ reinit_db = True
180
+
181
+ if not self._media_dir.exists():
182
+ self._media_dir.mkdir()
183
+ if not self._file_cache_dir.exists():
184
+ self._file_cache_dir.mkdir()
185
+ if not self._dataset_cache_dir.exists():
186
+ self._dataset_cache_dir.mkdir()
187
+ if not self._log_dir.exists():
188
+ self._log_dir.mkdir()
189
+ if not self._tmp_dir.exists():
190
+ self._tmp_dir.mkdir()
191
+
192
+ # configure _logger to log to a file
193
+ self._logfilename = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + '.log'
194
+ fh = logging.FileHandler(self._log_dir / self._logfilename, mode='w')
195
+ fh.setFormatter(logging.Formatter(self._log_fmt_str))
196
+ self._logger.addHandler(fh)
197
+ sql_logger = logging.getLogger('sqlalchemy.engine')
198
+ sql_logger.setLevel(logging.INFO)
199
+ sql_logger.addHandler(fh)
200
+ sql_logger.propagate = False
201
+
202
+ # empty tmp dir
203
+ for path in glob.glob(f'{self._tmp_dir}/*'):
204
+ os.remove(path)
205
+
206
+ self._db_name = os.environ.get('PIXELTABLE_DB', 'pixeltable')
207
+
208
+ # cleanup_mode=None will leave db on for debugging purposes
209
+ self._db_server = pgserver.get_server(self._pgdata_dir, cleanup_mode=None)
210
+ self._db_url = self._db_server.get_uri(database=self._db_name)
211
+
212
+ if reinit_db:
213
+ if database_exists(self.db_url):
214
+ drop_database(self.db_url)
215
+
216
+ if not database_exists(self.db_url):
217
+ self._logger.info(f'creating database at {self.db_url}')
218
+ create_database(self.db_url)
219
+ self._sa_engine = sql.create_engine(self.db_url, echo=echo, future=True)
220
+ from pixeltable.metadata import schema
221
+ schema.Base.metadata.create_all(self._sa_engine)
222
+ metadata.create_system_info(self._sa_engine)
223
+ # enable pgvector
224
+ with self._sa_engine.begin() as conn:
225
+ conn.execute(sql.text('CREATE EXTENSION vector'))
51
226
  else:
52
- if not database_exists(db_url):
53
- raise RuntimeError(f'Database not found: {db_url}')
227
+ self._logger.info(f'found database {self.db_url}')
54
228
  if self._sa_engine is None:
55
- self._sa_engine = sql.create_engine(db_url, echo=echo, future=True)
229
+ self._sa_engine = sql.create_engine(self.db_url, echo=echo, future=True)
230
+
231
+ print(f'Connected to Pixeltable database at: {self.db_url}')
232
+
233
+ # we now have a home directory and db; start other services
234
+ self._set_up_runtime()
235
+ self.log_to_stdout(False)
56
236
 
57
- def tear_down(self) -> None:
58
- db_url = f'postgresql:///{self._db_name}'
59
- if database_exists(db_url):
60
- drop_database(db_url)
237
+ def upgrade_metadata(self) -> None:
238
+ metadata.upgrade_md(self._sa_engine)
61
239
 
62
- def set_home(self, home: Path) -> None:
63
- if self._home is not None:
240
+ def _create_nos_client(self) -> None:
241
+ import nos
242
+ self._logger.info('connecting to NOS')
243
+ nos.init(logging_level=logging.DEBUG)
244
+ self._nos_client = nos.client.InferenceClient()
245
+ self._logger.info('waiting for NOS')
246
+ self._nos_client.WaitForServer()
247
+
248
+ # now that we have a client, we can create the module
249
+ import importlib
250
+ try:
251
+ importlib.import_module('pixeltable.functions.nos')
252
+ # it's already been created
64
253
  return
65
- self._home = home
66
- self._db_path = self._home / 'db.sqlite3'
67
- self._img_dir = self._home / 'images'
68
- self._nnidx_dir = self._home / 'nnidxs'
69
- self._tmp_video_dir = self._home / 'tmp_videos'
254
+ except ImportError:
255
+ pass
256
+ from pixeltable.functions.util import create_nos_modules
257
+ _ = create_nos_modules()
258
+
259
+ def _create_openai_client(self) -> None:
260
+ if 'openai' in self._config and 'api_key' in self._config['openai']:
261
+ api_key = self._config['openai']['api_key']
262
+ else:
263
+ api_key = os.environ.get('OPENAI_API_KEY')
264
+ if api_key is None or api_key == '':
265
+ self._logger.info("OpenAI client not initialized (no API key configured).")
266
+ return
267
+ import openai
268
+ self._logger.info('Initializing OpenAI client.')
269
+ self._openai_client = openai.OpenAI(api_key=api_key)
270
+
271
+ def _create_together_client(self) -> None:
272
+ if 'together' in self._config and 'api_key' in self._config['together']:
273
+ api_key = self._config['together']['api_key']
274
+ else:
275
+ api_key = os.environ.get('TOGETHER_API_KEY')
276
+ if api_key is None or api_key == '':
277
+ self._logger.info('Together client not initialized (no API key configured).')
278
+ return
279
+ import together
280
+ self._logger.info('Initializing Together client.')
281
+ together.api_key = api_key
282
+ self._has_together_client = True
283
+
284
+ def _start_web_server(self) -> None:
285
+ """
286
+ The http server root is the file system root.
287
+ eg: /home/media/foo.mp4 is located at http://127.0.0.1:{port}/home/media/foo.mp4
288
+ This arrangement enables serving media hosted within _home,
289
+ as well as external media inserted into pixeltable or produced by pixeltable.
290
+ The port is chosen dynamically to prevent conflicts.
291
+ """
292
+ # Port 0 means OS picks one for us.
293
+ address = ("127.0.0.1", 0)
294
+ class FixedRootHandler(http.server.SimpleHTTPRequestHandler):
295
+ def __init__(self, *args, **kwargs):
296
+ super().__init__(*args, directory='/', **kwargs)
297
+ self._httpd = socketserver.TCPServer(address, FixedRootHandler)
298
+ port = self._httpd.server_address[1]
299
+ self._http_address = f'http://127.0.0.1:{port}'
300
+
301
+ def run_server():
302
+ logging.log(logging.INFO, f'running web server at {self._http_address}')
303
+ self._httpd.serve_forever()
304
+
305
+ # Run the server in a separate thread
306
+ thread = threading.Thread(target=run_server, daemon=True)
307
+ thread.start()
308
+
309
+ def _set_up_runtime(self) -> None:
310
+ """Check for and start runtime services"""
311
+ self._start_web_server()
312
+ self._check_installed_packages()
313
+
314
+ def _check_installed_packages(self) -> None:
315
+ def check(package: str) -> None:
316
+ if importlib.util.find_spec(package) is not None:
317
+ self._installed_packages[package] = []
318
+ else:
319
+ self._installed_packages[package] = None
320
+
321
+ check('torch')
322
+ check('torchvision')
323
+ check('transformers')
324
+ check('sentence_transformers')
325
+ check('boto3')
326
+ check('pyarrow')
327
+ check('spacy') # TODO: deal with en-core-web-sm
328
+ if self.is_installed_package('spacy'):
329
+ import spacy
330
+ self._spacy_nlp = spacy.load('en_core_web_sm')
331
+ check('tiktoken')
332
+ check('openai')
333
+ if self.is_installed_package('openai'):
334
+ self._create_openai_client()
335
+ check('together')
336
+ if self.is_installed_package('together'):
337
+ self._create_together_client()
338
+ check('fireworks')
339
+ check('nos')
340
+ if self.is_installed_package('nos'):
341
+ self._create_nos_client()
342
+
343
+ def require_package(self, package: str, min_version: Optional[List[int]] = None) -> None:
344
+ assert package in self._installed_packages
345
+ if self._installed_packages[package] is None:
346
+ raise excs.Error(f'Package {package} is not installed')
347
+ if min_version is None:
348
+ return
349
+
350
+ # check whether we have a version >= the required one
351
+ if self._installed_packages[package] == []:
352
+ m = importlib.import_module(package)
353
+ module_version = [int(x) for x in m.__version__.split('.')]
354
+ self._installed_packages[package] = module_version
355
+ installed_version = self._installed_packages[package]
356
+ if len(min_version) < len(installed_version):
357
+ normalized_min_version = min_version + [0] * (len(installed_version) - len(min_version))
358
+ if any([a < b for a, b in zip(installed_version, normalized_min_version)]):
359
+ raise excs.Error((
360
+ f'The installed version of package {package} is {".".join([str[v] for v in installed_version])}, '
361
+ f'but version >={".".join([str[v] for v in min_version])} is required'))
362
+
363
+ def num_tmp_files(self) -> int:
364
+ return len(glob.glob(f'{self._tmp_dir}/*'))
365
+
366
+ def create_tmp_path(self, extension: str = '') -> Path:
367
+ return self._tmp_dir / f'{uuid.uuid4()}{extension}'
368
+
369
+ @property
370
+ def home(self) -> Path:
371
+ assert self._home is not None
372
+ return self._home
373
+
374
+ @property
375
+ def media_dir(self) -> Path:
376
+ assert self._media_dir is not None
377
+ return self._media_dir
70
378
 
71
379
  @property
72
- def img_dir(self) -> Path:
73
- assert self._img_dir is not None
74
- return self._img_dir
380
+ def file_cache_dir(self) -> Path:
381
+ assert self._file_cache_dir is not None
382
+ return self._file_cache_dir
75
383
 
76
384
  @property
77
- def nnidx_dir(self) -> Path:
78
- assert self._nnidx_dir is not None
79
- return self._nnidx_dir
385
+ def dataset_cache_dir(self) -> Path:
386
+ assert self._dataset_cache_dir is not None
387
+ return self._dataset_cache_dir
80
388
 
81
389
  @property
82
- def tmp_video_dir(self) -> Path:
83
- assert self._tmp_video_dir is not None
84
- return self._tmp_video_dir
390
+ def tmp_dir(self) -> Path:
391
+ assert self._tmp_dir is not None
392
+ return self._tmp_dir
85
393
 
86
394
  @property
87
395
  def engine(self) -> sql.engine.base.Engine:
88
396
  assert self._sa_engine is not None
89
397
  return self._sa_engine
398
+
399
+ @property
400
+ def nos_client(self) -> Any:
401
+ return self._nos_client
402
+
403
+ @property
404
+ def openai_client(self) -> Optional['openai.OpenAI']:
405
+ return self._openai_client
406
+
407
+ @property
408
+ def has_together_client(self) -> bool:
409
+ return self._has_together_client
410
+
411
+ @property
412
+ def spacy_nlp(self) -> Any:
413
+ assert self._spacy_nlp is not None
414
+ return self._spacy_nlp
pixeltable/exceptions.py CHANGED
@@ -1,26 +1,17 @@
1
- class Error(Exception):
2
- pass
3
-
4
-
5
- class DuplicateNameError(Exception):
6
- pass
7
-
8
-
9
- class UnknownEntityError(Exception):
10
- pass
1
+ from typing import List, Any
2
+ from types import TracebackType
3
+ from dataclasses import dataclass
11
4
 
12
5
 
13
- class BadFormatError(Exception):
14
- pass
15
-
16
-
17
- class DirectoryNotEmptyError(Exception):
18
- pass
19
-
20
-
21
- class InsertError(Exception):
6
+ class Error(Exception):
22
7
  pass
23
8
 
24
9
 
25
- class OperationalError(Exception):
26
- pass
10
+ @dataclass
11
+ class ExprEvalError(Exception):
12
+ expr: Any # exprs.Expr, but we're not importing pixeltable.exprs to avoid circular imports
13
+ expr_msg: str
14
+ exc: Exception
15
+ exc_tb: TracebackType
16
+ input_vals: List[Any]
17
+ row_num: int
@@ -0,0 +1,9 @@
1
+ from .aggregation_node import AggregationNode
2
+ from .cache_prefetch_node import CachePrefetchNode
3
+ from .component_iteration_node import ComponentIterationNode
4
+ from .exec_context import ExecContext
5
+ from .exec_node import ExecNode
6
+ from .expr_eval_node import ExprEvalNode
7
+ from .in_memory_data_node import InMemoryDataNode
8
+ from .sql_scan_node import SqlScanNode
9
+ from .media_validation_node import MediaValidationNode
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import sys
5
+ from typing import List, Optional, Any
6
+
7
+ import pixeltable.catalog as catalog
8
+ import pixeltable.exceptions as excs
9
+ import pixeltable.exprs as exprs
10
+ from .data_row_batch import DataRowBatch
11
+ from .exec_node import ExecNode
12
+
13
+ _logger = logging.getLogger('pixeltable')
14
+
15
+ class AggregationNode(ExecNode):
16
+ def __init__(
17
+ self, tbl: catalog.TableVersion, row_builder: exprs.RowBuilder, group_by: List[exprs.Expr],
18
+ agg_fn_calls: List[exprs.FunctionCall], input_exprs: List[exprs.Expr], input: ExecNode
19
+ ):
20
+ super().__init__(row_builder, group_by + agg_fn_calls, input_exprs, input)
21
+ self.input = input
22
+ self.group_by = group_by
23
+ self.input_exprs = input_exprs
24
+ self.agg_fn_calls = agg_fn_calls
25
+ self.agg_fn_eval_ctx = row_builder.create_eval_ctx(agg_fn_calls, exclude=input_exprs)
26
+ self.output_batch = DataRowBatch(tbl, row_builder, 0)
27
+
28
+ def _reset_agg_state(self, row_num: int) -> None:
29
+ for fn_call in self.agg_fn_calls:
30
+ try:
31
+ fn_call.reset_agg()
32
+ except Exception as e:
33
+ _, _, exc_tb = sys.exc_info()
34
+ expr_msg = f'init() function of the aggregate {fn_call}'
35
+ raise excs.ExprEvalError(fn_call, expr_msg, e, exc_tb, [], row_num)
36
+
37
+ def _update_agg_state(self, row: exprs.DataRow, row_num: int) -> None:
38
+ for fn_call in self.agg_fn_calls:
39
+ try:
40
+ fn_call.update(row)
41
+ except Exception as e:
42
+ _, _, exc_tb = sys.exc_info()
43
+ expr_msg = f'update() function of the aggregate {fn_call}'
44
+ input_vals = [row[d.slot_idx] for d in fn_call.dependencies()]
45
+ raise excs.ExprEvalError(fn_call, expr_msg, e, exc_tb, input_vals, row_num)
46
+
47
+ def __next__(self) -> DataRowBatch:
48
+ if self.output_batch is None:
49
+ raise StopIteration
50
+
51
+ prev_row: Optional[exprs.DataRow] = None
52
+ current_group: Optional[List[Any]] = None # the values of the group-by exprs
53
+ num_input_rows = 0
54
+ for row_batch in self.input:
55
+ num_input_rows += len(row_batch)
56
+ for row in row_batch:
57
+ group = [row[e.slot_idx] for e in self.group_by]
58
+ if current_group is None:
59
+ current_group = group
60
+ self._reset_agg_state(0)
61
+ if group != current_group:
62
+ # we're entering a new group, emit a row for the previous one
63
+ self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
64
+ self.output_batch.add_row(prev_row)
65
+ current_group = group
66
+ self._reset_agg_state(0)
67
+ self._update_agg_state(row, 0)
68
+ prev_row = row
69
+ # emit the last group
70
+ self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
71
+ self.output_batch.add_row(prev_row)
72
+
73
+ result = self.output_batch
74
+ result.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
75
+ self.output_batch = None
76
+ _logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(result.rows)} rows')
77
+ return result
78
+