pixeltable 0.1.0__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (147) hide show
  1. pixeltable/__init__.py +34 -6
  2. pixeltable/catalog/__init__.py +13 -0
  3. pixeltable/catalog/catalog.py +159 -0
  4. pixeltable/catalog/column.py +200 -0
  5. pixeltable/catalog/dir.py +32 -0
  6. pixeltable/catalog/globals.py +33 -0
  7. pixeltable/catalog/insertable_table.py +191 -0
  8. pixeltable/catalog/named_function.py +36 -0
  9. pixeltable/catalog/path.py +58 -0
  10. pixeltable/catalog/path_dict.py +139 -0
  11. pixeltable/catalog/schema_object.py +39 -0
  12. pixeltable/catalog/table.py +581 -0
  13. pixeltable/catalog/table_version.py +749 -0
  14. pixeltable/catalog/table_version_path.py +133 -0
  15. pixeltable/catalog/view.py +203 -0
  16. pixeltable/client.py +590 -30
  17. pixeltable/dataframe.py +540 -349
  18. pixeltable/env.py +359 -45
  19. pixeltable/exceptions.py +12 -21
  20. pixeltable/exec/__init__.py +9 -0
  21. pixeltable/exec/aggregation_node.py +78 -0
  22. pixeltable/exec/cache_prefetch_node.py +116 -0
  23. pixeltable/exec/component_iteration_node.py +79 -0
  24. pixeltable/exec/data_row_batch.py +95 -0
  25. pixeltable/exec/exec_context.py +22 -0
  26. pixeltable/exec/exec_node.py +61 -0
  27. pixeltable/exec/expr_eval_node.py +217 -0
  28. pixeltable/exec/in_memory_data_node.py +69 -0
  29. pixeltable/exec/media_validation_node.py +43 -0
  30. pixeltable/exec/sql_scan_node.py +225 -0
  31. pixeltable/exprs/__init__.py +24 -0
  32. pixeltable/exprs/arithmetic_expr.py +102 -0
  33. pixeltable/exprs/array_slice.py +71 -0
  34. pixeltable/exprs/column_property_ref.py +77 -0
  35. pixeltable/exprs/column_ref.py +105 -0
  36. pixeltable/exprs/comparison.py +77 -0
  37. pixeltable/exprs/compound_predicate.py +98 -0
  38. pixeltable/exprs/data_row.py +195 -0
  39. pixeltable/exprs/expr.py +586 -0
  40. pixeltable/exprs/expr_set.py +39 -0
  41. pixeltable/exprs/function_call.py +380 -0
  42. pixeltable/exprs/globals.py +69 -0
  43. pixeltable/exprs/image_member_access.py +115 -0
  44. pixeltable/exprs/image_similarity_predicate.py +58 -0
  45. pixeltable/exprs/inline_array.py +107 -0
  46. pixeltable/exprs/inline_dict.py +101 -0
  47. pixeltable/exprs/is_null.py +38 -0
  48. pixeltable/exprs/json_mapper.py +121 -0
  49. pixeltable/exprs/json_path.py +159 -0
  50. pixeltable/exprs/literal.py +54 -0
  51. pixeltable/exprs/object_ref.py +41 -0
  52. pixeltable/exprs/predicate.py +44 -0
  53. pixeltable/exprs/row_builder.py +355 -0
  54. pixeltable/exprs/rowid_ref.py +94 -0
  55. pixeltable/exprs/type_cast.py +53 -0
  56. pixeltable/exprs/variable.py +45 -0
  57. pixeltable/func/__init__.py +9 -0
  58. pixeltable/func/aggregate_function.py +194 -0
  59. pixeltable/func/batched_function.py +53 -0
  60. pixeltable/func/callable_function.py +69 -0
  61. pixeltable/func/expr_template_function.py +82 -0
  62. pixeltable/func/function.py +110 -0
  63. pixeltable/func/function_registry.py +227 -0
  64. pixeltable/func/globals.py +36 -0
  65. pixeltable/func/nos_function.py +202 -0
  66. pixeltable/func/signature.py +166 -0
  67. pixeltable/func/udf.py +163 -0
  68. pixeltable/functions/__init__.py +52 -103
  69. pixeltable/functions/eval.py +216 -0
  70. pixeltable/functions/fireworks.py +34 -0
  71. pixeltable/functions/huggingface.py +120 -0
  72. pixeltable/functions/image.py +16 -0
  73. pixeltable/functions/openai.py +256 -0
  74. pixeltable/functions/pil/image.py +148 -7
  75. pixeltable/functions/string.py +13 -0
  76. pixeltable/functions/together.py +122 -0
  77. pixeltable/functions/util.py +41 -0
  78. pixeltable/functions/video.py +62 -0
  79. pixeltable/iterators/__init__.py +3 -0
  80. pixeltable/iterators/base.py +48 -0
  81. pixeltable/iterators/document.py +311 -0
  82. pixeltable/iterators/video.py +89 -0
  83. pixeltable/metadata/__init__.py +54 -0
  84. pixeltable/metadata/converters/convert_10.py +18 -0
  85. pixeltable/metadata/schema.py +211 -0
  86. pixeltable/plan.py +656 -0
  87. pixeltable/store.py +418 -182
  88. pixeltable/tests/conftest.py +146 -88
  89. pixeltable/tests/functions/test_fireworks.py +42 -0
  90. pixeltable/tests/functions/test_functions.py +60 -0
  91. pixeltable/tests/functions/test_huggingface.py +158 -0
  92. pixeltable/tests/functions/test_openai.py +152 -0
  93. pixeltable/tests/functions/test_together.py +111 -0
  94. pixeltable/tests/test_audio.py +65 -0
  95. pixeltable/tests/test_catalog.py +27 -0
  96. pixeltable/tests/test_client.py +14 -14
  97. pixeltable/tests/test_component_view.py +370 -0
  98. pixeltable/tests/test_dataframe.py +439 -0
  99. pixeltable/tests/test_dirs.py +78 -62
  100. pixeltable/tests/test_document.py +120 -0
  101. pixeltable/tests/test_exprs.py +592 -135
  102. pixeltable/tests/test_function.py +297 -67
  103. pixeltable/tests/test_migration.py +43 -0
  104. pixeltable/tests/test_nos.py +54 -0
  105. pixeltable/tests/test_snapshot.py +208 -0
  106. pixeltable/tests/test_table.py +1195 -263
  107. pixeltable/tests/test_transactional_directory.py +42 -0
  108. pixeltable/tests/test_types.py +5 -11
  109. pixeltable/tests/test_video.py +151 -34
  110. pixeltable/tests/test_view.py +530 -0
  111. pixeltable/tests/utils.py +320 -45
  112. pixeltable/tool/create_test_db_dump.py +149 -0
  113. pixeltable/tool/create_test_video.py +81 -0
  114. pixeltable/type_system.py +445 -124
  115. pixeltable/utils/__init__.py +17 -46
  116. pixeltable/utils/arrow.py +98 -0
  117. pixeltable/utils/clip.py +12 -15
  118. pixeltable/utils/coco.py +136 -0
  119. pixeltable/utils/documents.py +39 -0
  120. pixeltable/utils/filecache.py +195 -0
  121. pixeltable/utils/help.py +11 -0
  122. pixeltable/utils/hf_datasets.py +157 -0
  123. pixeltable/utils/media_store.py +76 -0
  124. pixeltable/utils/parquet.py +167 -0
  125. pixeltable/utils/pytorch.py +91 -0
  126. pixeltable/utils/s3.py +13 -0
  127. pixeltable/utils/sql.py +17 -0
  128. pixeltable/utils/transactional_directory.py +35 -0
  129. pixeltable-0.2.4.dist-info/LICENSE +18 -0
  130. pixeltable-0.2.4.dist-info/METADATA +127 -0
  131. pixeltable-0.2.4.dist-info/RECORD +132 -0
  132. {pixeltable-0.1.0.dist-info → pixeltable-0.2.4.dist-info}/WHEEL +1 -1
  133. pixeltable/catalog.py +0 -1421
  134. pixeltable/exprs.py +0 -1745
  135. pixeltable/function.py +0 -269
  136. pixeltable/functions/clip.py +0 -10
  137. pixeltable/functions/pil/__init__.py +0 -23
  138. pixeltable/functions/tf.py +0 -21
  139. pixeltable/index.py +0 -57
  140. pixeltable/tests/test_dict.py +0 -24
  141. pixeltable/tests/test_functions.py +0 -11
  142. pixeltable/tests/test_tf.py +0 -69
  143. pixeltable/tf.py +0 -33
  144. pixeltable/utils/tf.py +0 -33
  145. pixeltable/utils/video.py +0 -32
  146. pixeltable-0.1.0.dist-info/METADATA +0 -34
  147. pixeltable-0.1.0.dist-info/RECORD +0 -36
pixeltable/env.py CHANGED
@@ -1,86 +1,400 @@
1
- from typing import Optional
1
+ from __future__ import annotations
2
+
3
+ import datetime
4
+ import glob
5
+ import http.server
6
+ import importlib
7
+ import importlib.util
8
+ import logging
9
+ import os
10
+ import socketserver
11
+ import sys
12
+ import threading
13
+ import typing
14
+ import uuid
2
15
  from pathlib import Path
16
+ from typing import Callable, Optional, Dict, Any, List
17
+
18
+ import pgserver
3
19
  import sqlalchemy as sql
20
+ import yaml
4
21
  from sqlalchemy_utils.functions import database_exists, create_database, drop_database
5
22
 
23
+ import pixeltable.exceptions as excs
24
+ from pixeltable import metadata
25
+
6
26
 
7
27
  class Env:
8
28
  """
9
29
  Store for runtime globals.
10
30
  """
11
- _instance: Optional['Env'] = None
31
+ _instance: Optional[Env] = None
32
+ _log_fmt_str = '%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d: %(message)s'
12
33
 
13
34
  @classmethod
14
- def get(cls) -> 'Env':
35
+ def get(cls) -> Env:
15
36
  if cls._instance is None:
16
37
  cls._instance = Env()
17
38
  return cls._instance
18
39
 
19
40
  def __init__(self):
20
41
  self._home: Optional[Path] = None
21
- self._db_path: Optional[Path] = None
22
- self._img_dir: Optional[Path] = None
23
- self._nnidx_dir: Optional[Path] = None
24
- self._tmp_video_dir: Optional[Path] = None
42
+ self._media_dir: Optional[Path] = None # computed media files
43
+ self._file_cache_dir: Optional[Path] = None # cached media files with external URL
44
+ self._dataset_cache_dir: Optional[Path] = None # cached datasets (eg, pytorch or COCO)
45
+ self._log_dir: Optional[Path] = None # log files
46
+ self._tmp_dir: Optional[Path] = None # any tmp files
25
47
  self._sa_engine: Optional[sql.engine.base.Engine] = None
48
+ self._pgdata_dir : Optional[Path] = None
26
49
  self._db_name: Optional[str] = None
50
+ self._db_server: Optional[pgserver.PostgresServer] = None
51
+ self._db_url: Optional[str] = None
52
+
53
+ # info about installed packages that are utilized by some parts of the code;
54
+ # package name -> version; version == []: package is installed, but we haven't determined the version yet
55
+ self._installed_packages: Dict[str, Optional[List[int]]] = {}
56
+ self._nos_client: Optional[Any] = None
57
+ self._spacy_nlp: Optional[Any] = None # spacy.Language
58
+ self._httpd: Optional[socketserver.TCPServer] = None
59
+ self._http_address: Optional[str] = None
60
+
61
+ self._registered_clients: dict[str, Any] = {}
62
+
63
+ # logging-related state
64
+ self._logger = logging.getLogger('pixeltable')
65
+ self._logger.setLevel(logging.DEBUG) # allow everything to pass, we filter in _log_filter()
66
+ self._logger.propagate = False
67
+ self._logger.addFilter(self._log_filter)
68
+ self._default_log_level = logging.INFO
69
+ self._logfilename: Optional[str] = None
70
+ self._log_to_stdout = False
71
+ self._module_log_level: Dict[str, int] = {} # module name -> log level
72
+
73
+ # config
74
+ self._config_file: Optional[Path] = None
75
+ self._config: Optional[Dict[str, Any]] = None
76
+
77
+ # create logging handler to also log to stdout
78
+ self._stdout_handler = logging.StreamHandler(stream=sys.stdout)
79
+ self._stdout_handler.setFormatter(logging.Formatter(self._log_fmt_str))
80
+ self._initialized = False
81
+
82
+ @property
83
+ def config(self):
84
+ return self._config
85
+
86
+ @property
87
+ def db_url(self) -> str:
88
+ assert self._db_url is not None
89
+ return self._db_url
90
+
91
+ @property
92
+ def http_address(self) -> str:
93
+ assert self._http_address is not None
94
+ return self._http_address
95
+
96
+ def print_log_config(self) -> None:
97
+ print(f'logging to {self._logfilename}')
98
+ print(f'{"" if self._log_to_stdout else "not "}logging to stdout')
99
+ print(f'default log level: {logging.getLevelName(self._default_log_level)}')
100
+ print(
101
+ f'module log levels: '
102
+ f'{",".join([name + ":" + logging.getLevelName(val) for name, val in self._module_log_level.items()])}')
103
+
104
+ def log_to_stdout(self, enable: bool = True) -> None:
105
+ self._log_to_stdout = enable
106
+ if enable:
107
+ self._logger.addHandler(self._stdout_handler)
108
+ else:
109
+ self._logger.removeHandler(self._stdout_handler)
110
+
111
+ def set_log_level(self, level: int) -> None:
112
+ self._default_log_level = level
113
+
114
+ def set_module_log_level(self, module: str, level: Optional[int]) -> None:
115
+ if level is None:
116
+ self._module_log_level.pop(module, None)
117
+ else:
118
+ self._module_log_level[module] = level
119
+
120
+ def is_installed_package(self, package_name: str) -> bool:
121
+ return self._installed_packages[package_name] is not None
122
+
123
+ def _log_filter(self, record: logging.LogRecord) -> bool:
124
+ if record.name == 'pixeltable':
125
+ # accept log messages from a configured pixeltable module (at any level of the module hierarchy)
126
+ path_parts = list(Path(record.pathname).parts)
127
+ path_parts.reverse()
128
+ max_idx = path_parts.index('pixeltable')
129
+ for module_name in path_parts[:max_idx]:
130
+ if module_name in self._module_log_level and record.levelno >= self._module_log_level[module_name]:
131
+ return True
132
+ if record.levelno >= self._default_log_level:
133
+ return True
134
+ else:
135
+ return False
136
+
137
+ def set_up(self, echo: bool = False, reinit_db: bool = False) -> None:
138
+ if self._initialized:
139
+ return
140
+
141
+ self._initialized = True
142
+ home = Path(os.environ.get('PIXELTABLE_HOME', str(Path.home() / '.pixeltable')))
143
+ assert self._home is None or self._home == home
144
+ self._home = home
145
+ self._config_file = Path(os.environ.get('PIXELTABLE_CONFIG', str(self._home / 'config.yaml')))
146
+ self._media_dir = self._home / 'media'
147
+ self._file_cache_dir = self._home / 'file_cache'
148
+ self._dataset_cache_dir = self._home / 'dataset_cache'
149
+ self._log_dir = self._home / 'logs'
150
+ self._tmp_dir = self._home / 'tmp'
151
+
152
+ # Read in the config
153
+ if os.path.isfile(self._config_file):
154
+ with open(self._config_file, 'r') as stream:
155
+ try:
156
+ self._config = yaml.safe_load(stream)
157
+ except yaml.YAMLError as exc:
158
+ self._logger.error(f'Could not read config file: {self._config_file}')
159
+ self._config = {}
160
+ else:
161
+ self._config = {}
27
162
 
28
- def set_up(
29
- self, home_parent: Optional[Path] = Path.home(), db_name: str = 'pixeltable', echo: bool = False
30
- ) -> None:
31
- self.set_home(home_parent / '.pixeltable')
32
163
  if self._home.exists() and not self._home.is_dir():
33
164
  raise RuntimeError(f'{self._home} is not a directory')
34
165
 
35
- self._db_name = db_name
36
- db_url = f'postgresql:///{self._db_name}'
37
-
38
166
  if not self._home.exists():
39
- print(f'creating {self._home}')
167
+ # we don't have our logger set up yet, so print to stdout
168
+ print(f'Creating a Pixeltable instance at: {self._home}')
40
169
  self._home.mkdir()
41
- self._img_dir.mkdir()
42
- self._nnidx_dir.mkdir()
43
- self._tmp_video_dir.mkdir()
44
- self.tear_down()
45
- if not database_exists(db_url):
46
- create_database(db_url)
47
- self._sa_engine = sql.create_engine(db_url, echo=echo, future=True)
48
- from pixeltable import store
49
- store.Base.metadata.create_all(self._sa_engine)
170
+ # TODO (aaron-siegel) This is the existing behavior, but it seems scary. If something happens to
171
+ # self._home, it will cause the DB to be destroyed even if pgdata is in an alternate location.
172
+ # PROPOSAL: require `reinit_db` to be set explicitly to destroy the DB.
173
+ reinit_db = True
174
+
175
+ if not self._media_dir.exists():
176
+ self._media_dir.mkdir()
177
+ if not self._file_cache_dir.exists():
178
+ self._file_cache_dir.mkdir()
179
+ if not self._dataset_cache_dir.exists():
180
+ self._dataset_cache_dir.mkdir()
181
+ if not self._log_dir.exists():
182
+ self._log_dir.mkdir()
183
+ if not self._tmp_dir.exists():
184
+ self._tmp_dir.mkdir()
185
+
186
+ # configure _logger to log to a file
187
+ self._logfilename = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + '.log'
188
+ fh = logging.FileHandler(self._log_dir / self._logfilename, mode='w')
189
+ fh.setFormatter(logging.Formatter(self._log_fmt_str))
190
+ self._logger.addHandler(fh)
191
+ sql_logger = logging.getLogger('sqlalchemy.engine')
192
+ sql_logger.setLevel(logging.INFO)
193
+ sql_logger.addHandler(fh)
194
+ sql_logger.propagate = False
195
+
196
+ # empty tmp dir
197
+ for path in glob.glob(f'{self._tmp_dir}/*'):
198
+ os.remove(path)
199
+
200
+ self._db_name = os.environ.get('PIXELTABLE_DB', 'pixeltable')
201
+ self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(self._home / 'pgdata')))
202
+
203
+ # in pgserver.get_server(): cleanup_mode=None will leave db on for debugging purposes
204
+ self._db_server = pgserver.get_server(self._pgdata_dir, cleanup_mode=None)
205
+ self._db_url = self._db_server.get_uri(database=self._db_name)
206
+
207
+ if reinit_db:
208
+ if database_exists(self.db_url):
209
+ drop_database(self.db_url)
210
+
211
+ if not database_exists(self.db_url):
212
+ self._logger.info(f'creating database at {self.db_url}')
213
+ create_database(self.db_url)
214
+ self._sa_engine = sql.create_engine(self.db_url, echo=echo, future=True)
215
+ from pixeltable.metadata import schema
216
+ schema.Base.metadata.create_all(self._sa_engine)
217
+ metadata.create_system_info(self._sa_engine)
218
+ # enable pgvector
219
+ with self._sa_engine.begin() as conn:
220
+ conn.execute(sql.text('CREATE EXTENSION vector'))
50
221
  else:
222
+ self._logger.info(f'found database {self.db_url}')
51
223
  if self._sa_engine is None:
52
- self._sa_engine = sql.create_engine(db_url, echo=echo, future=True)
224
+ self._sa_engine = sql.create_engine(self.db_url, echo=echo, future=True)
225
+
226
+ print(f'Connected to Pixeltable database at: {self.db_url}')
53
227
 
54
- def tear_down(self) -> None:
55
- db_url = f'postgresql:///{self._db_name}'
56
- if database_exists(db_url):
57
- drop_database(db_url)
228
+ # we now have a home directory and db; start other services
229
+ self._set_up_runtime()
230
+ self.log_to_stdout(False)
58
231
 
59
- def set_home(self, home: Path) -> None:
60
- if self._home is not None:
232
+ def upgrade_metadata(self) -> None:
233
+ metadata.upgrade_md(self._sa_engine)
234
+
235
+ def _create_nos_client(self) -> None:
236
+ import nos
237
+ self._logger.info('connecting to NOS')
238
+ nos.init(logging_level=logging.DEBUG)
239
+ self._nos_client = nos.client.InferenceClient()
240
+ self._logger.info('waiting for NOS')
241
+ self._nos_client.WaitForServer()
242
+
243
+ # now that we have a client, we can create the module
244
+ import importlib
245
+ try:
246
+ importlib.import_module('pixeltable.functions.nos')
247
+ # it's already been created
61
248
  return
62
- self._home = home
63
- self._db_path = self._home / 'db.sqlite3'
64
- self._img_dir = self._home / 'images'
65
- self._nnidx_dir = self._home / 'nnidxs'
66
- self._tmp_video_dir = self._home / 'tmp_videos'
249
+ except ImportError:
250
+ pass
251
+ from pixeltable.functions.util import create_nos_modules
252
+ _ = create_nos_modules()
253
+
254
+ def get_client(self, name: str, init: Callable, environ: Optional[str] = None) -> Any:
255
+ """
256
+ Gets the client with the specified name, using `init` to construct one if necessary.
257
+
258
+ - name: The name of the client
259
+ - init: A `Callable` with signature `fn(api_key: str) -> Any` that constructs a client object
260
+ - environ: The name of the environment variable to use for the API key, if no API key is found in config
261
+ (defaults to f'{name.upper()}_API_KEY')
262
+ """
263
+ if name in self._registered_clients:
264
+ return self._registered_clients[name]
265
+
266
+ if environ is None:
267
+ environ = f'{name.upper()}_API_KEY'
268
+
269
+ if name in self._config and 'api_key' in self._config[name]:
270
+ api_key = self._config[name]['api_key']
271
+ else:
272
+ api_key = os.environ.get(environ)
273
+ if api_key is None or api_key == '':
274
+ raise excs.Error(f'`{name}` client not initialized (no API key configured).')
275
+
276
+ client = init(api_key)
277
+ self._registered_clients[name] = client
278
+ self._logger.info(f'Initialized `{name}` client.')
279
+ return client
280
+
281
+ def _start_web_server(self) -> None:
282
+ """
283
+ The http server root is the file system root.
284
+ eg: /home/media/foo.mp4 is located at http://127.0.0.1:{port}/home/media/foo.mp4
285
+ This arrangement enables serving media hosted within _home,
286
+ as well as external media inserted into pixeltable or produced by pixeltable.
287
+ The port is chosen dynamically to prevent conflicts.
288
+ """
289
+ # Port 0 means OS picks one for us.
290
+ address = ("127.0.0.1", 0)
291
+ class FixedRootHandler(http.server.SimpleHTTPRequestHandler):
292
+ def __init__(self, *args, **kwargs):
293
+ super().__init__(*args, directory='/', **kwargs)
294
+ self._httpd = socketserver.TCPServer(address, FixedRootHandler)
295
+ port = self._httpd.server_address[1]
296
+ self._http_address = f'http://127.0.0.1:{port}'
297
+
298
+ def run_server():
299
+ logging.log(logging.INFO, f'running web server at {self._http_address}')
300
+ self._httpd.serve_forever()
301
+
302
+ # Run the server in a separate thread
303
+ thread = threading.Thread(target=run_server, daemon=True)
304
+ thread.start()
305
+
306
+ def _set_up_runtime(self) -> None:
307
+ """Check for and start runtime services"""
308
+ self._start_web_server()
309
+ self._check_installed_packages()
310
+
311
+ def _check_installed_packages(self) -> None:
312
+ def check(package: str) -> None:
313
+ if importlib.util.find_spec(package) is not None:
314
+ self._installed_packages[package] = []
315
+ else:
316
+ self._installed_packages[package] = None
317
+
318
+ check('datasets')
319
+ check('torch')
320
+ check('torchvision')
321
+ check('transformers')
322
+ check('sentence_transformers')
323
+ check('boto3')
324
+ check('pyarrow')
325
+ check('spacy') # TODO: deal with en-core-web-sm
326
+ if self.is_installed_package('spacy'):
327
+ import spacy
328
+ self._spacy_nlp = spacy.load('en_core_web_sm')
329
+ check('tiktoken')
330
+ check('openai')
331
+ check('together')
332
+ check('fireworks')
333
+ check('nos')
334
+ if self.is_installed_package('nos'):
335
+ self._create_nos_client()
336
+
337
+ def require_package(self, package: str, min_version: Optional[List[int]] = None) -> None:
338
+ assert package in self._installed_packages
339
+ if self._installed_packages[package] is None:
340
+ raise excs.Error(f'Package {package} is not installed')
341
+ if min_version is None:
342
+ return
343
+
344
+ # check whether we have a version >= the required one
345
+ if self._installed_packages[package] == []:
346
+ m = importlib.import_module(package)
347
+ module_version = [int(x) for x in m.__version__.split('.')]
348
+ self._installed_packages[package] = module_version
349
+ installed_version = self._installed_packages[package]
350
+ if len(min_version) < len(installed_version):
351
+ normalized_min_version = min_version + [0] * (len(installed_version) - len(min_version))
352
+ if any([a < b for a, b in zip(installed_version, normalized_min_version)]):
353
+ raise excs.Error((
354
+ f'The installed version of package {package} is {".".join([str[v] for v in installed_version])}, '
355
+ f'but version >={".".join([str[v] for v in min_version])} is required'))
356
+
357
+ def num_tmp_files(self) -> int:
358
+ return len(glob.glob(f'{self._tmp_dir}/*'))
359
+
360
+ def create_tmp_path(self, extension: str = '') -> Path:
361
+ return self._tmp_dir / f'{uuid.uuid4()}{extension}'
67
362
 
68
363
  @property
69
- def img_dir(self) -> Path:
70
- assert self._img_dir is not None
71
- return self._img_dir
364
+ def home(self) -> Path:
365
+ assert self._home is not None
366
+ return self._home
72
367
 
73
368
  @property
74
- def nnidx_dir(self) -> Path:
75
- assert self._nnidx_dir is not None
76
- return self._nnidx_dir
369
+ def media_dir(self) -> Path:
370
+ assert self._media_dir is not None
371
+ return self._media_dir
77
372
 
78
373
  @property
79
- def tmp_video_dir(self) -> Path:
80
- assert self._tmp_video_dir is not None
81
- return self._tmp_video_dir
374
+ def file_cache_dir(self) -> Path:
375
+ assert self._file_cache_dir is not None
376
+ return self._file_cache_dir
377
+
378
+ @property
379
+ def dataset_cache_dir(self) -> Path:
380
+ assert self._dataset_cache_dir is not None
381
+ return self._dataset_cache_dir
382
+
383
+ @property
384
+ def tmp_dir(self) -> Path:
385
+ assert self._tmp_dir is not None
386
+ return self._tmp_dir
82
387
 
83
388
  @property
84
389
  def engine(self) -> sql.engine.base.Engine:
85
390
  assert self._sa_engine is not None
86
391
  return self._sa_engine
392
+
393
+ @property
394
+ def nos_client(self) -> Any:
395
+ return self._nos_client
396
+
397
+ @property
398
+ def spacy_nlp(self) -> Any:
399
+ assert self._spacy_nlp is not None
400
+ return self._spacy_nlp
pixeltable/exceptions.py CHANGED
@@ -1,26 +1,17 @@
1
- class Error(Exception):
2
- pass
3
-
4
-
5
- class DuplicateNameError(Exception):
6
- pass
7
-
8
-
9
- class UnknownEntityError(Exception):
10
- pass
1
+ from typing import List, Any
2
+ from types import TracebackType
3
+ from dataclasses import dataclass
11
4
 
12
5
 
13
- class BadFormatError(Exception):
14
- pass
15
-
16
-
17
- class DirectoryNotEmptyError(Exception):
18
- pass
19
-
20
-
21
- class InsertError(Exception):
6
+ class Error(Exception):
22
7
  pass
23
8
 
24
9
 
25
- class OperationalError(Exception):
26
- pass
10
+ @dataclass
11
+ class ExprEvalError(Exception):
12
+ expr: Any # exprs.Expr, but we're not importing pixeltable.exprs to avoid circular imports
13
+ expr_msg: str
14
+ exc: Exception
15
+ exc_tb: TracebackType
16
+ input_vals: List[Any]
17
+ row_num: int
@@ -0,0 +1,9 @@
1
+ from .aggregation_node import AggregationNode
2
+ from .cache_prefetch_node import CachePrefetchNode
3
+ from .component_iteration_node import ComponentIterationNode
4
+ from .exec_context import ExecContext
5
+ from .exec_node import ExecNode
6
+ from .expr_eval_node import ExprEvalNode
7
+ from .in_memory_data_node import InMemoryDataNode
8
+ from .sql_scan_node import SqlScanNode
9
+ from .media_validation_node import MediaValidationNode
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import sys
5
+ from typing import List, Optional, Any
6
+
7
+ import pixeltable.catalog as catalog
8
+ import pixeltable.exceptions as excs
9
+ import pixeltable.exprs as exprs
10
+ from .data_row_batch import DataRowBatch
11
+ from .exec_node import ExecNode
12
+
13
+ _logger = logging.getLogger('pixeltable')
14
+
15
+ class AggregationNode(ExecNode):
16
+ def __init__(
17
+ self, tbl: catalog.TableVersion, row_builder: exprs.RowBuilder, group_by: List[exprs.Expr],
18
+ agg_fn_calls: List[exprs.FunctionCall], input_exprs: List[exprs.Expr], input: ExecNode
19
+ ):
20
+ super().__init__(row_builder, group_by + agg_fn_calls, input_exprs, input)
21
+ self.input = input
22
+ self.group_by = group_by
23
+ self.input_exprs = input_exprs
24
+ self.agg_fn_calls = agg_fn_calls
25
+ self.agg_fn_eval_ctx = row_builder.create_eval_ctx(agg_fn_calls, exclude=input_exprs)
26
+ self.output_batch = DataRowBatch(tbl, row_builder, 0)
27
+
28
+ def _reset_agg_state(self, row_num: int) -> None:
29
+ for fn_call in self.agg_fn_calls:
30
+ try:
31
+ fn_call.reset_agg()
32
+ except Exception as e:
33
+ _, _, exc_tb = sys.exc_info()
34
+ expr_msg = f'init() function of the aggregate {fn_call}'
35
+ raise excs.ExprEvalError(fn_call, expr_msg, e, exc_tb, [], row_num)
36
+
37
+ def _update_agg_state(self, row: exprs.DataRow, row_num: int) -> None:
38
+ for fn_call in self.agg_fn_calls:
39
+ try:
40
+ fn_call.update(row)
41
+ except Exception as e:
42
+ _, _, exc_tb = sys.exc_info()
43
+ expr_msg = f'update() function of the aggregate {fn_call}'
44
+ input_vals = [row[d.slot_idx] for d in fn_call.dependencies()]
45
+ raise excs.ExprEvalError(fn_call, expr_msg, e, exc_tb, input_vals, row_num)
46
+
47
+ def __next__(self) -> DataRowBatch:
48
+ if self.output_batch is None:
49
+ raise StopIteration
50
+
51
+ prev_row: Optional[exprs.DataRow] = None
52
+ current_group: Optional[List[Any]] = None # the values of the group-by exprs
53
+ num_input_rows = 0
54
+ for row_batch in self.input:
55
+ num_input_rows += len(row_batch)
56
+ for row in row_batch:
57
+ group = [row[e.slot_idx] for e in self.group_by]
58
+ if current_group is None:
59
+ current_group = group
60
+ self._reset_agg_state(0)
61
+ if group != current_group:
62
+ # we're entering a new group, emit a row for the previous one
63
+ self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
64
+ self.output_batch.add_row(prev_row)
65
+ current_group = group
66
+ self._reset_agg_state(0)
67
+ self._update_agg_state(row, 0)
68
+ prev_row = row
69
+ # emit the last group
70
+ self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
71
+ self.output_batch.add_row(prev_row)
72
+
73
+ result = self.output_batch
74
+ result.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
75
+ self.output_batch = None
76
+ _logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(result.rows)} rows')
77
+ return result
78
+