pixeltable 0.2.18__py3-none-any.whl → 0.2.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (42) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/insertable_table.py +9 -7
  4. pixeltable/catalog/table.py +18 -5
  5. pixeltable/catalog/table_version.py +1 -1
  6. pixeltable/catalog/view.py +1 -1
  7. pixeltable/dataframe.py +1 -1
  8. pixeltable/env.py +140 -40
  9. pixeltable/exceptions.py +12 -5
  10. pixeltable/exec/component_iteration_node.py +63 -42
  11. pixeltable/exprs/__init__.py +1 -2
  12. pixeltable/exprs/expr.py +5 -6
  13. pixeltable/exprs/function_call.py +8 -10
  14. pixeltable/exprs/inline_expr.py +200 -0
  15. pixeltable/exprs/json_path.py +3 -6
  16. pixeltable/ext/functions/whisperx.py +2 -0
  17. pixeltable/ext/functions/yolox.py +5 -3
  18. pixeltable/functions/huggingface.py +89 -12
  19. pixeltable/functions/image.py +3 -3
  20. pixeltable/functions/together.py +37 -16
  21. pixeltable/functions/vision.py +43 -21
  22. pixeltable/functions/whisper.py +3 -0
  23. pixeltable/globals.py +7 -1
  24. pixeltable/io/globals.py +1 -1
  25. pixeltable/io/hf_datasets.py +3 -3
  26. pixeltable/iterators/document.py +1 -1
  27. pixeltable/metadata/__init__.py +1 -1
  28. pixeltable/metadata/converters/convert_18.py +1 -1
  29. pixeltable/metadata/converters/convert_20.py +56 -0
  30. pixeltable/metadata/converters/util.py +29 -4
  31. pixeltable/metadata/notes.py +1 -0
  32. pixeltable/tool/create_test_db_dump.py +15 -4
  33. pixeltable/type_system.py +3 -1
  34. pixeltable/utils/filecache.py +126 -79
  35. pixeltable-0.2.20.dist-info/LICENSE +201 -0
  36. {pixeltable-0.2.18.dist-info → pixeltable-0.2.20.dist-info}/METADATA +16 -6
  37. {pixeltable-0.2.18.dist-info → pixeltable-0.2.20.dist-info}/RECORD +39 -39
  38. pixeltable/exprs/inline_array.py +0 -117
  39. pixeltable/exprs/inline_dict.py +0 -104
  40. pixeltable-0.2.18.dist-info/LICENSE +0 -18
  41. {pixeltable-0.2.18.dist-info → pixeltable-0.2.20.dist-info}/WHEEL +0 -0
  42. {pixeltable-0.2.18.dist-info → pixeltable-0.2.20.dist-info}/entry_points.txt +0 -0
@@ -19,12 +19,9 @@ from typing import Any, Optional, Union
19
19
  import numpy as np
20
20
  import PIL.Image
21
21
 
22
- import pixeltable.func as func
23
- import pixeltable.type_system as ts
22
+ import pixeltable as pxt
24
23
  from pixeltable.utils.code import local_public_names
25
24
 
26
- # TODO: figure out a better submodule structure
27
-
28
25
 
29
26
  # the following function has been adapted from MMEval
30
27
  # (sources at https://github.com/open-mmlab/mmeval)
@@ -161,25 +158,41 @@ def __calculate_image_tpfp(
161
158
  return tp, fp
162
159
 
163
160
 
164
- @func.udf(
165
- return_type=ts.JsonType(nullable=False),
166
- param_types=[
167
- ts.JsonType(nullable=False),
168
- ts.JsonType(nullable=False),
169
- ts.JsonType(nullable=False),
170
- ts.JsonType(nullable=False),
171
- ts.JsonType(nullable=False),
172
- ],
173
- )
161
+ @pxt.udf
174
162
  def eval_detections(
175
163
  pred_bboxes: list[list[int]],
176
164
  pred_labels: list[int],
177
165
  pred_scores: list[float],
178
166
  gt_bboxes: list[list[int]],
179
167
  gt_labels: list[int],
168
+ min_iou: float = 0.5,
180
169
  ) -> list[dict]:
181
170
  """
182
171
  Evaluates the performance of a set of predicted bounding boxes against a set of ground truth bounding boxes.
172
+
173
+ Args:
174
+ pred_bboxes: List of predicted bounding boxes, each represented as [xmin, ymin, xmax, ymax].
175
+ pred_labels: List of predicted labels.
176
+ pred_scores: List of predicted scores.
177
+ gt_bboxes: List of ground truth bounding boxes, each represented as [xmin, ymin, xmax, ymax].
178
+ gt_labels: List of ground truth labels.
179
+ min_iou: Minimum intersection-over-union (IoU) threshold for a predicted bounding box to be
180
+ considered a true positive.
181
+
182
+ Returns:
183
+ A list of dictionaries, one per label class, with the following structure:
184
+ ```python
185
+ {
186
+ 'min_iou': float, # The value of `min_iou` used for the detections
187
+ 'class': int, # The label class
188
+ 'tp': list[int], # List of 1's and 0's indicating true positives for each
189
+ # predicted bounding box of this class
190
+ 'fp': list[int], # List of 1's and 0's indicating false positives for each
191
+ # predicted bounding box of this class; `fp[n] == 1 - tp[n]`
192
+ 'scores': list[float], # List of predicted scores for each bounding box of this class
193
+ 'num_gts': int, # Number of ground truth bounding boxes of this class
194
+ }
195
+ ```
183
196
  """
184
197
  class_idxs = list(set(pred_labels + gt_labels))
185
198
  result: list[dict] = []
@@ -192,11 +205,11 @@ def eval_detections(
192
205
  pred_filter = pred_classes_arr == class_idx
193
206
  gt_filter = gt_classes_arr == class_idx
194
207
  class_pred_scores = pred_scores_arr[pred_filter]
195
- tp, fp = __calculate_image_tpfp(pred_bboxes_arr[pred_filter], class_pred_scores, gt_bboxes_arr[gt_filter], 0.5)
208
+ tp, fp = __calculate_image_tpfp(pred_bboxes_arr[pred_filter], class_pred_scores, gt_bboxes_arr[gt_filter], min_iou)
196
209
  ordered_class_pred_scores = -np.sort(-class_pred_scores)
197
210
  result.append(
198
211
  {
199
- 'min_iou': 0.5,
212
+ 'min_iou': min_iou,
200
213
  'class': class_idx,
201
214
  'tp': tp.tolist(),
202
215
  'fp': fp.tolist(),
@@ -207,11 +220,20 @@ def eval_detections(
207
220
  return result
208
221
 
209
222
 
210
- @func.uda(update_types=[ts.JsonType()], value_type=ts.JsonType(), allows_std_agg=True, allows_window=False)
211
- class mean_ap(func.Aggregator):
223
+ @pxt.uda(update_types=[pxt.JsonType()], value_type=pxt.JsonType(), allows_std_agg=True, allows_window=False)
224
+ class mean_ap(pxt.Aggregator):
212
225
  """
213
226
  Calculates the mean average precision (mAP) over
214
227
  [`eval_detections()`][pixeltable.functions.vision.eval_detections] results.
228
+
229
+ __Parameters:__
230
+
231
+ - `eval_dicts` (list[dict]): List of dictionaries as returned by
232
+ [`eval_detections()`][pixeltable.functions.vision.eval_detections].
233
+
234
+ __Returns:__
235
+
236
+ - A `dict[int, float]` mapping each label class to an average precision (AP) value for that class.
215
237
  """
216
238
  def __init__(self):
217
239
  self.class_tpfp: dict[int, list[dict]] = defaultdict(list)
@@ -246,7 +268,7 @@ class mean_ap(func.Aggregator):
246
268
  return result
247
269
 
248
270
 
249
- def _create_label_colors(labels: list[Any]) -> dict[Any, str]:
271
+ def __create_label_colors(labels: list[Any]) -> dict[Any, str]:
250
272
  """
251
273
  Create random colors for labels such that a particular label always gets the same color.
252
274
 
@@ -265,7 +287,7 @@ def _create_label_colors(labels: list[Any]) -> dict[Any, str]:
265
287
  return result
266
288
 
267
289
 
268
- @func.udf
290
+ @pxt.udf
269
291
  def draw_bounding_boxes(
270
292
  img: PIL.Image.Image,
271
293
  boxes: list[list[int]],
@@ -324,7 +346,7 @@ def draw_bounding_boxes(
324
346
  if color is not None:
325
347
  box_colors = [color] * num_boxes
326
348
  else:
327
- label_colors = _create_label_colors(labels)
349
+ label_colors = __create_label_colors(labels)
328
350
  box_colors = [label_colors[label] for label in labels]
329
351
 
330
352
  from PIL import ImageColor, ImageDraw, ImageFont
@@ -9,6 +9,7 @@ first `pip install openai-whisper`.
9
9
  from typing import TYPE_CHECKING, Optional
10
10
 
11
11
  import pixeltable as pxt
12
+ from pixeltable.env import Env
12
13
 
13
14
  if TYPE_CHECKING:
14
15
  from whisper import Whisper # type: ignore[import-untyped]
@@ -71,6 +72,8 @@ def transcribe(
71
72
 
72
73
  >>> tbl['result'] = transcribe(tbl.audio, model='base.en')
73
74
  """
75
+ Env.get().require_package('whisper')
76
+ Env.get().require_package('torch')
74
77
  import torch
75
78
 
76
79
  if decode_options is None:
pixeltable/globals.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import dataclasses
2
2
  import logging
3
- from typing import Any, Optional, Union
3
+ from typing import Any, Iterable, Optional, Union
4
4
  from uuid import UUID
5
5
 
6
6
  import pandas as pd
@@ -16,6 +16,7 @@ from pixeltable.dataframe import DataFrameResultSet
16
16
  from pixeltable.env import Env
17
17
  from pixeltable.iterators import ComponentIterator
18
18
  from pixeltable.metadata import schema
19
+ from pixeltable.utils.filecache import FileCache
19
20
 
20
21
  _logger = logging.getLogger('pixeltable')
21
22
 
@@ -193,6 +194,7 @@ def create_view(
193
194
  )
194
195
  Catalog.get().paths[path] = view
195
196
  _logger.info(f'Created view `{path_str}`.')
197
+ FileCache.get().emit_eviction_warnings()
196
198
  return view
197
199
 
198
200
 
@@ -487,3 +489,7 @@ def configure_logging(
487
489
  remove: comma-separated list of module names
488
490
  """
489
491
  return Env.get().configure_logging(to_stdout=to_stdout, level=level, add=add, remove=remove)
492
+
493
+
494
+ def array(elements: Iterable) -> exprs.Expr:
495
+ return exprs.InlineArray(elements)
pixeltable/io/globals.py CHANGED
@@ -43,7 +43,7 @@ def create_label_studio_project(
43
43
  The API key and URL for a valid Label Studio server must be specified in Pixeltable config. Either:
44
44
 
45
45
  * Set the `LABEL_STUDIO_API_KEY` and `LABEL_STUDIO_URL` environment variables; or
46
- * Specify `api_key` and `url` fields in the `label-studio` section of `$PIXELTABLE_HOME/config.yaml`.
46
+ * Specify `api_key` and `url` fields in the `label-studio` section of `$PIXELTABLE_HOME/config.toml`.
47
47
 
48
48
  __Requirements:__
49
49
 
@@ -34,9 +34,7 @@ _hf_to_pxt: dict[str, ts.ColumnType] = {
34
34
  }
35
35
 
36
36
 
37
- def _to_pixeltable_type(
38
- feature_type: Union[datasets.ClassLabel, datasets.Value, datasets.Sequence],
39
- ) -> Optional[ts.ColumnType]:
37
+ def _to_pixeltable_type(feature_type: Any) -> Optional[ts.ColumnType]:
40
38
  """Convert a huggingface feature type to a pixeltable ColumnType if one is defined."""
41
39
  import datasets
42
40
 
@@ -51,6 +49,8 @@ def _to_pixeltable_type(
51
49
  dtype = _to_pixeltable_type(feature_type.feature)
52
50
  length = feature_type.length if feature_type.length != -1 else None
53
51
  return ts.ArrayType(shape=(length,), dtype=dtype)
52
+ elif isinstance(feature_type, datasets.Image):
53
+ return ts.ImageType(nullable=True)
54
54
  else:
55
55
  return None
56
56
 
@@ -166,7 +166,7 @@ class DocumentSplitter(ComponentIterator):
166
166
  return {
167
167
  'document': DocumentType(nullable=False),
168
168
  'separators': StringType(nullable=False),
169
- 'metadata': StringType(nullable=True),
169
+ 'metadata': StringType(nullable=False),
170
170
  'limit': IntType(nullable=True),
171
171
  'overlap': IntType(nullable=True),
172
172
  'skip_tags': StringType(nullable=True),
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
10
10
  from .schema import SystemInfo, SystemInfoMd
11
11
 
12
12
  # current version of the metadata; this is incremented whenever the metadata schema changes
13
- VERSION = 20
13
+ VERSION = 21
14
14
 
15
15
 
16
16
  def create_system_info(engine: sql.engine.Engine) -> None:
@@ -13,7 +13,7 @@ def _(engine: sql.engine.Engine) -> None:
13
13
  )
14
14
 
15
15
 
16
- def __substitute_md(k: Any, v: Any) -> Optional[tuple[Any, Any]]:
16
+ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
17
17
  # Migrate a few changed function names
18
18
  if k == 'path' and v == 'pixeltable.functions.string.str_format':
19
19
  return 'path', 'pixeltable.functions.string.format'
@@ -0,0 +1,56 @@
1
+ from typing import Any, Optional
2
+
3
+ import sqlalchemy as sql
4
+
5
+ from pixeltable.metadata import register_converter
6
+ from pixeltable.metadata.converters.util import convert_table_md
7
+
8
+
9
+ @register_converter(version=20)
10
+ def _(engine: sql.engine.Engine) -> None:
11
+ convert_table_md(
12
+ engine,
13
+ substitution_fn=__substitute_md
14
+ )
15
+
16
+
17
+ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
18
+ if isinstance(v, dict) and '_classname' in v:
19
+ # The way InlineArray is represented changed in v20. Previously, literal values were stored
20
+ # directly in the Inline expr; now we store them in Literal sub-exprs. This converter
21
+ # constructs new Literal exprs for the literal values in InlineArray, interleaving them
22
+ # with non-literal exprs into the correct sequence.
23
+ if v['_classname'] == 'InlineArray':
24
+ components = v.get('components') # Might be None, but that's ok
25
+ updated_components = []
26
+ for idx, val in v['elements']:
27
+ # idx >= 0, then this is a non-literal sub-expr. Otherwise, idx could be either
28
+ # None or -1, for legacy reasons (which are now obviated).
29
+ if idx is not None and idx >= 0:
30
+ updated_components.append(components[idx])
31
+ else:
32
+ updated_components.append({'val': val, '_classname': 'Literal'})
33
+ # InlineList was split out from InlineArray in v20. If is_json=True, then this is
34
+ # actually an InlineList. If is_json=False, then we assume it's an InlineArray for now,
35
+ # but it might actually be transformed into an InlineList when it is instantiated
36
+ # (unfortunately, there is no way to disambiguate at this stage; see comments in
37
+ # InlineArray._from_dict() for more details).
38
+ updated_v = {'_classname': 'InlineList' if v.get('is_json') else 'InlineArray'}
39
+ if len(updated_components) > 0:
40
+ updated_v['components'] = updated_components
41
+ return k, updated_v
42
+ if v['_classname'] == 'InlineDict':
43
+ components = v.get('components')
44
+ keys = []
45
+ updated_components = []
46
+ for key, idx, val in v['dict_items']:
47
+ keys.append(key)
48
+ if idx is not None and idx >= 0:
49
+ updated_components.append(components[idx])
50
+ else:
51
+ updated_components.append({'val': val, '_classname': 'Literal'})
52
+ updated_v = {'keys': keys, '_classname': 'InlineDict'}
53
+ if len(updated_components) > 0:
54
+ updated_v['components'] = updated_components
55
+ return k, updated_v
56
+ return None
@@ -14,8 +14,22 @@ def convert_table_md(
14
14
  table_md_updater: Optional[Callable[[dict], None]] = None,
15
15
  column_md_updater: Optional[Callable[[dict], None]] = None,
16
16
  external_store_md_updater: Optional[Callable[[dict], None]] = None,
17
- substitution_fn: Optional[Callable[[Any, Any], Optional[tuple[Any, Any]]]] = None
17
+ substitution_fn: Optional[Callable[[Optional[str], Any], Optional[tuple[Optional[str], Any]]]] = None
18
18
  ) -> None:
19
+ """
20
+ Converts table metadata based on the specified conversion functions.
21
+
22
+ Args:
23
+ engine: The SQLAlchemy engine.
24
+ table_md_updater: A function that updates the table metadata in place.
25
+ column_md_updater: A function that updates the column metadata in place.
26
+ external_store_md_updater: A function that updates the external store metadata in place.
27
+ substitution_fn: A function that substitutes metadata values. If specified, all metadata will be traversed
28
+ recursively, and `substitution_fn` will be called once for each metadata entry. If the entry appears in
29
+ a dict as a `(k, v)` pair, then `substitution_fn(k, v)` will be called. If the entry appears in a list,
30
+ then `substitution_fn(None, v)` will be called. If `substitution_fn` returns a tuple `(k', v')`, then
31
+ the original entry will be replaced, and the traversal will continue with `v'`.
32
+ """
19
33
  with engine.begin() as conn:
20
34
  for row in conn.execute(sql.select(Table)):
21
35
  id = row[0]
@@ -49,18 +63,29 @@ def __update_external_store_md(table_md: dict, external_store_md_updater: Callab
49
63
  external_store_md_updater(store_md)
50
64
 
51
65
 
52
- def __substitute_md_rec(md: Any, substitution_fn: Callable[[Any, Any], Optional[tuple[Any, Any]]]) -> Any:
66
+ def __substitute_md_rec(
67
+ md: Any,
68
+ substitution_fn: Callable[[Optional[str], Any], Optional[tuple[Optional[str], Any]]]
69
+ ) -> Any:
53
70
  if isinstance(md, dict):
54
71
  updated_md = {}
55
72
  for k, v in md.items():
56
73
  substitute = substitution_fn(k, v)
57
74
  if substitute is not None:
58
75
  updated_k, updated_v = substitute
59
- updated_md[updated_k] = updated_v
76
+ updated_md[updated_k] = __substitute_md_rec(updated_v, substitution_fn)
60
77
  else:
61
78
  updated_md[k] = __substitute_md_rec(v, substitution_fn)
62
79
  return updated_md
63
80
  elif isinstance(md, list):
64
- return [__substitute_md_rec(v, substitution_fn) for v in md]
81
+ updated_md = []
82
+ for v in md:
83
+ substitute = substitution_fn(None, v)
84
+ if substitute is not None:
85
+ _, updated_v = substitute
86
+ updated_md.append(__substitute_md_rec(updated_v, substitution_fn))
87
+ else:
88
+ updated_md.append(__substitute_md_rec(v, substitution_fn))
89
+ return updated_md
65
90
  else:
66
91
  return md
@@ -2,6 +2,7 @@
2
2
  # rather than as a comment, so that the existence of a description can be enforced by
3
3
  # the unit tests when new versions are added.
4
4
  VERSION_NOTES = {
5
+ 21: 'Separate InlineArray and InlineList',
5
6
  20: 'Store DB timestamps in UTC',
6
7
  19: 'UDF renames; ImageMemberAccess removal',
7
8
  18: 'Restructured index metadata',
@@ -4,6 +4,7 @@ import logging
4
4
  import os
5
5
  import pathlib
6
6
  import subprocess
7
+ import sys
7
8
  from typing import Any
8
9
  from zoneinfo import ZoneInfo
9
10
 
@@ -24,12 +25,18 @@ _logger = logging.getLogger('pixeltable')
24
25
  class Dumper:
25
26
 
26
27
  def __init__(self, output_dir='target', db_name='pxtdump') -> None:
28
+ if sys.version_info >= (3, 10):
29
+ raise RuntimeError(
30
+ 'This script must be run on Python 3.9. '
31
+ 'DB dumps are incompatible across versions due to issues with pickling anonymous UDFs.'
32
+ )
33
+
27
34
  self.output_dir = pathlib.Path(output_dir)
28
35
  shared_home = pathlib.Path(os.environ.get('PIXELTABLE_HOME', '~/.pixeltable')).expanduser()
29
36
  mock_home_dir = self.output_dir / '.pixeltable'
30
37
  mock_home_dir.mkdir(parents=True, exist_ok=True)
31
38
  os.environ['PIXELTABLE_HOME'] = str(mock_home_dir)
32
- os.environ['PIXELTABLE_CONFIG'] = str(shared_home / 'config.yaml')
39
+ os.environ['PIXELTABLE_CONFIG'] = str(shared_home / 'config.toml')
33
40
  os.environ['PIXELTABLE_DB'] = db_name
34
41
  os.environ['PIXELTABLE_PGDATA'] = str(shared_home / 'pgdata')
35
42
 
@@ -226,9 +233,13 @@ class Dumper:
226
233
  add_column('isin_2', t.c2.isin([1, 2, 3, 4, 5]))
227
234
  add_column('isin_3', t.c2.isin(t.c6.f5))
228
235
 
229
- # inline_array and inline_dict
230
- add_column('inline_array_1', [[1, 2, 3], [4, 5, 6]])
231
- add_column('inline_array_2', [['a', 'b', 'c'], ['d', 'e', 'f']])
236
+ # inline_array, inline_list, inline_dict
237
+ add_column('inline_array_1', pxt.array([[1, 2, 3], [4, 5, 6]]))
238
+ add_column('inline_array_2', pxt.array([['a', 'b', 'c'], ['d', 'e', 'f']]))
239
+ add_column('inline_array_exprs', pxt.array([[t.c2, t.c2 + 1], [t.c2 + 2, t.c2]]))
240
+ add_column('inline_array_mixed', pxt.array([[1, t.c2], [3, t.c2]]))
241
+ add_column('inline_list_1', [[1, 2, 3], [4, 5, 6]])
242
+ add_column('inline_list_2', [['a', 'b', 'c'], ['d', 'e', 'f']])
232
243
  add_column('inline_list_exprs', [t.c1, [t.c1n, t.c2]])
233
244
  add_column('inline_list_mixed', [1, 'a', t.c1, [1, 'a', t.c1n], 1, 'a'])
234
245
  add_column('inline_dict', {'int': 22, 'dict': {'key': 'val'}, 'expr': t.c1})
pixeltable/type_system.py CHANGED
@@ -204,6 +204,8 @@ class ColumnType:
204
204
 
205
205
  @classmethod
206
206
  def infer_literal_type(cls, val: Any, nullable: bool = False) -> Optional[ColumnType]:
207
+ if val is None:
208
+ return InvalidType(nullable=True)
207
209
  if isinstance(val, str):
208
210
  return StringType(nullable=nullable)
209
211
  if isinstance(val, bool):
@@ -395,7 +397,7 @@ class InvalidType(ColumnType):
395
397
  assert False
396
398
 
397
399
  def print_value(self, val: Any) -> str:
398
- assert False
400
+ return str(val)
399
401
 
400
402
  def _validate_literal(self, val: Any) -> None:
401
403
  assert False