pixeltable 0.2.19__py3-none-any.whl → 0.2.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (88) hide show
  1. pixeltable/__init__.py +7 -19
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +7 -7
  4. pixeltable/catalog/globals.py +3 -0
  5. pixeltable/catalog/insertable_table.py +9 -7
  6. pixeltable/catalog/table.py +220 -143
  7. pixeltable/catalog/table_version.py +36 -18
  8. pixeltable/catalog/table_version_path.py +0 -8
  9. pixeltable/catalog/view.py +3 -3
  10. pixeltable/dataframe.py +9 -24
  11. pixeltable/env.py +107 -36
  12. pixeltable/exceptions.py +7 -4
  13. pixeltable/exec/__init__.py +1 -1
  14. pixeltable/exec/aggregation_node.py +22 -15
  15. pixeltable/exec/component_iteration_node.py +62 -41
  16. pixeltable/exec/data_row_batch.py +7 -7
  17. pixeltable/exec/exec_node.py +35 -7
  18. pixeltable/exec/expr_eval_node.py +2 -1
  19. pixeltable/exec/in_memory_data_node.py +9 -9
  20. pixeltable/exec/sql_node.py +265 -136
  21. pixeltable/exprs/__init__.py +1 -0
  22. pixeltable/exprs/data_row.py +30 -19
  23. pixeltable/exprs/expr.py +15 -14
  24. pixeltable/exprs/expr_dict.py +55 -0
  25. pixeltable/exprs/expr_set.py +21 -15
  26. pixeltable/exprs/function_call.py +21 -8
  27. pixeltable/exprs/json_path.py +3 -6
  28. pixeltable/exprs/rowid_ref.py +2 -2
  29. pixeltable/exprs/sql_element_cache.py +5 -1
  30. pixeltable/ext/functions/whisperx.py +7 -2
  31. pixeltable/func/callable_function.py +2 -2
  32. pixeltable/func/function_registry.py +6 -7
  33. pixeltable/func/query_template_function.py +11 -12
  34. pixeltable/func/signature.py +17 -15
  35. pixeltable/func/udf.py +0 -4
  36. pixeltable/functions/__init__.py +1 -1
  37. pixeltable/functions/audio.py +4 -6
  38. pixeltable/functions/globals.py +86 -42
  39. pixeltable/functions/huggingface.py +12 -14
  40. pixeltable/functions/image.py +59 -45
  41. pixeltable/functions/json.py +0 -1
  42. pixeltable/functions/mistralai.py +2 -2
  43. pixeltable/functions/openai.py +22 -25
  44. pixeltable/functions/string.py +50 -50
  45. pixeltable/functions/timestamp.py +20 -20
  46. pixeltable/functions/together.py +26 -12
  47. pixeltable/functions/video.py +11 -20
  48. pixeltable/functions/whisper.py +2 -20
  49. pixeltable/globals.py +57 -56
  50. pixeltable/index/base.py +2 -2
  51. pixeltable/index/btree.py +7 -7
  52. pixeltable/index/embedding_index.py +8 -10
  53. pixeltable/io/external_store.py +11 -5
  54. pixeltable/io/globals.py +3 -1
  55. pixeltable/io/hf_datasets.py +4 -4
  56. pixeltable/io/label_studio.py +6 -6
  57. pixeltable/io/parquet.py +14 -13
  58. pixeltable/iterators/document.py +10 -8
  59. pixeltable/iterators/video.py +10 -1
  60. pixeltable/metadata/__init__.py +3 -2
  61. pixeltable/metadata/converters/convert_14.py +4 -2
  62. pixeltable/metadata/converters/convert_15.py +1 -1
  63. pixeltable/metadata/converters/convert_19.py +1 -0
  64. pixeltable/metadata/converters/convert_20.py +1 -1
  65. pixeltable/metadata/converters/util.py +9 -8
  66. pixeltable/metadata/schema.py +32 -21
  67. pixeltable/plan.py +136 -154
  68. pixeltable/store.py +51 -36
  69. pixeltable/tool/create_test_db_dump.py +7 -7
  70. pixeltable/tool/doc_plugins/griffe.py +3 -34
  71. pixeltable/tool/mypy_plugin.py +32 -0
  72. pixeltable/type_system.py +243 -60
  73. pixeltable/utils/arrow.py +10 -9
  74. pixeltable/utils/coco.py +4 -4
  75. pixeltable/utils/documents.py +1 -1
  76. pixeltable/utils/filecache.py +131 -84
  77. pixeltable/utils/formatter.py +1 -1
  78. pixeltable/utils/http_server.py +2 -5
  79. pixeltable/utils/media_store.py +6 -6
  80. pixeltable/utils/pytorch.py +10 -11
  81. pixeltable/utils/sql.py +2 -1
  82. {pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/METADATA +16 -7
  83. pixeltable-0.2.21.dist-info/RECORD +148 -0
  84. pixeltable/utils/help.py +0 -11
  85. pixeltable-0.2.19.dist-info/RECORD +0 -147
  86. {pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/LICENSE +0 -0
  87. {pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/WHEEL +0 -0
  88. {pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/entry_points.txt +0 -0
@@ -1,10 +1,12 @@
1
- from typing import Generator, Optional
1
+ import inspect
2
+ from typing import Iterator, Optional
2
3
 
3
- from .data_row_batch import DataRowBatch
4
- from .exec_node import ExecNode
5
4
  import pixeltable.catalog as catalog
6
- import pixeltable.exprs as exprs
7
5
  import pixeltable.exceptions as excs
6
+ import pixeltable.exprs as exprs
7
+
8
+ from .data_row_batch import DataRowBatch
9
+ from .exec_node import ExecNode
8
10
 
9
11
 
10
12
  class ComponentIterationNode(ExecNode):
@@ -12,7 +14,7 @@ class ComponentIterationNode(ExecNode):
12
14
 
13
15
  Returns row batches of OUTPUT_BATCH_SIZE size.
14
16
  """
15
- OUTPUT_BATCH_SIZE = 1024
17
+ __OUTPUT_BATCH_SIZE = 1024
16
18
 
17
19
  def __init__(self, view: catalog.TableVersion, input: ExecNode):
18
20
  assert view.is_component_view()
@@ -23,57 +25,76 @@ class ComponentIterationNode(ExecNode):
23
25
  self.iterator_args = iterator_args[0]
24
26
  assert isinstance(self.iterator_args, exprs.InlineDict)
25
27
  self.iterator_args_ctx = self.row_builder.create_eval_ctx([self.iterator_args])
26
- self.iterator_output_schema, self.unstored_column_names = \
28
+ self.iterator_output_schema, self.unstored_column_names = (
27
29
  self.view.iterator_cls.output_schema(**self.iterator_args.to_kwargs())
30
+ )
28
31
  self.iterator_output_fields = list(self.iterator_output_schema.keys())
29
- self.iterator_output_cols = \
30
- {field_name: self.view.cols_by_name[field_name] for field_name in self.iterator_output_fields}
32
+ self.iterator_output_cols = {
33
+ field_name: self.view.cols_by_name[field_name] for field_name in self.iterator_output_fields
34
+ }
31
35
  # referenced iterator output fields
32
36
  self.refd_output_slot_idxs = {
33
37
  e.col.name: e.slot_idx for e in self.row_builder.unique_exprs
34
38
  if isinstance(e, exprs.ColumnRef) and e.col.name in self.iterator_output_fields
35
39
  }
36
- self._output: Optional[Generator[DataRowBatch, None, None]] = None
40
+ self.__output: Optional[Iterator[DataRowBatch]] = None
37
41
 
38
- def _output_batches(self) -> Generator[DataRowBatch, None, None]:
42
+ def __output_batches(self) -> Iterator[DataRowBatch]:
39
43
  output_batch = DataRowBatch(self.view, self.row_builder)
40
44
  for input_batch in self.input:
41
45
  for input_row in input_batch:
42
46
  self.row_builder.eval(input_row, self.iterator_args_ctx)
43
47
  iterator_args = input_row[self.iterator_args.slot_idx]
44
- iterator = self.view.iterator_cls(**iterator_args)
45
- for pos, component_dict in enumerate(iterator):
46
- output_row = output_batch.add_row()
47
- input_row.copy(output_row)
48
- # we're expanding the input and need to add the iterator position to the pk
49
- pk = output_row.pk[:-1] + (pos,) + output_row.pk[-1:]
50
- output_row.set_pk(pk)
51
-
52
- # verify and copy component_dict fields to their respective slots in output_row
53
- for field_name, field_val in component_dict.items():
54
- if field_name not in self.iterator_output_fields:
55
- raise excs.Error(
56
- f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
57
- if field_name not in self.refd_output_slot_idxs:
58
- # we can ignore this
59
- continue
60
- output_col = self.iterator_output_cols[field_name]
61
- output_col.col_type.validate_literal(field_val)
62
- output_row[self.refd_output_slot_idxs[field_name]] = field_val
63
- if len(component_dict) != len(self.iterator_output_fields):
64
- missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
65
- raise excs.Error(
66
- f'Invalid output of {self.view.iterator_cls.__name__}: '
67
- f'missing fields {", ".join(missing_fields)}')
68
-
69
- if len(output_batch) == self.OUTPUT_BATCH_SIZE:
70
- yield output_batch
71
- output_batch = DataRowBatch(self.view, self.row_builder)
48
+ assert isinstance(iterator_args, dict)
49
+ # We need to ensure that all of the required (non-nullable) parameters of the iterator are
50
+ # specified and are not null. If any of them are null, then we skip this row (i.e., we emit 0
51
+ # output rows for this input row).
52
+ if self.__non_nullable_args_specified(iterator_args):
53
+ iterator = self.view.iterator_cls(**iterator_args)
54
+ for pos, component_dict in enumerate(iterator):
55
+ output_row = output_batch.add_row()
56
+ input_row.copy(output_row)
57
+ # we're expanding the input and need to add the iterator position to the pk
58
+ self.__populate_output_row(output_row, pos, component_dict)
59
+ if len(output_batch) == self.__OUTPUT_BATCH_SIZE:
60
+ yield output_batch
61
+ output_batch = DataRowBatch(self.view, self.row_builder)
72
62
 
73
63
  if len(output_batch) > 0:
74
64
  yield output_batch
75
65
 
66
+ def __non_nullable_args_specified(self, iterator_args: dict) -> bool:
67
+ """
68
+ Returns true if all non-nullable iterator arguments are not `None`.
69
+ """
70
+ input_schema = self.view.iterator_cls.input_schema()
71
+ for arg_name, arg_value in iterator_args.items():
72
+ col_type = input_schema[arg_name]
73
+ if arg_value is None and not col_type.nullable:
74
+ return False
75
+ return True
76
+
77
+ def __populate_output_row(self, output_row: exprs.DataRow, pos: int, component_dict: dict) -> None:
78
+ pk = output_row.pk[:-1] + (pos,) + output_row.pk[-1:]
79
+ output_row.set_pk(pk)
80
+ # verify and copy component_dict fields to their respective slots in output_row
81
+ for field_name, field_val in component_dict.items():
82
+ if field_name not in self.iterator_output_fields:
83
+ raise excs.Error(
84
+ f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
85
+ if field_name not in self.refd_output_slot_idxs:
86
+ # we can ignore this
87
+ continue
88
+ output_col = self.iterator_output_cols[field_name]
89
+ output_col.col_type.validate_literal(field_val)
90
+ output_row[self.refd_output_slot_idxs[field_name]] = field_val
91
+ if len(component_dict) != len(self.iterator_output_fields):
92
+ missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
93
+ raise excs.Error(
94
+ f'Invalid output of {self.view.iterator_cls.__name__}: '
95
+ f'missing fields {", ".join(missing_fields)}')
96
+
76
97
  def __next__(self) -> DataRowBatch:
77
- if self._output is None:
78
- self._output = self._output_batches()
79
- return next(self._output)
98
+ if self.__output is None:
99
+ self.__output = self.__output_batches()
100
+ return next(self.__output)
@@ -14,6 +14,13 @@ class DataRowBatch:
14
14
 
15
15
  Contains the metadata needed to initialize DataRows.
16
16
  """
17
+ tbl: Optional[catalog.TableVersion]
18
+ row_builder: exprs.RowBuilder
19
+ img_slot_idxs: list[int]
20
+ media_slot_idxs: list[int] # non-image media slots
21
+ array_slot_idxs: list[int]
22
+ rows: list[exprs.DataRow]
23
+
17
24
  def __init__(self, tbl: Optional[catalog.TableVersion], row_builder: exprs.RowBuilder, len: int = 0):
18
25
  self.tbl = tbl
19
26
  self.row_builder = row_builder
@@ -39,13 +46,6 @@ class DataRowBatch:
39
46
  def pop_row(self) -> exprs.DataRow:
40
47
  return self.rows.pop()
41
48
 
42
- def set_row_ids(self, row_ids: List[int]) -> None:
43
- """Sets pks for rows in batch"""
44
- assert self.tbl is not None
45
- assert len(row_ids) == len(self.rows)
46
- for row, row_id in zip(self.rows, row_ids):
47
- row.set_pk((row_id, self.tbl))
48
-
49
49
  def __len__(self) -> int:
50
50
  return len(self.rows)
51
51
 
@@ -1,13 +1,25 @@
1
1
  from __future__ import annotations
2
- from typing import Iterable, Optional, List
2
+
3
3
  import abc
4
+ from typing import Iterable, Optional, List, TYPE_CHECKING, Iterator
4
5
 
6
+ import pixeltable.exprs as exprs
5
7
  from .data_row_batch import DataRowBatch
6
8
  from .exec_context import ExecContext
7
- import pixeltable.exprs as exprs
9
+
10
+ if TYPE_CHECKING:
11
+ from pixeltable import exec
8
12
 
9
13
  class ExecNode(abc.ABC):
10
14
  """Base class of all execution nodes"""
15
+ output_exprs: Iterable[exprs.Expr]
16
+ row_builder: exprs.RowBuilder
17
+ input: Optional[ExecNode]
18
+ flushed_img_slots: list[int] # idxs of image slots of our output_exprs dependencies
19
+ stored_img_cols: list[exprs.ColumnSlotIdx]
20
+ ctx: Optional[ExecContext]
21
+ __iter: Optional[Iterator[DataRowBatch]]
22
+
11
23
  def __init__(
12
24
  self, row_builder: exprs.RowBuilder, output_exprs: Iterable[exprs.Expr],
13
25
  input_exprs: Iterable[exprs.Expr], input: Optional[ExecNode] = None):
@@ -21,8 +33,9 @@ class ExecNode(abc.ABC):
21
33
  e.slot_idx for e in output_dependencies
22
34
  if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
23
35
  ]
24
- self.stored_img_cols: List[exprs.ColumnSlotIdx] = []
25
- self.ctx: Optional[ExecContext] = None # all nodes of a tree share the same context
36
+ self.stored_img_cols = []
37
+ self.ctx = None # all nodes of a tree share the same context
38
+ self.__iter = None
26
39
 
27
40
  def set_ctx(self, ctx: ExecContext) -> None:
28
41
  self.ctx = ctx
@@ -35,12 +48,15 @@ class ExecNode(abc.ABC):
35
48
  if self.input is not None:
36
49
  self.input.set_stored_img_cols(stored_img_cols)
37
50
 
38
- def __iter__(self):
51
+ # TODO: make this an abstractmethod when __next__() is removed
52
+ def __iter__(self) -> Iterator[DataRowBatch]:
39
53
  return self
40
54
 
41
- @abc.abstractmethod
55
+ # TODO: remove this and switch every subclass over to implementing __iter__
42
56
  def __next__(self) -> DataRowBatch:
43
- pass
57
+ if self.__iter is None:
58
+ self.__iter = iter(self)
59
+ return next(self.__iter)
44
60
 
45
61
  def open(self) -> None:
46
62
  """Bottom-up initialization of nodes for execution. Must be called before __next__."""
@@ -60,3 +76,15 @@ class ExecNode(abc.ABC):
60
76
  def _close(self) -> None:
61
77
  pass
62
78
 
79
+ def get_sql_node(self) -> Optional['exec.SqlNode']:
80
+ from .sql_node import SqlNode
81
+ if isinstance(self, SqlNode):
82
+ return self
83
+ if self.input is not None:
84
+ return self.input.get_sql_node()
85
+ return None
86
+
87
+ def set_limit(self, limit: int) -> None:
88
+ """Default implementation propagates to input"""
89
+ if self.input is not None:
90
+ self.input.set_limit(limit)
@@ -5,10 +5,11 @@ import warnings
5
5
  from dataclasses import dataclass
6
6
  from typing import Iterable, List, Optional
7
7
 
8
- from tqdm import tqdm, TqdmWarning
8
+ from tqdm import TqdmWarning, tqdm
9
9
 
10
10
  import pixeltable.exprs as exprs
11
11
  from pixeltable.func import CallableFunction
12
+
12
13
  from .data_row_batch import DataRowBatch
13
14
  from .exec_node import ExecNode
14
15
 
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Any, Optional
2
+ from typing import Any, Optional, Iterator
3
3
 
4
4
  import pixeltable.catalog as catalog
5
5
  import pixeltable.exprs as exprs
@@ -18,6 +18,11 @@ class InMemoryDataNode(ExecNode):
18
18
  - with the values provided in the input rows
19
19
  - if an input row doesn't provide a value, sets the slot to the column default
20
20
  """
21
+ tbl: catalog.TableVersion
22
+ input_rows: list[dict[str, Any]]
23
+ start_row_id: int
24
+ output_rows: Optional[DataRowBatch]
25
+
21
26
  def __init__(
22
27
  self, tbl: catalog.TableVersion, rows: list[dict[str, Any]],
23
28
  row_builder: exprs.RowBuilder, start_row_id: int,
@@ -29,8 +34,7 @@ class InMemoryDataNode(ExecNode):
29
34
  self.tbl = tbl
30
35
  self.input_rows = rows
31
36
  self.start_row_id = start_row_id
32
- self.has_returned_data = False
33
- self.output_rows: Optional[DataRowBatch] = None
37
+ self.output_rows = None
34
38
 
35
39
  def _open(self) -> None:
36
40
  """Create row batch and populate with self.input_rows"""
@@ -67,12 +71,8 @@ class InMemoryDataNode(ExecNode):
67
71
  assert col_info is not None
68
72
  self.output_rows[row_idx][col_info.slot_idx] = None
69
73
 
70
- self.output_rows.set_row_ids([self.start_row_id + i for i in range(len(self.output_rows))])
71
74
  self.ctx.num_rows = len(self.output_rows)
72
75
 
73
- def __next__(self) -> DataRowBatch:
74
- if self.has_returned_data:
75
- raise StopIteration
76
- self.has_returned_data = True
76
+ def __iter__(self) -> Iterator[DataRowBatch]:
77
77
  _logger.debug(f'InMemoryDataNode: created row batch with {len(self.output_rows)} output_rows')
78
- return self.output_rows
78
+ yield self.output_rows