pixeltable 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (56) hide show
  1. pixeltable/__init__.py +3 -1
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/column.py +14 -2
  4. pixeltable/catalog/insertable_table.py +32 -17
  5. pixeltable/catalog/table.py +194 -12
  6. pixeltable/catalog/table_version.py +270 -110
  7. pixeltable/catalog/table_version_path.py +6 -1
  8. pixeltable/datatransfer/__init__.py +1 -0
  9. pixeltable/datatransfer/label_studio.py +526 -0
  10. pixeltable/datatransfer/remote.py +113 -0
  11. pixeltable/env.py +156 -73
  12. pixeltable/exprs/column_ref.py +2 -2
  13. pixeltable/exprs/comparison.py +39 -1
  14. pixeltable/exprs/data_row.py +7 -0
  15. pixeltable/exprs/expr.py +11 -12
  16. pixeltable/exprs/function_call.py +0 -3
  17. pixeltable/exprs/globals.py +14 -2
  18. pixeltable/exprs/similarity_expr.py +5 -3
  19. pixeltable/ext/functions/whisperx.py +30 -0
  20. pixeltable/ext/functions/yolox.py +16 -0
  21. pixeltable/func/aggregate_function.py +2 -2
  22. pixeltable/func/expr_template_function.py +3 -1
  23. pixeltable/func/udf.py +2 -2
  24. pixeltable/functions/fireworks.py +9 -4
  25. pixeltable/functions/huggingface.py +25 -1
  26. pixeltable/functions/openai.py +15 -10
  27. pixeltable/functions/together.py +11 -6
  28. pixeltable/functions/util.py +0 -43
  29. pixeltable/functions/video.py +46 -8
  30. pixeltable/globals.py +20 -2
  31. pixeltable/index/__init__.py +1 -0
  32. pixeltable/index/base.py +6 -1
  33. pixeltable/index/btree.py +54 -0
  34. pixeltable/index/embedding_index.py +4 -1
  35. pixeltable/io/__init__.py +1 -0
  36. pixeltable/io/globals.py +59 -0
  37. pixeltable/iterators/base.py +4 -4
  38. pixeltable/iterators/document.py +26 -15
  39. pixeltable/iterators/video.py +9 -1
  40. pixeltable/metadata/__init__.py +2 -2
  41. pixeltable/metadata/converters/convert_14.py +13 -0
  42. pixeltable/metadata/converters/convert_15.py +29 -0
  43. pixeltable/metadata/converters/util.py +63 -0
  44. pixeltable/metadata/schema.py +12 -6
  45. pixeltable/plan.py +9 -5
  46. pixeltable/store.py +14 -21
  47. pixeltable/tool/create_test_db_dump.py +16 -0
  48. pixeltable/type_system.py +14 -4
  49. pixeltable/utils/coco.py +94 -0
  50. pixeltable-0.2.7.dist-info/METADATA +137 -0
  51. {pixeltable-0.2.6.dist-info → pixeltable-0.2.7.dist-info}/RECORD +53 -46
  52. pixeltable/func/nos_function.py +0 -202
  53. pixeltable/utils/clip.py +0 -18
  54. pixeltable-0.2.6.dist-info/METADATA +0 -131
  55. {pixeltable-0.2.6.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
  56. {pixeltable-0.2.6.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +0 -0
pixeltable/__init__.py CHANGED
@@ -1,5 +1,7 @@
1
1
  from .catalog import Column, Table, InsertableTable, View
2
2
  from .dataframe import DataFrame
3
+ from .datatransfer import Remote
4
+ from .catalog import Column, Table, InsertableTable, View
3
5
  from .exceptions import Error, Error
4
6
  from .exprs import RELATIVE_PATH_ROOT
5
7
  from .func import Function, udf, uda, Aggregator, expr_udf
@@ -21,7 +23,7 @@ from .type_system import (
21
23
  from .utils.help import help
22
24
 
23
25
  # noinspection PyUnresolvedReferences
24
- from . import functions, io
26
+ from . import functions, io, iterators
25
27
  from .__version__ import __version__, __version_tuple__
26
28
 
27
29
  __all__ = [
pixeltable/__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  # These version placeholders will be replaced during build.
2
- __version__ = "0.2.6"
3
- __version_tuple__ = (0, 2, 6)
2
+ __version__ = "0.2.7"
3
+ __version_tuple__ = (0, 2, 7)
@@ -22,7 +22,8 @@ class Column:
22
22
  computed_with: Optional[Union['Expr', Callable]] = None,
23
23
  is_pk: bool = False, stored: Optional[bool] = None,
24
24
  col_id: Optional[int] = None, schema_version_add: Optional[int] = None,
25
- schema_version_drop: Optional[int] = None, sa_col_type: Optional[sql.sqltypes.TypeEngine] = None
25
+ schema_version_drop: Optional[int] = None, sa_col_type: Optional[sql.sqltypes.TypeEngine] = None,
26
+ records_errors: Optional[bool] = None
26
27
  ):
27
28
  """Column constructor.
28
29
 
@@ -80,12 +81,19 @@ class Column:
80
81
  assert self.col_type is not None
81
82
 
82
83
  self.stored = stored
83
- self.dependent_cols: Set[Column] = set() # cols with value_exprs that reference us; set by TableVersion
84
+ self.dependent_cols: set[Column] = set() # cols with value_exprs that reference us; set by TableVersion
84
85
  self.id = col_id
85
86
  self.is_pk = is_pk
86
87
  self.schema_version_add = schema_version_add
87
88
  self.schema_version_drop = schema_version_drop
88
89
 
90
+ # stored_proxy may be set later if this is a non-stored column.
91
+ # if col1.stored_proxy == col2, then also col1 == col2.proxy_base.
92
+ self.stored_proxy: Optional[Column] = None
93
+ self.proxy_base: Optional[Column] = None
94
+
95
+ self._records_errors = records_errors
96
+
89
97
  # column in the stored table for the values of this Column
90
98
  self.sa_col: Optional[sql.schema.Column] = None
91
99
  self.sa_col_type = sa_col_type
@@ -93,6 +101,7 @@ class Column:
93
101
  # computed cols also have storage columns for the exception string and type
94
102
  self.sa_errormsg_col: Optional[sql.schema.Column] = None
95
103
  self.sa_errortype_col: Optional[sql.schema.Column] = None
104
+
96
105
  from .table_version import TableVersion
97
106
  self.tbl: Optional[TableVersion] = None # set by owning TableVersion
98
107
 
@@ -131,6 +140,9 @@ class Column:
131
140
  @property
132
141
  def records_errors(self) -> bool:
133
142
  """True if this column also stores error information."""
143
+ # default: record errors for computed and media columns
144
+ if self._records_errors is not None:
145
+ return self._records_errors
134
146
  return self.is_stored and (self.is_computed or self.col_type.is_media_type())
135
147
 
136
148
  def source(self) -> None:
@@ -60,25 +60,29 @@ class InsertableTable(Table):
60
60
  return tbl
61
61
 
62
62
  @overload
63
- def insert(self, rows: Iterable[Dict[str, Any]], /, print_stats: bool = False, fail_on_exception: bool = True): ...
63
+ def insert(
64
+ self, rows: Iterable[Dict[str, Any]], /, *, print_stats: bool = False, fail_on_exception: bool = True
65
+ ) -> UpdateStatus: ...
64
66
 
65
67
  @overload
66
- def insert(self, print_stats: bool = False, fail_on_exception: bool = True, **kwargs: Any): ...
68
+ def insert(self, *, print_stats: bool = False, fail_on_exception: bool = True, **kwargs: Any) -> UpdateStatus: ...
67
69
 
68
- def insert(self, *args, **kwargs) -> UpdateStatus:
69
- """Insert rows into table.
70
+ def insert(
71
+ self, rows: Optional[Iterable[dict[str, Any]]] = None, /, *, print_stats: bool = False,
72
+ fail_on_exception: bool = True, **kwargs: Any
73
+ ) -> UpdateStatus:
74
+ """Inserts rows into this table. There are two mutually exclusive call patterns:
70
75
 
71
76
  To insert multiple rows at a time:
72
-
73
- ``insert(rows: List[Dict[str, Any]], print_stats: bool = False, fail_on_exception: bool = True)``
77
+ ``insert(rows: Iterable[dict[str, Any]], /, *, print_stats: bool = False, fail_on_exception: bool = True)``
74
78
 
75
79
  To insert just a single row, you can use the more convenient syntax:
76
- ``insert(print_stats: bool = False, fail_on_exception: bool = True, **kwargs: Any)``
80
+ ``insert(*, print_stats: bool = False, fail_on_exception: bool = True, **kwargs: Any)``
77
81
 
78
82
  Args:
79
83
  rows: (if inserting multiple rows) A list of rows to insert, each of which is a dictionary mapping column
80
84
  names to values.
81
- kwargs: (if inserting a single row) keyword-argument pairs representing column names and values.
85
+ kwargs: (if inserting a single row) Keyword-argument pairs representing column names and values.
82
86
  print_stats: If ``True``, print statistics about the cost of computed columns.
83
87
  fail_on_exception:
84
88
  Determines how exceptions in computed columns and invalid media files (e.g., corrupt images)
@@ -102,16 +106,27 @@ class InsertableTable(Table):
102
106
 
103
107
  >>> tbl.insert(a=1, b=1, c=1)
104
108
  """
105
- print_stats = kwargs.pop('print_stats', False)
106
- fail_on_exception = kwargs.pop('fail_on_exception', True)
107
- if len(args) > 0:
108
- # There's a positional argument; this means `rows` is expressed as a
109
- # list of dicts (multi-insert)
110
- rows = list(args[0])
111
- else:
112
- # No positional argument; this means we're inserting a single row
113
- # using kwargs syntax
109
+ # The commented code is the intended implementation, with signature (*args, **kwargs).
110
+ # That signature cannot be used currently, due to a present limitation in mkdocs.
111
+ # See: https://github.com/mkdocstrings/mkdocstrings/issues/669
112
+
113
+ # print_stats = kwargs.pop('print_stats', False)
114
+ # fail_on_exception = kwargs.pop('fail_on_exception', True)
115
+ # if len(args) > 0:
116
+ # # There's a positional argument; this means `rows` is expressed as a
117
+ # # list of dicts (multi-insert)
118
+ # rows = list(args[0])
119
+ # else:
120
+ # # No positional argument; this means we're inserting a single row
121
+ # # using kwargs syntax
122
+ # rows = [kwargs]
123
+
124
+ if rows is None:
114
125
  rows = [kwargs]
126
+ else:
127
+ rows = list(rows)
128
+ if len(kwargs) > 0:
129
+ raise excs.Error('`kwargs` cannot be specified unless `rows is None`.')
115
130
 
116
131
  if not isinstance(rows, list):
117
132
  raise excs.Error('rows must be a list of dictionaries')
@@ -1,9 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import itertools
3
4
  import json
4
5
  import logging
5
6
  from pathlib import Path
6
- from typing import Union, Any, List, Dict, Optional, Callable, Set, Tuple, Iterable
7
+ from typing import Union, Any, List, Dict, Optional, Callable, Set, Tuple, Iterable, Type
7
8
  from uuid import UUID
8
9
 
9
10
  import pandas as pd
@@ -16,6 +17,7 @@ import pixeltable.exceptions as excs
16
17
  import pixeltable.exprs as exprs
17
18
  import pixeltable.metadata.schema as schema
18
19
  import pixeltable.type_system as ts
20
+ import pixeltable.index as index
19
21
  from .column import Column
20
22
  from .globals import is_valid_identifier, is_system_column_name, UpdateStatus
21
23
  from .schema_object import SchemaObject
@@ -102,27 +104,26 @@ class Table(SchemaObject):
102
104
  from pixeltable.dataframe import DataFrame
103
105
  return DataFrame(self.tbl_version_path).group_by(*items)
104
106
 
105
- def collect(self) -> 'pixeltable.dataframe.DataFrameResultSet': # type: ignore[name-defined, no-untyped-def]
106
- """Return rows from this table.
107
- """
107
+ def collect(self) -> 'pixeltable.dataframe.DataFrameResultSet':
108
+ """Return rows from this table."""
108
109
  return self.df().collect()
109
110
 
110
111
  def show(
111
112
  self, *args, **kwargs
112
- ) -> 'pixeltable.dataframe.DataFrameResultSet': # type: ignore[name-defined, no-untyped-def]
113
+ ) -> 'pixeltable.dataframe.DataFrameResultSet':
113
114
  """Return rows from this table.
114
115
  """
115
116
  return self.df().show(*args, **kwargs)
116
117
 
117
118
  def head(
118
119
  self, *args, **kwargs
119
- ) -> 'pixeltable.dataframe.DataFrameResultSet': # type: ignore[name-defined, no-untyped-def]
120
+ ) -> 'pixeltable.dataframe.DataFrameResultSet':
120
121
  """Return the first n rows inserted into this table."""
121
122
  return self.df().head(*args, **kwargs)
122
123
 
123
124
  def tail(
124
125
  self, *args, **kwargs
125
- ) -> 'pixeltable.dataframe.DataFrameResultSet': # type: ignore[name-defined, no-untyped-def]
126
+ ) -> 'pixeltable.dataframe.DataFrameResultSet':
126
127
  """Return the last n rows inserted into this table."""
127
128
  return self.df().tail(*args, **kwargs)
128
129
 
@@ -514,6 +515,24 @@ class Table(SchemaObject):
514
515
  status = self.tbl_version_path.tbl_version.add_index(col, idx_name=idx_name, idx=idx)
515
516
  # TODO: how to deal with exceptions here? drop the index and raise?
516
517
 
518
+ def drop_embedding_index(self, *, column_name: Optional[str] = None, idx_name: Optional[str] = None) -> None:
519
+ """Drop an embedding index from the table.
520
+
521
+ Args:
522
+ column_name: The name of the column whose embedding index to drop. Invalid if the column has multiple
523
+ embedding indices.
524
+ idx_name: The name of the index to drop.
525
+
526
+ Raises:
527
+ Error: If the index does not exist.
528
+
529
+ Examples:
530
+ Drop embedding index on the ``img`` column:
531
+
532
+ >>> tbl.drop_embedding_index(column_name='img')
533
+ """
534
+ self._drop_index(column_name=column_name, idx_name=idx_name, _idx_class=index.EmbeddingIndex)
535
+
517
536
  def drop_index(self, *, column_name: Optional[str] = None, idx_name: Optional[str] = None) -> None:
518
537
  """Drop an index from the table.
519
538
 
@@ -529,6 +548,12 @@ class Table(SchemaObject):
529
548
 
530
549
  >>> tbl.drop_index(column_name='img')
531
550
  """
551
+ self._drop_index(column_name=column_name, idx_name=idx_name)
552
+
553
+ def _drop_index(
554
+ self, *, column_name: Optional[str] = None, idx_name: Optional[str] = None,
555
+ _idx_class: Optional[Type[index.IndexBase]] = None
556
+ ) -> None:
532
557
  if self.tbl_version_path.is_snapshot():
533
558
  raise excs.Error('Cannot drop an index from a snapshot')
534
559
  self._check_is_dropped()
@@ -547,12 +572,14 @@ class Table(SchemaObject):
547
572
  if col.tbl.id != tbl_version.id:
548
573
  raise excs.Error(
549
574
  f'Column {column_name}: cannot drop index from column that belongs to base ({col.tbl.name})')
550
- idx_ids = [info.id for info in tbl_version.idxs_by_name.values() if info.col.id == col.id]
551
- if len(idx_ids) == 0:
575
+ idx_info = [info for info in tbl_version.idxs_by_name.values() if info.col.id == col.id]
576
+ if _idx_class is not None:
577
+ idx_info = [info for info in idx_info if isinstance(info.idx, _idx_class)]
578
+ if len(idx_info) == 0:
552
579
  raise excs.Error(f'Column {column_name} does not have an index')
553
- if len(idx_ids) > 1:
580
+ if len(idx_info) > 1:
554
581
  raise excs.Error(f'Column {column_name} has multiple indices; specify idx_name instead')
555
- idx_id = idx_ids[0]
582
+ idx_id = idx_info[0].id
556
583
  self.tbl_version_path.tbl_version.drop_index(idx_id)
557
584
 
558
585
  def update(
@@ -682,7 +709,6 @@ class Table(SchemaObject):
682
709
 
683
710
  return update_targets
684
711
 
685
-
686
712
  def revert(self) -> None:
687
713
  """Reverts the table to the previous version.
688
714
 
@@ -693,3 +719,159 @@ class Table(SchemaObject):
693
719
  raise excs.Error('Cannot revert a snapshot')
694
720
  self._check_is_dropped()
695
721
  self.tbl_version_path.tbl_version.revert()
722
+
723
+ def _link(
724
+ self,
725
+ remote: 'pixeltable.datatransfer.Remote',
726
+ col_mapping: Optional[dict[str, str]] = None
727
+ ) -> None:
728
+ """
729
+ Links the specified `Remote` to this table. Once a remote is linked, it can be synchronized with
730
+ this `Table` by calling [`Table.sync()`]. A record of the link
731
+ is stored in table metadata and will persist across sessions.
732
+
733
+ Args:
734
+ remote (pixeltable.datatransfer.Remote): The `Remote` to link to this table.
735
+ col_mapping: An optional mapping of columns from this `Table` to columns in the `Remote`.
736
+ """
737
+ # TODO(aaron-siegel): Refactor `col_mapping`
738
+ self._check_is_dropped()
739
+ if remote in self._get_remotes():
740
+ raise excs.Error(f'That remote is already linked to table `{self.get_name()}`: {remote}')
741
+ push_cols = remote.get_export_columns()
742
+ pull_cols = remote.get_import_columns()
743
+ is_col_mapping_user_specified = col_mapping is not None
744
+ if col_mapping is None:
745
+ # Use the identity mapping by default if `col_mapping` is not specified
746
+ col_mapping = {col: col for col in itertools.chain(push_cols.keys(), pull_cols.keys())}
747
+ self._validate_remote(push_cols, pull_cols, col_mapping, is_col_mapping_user_specified)
748
+ _logger.info(f'Linking remote {remote} to table `{self.get_name()}`.')
749
+ self.tbl_version_path.tbl_version.link(remote, col_mapping)
750
+ print(f'Linked remote {remote} to table `{self.get_name()}`.')
751
+
752
+ def unlink(
753
+ self,
754
+ remotes: Optional['pixeltable.datatransfer.Remote' | list['pixeltable.datatransfer.Remote']] = None,
755
+ *,
756
+ delete_remote_data: bool = False,
757
+ ignore_errors: bool = False
758
+ ) -> None:
759
+ """
760
+ Unlinks this table's `Remote`s.
761
+
762
+ Args:
763
+ remotes: If specified, will unlink only the specified `Remote` or list of `Remote`s. If not specified,
764
+ will unlink all of this table's `Remote`s.
765
+ ignore_errors (bool): If `True`, no exception will be thrown if the specified `Remote` is not linked
766
+ to this table.
767
+ delete_remote_data (bool): If `True`, then the remote data source will also be deleted. WARNING: This
768
+ is a destructive operation that will delete data outside Pixeltable, and cannot be undone.
769
+
770
+ """
771
+ self._check_is_dropped()
772
+ all_remotes = self._get_remotes()
773
+
774
+ if remotes is None:
775
+ remotes = list(all_remotes.keys())
776
+ elif isinstance(remotes, pixeltable.datatransfer.Remote):
777
+ remotes = [remotes]
778
+
779
+ # Validation
780
+ if not ignore_errors:
781
+ for remote in remotes:
782
+ if remote not in all_remotes:
783
+ raise excs.Error(f'Remote {remote} is not linked to table `{self.get_name()}`')
784
+
785
+ for remote in remotes:
786
+ self.tbl_version_path.tbl_version.unlink(remote)
787
+ print(f'Unlinked remote {remote} from table `{self.get_name()}`.')
788
+ if delete_remote_data:
789
+ remote.delete()
790
+
791
+ def _validate_remote(
792
+ self,
793
+ export_cols: dict[str, ts.ColumnType],
794
+ import_cols: dict[str, ts.ColumnType],
795
+ col_mapping: Optional[dict[str, str]],
796
+ is_col_mapping_user_specified: bool
797
+ ):
798
+ # Validate names
799
+ t_cols = self.column_names()
800
+ for t_col, r_col in col_mapping.items():
801
+ if t_col not in t_cols:
802
+ if is_col_mapping_user_specified:
803
+ raise excs.Error(
804
+ f'Column name `{t_col}` appears as a key in `col_mapping`, but Table `{self.get_name()}` '
805
+ 'contains no such column.'
806
+ )
807
+ else:
808
+ raise excs.Error(
809
+ f'Column `{t_col}` does not exist in Table `{self.get_name()}`. Either add a column `{t_col}`, '
810
+ f'or specify a `col_mapping` to associate a different column with the remote field `{r_col}`.'
811
+ )
812
+ if r_col not in export_cols and r_col not in import_cols:
813
+ raise excs.Error(
814
+ f'Column name `{r_col}` appears as a value in `col_mapping`, but the remote '
815
+ f'configuration has no column `{r_col}`.'
816
+ )
817
+ # Validate column specs
818
+ t_col_types = self.column_types()
819
+ for t_col, r_col in col_mapping.items():
820
+ t_col_type = t_col_types[t_col]
821
+ if r_col in export_cols:
822
+ # Validate that the table column can be assigned to the remote column
823
+ r_col_type = export_cols[r_col]
824
+ if not r_col_type.is_supertype_of(t_col_type):
825
+ raise excs.Error(
826
+ f'Column `{t_col}` cannot be exported to remote column `{r_col}` (incompatible types; expecting `{r_col_type}`)'
827
+ )
828
+ if r_col in import_cols:
829
+ # Validate that the remote column can be assigned to the table column
830
+ if self.tbl_version_path.get_column(t_col).is_computed:
831
+ raise excs.Error(
832
+ f'Column `{t_col}` is a computed column, which cannot be populated from a remote column'
833
+ )
834
+ r_col_type = import_cols[r_col]
835
+ if not t_col_type.is_supertype_of(r_col_type):
836
+ raise excs.Error(
837
+ f'Column `{t_col}` cannot be imported from remote column `{r_col}` (incompatible types; expecting `{r_col_type}`)'
838
+ )
839
+
840
+ def _get_remotes(self) -> dict[pixeltable.datatransfer.Remote, dict[str, str]]:
841
+ """
842
+ Gets a `dict` of all `Remote`s linked to this table.
843
+ """
844
+ return self.tbl_version_path.tbl_version.get_remotes()
845
+
846
+ def sync(
847
+ self,
848
+ *,
849
+ export_data: bool = True,
850
+ import_data: bool = True
851
+ ):
852
+ """
853
+ Synchronizes this table with its linked `Remote`s.
854
+
855
+ Args:
856
+ export_data: If `True`, data from this table will be exported to the external store during synchronization.
857
+ import_data: If `True`, data from the external store will be imported to this table during synchronization.
858
+ """
859
+ remotes = self._get_remotes()
860
+ assert len(remotes) <= 1
861
+
862
+ # Validation
863
+ for remote in remotes:
864
+ col_mapping = remotes[remote]
865
+ r_cols = set(col_mapping.values())
866
+ # Validate export/import
867
+ if export_data and not any(col in r_cols for col in remote.get_export_columns()):
868
+ raise excs.Error(
869
+ f'Attempted to sync with export_data=True, but there are no columns to export: {remote}'
870
+ )
871
+ if import_data and not any(col in r_cols for col in remote.get_import_columns()):
872
+ raise excs.Error(
873
+ f'Attempted to sync with import_data=True, but there are no columns to import: {remote}'
874
+ )
875
+
876
+ for remote in remotes:
877
+ remote.sync(self, remotes[remote], export_data=export_data, import_data=import_data)