pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. pixeltable/__init__.py +42 -8
  2. pixeltable/{dataframe.py → _query.py} +470 -206
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +5 -4
  5. pixeltable/catalog/catalog.py +1785 -432
  6. pixeltable/catalog/column.py +190 -113
  7. pixeltable/catalog/dir.py +2 -4
  8. pixeltable/catalog/globals.py +19 -46
  9. pixeltable/catalog/insertable_table.py +191 -98
  10. pixeltable/catalog/path.py +63 -23
  11. pixeltable/catalog/schema_object.py +11 -15
  12. pixeltable/catalog/table.py +843 -436
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +978 -657
  15. pixeltable/catalog/table_version_handle.py +72 -16
  16. pixeltable/catalog/table_version_path.py +112 -43
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +134 -90
  20. pixeltable/config.py +134 -22
  21. pixeltable/env.py +471 -157
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +4 -1
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +11 -7
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +106 -56
  37. pixeltable/exec/globals.py +35 -0
  38. pixeltable/exec/in_memory_data_node.py +19 -19
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +351 -84
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +36 -23
  46. pixeltable/exprs/column_ref.py +213 -89
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +164 -54
  50. pixeltable/exprs/expr.py +70 -44
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +100 -40
  54. pixeltable/exprs/globals.py +2 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +18 -32
  57. pixeltable/exprs/is_null.py +7 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +27 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +167 -67
  64. pixeltable/exprs/rowid_ref.py +25 -10
  65. pixeltable/exprs/similarity_expr.py +58 -40
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +17 -11
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +29 -27
  78. pixeltable/func/signature.py +46 -19
  79. pixeltable/func/tools.py +31 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +16 -0
  82. pixeltable/functions/anthropic.py +123 -77
  83. pixeltable/functions/audio.py +147 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +7 -4
  86. pixeltable/functions/deepseek.py +35 -43
  87. pixeltable/functions/document.py +81 -0
  88. pixeltable/functions/fal.py +76 -0
  89. pixeltable/functions/fireworks.py +11 -20
  90. pixeltable/functions/gemini.py +195 -39
  91. pixeltable/functions/globals.py +142 -14
  92. pixeltable/functions/groq.py +108 -0
  93. pixeltable/functions/huggingface.py +1056 -24
  94. pixeltable/functions/image.py +115 -57
  95. pixeltable/functions/json.py +1 -1
  96. pixeltable/functions/llama_cpp.py +28 -13
  97. pixeltable/functions/math.py +67 -5
  98. pixeltable/functions/mistralai.py +18 -55
  99. pixeltable/functions/net.py +70 -0
  100. pixeltable/functions/ollama.py +20 -13
  101. pixeltable/functions/openai.py +240 -226
  102. pixeltable/functions/openrouter.py +143 -0
  103. pixeltable/functions/replicate.py +4 -4
  104. pixeltable/functions/reve.py +250 -0
  105. pixeltable/functions/string.py +239 -69
  106. pixeltable/functions/timestamp.py +16 -16
  107. pixeltable/functions/together.py +24 -84
  108. pixeltable/functions/twelvelabs.py +188 -0
  109. pixeltable/functions/util.py +6 -1
  110. pixeltable/functions/uuid.py +30 -0
  111. pixeltable/functions/video.py +1515 -107
  112. pixeltable/functions/vision.py +8 -8
  113. pixeltable/functions/voyageai.py +289 -0
  114. pixeltable/functions/whisper.py +16 -8
  115. pixeltable/functions/whisperx.py +179 -0
  116. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  117. pixeltable/globals.py +362 -115
  118. pixeltable/index/base.py +17 -21
  119. pixeltable/index/btree.py +28 -22
  120. pixeltable/index/embedding_index.py +100 -118
  121. pixeltable/io/__init__.py +4 -2
  122. pixeltable/io/datarows.py +8 -7
  123. pixeltable/io/external_store.py +56 -105
  124. pixeltable/io/fiftyone.py +13 -13
  125. pixeltable/io/globals.py +31 -30
  126. pixeltable/io/hf_datasets.py +61 -16
  127. pixeltable/io/label_studio.py +74 -70
  128. pixeltable/io/lancedb.py +3 -0
  129. pixeltable/io/pandas.py +21 -12
  130. pixeltable/io/parquet.py +25 -105
  131. pixeltable/io/table_data_conduit.py +250 -123
  132. pixeltable/io/utils.py +4 -4
  133. pixeltable/iterators/__init__.py +2 -1
  134. pixeltable/iterators/audio.py +26 -25
  135. pixeltable/iterators/base.py +9 -3
  136. pixeltable/iterators/document.py +112 -78
  137. pixeltable/iterators/image.py +12 -15
  138. pixeltable/iterators/string.py +11 -4
  139. pixeltable/iterators/video.py +523 -120
  140. pixeltable/metadata/__init__.py +14 -3
  141. pixeltable/metadata/converters/convert_13.py +2 -2
  142. pixeltable/metadata/converters/convert_18.py +2 -2
  143. pixeltable/metadata/converters/convert_19.py +2 -2
  144. pixeltable/metadata/converters/convert_20.py +2 -2
  145. pixeltable/metadata/converters/convert_21.py +2 -2
  146. pixeltable/metadata/converters/convert_22.py +2 -2
  147. pixeltable/metadata/converters/convert_24.py +2 -2
  148. pixeltable/metadata/converters/convert_25.py +2 -2
  149. pixeltable/metadata/converters/convert_26.py +2 -2
  150. pixeltable/metadata/converters/convert_29.py +4 -4
  151. pixeltable/metadata/converters/convert_30.py +34 -21
  152. pixeltable/metadata/converters/convert_34.py +2 -2
  153. pixeltable/metadata/converters/convert_35.py +9 -0
  154. pixeltable/metadata/converters/convert_36.py +38 -0
  155. pixeltable/metadata/converters/convert_37.py +15 -0
  156. pixeltable/metadata/converters/convert_38.py +39 -0
  157. pixeltable/metadata/converters/convert_39.py +124 -0
  158. pixeltable/metadata/converters/convert_40.py +73 -0
  159. pixeltable/metadata/converters/convert_41.py +12 -0
  160. pixeltable/metadata/converters/convert_42.py +9 -0
  161. pixeltable/metadata/converters/convert_43.py +44 -0
  162. pixeltable/metadata/converters/util.py +20 -31
  163. pixeltable/metadata/notes.py +9 -0
  164. pixeltable/metadata/schema.py +140 -53
  165. pixeltable/metadata/utils.py +74 -0
  166. pixeltable/mypy/__init__.py +3 -0
  167. pixeltable/mypy/mypy_plugin.py +123 -0
  168. pixeltable/plan.py +382 -115
  169. pixeltable/share/__init__.py +1 -1
  170. pixeltable/share/packager.py +547 -83
  171. pixeltable/share/protocol/__init__.py +33 -0
  172. pixeltable/share/protocol/common.py +165 -0
  173. pixeltable/share/protocol/operation_types.py +33 -0
  174. pixeltable/share/protocol/replica.py +119 -0
  175. pixeltable/share/publish.py +257 -59
  176. pixeltable/store.py +311 -194
  177. pixeltable/type_system.py +373 -211
  178. pixeltable/utils/__init__.py +2 -3
  179. pixeltable/utils/arrow.py +131 -17
  180. pixeltable/utils/av.py +298 -0
  181. pixeltable/utils/azure_store.py +346 -0
  182. pixeltable/utils/coco.py +6 -6
  183. pixeltable/utils/code.py +3 -3
  184. pixeltable/utils/console_output.py +4 -1
  185. pixeltable/utils/coroutine.py +6 -23
  186. pixeltable/utils/dbms.py +32 -6
  187. pixeltable/utils/description_helper.py +4 -5
  188. pixeltable/utils/documents.py +7 -18
  189. pixeltable/utils/exception_handler.py +7 -30
  190. pixeltable/utils/filecache.py +6 -6
  191. pixeltable/utils/formatter.py +86 -48
  192. pixeltable/utils/gcs_store.py +295 -0
  193. pixeltable/utils/http.py +133 -0
  194. pixeltable/utils/http_server.py +2 -3
  195. pixeltable/utils/iceberg.py +1 -2
  196. pixeltable/utils/image.py +17 -0
  197. pixeltable/utils/lancedb.py +90 -0
  198. pixeltable/utils/local_store.py +322 -0
  199. pixeltable/utils/misc.py +5 -0
  200. pixeltable/utils/object_stores.py +573 -0
  201. pixeltable/utils/pydantic.py +60 -0
  202. pixeltable/utils/pytorch.py +5 -6
  203. pixeltable/utils/s3_store.py +527 -0
  204. pixeltable/utils/sql.py +26 -0
  205. pixeltable/utils/system.py +30 -0
  206. pixeltable-0.5.7.dist-info/METADATA +579 -0
  207. pixeltable-0.5.7.dist-info/RECORD +227 -0
  208. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  209. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  210. pixeltable/__version__.py +0 -3
  211. pixeltable/catalog/named_function.py +0 -40
  212. pixeltable/ext/__init__.py +0 -17
  213. pixeltable/ext/functions/__init__.py +0 -11
  214. pixeltable/ext/functions/whisperx.py +0 -77
  215. pixeltable/utils/media_store.py +0 -77
  216. pixeltable/utils/s3.py +0 -17
  217. pixeltable-0.3.14.dist-info/METADATA +0 -434
  218. pixeltable-0.3.14.dist-info/RECORD +0 -186
  219. pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
  220. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/index/base.py CHANGED
@@ -1,11 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import abc
4
- from typing import Any
5
4
 
6
5
  import sqlalchemy as sql
7
6
 
8
- from pixeltable import catalog, exprs
7
+ import pixeltable.catalog as catalog
8
+ import pixeltable.exprs as exprs
9
+ import pixeltable.type_system as ts
9
10
 
10
11
 
11
12
  class IndexBase(abc.ABC):
@@ -18,39 +19,34 @@ class IndexBase(abc.ABC):
18
19
  """
19
20
 
20
21
  @abc.abstractmethod
21
- def __init__(self, c: catalog.Column, **kwargs: Any):
22
- pass
23
-
24
- @abc.abstractmethod
25
- def index_value_expr(self) -> exprs.Expr:
26
- """Return expression that computes the value that goes into the index"""
27
- pass
22
+ def create_value_expr(self, c: catalog.Column) -> exprs.Expr:
23
+ """
24
+ Validates that the index can be created on column c and returns an expression that computes the index value.
25
+ """
28
26
 
29
27
  @abc.abstractmethod
30
28
  def records_value_errors(self) -> bool:
31
29
  """True if index_value_expr() can raise errors"""
32
- pass
33
30
 
34
31
  @abc.abstractmethod
35
- def index_sa_type(self) -> sql.types.TypeEngine:
32
+ def get_index_sa_type(self, value_col_type: ts.ColumnType) -> sql.types.TypeEngine:
36
33
  """Return the sqlalchemy type of the index value column"""
37
- pass
38
34
 
39
35
  @abc.abstractmethod
40
- def create_index(self, index_name: str, index_value_col: catalog.Column) -> None:
41
- """Create the index on the index value column"""
42
- pass
36
+ def sa_create_stmt(self, store_index_name: str, sa_value_col: sql.Column) -> sql.Compiled:
37
+ """Return a sqlalchemy statement for creating the index"""
38
+
39
+ @abc.abstractmethod
40
+ def drop_index(self, index_name: str, index_value_col: catalog.Column) -> None:
41
+ """Drop the index on the index value column"""
43
42
 
44
43
  @classmethod
45
44
  @abc.abstractmethod
46
- def display_name(cls) -> str:
47
- pass
45
+ def display_name(cls) -> str: ...
48
46
 
49
47
  @abc.abstractmethod
50
- def as_dict(self) -> dict:
51
- pass
48
+ def as_dict(self) -> dict: ...
52
49
 
53
50
  @classmethod
54
51
  @abc.abstractmethod
55
- def from_dict(cls, c: catalog.Column, d: dict) -> IndexBase:
56
- pass
52
+ def from_dict(cls, d: dict) -> IndexBase: ...
pixeltable/index/btree.py CHANGED
@@ -1,18 +1,18 @@
1
- from typing import TYPE_CHECKING, Optional
1
+ from typing import TYPE_CHECKING
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
5
5
  # TODO: why does this import result in a circular import, but the one im embedding_index.py doesn't?
6
6
  # import pixeltable.catalog as catalog
7
7
  import pixeltable.exceptions as excs
8
- from pixeltable import catalog, exprs
9
- from pixeltable.env import Env
8
+ import pixeltable.exprs as exprs
9
+ import pixeltable.type_system as ts
10
10
  from pixeltable.func.udf import udf
11
11
 
12
12
  from .base import IndexBase
13
13
 
14
14
  if TYPE_CHECKING:
15
- import pixeltable.exprs
15
+ import pixeltable.catalog as catalog
16
16
 
17
17
 
18
18
  class BtreeIndex(IndexBase):
@@ -22,42 +22,48 @@ class BtreeIndex(IndexBase):
22
22
 
23
23
  MAX_STRING_LEN = 256
24
24
 
25
- value_expr: 'pixeltable.exprs.Expr'
26
-
27
25
  @staticmethod
28
26
  @udf
29
- def str_filter(s: Optional[str]) -> Optional[str]:
27
+ def str_filter(s: str | None) -> str | None:
30
28
  if s is None:
31
29
  return None
32
30
  return s[: BtreeIndex.MAX_STRING_LEN]
33
31
 
34
- def __init__(self, c: 'catalog.Column'):
32
+ def __init__(self) -> None:
33
+ pass
34
+
35
+ def create_value_expr(self, c: 'catalog.Column') -> 'exprs.Expr':
35
36
  if not c.col_type.is_scalar_type() and not c.col_type.is_media_type():
36
37
  raise excs.Error(f'Index on column {c.name}: B-tree index requires scalar or media type, got {c.col_type}')
38
+ value_expr: exprs.Expr
37
39
  if c.col_type.is_media_type():
38
40
  # an index on a media column is an index on the file url
39
41
  # no validation for media columns: we're only interested in the string value
40
- self.value_expr = exprs.ColumnRef(c, perform_validation=False)
42
+ value_expr = exprs.ColumnRef(c, perform_validation=False)
41
43
  else:
42
- self.value_expr = (
44
+ value_expr = (
43
45
  BtreeIndex.str_filter(exprs.ColumnRef(c)) if c.col_type.is_string_type() else exprs.ColumnRef(c)
44
46
  )
45
-
46
- def index_value_expr(self) -> 'exprs.Expr':
47
- return self.value_expr
47
+ return value_expr
48
48
 
49
49
  def records_value_errors(self) -> bool:
50
50
  return False
51
51
 
52
- def index_sa_type(self) -> sql.types.TypeEngine:
52
+ def get_index_sa_type(self, val_col_type: ts.ColumnType) -> sql.types.TypeEngine:
53
53
  """Return the sqlalchemy type of the index value column"""
54
- return self.value_expr.col_type.to_sa_type()
54
+ return val_col_type.to_sa_type()
55
+
56
+ def sa_create_stmt(self, store_index_name: str, sa_value_col: sql.Column) -> sql.Compiled:
57
+ """Return a sqlalchemy statement for creating the index"""
58
+ from sqlalchemy.dialects import postgresql
59
+
60
+ sa_idx = sql.Index(store_index_name, sa_value_col, postgresql_using='btree')
61
+ return sql.schema.CreateIndex(sa_idx, if_not_exists=True).compile(dialect=postgresql.dialect())
55
62
 
56
- def create_index(self, index_name: str, index_value_col: 'catalog.Column') -> None:
57
- """Create the index on the index value column"""
58
- idx = sql.Index(index_name, index_value_col.sa_col, postgresql_using='btree')
59
- conn = Env.get().conn
60
- idx.create(bind=conn)
63
+ def drop_index(self, index_name: str, index_value_col: 'catalog.Column') -> None:
64
+ """Drop the index on the index value column"""
65
+ # TODO: implement
66
+ raise NotImplementedError()
61
67
 
62
68
  @classmethod
63
69
  def display_name(cls) -> str:
@@ -67,5 +73,5 @@ class BtreeIndex(IndexBase):
67
73
  return {}
68
74
 
69
75
  @classmethod
70
- def from_dict(cls, c: 'catalog.Column', d: dict) -> 'BtreeIndex':
71
- return cls(c)
76
+ def from_dict(cls, d: dict) -> 'BtreeIndex':
77
+ return cls()
@@ -1,16 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import enum
4
- from typing import Any, ClassVar, Optional
4
+ from typing import Any, ClassVar
5
5
 
6
6
  import numpy as np
7
7
  import pgvector.sqlalchemy # type: ignore[import-untyped]
8
- import PIL.Image
9
8
  import sqlalchemy as sql
10
9
 
10
+ import pixeltable.catalog as catalog
11
11
  import pixeltable.exceptions as excs
12
+ import pixeltable.exprs as exprs
13
+ import pixeltable.func as func
12
14
  import pixeltable.type_system as ts
13
- from pixeltable import catalog, exprs, func
14
15
  from pixeltable.env import Env
15
16
 
16
17
  from .base import IndexBase
@@ -39,124 +40,105 @@ class EmbeddingIndex(IndexBase):
39
40
  }
40
41
 
41
42
  metric: Metric
42
- value_expr: exprs.FunctionCall
43
- string_embed: Optional[func.Function]
44
- image_embed: Optional[func.Function]
45
- string_embed_signature_idx: int
46
- image_embed_signature_idx: int
47
- index_col_type: pgvector.sqlalchemy.Vector
43
+ embeddings: dict[ts.ColumnType.Type, func.Function]
48
44
 
49
45
  def __init__(
50
46
  self,
51
- c: catalog.Column,
52
47
  metric: str,
53
- embed: Optional[func.Function] = None,
54
- string_embed: Optional[func.Function] = None,
55
- image_embed: Optional[func.Function] = None,
48
+ embed: func.Function | None = None,
49
+ string_embed: func.Function | None = None,
50
+ image_embed: func.Function | None = None,
51
+ audio_embed: func.Function | None = None,
52
+ video_embed: func.Function | None = None,
56
53
  ):
57
54
  if embed is None and string_embed is None and image_embed is None:
58
55
  raise excs.Error('At least one of `embed`, `string_embed`, or `image_embed` must be specified')
59
56
  metric_names = [m.name.lower() for m in self.Metric]
60
57
  if metric.lower() not in metric_names:
61
58
  raise excs.Error(f'Invalid metric {metric}, must be one of {metric_names}')
62
- if not c.col_type.is_string_type() and not c.col_type.is_image_type():
63
- raise excs.Error('Embedding index requires string or image column')
64
-
65
- self.string_embed = None
66
- self.image_embed = None
67
-
68
- # Resolve the specific embedding functions corresponding to the user-provided `string_embed`, `image_embed`,
69
- # and/or `embed`. For string embeddings, `string_embed` will be used if specified; otherwise, `embed` will
70
- # be used as a fallback, if it has a matching signature. Likewise for image embeddings.
71
-
72
- if string_embed is not None:
73
- # `string_embed` is specified; it MUST be valid.
74
- self.string_embed = self._resolve_embedding_fn(string_embed, ts.ColumnType.Type.STRING)
75
- if self.string_embed is None:
76
- raise excs.Error(
77
- f'The function `{string_embed.name}` is not a valid string embedding: '
78
- 'it must take a single string parameter'
79
- )
80
- elif embed is not None:
81
- # `embed` is specified; see if it has a string signature.
82
- self.string_embed = self._resolve_embedding_fn(embed, ts.ColumnType.Type.STRING)
83
-
84
- if image_embed is not None:
85
- # `image_embed` is specified; it MUST be valid.
86
- self.image_embed = self._resolve_embedding_fn(image_embed, ts.ColumnType.Type.IMAGE)
87
- if self.image_embed is None:
88
- raise excs.Error(
89
- f'The function `{image_embed.name}` is not a valid image embedding: '
90
- 'it must take a single image parameter'
91
- )
92
- elif embed is not None:
93
- # `embed` is specified; see if it has an image signature.
94
- self.image_embed = self._resolve_embedding_fn(embed, ts.ColumnType.Type.IMAGE)
95
-
96
- if self.string_embed is None and self.image_embed is None:
97
- # No string OR image signature was found. This can only happen if `embed` was specified and
98
- # contains no matching signatures.
59
+
60
+ self.embeddings = {}
61
+
62
+ # Resolve the specific embedding functions corresponding to the user-provided embedding functions.
63
+ # For string embeddings, for example, `string_embed` will be used if specified; otherwise, `embed` will
64
+ # be used as a fallback, if it has a matching signature.
65
+
66
+ for embed_type, embed_fn in (
67
+ (ts.ColumnType.Type.STRING, string_embed),
68
+ (ts.ColumnType.Type.IMAGE, image_embed),
69
+ (ts.ColumnType.Type.AUDIO, audio_embed),
70
+ (ts.ColumnType.Type.VIDEO, video_embed),
71
+ ):
72
+ if embed_fn is not None:
73
+ # Embedding function for the requisite type is specified directly; it MUST be valid.
74
+ resolved_fn = self._resolve_embedding_fn(embed_fn, embed_type)
75
+ if resolved_fn is None:
76
+ raise excs.Error(
77
+ f'The function `{embed_fn.name}` is not a valid {embed_type.name.lower()} '
78
+ f'embedding: it must take a single {embed_type.name.lower()} parameter'
79
+ )
80
+ self.embeddings[embed_type] = resolved_fn
81
+ elif embed is not None:
82
+ # General `embed` is specified; see if it has a matching signature.
83
+ resolved_fn = self._resolve_embedding_fn(embed, embed_type)
84
+ if resolved_fn is not None:
85
+ self.embeddings[embed_type] = resolved_fn
86
+
87
+ if len(self.embeddings) == 0:
88
+ # `embed` was specified and contains no matching signatures.
99
89
  assert embed is not None
100
90
  raise excs.Error(
101
- f'The function `{embed.name}` is not a valid embedding: it must take a single string or image parameter'
91
+ f'The function `{embed.name}` is not a valid embedding: '
92
+ 'it must take a single string, image, audio, or video parameter'
102
93
  )
103
94
 
104
95
  # Now validate the return types of the embedding functions.
105
-
106
- if self.string_embed is not None:
107
- self._validate_embedding_fn(self.string_embed)
108
-
109
- if self.image_embed is not None:
110
- self._validate_embedding_fn(self.image_embed)
111
-
112
- if c.col_type.is_string_type() and self.string_embed is None:
113
- raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
114
- if c.col_type.is_image_type() and self.image_embed is None:
115
- raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
96
+ for _, embed_fn in self.embeddings.items():
97
+ self._validate_embedding_fn(embed_fn)
116
98
 
117
99
  self.metric = self.Metric[metric.upper()]
118
- self.value_expr = (
119
- self.string_embed(exprs.ColumnRef(c))
120
- if c.col_type.is_string_type()
121
- else self.image_embed(exprs.ColumnRef(c))
122
- )
123
- assert isinstance(self.value_expr.col_type, ts.ArrayType)
124
- vector_size = self.value_expr.col_type.shape[0]
125
- assert vector_size is not None
126
- self.index_col_type = pgvector.sqlalchemy.Vector(vector_size)
127
100
 
128
- def index_value_expr(self) -> exprs.Expr:
129
- """Return expression that computes the value that goes into the index"""
130
- return self.value_expr
101
+ def create_value_expr(self, c: catalog.Column) -> exprs.Expr:
102
+ if c.col_type._type not in (
103
+ ts.ColumnType.Type.STRING,
104
+ ts.ColumnType.Type.IMAGE,
105
+ ts.ColumnType.Type.AUDIO,
106
+ ts.ColumnType.Type.VIDEO,
107
+ ):
108
+ raise excs.Error(f'Type `{c.col_type}` of column {c.name!r} is not a valid type for an embedding index.')
109
+ if c.col_type._type not in self.embeddings:
110
+ raise excs.Error(
111
+ f'The specified embedding function does not support the type `{c.col_type}` of column {c.name!r}.'
112
+ )
113
+
114
+ embed_fn = self.embeddings[c.col_type._type]
115
+ return embed_fn(exprs.ColumnRef(c))
131
116
 
132
117
  def records_value_errors(self) -> bool:
133
118
  return True
134
119
 
135
- def index_sa_type(self) -> sql.types.TypeEngine:
136
- """Return the sqlalchemy type of the index value column"""
137
- return self.index_col_type
138
-
139
- def create_index(self, index_name: str, index_value_col: catalog.Column) -> None:
140
- """Create the index on the index value column"""
141
- idx = sql.Index(
142
- index_name,
143
- index_value_col.sa_col,
144
- postgresql_using='hnsw',
145
- postgresql_with={'m': 16, 'ef_construction': 64},
146
- postgresql_ops={index_value_col.sa_col.name: self.PGVECTOR_OPS[self.metric]},
120
+ def get_index_sa_type(self, val_col_type: ts.ColumnType) -> sql.types.TypeEngine:
121
+ assert isinstance(val_col_type, ts.ArrayType) and val_col_type.shape is not None
122
+ vector_size = val_col_type.shape[0]
123
+ assert vector_size is not None
124
+ return pgvector.sqlalchemy.Vector(vector_size)
125
+
126
+ def sa_create_stmt(self, store_index_name: str, sa_value_col: sql.Column) -> sql.Compiled:
127
+ """Return a sqlalchemy statement for creating the index"""
128
+ return Env.get().dbms.create_vector_index_stmt(
129
+ store_index_name, sa_value_col, metric=self.PGVECTOR_OPS[self.metric]
147
130
  )
148
- conn = Env.get().conn
149
- idx.create(bind=conn)
150
131
 
151
- def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.ColumnElement:
132
+ def drop_index(self, index_name: str, index_value_col: catalog.Column) -> None:
133
+ """Drop the index on the index value column"""
134
+ # TODO: implement
135
+ raise NotImplementedError()
136
+
137
+ def similarity_clause(self, val_column: catalog.Column, item: exprs.Literal) -> sql.ColumnElement:
152
138
  """Create a ColumnElement that represents '<val_column> <op> <item>'"""
153
- assert isinstance(item, (str, PIL.Image.Image))
154
- if isinstance(item, str):
155
- assert self.string_embed is not None
156
- embedding = self.string_embed.exec([item], {})
157
- if isinstance(item, PIL.Image.Image):
158
- assert self.image_embed is not None
159
- embedding = self.image_embed.exec([item], {})
139
+ assert item.col_type._type in self.embeddings
140
+ embedding = self.embeddings[item.col_type._type].exec([item.val], {})
141
+ assert isinstance(embedding, np.ndarray)
160
142
 
161
143
  if self.metric == self.Metric.COSINE:
162
144
  return val_column.sa_col.cosine_distance(embedding) * -1 + 1
@@ -166,17 +148,11 @@ class EmbeddingIndex(IndexBase):
166
148
  assert self.metric == self.Metric.L2
167
149
  return val_column.sa_col.l2_distance(embedding)
168
150
 
169
- def order_by_clause(self, val_column: catalog.Column, item: Any, is_asc: bool) -> sql.ColumnElement:
151
+ def order_by_clause(self, val_column: catalog.Column, item: exprs.Literal, is_asc: bool) -> sql.ColumnElement:
170
152
  """Create a ColumnElement that is used in an ORDER BY clause"""
171
- assert isinstance(item, (str, PIL.Image.Image))
172
- embedding: Optional[np.ndarray] = None
173
- if isinstance(item, str):
174
- assert self.string_embed is not None
175
- embedding = self.string_embed.exec([item], {})
176
- if isinstance(item, PIL.Image.Image):
177
- assert self.image_embed is not None
178
- embedding = self.image_embed.exec([item], {})
179
- assert embedding is not None
153
+ assert item.col_type._type in self.embeddings
154
+ embedding = self.embeddings[item.col_type._type].exec([item.val], {})
155
+ assert isinstance(embedding, np.ndarray)
180
156
 
181
157
  if self.metric == self.Metric.COSINE:
182
158
  result = val_column.sa_col.cosine_distance(embedding)
@@ -194,9 +170,7 @@ class EmbeddingIndex(IndexBase):
194
170
  return 'embedding'
195
171
 
196
172
  @classmethod
197
- def _resolve_embedding_fn(
198
- cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type
199
- ) -> Optional[func.Function]:
173
+ def _resolve_embedding_fn(cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type) -> func.Function | None:
200
174
  """Find an overload resolution for `embed_fn` that matches the given type."""
201
175
  assert isinstance(embed_fn, func.Function)
202
176
  for resolved_fn in embed_fn._resolved_fns:
@@ -243,14 +217,22 @@ class EmbeddingIndex(IndexBase):
243
217
  )
244
218
 
245
219
  def as_dict(self) -> dict:
246
- return {
247
- 'metric': self.metric.name.lower(),
248
- 'string_embed': None if self.string_embed is None else self.string_embed.as_dict(),
249
- 'image_embed': None if self.image_embed is None else self.image_embed.as_dict(),
250
- }
220
+ d: dict[str, Any] = {'metric': self.metric.name.lower()}
221
+ for embed_type, embed_fn in self.embeddings.items():
222
+ key = f'{embed_type.name.lower()}_embed'
223
+ d[key] = embed_fn.as_dict()
224
+ return d
251
225
 
252
226
  @classmethod
253
- def from_dict(cls, c: catalog.Column, d: dict) -> EmbeddingIndex:
254
- string_embed = func.Function.from_dict(d['string_embed']) if d['string_embed'] is not None else None
255
- image_embed = func.Function.from_dict(d['image_embed']) if d['image_embed'] is not None else None
256
- return cls(c, metric=d['metric'], string_embed=string_embed, image_embed=image_embed)
227
+ def from_dict(cls, d: dict) -> EmbeddingIndex:
228
+ string_embed = func.Function.from_dict(d['string_embed']) if d.get('string_embed') is not None else None
229
+ image_embed = func.Function.from_dict(d['image_embed']) if d.get('image_embed') is not None else None
230
+ audio_embed = func.Function.from_dict(d['audio_embed']) if d.get('audio_embed') is not None else None
231
+ video_embed = func.Function.from_dict(d['video_embed']) if d.get('video_embed') is not None else None
232
+ return cls(
233
+ metric=d['metric'],
234
+ string_embed=string_embed,
235
+ image_embed=image_embed,
236
+ audio_embed=audio_embed,
237
+ video_embed=video_embed,
238
+ )
pixeltable/io/__init__.py CHANGED
@@ -1,14 +1,16 @@
1
+ """Functions for importing and exporting Pixeltable data."""
1
2
  # ruff: noqa: F401
2
3
 
3
4
  from .datarows import import_json, import_rows
4
- from .external_store import ExternalStore, SyncStatus
5
+ from .external_store import ExternalStore
5
6
  from .globals import create_label_studio_project, export_images_as_fo_dataset
6
7
  from .hf_datasets import import_huggingface_dataset
8
+ from .lancedb import export_lancedb
7
9
  from .pandas import import_csv, import_excel, import_pandas
8
10
  from .parquet import export_parquet, import_parquet
9
11
 
10
12
  __default_dir = {symbol for symbol in dir() if not symbol.startswith('_')}
11
- __removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows'}
13
+ __removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows', 'lancedb'}
12
14
  __all__ = sorted(__default_dir - __removed_symbols)
13
15
 
14
16
 
pixeltable/io/datarows.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Iterable, Optional, Union
3
+ from typing import Any, Iterable
4
4
 
5
5
  import pixeltable as pxt
6
6
  import pixeltable.type_system as ts
@@ -8,7 +8,7 @@ from pixeltable import exceptions as excs
8
8
 
9
9
 
10
10
  def _infer_schema_from_rows(
11
- rows: Iterable[dict[str, Any]], schema_overrides: dict[str, Any], primary_key: list[str]
11
+ rows: Iterable[dict[str, Any]], schema_overrides: dict[str, ts.ColumnType], primary_key: list[str]
12
12
  ) -> dict[str, ts.ColumnType]:
13
13
  schema: dict[str, ts.ColumnType] = {}
14
14
  cols_with_nones: set[str] = set()
@@ -20,6 +20,7 @@ def _infer_schema_from_rows(
20
20
  # in which the column names are encountered in the input data, even if `schema_overrides`
21
21
  # is specified.
22
22
  if col_name not in schema:
23
+ assert isinstance(schema_overrides[col_name], ts.ColumnType)
23
24
  schema[col_name] = schema_overrides[col_name]
24
25
  elif value is not None:
25
26
  # If `key` is not in `schema_overrides`, then we infer its type from the data.
@@ -33,7 +34,7 @@ def _infer_schema_from_rows(
33
34
  if col_name not in schema:
34
35
  schema[col_name] = col_type
35
36
  else:
36
- supertype = schema[col_name].supertype(col_type)
37
+ supertype = schema[col_name].supertype(col_type, for_inference=True)
37
38
  if supertype is None:
38
39
  raise excs.Error(
39
40
  f'Could not infer type of column `{col_name}`; the value in row {n} '
@@ -59,8 +60,8 @@ def import_rows(
59
60
  tbl_path: str,
60
61
  rows: list[dict[str, Any]],
61
62
  *,
62
- schema_overrides: Optional[dict[str, Any]] = None,
63
- primary_key: Optional[Union[str, list[str]]] = None,
63
+ schema_overrides: dict[str, Any] | None = None,
64
+ primary_key: str | list[str] | None = None,
64
65
  num_retained_versions: int = 10,
65
66
  comment: str = '',
66
67
  ) -> pxt.Table:
@@ -103,8 +104,8 @@ def import_json(
103
104
  tbl_path: str,
104
105
  filepath_or_url: str,
105
106
  *,
106
- schema_overrides: Optional[dict[str, Any]] = None,
107
- primary_key: Optional[Union[str, list[str]]] = None,
107
+ schema_overrides: dict[str, Any] | None = None,
108
+ primary_key: str | list[str] | None = None,
108
109
  num_retained_versions: int = 10,
109
110
  comment: str = '',
110
111
  **kwargs: Any,