pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -1,3 +1,5 @@
1
+ # ruff: noqa: F401
2
+
1
3
  from .base import IndexBase
2
- from .embedding_index import EmbeddingIndex
3
4
  from .btree import BtreeIndex
5
+ from .embedding_index import EmbeddingIndex
pixeltable/index/base.py CHANGED
@@ -1,11 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import abc
4
- from typing import Any
5
4
 
6
5
  import sqlalchemy as sql
7
6
 
8
- from pixeltable import catalog, exprs
7
+ import pixeltable.catalog as catalog
8
+ import pixeltable.exprs as exprs
9
+ import pixeltable.type_system as ts
9
10
 
10
11
 
11
12
  class IndexBase(abc.ABC):
@@ -18,39 +19,34 @@ class IndexBase(abc.ABC):
18
19
  """
19
20
 
20
21
  @abc.abstractmethod
21
- def __init__(self, c: catalog.Column, **kwargs: Any):
22
- pass
23
-
24
- @abc.abstractmethod
25
- def index_value_expr(self) -> exprs.Expr:
26
- """Return expression that computes the value that goes into the index"""
27
- pass
22
+ def create_value_expr(self, c: catalog.Column) -> exprs.Expr:
23
+ """
24
+ Validates that the index can be created on column c and returns an expression that computes the index value.
25
+ """
28
26
 
29
27
  @abc.abstractmethod
30
28
  def records_value_errors(self) -> bool:
31
29
  """True if index_value_expr() can raise errors"""
32
- pass
33
30
 
34
31
  @abc.abstractmethod
35
- def index_sa_type(self) -> sql.types.TypeEngine:
32
+ def get_index_sa_type(self, value_col_type: ts.ColumnType) -> sql.types.TypeEngine:
36
33
  """Return the sqlalchemy type of the index value column"""
37
- pass
38
34
 
39
35
  @abc.abstractmethod
40
- def create_index(self, index_name: str, index_value_col: catalog.Column, conn: sql.engine.Connection) -> None:
41
- """Create the index on the index value column"""
42
- pass
36
+ def sa_create_stmt(self, store_index_name: str, sa_value_col: sql.Column) -> sql.Compiled:
37
+ """Return a sqlalchemy statement for creating the index"""
38
+
39
+ @abc.abstractmethod
40
+ def drop_index(self, index_name: str, index_value_col: catalog.Column) -> None:
41
+ """Drop the index on the index value column"""
43
42
 
44
43
  @classmethod
45
44
  @abc.abstractmethod
46
- def display_name(cls) -> str:
47
- pass
45
+ def display_name(cls) -> str: ...
48
46
 
49
47
  @abc.abstractmethod
50
- def as_dict(self) -> dict:
51
- pass
48
+ def as_dict(self) -> dict: ...
52
49
 
53
50
  @classmethod
54
51
  @abc.abstractmethod
55
- def from_dict(cls, c: catalog.Column, d: dict) -> IndexBase:
56
- pass
52
+ def from_dict(cls, d: dict) -> IndexBase: ...
pixeltable/index/btree.py CHANGED
@@ -1,58 +1,69 @@
1
- from typing import Optional, TYPE_CHECKING
1
+ from typing import TYPE_CHECKING
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
5
5
  # TODO: why does this import result in a circular import, but the one im embedding_index.py doesn't?
6
6
  # import pixeltable.catalog as catalog
7
7
  import pixeltable.exceptions as excs
8
- from pixeltable import catalog, exprs
8
+ import pixeltable.exprs as exprs
9
+ import pixeltable.type_system as ts
9
10
  from pixeltable.func.udf import udf
11
+
10
12
  from .base import IndexBase
11
13
 
12
14
  if TYPE_CHECKING:
13
- import pixeltable.exprs
15
+ import pixeltable.catalog as catalog
16
+
14
17
 
15
18
  class BtreeIndex(IndexBase):
16
19
  """
17
20
  Interface to B-tree indices in Postgres.
18
21
  """
19
- MAX_STRING_LEN = 256
20
22
 
21
- value_expr: 'pixeltable.exprs.Expr'
23
+ MAX_STRING_LEN = 256
22
24
 
23
25
  @staticmethod
24
26
  @udf
25
- def str_filter(s: Optional[str]) -> Optional[str]:
27
+ def str_filter(s: str | None) -> str | None:
26
28
  if s is None:
27
29
  return None
28
- return s[:BtreeIndex.MAX_STRING_LEN]
30
+ return s[: BtreeIndex.MAX_STRING_LEN]
31
+
32
+ def __init__(self) -> None:
33
+ pass
29
34
 
30
- def __init__(self, c: 'catalog.Column'):
35
+ def create_value_expr(self, c: 'catalog.Column') -> 'exprs.Expr':
31
36
  if not c.col_type.is_scalar_type() and not c.col_type.is_media_type():
32
37
  raise excs.Error(f'Index on column {c.name}: B-tree index requires scalar or media type, got {c.col_type}')
38
+ value_expr: exprs.Expr
33
39
  if c.col_type.is_media_type():
34
40
  # an index on a media column is an index on the file url
35
41
  # no validation for media columns: we're only interested in the string value
36
- self.value_expr = exprs.ColumnRef(c, perform_validation=False)
42
+ value_expr = exprs.ColumnRef(c, perform_validation=False)
37
43
  else:
38
- self.value_expr = (
44
+ value_expr = (
39
45
  BtreeIndex.str_filter(exprs.ColumnRef(c)) if c.col_type.is_string_type() else exprs.ColumnRef(c)
40
46
  )
41
-
42
- def index_value_expr(self) -> 'exprs.Expr':
43
- return self.value_expr
47
+ return value_expr
44
48
 
45
49
  def records_value_errors(self) -> bool:
46
50
  return False
47
51
 
48
- def index_sa_type(self) -> sql.types.TypeEngine:
52
+ def get_index_sa_type(self, val_col_type: ts.ColumnType) -> sql.types.TypeEngine:
49
53
  """Return the sqlalchemy type of the index value column"""
50
- return self.value_expr.col_type.to_sa_type()
54
+ return val_col_type.to_sa_type()
55
+
56
+ def sa_create_stmt(self, store_index_name: str, sa_value_col: sql.Column) -> sql.Compiled:
57
+ """Return a sqlalchemy statement for creating the index"""
58
+ from sqlalchemy.dialects import postgresql
51
59
 
52
- def create_index(self, index_name: str, index_value_col: 'catalog.Column', conn: sql.engine.Connection) -> None:
53
- """Create the index on the index value column"""
54
- idx = sql.Index(index_name, index_value_col.sa_col, postgresql_using='btree')
55
- idx.create(bind=conn)
60
+ sa_idx = sql.Index(store_index_name, sa_value_col, postgresql_using='btree')
61
+ return sql.schema.CreateIndex(sa_idx, if_not_exists=True).compile(dialect=postgresql.dialect())
62
+
63
+ def drop_index(self, index_name: str, index_value_col: 'catalog.Column') -> None:
64
+ """Drop the index on the index value column"""
65
+ # TODO: implement
66
+ raise NotImplementedError()
56
67
 
57
68
  @classmethod
58
69
  def display_name(cls) -> str:
@@ -62,6 +73,5 @@ class BtreeIndex(IndexBase):
62
73
  return {}
63
74
 
64
75
  @classmethod
65
- def from_dict(cls, c: 'catalog.Column', d: dict) -> 'BtreeIndex':
66
- return cls(c)
67
-
76
+ def from_dict(cls, d: dict) -> 'BtreeIndex':
77
+ return cls()
@@ -1,16 +1,18 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import enum
4
- from typing import Any, Optional
4
+ from typing import Any, ClassVar
5
5
 
6
6
  import numpy as np
7
7
  import pgvector.sqlalchemy # type: ignore[import-untyped]
8
- import PIL.Image
9
8
  import sqlalchemy as sql
10
9
 
10
+ import pixeltable.catalog as catalog
11
11
  import pixeltable.exceptions as excs
12
+ import pixeltable.exprs as exprs
13
+ import pixeltable.func as func
12
14
  import pixeltable.type_system as ts
13
- from pixeltable import catalog, exprs, func
15
+ from pixeltable.env import Env
14
16
 
15
17
  from .base import IndexBase
16
18
 
@@ -31,70 +33,112 @@ class EmbeddingIndex(IndexBase):
31
33
  IP = 2
32
34
  L2 = 3
33
35
 
34
- PGVECTOR_OPS = {
36
+ PGVECTOR_OPS: ClassVar[dict[Metric, str]] = {
35
37
  Metric.COSINE: 'vector_cosine_ops',
36
38
  Metric.IP: 'vector_ip_ops',
37
- Metric.L2: 'vector_l2_ops'
39
+ Metric.L2: 'vector_l2_ops',
38
40
  }
39
41
 
42
+ metric: Metric
43
+ embeddings: dict[ts.ColumnType.Type, func.Function]
44
+
40
45
  def __init__(
41
- self, c: catalog.Column, metric: str, string_embed: Optional[func.Function] = None,
42
- image_embed: Optional[func.Function] = None):
46
+ self,
47
+ metric: str,
48
+ embed: func.Function | None = None,
49
+ string_embed: func.Function | None = None,
50
+ image_embed: func.Function | None = None,
51
+ audio_embed: func.Function | None = None,
52
+ video_embed: func.Function | None = None,
53
+ ):
54
+ if embed is None and string_embed is None and image_embed is None:
55
+ raise excs.Error('At least one of `embed`, `string_embed`, or `image_embed` must be specified')
43
56
  metric_names = [m.name.lower() for m in self.Metric]
44
57
  if metric.lower() not in metric_names:
45
58
  raise excs.Error(f'Invalid metric {metric}, must be one of {metric_names}')
46
- if not c.col_type.is_string_type() and not c.col_type.is_image_type():
47
- raise excs.Error(f'Embedding index requires string or image column')
48
- if c.col_type.is_string_type() and string_embed is None:
49
- raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
50
- if c.col_type.is_image_type() and image_embed is None:
51
- raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
52
- if string_embed is not None:
53
- # verify signature
54
- self._validate_embedding_fn(string_embed, 'string_embed', ts.ColumnType.Type.STRING)
55
- if image_embed is not None:
56
- # verify signature
57
- self._validate_embedding_fn(image_embed, 'image_embed', ts.ColumnType.Type.IMAGE)
59
+
60
+ self.embeddings = {}
61
+
62
+ # Resolve the specific embedding functions corresponding to the user-provided embedding functions.
63
+ # For string embeddings, for example, `string_embed` will be used if specified; otherwise, `embed` will
64
+ # be used as a fallback, if it has a matching signature.
65
+
66
+ for embed_type, embed_fn in (
67
+ (ts.ColumnType.Type.STRING, string_embed),
68
+ (ts.ColumnType.Type.IMAGE, image_embed),
69
+ (ts.ColumnType.Type.AUDIO, audio_embed),
70
+ (ts.ColumnType.Type.VIDEO, video_embed),
71
+ ):
72
+ if embed_fn is not None:
73
+ # Embedding function for the requisite type is specified directly; it MUST be valid.
74
+ resolved_fn = self._resolve_embedding_fn(embed_fn, embed_type)
75
+ if resolved_fn is None:
76
+ raise excs.Error(
77
+ f'The function `{embed_fn.name}` is not a valid {embed_type.name.lower()} '
78
+ f'embedding: it must take a single {embed_type.name.lower()} parameter'
79
+ )
80
+ self.embeddings[embed_type] = resolved_fn
81
+ elif embed is not None:
82
+ # General `embed` is specified; see if it has a matching signature.
83
+ resolved_fn = self._resolve_embedding_fn(embed, embed_type)
84
+ if resolved_fn is not None:
85
+ self.embeddings[embed_type] = resolved_fn
86
+
87
+ if len(self.embeddings) == 0:
88
+ # `embed` was specified and contains no matching signatures.
89
+ assert embed is not None
90
+ raise excs.Error(
91
+ f'The function `{embed.name}` is not a valid embedding: '
92
+ 'it must take a single string, image, audio, or video parameter'
93
+ )
94
+
95
+ # Now validate the return types of the embedding functions.
96
+ for _, embed_fn in self.embeddings.items():
97
+ self._validate_embedding_fn(embed_fn)
58
98
 
59
99
  self.metric = self.Metric[metric.upper()]
60
- self.value_expr = string_embed(exprs.ColumnRef(c)) if c.col_type.is_string_type() else image_embed(exprs.ColumnRef(c))
61
- assert isinstance(self.value_expr.col_type, ts.ArrayType)
62
- self.string_embed = string_embed
63
- self.image_embed = image_embed
64
- vector_size = self.value_expr.col_type.shape[0]
65
- assert vector_size is not None
66
- self.index_col_type = pgvector.sqlalchemy.Vector(vector_size)
67
100
 
68
- def index_value_expr(self) -> exprs.Expr:
69
- """Return expression that computes the value that goes into the index"""
70
- return self.value_expr
101
+ def create_value_expr(self, c: catalog.Column) -> exprs.Expr:
102
+ if c.col_type._type not in (
103
+ ts.ColumnType.Type.STRING,
104
+ ts.ColumnType.Type.IMAGE,
105
+ ts.ColumnType.Type.AUDIO,
106
+ ts.ColumnType.Type.VIDEO,
107
+ ):
108
+ raise excs.Error(f'Type `{c.col_type}` of column {c.name!r} is not a valid type for an embedding index.')
109
+ if c.col_type._type not in self.embeddings:
110
+ raise excs.Error(
111
+ f'The specified embedding function does not support the type `{c.col_type}` of column {c.name!r}.'
112
+ )
113
+
114
+ embed_fn = self.embeddings[c.col_type._type]
115
+ return embed_fn(exprs.ColumnRef(c))
71
116
 
72
117
  def records_value_errors(self) -> bool:
73
118
  return True
74
119
 
75
- def index_sa_type(self) -> sql.types.TypeEngine:
76
- """Return the sqlalchemy type of the index value column"""
77
- return self.index_col_type
78
-
79
- def create_index(self, index_name: str, index_value_col: catalog.Column, conn: sql.engine.Connection) -> None:
80
- """Create the index on the index value column"""
81
- idx = sql.Index(
82
- index_name, index_value_col.sa_col,
83
- postgresql_using='hnsw',
84
- postgresql_with={'m': 16, 'ef_construction': 64},
85
- postgresql_ops={index_value_col.sa_col.name: self.PGVECTOR_OPS[self.metric]}
120
+ def get_index_sa_type(self, val_col_type: ts.ColumnType) -> sql.types.TypeEngine:
121
+ assert isinstance(val_col_type, ts.ArrayType) and val_col_type.shape is not None
122
+ vector_size = val_col_type.shape[0]
123
+ assert vector_size is not None
124
+ return pgvector.sqlalchemy.Vector(vector_size)
125
+
126
+ def sa_create_stmt(self, store_index_name: str, sa_value_col: sql.Column) -> sql.Compiled:
127
+ """Return a sqlalchemy statement for creating the index"""
128
+ return Env.get().dbms.create_vector_index_stmt(
129
+ store_index_name, sa_value_col, metric=self.PGVECTOR_OPS[self.metric]
86
130
  )
87
- idx.create(bind=conn)
88
131
 
89
- def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.ColumnElement:
132
+ def drop_index(self, index_name: str, index_value_col: catalog.Column) -> None:
133
+ """Drop the index on the index value column"""
134
+ # TODO: implement
135
+ raise NotImplementedError()
136
+
137
+ def similarity_clause(self, val_column: catalog.Column, item: exprs.Literal) -> sql.ColumnElement:
90
138
  """Create a ColumnElement that represents '<val_column> <op> <item>'"""
91
- assert isinstance(item, (str, PIL.Image.Image))
92
- if isinstance(item, str):
93
- assert self.string_embed is not None
94
- embedding = self.string_embed.exec(item)
95
- if isinstance(item, PIL.Image.Image):
96
- assert self.image_embed is not None
97
- embedding = self.image_embed.exec(item)
139
+ assert item.col_type._type in self.embeddings
140
+ embedding = self.embeddings[item.col_type._type].exec([item.val], {})
141
+ assert isinstance(embedding, np.ndarray)
98
142
 
99
143
  if self.metric == self.Metric.COSINE:
100
144
  return val_column.sa_col.cosine_distance(embedding) * -1 + 1
@@ -104,17 +148,11 @@ class EmbeddingIndex(IndexBase):
104
148
  assert self.metric == self.Metric.L2
105
149
  return val_column.sa_col.l2_distance(embedding)
106
150
 
107
- def order_by_clause(self, val_column: catalog.Column, item: Any, is_asc: bool) -> sql.ColumnElement:
151
+ def order_by_clause(self, val_column: catalog.Column, item: exprs.Literal, is_asc: bool) -> sql.ColumnElement:
108
152
  """Create a ColumnElement that is used in an ORDER BY clause"""
109
- assert isinstance(item, (str, PIL.Image.Image))
110
- embedding: Optional[np.ndarray] = None
111
- if isinstance(item, str):
112
- assert self.string_embed is not None
113
- embedding = self.string_embed.exec(item)
114
- if isinstance(item, PIL.Image.Image):
115
- assert self.image_embed is not None
116
- embedding = self.image_embed.exec(item)
117
- assert embedding is not None
153
+ assert item.col_type._type in self.embeddings
154
+ embedding = self.embeddings[item.col_type._type].exec([item.val], {})
155
+ assert isinstance(embedding, np.ndarray)
118
156
 
119
157
  if self.metric == self.Metric.COSINE:
120
158
  result = val_column.sa_col.cosine_distance(embedding)
@@ -132,44 +170,69 @@ class EmbeddingIndex(IndexBase):
132
170
  return 'embedding'
133
171
 
134
172
  @classmethod
135
- def _validate_embedding_fn(cls, embed_fn: func.Function, name: str, expected_type: ts.ColumnType.Type) -> None:
136
- """Validate the signature"""
173
+ def _resolve_embedding_fn(cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type) -> func.Function | None:
174
+ """Find an overload resolution for `embed_fn` that matches the given type."""
137
175
  assert isinstance(embed_fn, func.Function)
138
- sig = embed_fn.signature
176
+ for resolved_fn in embed_fn._resolved_fns:
177
+ # The embedding function must be a 1-ary function of the correct type. But it's ok if the function signature
178
+ # has more than one parameter, as long as it has at most one *required* parameter.
179
+ sig = resolved_fn.signature
180
+ if (
181
+ len(sig.parameters) >= 1
182
+ and len(sig.required_parameters) <= 1
183
+ and sig.parameters_by_pos[0].col_type.type_enum == expected_type
184
+ ):
185
+ # We found a valid signature. Now, if it has more than one parameter, we need to transform it into a
186
+ # 1-ary function by fixing all the other parameters to their defaults. This is to ensure that
187
+ # conditional_return_type resolves correctly.
188
+ if len(sig.parameters) == 1:
189
+ unary_fn = resolved_fn
190
+ else:
191
+ assert all(sig.parameters_by_pos[i].has_default for i in range(1, len(sig.parameters)))
192
+ defaults = {param.name: param.default for param in sig.parameters_by_pos[1:]}
193
+ unary_fn = resolved_fn.using(**defaults)
194
+ assert not unary_fn.is_polymorphic
195
+ assert len(unary_fn.signature.parameters) == 1
196
+ return unary_fn
197
+ return None
139
198
 
140
- # The embedding function must be a 1-ary function of the correct type. But it's ok if the function signature
141
- # has more than one parameter, as long as it has at most one *required* parameter.
142
- if (len(sig.parameters) == 0
143
- or len(sig.required_parameters) > 1
144
- or sig.parameters_by_pos[0].col_type.type_enum != expected_type):
145
- raise excs.Error(
146
- f'{name} must take a single {expected_type.name.lower()} parameter, but has signature {sig}')
199
+ @classmethod
200
+ def _validate_embedding_fn(cls, embed_fn: func.Function) -> None:
201
+ """Validate the given embedding function."""
202
+ assert not embed_fn.is_polymorphic
203
+
204
+ return_type = embed_fn.signature.return_type
147
205
 
148
- # validate return type
149
- param_name = sig.parameters_by_pos[0].name
150
- if expected_type == ts.ColumnType.Type.STRING:
151
- return_type = embed_fn.call_return_type({param_name: 'dummy'})
152
- else:
153
- assert expected_type == ts.ColumnType.Type.IMAGE
154
- img = PIL.Image.new('RGB', (512, 512))
155
- return_type = embed_fn.call_return_type({param_name: img})
156
- assert return_type is not None
157
206
  if not isinstance(return_type, ts.ArrayType):
158
- raise excs.Error(f'{name} must return an array, but returns {return_type}')
159
- else:
160
- shape = return_type.shape
161
- if len(shape) != 1 or shape[0] == None:
162
- raise excs.Error(f'{name} must return a 1D array of a specific length, but returns {return_type}')
207
+ raise excs.Error(
208
+ f'The function `{embed_fn.name}` is not a valid embedding: '
209
+ f'it must return an array, but returns {return_type}'
210
+ )
211
+
212
+ shape = return_type.shape
213
+ if len(shape) != 1 or shape[0] is None:
214
+ raise excs.Error(
215
+ f'The function `{embed_fn.name}` is not a valid embedding: '
216
+ f'it must return a 1-dimensional array of a specific length, but returns {return_type}'
217
+ )
163
218
 
164
219
  def as_dict(self) -> dict:
165
- return {
166
- 'metric': self.metric.name.lower(),
167
- 'string_embed': None if self.string_embed is None else self.string_embed.as_dict(),
168
- 'image_embed': None if self.image_embed is None else self.image_embed.as_dict()
169
- }
220
+ d: dict[str, Any] = {'metric': self.metric.name.lower()}
221
+ for embed_type, embed_fn in self.embeddings.items():
222
+ key = f'{embed_type.name.lower()}_embed'
223
+ d[key] = embed_fn.as_dict()
224
+ return d
170
225
 
171
226
  @classmethod
172
- def from_dict(cls, c: catalog.Column, d: dict) -> EmbeddingIndex:
173
- string_embed = func.Function.from_dict(d['string_embed']) if d['string_embed'] is not None else None
174
- image_embed = func.Function.from_dict(d['image_embed']) if d['image_embed'] is not None else None
175
- return cls(c, metric=d['metric'], string_embed=string_embed, image_embed=image_embed)
227
+ def from_dict(cls, d: dict) -> EmbeddingIndex:
228
+ string_embed = func.Function.from_dict(d['string_embed']) if d.get('string_embed') is not None else None
229
+ image_embed = func.Function.from_dict(d['image_embed']) if d.get('image_embed') is not None else None
230
+ audio_embed = func.Function.from_dict(d['audio_embed']) if d.get('audio_embed') is not None else None
231
+ video_embed = func.Function.from_dict(d['video_embed']) if d.get('video_embed') is not None else None
232
+ return cls(
233
+ metric=d['metric'],
234
+ string_embed=string_embed,
235
+ image_embed=image_embed,
236
+ audio_embed=audio_embed,
237
+ video_embed=video_embed,
238
+ )
pixeltable/io/__init__.py CHANGED
@@ -1,13 +1,18 @@
1
- from .external_store import ExternalStore, SyncStatus
2
- from .globals import create_label_studio_project, export_images_as_fo_dataset, import_json, import_rows
1
+ """Functions for importing and exporting Pixeltable data."""
2
+ # ruff: noqa: F401
3
+
4
+ from .datarows import import_json, import_rows
5
+ from .external_store import ExternalStore
6
+ from .globals import create_label_studio_project, export_images_as_fo_dataset
3
7
  from .hf_datasets import import_huggingface_dataset
8
+ from .lancedb import export_lancedb
4
9
  from .pandas import import_csv, import_excel, import_pandas
5
- from .parquet import import_parquet, export_parquet
10
+ from .parquet import export_parquet, import_parquet
6
11
 
7
- __default_dir = set(symbol for symbol in dir() if not symbol.startswith('_'))
8
- __removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet'}
9
- __all__ = sorted(list(__default_dir - __removed_symbols))
12
+ __default_dir = {symbol for symbol in dir() if not symbol.startswith('_')}
13
+ __removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows', 'lancedb'}
14
+ __all__ = sorted(__default_dir - __removed_symbols)
10
15
 
11
16
 
12
- def __dir__():
17
+ def __dir__() -> list[str]:
13
18
  return __all__
@@ -0,0 +1,140 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Iterable
4
+
5
+ import pixeltable as pxt
6
+ import pixeltable.type_system as ts
7
+ from pixeltable import exceptions as excs
8
+
9
+
10
+ def _infer_schema_from_rows(
11
+ rows: Iterable[dict[str, Any]], schema_overrides: dict[str, ts.ColumnType], primary_key: list[str]
12
+ ) -> dict[str, ts.ColumnType]:
13
+ schema: dict[str, ts.ColumnType] = {}
14
+ cols_with_nones: set[str] = set()
15
+
16
+ for n, row in enumerate(rows):
17
+ for col_name, value in row.items():
18
+ if col_name in schema_overrides:
19
+ # We do the insertion here; this will ensure that the column order matches the order
20
+ # in which the column names are encountered in the input data, even if `schema_overrides`
21
+ # is specified.
22
+ if col_name not in schema:
23
+ assert isinstance(schema_overrides[col_name], ts.ColumnType)
24
+ schema[col_name] = schema_overrides[col_name]
25
+ elif value is not None:
26
+ # If `key` is not in `schema_overrides`, then we infer its type from the data.
27
+ # The column type will always be nullable by default.
28
+ col_type = ts.ColumnType.infer_literal_type(value, nullable=col_name not in primary_key)
29
+ if col_type is None:
30
+ raise excs.Error(
31
+ f'Could not infer type for column `{col_name}`; the value in row {n} '
32
+ f'has an unsupported type: {type(value)}'
33
+ )
34
+ if col_name not in schema:
35
+ schema[col_name] = col_type
36
+ else:
37
+ supertype = schema[col_name].supertype(col_type, for_inference=True)
38
+ if supertype is None:
39
+ raise excs.Error(
40
+ f'Could not infer type of column `{col_name}`; the value in row {n} '
41
+ f'does not match preceding type {schema[col_name]}: {value!r}\n'
42
+ 'Consider specifying the type explicitly in `schema_overrides`.'
43
+ )
44
+ schema[col_name] = supertype
45
+ else:
46
+ cols_with_nones.add(col_name)
47
+
48
+ entirely_none_cols = cols_with_nones - schema.keys()
49
+ if len(entirely_none_cols) > 0:
50
+ # A column can only end up in `entirely_none_cols` if it was not in `schema_overrides` and
51
+ # was not encountered in any row with a non-None value.
52
+ raise excs.Error(
53
+ f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
54
+ 'Consider specifying the type(s) explicitly in `schema_overrides`.'
55
+ )
56
+ return schema
57
+
58
+
59
+ def import_rows(
60
+ tbl_path: str,
61
+ rows: list[dict[str, Any]],
62
+ *,
63
+ schema_overrides: dict[str, Any] | None = None,
64
+ primary_key: str | list[str] | None = None,
65
+ num_retained_versions: int = 10,
66
+ comment: str = '',
67
+ ) -> pxt.Table:
68
+ """
69
+ Creates a new base table from a list of dictionaries. The dictionaries must be of the
70
+ form `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
71
+ supplied data, using the most specific type that can represent all the values in a column.
72
+
73
+ If `schema_overrides` is specified, then for each entry `(column_name, type)` in `schema_overrides`,
74
+ Pixeltable will force the specified column to the specified type (and will not attempt any type inference
75
+ for that column).
76
+
77
+ All column types of the new table will be nullable unless explicitly specified as non-nullable in
78
+ `schema_overrides`.
79
+
80
+ Args:
81
+ tbl_path: The qualified name of the table to create.
82
+ rows: The list of dictionaries to import.
83
+ schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
84
+ as described above.
85
+ primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
86
+ num_retained_versions: The number of retained versions of the table
87
+ (see [`create_table()`][pixeltable.create_table]).
88
+ comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
89
+
90
+ Returns:
91
+ A handle to the newly created [`Table`][pixeltable.Table].
92
+ """
93
+ return pxt.create_table(
94
+ tbl_path,
95
+ source=rows,
96
+ schema_overrides=schema_overrides,
97
+ primary_key=primary_key,
98
+ num_retained_versions=num_retained_versions,
99
+ comment=comment,
100
+ )
101
+
102
+
103
+ def import_json(
104
+ tbl_path: str,
105
+ filepath_or_url: str,
106
+ *,
107
+ schema_overrides: dict[str, Any] | None = None,
108
+ primary_key: str | list[str] | None = None,
109
+ num_retained_versions: int = 10,
110
+ comment: str = '',
111
+ **kwargs: Any,
112
+ ) -> pxt.Table:
113
+ """
114
+ Creates a new base table from a JSON file. This is a convenience method and is
115
+ equivalent to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
116
+ is the contents of the specified `filepath_or_url`.
117
+
118
+ Args:
119
+ tbl_path: The name of the table to create.
120
+ filepath_or_url: The path or URL of the JSON file.
121
+ schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
122
+ (see [`import_rows()`][pixeltable.io.import_rows]).
123
+ primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
124
+ num_retained_versions: The number of retained versions of the table
125
+ (see [`create_table()`][pixeltable.create_table]).
126
+ comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
127
+ kwargs: Additional keyword arguments to pass to `json.loads`.
128
+
129
+ Returns:
130
+ A handle to the newly created [`Table`][pixeltable.Table].
131
+ """
132
+ return pxt.create_table(
133
+ tbl_path,
134
+ source=filepath_or_url,
135
+ schema_overrides=schema_overrides,
136
+ primary_key=primary_key,
137
+ num_retained_versions=num_retained_versions,
138
+ comment=comment,
139
+ extra_args=kwargs,
140
+ )