pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. pixeltable/__init__.py +42 -8
  2. pixeltable/{dataframe.py → _query.py} +470 -206
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +5 -4
  5. pixeltable/catalog/catalog.py +1785 -432
  6. pixeltable/catalog/column.py +190 -113
  7. pixeltable/catalog/dir.py +2 -4
  8. pixeltable/catalog/globals.py +19 -46
  9. pixeltable/catalog/insertable_table.py +191 -98
  10. pixeltable/catalog/path.py +63 -23
  11. pixeltable/catalog/schema_object.py +11 -15
  12. pixeltable/catalog/table.py +843 -436
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +978 -657
  15. pixeltable/catalog/table_version_handle.py +72 -16
  16. pixeltable/catalog/table_version_path.py +112 -43
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +134 -90
  20. pixeltable/config.py +134 -22
  21. pixeltable/env.py +471 -157
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +4 -1
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +11 -7
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +106 -56
  37. pixeltable/exec/globals.py +35 -0
  38. pixeltable/exec/in_memory_data_node.py +19 -19
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +351 -84
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +36 -23
  46. pixeltable/exprs/column_ref.py +213 -89
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +164 -54
  50. pixeltable/exprs/expr.py +70 -44
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +100 -40
  54. pixeltable/exprs/globals.py +2 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +18 -32
  57. pixeltable/exprs/is_null.py +7 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +27 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +167 -67
  64. pixeltable/exprs/rowid_ref.py +25 -10
  65. pixeltable/exprs/similarity_expr.py +58 -40
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +17 -11
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +29 -27
  78. pixeltable/func/signature.py +46 -19
  79. pixeltable/func/tools.py +31 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +16 -0
  82. pixeltable/functions/anthropic.py +123 -77
  83. pixeltable/functions/audio.py +147 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +7 -4
  86. pixeltable/functions/deepseek.py +35 -43
  87. pixeltable/functions/document.py +81 -0
  88. pixeltable/functions/fal.py +76 -0
  89. pixeltable/functions/fireworks.py +11 -20
  90. pixeltable/functions/gemini.py +195 -39
  91. pixeltable/functions/globals.py +142 -14
  92. pixeltable/functions/groq.py +108 -0
  93. pixeltable/functions/huggingface.py +1056 -24
  94. pixeltable/functions/image.py +115 -57
  95. pixeltable/functions/json.py +1 -1
  96. pixeltable/functions/llama_cpp.py +28 -13
  97. pixeltable/functions/math.py +67 -5
  98. pixeltable/functions/mistralai.py +18 -55
  99. pixeltable/functions/net.py +70 -0
  100. pixeltable/functions/ollama.py +20 -13
  101. pixeltable/functions/openai.py +240 -226
  102. pixeltable/functions/openrouter.py +143 -0
  103. pixeltable/functions/replicate.py +4 -4
  104. pixeltable/functions/reve.py +250 -0
  105. pixeltable/functions/string.py +239 -69
  106. pixeltable/functions/timestamp.py +16 -16
  107. pixeltable/functions/together.py +24 -84
  108. pixeltable/functions/twelvelabs.py +188 -0
  109. pixeltable/functions/util.py +6 -1
  110. pixeltable/functions/uuid.py +30 -0
  111. pixeltable/functions/video.py +1515 -107
  112. pixeltable/functions/vision.py +8 -8
  113. pixeltable/functions/voyageai.py +289 -0
  114. pixeltable/functions/whisper.py +16 -8
  115. pixeltable/functions/whisperx.py +179 -0
  116. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  117. pixeltable/globals.py +362 -115
  118. pixeltable/index/base.py +17 -21
  119. pixeltable/index/btree.py +28 -22
  120. pixeltable/index/embedding_index.py +100 -118
  121. pixeltable/io/__init__.py +4 -2
  122. pixeltable/io/datarows.py +8 -7
  123. pixeltable/io/external_store.py +56 -105
  124. pixeltable/io/fiftyone.py +13 -13
  125. pixeltable/io/globals.py +31 -30
  126. pixeltable/io/hf_datasets.py +61 -16
  127. pixeltable/io/label_studio.py +74 -70
  128. pixeltable/io/lancedb.py +3 -0
  129. pixeltable/io/pandas.py +21 -12
  130. pixeltable/io/parquet.py +25 -105
  131. pixeltable/io/table_data_conduit.py +250 -123
  132. pixeltable/io/utils.py +4 -4
  133. pixeltable/iterators/__init__.py +2 -1
  134. pixeltable/iterators/audio.py +26 -25
  135. pixeltable/iterators/base.py +9 -3
  136. pixeltable/iterators/document.py +112 -78
  137. pixeltable/iterators/image.py +12 -15
  138. pixeltable/iterators/string.py +11 -4
  139. pixeltable/iterators/video.py +523 -120
  140. pixeltable/metadata/__init__.py +14 -3
  141. pixeltable/metadata/converters/convert_13.py +2 -2
  142. pixeltable/metadata/converters/convert_18.py +2 -2
  143. pixeltable/metadata/converters/convert_19.py +2 -2
  144. pixeltable/metadata/converters/convert_20.py +2 -2
  145. pixeltable/metadata/converters/convert_21.py +2 -2
  146. pixeltable/metadata/converters/convert_22.py +2 -2
  147. pixeltable/metadata/converters/convert_24.py +2 -2
  148. pixeltable/metadata/converters/convert_25.py +2 -2
  149. pixeltable/metadata/converters/convert_26.py +2 -2
  150. pixeltable/metadata/converters/convert_29.py +4 -4
  151. pixeltable/metadata/converters/convert_30.py +34 -21
  152. pixeltable/metadata/converters/convert_34.py +2 -2
  153. pixeltable/metadata/converters/convert_35.py +9 -0
  154. pixeltable/metadata/converters/convert_36.py +38 -0
  155. pixeltable/metadata/converters/convert_37.py +15 -0
  156. pixeltable/metadata/converters/convert_38.py +39 -0
  157. pixeltable/metadata/converters/convert_39.py +124 -0
  158. pixeltable/metadata/converters/convert_40.py +73 -0
  159. pixeltable/metadata/converters/convert_41.py +12 -0
  160. pixeltable/metadata/converters/convert_42.py +9 -0
  161. pixeltable/metadata/converters/convert_43.py +44 -0
  162. pixeltable/metadata/converters/util.py +20 -31
  163. pixeltable/metadata/notes.py +9 -0
  164. pixeltable/metadata/schema.py +140 -53
  165. pixeltable/metadata/utils.py +74 -0
  166. pixeltable/mypy/__init__.py +3 -0
  167. pixeltable/mypy/mypy_plugin.py +123 -0
  168. pixeltable/plan.py +382 -115
  169. pixeltable/share/__init__.py +1 -1
  170. pixeltable/share/packager.py +547 -83
  171. pixeltable/share/protocol/__init__.py +33 -0
  172. pixeltable/share/protocol/common.py +165 -0
  173. pixeltable/share/protocol/operation_types.py +33 -0
  174. pixeltable/share/protocol/replica.py +119 -0
  175. pixeltable/share/publish.py +257 -59
  176. pixeltable/store.py +311 -194
  177. pixeltable/type_system.py +373 -211
  178. pixeltable/utils/__init__.py +2 -3
  179. pixeltable/utils/arrow.py +131 -17
  180. pixeltable/utils/av.py +298 -0
  181. pixeltable/utils/azure_store.py +346 -0
  182. pixeltable/utils/coco.py +6 -6
  183. pixeltable/utils/code.py +3 -3
  184. pixeltable/utils/console_output.py +4 -1
  185. pixeltable/utils/coroutine.py +6 -23
  186. pixeltable/utils/dbms.py +32 -6
  187. pixeltable/utils/description_helper.py +4 -5
  188. pixeltable/utils/documents.py +7 -18
  189. pixeltable/utils/exception_handler.py +7 -30
  190. pixeltable/utils/filecache.py +6 -6
  191. pixeltable/utils/formatter.py +86 -48
  192. pixeltable/utils/gcs_store.py +295 -0
  193. pixeltable/utils/http.py +133 -0
  194. pixeltable/utils/http_server.py +2 -3
  195. pixeltable/utils/iceberg.py +1 -2
  196. pixeltable/utils/image.py +17 -0
  197. pixeltable/utils/lancedb.py +90 -0
  198. pixeltable/utils/local_store.py +322 -0
  199. pixeltable/utils/misc.py +5 -0
  200. pixeltable/utils/object_stores.py +573 -0
  201. pixeltable/utils/pydantic.py +60 -0
  202. pixeltable/utils/pytorch.py +5 -6
  203. pixeltable/utils/s3_store.py +527 -0
  204. pixeltable/utils/sql.py +26 -0
  205. pixeltable/utils/system.py +30 -0
  206. pixeltable-0.5.7.dist-info/METADATA +579 -0
  207. pixeltable-0.5.7.dist-info/RECORD +227 -0
  208. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  209. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  210. pixeltable/__version__.py +0 -3
  211. pixeltable/catalog/named_function.py +0 -40
  212. pixeltable/ext/__init__.py +0 -17
  213. pixeltable/ext/functions/__init__.py +0 -11
  214. pixeltable/ext/functions/whisperx.py +0 -77
  215. pixeltable/utils/media_store.py +0 -77
  216. pixeltable/utils/s3.py +0 -17
  217. pixeltable-0.3.14.dist-info/METADATA +0 -434
  218. pixeltable-0.3.14.dist-info/RECORD +0 -186
  219. pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
  220. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,44 @@
1
+ from typing import Any
2
+
3
+ import numpy as np
4
+ import sqlalchemy as sql
5
+
6
+ from pixeltable import type_system as ts
7
+ from pixeltable.metadata import register_converter
8
+ from pixeltable.metadata.converters.util import convert_table_md
9
+
10
+
11
+ @register_converter(version=43)
12
+ def _(engine: sql.engine.Engine) -> None:
13
+ """Converts ArrayTypes by replacing legacy dtype (which was a pxt Type ID) to numpy dtype."""
14
+ convert_table_md(engine, substitution_fn=_substitution_fn)
15
+
16
+
17
+ def _substitution_fn(key: str | None, value: Any) -> tuple[str | None, Any] | None:
18
+ if not isinstance(value, dict):
19
+ return None
20
+ if value.get('_classname', None) != 'ArrayType':
21
+ return None
22
+ if 'numpy_dtype' in value:
23
+ return None
24
+ assert 'dtype' in value
25
+
26
+ legacy_dtype_val = value['dtype']
27
+ new_dtype: np.dtype | None
28
+ if legacy_dtype_val is None:
29
+ new_dtype = None
30
+ else:
31
+ legacy_dtype = ts.ColumnType.Type(legacy_dtype_val)
32
+ new_dtype = ts.ArrayType.pxt_dtype_to_numpy_dtype.get(legacy_dtype, None)
33
+ if new_dtype is None:
34
+ raise ValueError(f'Unrecognized dtype: {legacy_dtype_val} ({legacy_dtype}) in {key}, {value}')
35
+
36
+ del value['dtype']
37
+ if new_dtype is None:
38
+ value['numpy_dtype'] = None
39
+ elif new_dtype == np.str_:
40
+ # str(np.str_) would be something like '<U'
41
+ value['numpy_dtype'] = 'str'
42
+ else:
43
+ value['numpy_dtype'] = str(new_dtype)
44
+ return key, value
@@ -1,21 +1,22 @@
1
1
  import copy
2
2
  import logging
3
- from typing import Any, Callable, Optional
3
+ from typing import Any, Callable, TypeVar
4
4
  from uuid import UUID
5
5
 
6
6
  import sqlalchemy as sql
7
7
 
8
- from pixeltable.metadata.schema import Function, Table, TableSchemaVersion, TableVersion
8
+ from pixeltable.metadata.schema import Function, Table, TableSchemaVersion
9
9
 
10
10
  __logger = logging.getLogger('pixeltable')
11
11
 
12
12
 
13
13
  def convert_table_md(
14
14
  engine: sql.engine.Engine,
15
- table_md_updater: Optional[Callable[[dict, UUID], None]] = None,
16
- column_md_updater: Optional[Callable[[dict], None]] = None,
17
- external_store_md_updater: Optional[Callable[[dict], None]] = None,
18
- substitution_fn: Optional[Callable[[Optional[str], Any], Optional[tuple[Optional[str], Any]]]] = None,
15
+ table_md_updater: Callable[[dict, UUID], None] | None = None,
16
+ column_md_updater: Callable[[dict], None] | None = None,
17
+ external_store_md_updater: Callable[[dict], None] | None = None,
18
+ substitution_fn: Callable[[str | None, Any], tuple[str | None, Any] | None] | None = None,
19
+ table_modifier: Callable[[sql.Connection, UUID, dict, dict], None] | None = None,
19
20
  ) -> None:
20
21
  """
21
22
  Converts schema.TableMd dicts based on the specified conversion functions.
@@ -33,9 +34,10 @@ def convert_table_md(
33
34
  the original entry will be replaced, and the traversal will continue with `v'`.
34
35
  """
35
36
  with engine.begin() as conn:
36
- for row in conn.execute(sql.select(Table)):
37
+ # avoid a SELECT * here, which breaks when we add new columns to Table
38
+ for row in conn.execute(sql.select(Table.id, Table.md)):
37
39
  tbl_id = row[0]
38
- table_md = row[2]
40
+ table_md = row[1]
39
41
  assert isinstance(table_md, dict)
40
42
  updated_table_md = copy.deepcopy(table_md)
41
43
  if table_md_updater is not None:
@@ -49,6 +51,8 @@ def convert_table_md(
49
51
  if updated_table_md != table_md:
50
52
  __logger.info(f'Updating schema for table: {tbl_id}')
51
53
  conn.execute(sql.update(Table).where(Table.id == tbl_id).values(md=updated_table_md))
54
+ if table_modifier is not None:
55
+ table_modifier(conn, tbl_id, table_md, updated_table_md)
52
56
 
53
57
  for row in conn.execute(sql.select(Function)):
54
58
  fn_id = row[0]
@@ -76,9 +80,7 @@ def __update_external_store_md(table_md: dict, external_store_md_updater: Callab
76
80
  external_store_md_updater(store_md)
77
81
 
78
82
 
79
- def __substitute_md_rec(
80
- md: Any, substitution_fn: Callable[[Optional[str], Any], Optional[tuple[Optional[str], Any]]]
81
- ) -> Any:
83
+ def __substitute_md_rec(md: Any, substitution_fn: Callable[[str | None, Any], tuple[str | None, Any] | None]) -> Any:
82
84
  if isinstance(md, dict):
83
85
  updated_dict: dict[str, Any] = {}
84
86
  for k, v in md.items():
@@ -106,8 +108,8 @@ def __substitute_md_rec(
106
108
 
107
109
  def convert_table_schema_version_md(
108
110
  engine: sql.engine.Engine,
109
- table_schema_version_md_updater: Optional[Callable[[dict], None]] = None,
110
- schema_column_updater: Optional[Callable[[dict], None]] = None,
111
+ table_schema_version_md_updater: Callable[[dict], None] | None = None,
112
+ schema_column_updater: Callable[[dict], None] | None = None,
111
113
  ) -> None:
112
114
  """
113
115
  Converts schema.TableSchemaVersionMd dicts based on the specified conversion functions.
@@ -145,26 +147,13 @@ def __update_schema_column(table_schema_version_md: dict, schema_column_updater:
145
147
  schema_column_updater(schema_col)
146
148
 
147
149
 
148
- def convert_table_record(engine: sql.engine.Engine, table_record_updater: Optional[Callable[[Table], None]]) -> None:
149
- with sql.orm.Session(engine, future=True) as session:
150
- for record in session.query(Table).all():
151
- table_record_updater(record)
152
- session.commit()
153
-
154
-
155
- def convert_table_version_record(
156
- engine: sql.engine.Engine, table_version_record_updater: Optional[Callable[[TableVersion], None]]
157
- ) -> None:
158
- with sql.orm.Session(engine, future=True) as session:
159
- for record in session.query(TableVersion).all():
160
- table_version_record_updater(record)
161
- session.commit()
150
+ T = TypeVar('T')
162
151
 
163
152
 
164
- def convert_table_schema_version_record(
165
- engine: sql.engine.Engine, table_schema_version_record_updater: Optional[Callable[[TableSchemaVersion], None]]
153
+ def convert_sql_table_record(
154
+ schema: type[T], engine: sql.engine.Engine, record_updater: Callable[[T], None] | None
166
155
  ) -> None:
167
156
  with sql.orm.Session(engine, future=True) as session:
168
- for record in session.query(TableSchemaVersion).all():
169
- table_schema_version_record_updater(record)
157
+ for record in session.query(schema).all():
158
+ record_updater(record)
170
159
  session.commit()
@@ -2,6 +2,15 @@
2
2
  # rather than as a comment, so that the existence of a description can be enforced by
3
3
  # the unit tests when new versions are added.
4
4
  VERSION_NOTES = {
5
+ 44: 'ArrayType dtype migration from pxt types to numpy dtypes',
6
+ 43: 'Changing tables.dir_id to nullable',
7
+ 42: 'Add additional_md columns to metadata tables',
8
+ 41: 'Cellmd columns for array and json columns',
9
+ 40: 'Convert error property columns to cellmd columns',
10
+ 39: 'ColumnHandles in external stores',
11
+ 38: 'Added TableMd.view_sn',
12
+ 37: 'Add support for the sample() method on DataFrames',
13
+ 36: 'Added Table.lock_dummy',
5
14
  35: 'Track reference_tbl in ColumnRef',
6
15
  34: 'Set default value for is_pk field in column metadata to False',
7
16
  33: 'Add is_replica field to table metadata',
@@ -1,13 +1,17 @@
1
1
  import dataclasses
2
+ import types
2
3
  import typing
3
4
  import uuid
4
- from typing import Any, NamedTuple, Optional, TypeVar, Union, get_type_hints
5
+ from enum import Enum
6
+ from typing import Any, TypeVar, Union, get_type_hints
5
7
 
6
8
  import sqlalchemy as sql
7
9
  from sqlalchemy import BigInteger, ForeignKey, Integer, LargeBinary, orm
8
10
  from sqlalchemy.dialects.postgresql import JSONB, UUID
9
11
  from sqlalchemy.orm.decl_api import DeclarativeMeta
10
12
 
13
+ from ..catalog.update_status import UpdateStatus
14
+
11
15
  # Base has to be marked explicitly as a type, in order to be used elsewhere as a type hint. But in addition to being
12
16
  # a type, it's also a `DeclarativeMeta`. The following pattern enables us to expose both `Base` and `Base.metadata`
13
17
  # outside of the module in a typesafe way.
@@ -18,17 +22,17 @@ base_metadata = Base.metadata
18
22
  T = TypeVar('T')
19
23
 
20
24
 
21
- def md_from_dict(data_class_type: type[T], data: Any) -> T:
25
+ def md_from_dict(type_: type[T], data: Any) -> T:
22
26
  """Re-instantiate a dataclass instance that contains nested dataclasses from a dict."""
23
- if dataclasses.is_dataclass(data_class_type):
24
- fieldtypes = get_type_hints(data_class_type)
25
- return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data}) # type: ignore[return-value]
27
+ if dataclasses.is_dataclass(type_):
28
+ fieldtypes = get_type_hints(type_)
29
+ return type_(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data})
26
30
 
27
- origin = typing.get_origin(data_class_type)
31
+ origin = typing.get_origin(type_)
28
32
  if origin is not None:
29
- type_args = typing.get_args(data_class_type)
30
- if origin is Union and type(None) in type_args:
31
- # Handling Optional types
33
+ type_args = typing.get_args(type_)
34
+ if (origin is Union or origin is types.UnionType) and type(None) in type_args:
35
+ # handling T | None, T | None
32
36
  non_none_args = [arg for arg in type_args if arg is not type(None)]
33
37
  assert len(non_none_args) == 1
34
38
  return md_from_dict(non_none_args[0], data) if data is not None else None
@@ -42,10 +46,18 @@ def md_from_dict(data_class_type: type[T], data: Any) -> T:
42
46
  return tuple(md_from_dict(arg_type, elem) for arg_type, elem in zip(type_args, data)) # type: ignore[return-value]
43
47
  else:
44
48
  raise AssertionError(origin)
49
+ elif isinstance(type_, type) and issubclass(type_, Enum):
50
+ return type_(data)
45
51
  else:
46
52
  return data
47
53
 
48
54
 
55
+ def _md_dict_factory(data: list[tuple[str, Any]]) -> dict:
56
+ """Use this to serialize <>Md instances with dataclasses.asdict()"""
57
+ # serialize enums to their values
58
+ return {k: v.value if isinstance(v, Enum) else v for k, v in data}
59
+
60
+
49
61
  # structure of the stored metadata:
50
62
  # - each schema entity that grows somehow proportionally to the data (# of output_rows, total insert operations,
51
63
  # number of schema changes) gets its own table
@@ -65,6 +77,7 @@ class SystemInfo(Base):
65
77
  """A single-row table that contains system-wide metadata."""
66
78
 
67
79
  __tablename__ = 'systeminfo'
80
+
68
81
  dummy = sql.Column(Integer, primary_key=True, default=0, nullable=False)
69
82
  md = sql.Column(JSONB, nullable=False) # SystemInfoMd
70
83
 
@@ -72,8 +85,8 @@ class SystemInfo(Base):
72
85
  @dataclasses.dataclass
73
86
  class DirMd:
74
87
  name: str
75
- user: Optional[str]
76
- additional_md: dict[str, Any]
88
+ user: str | None
89
+ additional_md: dict[str, Any] # deprecated
77
90
 
78
91
 
79
92
  class Dir(Base):
@@ -84,7 +97,9 @@ class Dir(Base):
84
97
  )
85
98
  parent_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
86
99
  md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # DirMd
87
- # This field is updated to synchronize database operations across multiple sessions
100
+ additional_md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False, default=dict)
101
+
102
+ # used to force acquisition of an X-lock via an Update stmt
88
103
  lock_dummy: orm.Mapped[int] = orm.mapped_column(BigInteger, nullable=True)
89
104
 
90
105
 
@@ -100,17 +115,20 @@ class ColumnMd:
100
115
 
101
116
  id: int
102
117
  schema_version_add: int
103
- schema_version_drop: Optional[int]
118
+ schema_version_drop: int | None
104
119
  col_type: dict
105
120
 
106
121
  # if True, is part of the primary key
107
122
  is_pk: bool
108
123
 
109
124
  # if set, this is a computed column
110
- value_expr: Optional[dict]
125
+ value_expr: dict | None
111
126
 
112
127
  # if True, the column is present in the stored table
113
- stored: Optional[bool]
128
+ stored: bool | None
129
+
130
+ # If present, the URI for the destination for column values
131
+ destination: str | None = None
114
132
 
115
133
 
116
134
  @dataclasses.dataclass
@@ -126,13 +144,13 @@ class IndexMd:
126
144
  index_val_col_id: int # column holding the values to be indexed
127
145
  index_val_undo_col_id: int # column holding index values for deleted rows
128
146
  schema_version_add: int
129
- schema_version_drop: Optional[int]
147
+ schema_version_drop: int | None
130
148
  class_fqn: str
131
149
  init_args: dict[str, Any]
132
150
 
133
151
 
134
152
  # a stored table version path is a list of (table id as str, effective table version)
135
- TableVersionPath = list[tuple[str, Optional[int]]]
153
+ TableVersionPath = list[tuple[str, int | None]]
136
154
 
137
155
 
138
156
  @dataclasses.dataclass
@@ -144,13 +162,36 @@ class ViewMd:
144
162
  base_versions: TableVersionPath
145
163
 
146
164
  # filter predicate applied to the base table; view-only
147
- predicate: Optional[dict[str, Any]]
165
+ predicate: dict[str, Any] | None
166
+
167
+ # sampling predicate applied to the base table; view-only
168
+ sample_clause: dict[str, Any] | None
148
169
 
149
170
  # ComponentIterator subclass; only for component views
150
- iterator_class_fqn: Optional[str]
171
+ iterator_class_fqn: str | None
151
172
 
152
173
  # args to pass to the iterator class constructor; only for component views
153
- iterator_args: Optional[dict[str, Any]]
174
+ iterator_args: dict[str, Any] | None
175
+
176
+
177
+ class TableState(Enum):
178
+ """The operational state of the table"""
179
+
180
+ LIVE = 0
181
+ ROLLFORWARD = 1 # finalizing pending table ops
182
+ ROLLBACK = 2 # rolling back pending table ops
183
+
184
+
185
+ class TableStatement(Enum):
186
+ """The top-level DDL/DML operation (corresponding to a statement in SQL; not: a TableOp) currently being executed"""
187
+
188
+ CREATE_TABLE = 0
189
+ CREATE_VIEW = 1
190
+ DROP_TABLE = 2
191
+ ADD_COLUMNS = 3
192
+ DROP_COLUMNS = 4
193
+ ADD_INDEX = 5
194
+ DROP_INDEX = 6
154
195
 
155
196
 
156
197
  @dataclasses.dataclass
@@ -159,7 +200,7 @@ class TableMd:
159
200
  name: str
160
201
  is_replica: bool
161
202
 
162
- user: Optional[str]
203
+ user: str | None
163
204
 
164
205
  # monotonically increasing w/in Table for both data and schema changes, starting at 0
165
206
  current_version: int
@@ -173,14 +214,52 @@ class TableMd:
173
214
  # - every row is assigned a unique and immutable rowid on insertion
174
215
  next_row_id: int
175
216
 
217
+ # sequence number to track changes in the set of mutable views of this table (ie, this table = the view base)
218
+ # - incremented for each add/drop of a mutable view
219
+ # - only maintained for mutable tables
220
+ # TODO: replace with mutable_views: list[UUID] to help with debugging
221
+ view_sn: int
222
+
176
223
  # Metadata format for external stores:
177
224
  # {'class': 'pixeltable.io.label_studio.LabelStudioProject', 'md': {'project_id': 3}}
178
225
  external_stores: list[dict[str, Any]]
179
226
 
180
227
  column_md: dict[int, ColumnMd] # col_id -> ColumnMd
181
228
  index_md: dict[int, IndexMd] # index_id -> IndexMd
182
- view_md: Optional[ViewMd]
183
- additional_md: dict[str, Any]
229
+ view_md: ViewMd | None
230
+ # TODO: Remove additional_md from this and other Md dataclasses (and switch to using the separate additional_md
231
+ # column in all cases)
232
+ additional_md: dict[str, Any] # deprecated
233
+
234
+ # deprecated
235
+ has_pending_ops: bool = False
236
+
237
+ tbl_state: TableState = TableState.LIVE
238
+ pending_stmt: TableStatement | None = None
239
+
240
+ @property
241
+ def is_snapshot(self) -> bool:
242
+ return self.view_md is not None and self.view_md.is_snapshot
243
+
244
+ @property
245
+ def is_mutable(self) -> bool:
246
+ return not self.is_snapshot and not self.is_replica
247
+
248
+ @property
249
+ def is_pure_snapshot(self) -> bool:
250
+ return (
251
+ self.view_md is not None
252
+ and self.view_md.is_snapshot
253
+ and self.view_md.sample_clause is None
254
+ and self.view_md.predicate is None
255
+ and len(self.column_md) == 0
256
+ )
257
+
258
+ @property
259
+ def ancestors(self) -> TableVersionPath:
260
+ if self.view_md is None:
261
+ return []
262
+ return self.view_md.base_versions
184
263
 
185
264
 
186
265
  class Table(Base):
@@ -190,6 +269,8 @@ class Table(Base):
190
269
  Views are in essence a subclass of tables, because they also store materialized columns. The differences are:
191
270
  - views have a base, which is either a (live) table or a snapshot
192
271
  - views can have a filter predicate
272
+
273
+ dir_id: NULL for dropped tables
193
274
  """
194
275
 
195
276
  __tablename__ = 'tables'
@@ -197,26 +278,37 @@ class Table(Base):
197
278
  MAX_VERSION = 9223372036854775807 # 2^63 - 1
198
279
 
199
280
  id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), primary_key=True, nullable=False)
200
- dir_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=False)
281
+ dir_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
201
282
  md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # TableMd
283
+ additional_md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False, default=dict)
284
+
285
+ # used to force acquisition of an X-lock via an Update stmt
286
+ lock_dummy: orm.Mapped[int] = orm.mapped_column(BigInteger, nullable=True)
202
287
 
203
288
 
204
289
  @dataclasses.dataclass
205
- class TableVersionMd:
290
+ class VersionMd:
206
291
  tbl_id: str # uuid.UUID
207
292
  created_at: float # time.time()
208
293
  version: int
209
294
  schema_version: int
210
- additional_md: dict[str, Any]
295
+ user: str | None = None # User that created this version
296
+ update_status: UpdateStatus | None = None # UpdateStatus of the change that created this version
297
+ # A version fragment cannot be queried or instantiated via get_table(). A fragment represents a version of a
298
+ # replica table that has incomplete data, and exists only to provide base table support for a dependent view.
299
+ is_fragment: bool = False
300
+ additional_md: dict[str, Any] = dataclasses.field(default_factory=dict) # deprecated
211
301
 
212
302
 
213
303
  class TableVersion(Base):
214
304
  __tablename__ = 'tableversions'
305
+
215
306
  tbl_id: orm.Mapped[uuid.UUID] = orm.mapped_column(
216
307
  UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False
217
308
  )
218
309
  version: orm.Mapped[int] = orm.mapped_column(BigInteger, primary_key=True, nullable=False)
219
310
  md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False)
311
+ additional_md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False, default=dict)
220
312
 
221
313
 
222
314
  @dataclasses.dataclass
@@ -230,18 +322,18 @@ class SchemaColumn:
230
322
 
231
323
  # media validation strategy of this particular media column; if not set, TableMd.media_validation applies
232
324
  # stores column.MediaValiation.name.lower()
233
- media_validation: Optional[str]
325
+ media_validation: str | None
234
326
 
235
327
 
236
328
  @dataclasses.dataclass
237
- class TableSchemaVersionMd:
329
+ class SchemaVersionMd:
238
330
  """
239
331
  Records all versioned table metadata.
240
332
  """
241
333
 
242
334
  tbl_id: str # uuid.UUID
243
335
  schema_version: int
244
- preceding_schema_version: Optional[int]
336
+ preceding_schema_version: int | None
245
337
  columns: dict[int, SchemaColumn] # col_id -> SchemaColumn
246
338
  num_retained_versions: int
247
339
  comment: str
@@ -249,7 +341,7 @@ class TableSchemaVersionMd:
249
341
  # default validation strategy for any media column of this table
250
342
  # stores column.MediaValiation.name.lower()
251
343
  media_validation: str
252
- additional_md: dict[str, Any]
344
+ additional_md: dict[str, Any] # deprecated
253
345
 
254
346
 
255
347
  # versioning: each table schema change results in a new record
@@ -261,6 +353,23 @@ class TableSchemaVersion(Base):
261
353
  )
262
354
  schema_version: orm.Mapped[int] = orm.mapped_column(BigInteger, primary_key=True, nullable=False)
263
355
  md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # TableSchemaVersionMd
356
+ additional_md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False, default=dict)
357
+
358
+
359
+ class PendingTableOp(Base):
360
+ """
361
+ Table operation that needs to be completed before the table can be used.
362
+
363
+ Operations need to be completed in order of increasing seq_num.
364
+ """
365
+
366
+ __tablename__ = 'pendingtableops'
367
+
368
+ tbl_id: orm.Mapped[uuid.UUID] = orm.mapped_column(
369
+ UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False
370
+ )
371
+ op_sn: orm.Mapped[int] = orm.mapped_column(Integer, primary_key=True, nullable=False) # catalog.TableOp.op_sn
372
+ op: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # catalog.TableOp
264
373
 
265
374
 
266
375
  @dataclasses.dataclass
@@ -288,26 +397,4 @@ class Function(Base):
288
397
  )
289
398
  dir_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
290
399
  md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # FunctionMd
291
- binary_obj: orm.Mapped[Optional[bytes]] = orm.mapped_column(LargeBinary, nullable=True)
292
-
293
-
294
- class FullTableMd(NamedTuple):
295
- tbl_md: TableMd
296
- version_md: TableVersionMd
297
- schema_version_md: TableSchemaVersionMd
298
-
299
- def as_dict(self) -> dict[str, Any]:
300
- return {
301
- 'table_id': self.tbl_md.tbl_id,
302
- 'table_md': dataclasses.asdict(self.tbl_md),
303
- 'table_version_md': dataclasses.asdict(self.version_md),
304
- 'table_schema_version_md': dataclasses.asdict(self.schema_version_md),
305
- }
306
-
307
- @classmethod
308
- def from_dict(cls, data_dict: dict[str, Any]) -> 'FullTableMd':
309
- return FullTableMd(
310
- tbl_md=md_from_dict(TableMd, data_dict['table_md']),
311
- version_md=md_from_dict(TableVersionMd, data_dict['table_version_md']),
312
- schema_version_md=md_from_dict(TableSchemaVersionMd, data_dict['table_schema_version_md']),
313
- )
400
+ binary_obj: orm.Mapped[bytes | None] = orm.mapped_column(LargeBinary, nullable=True)
@@ -0,0 +1,74 @@
1
+ from __future__ import annotations
2
+
3
+ from pixeltable.metadata import schema
4
+
5
+
6
+ class MetadataUtils:
7
+ @classmethod
8
+ def _diff_md(
9
+ cls, old_md: dict[int, schema.SchemaColumn] | None, new_md: dict[int, schema.SchemaColumn] | None
10
+ ) -> str:
11
+ """Return a string reporting the differences in a specific entry in two dictionaries
12
+
13
+ Results are formatted as follows:
14
+ - If `old_md` is `None`, returns 'Initial Version'.
15
+ - If `old_md` and `new_md` are the same, returns an empty string.
16
+ - If there are additions, changes, or deletions, returns a string summarizing the changes.
17
+ """
18
+ assert new_md is not None
19
+ if old_md is None:
20
+ return 'Initial Version'
21
+ if old_md == new_md:
22
+ return ''
23
+ added = {k: v.name for k, v in new_md.items() if k not in old_md}
24
+ changed = {
25
+ k: f'{old_md[k].name!r} to {v.name!r}'
26
+ for k, v in new_md.items()
27
+ if k in old_md and old_md[k].name != v.name
28
+ }
29
+ deleted = {k: v.name for k, v in old_md.items() if k not in new_md}
30
+ if len(added) == 0 and len(changed) == 0 and len(deleted) == 0:
31
+ return ''
32
+ # Format the result
33
+ t = []
34
+ if len(added) > 0:
35
+ t.append('Added: ' + ', '.join(added.values()))
36
+ if len(changed) > 0:
37
+ t.append('Renamed: ' + ', '.join(changed.values()))
38
+ if len(deleted) > 0:
39
+ t.append('Deleted: ' + ', '.join(deleted.values()))
40
+ r = ', '.join(t)
41
+ return r
42
+
43
+ @classmethod
44
+ def _create_md_change_dict(cls, md_list: list[tuple[int, dict[int, schema.SchemaColumn]]] | None) -> dict[int, str]:
45
+ """Return a dictionary of schema changes by version
46
+ Args:
47
+ md_list: a list of tuples, each containing a version number and a metadata dictionary.
48
+ """
49
+ r: dict[int, str] = {}
50
+ if md_list is None or len(md_list) == 0:
51
+ return r
52
+
53
+ # Sort the list in place by version number
54
+ md_list.sort()
55
+
56
+ first_retrieved_version = md_list[0][0]
57
+ if first_retrieved_version == 0:
58
+ prev_md = None
59
+ prev_ver = -1
60
+ start = 0
61
+ else:
62
+ prev_md = md_list[0][1]
63
+ prev_ver = first_retrieved_version
64
+ start = 1
65
+
66
+ for ver, curr_md in md_list[start:]:
67
+ if ver == prev_ver:
68
+ continue
69
+ assert ver > prev_ver
70
+ tf = cls._diff_md(prev_md, curr_md)
71
+ if tf != '':
72
+ r[ver] = tf
73
+ prev_md = curr_md
74
+ return r
@@ -0,0 +1,3 @@
1
+ from .mypy_plugin import plugin
2
+
3
+ __all__ = ['plugin']