pixeltable 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. pixeltable/__init__.py +2 -27
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/catalog.py +9 -7
  4. pixeltable/catalog/column.py +6 -2
  5. pixeltable/catalog/dir.py +2 -1
  6. pixeltable/catalog/insertable_table.py +11 -0
  7. pixeltable/catalog/schema_object.py +2 -1
  8. pixeltable/catalog/table.py +27 -38
  9. pixeltable/catalog/table_version.py +19 -0
  10. pixeltable/catalog/table_version_path.py +7 -0
  11. pixeltable/catalog/view.py +31 -0
  12. pixeltable/dataframe.py +50 -7
  13. pixeltable/env.py +1 -1
  14. pixeltable/exceptions.py +20 -2
  15. pixeltable/exec/aggregation_node.py +14 -0
  16. pixeltable/exec/cache_prefetch_node.py +1 -1
  17. pixeltable/exec/expr_eval/evaluators.py +0 -4
  18. pixeltable/exec/expr_eval/expr_eval_node.py +1 -2
  19. pixeltable/exec/sql_node.py +3 -2
  20. pixeltable/exprs/column_ref.py +42 -17
  21. pixeltable/exprs/data_row.py +3 -0
  22. pixeltable/exprs/globals.py +1 -1
  23. pixeltable/exprs/literal.py +11 -1
  24. pixeltable/exprs/rowid_ref.py +4 -1
  25. pixeltable/exprs/similarity_expr.py +1 -1
  26. pixeltable/func/function.py +1 -1
  27. pixeltable/func/udf.py +1 -1
  28. pixeltable/functions/__init__.py +2 -0
  29. pixeltable/functions/anthropic.py +1 -1
  30. pixeltable/functions/bedrock.py +130 -0
  31. pixeltable/functions/date.py +185 -0
  32. pixeltable/functions/gemini.py +22 -20
  33. pixeltable/functions/globals.py +1 -16
  34. pixeltable/functions/huggingface.py +7 -6
  35. pixeltable/functions/image.py +15 -16
  36. pixeltable/functions/json.py +2 -1
  37. pixeltable/functions/math.py +40 -0
  38. pixeltable/functions/mistralai.py +3 -2
  39. pixeltable/functions/openai.py +9 -8
  40. pixeltable/functions/string.py +1 -2
  41. pixeltable/functions/together.py +4 -3
  42. pixeltable/functions/video.py +2 -2
  43. pixeltable/globals.py +26 -9
  44. pixeltable/io/datarows.py +4 -3
  45. pixeltable/io/hf_datasets.py +2 -2
  46. pixeltable/io/label_studio.py +17 -17
  47. pixeltable/io/pandas.py +29 -16
  48. pixeltable/io/parquet.py +2 -0
  49. pixeltable/io/table_data_conduit.py +8 -2
  50. pixeltable/metadata/__init__.py +1 -1
  51. pixeltable/metadata/converters/convert_19.py +2 -2
  52. pixeltable/metadata/converters/convert_34.py +21 -0
  53. pixeltable/metadata/notes.py +1 -0
  54. pixeltable/plan.py +12 -5
  55. pixeltable/share/__init__.py +1 -1
  56. pixeltable/share/packager.py +219 -119
  57. pixeltable/share/publish.py +61 -16
  58. pixeltable/store.py +45 -20
  59. pixeltable/type_system.py +46 -2
  60. pixeltable/utils/arrow.py +8 -2
  61. pixeltable/utils/pytorch.py +4 -0
  62. {pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/METADATA +2 -4
  63. {pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/RECORD +66 -63
  64. {pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/WHEEL +1 -1
  65. {pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/LICENSE +0 -0
  66. {pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/entry_points.txt +0 -0
pixeltable/io/pandas.py CHANGED
@@ -8,6 +8,8 @@ from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
8
8
 
9
9
  import pixeltable as pxt
10
10
  import pixeltable.exceptions as excs
11
+ import pixeltable.type_system as ts
12
+ from pixeltable.env import Env
11
13
 
12
14
 
13
15
  def import_pandas(
@@ -119,15 +121,15 @@ def _df_check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> No
119
121
 
120
122
 
121
123
  def df_infer_schema(
122
- df: pd.DataFrame, schema_overrides: dict[str, pxt.ColumnType], primary_key: list[str]
123
- ) -> dict[str, pxt.ColumnType]:
124
+ df: pd.DataFrame, schema_overrides: dict[str, ts.ColumnType], primary_key: list[str]
125
+ ) -> dict[str, ts.ColumnType]:
124
126
  """
125
127
  Infers a Pixeltable schema from a Pandas DataFrame.
126
128
 
127
129
  Returns:
128
130
  A tuple containing a Pixeltable schema and a list of primary key column names.
129
131
  """
130
- pd_schema: dict[str, pxt.ColumnType] = {}
132
+ pd_schema: dict[str, ts.ColumnType] = {}
131
133
  for pd_name, pd_dtype in zip(df.columns, df.dtypes):
132
134
  if pd_name in schema_overrides:
133
135
  pxt_type = schema_overrides[pd_name]
@@ -138,7 +140,7 @@ def df_infer_schema(
138
140
  return pd_schema
139
141
 
140
142
 
141
- def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.ColumnType]:
143
+ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[ts.ColumnType]:
142
144
  """
143
145
  Determines a pixeltable ColumnType from a pandas dtype
144
146
 
@@ -146,21 +148,21 @@ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.C
146
148
  pd_dtype: A pandas dtype object
147
149
 
148
150
  Returns:
149
- pxt.ColumnType: A pixeltable ColumnType
151
+ ts.ColumnType: A pixeltable ColumnType
150
152
  """
151
153
  # Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly
152
154
  # compatible with NumPy dtypes
153
155
  # The timezone-aware datetime64[ns, tz=] dtype is a pandas extension dtype
154
156
  if is_datetime64_any_dtype(pd_dtype):
155
- return pxt.TimestampType(nullable=nullable)
157
+ return ts.TimestampType(nullable=nullable)
156
158
  if is_extension_array_dtype(pd_dtype):
157
159
  return None
158
160
  # Most other pandas dtypes are directly NumPy compatible
159
161
  assert isinstance(pd_dtype, np.dtype)
160
- return pxt.ArrayType.from_np_dtype(pd_dtype, nullable)
162
+ return ts.ArrayType.from_np_dtype(pd_dtype, nullable)
161
163
 
162
164
 
163
- def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable: bool) -> pxt.ColumnType:
165
+ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable: bool) -> ts.ColumnType:
164
166
  """
165
167
  Infers a Pixeltable type based on a pandas dtype.
166
168
  """
@@ -176,12 +178,12 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
176
178
 
177
179
  if len(data_col) == 0:
178
180
  # No non-null values; default to FloatType (the Pandas type of an all-NaN column)
179
- return pxt.FloatType(nullable=nullable)
181
+ return ts.FloatType(nullable=nullable)
180
182
 
181
- inferred_type = pxt.ColumnType.infer_common_literal_type(data_col)
183
+ inferred_type = ts.ColumnType.infer_common_literal_type(data_col)
182
184
  if inferred_type is None:
183
185
  # Fallback on StringType if everything else fails
184
- return pxt.StringType(nullable=nullable)
186
+ return ts.StringType(nullable=nullable)
185
187
  else:
186
188
  return inferred_type.copy(nullable=nullable)
187
189
 
@@ -189,7 +191,7 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
189
191
 
190
192
 
191
193
  def _df_row_to_pxt_row(
192
- row: tuple[Any, ...], schema: dict[str, pxt.ColumnType], col_mapping: Optional[dict[str, str]]
194
+ row: tuple[Any, ...], schema: dict[str, ts.ColumnType], col_mapping: Optional[dict[str, str]]
193
195
  ) -> dict[str, Any]:
194
196
  """Convert a row to insertable format"""
195
197
  pxt_row: dict[str, Any] = {}
@@ -208,14 +210,25 @@ def _df_row_to_pxt_row(
208
210
  nval = bool(val)
209
211
  elif pxt_type.is_string_type():
210
212
  nval = str(val)
213
+ elif pxt_type.is_date_type():
214
+ if pd.isnull(val):
215
+ # pandas has the bespoke 'NaT' valud for a missing timestamp
216
+ # This is not supported by postgres, and must be converted to None
217
+ nval = None
218
+ else:
219
+ nval = pd.Timestamp(val).date()
211
220
  elif pxt_type.is_timestamp_type():
212
221
  if pd.isnull(val):
213
- # pandas has the bespoke 'NaT' type for a missing timestamp; postgres is very
214
- # much not-ok with it. (But if we convert it to None and then load out the
215
- # table contents as a pandas DataFrame, it will correctly restore the 'NaT'!)
222
+ # pandas has the bespoke 'NaT' value for a missing timestamp
223
+ # This is not supported by postgres, and must be converted to None
216
224
  nval = None
217
225
  else:
218
- nval = pd.Timestamp(val).to_pydatetime()
226
+ tval = pd.Timestamp(val)
227
+ # pandas supports tz-aware and naive timestamps.
228
+ if tval.tz is None:
229
+ nval = pd.Timestamp(tval).tz_localize(tz=Env.get().default_time_zone)
230
+ else:
231
+ nval = tval.astimezone(Env.get().default_time_zone)
219
232
  else:
220
233
  nval = val
221
234
  pxt_row[pxt_name] = nval
pixeltable/io/parquet.py CHANGED
@@ -127,6 +127,8 @@ def export_parquet(
127
127
  length = 8
128
128
  elif col_type.is_bool_type():
129
129
  length = 1
130
+ elif col_type.is_date_type():
131
+ length = 4
130
132
  elif col_type.is_timestamp_type():
131
133
  val = val.astimezone(datetime.timezone.utc)
132
134
  length = 8
@@ -15,6 +15,7 @@ from pyarrow.parquet import ParquetDataset
15
15
 
16
16
  import pixeltable as pxt
17
17
  import pixeltable.exceptions as excs
18
+ import pixeltable.type_system as ts
18
19
  from pixeltable.io.pandas import _df_check_primary_key_values, _df_row_to_pxt_row, df_infer_schema
19
20
  from pixeltable.utils import parse_local_file_path
20
21
 
@@ -72,6 +73,11 @@ class TableDataConduit:
72
73
  def check_source_format(self) -> None:
73
74
  assert self.source_format is None or TableDataConduitFormat.is_valid(self.source_format)
74
75
 
76
+ def __post_init__(self) -> None:
77
+ """If no extra_fields were provided, initialize to empty dict"""
78
+ if self.extra_fields is None:
79
+ self.extra_fields = {}
80
+
75
81
  @classmethod
76
82
  def is_rowdata_structure(cls, d: TableDataSource) -> bool:
77
83
  if not isinstance(d, list) or len(d) == 0:
@@ -83,7 +89,7 @@ class TableDataConduit:
83
89
 
84
90
  def normalize_pxt_schema_types(self) -> None:
85
91
  for name, coltype in self.pxt_schema.items():
86
- self.pxt_schema[name] = pxt.ColumnType.normalize_type(coltype)
92
+ self.pxt_schema[name] = ts.ColumnType.normalize_type(coltype)
87
93
 
88
94
  def infer_schema(self) -> dict[str, Any]:
89
95
  raise NotImplementedError
@@ -393,7 +399,7 @@ class HFTableDataConduit(TableDataConduit):
393
399
  f'Column name `{self.column_name_for_split}` already exists in dataset schema;'
394
400
  f'provide a different `column_name_for_split`'
395
401
  )
396
- self.src_schema[self.column_name_for_split] = pxt.StringType(nullable=True)
402
+ self.src_schema[self.column_name_for_split] = ts.StringType(nullable=True)
397
403
 
398
404
  inferred_schema, inferred_pk, self.source_column_map = normalize_schema_names(
399
405
  self.src_schema, self.src_pk, self.src_schema_overrides, True
@@ -16,7 +16,7 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
16
16
 
17
17
 
18
18
  # current version of the metadata; this is incremented whenever the metadata schema changes
19
- VERSION = 34
19
+ VERSION = 35
20
20
 
21
21
 
22
22
  def create_system_info(engine: sql.engine.Engine) -> None:
@@ -3,7 +3,7 @@ from typing import Any, Optional
3
3
 
4
4
  import sqlalchemy as sql
5
5
 
6
- import pixeltable as pxt
6
+ import pixeltable.type_system as ts
7
7
  from pixeltable.metadata import register_converter, schema
8
8
  from pixeltable.metadata.converters.util import convert_table_md
9
9
 
@@ -34,7 +34,7 @@ def __update_timestamp_literals(k: Any, v: Any) -> Optional[tuple[Any, Any]]:
34
34
  # timestamp literal, which (in version 19) is stored in the DB as a naive datetime.
35
35
  # We convert it to an aware datetime, stored in UTC.
36
36
  assert v['_classname'] == 'Literal'
37
- assert v['val_t'] == pxt.ColumnType.Type.TIMESTAMP.name
37
+ assert v['val_t'] == ts.ColumnType.Type.TIMESTAMP.name
38
38
  assert isinstance(v['val'], str)
39
39
  dt = datetime.datetime.fromisoformat(v['val'])
40
40
  assert dt.tzinfo is None # In version 19 all timestamps are naive
@@ -0,0 +1,21 @@
1
+ from typing import Any, Optional
2
+
3
+ import sqlalchemy as sql
4
+
5
+ from pixeltable.metadata import register_converter
6
+ from pixeltable.metadata.converters.util import convert_table_md
7
+
8
+
9
+ @register_converter(version=34)
10
+ def _(engine: sql.engine.Engine) -> None:
11
+ convert_table_md(engine, substitution_fn=__substitute_md)
12
+
13
+
14
+ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
15
+ if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'ColumnRef':
16
+ # Add reference_tbl to ColumnRef; for historical metadata it is always equal to tbl
17
+ assert 'reference_tbl' not in v
18
+ v['reference_tbl'] = None
19
+ return k, v
20
+
21
+ return None
@@ -2,6 +2,7 @@
2
2
  # rather than as a comment, so that the existence of a description can be enforced by
3
3
  # the unit tests when new versions are added.
4
4
  VERSION_NOTES = {
5
+ 35: 'Track reference_tbl in ColumnRef',
5
6
  34: 'Set default value for is_pk field in column metadata to False',
6
7
  33: 'Add is_replica field to table metadata',
7
8
  32: 'Add the lock_dummy BIGINT column to the dirs table',
pixeltable/plan.py CHANGED
@@ -635,8 +635,8 @@ class Planner:
635
635
  raise excs.Error(f'Join predicate {join_clause.join_predicate} not expressible in SQL')
636
636
 
637
637
  @classmethod
638
- def _verify_ordering(cls, analyzer: Analyzer, verify_agg: bool) -> None:
639
- """Verify that the various ordering requirements don't conflict"""
638
+ def _create_combined_ordering(cls, analyzer: Analyzer, verify_agg: bool) -> Optional[OrderByClause]:
639
+ """Verify that the various ordering requirements don't conflict and return a combined ordering"""
640
640
  ob_clauses: list[OrderByClause] = [analyzer.order_by_clause.copy()]
641
641
 
642
642
  if verify_agg:
@@ -652,8 +652,11 @@ class Planner:
652
652
  OrderByItem(e, True) for e in fn_call.get_agg_order_by()
653
653
  ]
654
654
  ob_clauses.append(ordering)
655
- if len(ob_clauses) <= 1:
656
- return
655
+
656
+ if len(ob_clauses) == 0:
657
+ return None
658
+ elif len(ob_clauses) == 1:
659
+ return ob_clauses[0]
657
660
 
658
661
  combined_ordering = ob_clauses[0]
659
662
  for ordering in ob_clauses[1:]:
@@ -664,6 +667,7 @@ class Planner:
664
667
  f'{print_order_by_clause(combined_ordering)} vs {print_order_by_clause(ordering)}'
665
668
  )
666
669
  combined_ordering = combined
670
+ return combined_ordering
667
671
 
668
672
  @classmethod
669
673
  def _is_contained_in(cls, l1: Iterable[exprs.Expr], l2: Iterable[exprs.Expr]) -> bool:
@@ -761,7 +765,7 @@ class Planner:
761
765
  analyzer.window_fn_calls
762
766
  )
763
767
  ctx = exec.ExecContext(row_builder)
764
- cls._verify_ordering(analyzer, verify_agg=is_python_agg)
768
+ combined_ordering = cls._create_combined_ordering(analyzer, verify_agg=is_python_agg)
765
769
  cls._verify_join_clauses(analyzer)
766
770
 
767
771
  # materialized with SQL table scans (ie, single-table SELECT statements):
@@ -859,6 +863,9 @@ class Planner:
859
863
  row_builder, input=plan, select_list=analyzer.select_list, group_by_items=analyzer.group_by_clause
860
864
  )
861
865
  else:
866
+ input_sql_node = plan.get_node(exec.SqlNode)
867
+ assert combined_ordering is not None
868
+ input_sql_node.set_order_by(combined_ordering)
862
869
  plan = exec.AggregationNode(
863
870
  tbl.tbl_version,
864
871
  row_builder,
@@ -1,3 +1,3 @@
1
1
  # ruff: noqa: F401
2
2
 
3
- from .publish import publish_snapshot
3
+ from .publish import pull_replica, push_replica