pixeltable 0.3.13__py3-none-any.whl → 0.3.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (50) hide show
  1. pixeltable/__init__.py +2 -2
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/catalog.py +9 -7
  4. pixeltable/catalog/column.py +6 -2
  5. pixeltable/catalog/dir.py +2 -1
  6. pixeltable/catalog/insertable_table.py +1 -1
  7. pixeltable/catalog/schema_object.py +2 -1
  8. pixeltable/catalog/table.py +12 -8
  9. pixeltable/catalog/table_version.py +21 -0
  10. pixeltable/catalog/view.py +3 -3
  11. pixeltable/dataframe.py +48 -5
  12. pixeltable/env.py +1 -1
  13. pixeltable/exec/aggregation_node.py +14 -0
  14. pixeltable/exec/cache_prefetch_node.py +1 -1
  15. pixeltable/exec/expr_eval/expr_eval_node.py +1 -1
  16. pixeltable/exprs/column_ref.py +42 -17
  17. pixeltable/exprs/data_row.py +3 -0
  18. pixeltable/exprs/globals.py +1 -1
  19. pixeltable/exprs/literal.py +11 -1
  20. pixeltable/exprs/rowid_ref.py +4 -1
  21. pixeltable/exprs/similarity_expr.py +1 -1
  22. pixeltable/func/function.py +1 -1
  23. pixeltable/functions/__init__.py +1 -0
  24. pixeltable/functions/date.py +185 -0
  25. pixeltable/functions/gemini.py +184 -49
  26. pixeltable/functions/globals.py +1 -16
  27. pixeltable/functions/json.py +2 -1
  28. pixeltable/functions/math.py +103 -0
  29. pixeltable/functions/string.py +1 -2
  30. pixeltable/functions/video.py +2 -2
  31. pixeltable/globals.py +26 -9
  32. pixeltable/io/hf_datasets.py +2 -2
  33. pixeltable/io/pandas.py +16 -4
  34. pixeltable/io/parquet.py +4 -2
  35. pixeltable/metadata/__init__.py +1 -1
  36. pixeltable/metadata/converters/convert_34.py +21 -0
  37. pixeltable/metadata/notes.py +1 -0
  38. pixeltable/plan.py +12 -5
  39. pixeltable/share/__init__.py +1 -1
  40. pixeltable/share/packager.py +397 -120
  41. pixeltable/share/publish.py +61 -16
  42. pixeltable/store.py +57 -20
  43. pixeltable/type_system.py +46 -2
  44. pixeltable/utils/arrow.py +8 -2
  45. pixeltable/utils/pytorch.py +4 -0
  46. {pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/METADATA +2 -4
  47. {pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/RECORD +50 -48
  48. {pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/LICENSE +0 -0
  49. {pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/WHEEL +0 -0
  50. {pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/entry_points.txt +0 -0
@@ -1,3 +1,15 @@
1
+ """
2
+ Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for mathematical operations.
3
+
4
+ Example:
5
+ ```python
6
+ import pixeltable as pxt
7
+
8
+ t = pxt.get_table(...)
9
+ t.select(t.float_col.floor()).collect()
10
+ ```
11
+ """
12
+
1
13
  import builtins
2
14
  import math
3
15
  from typing import Optional
@@ -10,6 +22,11 @@ from pixeltable.utils.code import local_public_names
10
22
 
11
23
  @pxt.udf(is_method=True)
12
24
  def abs(self: float) -> float:
25
+ """
26
+ Return the absolute value of the given number.
27
+
28
+ Equivalent to Python [`builtins.abs()`](https://docs.python.org/3/library/functions.html#abs).
29
+ """
13
30
  return builtins.abs(self)
14
31
 
15
32
 
@@ -20,6 +37,14 @@ def _(self: sql.ColumnElement) -> sql.ColumnElement:
20
37
 
21
38
  @pxt.udf(is_method=True)
22
39
  def ceil(self: float) -> float:
40
+ """
41
+ Return the ceiling of the given number.
42
+
43
+ Equivalent to Python [`float(math.ceil(self))`](https://docs.python.org/3/library/math.html#math.ceil) if `self`
44
+ is finite, or `self` itself if `self` is infinite. (This is slightly different from the default behavior of
45
+ `math.ceil(self)`, which always returns an `int` and raises an error if `self` is infinite. The behavior in
46
+ Pixeltable generalizes the Python operator and is chosen to align with the SQL standard.)
47
+ """
23
48
  # This ensures the same behavior as SQL
24
49
  if math.isfinite(self):
25
50
  return float(math.ceil(self))
@@ -34,6 +59,14 @@ def _(self: sql.ColumnElement) -> sql.ColumnElement:
34
59
 
35
60
  @pxt.udf(is_method=True)
36
61
  def floor(self: float) -> float:
62
+ """
63
+ Return the ceiling of the given number.
64
+
65
+ Equivalent to Python [`float(math.floor(self))`](https://docs.python.org/3/library/math.html#math.ceil) if `self`
66
+ is finite, or `self` itself if `self` is infinite. (This is slightly different from the default behavior of
67
+ `math.floor(self)`, which always returns an `int` and raises an error if `self` is infinite. The behavior of
68
+ Pixeltable generalizes the Python operator and is chosen to align with the SQL standard.)
69
+ """
37
70
  # This ensures the same behavior as SQL
38
71
  if math.isfinite(self):
39
72
  return float(math.floor(self))
@@ -48,6 +81,13 @@ def _(self: sql.ColumnElement) -> sql.ColumnElement:
48
81
 
49
82
  @pxt.udf(is_method=True)
50
83
  def round(self: float, digits: Optional[int] = None) -> float:
84
+ """
85
+ Round a number to a given precision in decimal digits.
86
+
87
+ Equivalent to Python [`builtins.round(self, digits or 0)`](https://docs.python.org/3/library/functions.html#round).
88
+ Note that if `digits` is not specified, the behavior matches `builtins.round(self, 0)` rather than
89
+ `builtins.round(self)`; this ensures that the return type is always `float` (as in SQL) rather than `int`.
90
+ """
51
91
  # Set digits explicitly to 0 to guarantee a return type of float; this ensures the same behavior as SQL
52
92
  return builtins.round(self, digits or 0)
53
93
 
@@ -60,6 +100,69 @@ def _(self: sql.ColumnElement, digits: Optional[sql.ColumnElement] = None) -> sq
60
100
  return sql.func.round(sql.cast(self, sql.Numeric), sql.cast(digits, sql.Integer))
61
101
 
62
102
 
103
+ @pxt.udf(is_method=True)
104
+ def pow(self: int, other: int) -> float:
105
+ """
106
+ Raise `self` to the power of `other`.
107
+
108
+ Equivalent to Python [`self ** other`](https://docs.python.org/3/library/functions.html#pow).
109
+ """
110
+ return self**other
111
+
112
+
113
+ @pow.to_sql
114
+ def _(self: sql.ColumnElement, other: sql.ColumnElement) -> sql.ColumnElement:
115
+ return sql.func.pow(self, other)
116
+
117
+
118
+ @pxt.udf(is_method=True)
119
+ def bitwise_and(self: int, other: int) -> int:
120
+ """
121
+ Bitwise AND of two integers.
122
+
123
+ Equivalent to Python
124
+ [`self & other`](https://docs.python.org/3/library/stdtypes.html#bitwise-operations-on-integer-types).
125
+ """
126
+ return self & other
127
+
128
+
129
+ @bitwise_and.to_sql
130
+ def _(self: sql.ColumnElement, other: sql.ColumnElement) -> sql.ColumnElement:
131
+ return self.bitwise_and(other)
132
+
133
+
134
+ @pxt.udf(is_method=True)
135
+ def bitwise_or(self: int, other: int) -> int:
136
+ """
137
+ Bitwise OR of two integers.
138
+
139
+ Equivalent to Python
140
+ [`self | other`](https://docs.python.org/3/library/stdtypes.html#bitwise-operations-on-integer-types).
141
+ """
142
+ return self | other
143
+
144
+
145
+ @bitwise_or.to_sql
146
+ def _(self: sql.ColumnElement, other: sql.ColumnElement) -> sql.ColumnElement:
147
+ return self.bitwise_or(other)
148
+
149
+
150
+ @pxt.udf(is_method=True)
151
+ def bitwise_xor(self: int, other: int) -> int:
152
+ """
153
+ Bitwise XOR of two integers.
154
+
155
+ Equivalent to Python
156
+ [`self ^ other`](https://docs.python.org/3/library/stdtypes.html#bitwise-operations-on-integer-types).
157
+ """
158
+ return self ^ other
159
+
160
+
161
+ @bitwise_xor.to_sql
162
+ def _(self: sql.ColumnElement, other: sql.ColumnElement) -> sql.ColumnElement:
163
+ return self.bitwise_xor(other)
164
+
165
+
63
166
  __all__ = local_public_names(__name__)
64
167
 
65
168
 
@@ -5,10 +5,9 @@ It closely follows the Pandas `pandas.Series.str` API.
5
5
  Example:
6
6
  ```python
7
7
  import pixeltable as pxt
8
- from pixeltable.functions import string as pxt_str
9
8
 
10
9
  t = pxt.get_table(...)
11
- t.select(pxt_str.capitalize(t.str_col)).collect()
10
+ t.select(t.str_col.capitalize()).collect()
12
11
  ```
13
12
  """
14
13
 
@@ -4,10 +4,10 @@ Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs)
4
4
  Example:
5
5
  ```python
6
6
  import pixeltable as pxt
7
- from pixeltable.functions import video as pxt_video
7
+ import pixeltable.functions as pxtf
8
8
 
9
9
  t = pxt.get_table(...)
10
- t.select(pxt_video.extract_audio(t.video_col)).collect()
10
+ t.select(pxtf.video.extract_audio(t.video_col)).collect()
11
11
  ```
12
12
  """
13
13
 
pixeltable/globals.py CHANGED
@@ -2,7 +2,6 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
  import os
5
- import urllib.parse
6
5
  from pathlib import Path
7
6
  from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, Union
8
7
 
@@ -372,6 +371,31 @@ def create_snapshot(
372
371
  )
373
372
 
374
373
 
374
+ def create_replica(destination: str, source: Union[str, catalog.Table]) -> Optional[catalog.Table]:
375
+ """
376
+ Create a replica of a table. Can be used either to create a remote replica of a local table, or to create a local
377
+ replica of a remote table. A given table can have at most one replica per Pixeltable instance.
378
+
379
+ Args:
380
+ destination: Path where the replica will be created. Can be either a local path such as `'my_dir.my_table'`, or
381
+ a remote URI such as `'pxt://username/mydir.my_table'`.
382
+ source: Path to the source table, or (if the source table is a local table) a handle to the source table.
383
+ """
384
+ remote_dest = destination.startswith('pxt://')
385
+ remote_source = isinstance(source, str) and source.startswith('pxt://')
386
+ if remote_dest == remote_source:
387
+ raise excs.Error('Exactly one of `destination` or `source` must be a remote URI.')
388
+
389
+ if remote_dest:
390
+ if isinstance(source, str):
391
+ source = get_table(source)
392
+ share.push_replica(destination, source)
393
+ return None
394
+ else:
395
+ assert isinstance(source, str)
396
+ return share.pull_replica(destination, source)
397
+
398
+
375
399
  def get_table(path: str) -> catalog.Table:
376
400
  """Get a handle to an existing table, view, or snapshot.
377
401
 
@@ -470,7 +494,7 @@ def drop_table(
470
494
  # if we're dropping a table by handle, we first need to get the current path, then drop the S lock on
471
495
  # the Table record, and then get X locks in the correct order (first containing directory, then table)
472
496
  with Env.get().begin_xact():
473
- tbl_path = table._path()
497
+ tbl_path = table._path
474
498
  else:
475
499
  assert isinstance(table, str)
476
500
  tbl_path = table
@@ -627,13 +651,6 @@ def _extract_paths(
627
651
  return result
628
652
 
629
653
 
630
- def publish_snapshot(dest_uri: str, table: catalog.Table) -> None:
631
- parsed_uri = urllib.parse.urlparse(dest_uri)
632
- if parsed_uri.scheme != 'pxt':
633
- raise excs.Error(f'Invalid Pixeltable URI (does not start with pxt://): {dest_uri}')
634
- share.publish_snapshot(dest_uri, table)
635
-
636
-
637
654
  def list_dirs(path: str = '', recursive: bool = True) -> list[str]:
638
655
  """List the directories in a directory.
639
656
 
@@ -31,8 +31,8 @@ _hf_to_pxt: dict[str, ts.ColumnType] = {
31
31
  'timestamp[s]': ts.TimestampType(nullable=True),
32
32
  'timestamp[ms]': ts.TimestampType(nullable=True), # HF dataset iterator converts timestamps to datetime.datetime
33
33
  'timestamp[us]': ts.TimestampType(nullable=True),
34
- 'date32': ts.StringType(nullable=True), # date32 is not supported in pixeltable, use string
35
- 'date64': ts.StringType(nullable=True), # date64 is not supported in pixeltable, use string
34
+ 'date32': ts.DateType(nullable=True),
35
+ 'date64': ts.DateType(nullable=True),
36
36
  }
37
37
 
38
38
 
pixeltable/io/pandas.py CHANGED
@@ -9,6 +9,7 @@ from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
9
9
  import pixeltable as pxt
10
10
  import pixeltable.exceptions as excs
11
11
  import pixeltable.type_system as ts
12
+ from pixeltable.env import Env
12
13
 
13
14
 
14
15
  def import_pandas(
@@ -209,14 +210,25 @@ def _df_row_to_pxt_row(
209
210
  nval = bool(val)
210
211
  elif pxt_type.is_string_type():
211
212
  nval = str(val)
213
+ elif pxt_type.is_date_type():
214
+ if pd.isnull(val):
215
+ # pandas has the bespoke 'NaT' valud for a missing timestamp
216
+ # This is not supported by postgres, and must be converted to None
217
+ nval = None
218
+ else:
219
+ nval = pd.Timestamp(val).date()
212
220
  elif pxt_type.is_timestamp_type():
213
221
  if pd.isnull(val):
214
- # pandas has the bespoke 'NaT' type for a missing timestamp; postgres is very
215
- # much not-ok with it. (But if we convert it to None and then load out the
216
- # table contents as a pandas DataFrame, it will correctly restore the 'NaT'!)
222
+ # pandas has the bespoke 'NaT' value for a missing timestamp
223
+ # This is not supported by postgres, and must be converted to None
217
224
  nval = None
218
225
  else:
219
- nval = pd.Timestamp(val).to_pydatetime()
226
+ tval = pd.Timestamp(val)
227
+ # pandas supports tz-aware and naive timestamps.
228
+ if tval.tz is None:
229
+ nval = pd.Timestamp(tval).tz_localize(tz=Env.get().default_time_zone)
230
+ else:
231
+ nval = tval.astimezone(Env.get().default_time_zone)
220
232
  else:
221
233
  nval = val
222
234
  pxt_row[pxt_name] = nval
pixeltable/io/parquet.py CHANGED
@@ -112,11 +112,11 @@ def export_parquet(
112
112
  length = len(val)
113
113
  elif col_type.is_string_type():
114
114
  length = len(val)
115
- elif col_type.is_video_type():
115
+ elif col_type.is_video_type() or col_type.is_audio_type():
116
116
  if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
117
117
  val = data_row.file_paths[e.slot_idx]
118
118
  else:
119
- raise excs.Error(f'unknown video type {type(val)}')
119
+ raise excs.Error(f'unknown audio/video type {type(val)}')
120
120
  length = len(val)
121
121
  elif col_type.is_json_type():
122
122
  val = json.dumps(val)
@@ -127,6 +127,8 @@ def export_parquet(
127
127
  length = 8
128
128
  elif col_type.is_bool_type():
129
129
  length = 1
130
+ elif col_type.is_date_type():
131
+ length = 4
130
132
  elif col_type.is_timestamp_type():
131
133
  val = val.astimezone(datetime.timezone.utc)
132
134
  length = 8
@@ -16,7 +16,7 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
16
16
 
17
17
 
18
18
  # current version of the metadata; this is incremented whenever the metadata schema changes
19
- VERSION = 34
19
+ VERSION = 35
20
20
 
21
21
 
22
22
  def create_system_info(engine: sql.engine.Engine) -> None:
@@ -0,0 +1,21 @@
1
+ from typing import Any, Optional
2
+
3
+ import sqlalchemy as sql
4
+
5
+ from pixeltable.metadata import register_converter
6
+ from pixeltable.metadata.converters.util import convert_table_md
7
+
8
+
9
+ @register_converter(version=34)
10
+ def _(engine: sql.engine.Engine) -> None:
11
+ convert_table_md(engine, substitution_fn=__substitute_md)
12
+
13
+
14
+ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
15
+ if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'ColumnRef':
16
+ # Add reference_tbl to ColumnRef; for historical metadata it is always equal to tbl
17
+ assert 'reference_tbl' not in v
18
+ v['reference_tbl'] = None
19
+ return k, v
20
+
21
+ return None
@@ -2,6 +2,7 @@
2
2
  # rather than as a comment, so that the existence of a description can be enforced by
3
3
  # the unit tests when new versions are added.
4
4
  VERSION_NOTES = {
5
+ 35: 'Track reference_tbl in ColumnRef',
5
6
  34: 'Set default value for is_pk field in column metadata to False',
6
7
  33: 'Add is_replica field to table metadata',
7
8
  32: 'Add the lock_dummy BIGINT column to the dirs table',
pixeltable/plan.py CHANGED
@@ -635,8 +635,8 @@ class Planner:
635
635
  raise excs.Error(f'Join predicate {join_clause.join_predicate} not expressible in SQL')
636
636
 
637
637
  @classmethod
638
- def _verify_ordering(cls, analyzer: Analyzer, verify_agg: bool) -> None:
639
- """Verify that the various ordering requirements don't conflict"""
638
+ def _create_combined_ordering(cls, analyzer: Analyzer, verify_agg: bool) -> Optional[OrderByClause]:
639
+ """Verify that the various ordering requirements don't conflict and return a combined ordering"""
640
640
  ob_clauses: list[OrderByClause] = [analyzer.order_by_clause.copy()]
641
641
 
642
642
  if verify_agg:
@@ -652,8 +652,11 @@ class Planner:
652
652
  OrderByItem(e, True) for e in fn_call.get_agg_order_by()
653
653
  ]
654
654
  ob_clauses.append(ordering)
655
- if len(ob_clauses) <= 1:
656
- return
655
+
656
+ if len(ob_clauses) == 0:
657
+ return None
658
+ elif len(ob_clauses) == 1:
659
+ return ob_clauses[0]
657
660
 
658
661
  combined_ordering = ob_clauses[0]
659
662
  for ordering in ob_clauses[1:]:
@@ -664,6 +667,7 @@ class Planner:
664
667
  f'{print_order_by_clause(combined_ordering)} vs {print_order_by_clause(ordering)}'
665
668
  )
666
669
  combined_ordering = combined
670
+ return combined_ordering
667
671
 
668
672
  @classmethod
669
673
  def _is_contained_in(cls, l1: Iterable[exprs.Expr], l2: Iterable[exprs.Expr]) -> bool:
@@ -761,7 +765,7 @@ class Planner:
761
765
  analyzer.window_fn_calls
762
766
  )
763
767
  ctx = exec.ExecContext(row_builder)
764
- cls._verify_ordering(analyzer, verify_agg=is_python_agg)
768
+ combined_ordering = cls._create_combined_ordering(analyzer, verify_agg=is_python_agg)
765
769
  cls._verify_join_clauses(analyzer)
766
770
 
767
771
  # materialized with SQL table scans (ie, single-table SELECT statements):
@@ -859,6 +863,9 @@ class Planner:
859
863
  row_builder, input=plan, select_list=analyzer.select_list, group_by_items=analyzer.group_by_clause
860
864
  )
861
865
  else:
866
+ input_sql_node = plan.get_node(exec.SqlNode)
867
+ assert combined_ordering is not None
868
+ input_sql_node.set_order_by(combined_ordering)
862
869
  plan = exec.AggregationNode(
863
870
  tbl.tbl_version,
864
871
  row_builder,
@@ -1,3 +1,3 @@
1
1
  # ruff: noqa: F401
2
2
 
3
- from .publish import publish_snapshot
3
+ from .publish import pull_replica, push_replica