pixeltable 0.2.16__py3-none-any.whl → 0.2.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (79) hide show
  1. pixeltable/__version__.py +2 -2
  2. pixeltable/catalog/catalog.py +8 -7
  3. pixeltable/catalog/column.py +11 -8
  4. pixeltable/catalog/insertable_table.py +1 -1
  5. pixeltable/catalog/path_dict.py +8 -6
  6. pixeltable/catalog/table.py +20 -13
  7. pixeltable/catalog/table_version.py +91 -54
  8. pixeltable/catalog/table_version_path.py +7 -9
  9. pixeltable/catalog/view.py +2 -1
  10. pixeltable/dataframe.py +1 -1
  11. pixeltable/env.py +173 -82
  12. pixeltable/exec/aggregation_node.py +2 -1
  13. pixeltable/exec/component_iteration_node.py +1 -1
  14. pixeltable/exec/sql_node.py +11 -8
  15. pixeltable/exprs/__init__.py +1 -0
  16. pixeltable/exprs/arithmetic_expr.py +4 -4
  17. pixeltable/exprs/array_slice.py +2 -1
  18. pixeltable/exprs/column_property_ref.py +9 -7
  19. pixeltable/exprs/column_ref.py +2 -1
  20. pixeltable/exprs/comparison.py +10 -7
  21. pixeltable/exprs/compound_predicate.py +3 -2
  22. pixeltable/exprs/data_row.py +19 -4
  23. pixeltable/exprs/expr.py +46 -35
  24. pixeltable/exprs/expr_set.py +32 -9
  25. pixeltable/exprs/function_call.py +56 -32
  26. pixeltable/exprs/in_predicate.py +3 -2
  27. pixeltable/exprs/inline_array.py +2 -1
  28. pixeltable/exprs/inline_dict.py +2 -1
  29. pixeltable/exprs/is_null.py +3 -2
  30. pixeltable/exprs/json_mapper.py +5 -4
  31. pixeltable/exprs/json_path.py +7 -1
  32. pixeltable/exprs/literal.py +34 -7
  33. pixeltable/exprs/method_ref.py +3 -3
  34. pixeltable/exprs/object_ref.py +6 -5
  35. pixeltable/exprs/row_builder.py +25 -17
  36. pixeltable/exprs/rowid_ref.py +2 -1
  37. pixeltable/exprs/similarity_expr.py +2 -1
  38. pixeltable/exprs/sql_element_cache.py +30 -0
  39. pixeltable/exprs/type_cast.py +3 -3
  40. pixeltable/exprs/variable.py +2 -1
  41. pixeltable/ext/functions/whisperx.py +4 -4
  42. pixeltable/ext/functions/yolox.py +6 -6
  43. pixeltable/func/aggregate_function.py +1 -0
  44. pixeltable/func/function.py +28 -4
  45. pixeltable/functions/__init__.py +4 -2
  46. pixeltable/functions/anthropic.py +107 -0
  47. pixeltable/functions/fireworks.py +2 -2
  48. pixeltable/functions/globals.py +6 -1
  49. pixeltable/functions/huggingface.py +2 -2
  50. pixeltable/functions/image.py +17 -2
  51. pixeltable/functions/json.py +5 -5
  52. pixeltable/functions/mistralai.py +188 -0
  53. pixeltable/functions/openai.py +6 -10
  54. pixeltable/functions/string.py +3 -2
  55. pixeltable/functions/timestamp.py +95 -7
  56. pixeltable/functions/together.py +5 -5
  57. pixeltable/functions/video.py +2 -2
  58. pixeltable/functions/vision.py +27 -17
  59. pixeltable/functions/whisper.py +1 -1
  60. pixeltable/io/hf_datasets.py +17 -15
  61. pixeltable/io/pandas.py +0 -2
  62. pixeltable/io/parquet.py +15 -14
  63. pixeltable/iterators/document.py +16 -15
  64. pixeltable/metadata/__init__.py +1 -1
  65. pixeltable/metadata/converters/convert_19.py +46 -0
  66. pixeltable/metadata/notes.py +1 -0
  67. pixeltable/metadata/schema.py +5 -4
  68. pixeltable/plan.py +100 -78
  69. pixeltable/store.py +5 -1
  70. pixeltable/tool/create_test_db_dump.py +4 -3
  71. pixeltable/type_system.py +12 -14
  72. pixeltable/utils/documents.py +45 -42
  73. pixeltable/utils/formatter.py +2 -2
  74. {pixeltable-0.2.16.dist-info → pixeltable-0.2.18.dist-info}/METADATA +79 -21
  75. pixeltable-0.2.18.dist-info/RECORD +147 -0
  76. pixeltable-0.2.16.dist-info/RECORD +0 -143
  77. {pixeltable-0.2.16.dist-info → pixeltable-0.2.18.dist-info}/LICENSE +0 -0
  78. {pixeltable-0.2.16.dist-info → pixeltable-0.2.18.dist-info}/WHEEL +0 -0
  79. {pixeltable-0.2.16.dist-info → pixeltable-0.2.18.dist-info}/entry_points.txt +0 -0
@@ -6,7 +6,7 @@ import random
6
6
  import typing
7
7
  from typing import Union, Optional, Any
8
8
 
9
- import pixeltable
9
+ import pixeltable as pxt
10
10
  import pixeltable.type_system as ts
11
11
  from pixeltable import exceptions as excs
12
12
 
@@ -81,24 +81,26 @@ def import_huggingface_dataset(
81
81
  dataset: Union[datasets.Dataset, datasets.DatasetDict],
82
82
  *,
83
83
  column_name_for_split: Optional[str] = None,
84
- schema_override: Optional[dict[str, Any]] = None,
85
- **kwargs,
86
- ) -> 'pixeltable.InsertableTable':
87
- """Create a new `Table` from a Huggingface dataset, or dataset dict with multiple splits.
88
- Requires datasets library to be installed.
84
+ schema_overrides: Optional[dict[str, Any]] = None,
85
+ **kwargs: Any,
86
+ ) -> pxt.Table:
87
+ """Create a new base table from a Huggingface dataset, or dataset dict with multiple splits.
88
+ Requires `datasets` library to be installed.
89
89
 
90
90
  Args:
91
- path_str: Path to the table.
92
- dataset: Huggingface datasets.Dataset or datasets.DatasetDict to insert into the table.
91
+ table_path: Path to the table.
92
+ dataset: Huggingface [`datasets.Dataset`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.Dataset)
93
+ or [`datasets.DatasetDict`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.DatasetDict)
94
+ to insert into the table.
93
95
  column_name_for_split: column name to use for split information. If None, no split information will be stored.
94
- schema_override: Optional dictionary mapping column names to column type to override the corresponding defaults from
95
- `pixeltable.utils.hf_datasets.huggingface_schema_to_pixeltable_schema`. The column type should be a pixeltable ColumnType.
96
- For example, {'col_vid': VideoType()}, rather than {'col_vid': StringType()}.
97
-
96
+ schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
97
+ name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`. The keys in
98
+ `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not they are valid
99
+ Pixeltable identifiers).
98
100
  kwargs: Additional arguments to pass to `create_table`.
99
101
 
100
102
  Returns:
101
- The newly created table. The table will have loaded the data from the dataset.
103
+ A handle to the newly created [`Table`][pixeltable.Table].
102
104
  """
103
105
  import datasets
104
106
  import pixeltable as pxt
@@ -118,8 +120,8 @@ def import_huggingface_dataset(
118
120
  dataset_dict = dataset
119
121
 
120
122
  pixeltable_schema = huggingface_schema_to_pixeltable_schema(dataset)
121
- if schema_override is not None:
122
- pixeltable_schema.update(schema_override)
123
+ if schema_overrides is not None:
124
+ pixeltable_schema.update(schema_overrides)
123
125
 
124
126
  if column_name_for_split is not None:
125
127
  if column_name_for_split in pixeltable_schema:
pixeltable/io/pandas.py CHANGED
@@ -1,9 +1,7 @@
1
- import datetime
2
1
  from typing import Any, Optional, Union
3
2
 
4
3
  import numpy as np
5
4
  import pandas as pd
6
- import PIL.Image
7
5
 
8
6
  import pixeltable as pxt
9
7
  import pixeltable.exceptions as excs
pixeltable/io/parquet.py CHANGED
@@ -7,7 +7,7 @@ import random
7
7
  import typing
8
8
  from collections import deque
9
9
  from pathlib import Path
10
- from typing import Dict, Optional
10
+ from typing import Dict, Optional, Any
11
11
 
12
12
  import PIL.Image
13
13
  import numpy as np
@@ -142,21 +142,22 @@ def import_parquet(
142
142
  table_path: str,
143
143
  *,
144
144
  parquet_path: str,
145
- schema_override: Optional[Dict[str, ts.ColumnType]] = None,
146
- **kwargs,
147
- ) -> pxt.catalog.InsertableTable:
148
- """Create a new `Table` from a Parquet file or set of files. Requires pyarrow to be installed.
145
+ schema_overrides: Optional[Dict[str, ts.ColumnType]] = None,
146
+ **kwargs: Any,
147
+ ) -> pxt.Table:
148
+ """Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
149
+
149
150
  Args:
150
- path_str: Path to the table within pixeltable.
151
+ table_path: Path to the table.
151
152
  parquet_path: Path to an individual Parquet file or directory of Parquet files.
152
- schema_override: Optional dictionary mapping column names to column type to override the default
153
- schema inferred from the Parquet file. The column type should be a pixeltable ColumnType.
154
- For example, {'col_vid': VideoType()}, rather than {'col_vid': StringType()}.
155
- Any fields not provided explicitly will map to types with `pixeltable.utils.parquet.parquet_schema_to_pixeltable_schema`
153
+ schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
154
+ name `name` will be given type `type`, instead of being inferred from the Parquet dataset. The keys in
155
+ `schema_overrides` should be the column names of the Parquet dataset (whether or not they are valid
156
+ Pixeltable identifiers).
156
157
  kwargs: Additional arguments to pass to `create_table`.
157
158
 
158
159
  Returns:
159
- The newly created table. The table will have loaded the data from the Parquet file(s).
160
+ A handle to the newly created [`Table`][pixeltable.Table].
160
161
  """
161
162
  import pixeltable as pxt
162
163
  from pyarrow import parquet
@@ -166,10 +167,10 @@ def import_parquet(
166
167
  parquet_dataset = parquet.ParquetDataset(input_path)
167
168
 
168
169
  schema = parquet_schema_to_pixeltable_schema(parquet_path)
169
- if schema_override is None:
170
- schema_override = {}
170
+ if schema_overrides is None:
171
+ schema_overrides = {}
171
172
 
172
- schema.update(schema_override)
173
+ schema.update(schema_overrides)
173
174
  for k, v in schema.items():
174
175
  if v is None:
175
176
  raise exc.Error(f'Could not infer pixeltable type for column {k} from parquet file')
@@ -1,14 +1,15 @@
1
1
  import dataclasses
2
2
  import enum
3
3
  import logging
4
- from typing import Dict, Any, List, Tuple, Optional, Iterable, Iterator
4
+ from typing import Any, Iterable, Iterator, Optional
5
5
 
6
6
  import ftfy
7
7
 
8
8
  from pixeltable.env import Env
9
9
  from pixeltable.exceptions import Error
10
- from pixeltable.type_system import ColumnType, DocumentType, StringType, IntType, JsonType
10
+ from pixeltable.type_system import ColumnType, DocumentType, IntType, JsonType, StringType
11
11
  from pixeltable.utils.documents import get_document_handle
12
+
12
13
  from .base import ComponentIterator
13
14
 
14
15
  _logger = logging.getLogger('pixeltable')
@@ -38,12 +39,12 @@ class DocumentSectionMetadata:
38
39
  sourceline: Optional[int] = None
39
40
  # the stack of headings up to the most recently observed one;
40
41
  # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
41
- heading: Optional[Dict[str, str]] = None
42
+ heading: Optional[dict[str, str]] = None
42
43
 
43
44
  # pdf-specific metadata
44
45
  page: Optional[int] = None
45
46
  # bounding box as an {x1, y1, x2, y2} dictionary
46
- bounding_box: Optional[Dict[str, float]] = None
47
+ bounding_box: Optional[dict[str, float]] = None
47
48
 
48
49
 
49
50
  @dataclasses.dataclass
@@ -53,7 +54,7 @@ class DocumentSection:
53
54
  metadata: Optional[DocumentSectionMetadata]
54
55
 
55
56
 
56
- def _parse_separators(separators: str) -> List[Separator]:
57
+ def _parse_separators(separators: str) -> list[Separator]:
57
58
  ret = []
58
59
  for s in separators.split(','):
59
60
  clean_s = s.strip().upper()
@@ -67,7 +68,7 @@ def _parse_separators(separators: str) -> List[Separator]:
67
68
  return ret
68
69
 
69
70
 
70
- def _parse_metadata(metadata: str) -> List[ChunkMetadata]:
71
+ def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
71
72
  ret = []
72
73
  for m in metadata.split(','):
73
74
  clean_m = m.strip().upper()
@@ -161,7 +162,7 @@ class DocumentSplitter(ComponentIterator):
161
162
  self._sections = self._char_chunks(self._sections)
162
163
 
163
164
  @classmethod
164
- def input_schema(cls) -> Dict[str, ColumnType]:
165
+ def input_schema(cls) -> dict[str, ColumnType]:
165
166
  return {
166
167
  'document': DocumentType(nullable=False),
167
168
  'separators': StringType(nullable=False),
@@ -174,7 +175,7 @@ class DocumentSplitter(ComponentIterator):
174
175
  }
175
176
 
176
177
  @classmethod
177
- def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
178
+ def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
178
179
  schema = {'text': StringType()}
179
180
  md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
180
181
 
@@ -208,7 +209,7 @@ class DocumentSplitter(ComponentIterator):
208
209
 
209
210
  return schema, []
210
211
 
211
- def __next__(self) -> Dict[str, Any]:
212
+ def __next__(self) -> dict[str, Any]:
212
213
  while True:
213
214
  section = next(self._sections)
214
215
  if section.text is None:
@@ -236,7 +237,7 @@ class DocumentSplitter(ComponentIterator):
236
237
  accumulated_text = [] # currently accumulated text
237
238
  # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
238
239
 
239
- headings: Dict[str, str] = {} # current state of observed headings (level -> text)
240
+ headings: dict[str, str] = {} # current state of observed headings (level -> text)
240
241
  sourceline = 0 # most recently seen sourceline
241
242
 
242
243
  def update_metadata(el: bs4.Tag) -> None:
@@ -250,7 +251,7 @@ class DocumentSplitter(ComponentIterator):
250
251
  del headings[l]
251
252
  headings[el.name] = el.get_text().strip()
252
253
 
253
- def emit() -> None:
254
+ def emit() -> Iterator[DocumentSection]:
254
255
  nonlocal accumulated_text, headings, sourceline
255
256
  if len(accumulated_text) > 0:
256
257
  md = DocumentSectionMetadata(sourceline=sourceline, heading=headings.copy())
@@ -294,9 +295,9 @@ class DocumentSplitter(ComponentIterator):
294
295
  # current state
295
296
  accumulated_text = [] # currently accumulated text
296
297
  # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
297
- headings: Dict[str, str] = {} # current state of observed headings (level -> text)
298
+ headings: dict[str, str] = {} # current state of observed headings (level -> text)
298
299
 
299
- def update_headings(heading: Dict) -> None:
300
+ def update_headings(heading: dict) -> None:
300
301
  # update current state
301
302
  nonlocal headings
302
303
  assert 'type' in heading and heading['type'] == 'heading'
@@ -309,14 +310,14 @@ class DocumentSplitter(ComponentIterator):
309
310
  del headings[l]
310
311
  headings[level] = text
311
312
 
312
- def emit() -> None:
313
+ def emit() -> Iterator[DocumentSection]:
313
314
  nonlocal accumulated_text, headings
314
315
  if len(accumulated_text) > 0:
315
316
  metadata = DocumentSectionMetadata(sourceline=0, heading=headings.copy())
316
317
  yield DocumentSection(text=ftfy.fix_text(' '.join(accumulated_text)), metadata=metadata)
317
318
  accumulated_text = []
318
319
 
319
- def process_element(el: Dict) -> Iterator[DocumentSection]:
320
+ def process_element(el: dict) -> Iterator[DocumentSection]:
320
321
  # process the element and emit sections as necessary
321
322
  nonlocal accumulated_text, headings, emit_on_heading, emit_on_paragraph
322
323
  assert 'type' in el
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
10
10
  from .schema import SystemInfo, SystemInfoMd
11
11
 
12
12
  # current version of the metadata; this is incremented whenever the metadata schema changes
13
- VERSION = 19
13
+ VERSION = 20
14
14
 
15
15
 
16
16
  def create_system_info(engine: sql.engine.Engine) -> None:
@@ -0,0 +1,46 @@
1
+ import datetime
2
+ from typing import Any, Optional
3
+
4
+ import sqlalchemy as sql
5
+
6
+ import pixeltable as pxt
7
+ from pixeltable.metadata import register_converter, schema
8
+ from pixeltable.metadata.converters.util import convert_table_md
9
+
10
+
11
+ @register_converter(version=19)
12
+ def _(engine: sql.engine.Engine) -> None:
13
+ # Convert all timestamp literals to aware datetimes
14
+ convert_table_md(engine, substitution_fn=__update_timestamp_literals)
15
+
16
+ # Convert all timestamp columns to TIMESTAMPTZ. (This conversion will take place in the database
17
+ # default time zone, which is what we want, since in versions <= 19 they were naive timestamps.)
18
+ with engine.begin() as conn:
19
+ tables = conn.execute(sql.select(schema.Table.id, schema.Table.md))
20
+ for id, md in tables:
21
+ store_prefix = 'view' if md['view_md'] is not None else 'tbl'
22
+ store_name = f'{store_prefix}_{id.hex}'
23
+ column_md = md['column_md']
24
+ timestamp_cols = [
25
+ col_id for col_id, col in column_md.items()
26
+ if col['col_type']['_classname'] == 'TimestampType'
27
+ ]
28
+ for col_id in timestamp_cols:
29
+ conn.execute(
30
+ sql.text(f'ALTER TABLE {store_name} ALTER COLUMN col_{col_id} TYPE TIMESTAMPTZ')
31
+ )
32
+
33
+
34
+ def __update_timestamp_literals(k: Any, v: Any) -> Optional[tuple[Any, Any]]:
35
+ if isinstance(v, dict) and 'val_t' in v:
36
+ # It's a literal with an explicit 'val_t' field. In version 19 this can only mean a
37
+ # timestamp literal, which (in version 19) is stored in the DB as a naive datetime.
38
+ # We convert it to an aware datetime, stored in UTC.
39
+ assert v['_classname'] == 'Literal'
40
+ assert v['val_t'] == pxt.ColumnType.Type.TIMESTAMP.name
41
+ assert isinstance(v['val'], str)
42
+ dt = datetime.datetime.fromisoformat(v['val'])
43
+ assert dt.tzinfo is None # In version 19 all timestamps are naive
44
+ dt_utc = dt.astimezone(datetime.timezone.utc)
45
+ v['val'] = dt_utc.isoformat()
46
+ return k, v
@@ -2,6 +2,7 @@
2
2
  # rather than as a comment, so that the existence of a description can be enforced by
3
3
  # the unit tests when new versions are added.
4
4
  VERSION_NOTES = {
5
+ 20: 'Store DB timestamps in UTC',
5
6
  19: 'UDF renames; ImageMemberAccess removal',
6
7
  18: 'Restructured index metadata',
7
8
  17: 'Renamed remotes to external_stores',
@@ -3,6 +3,7 @@ import uuid
3
3
  from typing import Optional, List, get_type_hints, Type, Any, TypeVar, Tuple, Union
4
4
 
5
5
  import sqlalchemy as sql
6
+ import sqlalchemy.orm as orm
6
7
  from sqlalchemy import ForeignKey
7
8
  from sqlalchemy import Integer, BigInteger, LargeBinary
8
9
  from sqlalchemy.dialects.postgresql import UUID, JSONB
@@ -64,8 +65,8 @@ class DirMd:
64
65
  class Dir(Base):
65
66
  __tablename__ = 'dirs'
66
67
 
67
- id = sql.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False)
68
- parent_id = sql.Column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
68
+ id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False)
69
+ parent_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
69
70
  md = sql.Column(JSONB, nullable=False)
70
71
 
71
72
 
@@ -163,8 +164,8 @@ class Table(Base):
163
164
 
164
165
  MAX_VERSION = 9223372036854775807 # 2^63 - 1
165
166
 
166
- id = sql.Column(UUID(as_uuid=True), primary_key=True, nullable=False)
167
- dir_id = sql.Column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=False)
167
+ id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), primary_key=True, nullable=False)
168
+ dir_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=False)
168
169
  md = sql.Column(JSONB, nullable=False) # TableMd
169
170
 
170
171