pixeltable 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

pixeltable/__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  # These version placeholders will be replaced during build.
2
- __version__ = '0.3.3'
3
- __version_tuple__ = (0, 3, 3)
2
+ __version__ = '0.3.4'
3
+ __version_tuple__ = (0, 3, 4)
pixeltable/dataframe.py CHANGED
@@ -578,15 +578,9 @@ class DataFrame:
578
578
  # analyze select list; wrap literals with the corresponding expressions
579
579
  select_list: list[tuple[exprs.Expr, Optional[str]]] = []
580
580
  for raw_expr, name in base_list:
581
- if isinstance(raw_expr, exprs.Expr):
582
- select_list.append((raw_expr, name))
583
- elif isinstance(raw_expr, (dict, list, tuple)):
584
- select_list.append((exprs.Expr.from_object(raw_expr), name))
585
- elif isinstance(raw_expr, np.ndarray):
586
- select_list.append((exprs.Expr.from_array(raw_expr), name))
587
- else:
588
- select_list.append((exprs.Literal(raw_expr), name))
589
- expr = select_list[-1][0]
581
+ expr = exprs.Expr.from_object(raw_expr)
582
+ if expr is None:
583
+ raise excs.Error(f'Invalid expression: {raw_expr}')
590
584
  if expr.col_type.is_invalid_type():
591
585
  raise excs.Error(f'Invalid type: {raw_expr}')
592
586
  if not expr.is_bound_by(self._from_clause.tbls):
@@ -594,6 +588,7 @@ class DataFrame:
594
588
  f"Expression '{expr}' cannot be evaluated in the context of this query's tables "
595
589
  f'({",".join(tbl.tbl_name() for tbl in self._from_clause.tbls)})'
596
590
  )
591
+ select_list.append((expr, name))
597
592
 
598
593
  # check user provided names do not conflict among themselves or with auto-generated ones
599
594
  seen: set[str] = set()
pixeltable/env.py CHANGED
@@ -333,9 +333,7 @@ class Env:
333
333
  http_logger.addHandler(http_fh)
334
334
  http_logger.propagate = False
335
335
 
336
- # empty tmp dir
337
- for path in glob.glob(f'{self._tmp_dir}/*'):
338
- os.remove(path)
336
+ self.clear_tmp_dir()
339
337
 
340
338
  self._db_name = os.environ.get('PIXELTABLE_DB', 'pixeltable')
341
339
  self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(self._home / 'pgdata')))
@@ -628,6 +626,13 @@ class Env:
628
626
  )
629
627
  self.__optional_packages['spacy'].is_installed = False
630
628
 
629
+ def clear_tmp_dir(self) -> None:
630
+ for path in glob.glob(f'{self._tmp_dir}/*'):
631
+ if os.path.isdir(path):
632
+ shutil.rmtree(path)
633
+ else:
634
+ os.remove(path)
635
+
631
636
  def num_tmp_files(self) -> int:
632
637
  return len(glob.glob(f'{self._tmp_dir}/*'))
633
638
 
@@ -1,5 +1,4 @@
1
- import inspect
2
- from typing import AsyncIterator, Iterator, Optional
1
+ from typing import AsyncIterator
3
2
 
4
3
  import pixeltable.catalog as catalog
5
4
  import pixeltable.exceptions as excs
pixeltable/exprs/expr.py CHANGED
@@ -10,6 +10,7 @@ import typing
10
10
  from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Optional, TypeVar, Union, overload
11
11
  from uuid import UUID
12
12
 
13
+ import numpy as np
13
14
  import sqlalchemy as sql
14
15
  from typing_extensions import Self, _AnnotatedAlias
15
16
 
@@ -379,6 +380,12 @@ class Expr(abc.ABC):
379
380
  @classmethod
380
381
  def from_array(cls, elements: Iterable) -> Optional[Expr]:
381
382
  from .inline_expr import InlineArray
383
+ from .literal import Literal
384
+
385
+ if isinstance(elements, np.ndarray):
386
+ pxttype = ts.ArrayType.from_literal(elements)
387
+ if pxttype is not None:
388
+ return Literal(elements, col_type=pxttype)
382
389
 
383
390
  inline_array = InlineArray(elements)
384
391
  return inline_array.maybe_literal()
@@ -14,7 +14,7 @@ import math
14
14
  import pathlib
15
15
  import re
16
16
  import uuid
17
- from typing import TYPE_CHECKING, Any, Callable, Optional, Type, TypeVar, Union, cast
17
+ from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Type, TypeVar, Union, cast
18
18
 
19
19
  import httpx
20
20
  import numpy as np
@@ -324,10 +324,17 @@ async def translations(
324
324
  # Chat Endpoints
325
325
 
326
326
 
327
+ def _default_max_tokens(model: str) -> int:
328
+ if model in ('o1', 'o3-mini'):
329
+ return 65536
330
+ else:
331
+ return 1024
332
+
333
+
327
334
  def _chat_completions_get_request_resources(
328
- messages: list, max_tokens: Optional[int], n: Optional[int]
335
+ messages: list, model: str, max_completion_tokens: Optional[int], max_tokens: Optional[int], n: Optional[int]
329
336
  ) -> dict[str, int]:
330
- completion_tokens = n * max_tokens
337
+ completion_tokens = (n or 1) * (max_completion_tokens or max_tokens or _default_max_tokens(model))
331
338
 
332
339
  num_tokens = 0.0
333
340
  for message in messages:
@@ -349,16 +356,18 @@ async def chat_completions(
349
356
  logit_bias: Optional[dict[str, int]] = None,
350
357
  logprobs: Optional[bool] = None,
351
358
  top_logprobs: Optional[int] = None,
352
- max_tokens: Optional[int] = 1024,
353
- n: Optional[int] = 1,
359
+ max_completion_tokens: Optional[int] = None,
360
+ max_tokens: Optional[int] = None,
361
+ n: Optional[int] = None,
354
362
  presence_penalty: Optional[float] = None,
363
+ reasoning_effort: Optional[Literal['low', 'medium', 'high']] = None,
355
364
  response_format: Optional[dict] = None,
356
365
  seed: Optional[int] = None,
357
366
  stop: Optional[list[str]] = None,
358
367
  temperature: Optional[float] = None,
359
- top_p: Optional[float] = None,
360
368
  tools: Optional[list[dict]] = None,
361
369
  tool_choice: Optional[dict] = None,
370
+ top_p: Optional[float] = None,
362
371
  user: Optional[str] = None,
363
372
  timeout: Optional[float] = None,
364
373
  ) -> dict:
@@ -418,6 +427,9 @@ async def chat_completions(
418
427
  resource_pool, lambda: OpenAIRateLimitsInfo(_chat_completions_get_request_resources)
419
428
  )
420
429
 
430
+ if max_completion_tokens is None and max_tokens is None:
431
+ max_completion_tokens = _default_max_tokens(model)
432
+
421
433
  # cast(Any, ...): avoid mypy errors
422
434
  result = await _openai_client().chat.completions.with_raw_response.create(
423
435
  messages=messages,
@@ -426,16 +438,18 @@ async def chat_completions(
426
438
  logit_bias=_opt(logit_bias),
427
439
  logprobs=_opt(logprobs),
428
440
  top_logprobs=_opt(top_logprobs),
441
+ max_completion_tokens=_opt(max_completion_tokens),
429
442
  max_tokens=_opt(max_tokens),
430
443
  n=_opt(n),
431
444
  presence_penalty=_opt(presence_penalty),
445
+ reasoning_effort=_opt(reasoning_effort),
432
446
  response_format=_opt(cast(Any, response_format)),
433
447
  seed=_opt(seed),
434
448
  stop=_opt(stop),
435
449
  temperature=_opt(temperature),
436
- top_p=_opt(top_p),
437
450
  tools=_opt(cast(Any, tools)),
438
451
  tool_choice=_opt(cast(Any, tool_choice_)),
452
+ top_p=_opt(top_p),
439
453
  user=_opt(user),
440
454
  timeout=_opt(timeout),
441
455
  extra_body=extra_body,
@@ -448,9 +462,14 @@ async def chat_completions(
448
462
 
449
463
 
450
464
  def _vision_get_request_resources(
451
- prompt: str, image: PIL.Image.Image, max_tokens: Optional[int], n: Optional[int]
465
+ prompt: str,
466
+ image: PIL.Image.Image,
467
+ model: str,
468
+ max_completion_tokens: Optional[int],
469
+ max_tokens: Optional[int],
470
+ n: Optional[int],
452
471
  ) -> dict[str, int]:
453
- completion_tokens = n * max_tokens
472
+ completion_tokens = (n or 1) * (max_completion_tokens or max_tokens or _default_max_tokens(model))
454
473
  prompt_tokens = len(prompt) / 4
455
474
 
456
475
  # calculate image tokens based on
@@ -482,7 +501,8 @@ async def vision(
482
501
  image: PIL.Image.Image,
483
502
  *,
484
503
  model: str,
485
- max_tokens: Optional[int] = 1024,
504
+ max_completion_tokens: Optional[int] = None,
505
+ max_tokens: Optional[int] = None,
486
506
  n: Optional[int] = 1,
487
507
  timeout: Optional[float] = None,
488
508
  ) -> str:
@@ -534,9 +554,14 @@ async def vision(
534
554
  rate_limits_info = env.Env.get().get_resource_pool_info(
535
555
  resource_pool, lambda: OpenAIRateLimitsInfo(_vision_get_request_resources)
536
556
  )
557
+
558
+ if max_completion_tokens is None and max_tokens is None:
559
+ max_completion_tokens = _default_max_tokens(model)
560
+
537
561
  result = await _openai_client().chat.completions.with_raw_response.create(
538
562
  messages=messages, # type: ignore
539
563
  model=model,
564
+ max_completion_tokens=_opt(max_completion_tokens),
540
565
  max_tokens=_opt(max_tokens),
541
566
  n=_opt(n),
542
567
  timeout=_opt(timeout),
pixeltable/io/pandas.py CHANGED
@@ -185,20 +185,9 @@ def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bo
185
185
  """
186
186
  Infers a Pixeltable type based on a Numpy dtype.
187
187
  """
188
- if np.issubdtype(np_dtype, np.integer):
189
- return pxt.IntType(nullable=nullable)
190
-
191
- if np.issubdtype(np_dtype, np.floating):
192
- return pxt.FloatType(nullable=nullable)
193
-
194
- if np.issubdtype(np_dtype, np.bool_):
195
- return pxt.BoolType(nullable=nullable)
196
-
197
- if np.issubdtype(np_dtype, np.character):
198
- return pxt.StringType(nullable=nullable)
199
-
200
- if np.issubdtype(np_dtype, np.datetime64):
201
- return pxt.TimestampType(nullable=nullable)
188
+ pxttype = ts.ArrayType.from_np_dtype(np_dtype, nullable)
189
+ if pxttype is not None:
190
+ return pxttype
202
191
 
203
192
  if np_dtype == np.object_:
204
193
  # The `object_` dtype can mean all sorts of things; see if we can infer the Pixeltable type
File without changes
@@ -0,0 +1,218 @@
1
+ import io
2
+ import json
3
+ import logging
4
+ import tarfile
5
+ import urllib.parse
6
+ import urllib.request
7
+ import uuid
8
+ from pathlib import Path
9
+ from typing import Any, Iterator
10
+
11
+ import more_itertools
12
+ import numpy as np
13
+ import pyarrow as pa
14
+ import pyiceberg.catalog
15
+
16
+ import pixeltable as pxt
17
+ import pixeltable.type_system as ts
18
+ from pixeltable import exprs
19
+ from pixeltable.env import Env
20
+ from pixeltable.utils.arrow import PXT_TO_PA_TYPES
21
+ from pixeltable.utils.iceberg import sqlite_catalog
22
+
23
+ _logger = logging.getLogger('pixeltable')
24
+
25
+
26
+ class TablePackager:
27
+ """
28
+ Packages a pixeltable Table into a tarball containing Iceberg tables and media files. The structure of the tarball
29
+ is as follows:
30
+
31
+ warehouse/catalog.db # sqlite Iceberg catalog
32
+ warehouse/pxt.db/** # Iceberg metadata and data files (parquet/avro/json)
33
+ media/** # Local media files
34
+
35
+ If the table being archived is a view, then the Iceberg catalog will contain separate tables for the view and each
36
+ of its ancestors. All rows will be exported with additional _rowid and _v_min columns. Currently, only the most
37
+ recent version of the table can be exported, and only the full table contents.
38
+
39
+ If the table contains media columns, they are handled as follows:
40
+ - If a media file has an external URL (any URL scheme other than file://), then the URL will be preserved as-is and
41
+ stored in the Iceberg table.
42
+ - If a media file is a local file, then it will be copied into the tarball as a file of the form
43
+ 'media/{uuid}{extension}', and the Iceberg table will contain the ephemeral URI 'pxtmedia://{uuid}{extension}'.
44
+ """
45
+
46
+ table: pxt.Table # The table to be packaged
47
+ tmp_dir: Path # Temporary directory where the package will reside
48
+ iceberg_catalog: pyiceberg.catalog.Catalog
49
+ media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
50
+
51
+ def __init__(self, table: pxt.Table) -> None:
52
+ self.table = table
53
+ self.tmp_dir = Path(Env.get().create_tmp_path())
54
+ self.media_files = {}
55
+
56
+ def package(self) -> Path:
57
+ """
58
+ Export the table to a tarball containing Iceberg tables and media files.
59
+ """
60
+ assert not self.tmp_dir.exists() # Packaging can only be done once per TablePackager instance
61
+ _logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
62
+ self.tmp_dir.mkdir()
63
+ self.iceberg_catalog = sqlite_catalog(self.tmp_dir / 'warehouse')
64
+ ancestors = [self.table] + self.table._bases
65
+ for t in ancestors:
66
+ _logger.info(f"Exporting table '{t._path}'.")
67
+ self.__export_table(t)
68
+ _logger.info(f'Building archive.')
69
+ bundle_path = self.__build_tarball()
70
+ _logger.info(f'Packaging complete: {bundle_path}')
71
+ return bundle_path
72
+
73
+ def __export_table(self, t: pxt.Table) -> None:
74
+ """
75
+ Exports the data from `t` into an Iceberg table.
76
+ """
77
+ # First generate a select list for the data we want to extract from `t`. This includes:
78
+ # - all stored columns, including computed columns;
79
+ # - errortype and errormsg fields whenever they're defined.
80
+ # We select only those columns that are defined in this table (columns inherited from ancestor tables will be
81
+ # handled separately).
82
+ # For media columns, we substitute `col.fileurl` so that we always get the URL (which may be a file:// URL;
83
+ # these will be specially handled later)
84
+ select_exprs: dict[str, exprs.Expr] = {}
85
+
86
+ # As we generate the select list, we construct a separate list of column types. We can't rely on df._schema
87
+ # to get the column types, since we'll be substituting `fileurl`s for media columns.
88
+ actual_col_types: list[ts.ColumnType] = []
89
+
90
+ for col_name, col in t._tbl_version.cols_by_name.items():
91
+ if not col.is_stored:
92
+ continue
93
+ if col.col_type.is_media_type():
94
+ select_exprs[col_name] = t[col_name].fileurl
95
+ else:
96
+ select_exprs[col_name] = t[col_name]
97
+ actual_col_types.append(col.col_type)
98
+ if col.records_errors:
99
+ select_exprs[f'{col_name}_errortype'] = t[col_name].errortype
100
+ actual_col_types.append(ts.StringType())
101
+ select_exprs[f'{col_name}_errormsg'] = t[col_name].errormsg
102
+ actual_col_types.append(ts.StringType())
103
+
104
+ # Run the select() on `self.table`, not `t`, so that we export only those rows that are actually present in
105
+ # `self.table`.
106
+ df = self.table.select(**select_exprs)
107
+ namespace = self.__iceberg_namespace(t)
108
+ self.iceberg_catalog.create_namespace_if_not_exists(namespace)
109
+ iceberg_schema = self.__to_iceberg_schema(df._schema)
110
+ iceberg_tbl = self.iceberg_catalog.create_table(f'{namespace}.{t._name}', schema=iceberg_schema)
111
+
112
+ # Populate the Iceberg table with data.
113
+ # The data is first loaded from the DataFrame into a sequence of pyarrow tables, batched in order to avoid
114
+ # excessive memory usage. The pyarrow tables are then amalgamated into the (single) Iceberg table on disk.
115
+ for pa_table in self.__to_pa_tables(df, actual_col_types, iceberg_schema):
116
+ iceberg_tbl.append(pa_table)
117
+
118
+ @classmethod
119
+ def __iceberg_namespace(cls, table: pxt.Table) -> str:
120
+ """
121
+ Iceberg tables must have a namespace, which cannot be the empty string, so we prepend `pxt` to the table path.
122
+ """
123
+ parent_path = table._parent._path
124
+ if len(parent_path) == 0:
125
+ return 'pxt'
126
+ else:
127
+ return f'pxt.{parent_path}'
128
+
129
+ # The following methods are responsible for schema and data conversion from Pixeltable to Iceberg. Some of this
130
+ # logic might be consolidated into arrow.py and unified with general Parquet export, but there are several
131
+ # major differences:
132
+ # - Iceberg has no array type; we export all arrays as binary blobs
133
+ # - We include _rowid and _v_min columns in the Iceberg table
134
+ # - Media columns are handled specially as indicated above
135
+
136
+ @classmethod
137
+ def __to_iceberg_schema(cls, pxt_schema: dict[str, ts.ColumnType]) -> pa.Schema:
138
+ entries = [(name, cls.__to_iceberg_type(col_type)) for name, col_type in pxt_schema.items()]
139
+ entries.append(('_rowid', pa.list_(pa.int64())))
140
+ entries.append(('_v_min', pa.int64()))
141
+ return pa.schema(entries) # type: ignore[arg-type]
142
+
143
+ @classmethod
144
+ def __to_iceberg_type(cls, col_type: ts.ColumnType) -> pa.DataType:
145
+ if col_type.is_array_type():
146
+ return pa.binary()
147
+ if col_type.is_media_type():
148
+ return pa.string()
149
+ return PXT_TO_PA_TYPES.get(col_type.__class__)
150
+
151
+ def __to_pa_tables(
152
+ self,
153
+ df: pxt.DataFrame,
154
+ actual_col_types: list[pxt.ColumnType],
155
+ arrow_schema: pa.Schema,
156
+ batch_size: int = 1_000,
157
+ ) -> Iterator[pa.Table]:
158
+ """
159
+ Load a DataFrame as a sequence of pyarrow tables. The pyarrow tables are batched into smaller chunks
160
+ to avoid excessive memory usage.
161
+ """
162
+ for rows in more_itertools.batched(self.__to_pa_rows(df, actual_col_types), batch_size):
163
+ cols = {col_name: [row[idx] for row in rows] for idx, col_name in enumerate(df._schema.keys())}
164
+ cols['_rowid'] = [row[-2] for row in rows]
165
+ cols['_v_min'] = [row[-1] for row in rows]
166
+ yield pa.Table.from_pydict(cols, schema=arrow_schema)
167
+
168
+ def __to_pa_rows(self, df: pxt.DataFrame, actual_col_types: list[pxt.ColumnType]) -> Iterator[list]:
169
+ for row in df._exec():
170
+ vals = [row[e.slot_idx] for e in df._select_list_exprs]
171
+ result = [self.__to_pa_value(val, col_type) for val, col_type in zip(vals, actual_col_types)]
172
+ result.append(row.rowid)
173
+ result.append(row.v_min)
174
+ yield result
175
+
176
+ def __to_pa_value(self, val: Any, col_type: ts.ColumnType) -> Any:
177
+ if val is None:
178
+ return None
179
+ if col_type.is_array_type():
180
+ # Export arrays as binary
181
+ assert isinstance(val, np.ndarray)
182
+ arr = io.BytesIO()
183
+ np.save(arr, val)
184
+ return arr.getvalue()
185
+ if col_type.is_json_type():
186
+ # Export JSON as strings
187
+ return json.dumps(val)
188
+ if col_type.is_media_type():
189
+ # Handle media files as described above
190
+ assert isinstance(val, str) # Media columns are always referenced by `fileurl`
191
+ return self.__process_media_url(val)
192
+ return val
193
+
194
+ def __process_media_url(self, url: str) -> str:
195
+ parsed_url = urllib.parse.urlparse(url)
196
+ if parsed_url.scheme == 'file':
197
+ # It's the URL of a local file. Replace it with a pxtmedia:// URI.
198
+ # (We can't use an actual pxt:// URI, because the eventual pxt:// table name might not be known at this
199
+ # time. The pxtmedia:// URI serves as a relative reference into the tarball that can be replaced with an
200
+ # actual URL when the table is reconstituted.)
201
+ path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_url.path)))
202
+ if path not in self.media_files:
203
+ # Create a new entry in the `media_files` dict so that we can copy the file into the tarball later.
204
+ dest_name = f'{uuid.uuid4().hex}{path.suffix}'
205
+ self.media_files[path] = dest_name
206
+ return f'pxtmedia://{self.media_files[path]}'
207
+ # For any type of URL other than a local file, just return the URL as-is.
208
+ return url
209
+
210
+ def __build_tarball(self) -> Path:
211
+ bundle_path = self.tmp_dir / 'bundle.tar.bz2'
212
+ with tarfile.open(bundle_path, 'w:bz2') as tf:
213
+ # Add the Iceberg warehouse dir (including the catalog)
214
+ tf.add(self.tmp_dir / 'warehouse', arcname='warehouse', recursive=True)
215
+ # Add the media files
216
+ for src_file, dest_name in self.media_files.items():
217
+ tf.add(src_file, arcname=f'media/{dest_name}')
218
+ return bundle_path
pixeltable/type_system.py CHANGED
@@ -9,9 +9,7 @@ import typing
9
9
  import urllib.parse
10
10
  import urllib.request
11
11
  from pathlib import Path
12
-
13
- from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
14
- from typing import Any, Iterable, Mapping, Optional, Sequence, Union
12
+ from typing import Any, Iterable, Literal, Mapping, Optional, Sequence, Union
15
13
 
16
14
  import av # type: ignore
17
15
  import jsonschema
@@ -25,6 +23,8 @@ from typing_extensions import _AnnotatedAlias
25
23
 
26
24
  import pixeltable.exceptions as excs
27
25
 
26
+ from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
27
+
28
28
 
29
29
  class ColumnType:
30
30
  @enum.unique
@@ -213,9 +213,9 @@ class ColumnType:
213
213
  return self.copy(nullable=(self.nullable or other.nullable))
214
214
 
215
215
  if self.is_invalid_type():
216
- return other
216
+ return other.copy(nullable=(self.nullable or other.nullable))
217
217
  if other.is_invalid_type():
218
- return self
218
+ return self.copy(nullable=(self.nullable or other.nullable))
219
219
 
220
220
  if self.is_scalar_type() and other.is_scalar_type():
221
221
  t = self.Type.supertype(self._type, other._type, self.common_supertypes)
@@ -292,26 +292,24 @@ class ColumnType:
292
292
  designations will be allowed regardless.
293
293
  """
294
294
  origin = typing.get_origin(t)
295
+ type_args = typing.get_args(t)
295
296
  if origin is typing.Union:
296
297
  # Check if `t` has the form Optional[T].
297
- union_args = typing.get_args(t)
298
- if len(union_args) == 2 and type(None) in union_args:
298
+ if len(type_args) == 2 and type(None) in type_args:
299
299
  # `t` is a type of the form Optional[T] (equivalently, Union[T, None] or Union[None, T]).
300
300
  # We treat it as the underlying type but with nullable=True.
301
- underlying_py_type = union_args[0] if union_args[1] is type(None) else union_args[1]
301
+ underlying_py_type = type_args[0] if type_args[1] is type(None) else type_args[1]
302
302
  underlying = cls.from_python_type(underlying_py_type, allow_builtin_types=allow_builtin_types)
303
303
  if underlying is not None:
304
304
  return underlying.copy(nullable=True)
305
305
  elif origin is Required:
306
- required_args = typing.get_args(t)
307
- assert len(required_args) == 1
306
+ assert len(type_args) == 1
308
307
  return cls.from_python_type(
309
- required_args[0], nullable_default=False, allow_builtin_types=allow_builtin_types
310
- )
308
+ type_args[0], nullable_default=False, allow_builtin_types=allow_builtin_types
309
+ ).copy(nullable=False)
311
310
  elif origin is typing.Annotated:
312
- annotated_args = typing.get_args(t)
313
- origin = annotated_args[0]
314
- parameters = annotated_args[1]
311
+ origin = type_args[0]
312
+ parameters = type_args[1]
315
313
  if isinstance(parameters, ColumnType):
316
314
  return parameters.copy(nullable=nullable_default)
317
315
  else:
@@ -323,6 +321,11 @@ class ColumnType:
323
321
  if isinstance(t, type) and issubclass(t, _PxtType):
324
322
  return t.as_col_type(nullable=nullable_default)
325
323
  elif allow_builtin_types:
324
+ if t is Literal and len(type_args) > 0:
325
+ literal_type = cls.infer_common_literal_type(type_args)
326
+ if literal_type is None:
327
+ return None
328
+ return literal_type.copy(nullable=(literal_type.nullable or nullable_default))
326
329
  if t is str:
327
330
  return StringType(nullable=nullable_default)
328
331
  if t is int:
@@ -335,7 +338,7 @@ class ColumnType:
335
338
  return TimestampType(nullable=nullable_default)
336
339
  if t is PIL.Image.Image:
337
340
  return ImageType(nullable=nullable_default)
338
- if issubclass(t, Sequence) or issubclass(t, Mapping) or issubclass(t, pydantic.BaseModel):
341
+ if isinstance(t, type) and issubclass(t, (Sequence, Mapping, pydantic.BaseModel)):
339
342
  return JsonType(nullable=nullable_default)
340
343
  return None
341
344
 
@@ -851,23 +854,39 @@ class ArrayType(ColumnType):
851
854
  dtype = None if d['dtype'] is None else cls.make_type(cls.Type(d['dtype']))
852
855
  return cls(shape, dtype, nullable=d['nullable'])
853
856
 
857
+ @classmethod
858
+ def from_np_dtype(cls, dtype: np.dtype, nullable: bool) -> Optional[ColumnType]:
859
+ """
860
+ Return pixeltable type corresponding to a given simple numpy dtype
861
+ """
862
+ if np.issubdtype(dtype, np.integer):
863
+ return IntType(nullable=nullable)
864
+
865
+ if np.issubdtype(dtype, np.floating):
866
+ return FloatType(nullable=nullable)
867
+
868
+ if dtype == np.bool_:
869
+ return BoolType(nullable=nullable)
870
+
871
+ if np.issubdtype(dtype, np.str_):
872
+ return StringType(nullable=nullable)
873
+
874
+ if np.issubdtype(dtype, np.character):
875
+ return StringType(nullable=nullable)
876
+
877
+ if np.issubdtype(dtype, np.datetime64):
878
+ return TimestampType(nullable=nullable)
879
+
880
+ return None
881
+
854
882
  @classmethod
855
883
  def from_literal(cls, val: np.ndarray, nullable: bool = False) -> Optional[ArrayType]:
856
884
  # determine our dtype
857
885
  assert isinstance(val, np.ndarray)
858
- dtype: ColumnType
859
- if np.issubdtype(val.dtype, np.integer):
860
- dtype = IntType()
861
- elif np.issubdtype(val.dtype, np.floating):
862
- dtype = FloatType()
863
- elif val.dtype == np.bool_:
864
- dtype = BoolType()
865
- elif np.issubdtype(val.dtype, np.str_):
866
- # Note that this includes NumPy types like '<U1' -- arrays of single Unicode characters
867
- dtype = StringType()
868
- else:
886
+ pxttype: Optional[ColumnType] = cls.from_np_dtype(val.dtype, nullable)
887
+ if pxttype == None:
869
888
  return None
870
- return cls(val.shape, dtype=dtype, nullable=nullable)
889
+ return cls(val.shape, dtype=pxttype, nullable=nullable)
871
890
 
872
891
  def is_valid_literal(self, val: np.ndarray) -> bool:
873
892
  if not isinstance(val, np.ndarray):
pixeltable/utils/arrow.py CHANGED
@@ -6,7 +6,7 @@ import pyarrow as pa
6
6
 
7
7
  import pixeltable.type_system as ts
8
8
 
9
- _pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
9
+ PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
10
10
  pa.string(): ts.StringType(nullable=True),
11
11
  pa.bool_(): ts.BoolType(nullable=True),
12
12
  pa.uint8(): ts.IntType(nullable=True),
@@ -18,7 +18,7 @@ _pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
18
18
  pa.float32(): ts.FloatType(nullable=True),
19
19
  }
20
20
 
21
- _pt_to_pa: dict[type[ts.ColumnType], pa.DataType] = {
21
+ PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
22
22
  ts.StringType: pa.string(),
23
23
  ts.TimestampType: pa.timestamp('us', tz=datetime.timezone.utc), # postgres timestamp is microseconds
24
24
  ts.BoolType: pa.bool_(),
@@ -38,8 +38,8 @@ def to_pixeltable_type(arrow_type: pa.DataType) -> Optional[ts.ColumnType]:
38
38
  """
39
39
  if isinstance(arrow_type, pa.TimestampType):
40
40
  return ts.TimestampType(nullable=True)
41
- elif arrow_type in _pa_to_pt:
42
- return _pa_to_pt[arrow_type]
41
+ elif arrow_type in PA_TO_PXT_TYPES:
42
+ return PA_TO_PXT_TYPES[arrow_type]
43
43
  elif isinstance(arrow_type, pa.FixedShapeTensorType):
44
44
  dtype = to_pixeltable_type(arrow_type.value_type)
45
45
  if dtype is None:
@@ -53,8 +53,8 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
53
53
  """Convert a pixeltable DataType to a pyarrow datatype if one is defined.
54
54
  Returns None if no conversion is currently implemented.
55
55
  """
56
- if pixeltable_type.__class__ in _pt_to_pa:
57
- return _pt_to_pa[pixeltable_type.__class__]
56
+ if pixeltable_type.__class__ in PXT_TO_PA_TYPES:
57
+ return PXT_TO_PA_TYPES[pixeltable_type.__class__]
58
58
  elif isinstance(pixeltable_type, ts.ArrayType):
59
59
  return pa.fixed_shape_tensor(pa.from_numpy_dtype(pixeltable_type.numpy_dtype()), pixeltable_type.shape)
60
60
  else:
@@ -0,0 +1,14 @@
1
+ from pathlib import Path
2
+ from typing import Union
3
+
4
+ from pyiceberg.catalog.sql import SqlCatalog
5
+
6
+
7
+ def sqlite_catalog(warehouse_path: Union[str, Path], name: str = 'pixeltable') -> SqlCatalog:
8
+ """
9
+ Instantiate a sqlite Iceberg catalog at the specified path. If no catalog exists, one will be created.
10
+ """
11
+ if isinstance(warehouse_path, str):
12
+ warehouse_path = Path(warehouse_path)
13
+ warehouse_path.mkdir(exist_ok=True)
14
+ return SqlCatalog(name, uri=f'sqlite:///{warehouse_path}/catalog.db', warehouse=f'file://{warehouse_path}')
@@ -30,7 +30,7 @@ class MediaStore:
30
30
  the environment's media_dir.
31
31
  """
32
32
  id_hex = uuid.uuid4().hex
33
- parent = Env.get().media_dir / tbl_id.hex / id_hex[0:2] / id_hex[0:4]
33
+ parent = Env.get().media_dir / tbl_id.hex / id_hex[:2] / id_hex[:4]
34
34
  parent.mkdir(parents=True, exist_ok=True)
35
35
  return parent / f'{tbl_id.hex}_{col_id}_{version}_{id_hex}{ext or ""}'
36
36
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pixeltable
3
- Version: 0.3.3
3
+ Version: 0.3.4
4
4
  Summary: AI Data Infrastructure: Declarative, Multimodal, and Incremental
5
5
  Home-page: https://pixeltable.com/
6
6
  License: Apache-2.0
@@ -39,11 +39,13 @@ Requires-Dist: numpy (>=1.25,<2.0)
39
39
  Requires-Dist: pandas (>=2.0,<3.0)
40
40
  Requires-Dist: pgvector (>=0.2.1,<0.3.0)
41
41
  Requires-Dist: pillow (>=9.3.0)
42
- Requires-Dist: pixeltable-pgserver (==0.2.9)
42
+ Requires-Dist: pixeltable-pgserver (==0.3.1)
43
43
  Requires-Dist: psutil (>=5.9.5,<6.0.0)
44
44
  Requires-Dist: psycopg[binary] (>=3.1.18)
45
45
  Requires-Dist: puremagic (>=1.20)
46
+ Requires-Dist: pyarrow (>=13.0.0)
46
47
  Requires-Dist: pydantic (>=2.7.4)
48
+ Requires-Dist: pyiceberg (>=0.6.0)
47
49
  Requires-Dist: pymupdf (>=1.24.1,<2.0.0)
48
50
  Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
49
51
  Requires-Dist: requests (>=2.31.0,<3.0.0)
@@ -1,5 +1,5 @@
1
1
  pixeltable/__init__.py,sha256=FeL_ABFaY6QiShtTao1cfhSAwXV_2dkhL_4-qXoHbPE,1616
2
- pixeltable/__version__.py,sha256=NMCNPWfp4W0_zblLn-1M1FNbW4Fe6XSxnsm2uSwk7eA,112
2
+ pixeltable/__version__.py,sha256=a50-dZlwYU667r1CN3zUS6OONPFGlyZFnAAe8vTD1k8,112
3
3
  pixeltable/catalog/__init__.py,sha256=bACh33HpWQed86eV8t9of_ClSXqZx5blZi4y8vJ7-EA,517
4
4
  pixeltable/catalog/catalog.py,sha256=LFaOtHoGJM306jDlyyQRqCaPR6K4nrN-jPu3_vyZNvc,8267
5
5
  pixeltable/catalog/column.py,sha256=9Rm4DCP-uUCl3P44uTsD89P63jxmvv9emD2Rc7Bw_us,9684
@@ -14,13 +14,13 @@ pixeltable/catalog/table.py,sha256=qfTI7obvSanFt96-jbjSXU9PyninU3_B9K4pnaxlJdM,6
14
14
  pixeltable/catalog/table_version.py,sha256=rWBtgnIepVgq5tZ4vb9RzAL5peHnze5ZMOr-7gqMpog,60354
15
15
  pixeltable/catalog/table_version_path.py,sha256=yDU_KXriAckJqKPfKYhLVDig7glUc--_Fda9X7ekfGo,5810
16
16
  pixeltable/catalog/view.py,sha256=cTL1jBYHa3RweODoD-y_I9NjAntqJPSofP4BJdSWaBA,11226
17
- pixeltable/dataframe.py,sha256=hGYjMFE3Fwftgdsveo4eXd5SiGXl3uJOaIoH3wm61Po,49473
18
- pixeltable/env.py,sha256=8gWyNYnIufet8kbGpa-QNsVaEdTJGbCymUwq4XQpC2k,35723
17
+ pixeltable/dataframe.py,sha256=9eMkOUKYpcml6y_Nsj9nTY_UHaDyzo1GT1c6IfzWfXo,49177
18
+ pixeltable/env.py,sha256=1IN2Tju45H-ADNhMfVRDOQ11udBxo4L_euZ6gQKiRC8,35860
19
19
  pixeltable/exceptions.py,sha256=NuFY2WtkQpLfLHT_J70kOw9Tr0kEDkkgo-u7As4Gaq4,410
20
20
  pixeltable/exec/__init__.py,sha256=Qi0s2BEM8O8MPdYGQAIzclv2GNFsoCPJFvA6s5Tjc_o,489
21
21
  pixeltable/exec/aggregation_node.py,sha256=KR7OLQOfAL4KTF6_vKSuJvFC2ntwWf0NJxhQ9i340-4,4072
22
22
  pixeltable/exec/cache_prefetch_node.py,sha256=fwO-xUQfSOMWQMbrJplFXvjcKjLVjPz93O0HttSD3A8,12211
23
- pixeltable/exec/component_iteration_node.py,sha256=vYELAMtc4jKOxC0aZFjjx6UBlBcjC3LXG93epGHPJn0,4713
23
+ pixeltable/exec/component_iteration_node.py,sha256=b3tyspAuYLYHlb7BvAWqDpMGJojSeqtP-l8x72OGjvA,4678
24
24
  pixeltable/exec/data_row_batch.py,sha256=E0SVjyOBc237DopT0TwqK7JzcgFTEpE3xOS9K0-WFh8,3407
25
25
  pixeltable/exec/exec_context.py,sha256=l7GWAbt57H9VEksrDCeocmlc-MgUp8w_nDdAau8Cfqw,1115
26
26
  pixeltable/exec/exec_node.py,sha256=RbMJLDy7jwphNCEphSL0w50Dy1lrpjtEEugzyL6pqlA,4006
@@ -41,7 +41,7 @@ pixeltable/exprs/column_ref.py,sha256=MBWrNwnbRe0Hswu0q_Arerm9JoQs_0pNSsCYVxXONx
41
41
  pixeltable/exprs/comparison.py,sha256=5Bw6fEvVq-ynt3ciGLCouse7ZWFGPA-egsEkgUjUvsc,5132
42
42
  pixeltable/exprs/compound_predicate.py,sha256=ZN_JL97OZfTwdfgXF2t57EGyTYrpsBHaduZWRuBAekk,3832
43
43
  pixeltable/exprs/data_row.py,sha256=4lEyTxTw95v3ERuG9mFUBla8FfhPueoZyltcpTsWLK0,10577
44
- pixeltable/exprs/expr.py,sha256=uE8_hMVF1fCILVR4DWKR6WyC7ovp9iY1mCpsrI3eQ_U,32208
44
+ pixeltable/exprs/expr.py,sha256=r7eS6-7RCHemYBv_Ap1U9IKcZHpVqAghpxHcCpuk6uY,32463
45
45
  pixeltable/exprs/expr_dict.py,sha256=wf82K-aCPHZcM2A-VbE_0p5OzQFfVsI65uzMLp4Uwu4,1589
46
46
  pixeltable/exprs/expr_set.py,sha256=kkcG9df8fQOblNIKz2xciw9qfu2CnTWb4qwJKYVTUx8,2578
47
47
  pixeltable/exprs/function_call.py,sha256=3zjWl_vAKHpClR61-wpNNfPWYp5ccHO8CXD3Dts2bcs,28123
@@ -88,7 +88,7 @@ pixeltable/functions/llama_cpp.py,sha256=1nVXgU5ymuNblVNqRQv3iAEvlYpqzDZPAjYnAOH
88
88
  pixeltable/functions/math.py,sha256=WPoH9zD9_GdwvBs-FSC3Sqb70gOPNouhPcBZABsuLwI,1541
89
89
  pixeltable/functions/mistralai.py,sha256=H2onsnW1R_SaFN5SI_JWO0A5lJdlsnKxmtIu2m19cEg,6212
90
90
  pixeltable/functions/ollama.py,sha256=Et0l7XEMaNLxDwy3qTblljomjCkOQroY1Z7a-Ajmshk,4218
91
- pixeltable/functions/openai.py,sha256=mdeo4Y-wg-9LJAlmLxydu3VAS4NGgRQQvVb_7Gkefpc,28109
91
+ pixeltable/functions/openai.py,sha256=Oc_WApfR8M_-EgUEwV1BBuQwkmhunLUGqUVl5CWDTnA,29083
92
92
  pixeltable/functions/replicate.py,sha256=BQ5iaFJnw5MioL3X08DQiH41xQ_Pi2H5DDEasux9-fE,2454
93
93
  pixeltable/functions/string.py,sha256=1vFlbqKVm2n6jdh23BIA_8MBJJiNyxbQoFs5tJPgpy4,20433
94
94
  pixeltable/functions/timestamp.py,sha256=KKOw7l1hErYp8QQfFiWVTf7QowZszOyHJu-OJDKaXSg,9114
@@ -108,7 +108,7 @@ pixeltable/io/fiftyone.py,sha256=nviYiqDOGS5Os374Tl5knGNXpjFlgqcKnSPsBzz60vU,685
108
108
  pixeltable/io/globals.py,sha256=0X0sLpVrqPlgNna_vQX4KcBuerdUojZDTyTaX2sKV4I,17838
109
109
  pixeltable/io/hf_datasets.py,sha256=DV_bHB-LOQB8YC9FK1KYTEgaBPFelk31fYpq8h72eEE,8321
110
110
  pixeltable/io/label_studio.py,sha256=Dlq-2iVBadDnU0xOn3duLbpBJxiegY04XkWsmqQTXwk,31242
111
- pixeltable/io/pandas.py,sha256=Z-hBUbC6t-dGfJe8ksYXjp8k6T9xGvwvpbIXZLekHbw,9952
111
+ pixeltable/io/pandas.py,sha256=eKoo0tTPnKJUGOIc8VUV1gamsoeOPO6pOtXJyEV_W84,9594
112
112
  pixeltable/io/parquet.py,sha256=2i3YAQd-ZifxJv4JUU5Ysh7p6SemozBncd989bSl_qw,8745
113
113
  pixeltable/iterators/__init__.py,sha256=r5NYNF7qsepOPJnywG5N7jTz3Z1ubrbSzD19JK97cCM,431
114
114
  pixeltable/iterators/audio.py,sha256=UfWAzUAq33bqN5R7-kFK4LN2VUukhgZhAsnoHuOm2CU,9092
@@ -139,10 +139,12 @@ pixeltable/metadata/notes.py,sha256=2gQ0fAdAWOKxvzZ5DVBdmTk62q_KFGRFmv0tzi7tklE,
139
139
  pixeltable/metadata/schema.py,sha256=kv-PIMfG_NysET1k71iwIkBVlK5HwdnotXUvFeLaxaY,9470
140
140
  pixeltable/plan.py,sha256=ZTXpt10Rexvfm3_68CLQzUAS7YubZjbUJLbAN-RZDps,42385
141
141
  pixeltable/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
142
+ pixeltable/share/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
143
+ pixeltable/share/packager.py,sha256=QcMRI5qihNzO9Wcku-KpA8N7jUCkygrJUyyHB5XAGAA,10233
142
144
  pixeltable/store.py,sha256=uQKW9A3RWVVuP6bnQx22jhs5_WxQKx3rV0sGpdoBUzY,22555
143
- pixeltable/type_system.py,sha256=yTMSt8hljouXH3jZ0xMinhNDMCVZB0dVTZRXejBcODU,50183
145
+ pixeltable/type_system.py,sha256=c1kVcnX2Siu_V4DDn6DVF7nnDSNzFlDFw583WnWsUIc,50927
144
146
  pixeltable/utils/__init__.py,sha256=UYlrf6TIWJT0g-Hac0b34-dEk478B5Qx8dGco34YlIk,439
145
- pixeltable/utils/arrow.py,sha256=L0JFj6YQry1iHqhom6Zc9zWa8j6VCEUgQ0OfKqTiukY,3865
147
+ pixeltable/utils/arrow.py,sha256=EVFTHXt1r1b-rbvgG-TOjvl6GiAtm1hH-86A449cKTw,3901
146
148
  pixeltable/utils/coco.py,sha256=dl-IYO4VgfFly4-TvvF9Rw9XK2yY6HGTuL7LcyQk_RA,7290
147
149
  pixeltable/utils/code.py,sha256=SbG5OUF_fQAbOgGZHDuENijmbzisVqa4VS9guaZ0KtU,1231
148
150
  pixeltable/utils/console_output.py,sha256=GJ1oJWanP8_an343CEB35rtc1kcVW1FQtT3vRT4SZPs,1148
@@ -151,13 +153,14 @@ pixeltable/utils/documents.py,sha256=APFujdYq1qe2Do4KAUI0te35jh4925geR9UB8GeFQ1w
151
153
  pixeltable/utils/filecache.py,sha256=sYofh-6TwkQbwe8X64eUt27itSJ8o5rY10HYZJShbbI,10703
152
154
  pixeltable/utils/formatter.py,sha256=5E_gDg11ClFI-5SthwkiqyE3hAok3JHDj4OSK9cJklM,9257
153
155
  pixeltable/utils/http_server.py,sha256=zsESVjtG1P6hrz-d2N1m6_BChqPt8N3f-EO9sJbWnLs,2388
154
- pixeltable/utils/media_store.py,sha256=LcVTF8CW9C54mGg6OHI5u9W-gh5CkIfxbQaP9WAkmag,3093
156
+ pixeltable/utils/iceberg.py,sha256=L_s9G9NMIGMQdRHtNkks6ntTVW4DKKAw97R9gRmtw5s,553
157
+ pixeltable/utils/media_store.py,sha256=kSQ6YwQPRQzOhhCChS2hYmY9HxXX1fRq_M_FgkfsYU8,3091
155
158
  pixeltable/utils/pytorch.py,sha256=8lJT1SyP9jTMN7uLtrj9T_rGPEYRID44rWXbjBhRUrU,3422
156
159
  pixeltable/utils/s3.py,sha256=pxip2MlCqd2Qon2dzJXzfxvwtZyc-BAsjAnLL4J_OXY,587
157
160
  pixeltable/utils/sql.py,sha256=JX_fNI_SJWVUcXif5ho5qVhfJKFupOCFLLrHCMcbzLk,796
158
161
  pixeltable/utils/transactional_directory.py,sha256=4Q8UTylEyw-aZa-NVjfjGR9_JHRJTGQH1k1LNFaZukY,1349
159
- pixeltable-0.3.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
160
- pixeltable-0.3.3.dist-info/METADATA,sha256=s4trJASrbIe9hPC3MHXe0Tsvo7Fc0avMjgOpukZ7Hsw,19359
161
- pixeltable-0.3.3.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
162
- pixeltable-0.3.3.dist-info/entry_points.txt,sha256=ToOd-pRgG7AitEBgYoBCRRB4-KVDQ0pj_9T4a1LgwA4,97
163
- pixeltable-0.3.3.dist-info/RECORD,,
162
+ pixeltable-0.3.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
163
+ pixeltable-0.3.4.dist-info/METADATA,sha256=nM9QtJyu9ljdyn9ktpCuNLf9uaReun1Lo83BG9zR9Z4,19428
164
+ pixeltable-0.3.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
165
+ pixeltable-0.3.4.dist-info/entry_points.txt,sha256=ToOd-pRgG7AitEBgYoBCRRB4-KVDQ0pj_9T4a1LgwA4,97
166
+ pixeltable-0.3.4.dist-info/RECORD,,