pixeltable 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (63) hide show
  1. pixeltable/__init__.py +1 -0
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/catalog.py +9 -2
  4. pixeltable/catalog/column.py +1 -1
  5. pixeltable/catalog/dir.py +1 -1
  6. pixeltable/catalog/table.py +3 -1
  7. pixeltable/catalog/table_version.py +12 -2
  8. pixeltable/catalog/table_version_path.py +2 -2
  9. pixeltable/catalog/view.py +64 -20
  10. pixeltable/dataframe.py +11 -6
  11. pixeltable/env.py +12 -0
  12. pixeltable/exec/expr_eval/evaluators.py +4 -2
  13. pixeltable/exec/expr_eval/expr_eval_node.py +4 -1
  14. pixeltable/exprs/comparison.py +8 -4
  15. pixeltable/exprs/data_row.py +9 -7
  16. pixeltable/exprs/expr.py +2 -2
  17. pixeltable/exprs/function_call.py +155 -313
  18. pixeltable/exprs/json_mapper.py +25 -8
  19. pixeltable/exprs/json_path.py +6 -5
  20. pixeltable/exprs/object_ref.py +16 -5
  21. pixeltable/exprs/row_builder.py +10 -3
  22. pixeltable/func/aggregate_function.py +29 -15
  23. pixeltable/func/callable_function.py +11 -8
  24. pixeltable/func/expr_template_function.py +3 -9
  25. pixeltable/func/function.py +148 -74
  26. pixeltable/func/signature.py +65 -30
  27. pixeltable/func/tools.py +26 -26
  28. pixeltable/func/udf.py +1 -1
  29. pixeltable/functions/__init__.py +1 -0
  30. pixeltable/functions/anthropic.py +9 -3
  31. pixeltable/functions/deepseek.py +121 -0
  32. pixeltable/functions/image.py +7 -7
  33. pixeltable/functions/openai.py +30 -13
  34. pixeltable/functions/video.py +14 -7
  35. pixeltable/globals.py +14 -3
  36. pixeltable/index/embedding_index.py +4 -13
  37. pixeltable/io/globals.py +88 -77
  38. pixeltable/io/hf_datasets.py +34 -34
  39. pixeltable/io/pandas.py +75 -76
  40. pixeltable/io/parquet.py +19 -27
  41. pixeltable/io/utils.py +115 -0
  42. pixeltable/iterators/audio.py +2 -1
  43. pixeltable/iterators/video.py +1 -1
  44. pixeltable/metadata/__init__.py +2 -1
  45. pixeltable/metadata/converters/convert_15.py +18 -8
  46. pixeltable/metadata/converters/convert_27.py +31 -0
  47. pixeltable/metadata/converters/convert_28.py +15 -0
  48. pixeltable/metadata/converters/convert_29.py +111 -0
  49. pixeltable/metadata/converters/util.py +12 -1
  50. pixeltable/metadata/notes.py +3 -0
  51. pixeltable/metadata/schema.py +8 -0
  52. pixeltable/share/__init__.py +1 -0
  53. pixeltable/share/packager.py +41 -13
  54. pixeltable/share/publish.py +97 -0
  55. pixeltable/type_system.py +40 -14
  56. pixeltable/utils/__init__.py +41 -0
  57. pixeltable/utils/arrow.py +40 -7
  58. pixeltable/utils/formatter.py +1 -1
  59. {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/METADATA +34 -49
  60. {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/RECORD +63 -57
  61. {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/WHEEL +1 -1
  62. {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/LICENSE +0 -0
  63. {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/entry_points.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import dataclasses
1
2
  import io
2
3
  import json
3
4
  import logging
@@ -5,8 +6,9 @@ import tarfile
5
6
  import urllib.parse
6
7
  import urllib.request
7
8
  import uuid
9
+ from datetime import datetime
8
10
  from pathlib import Path
9
- from typing import Any, Iterator
11
+ from typing import Any, Iterator, Optional
10
12
 
11
13
  import more_itertools
12
14
  import numpy as np
@@ -15,7 +17,8 @@ import pyiceberg.catalog
15
17
 
16
18
  import pixeltable as pxt
17
19
  import pixeltable.type_system as ts
18
- from pixeltable import exprs
20
+ from pixeltable import catalog, exprs, metadata
21
+ from pixeltable.dataframe import DataFrame
19
22
  from pixeltable.env import Env
20
23
  from pixeltable.utils.arrow import PXT_TO_PA_TYPES
21
24
  from pixeltable.utils.iceberg import sqlite_catalog
@@ -28,6 +31,7 @@ class TablePackager:
28
31
  Packages a pixeltable Table into a tarball containing Iceberg tables and media files. The structure of the tarball
29
32
  is as follows:
30
33
 
34
+ metadata.json # Pixeltable metadata for the packaged table
31
35
  warehouse/catalog.db # sqlite Iceberg catalog
32
36
  warehouse/pxt.db/** # Iceberg metadata and data files (parquet/avro/json)
33
37
  media/** # Local media files
@@ -43,16 +47,40 @@ class TablePackager:
43
47
  'media/{uuid}{extension}', and the Iceberg table will contain the ephemeral URI 'pxtmedia://{uuid}{extension}'.
44
48
  """
45
49
 
46
- table: pxt.Table # The table to be packaged
50
+ table: catalog.Table # The table to be packaged
47
51
  tmp_dir: Path # Temporary directory where the package will reside
48
52
  iceberg_catalog: pyiceberg.catalog.Catalog
49
53
  media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
54
+ md: dict[str, Any]
50
55
 
51
- def __init__(self, table: pxt.Table) -> None:
56
+ def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
52
57
  self.table = table
53
58
  self.tmp_dir = Path(Env.get().create_tmp_path())
54
59
  self.media_files = {}
55
60
 
61
+ # Generate metadata
62
+ self.md = {
63
+ 'pxt_version': pxt.__version__,
64
+ 'pxt_md_version': metadata.VERSION,
65
+ 'md': {
66
+ 'tables': [
67
+ {
68
+ 'table_id': str(t._tbl_version.id),
69
+ # These are temporary; will replace with a better solution once the concurrency changes to catalog have
70
+ # been merged
71
+ 'table_md': dataclasses.asdict(t._tbl_version._create_tbl_md()),
72
+ 'table_version_md': dataclasses.asdict(
73
+ t._tbl_version._create_version_md(datetime.now().timestamp())
74
+ ),
75
+ 'table_schema_version_md': dataclasses.asdict(t._tbl_version._create_schema_version_md(0)),
76
+ }
77
+ for t in (table, *table._bases)
78
+ ]
79
+ },
80
+ }
81
+ if additional_md is not None:
82
+ self.md.update(additional_md)
83
+
56
84
  def package(self) -> Path:
57
85
  """
58
86
  Export the table to a tarball containing Iceberg tables and media files.
@@ -60,8 +88,10 @@ class TablePackager:
60
88
  assert not self.tmp_dir.exists() # Packaging can only be done once per TablePackager instance
61
89
  _logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
62
90
  self.tmp_dir.mkdir()
91
+ with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
92
+ json.dump(self.md, fp)
63
93
  self.iceberg_catalog = sqlite_catalog(self.tmp_dir / 'warehouse')
64
- ancestors = [self.table] + self.table._bases
94
+ ancestors = (self.table, *self.table._bases)
65
95
  for t in ancestors:
66
96
  _logger.info(f"Exporting table '{t._path}'.")
67
97
  self.__export_table(t)
@@ -70,7 +100,7 @@ class TablePackager:
70
100
  _logger.info(f'Packaging complete: {bundle_path}')
71
101
  return bundle_path
72
102
 
73
- def __export_table(self, t: pxt.Table) -> None:
103
+ def __export_table(self, t: catalog.Table) -> None:
74
104
  """
75
105
  Exports the data from `t` into an Iceberg table.
76
106
  """
@@ -116,7 +146,7 @@ class TablePackager:
116
146
  iceberg_tbl.append(pa_table)
117
147
 
118
148
  @classmethod
119
- def __iceberg_namespace(cls, table: pxt.Table) -> str:
149
+ def __iceberg_namespace(cls, table: catalog.Table) -> str:
120
150
  """
121
151
  Iceberg tables must have a namespace, which cannot be the empty string, so we prepend `pxt` to the table path.
122
152
  """
@@ -149,11 +179,7 @@ class TablePackager:
149
179
  return PXT_TO_PA_TYPES.get(col_type.__class__)
150
180
 
151
181
  def __to_pa_tables(
152
- self,
153
- df: pxt.DataFrame,
154
- actual_col_types: list[pxt.ColumnType],
155
- arrow_schema: pa.Schema,
156
- batch_size: int = 1_000,
182
+ self, df: DataFrame, actual_col_types: list[ts.ColumnType], arrow_schema: pa.Schema, batch_size: int = 1_000
157
183
  ) -> Iterator[pa.Table]:
158
184
  """
159
185
  Load a DataFrame as a sequence of pyarrow tables. The pyarrow tables are batched into smaller chunks
@@ -165,7 +191,7 @@ class TablePackager:
165
191
  cols['_v_min'] = [row[-1] for row in rows]
166
192
  yield pa.Table.from_pydict(cols, schema=arrow_schema)
167
193
 
168
- def __to_pa_rows(self, df: pxt.DataFrame, actual_col_types: list[pxt.ColumnType]) -> Iterator[list]:
194
+ def __to_pa_rows(self, df: DataFrame, actual_col_types: list[ts.ColumnType]) -> Iterator[list]:
169
195
  for row in df._exec():
170
196
  vals = [row[e.slot_idx] for e in df._select_list_exprs]
171
197
  result = [self.__to_pa_value(val, col_type) for val, col_type in zip(vals, actual_col_types)]
@@ -210,6 +236,8 @@ class TablePackager:
210
236
  def __build_tarball(self) -> Path:
211
237
  bundle_path = self.tmp_dir / 'bundle.tar.bz2'
212
238
  with tarfile.open(bundle_path, 'w:bz2') as tf:
239
+ # Add metadata json
240
+ tf.add(self.tmp_dir / 'metadata.json', arcname='metadata.json')
213
241
  # Add the Iceberg warehouse dir (including the catalog)
214
242
  tf.add(self.tmp_dir / 'warehouse', arcname='warehouse', recursive=True)
215
243
  # Add the media files
@@ -0,0 +1,97 @@
1
+ import dataclasses
2
+ import os
3
+ import sys
4
+ import urllib.parse
5
+ import urllib.request
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+
9
+ import requests
10
+ from tqdm import tqdm
11
+
12
+ import pixeltable as pxt
13
+ from pixeltable import exceptions as excs, metadata
14
+ from pixeltable.env import Env
15
+ from pixeltable.utils import sha256sum
16
+
17
+ from .packager import TablePackager
18
+
19
+ # These URLs are abstracted out for now, but will be replaced with actual (hard-coded) URLs once the
20
+ # pixeltable.com URLs are available.
21
+ _PUBLISH_URL = os.environ.get('PIXELTABLE_PUBLISH_URL')
22
+ _FINALIZE_URL = os.environ.get('PIXELTABLE_FINALIZE_URL')
23
+
24
+
25
+ def publish_snapshot(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
26
+ packager = TablePackager(src_tbl, additional_md={'table_uri': dest_tbl_uri})
27
+ request_json = packager.md
28
+ headers_json = {'X-api-key': Env.get().pxt_api_key}
29
+
30
+ response = requests.post(_PUBLISH_URL, json=request_json, headers=headers_json)
31
+ if response.status_code != 200:
32
+ raise excs.Error(f'Error publishing snapshot: {response.text}')
33
+ response_json = response.json()
34
+ if not isinstance(response_json, dict) or response_json.get('destination') != 's3':
35
+ raise excs.Error(f'Error publishing snapshot: unexpected response from server.\n{response_json}')
36
+ upload_id = response_json['upload_id']
37
+ destination_uri = response_json['destination_uri']
38
+
39
+ Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path}' at: {dest_tbl_uri}")
40
+
41
+ bundle = packager.package()
42
+
43
+ parsed_location = urllib.parse.urlparse(destination_uri)
44
+ if parsed_location.scheme == 's3':
45
+ _upload_bundle_to_s3(bundle, parsed_location)
46
+ else:
47
+ raise excs.Error(f'Unsupported destination: {destination_uri}')
48
+
49
+ Env.get().console_logger.info(f'Finalizing snapshot ...')
50
+
51
+ finalize_request_json = {
52
+ 'upload_id': upload_id,
53
+ 'datafile': bundle.name,
54
+ 'size': bundle.stat().st_size,
55
+ 'sha256': sha256sum(bundle), # Generate our own SHA for independent verification
56
+ }
57
+
58
+ # TODO: Use Pydantic for validation
59
+ finalize_response = requests.post(_FINALIZE_URL, json=finalize_request_json, headers=headers_json)
60
+ if finalize_response.status_code != 200:
61
+ raise excs.Error(f'Error finalizing snapshot: {finalize_response.text}')
62
+ finalize_response_json = finalize_response.json()
63
+ if not isinstance(finalize_response_json, dict) or 'confirmed_table_uri' not in finalize_response_json:
64
+ raise excs.Error(f'Error finalizing snapshot: unexpected response from server.\n{finalize_response_json}')
65
+
66
+ confirmed_tbl_uri = finalize_response_json['confirmed_table_uri']
67
+ Env.get().console_logger.info(f'The published snapshot is now available at: {confirmed_tbl_uri}')
68
+ return confirmed_tbl_uri
69
+
70
+
71
+ def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult) -> None:
72
+ from pixeltable.utils.s3 import get_client
73
+
74
+ bucket = parsed_location.netloc
75
+ remote_dir = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_location.path)))
76
+ remote_path = str(remote_dir / bundle.name)[1:] # Remove initial /
77
+
78
+ Env.get().console_logger.info(f'Uploading snapshot to: {bucket}:{remote_path}')
79
+
80
+ boto_config = {'max_pool_connections': 5, 'connect_timeout': 15, 'retries': {'max_attempts': 3, 'mode': 'adaptive'}}
81
+ s3_client = get_client(**boto_config)
82
+
83
+ upload_args = {'ChecksumAlgorithm': 'SHA256'}
84
+
85
+ progress_bar = tqdm(
86
+ desc=f'Uploading',
87
+ total=bundle.stat().st_size,
88
+ unit='B',
89
+ unit_scale=True,
90
+ unit_divisor=1024,
91
+ miniters=1, # Update every iteration (should be fine for an upload)
92
+ ncols=100,
93
+ file=sys.stdout,
94
+ )
95
+ s3_client.upload_file(
96
+ Filename=str(bundle), Bucket=bucket, Key=str(remote_path), ExtraArgs=upload_args, Callback=progress_bar.update
97
+ )
pixeltable/type_system.py CHANGED
@@ -8,10 +8,9 @@ import json
8
8
  import typing
9
9
  import urllib.parse
10
10
  import urllib.request
11
- from pathlib import Path
12
11
  from typing import Any, Iterable, Literal, Mapping, Optional, Sequence, Union
13
12
 
14
- import av # type: ignore
13
+ import av
15
14
  import jsonschema
16
15
  import jsonschema.protocols
17
16
  import jsonschema.validators
@@ -22,6 +21,7 @@ import sqlalchemy as sql
22
21
  from typing_extensions import _AnnotatedAlias
23
22
 
24
23
  import pixeltable.exceptions as excs
24
+ from pixeltable.utils import parse_local_file_path
25
25
 
26
26
  from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
27
27
 
@@ -47,8 +47,8 @@ class ColumnType:
47
47
  @classmethod
48
48
  def supertype(
49
49
  cls,
50
- type1: 'ColumnType.Type',
51
- type2: 'ColumnType.Type',
50
+ type1: Optional['ColumnType.Type'],
51
+ type2: Optional['ColumnType.Type'],
52
52
  # we need to pass this in because we can't easily append it as a class member
53
53
  common_supertypes: dict[tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type'],
54
54
  ) -> Optional['ColumnType.Type']:
@@ -93,6 +93,9 @@ class ColumnType:
93
93
  self._type = t
94
94
  self._nullable = nullable
95
95
 
96
+ def has_supertype(self) -> bool:
97
+ return True
98
+
96
99
  @property
97
100
  def nullable(self) -> bool:
98
101
  return self._nullable
@@ -271,8 +274,10 @@ class ColumnType:
271
274
  inferred_type = val_type
272
275
  else:
273
276
  inferred_type = inferred_type.supertype(val_type)
274
- if inferred_type is None:
275
- return None
277
+ if inferred_type is None:
278
+ return None
279
+ if not inferred_type.has_supertype():
280
+ return inferred_type
276
281
  return inferred_type
277
282
 
278
283
  @classmethod
@@ -397,12 +402,9 @@ class ColumnType:
397
402
  def _validate_file_path(self, val: Any) -> None:
398
403
  """Raises TypeError if not a valid local file path or not a path/byte sequence"""
399
404
  if isinstance(val, str):
400
- parsed = urllib.parse.urlparse(val)
401
- if parsed.scheme != '' and parsed.scheme != 'file':
402
- return
403
- path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
404
- if not path.is_file():
405
- raise TypeError(f'File not found: {str(path)}')
405
+ path = parse_local_file_path(val)
406
+ if path is not None and not path.is_file():
407
+ raise TypeError(f'File not found: {path}')
406
408
  else:
407
409
  if not isinstance(val, bytes):
408
410
  raise TypeError(f'expected file path or bytes, got {type(val)}')
@@ -495,7 +497,7 @@ class InvalidType(ColumnType):
495
497
  super().__init__(self.Type.INVALID, nullable=nullable)
496
498
 
497
499
  def to_sa_type(self) -> sql.types.TypeEngine:
498
- assert False
500
+ return sql.types.NullType()
499
501
 
500
502
  def print_value(self, val: Any) -> str:
501
503
  return str(val)
@@ -508,6 +510,9 @@ class StringType(ColumnType):
508
510
  def __init__(self, nullable: bool = False):
509
511
  super().__init__(self.Type.STRING, nullable=nullable)
510
512
 
513
+ def has_supertype(self):
514
+ return not self.nullable
515
+
511
516
  def to_sa_type(self) -> sql.types.TypeEngine:
512
517
  return sql.String()
513
518
 
@@ -591,6 +596,9 @@ class TimestampType(ColumnType):
591
596
  def __init__(self, nullable: bool = False):
592
597
  super().__init__(self.Type.TIMESTAMP, nullable=nullable)
593
598
 
599
+ def has_supertype(self):
600
+ return not self.nullable
601
+
594
602
  def to_sa_type(self) -> sql.types.TypeEngine:
595
603
  return sql.TIMESTAMP(timezone=True)
596
604
 
@@ -601,6 +609,8 @@ class TimestampType(ColumnType):
601
609
  def _create_literal(self, val: Any) -> Any:
602
610
  if isinstance(val, str):
603
611
  return datetime.datetime.fromisoformat(val)
612
+ if isinstance(val, datetime.datetime):
613
+ return val
604
614
  return val
605
615
 
606
616
 
@@ -651,6 +661,10 @@ class JsonType(ColumnType):
651
661
  return val_type.print_value(val)
652
662
 
653
663
  def _validate_literal(self, val: Any) -> None:
664
+ if isinstance(val, tuple):
665
+ val = list(val)
666
+ if isinstance(val, pydantic.BaseModel):
667
+ val = val.model_dump()
654
668
  if not self.__is_valid_json(val):
655
669
  raise TypeError(f'That literal is not a valid Pixeltable JSON object: {val}')
656
670
  if self.__validator is not None:
@@ -818,14 +832,20 @@ class ArrayType(ColumnType):
818
832
  return hash((self._type, self.nullable, self.shape, self.dtype))
819
833
 
820
834
  def supertype(self, other: ColumnType) -> Optional[ArrayType]:
835
+ basic_supertype = super().supertype(other)
836
+ if basic_supertype is not None:
837
+ assert isinstance(basic_supertype, ArrayType)
838
+ return basic_supertype
839
+
821
840
  if not isinstance(other, ArrayType):
822
841
  return None
842
+
823
843
  super_dtype = self.Type.supertype(self.dtype, other.dtype, self.common_supertypes)
824
844
  if super_dtype is None:
825
845
  # if the dtypes are incompatible, then the supertype is a fully general array
826
846
  return ArrayType(nullable=(self.nullable or other.nullable))
827
847
  super_shape: Optional[tuple[Optional[int], ...]]
828
- if len(self.shape) != len(other.shape):
848
+ if self.shape is None or other.shape is None or len(self.shape) != len(other.shape):
829
849
  super_shape = None
830
850
  else:
831
851
  super_shape = tuple(n1 if n1 == n2 else None for n1, n2 in zip(self.shape, other.shape))
@@ -1009,8 +1029,14 @@ class ImageType(ColumnType):
1009
1029
  return hash((self._type, self.nullable, self.size, self.mode))
1010
1030
 
1011
1031
  def supertype(self, other: ColumnType) -> Optional[ImageType]:
1032
+ basic_supertype = super().supertype(other)
1033
+ if basic_supertype is not None:
1034
+ assert isinstance(basic_supertype, ImageType)
1035
+ return basic_supertype
1036
+
1012
1037
  if not isinstance(other, ImageType):
1013
1038
  return None
1039
+
1014
1040
  width = self.width if self.width == other.width else None
1015
1041
  height = self.height if self.height == other.height else None
1016
1042
  mode = self.mode if self.mode == other.mode else None
@@ -1,3 +1,10 @@
1
+ import hashlib
2
+ import urllib.parse
3
+ import urllib.request
4
+ from pathlib import Path
5
+ from typing import Optional, Union
6
+
7
+
1
8
  def print_perf_counter_delta(delta: float) -> str:
2
9
  """Prints a performance counter delta in a human-readable format.
3
10
 
@@ -15,3 +22,37 @@ def print_perf_counter_delta(delta: float) -> str:
15
22
  return f'{delta * 1e3:.2f} ms'
16
23
  else:
17
24
  return f'{delta:.2f} s'
25
+
26
+
27
+ def sha256sum(path: Union[Path, str]) -> str:
28
+ """
29
+ Compute the SHA256 hash of a file.
30
+ """
31
+ if isinstance(path, str):
32
+ path = Path(path)
33
+
34
+ h = hashlib.sha256()
35
+ with open(path, 'rb') as file:
36
+ while chunk := file.read(h.block_size):
37
+ h.update(chunk)
38
+
39
+ return h.hexdigest()
40
+
41
+
42
+ def parse_local_file_path(file_or_url: str) -> Optional[Path]:
43
+ """
44
+ Parses a string that may be either a URL or a local file path.
45
+
46
+ If the string is a local file path or a file-scheme URL (file://), then a Path object will be returned.
47
+ Otherwise, None will be returned.
48
+ """
49
+ parsed = urllib.parse.urlparse(file_or_url)
50
+ if len(parsed.scheme) <= 1:
51
+ # We're using `urlparse` to help distinguish file paths from URLs. If there is no scheme, then it's a file path.
52
+ # If there's a single-character scheme, we also interpret this as a file path; this insures that drive letters
53
+ # on Windows pathnames are correctly handled.
54
+ return Path(file_or_url).absolute()
55
+ elif parsed.scheme == 'file':
56
+ return Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
57
+ else:
58
+ return None
pixeltable/utils/arrow.py CHANGED
@@ -8,6 +8,8 @@ import pixeltable.type_system as ts
8
8
 
9
9
  PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
10
10
  pa.string(): ts.StringType(nullable=True),
11
+ pa.large_string(): ts.StringType(nullable=True),
12
+ pa.timestamp('us', tz=datetime.timezone.utc): ts.TimestampType(nullable=True),
11
13
  pa.bool_(): ts.BoolType(nullable=True),
12
14
  pa.uint8(): ts.IntType(nullable=True),
13
15
  pa.int8(): ts.IntType(nullable=True),
@@ -16,6 +18,7 @@ PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
16
18
  pa.int32(): ts.IntType(nullable=True),
17
19
  pa.int64(): ts.IntType(nullable=True),
18
20
  pa.float32(): ts.FloatType(nullable=True),
21
+ pa.float64(): ts.FloatType(nullable=True),
19
22
  }
20
23
 
21
24
  PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
@@ -32,19 +35,20 @@ PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
32
35
  }
33
36
 
34
37
 
35
- def to_pixeltable_type(arrow_type: pa.DataType) -> Optional[ts.ColumnType]:
38
+ def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.ColumnType]:
36
39
  """Convert a pyarrow DataType to a pixeltable ColumnType if one is defined.
37
40
  Returns None if no conversion is currently implemented.
38
41
  """
39
42
  if isinstance(arrow_type, pa.TimestampType):
40
- return ts.TimestampType(nullable=True)
43
+ return ts.TimestampType(nullable=nullable)
41
44
  elif arrow_type in PA_TO_PXT_TYPES:
42
- return PA_TO_PXT_TYPES[arrow_type]
45
+ pt = PA_TO_PXT_TYPES[arrow_type]
46
+ return pt.copy(nullable=nullable)
43
47
  elif isinstance(arrow_type, pa.FixedShapeTensorType):
44
- dtype = to_pixeltable_type(arrow_type.value_type)
48
+ dtype = to_pixeltable_type(arrow_type.value_type, nullable)
45
49
  if dtype is None:
46
50
  return None
47
- return ts.ArrayType(shape=arrow_type.shape, dtype=dtype)
51
+ return ts.ArrayType(shape=arrow_type.shape, dtype=dtype, nullable=nullable)
48
52
  else:
49
53
  return None
50
54
 
@@ -61,8 +65,17 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
61
65
  return None
62
66
 
63
67
 
64
- def to_pixeltable_schema(arrow_schema: pa.Schema) -> dict[str, ts.ColumnType]:
65
- return {field.name: to_pixeltable_type(field.type) for field in arrow_schema}
68
+ def ar_infer_schema(
69
+ arrow_schema: pa.Schema, schema_overrides: dict[str, Any], primary_key: list[str]
70
+ ) -> dict[str, ts.ColumnType]:
71
+ """Convert a pyarrow Schema to a schema using pyarrow names and pixeltable types."""
72
+ ar_schema = {
73
+ field.name: to_pixeltable_type(field.type, field.name not in primary_key)
74
+ if field.name not in schema_overrides
75
+ else schema_overrides[field.name]
76
+ for field in arrow_schema
77
+ }
78
+ return ar_schema
66
79
 
67
80
 
68
81
  def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
@@ -96,3 +109,23 @@ def iter_tuples(batch: Union[pa.Table, pa.RecordBatch]) -> Iterator[dict[str, An
96
109
 
97
110
  for i in range(batch_size):
98
111
  yield {col_name: values[i] for col_name, values in pydict.items()}
112
+
113
+
114
+ def iter_tuples2(
115
+ batch: Union[pa.Table, pa.RecordBatch], col_mapping: Optional[dict[str, str]], schema: dict[str, ts.ColumnType]
116
+ ) -> Iterator[dict[str, Any]]:
117
+ """Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
118
+ pydict = to_pydict(batch)
119
+ assert len(pydict) > 0, 'empty record batch'
120
+ for _, v in pydict.items():
121
+ batch_size = len(v)
122
+ break
123
+
124
+ for i in range(batch_size):
125
+ # Convert a row to insertable format
126
+ yield {
127
+ (pxt_name := col_name if col_mapping is None else col_mapping[col_name]): schema[pxt_name].create_literal(
128
+ values[i]
129
+ )
130
+ for col_name, values in pydict.items()
131
+ }
@@ -6,7 +6,7 @@ import logging
6
6
  import mimetypes
7
7
  from typing import Any, Callable, Optional
8
8
 
9
- import av # type: ignore[import-untyped]
9
+ import av
10
10
  import numpy as np
11
11
  import PIL
12
12
  import PIL.Image as Image