pixeltable 0.4.12__py3-none-any.whl → 0.4.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (55) hide show
  1. pixeltable/__init__.py +11 -1
  2. pixeltable/catalog/__init__.py +2 -1
  3. pixeltable/catalog/catalog.py +179 -63
  4. pixeltable/catalog/column.py +24 -20
  5. pixeltable/catalog/table.py +96 -124
  6. pixeltable/catalog/table_metadata.py +96 -0
  7. pixeltable/catalog/table_version.py +15 -6
  8. pixeltable/catalog/view.py +22 -22
  9. pixeltable/config.py +2 -0
  10. pixeltable/dataframe.py +3 -2
  11. pixeltable/env.py +43 -21
  12. pixeltable/exec/__init__.py +1 -0
  13. pixeltable/exec/aggregation_node.py +0 -1
  14. pixeltable/exec/cache_prefetch_node.py +74 -98
  15. pixeltable/exec/data_row_batch.py +2 -18
  16. pixeltable/exec/in_memory_data_node.py +1 -1
  17. pixeltable/exec/object_store_save_node.py +299 -0
  18. pixeltable/exec/sql_node.py +28 -33
  19. pixeltable/exprs/data_row.py +31 -25
  20. pixeltable/exprs/json_path.py +6 -5
  21. pixeltable/exprs/row_builder.py +6 -12
  22. pixeltable/functions/gemini.py +1 -1
  23. pixeltable/functions/openai.py +1 -1
  24. pixeltable/functions/video.py +5 -6
  25. pixeltable/globals.py +6 -7
  26. pixeltable/index/embedding_index.py +5 -8
  27. pixeltable/io/__init__.py +2 -1
  28. pixeltable/io/fiftyone.py +1 -1
  29. pixeltable/io/label_studio.py +4 -5
  30. pixeltable/io/lancedb.py +3 -0
  31. pixeltable/io/parquet.py +9 -89
  32. pixeltable/io/table_data_conduit.py +2 -2
  33. pixeltable/iterators/audio.py +1 -1
  34. pixeltable/iterators/document.py +10 -12
  35. pixeltable/iterators/video.py +1 -1
  36. pixeltable/metadata/schema.py +7 -0
  37. pixeltable/plan.py +26 -1
  38. pixeltable/share/packager.py +8 -2
  39. pixeltable/share/publish.py +3 -9
  40. pixeltable/type_system.py +1 -3
  41. pixeltable/utils/arrow.py +97 -2
  42. pixeltable/utils/dbms.py +31 -5
  43. pixeltable/utils/gcs_store.py +283 -0
  44. pixeltable/utils/lancedb.py +88 -0
  45. pixeltable/utils/local_store.py +316 -0
  46. pixeltable/utils/object_stores.py +497 -0
  47. pixeltable/utils/pytorch.py +5 -6
  48. pixeltable/utils/s3_store.py +354 -0
  49. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/METADATA +162 -127
  50. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/RECORD +53 -47
  51. pixeltable/utils/media_store.py +0 -248
  52. pixeltable/utils/s3.py +0 -17
  53. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/WHEEL +0 -0
  54. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/entry_points.txt +0 -0
  55. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/licenses/LICENSE +0 -0
@@ -138,15 +138,12 @@ class EmbeddingIndex(IndexBase):
138
138
 
139
139
  def create_index(self, index_name: str, index_value_col: catalog.Column) -> None:
140
140
  """Create the index on the index value column"""
141
- idx = sql.Index(
142
- index_name,
143
- index_value_col.sa_col,
144
- postgresql_using='hnsw',
145
- postgresql_with={'m': 16, 'ef_construction': 64},
146
- postgresql_ops={index_value_col.sa_col.name: self.PGVECTOR_OPS[self.metric]},
141
+ Env.get().dbms.create_vector_index(
142
+ index_name=index_name,
143
+ index_value_sa_col=index_value_col.sa_col,
144
+ conn=Env.get().conn,
145
+ metric=self.PGVECTOR_OPS[self.metric],
147
146
  )
148
- conn = Env.get().conn
149
- idx.create(bind=conn)
150
147
 
151
148
  def drop_index(self, index_name: str, index_value_col: catalog.Column) -> None:
152
149
  """Drop the index on the index value column"""
pixeltable/io/__init__.py CHANGED
@@ -4,11 +4,12 @@ from .datarows import import_json, import_rows
4
4
  from .external_store import ExternalStore
5
5
  from .globals import create_label_studio_project, export_images_as_fo_dataset
6
6
  from .hf_datasets import import_huggingface_dataset
7
+ from .lancedb import export_lancedb
7
8
  from .pandas import import_csv, import_excel, import_pandas
8
9
  from .parquet import export_parquet, import_parquet
9
10
 
10
11
  __default_dir = {symbol for symbol in dir() if not symbol.startswith('_')}
11
- __removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows'}
12
+ __removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows', 'lancedb'}
12
13
  __all__ = sorted(__default_dir - __removed_symbols)
13
14
 
14
15
 
pixeltable/io/fiftyone.py CHANGED
@@ -9,7 +9,7 @@ import puremagic
9
9
  import pixeltable as pxt
10
10
  import pixeltable.exceptions as excs
11
11
  from pixeltable import exprs
12
- from pixeltable.utils.media_store import TempStore
12
+ from pixeltable.utils.local_store import TempStore
13
13
 
14
14
 
15
15
  class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
@@ -19,7 +19,7 @@ from pixeltable.config import Config
19
19
  from pixeltable.exprs import ColumnRef, DataRow, Expr
20
20
  from pixeltable.io.external_store import Project
21
21
  from pixeltable.utils import coco
22
- from pixeltable.utils.media_store import TempStore
22
+ from pixeltable.utils.local_store import TempStore
23
23
 
24
24
  # label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
25
25
  # the import two different ways to insure intercompatibility
@@ -46,6 +46,9 @@ class LabelStudioProject(Project):
46
46
  """
47
47
  An [`ExternalStore`][pixeltable.io.ExternalStore] that represents a Label Studio project, providing functionality
48
48
  for synchronizing between a Pixeltable table and a Label Studio project.
49
+
50
+ The constructor will NOT create a new Label Studio project; it is also used when loading
51
+ metadata for existing projects.
49
52
  """
50
53
 
51
54
  project_id: int # Label Studio project ID
@@ -60,10 +63,6 @@ class LabelStudioProject(Project):
60
63
  col_mapping: dict[ColumnHandle, str],
61
64
  stored_proxies: Optional[dict[ColumnHandle, ColumnHandle]] = None,
62
65
  ):
63
- """
64
- The constructor will NOT create a new Label Studio project; it is also used when loading
65
- metadata for existing projects.
66
- """
67
66
  self.project_id = project_id
68
67
  self.media_import_method = media_import_method
69
68
  self._project = None
@@ -0,0 +1,3 @@
1
+ from pixeltable.utils.lancedb import export_lancedb
2
+
3
+ __all__ = ['export_lancedb']
pixeltable/io/parquet.py CHANGED
@@ -1,46 +1,22 @@
1
1
  from __future__ import annotations
2
2
 
3
- import datetime
4
- import io
5
3
  import json
6
4
  import logging
7
5
  import typing
8
- from collections import deque
9
6
  from pathlib import Path
10
7
  from typing import Any, Optional
11
8
 
12
- import numpy as np
13
- import PIL.Image
14
-
15
9
  import pixeltable as pxt
16
10
  import pixeltable.exceptions as excs
17
11
  from pixeltable.catalog import Catalog
18
12
  from pixeltable.utils.transactional_directory import transactional_directory
19
13
 
20
14
  if typing.TYPE_CHECKING:
21
- import pyarrow as pa
22
-
23
15
  import pixeltable as pxt
24
16
 
25
17
  _logger = logging.getLogger('pixeltable')
26
18
 
27
19
 
28
- def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
29
- import pyarrow as pa
30
- from pyarrow import parquet
31
-
32
- pydict = {}
33
- for field in schema:
34
- if isinstance(field.type, pa.FixedShapeTensorType):
35
- stacked_arr = np.stack(value_batch[field.name])
36
- pydict[field.name] = pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr)
37
- else:
38
- pydict[field.name] = value_batch[field.name]
39
-
40
- tab = pa.Table.from_pydict(pydict, schema=schema)
41
- parquet.write_table(tab, str(output_path))
42
-
43
-
44
20
  def export_parquet(
45
21
  table_or_df: pxt.Table | pxt.DataFrame,
46
22
  parquet_path: Path,
@@ -63,7 +39,9 @@ def export_parquet(
63
39
  If False, will raise an error if the Dataframe has any image column.
64
40
  Default False.
65
41
  """
66
- from pixeltable.utils.arrow import to_arrow_schema
42
+ import pyarrow as pa
43
+
44
+ from pixeltable.utils.arrow import to_record_batches
67
45
 
68
46
  df: pxt.DataFrame
69
47
  if isinstance(table_or_df, pxt.catalog.Table):
@@ -71,9 +49,6 @@ def export_parquet(
71
49
  else:
72
50
  df = table_or_df
73
51
 
74
- type_dict = {k: v.as_dict() for k, v in df.schema.items()}
75
- arrow_schema = to_arrow_schema(df.schema)
76
-
77
52
  if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
78
53
  raise excs.Error('Cannot export Dataframe with image columns when inline_images is False')
79
54
 
@@ -81,70 +56,15 @@ def export_parquet(
81
56
  with transactional_directory(parquet_path) as temp_path:
82
57
  # dump metadata json file so we can inspect what was the source of the parquet file later on.
83
58
  json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
59
+ type_dict = {k: v.as_dict() for k, v in df.schema.items()}
84
60
  json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
85
-
86
61
  batch_num = 0
87
- current_value_batch: dict[str, deque] = {k: deque() for k in df.schema}
88
- current_byte_estimate = 0
89
-
90
62
  with Catalog.get().begin_xact(for_write=False):
91
- for data_row in df._exec():
92
- for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
93
- val = data_row[e.slot_idx]
94
- if val is None:
95
- current_value_batch[col_name].append(val)
96
- continue
97
-
98
- assert val is not None
99
- if col_type.is_image_type():
100
- # images get inlined into the parquet file
101
- if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
102
- # if there is a file, read directly to preserve information
103
- with open(data_row.file_paths[e.slot_idx], 'rb') as f:
104
- val = f.read()
105
- elif isinstance(val, PIL.Image.Image):
106
- # if no file available, eg. bc it is computed, convert to png
107
- buf = io.BytesIO()
108
- val.save(buf, format='PNG')
109
- val = buf.getvalue()
110
- else:
111
- raise excs.Error(f'unknown image type {type(val)}')
112
- length = len(val)
113
- elif col_type.is_string_type():
114
- length = len(val)
115
- elif col_type.is_video_type() or col_type.is_audio_type():
116
- if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
117
- val = data_row.file_paths[e.slot_idx]
118
- else:
119
- raise excs.Error(f'unknown audio/video type {type(val)}')
120
- length = len(val)
121
- elif col_type.is_json_type():
122
- val = json.dumps(val)
123
- length = len(val)
124
- elif col_type.is_array_type():
125
- length = val.nbytes
126
- elif col_type.is_int_type() or col_type.is_float_type():
127
- length = 8
128
- elif col_type.is_bool_type():
129
- length = 1
130
- elif col_type.is_date_type():
131
- length = 4
132
- elif col_type.is_timestamp_type():
133
- val = val.astimezone(datetime.timezone.utc)
134
- length = 8
135
- else:
136
- raise excs.Error(f'unknown type {col_type} for {col_name}')
137
-
138
- current_value_batch[col_name].append(val)
139
- current_byte_estimate += length
140
- if current_byte_estimate > partition_size_bytes:
141
- assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
142
- _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
143
- batch_num += 1
144
- current_value_batch = {k: deque() for k in df.schema}
145
- current_byte_estimate = 0
146
-
147
- _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
63
+ for record_batch in to_record_batches(df, partition_size_bytes):
64
+ output_path = temp_path / f'part-{batch_num:05d}.parquet'
65
+ arrow_tbl = pa.Table.from_batches([record_batch]) # type: ignore
66
+ pa.parquet.write_table(arrow_tbl, str(output_path))
67
+ batch_num += 1
148
68
 
149
69
 
150
70
  def import_parquet(
@@ -469,12 +469,12 @@ class ParquetTableDataConduit(TableDataConduit):
469
469
  return t
470
470
 
471
471
  def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
472
- from pixeltable.utils.arrow import ar_infer_schema
472
+ from pixeltable.utils.arrow import to_pxt_schema
473
473
 
474
474
  if self.source_column_map is None:
475
475
  if self.src_schema_overrides is None:
476
476
  self.src_schema_overrides = {}
477
- self.src_schema = ar_infer_schema(self.pq_ds.schema, self.src_schema_overrides, self.src_pk)
477
+ self.src_schema = to_pxt_schema(self.pq_ds.schema, self.src_schema_overrides, self.src_pk)
478
478
  inferred_schema, inferred_pk, self.source_column_map = normalize_schema_names(
479
479
  self.src_schema, self.src_pk, self.src_schema_overrides
480
480
  )
@@ -6,7 +6,7 @@ from typing import Any, ClassVar, Optional
6
6
  import av
7
7
 
8
8
  from pixeltable import exceptions as excs, type_system as ts
9
- from pixeltable.utils.media_store import TempStore
9
+ from pixeltable.utils.local_store import TempStore
10
10
 
11
11
  from .base import ComponentIterator
12
12
 
@@ -94,6 +94,16 @@ class DocumentSplitter(ComponentIterator):
94
94
  include additional metadata fields if specified in the `metadata` parameter, as explained below.
95
95
 
96
96
  Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
97
+
98
+ Args:
99
+ separators: separators to use to chunk the document. Options are:
100
+ `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
101
+ This may be a comma-separated string, e.g., `'heading,token_limit'`.
102
+ limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
103
+ or `'char_limit'` is specified.
104
+ metadata: additional metadata fields to include in the output. Options are:
105
+ `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
106
+ (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
97
107
  """
98
108
 
99
109
  METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
@@ -116,18 +126,6 @@ class DocumentSplitter(ComponentIterator):
116
126
  tiktoken_encoding: Optional[str] = 'cl100k_base',
117
127
  tiktoken_target_model: Optional[str] = None,
118
128
  ):
119
- """Init method for `DocumentSplitter` class.
120
-
121
- Args:
122
- separators: separators to use to chunk the document. Options are:
123
- `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
124
- This may be a comma-separated string, e.g., `'heading,token_limit'`.
125
- limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
126
- or `'char_limit'` is specified.
127
- metadata: additional metadata fields to include in the output. Options are:
128
- `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
129
- (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
130
- """
131
129
  if html_skip_tags is None:
132
130
  html_skip_tags = ['nav']
133
131
  self._doc_handle = get_document_handle(document)
@@ -14,7 +14,7 @@ import pixeltable as pxt
14
14
  import pixeltable.exceptions as excs
15
15
  import pixeltable.type_system as ts
16
16
  import pixeltable.utils.av as av_utils
17
- from pixeltable.utils.media_store import TempStore
17
+ from pixeltable.utils.local_store import TempStore
18
18
 
19
19
  from .base import ComponentIterator
20
20
 
@@ -115,6 +115,9 @@ class ColumnMd:
115
115
  # if True, the column is present in the stored table
116
116
  stored: Optional[bool]
117
117
 
118
+ # If present, the URI for the destination for column values
119
+ destination: Optional[str] = None
120
+
118
121
 
119
122
  @dataclasses.dataclass
120
123
  class IndexMd:
@@ -244,6 +247,9 @@ class TableVersionMd:
244
247
  schema_version: int
245
248
  user: Optional[str] = None # User that created this version
246
249
  update_status: Optional[UpdateStatus] = None # UpdateStatus of the change that created this version
250
+ # A version fragment cannot be queried or instantiated via get_table(). A fragment represents a version of a
251
+ # replica table that has incomplete data, and exists only to provide base table support for a dependent view.
252
+ is_fragment: bool = False
247
253
  additional_md: dict[str, Any] = dataclasses.field(default_factory=dict)
248
254
 
249
255
 
@@ -353,6 +359,7 @@ class FullTableMd(NamedTuple):
353
359
  def is_pure_snapshot(self) -> bool:
354
360
  return (
355
361
  self.tbl_md.view_md is not None
362
+ and self.tbl_md.view_md.is_snapshot
356
363
  and self.tbl_md.view_md.predicate is None
357
364
  and len(self.schema_version_md.columns) == 0
358
365
  )
pixeltable/plan.py CHANGED
@@ -403,6 +403,8 @@ class Planner:
403
403
  ignore_errors=ignore_errors,
404
404
  )
405
405
  )
406
+ plan = cls._insert_save_node(tbl.id, row_builder.stored_media_cols, input_node=plan)
407
+
406
408
  return plan
407
409
 
408
410
  @classmethod
@@ -499,6 +501,9 @@ class Planner:
499
501
  for i, col in enumerate(all_base_cols):
500
502
  plan.row_builder.add_table_column(col, select_list[i].slot_idx)
501
503
  plan.ctx.num_computed_exprs = len(recomputed_exprs)
504
+
505
+ plan = cls._insert_save_node(tbl.tbl_version.id, plan.row_builder.stored_media_cols, input_node=plan)
506
+
502
507
  recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
503
508
  return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
504
509
 
@@ -597,6 +602,7 @@ class Planner:
597
602
  # we're returning everything to the user, so we might as well do it in a single batch
598
603
  ctx.batch_size = 0
599
604
  plan.set_ctx(ctx)
605
+ plan = cls._insert_save_node(tbl.tbl_version.id, plan.row_builder.stored_media_cols, input_node=plan)
600
606
  recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
601
607
  return (
602
608
  plan,
@@ -650,6 +656,8 @@ class Planner:
650
656
  for i, col in enumerate(copied_cols + list(recomputed_cols)): # same order as select_list
651
657
  plan.row_builder.add_table_column(col, select_list[i].slot_idx)
652
658
  # TODO: avoid duplication with view_load_plan() logic (where does this belong?)
659
+ plan = cls._insert_save_node(view.tbl_version.id, plan.row_builder.stored_media_cols, input_node=plan)
660
+
653
661
  return plan
654
662
 
655
663
  @classmethod
@@ -718,6 +726,8 @@ class Planner:
718
726
 
719
727
  exec_ctx.ignore_errors = True
720
728
  plan.set_ctx(exec_ctx)
729
+ plan = cls._insert_save_node(view.tbl_version.id, plan.row_builder.stored_media_cols, input_node=plan)
730
+
721
731
  return plan, len(row_builder.default_eval_ctx.target_exprs)
722
732
 
723
733
  @classmethod
@@ -762,6 +772,17 @@ class Planner:
762
772
  combined_ordering = combined
763
773
  return combined_ordering
764
774
 
775
+ @classmethod
776
+ def _insert_save_node(
777
+ cls, tbl_id: UUID, stored_media_cols: list[exprs.ColumnSlotIdx], input_node: exec.ExecNode
778
+ ) -> exec.ExecNode:
779
+ """Return an ObjectStoreSaveNode if stored media columns are present, otherwise return input"""
780
+ if len(stored_media_cols) == 0:
781
+ return input_node
782
+ save_node = exec.ObjectStoreSaveNode(tbl_id, stored_media_cols, input_node)
783
+ save_node.set_ctx(input_node.ctx)
784
+ return save_node
785
+
765
786
  @classmethod
766
787
  def _is_contained_in(cls, l1: Iterable[exprs.Expr], l2: Iterable[exprs.Expr]) -> bool:
767
788
  """Returns True if l1 is contained in l2"""
@@ -771,7 +792,7 @@ class Planner:
771
792
  def _insert_prefetch_node(
772
793
  cls, tbl_id: UUID, expressions: Iterable[exprs.Expr], input_node: exec.ExecNode
773
794
  ) -> exec.ExecNode:
774
- """Return a CachePrefetchNode if needed, otherwise return input"""
795
+ """Return a node to prefetch data if needed, otherwise return input"""
775
796
  # we prefetch external files for all media ColumnRefs, even those that aren't part of the dependencies
776
797
  # of output_exprs: if unstored iterator columns are present, we might need to materialize ColumnRefs that
777
798
  # aren't explicitly captured as dependencies
@@ -989,6 +1010,7 @@ class Planner:
989
1010
  if not agg_output.issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
990
1011
  # we need an ExprEvalNode to evaluate the remaining output exprs
991
1012
  plan = exec.ExprEvalNode(row_builder, eval_ctx.target_exprs, agg_output, input=plan)
1013
+ plan = cls._insert_save_node(tbl.tbl_version.id, row_builder.stored_media_cols, input_node=plan)
992
1014
  else:
993
1015
  if not exprs.ExprSet(sql_exprs).issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
994
1016
  # we need an ExprEvalNode to evaluate the remaining output exprs
@@ -1034,10 +1056,13 @@ class Planner:
1034
1056
  plan = cls._create_query_plan(
1035
1057
  row_builder=row_builder, analyzer=analyzer, eval_ctx=row_builder.default_eval_ctx, with_pk=True
1036
1058
  )
1059
+
1037
1060
  plan.ctx.batch_size = 16
1038
1061
  plan.ctx.show_pbar = True
1039
1062
  plan.ctx.ignore_errors = True
1040
1063
  computed_exprs = row_builder.output_exprs - row_builder.input_exprs
1041
1064
  plan.ctx.num_computed_exprs = len(computed_exprs) # we are adding a computed column, so we need to evaluate it
1042
1065
 
1066
+ plan = cls._insert_save_node(tbl.tbl_version.id, row_builder.stored_media_cols, input_node=plan)
1067
+
1043
1068
  return plan
@@ -24,7 +24,8 @@ from pixeltable.env import Env
24
24
  from pixeltable.metadata import schema
25
25
  from pixeltable.utils import sha256sum
26
26
  from pixeltable.utils.formatter import Formatter
27
- from pixeltable.utils.media_store import MediaStore, TempStore
27
+ from pixeltable.utils.local_store import TempStore
28
+ from pixeltable.utils.object_stores import ObjectOps
28
29
 
29
30
  _logger = logging.getLogger('pixeltable')
30
31
 
@@ -362,6 +363,8 @@ class TableRestorer:
362
363
  for md in tbl_md:
363
364
  md.tbl_md.is_replica = True
364
365
 
366
+ assert not tbl_md[0].version_md.is_fragment # Top-level table cannot be a version fragment
367
+
365
368
  cat = catalog.Catalog.get()
366
369
 
367
370
  with cat.begin_xact(for_write=True):
@@ -369,6 +372,9 @@ class TableRestorer:
369
372
  # versions that have not been seen before.
370
373
  cat.create_replica(catalog.Path.parse(self.tbl_path), tbl_md)
371
374
 
375
+ _logger.debug(f'Now will import data for {len(tbl_md)} table(s):')
376
+ _logger.debug(repr([md.tbl_md.tbl_id for md in tbl_md[::-1]]))
377
+
372
378
  # Now we need to load data for replica_tbl and its ancestors, except that we skip
373
379
  # replica_tbl itself if it's a pure snapshot.
374
380
  for md in tbl_md[::-1]: # Base table first
@@ -619,7 +625,7 @@ class TableRestorer:
619
625
  # in self.media_files.
620
626
  src_path = self.tmp_dir / 'media' / parsed_url.netloc
621
627
  # Move the file to the media store and update the URL.
622
- self.media_files[url] = MediaStore.get().relocate_local_media_file(src_path, media_col)
628
+ self.media_files[url] = ObjectOps.put_file(media_col, src_path, relocate_or_delete=True)
623
629
  return self.media_files[url]
624
630
  # For any type of URL other than a local file, just return the URL as-is.
625
631
  return url
@@ -14,7 +14,7 @@ import pixeltable as pxt
14
14
  from pixeltable import exceptions as excs
15
15
  from pixeltable.env import Env
16
16
  from pixeltable.utils import sha256sum
17
- from pixeltable.utils.media_store import TempStore
17
+ from pixeltable.utils.local_store import TempStore
18
18
 
19
19
  from .packager import TablePackager, TableRestorer
20
20
 
@@ -79,16 +79,13 @@ def push_replica(
79
79
 
80
80
 
81
81
  def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult) -> None:
82
- from pixeltable.utils.s3 import get_client
83
-
84
82
  bucket = parsed_location.netloc
85
83
  remote_dir = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_location.path)))
86
84
  remote_path = str(remote_dir / bundle.name)[1:] # Remove initial /
87
85
 
88
86
  Env.get().console_logger.info(f'Uploading snapshot to: {bucket}:{remote_path}')
89
87
 
90
- boto_config = {'max_pool_connections': 5, 'connect_timeout': 15, 'retries': {'max_attempts': 3, 'mode': 'adaptive'}}
91
- s3_client = get_client(**boto_config)
88
+ s3_client = Env.get().get_client('s3')
92
89
 
93
90
  upload_args = {'ChecksumAlgorithm': 'SHA256'}
94
91
 
@@ -135,16 +132,13 @@ def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
135
132
 
136
133
 
137
134
  def _download_bundle_from_s3(parsed_location: urllib.parse.ParseResult, bundle_filename: str) -> Path:
138
- from pixeltable.utils.s3 import get_client
139
-
140
135
  bucket = parsed_location.netloc
141
136
  remote_dir = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_location.path)))
142
137
  remote_path = str(remote_dir / bundle_filename)[1:] # Remove initial /
143
138
 
144
139
  Env.get().console_logger.info(f'Downloading snapshot from: {bucket}:{remote_path}')
145
140
 
146
- boto_config = {'max_pool_connections': 5, 'connect_timeout': 15, 'retries': {'max_attempts': 3, 'mode': 'adaptive'}}
147
- s3_client = get_client(**boto_config)
141
+ s3_client = Env.get().get_client('s3')
148
142
 
149
143
  obj = s3_client.head_object(Bucket=bucket, Key=remote_path) # Check if the object exists
150
144
  bundle_size = obj['ContentLength']
pixeltable/type_system.py CHANGED
@@ -1081,9 +1081,7 @@ class ImageType(ColumnType):
1081
1081
  mode: Optional[str] = None,
1082
1082
  nullable: bool = False,
1083
1083
  ):
1084
- """
1085
- TODO: does it make sense to specify only width or height?
1086
- """
1084
+ # TODO: does it make sense to specify only width or height?
1087
1085
  super().__init__(self.Type.IMAGE, nullable=nullable)
1088
1086
  assert not (width is not None and size is not None)
1089
1087
  assert not (height is not None and size is not None)
pixeltable/utils/arrow.py CHANGED
@@ -1,11 +1,18 @@
1
1
  import datetime
2
- from typing import Any, Iterator, Optional
2
+ import io
3
+ import json
4
+ from typing import TYPE_CHECKING, Any, Iterator, Optional, cast
3
5
 
4
6
  import numpy as np
7
+ import PIL.Image
5
8
  import pyarrow as pa
6
9
 
10
+ import pixeltable.exceptions as excs
7
11
  import pixeltable.type_system as ts
8
12
 
13
+ if TYPE_CHECKING:
14
+ import pixeltable as pxt
15
+
9
16
  PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
10
17
  pa.string(): ts.StringType(nullable=True),
11
18
  pa.large_string(): ts.StringType(nullable=True),
@@ -71,7 +78,7 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
71
78
  return None
72
79
 
73
80
 
74
- def ar_infer_schema(
81
+ def to_pxt_schema(
75
82
  arrow_schema: pa.Schema, schema_overrides: dict[str, Any], primary_key: list[str]
76
83
  ) -> dict[str, ts.ColumnType]:
77
84
  """Convert a pyarrow Schema to a schema using pyarrow names and pixeltable types."""
@@ -88,6 +95,94 @@ def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
88
95
  return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items()) # type: ignore[misc]
89
96
 
90
97
 
98
+ def _to_record_batch(column_vals: dict[str, list[Any]], schema: pa.Schema) -> pa.RecordBatch:
99
+ import pyarrow as pa
100
+
101
+ pa_arrays: list[pa.Array] = []
102
+ for field in schema:
103
+ if isinstance(field.type, pa.FixedShapeTensorType):
104
+ stacked_arr = np.stack(column_vals[field.name])
105
+ pa_arrays.append(pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr))
106
+ else:
107
+ pa_array = cast(pa.Array, pa.array(column_vals[field.name]))
108
+ pa_arrays.append(pa_array)
109
+ return pa.RecordBatch.from_arrays(pa_arrays, schema=schema) # type: ignore
110
+
111
+
112
+ def to_record_batches(df: 'pxt.DataFrame', batch_size_bytes: int) -> Iterator[pa.RecordBatch]:
113
+ arrow_schema = to_arrow_schema(df.schema)
114
+ batch_columns: dict[str, list[Any]] = {k: [] for k in df.schema}
115
+ current_byte_estimate = 0
116
+ num_batch_rows = 0
117
+
118
+ # TODO: in order to avoid having to deal with ExprEvalError here, DataFrameResultSet should be an iterator
119
+ # over _exec()
120
+ try:
121
+ for data_row in df._exec():
122
+ num_batch_rows += 1
123
+ for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
124
+ val = data_row[e.slot_idx]
125
+ val_size_bytes: int
126
+ if val is None:
127
+ batch_columns[col_name].append(val)
128
+ continue
129
+
130
+ assert val is not None
131
+ if col_type.is_image_type():
132
+ # images get inlined into the parquet file
133
+ if data_row.file_paths[e.slot_idx] is not None:
134
+ # if there is a file, read directly to preserve information
135
+ with open(data_row.file_paths[e.slot_idx], 'rb') as f:
136
+ val = f.read()
137
+ elif isinstance(val, PIL.Image.Image):
138
+ # no file available: save as png
139
+ buf = io.BytesIO()
140
+ val.save(buf, format='png')
141
+ val = buf.getvalue()
142
+ else:
143
+ raise excs.Error(f'unknown image type {type(val)}')
144
+ val_size_bytes = len(val)
145
+ elif col_type.is_string_type():
146
+ val_size_bytes = len(val)
147
+ elif col_type.is_media_type():
148
+ assert data_row.file_paths[e.slot_idx] is not None
149
+ val = data_row.file_paths[e.slot_idx]
150
+ val_size_bytes = len(val)
151
+ elif col_type.is_json_type():
152
+ val = json.dumps(val)
153
+ val_size_bytes = len(val)
154
+ elif col_type.is_array_type():
155
+ val_size_bytes = val.nbytes
156
+ elif col_type.is_int_type() or col_type.is_float_type():
157
+ val_size_bytes = 8
158
+ elif col_type.is_bool_type():
159
+ val_size_bytes = 1
160
+ elif col_type.is_date_type():
161
+ val_size_bytes = 4
162
+ elif col_type.is_timestamp_type():
163
+ val = val.astimezone(datetime.timezone.utc)
164
+ val_size_bytes = 8
165
+ else:
166
+ raise excs.Error(f'unknown type {col_type} for {col_name}')
167
+
168
+ batch_columns[col_name].append(val)
169
+ current_byte_estimate += val_size_bytes
170
+
171
+ if current_byte_estimate > batch_size_bytes and num_batch_rows > 0:
172
+ record_batch = _to_record_batch(batch_columns, arrow_schema)
173
+ yield record_batch
174
+ batch_columns = {k: [] for k in df.schema}
175
+ current_byte_estimate = 0
176
+ num_batch_rows = 0
177
+
178
+ except excs.ExprEvalError as e:
179
+ df._raise_expr_eval_err(e)
180
+
181
+ if num_batch_rows > 0:
182
+ record_batch = _to_record_batch(batch_columns, arrow_schema)
183
+ yield record_batch
184
+
185
+
91
186
  def to_pydict(batch: pa.Table | pa.RecordBatch) -> dict[str, list | np.ndarray]:
92
187
  """Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
93
188
  this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.