pixeltable 0.4.11__py3-none-any.whl → 0.4.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

@@ -41,9 +41,9 @@ def _anthropic_client() -> 'anthropic.AsyncAnthropic':
41
41
  def _get_header_info(
42
42
  headers: httpx.Headers,
43
43
  ) -> tuple[
44
- Optional[tuple[int, int, datetime.datetime]],
45
- Optional[tuple[int, int, datetime.datetime]],
46
- Optional[tuple[int, int, datetime.datetime]],
44
+ tuple[int, int, datetime.datetime] | None,
45
+ tuple[int, int, datetime.datetime] | None,
46
+ tuple[int, int, datetime.datetime] | None,
47
47
  ]:
48
48
  """Extract rate limit info from Anthropic API response headers."""
49
49
  requests_limit_str = headers.get('anthropic-ratelimit-requests-limit')
@@ -54,7 +54,9 @@ def _get_header_info(
54
54
  requests_reset = (
55
55
  datetime.datetime.fromisoformat(requests_reset_str.replace('Z', '+00:00')) if requests_reset_str else None
56
56
  )
57
- requests_info = (requests_limit, requests_remaining, requests_reset) if requests_reset else None
57
+ requests_info = (
58
+ (requests_limit, requests_remaining, requests_reset) if requests_reset and requests_remaining else None
59
+ )
58
60
 
59
61
  input_tokens_limit_str = headers.get('anthropic-ratelimit-input-tokens-limit')
60
62
  input_tokens_limit = int(input_tokens_limit_str) if input_tokens_limit_str is not None else None
@@ -66,7 +68,11 @@ def _get_header_info(
66
68
  if input_tokens_reset_str
67
69
  else None
68
70
  )
69
- input_tokens_info = (input_tokens_limit, input_tokens_remaining, input_tokens_reset) if input_tokens_reset else None
71
+ input_tokens_info = (
72
+ (input_tokens_limit, input_tokens_remaining, input_tokens_reset)
73
+ if input_tokens_reset and input_tokens_remaining
74
+ else None
75
+ )
70
76
 
71
77
  output_tokens_limit_str = headers.get('anthropic-ratelimit-output-tokens-limit')
72
78
  output_tokens_limit = int(output_tokens_limit_str) if output_tokens_limit_str is not None else None
@@ -79,9 +85,14 @@ def _get_header_info(
79
85
  else None
80
86
  )
81
87
  output_tokens_info = (
82
- (output_tokens_limit, output_tokens_remaining, output_tokens_reset) if output_tokens_reset else None
88
+ (output_tokens_limit, output_tokens_remaining, output_tokens_reset)
89
+ if output_tokens_reset and output_tokens_remaining
90
+ else None
83
91
  )
84
92
 
93
+ if requests_info is None or input_tokens_info is None or output_tokens_info is None:
94
+ _logger.debug(f'get_header_info(): incomplete rate limit info: {headers}')
95
+
85
96
  return requests_info, input_tokens_info, output_tokens_info
86
97
 
87
98
 
@@ -62,11 +62,11 @@ async def chat_completions(
62
62
  A dictionary containing the response and other metadata.
63
63
 
64
64
  Examples:
65
- Add a computed column that applies the model `llama3-8b-8192`
65
+ Add a computed column that applies the model `llama-3.1-8b-instant`
66
66
  to an existing Pixeltable column `tbl.prompt` of the table `tbl`:
67
67
 
68
68
  >>> messages = [{'role': 'user', 'content': tbl.prompt}]
69
- ... tbl.add_computed_column(response=chat_completions(messages, model='llama3-8b-8192'))
69
+ ... tbl.add_computed_column(response=chat_completions(messages, model='llama-3.1-8b-instant'))
70
70
  """
71
71
  if model_kwargs is None:
72
72
  model_kwargs = {}
@@ -113,7 +113,7 @@ def _parse_header_duration(duration_str: str) -> datetime.timedelta:
113
113
 
114
114
  def _get_header_info(
115
115
  headers: httpx.Headers,
116
- ) -> tuple[Optional[tuple[int, int, datetime.datetime]], Optional[tuple[int, int, datetime.datetime]]]:
116
+ ) -> tuple[tuple[int, int, datetime.datetime] | None, tuple[int, int, datetime.datetime] | None]:
117
117
  now = datetime.datetime.now(tz=datetime.timezone.utc)
118
118
 
119
119
  requests_limit_str = headers.get('x-ratelimit-limit-requests')
@@ -122,7 +122,7 @@ def _get_header_info(
122
122
  requests_remaining = int(requests_remaining_str) if requests_remaining_str is not None else None
123
123
  requests_reset_str = headers.get('x-ratelimit-reset-requests', '5s') # Default to 5 seconds
124
124
  requests_reset_ts = now + _parse_header_duration(requests_reset_str)
125
- requests_info = (requests_limit, requests_remaining, requests_reset_ts)
125
+ requests_info = (requests_limit, requests_remaining, requests_reset_ts) if requests_remaining is not None else None
126
126
 
127
127
  tokens_limit_str = headers.get('x-ratelimit-limit-tokens')
128
128
  tokens_limit = int(tokens_limit_str) if tokens_limit_str is not None else None
@@ -130,7 +130,10 @@ def _get_header_info(
130
130
  tokens_remaining = int(tokens_remaining_str) if tokens_remaining_str is not None else None
131
131
  tokens_reset_str = headers.get('x-ratelimit-reset-tokens', '5s') # Default to 5 seconds
132
132
  tokens_reset_ts = now + _parse_header_duration(tokens_reset_str)
133
- tokens_info = (tokens_limit, tokens_remaining, tokens_reset_ts)
133
+ tokens_info = (tokens_limit, tokens_remaining, tokens_reset_ts) if tokens_remaining is not None else None
134
+
135
+ if requests_info is None or tokens_info is None:
136
+ _logger.debug(f'get_header_info(): incomplete rate limit info: {headers}')
134
137
 
135
138
  return requests_info, tokens_info
136
139
 
pixeltable/globals.py CHANGED
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  import logging
4
4
  import os
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, NamedTuple, Optional, Sequence, Union
6
+ from typing import TYPE_CHECKING, Any, Iterable, Literal, NamedTuple, Optional, Union
7
7
 
8
8
  import pandas as pd
9
9
  import pydantic
@@ -24,9 +24,8 @@ if TYPE_CHECKING:
24
24
  str,
25
25
  os.PathLike,
26
26
  Path, # OS paths, filenames, URLs
27
- Iterator[dict[str, Any]], # iterator producing dictionaries of values
28
- RowData, # list of dictionaries
29
- Sequence[pydantic.BaseModel], # list of Pydantic models
27
+ Iterable[dict[str, Any]], # dictionaries of values
28
+ Iterable[pydantic.BaseModel], # Pydantic model instances
30
29
  DataFrame, # Pixeltable DataFrame
31
30
  pd.DataFrame, # pandas DataFrame
32
31
  datasets.Dataset,
@@ -542,9 +541,14 @@ def drop_table(
542
541
  assert isinstance(table, str)
543
542
  tbl_path = table
544
543
 
545
- path_obj = catalog.Path.parse(tbl_path)
546
- if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
547
- Catalog.get().drop_table(path_obj, force=force, if_not_exists=if_not_exists_)
544
+ if tbl_path.startswith('pxt://'):
545
+ # Remote table
546
+ share.delete_replica(tbl_path)
547
+ else:
548
+ # Local table
549
+ path_obj = catalog.Path.parse(tbl_path)
550
+ if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
551
+ Catalog.get().drop_table(path_obj, force=force, if_not_exists=if_not_exists_)
548
552
 
549
553
 
550
554
  def get_dir_contents(dir_path: str = '', recursive: bool = True) -> 'DirContents':
pixeltable/io/__init__.py CHANGED
@@ -4,11 +4,12 @@ from .datarows import import_json, import_rows
4
4
  from .external_store import ExternalStore
5
5
  from .globals import create_label_studio_project, export_images_as_fo_dataset
6
6
  from .hf_datasets import import_huggingface_dataset
7
+ from .lancedb import export_lancedb
7
8
  from .pandas import import_csv, import_excel, import_pandas
8
9
  from .parquet import export_parquet, import_parquet
9
10
 
10
11
  __default_dir = {symbol for symbol in dir() if not symbol.startswith('_')}
11
- __removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows'}
12
+ __removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows', 'lancedb'}
12
13
  __all__ = sorted(__default_dir - __removed_symbols)
13
14
 
14
15
 
@@ -0,0 +1,3 @@
1
+ from pixeltable.utils.lancedb import export_lancedb
2
+
3
+ __all__ = ['export_lancedb']
pixeltable/io/parquet.py CHANGED
@@ -1,46 +1,22 @@
1
1
  from __future__ import annotations
2
2
 
3
- import datetime
4
- import io
5
3
  import json
6
4
  import logging
7
5
  import typing
8
- from collections import deque
9
6
  from pathlib import Path
10
7
  from typing import Any, Optional
11
8
 
12
- import numpy as np
13
- import PIL.Image
14
-
15
9
  import pixeltable as pxt
16
10
  import pixeltable.exceptions as excs
17
11
  from pixeltable.catalog import Catalog
18
12
  from pixeltable.utils.transactional_directory import transactional_directory
19
13
 
20
14
  if typing.TYPE_CHECKING:
21
- import pyarrow as pa
22
-
23
15
  import pixeltable as pxt
24
16
 
25
17
  _logger = logging.getLogger('pixeltable')
26
18
 
27
19
 
28
- def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
29
- import pyarrow as pa
30
- from pyarrow import parquet
31
-
32
- pydict = {}
33
- for field in schema:
34
- if isinstance(field.type, pa.FixedShapeTensorType):
35
- stacked_arr = np.stack(value_batch[field.name])
36
- pydict[field.name] = pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr)
37
- else:
38
- pydict[field.name] = value_batch[field.name]
39
-
40
- tab = pa.Table.from_pydict(pydict, schema=schema)
41
- parquet.write_table(tab, str(output_path))
42
-
43
-
44
20
  def export_parquet(
45
21
  table_or_df: pxt.Table | pxt.DataFrame,
46
22
  parquet_path: Path,
@@ -63,7 +39,9 @@ def export_parquet(
63
39
  If False, will raise an error if the Dataframe has any image column.
64
40
  Default False.
65
41
  """
66
- from pixeltable.utils.arrow import to_arrow_schema
42
+ import pyarrow as pa
43
+
44
+ from pixeltable.utils.arrow import to_record_batches
67
45
 
68
46
  df: pxt.DataFrame
69
47
  if isinstance(table_or_df, pxt.catalog.Table):
@@ -71,9 +49,6 @@ def export_parquet(
71
49
  else:
72
50
  df = table_or_df
73
51
 
74
- type_dict = {k: v.as_dict() for k, v in df.schema.items()}
75
- arrow_schema = to_arrow_schema(df.schema)
76
-
77
52
  if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
78
53
  raise excs.Error('Cannot export Dataframe with image columns when inline_images is False')
79
54
 
@@ -81,70 +56,15 @@ def export_parquet(
81
56
  with transactional_directory(parquet_path) as temp_path:
82
57
  # dump metadata json file so we can inspect what was the source of the parquet file later on.
83
58
  json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
59
+ type_dict = {k: v.as_dict() for k, v in df.schema.items()}
84
60
  json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
85
-
86
61
  batch_num = 0
87
- current_value_batch: dict[str, deque] = {k: deque() for k in df.schema}
88
- current_byte_estimate = 0
89
-
90
62
  with Catalog.get().begin_xact(for_write=False):
91
- for data_row in df._exec():
92
- for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
93
- val = data_row[e.slot_idx]
94
- if val is None:
95
- current_value_batch[col_name].append(val)
96
- continue
97
-
98
- assert val is not None
99
- if col_type.is_image_type():
100
- # images get inlined into the parquet file
101
- if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
102
- # if there is a file, read directly to preserve information
103
- with open(data_row.file_paths[e.slot_idx], 'rb') as f:
104
- val = f.read()
105
- elif isinstance(val, PIL.Image.Image):
106
- # if no file available, eg. bc it is computed, convert to png
107
- buf = io.BytesIO()
108
- val.save(buf, format='PNG')
109
- val = buf.getvalue()
110
- else:
111
- raise excs.Error(f'unknown image type {type(val)}')
112
- length = len(val)
113
- elif col_type.is_string_type():
114
- length = len(val)
115
- elif col_type.is_video_type() or col_type.is_audio_type():
116
- if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
117
- val = data_row.file_paths[e.slot_idx]
118
- else:
119
- raise excs.Error(f'unknown audio/video type {type(val)}')
120
- length = len(val)
121
- elif col_type.is_json_type():
122
- val = json.dumps(val)
123
- length = len(val)
124
- elif col_type.is_array_type():
125
- length = val.nbytes
126
- elif col_type.is_int_type() or col_type.is_float_type():
127
- length = 8
128
- elif col_type.is_bool_type():
129
- length = 1
130
- elif col_type.is_date_type():
131
- length = 4
132
- elif col_type.is_timestamp_type():
133
- val = val.astimezone(datetime.timezone.utc)
134
- length = 8
135
- else:
136
- raise excs.Error(f'unknown type {col_type} for {col_name}')
137
-
138
- current_value_batch[col_name].append(val)
139
- current_byte_estimate += length
140
- if current_byte_estimate > partition_size_bytes:
141
- assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
142
- _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
143
- batch_num += 1
144
- current_value_batch = {k: deque() for k in df.schema}
145
- current_byte_estimate = 0
146
-
147
- _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
63
+ for record_batch in to_record_batches(df, partition_size_bytes):
64
+ output_path = temp_path / f'part-{batch_num:05d}.parquet'
65
+ arrow_tbl = pa.Table.from_batches([record_batch]) # type: ignore
66
+ pa.parquet.write_table(arrow_tbl, str(output_path))
67
+ batch_num += 1
148
68
 
149
69
 
150
70
  def import_parquet(
@@ -469,12 +469,12 @@ class ParquetTableDataConduit(TableDataConduit):
469
469
  return t
470
470
 
471
471
  def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
472
- from pixeltable.utils.arrow import ar_infer_schema
472
+ from pixeltable.utils.arrow import to_pxt_schema
473
473
 
474
474
  if self.source_column_map is None:
475
475
  if self.src_schema_overrides is None:
476
476
  self.src_schema_overrides = {}
477
- self.src_schema = ar_infer_schema(self.pq_ds.schema, self.src_schema_overrides, self.src_pk)
477
+ self.src_schema = to_pxt_schema(self.pq_ds.schema, self.src_schema_overrides, self.src_pk)
478
478
  inferred_schema, inferred_pk, self.source_column_map = normalize_schema_names(
479
479
  self.src_schema, self.src_pk, self.src_schema_overrides
480
480
  )
@@ -1,3 +1,3 @@
1
1
  # ruff: noqa: F401
2
2
 
3
- from .publish import pull_replica, push_replica
3
+ from .publish import delete_replica, pull_replica, push_replica
@@ -27,15 +27,11 @@ PIXELTABLE_API_URL = os.environ.get('PIXELTABLE_API_URL', 'https://internal-api.
27
27
  def push_replica(
28
28
  dest_tbl_uri: str, src_tbl: pxt.Table, bucket: str | None = None, access: Literal['public', 'private'] = 'private'
29
29
  ) -> str:
30
- if not src_tbl._tbl_version_path.is_snapshot():
31
- raise excs.Error('Only snapshots may be published.')
32
-
33
30
  packager = TablePackager(
34
31
  src_tbl, additional_md={'table_uri': dest_tbl_uri, 'bucket_name': bucket, 'is_public': access == 'public'}
35
32
  )
36
33
  request_json = packager.md | {'operation_type': 'publish_snapshot'}
37
- headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
38
- response = requests.post(PIXELTABLE_API_URL, json=request_json, headers=headers_json)
34
+ response = requests.post(PIXELTABLE_API_URL, json=request_json, headers=_api_headers())
39
35
  if response.status_code != 200:
40
36
  raise excs.Error(f'Error publishing snapshot: {response.text}')
41
37
  response_json = response.json()
@@ -70,7 +66,7 @@ def push_replica(
70
66
  'preview_data': packager.md['preview_data'],
71
67
  }
72
68
  # TODO: Use Pydantic for validation
73
- finalize_response = requests.post(PIXELTABLE_API_URL, json=finalize_request_json, headers=headers_json)
69
+ finalize_response = requests.post(PIXELTABLE_API_URL, json=finalize_request_json, headers=_api_headers())
74
70
  if finalize_response.status_code != 200:
75
71
  raise excs.Error(f'Error finalizing snapshot: {finalize_response.text}')
76
72
  finalize_response_json = finalize_response.json()
@@ -112,9 +108,8 @@ def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult
112
108
 
113
109
 
114
110
  def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
115
- headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
116
111
  clone_request_json = {'operation_type': 'clone_snapshot', 'table_uri': src_tbl_uri}
117
- response = requests.post(PIXELTABLE_API_URL, json=clone_request_json, headers=headers_json)
112
+ response = requests.post(PIXELTABLE_API_URL, json=clone_request_json, headers=_api_headers())
118
113
  if response.status_code != 200:
119
114
  raise excs.Error(f'Error cloning snapshot: {response.text}')
120
115
  response_json = response.json()
@@ -268,11 +263,18 @@ def _download_from_presigned_url(
268
263
  # TODO: This will be replaced by drop_table with cloud table uri
269
264
  def delete_replica(dest_path: str) -> None:
270
265
  """Delete cloud replica"""
271
- headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
272
266
  delete_request_json = {'operation_type': 'delete_snapshot', 'table_uri': dest_path}
273
- response = requests.post(PIXELTABLE_API_URL, json=delete_request_json, headers=headers_json)
267
+ response = requests.post(PIXELTABLE_API_URL, json=delete_request_json, headers=_api_headers())
274
268
  if response.status_code != 200:
275
269
  raise excs.Error(f'Error deleting replica: {response.text}')
276
270
  response_json = response.json()
277
271
  if not isinstance(response_json, dict) or 'table_uri' not in response_json:
278
272
  raise excs.Error(f'Error deleting replica: unexpected response from server.\n{response_json}')
273
+
274
+
275
+ def _api_headers() -> dict[str, str]:
276
+ headers = {'Content-Type': 'application/json'}
277
+ api_key = Env.get().pxt_api_key
278
+ if api_key is not None:
279
+ headers['X-api-key'] = api_key
280
+ return headers
pixeltable/utils/arrow.py CHANGED
@@ -1,11 +1,18 @@
1
1
  import datetime
2
- from typing import Any, Iterator, Optional
2
+ import io
3
+ import json
4
+ from typing import TYPE_CHECKING, Any, Iterator, Optional, cast
3
5
 
4
6
  import numpy as np
7
+ import PIL.Image
5
8
  import pyarrow as pa
6
9
 
10
+ import pixeltable.exceptions as excs
7
11
  import pixeltable.type_system as ts
8
12
 
13
+ if TYPE_CHECKING:
14
+ import pixeltable as pxt
15
+
9
16
  PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
10
17
  pa.string(): ts.StringType(nullable=True),
11
18
  pa.large_string(): ts.StringType(nullable=True),
@@ -71,7 +78,7 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
71
78
  return None
72
79
 
73
80
 
74
- def ar_infer_schema(
81
+ def to_pxt_schema(
75
82
  arrow_schema: pa.Schema, schema_overrides: dict[str, Any], primary_key: list[str]
76
83
  ) -> dict[str, ts.ColumnType]:
77
84
  """Convert a pyarrow Schema to a schema using pyarrow names and pixeltable types."""
@@ -88,6 +95,94 @@ def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
88
95
  return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items()) # type: ignore[misc]
89
96
 
90
97
 
98
+ def _to_record_batch(column_vals: dict[str, list[Any]], schema: pa.Schema) -> pa.RecordBatch:
99
+ import pyarrow as pa
100
+
101
+ pa_arrays: list[pa.Array] = []
102
+ for field in schema:
103
+ if isinstance(field.type, pa.FixedShapeTensorType):
104
+ stacked_arr = np.stack(column_vals[field.name])
105
+ pa_arrays.append(pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr))
106
+ else:
107
+ pa_array = cast(pa.Array, pa.array(column_vals[field.name]))
108
+ pa_arrays.append(pa_array)
109
+ return pa.RecordBatch.from_arrays(pa_arrays, schema=schema) # type: ignore
110
+
111
+
112
+ def to_record_batches(df: 'pxt.DataFrame', batch_size_bytes: int) -> Iterator[pa.RecordBatch]:
113
+ arrow_schema = to_arrow_schema(df.schema)
114
+ batch_columns: dict[str, list[Any]] = {k: [] for k in df.schema}
115
+ current_byte_estimate = 0
116
+ num_batch_rows = 0
117
+
118
+ # TODO: in order to avoid having to deal with ExprEvalError here, DataFrameResultSet should be an iterator
119
+ # over _exec()
120
+ try:
121
+ for data_row in df._exec():
122
+ num_batch_rows += 1
123
+ for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
124
+ val = data_row[e.slot_idx]
125
+ val_size_bytes: int
126
+ if val is None:
127
+ batch_columns[col_name].append(val)
128
+ continue
129
+
130
+ assert val is not None
131
+ if col_type.is_image_type():
132
+ # images get inlined into the parquet file
133
+ if data_row.file_paths[e.slot_idx] is not None:
134
+ # if there is a file, read directly to preserve information
135
+ with open(data_row.file_paths[e.slot_idx], 'rb') as f:
136
+ val = f.read()
137
+ elif isinstance(val, PIL.Image.Image):
138
+ # no file available: save as png
139
+ buf = io.BytesIO()
140
+ val.save(buf, format='png')
141
+ val = buf.getvalue()
142
+ else:
143
+ raise excs.Error(f'unknown image type {type(val)}')
144
+ val_size_bytes = len(val)
145
+ elif col_type.is_string_type():
146
+ val_size_bytes = len(val)
147
+ elif col_type.is_media_type():
148
+ assert data_row.file_paths[e.slot_idx] is not None
149
+ val = data_row.file_paths[e.slot_idx]
150
+ val_size_bytes = len(val)
151
+ elif col_type.is_json_type():
152
+ val = json.dumps(val)
153
+ val_size_bytes = len(val)
154
+ elif col_type.is_array_type():
155
+ val_size_bytes = val.nbytes
156
+ elif col_type.is_int_type() or col_type.is_float_type():
157
+ val_size_bytes = 8
158
+ elif col_type.is_bool_type():
159
+ val_size_bytes = 1
160
+ elif col_type.is_date_type():
161
+ val_size_bytes = 4
162
+ elif col_type.is_timestamp_type():
163
+ val = val.astimezone(datetime.timezone.utc)
164
+ val_size_bytes = 8
165
+ else:
166
+ raise excs.Error(f'unknown type {col_type} for {col_name}')
167
+
168
+ batch_columns[col_name].append(val)
169
+ current_byte_estimate += val_size_bytes
170
+
171
+ if current_byte_estimate > batch_size_bytes and num_batch_rows > 0:
172
+ record_batch = _to_record_batch(batch_columns, arrow_schema)
173
+ yield record_batch
174
+ batch_columns = {k: [] for k in df.schema}
175
+ current_byte_estimate = 0
176
+ num_batch_rows = 0
177
+
178
+ except excs.ExprEvalError as e:
179
+ df._raise_expr_eval_err(e)
180
+
181
+ if num_batch_rows > 0:
182
+ record_batch = _to_record_batch(batch_columns, arrow_schema)
183
+ yield record_batch
184
+
185
+
91
186
  def to_pydict(batch: pa.Table | pa.RecordBatch) -> dict[str, list | np.ndarray]:
92
187
  """Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
93
188
  this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.
@@ -0,0 +1,88 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import shutil
5
+ from pathlib import Path
6
+ from typing import Literal
7
+
8
+ import pixeltable as pxt
9
+ import pixeltable.exceptions as excs
10
+ from pixeltable.catalog import Catalog
11
+ from pixeltable.env import Env
12
+
13
+ _logger = logging.getLogger('pixeltable')
14
+
15
+
16
+ def export_lancedb(
17
+ table_or_df: pxt.Table | pxt.DataFrame,
18
+ db_uri: Path,
19
+ table_name: str,
20
+ batch_size_bytes: int = 128 * 2**20,
21
+ if_exists: Literal['error', 'overwrite', 'append'] = 'error',
22
+ ) -> None:
23
+ """
24
+ Exports a dataframe's data to a LanceDB table.
25
+
26
+ This utilizes LanceDB's streaming interface for efficient table creation, via a sequence of in-memory pyarrow
27
+ `RecordBatches`, the size of which can be controlled with the `batch_size_bytes` parameter.
28
+
29
+ __Requirements:__
30
+
31
+ - `pip install lancedb`
32
+
33
+ Args:
34
+ table_or_df : Table or Dataframe to export.
35
+ db_uri: Local Path to the LanceDB database.
36
+ table_name : Name of the table in the LanceDB database.
37
+ batch_size_bytes : Maximum size in bytes for each batch.
38
+ if_exists: Determines the behavior if the table already exists. Must be one of the following:
39
+
40
+ - `'error'`: raise an error
41
+ - `'overwrite'`: overwrite the existing table
42
+ - `'append'`: append to the existing table
43
+ """
44
+ Env.get().require_package('lancedb')
45
+
46
+ import lancedb # type: ignore[import-untyped]
47
+
48
+ from pixeltable.utils.arrow import to_arrow_schema, to_record_batches
49
+
50
+ if if_exists not in ('error', 'overwrite', 'append'):
51
+ raise excs.Error("export_lancedb(): 'if_exists' must be one of: ['error', 'overwrite', 'append']")
52
+
53
+ df: pxt.DataFrame
54
+ if isinstance(table_or_df, pxt.catalog.Table):
55
+ df = table_or_df._df()
56
+ else:
57
+ df = table_or_df
58
+
59
+ db_exists = False
60
+ if db_uri.exists():
61
+ if not db_uri.is_dir():
62
+ raise excs.Error(f"export_lancedb(): '{db_uri!s}' exists and is not a directory")
63
+ db_exists = True
64
+
65
+ try:
66
+ db = lancedb.connect(str(db_uri))
67
+ lance_tbl: lancedb.LanceTable | None = None
68
+ try:
69
+ lance_tbl = db.open_table(table_name)
70
+ if if_exists == 'error':
71
+ raise excs.Error(f'export_lancedb(): table {table_name!r} already exists in {db_uri!r}')
72
+ except ValueError:
73
+ # table doesn't exist
74
+ pass
75
+
76
+ with Catalog.get().begin_xact(for_write=False):
77
+ if lance_tbl is None or if_exists == 'overwrite':
78
+ mode = 'overwrite' if lance_tbl is not None else 'create'
79
+ arrow_schema = to_arrow_schema(df.schema)
80
+ _ = db.create_table(table_name, to_record_batches(df, batch_size_bytes), schema=arrow_schema, mode=mode)
81
+ else:
82
+ lance_tbl.add(to_record_batches(df, batch_size_bytes))
83
+
84
+ except Exception as e:
85
+ # cleanup
86
+ if not db_exists:
87
+ shutil.rmtree(db_uri)
88
+ raise e
@@ -189,6 +189,12 @@ class MediaStore:
189
189
  result.sort(key=lambda e: e[3], reverse=True)
190
190
  return result
191
191
 
192
+ def clear(self) -> None:
193
+ """Clear all files from the media store."""
194
+ assert self.__base_dir.exists()
195
+ shutil.rmtree(self.__base_dir)
196
+ self.__base_dir.mkdir()
197
+
192
198
 
193
199
  class TempStore:
194
200
  """
@@ -235,3 +241,8 @@ class TempStore:
235
241
  if tbl_id is not None:
236
242
  return MediaStore(cls._tmp_dir())._prepare_media_path_raw(tbl_id, 0, 0, extension)
237
243
  return cls._tmp_dir() / f'{uuid.uuid4()}{extension}'
244
+
245
+ @classmethod
246
+ def clear(cls) -> None:
247
+ """Clear all files from the temporary store."""
248
+ MediaStore(cls._tmp_dir()).clear()