pixeltable 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (44) hide show
  1. pixeltable/__init__.py +1 -2
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/column.py +5 -0
  4. pixeltable/catalog/globals.py +16 -0
  5. pixeltable/catalog/insertable_table.py +82 -41
  6. pixeltable/catalog/table.py +78 -55
  7. pixeltable/catalog/table_version.py +18 -3
  8. pixeltable/catalog/view.py +9 -2
  9. pixeltable/env.py +1 -1
  10. pixeltable/exec/exec_node.py +1 -1
  11. pixeltable/exprs/__init__.py +2 -1
  12. pixeltable/exprs/arithmetic_expr.py +2 -0
  13. pixeltable/exprs/column_ref.py +36 -0
  14. pixeltable/exprs/expr.py +39 -9
  15. pixeltable/exprs/globals.py +12 -0
  16. pixeltable/exprs/json_mapper.py +1 -1
  17. pixeltable/exprs/json_path.py +0 -6
  18. pixeltable/exprs/similarity_expr.py +5 -20
  19. pixeltable/exprs/string_op.py +107 -0
  20. pixeltable/ext/functions/yolox.py +21 -64
  21. pixeltable/func/tools.py +2 -2
  22. pixeltable/functions/__init__.py +1 -1
  23. pixeltable/functions/globals.py +16 -5
  24. pixeltable/globals.py +85 -33
  25. pixeltable/io/__init__.py +3 -2
  26. pixeltable/io/datarows.py +138 -0
  27. pixeltable/io/external_store.py +8 -5
  28. pixeltable/io/globals.py +7 -160
  29. pixeltable/io/hf_datasets.py +21 -98
  30. pixeltable/io/pandas.py +29 -43
  31. pixeltable/io/parquet.py +17 -42
  32. pixeltable/io/table_data_conduit.py +569 -0
  33. pixeltable/io/utils.py +6 -21
  34. pixeltable/metadata/__init__.py +1 -1
  35. pixeltable/metadata/converters/convert_30.py +50 -0
  36. pixeltable/metadata/converters/util.py +26 -1
  37. pixeltable/metadata/notes.py +1 -0
  38. pixeltable/metadata/schema.py +3 -0
  39. pixeltable/utils/arrow.py +32 -7
  40. {pixeltable-0.3.9.dist-info → pixeltable-0.3.10.dist-info}/METADATA +1 -1
  41. {pixeltable-0.3.9.dist-info → pixeltable-0.3.10.dist-info}/RECORD +44 -40
  42. {pixeltable-0.3.9.dist-info → pixeltable-0.3.10.dist-info}/WHEEL +1 -1
  43. {pixeltable-0.3.9.dist-info → pixeltable-0.3.10.dist-info}/LICENSE +0 -0
  44. {pixeltable-0.3.9.dist-info → pixeltable-0.3.10.dist-info}/entry_points.txt +0 -0
@@ -1,41 +1,38 @@
1
1
  from __future__ import annotations
2
2
 
3
- import logging
4
- import math
5
- import random
6
3
  import typing
7
4
  from typing import Any, Optional, Union
8
5
 
9
6
  import pixeltable as pxt
10
7
  import pixeltable.type_system as ts
11
- from pixeltable import exceptions as excs
12
-
13
- from .utils import normalize_import_parameters, normalize_schema_names
14
8
 
15
9
  if typing.TYPE_CHECKING:
16
10
  import datasets # type: ignore[import-untyped]
17
11
 
18
- _logger = logging.getLogger('pixeltable')
19
-
20
- # use 100MB as the batch size limit for loading a huggingface dataset into pixeltable.
21
- # The primary goal is to bound memory use, regardless of dataset size.
22
- # Second goal is to limit overhead. 100MB is presumed to be reasonable for a lot of storage systems.
23
- _K_BATCH_SIZE_BYTES = 100_000_000
24
12
 
25
- # note, there are many more types. we allow overrides in the schema_override parameter
13
+ # note, there are many more types. we allow overrides in the schema_overrides parameter
26
14
  # to handle cases where the appropriate type is not yet mapped, or to override this mapping.
27
15
  # https://huggingface.co/docs/datasets/v2.17.0/en/package_reference/main_classes#datasets.Value
28
16
  _hf_to_pxt: dict[str, ts.ColumnType] = {
29
- 'int32': ts.IntType(nullable=True), # pixeltable widens to big int
30
- 'int64': ts.IntType(nullable=True),
31
17
  'bool': ts.BoolType(nullable=True),
18
+ 'int8': ts.IntType(nullable=True),
19
+ 'int16': ts.IntType(nullable=True),
20
+ 'int32': ts.IntType(nullable=True),
21
+ 'int64': ts.IntType(nullable=True),
22
+ 'uint8': ts.IntType(nullable=True),
23
+ 'uint16': ts.IntType(nullable=True),
24
+ 'uint32': ts.IntType(nullable=True),
25
+ 'uint64': ts.IntType(nullable=True),
26
+ 'float16': ts.FloatType(nullable=True),
32
27
  'float32': ts.FloatType(nullable=True),
33
28
  'float64': ts.FloatType(nullable=True),
34
- 'large_string': ts.StringType(nullable=True),
35
29
  'string': ts.StringType(nullable=True),
30
+ 'large_string': ts.StringType(nullable=True),
36
31
  'timestamp[s]': ts.TimestampType(nullable=True),
37
32
  'timestamp[ms]': ts.TimestampType(nullable=True), # HF dataset iterator converts timestamps to datetime.datetime
38
33
  'timestamp[us]': ts.TimestampType(nullable=True),
34
+ 'date32': ts.StringType(nullable=True), # date32 is not supported in pixeltable, use string
35
+ 'date64': ts.StringType(nullable=True), # date64 is not supported in pixeltable, use string
39
36
  }
40
37
 
41
38
 
@@ -88,7 +85,6 @@ def import_huggingface_dataset(
88
85
  table_path: str,
89
86
  dataset: Union[datasets.Dataset, datasets.DatasetDict],
90
87
  *,
91
- column_name_for_split: Optional[str] = None,
92
88
  schema_overrides: Optional[dict[str, Any]] = None,
93
89
  primary_key: Optional[Union[str, list[str]]] = None,
94
90
  **kwargs: Any,
@@ -101,91 +97,18 @@ def import_huggingface_dataset(
101
97
  dataset: Huggingface [`datasets.Dataset`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.Dataset)
102
98
  or [`datasets.DatasetDict`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.DatasetDict)
103
99
  to insert into the table.
104
- column_name_for_split: column name to use for split information. If None, no split information will be stored.
105
100
  schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
106
- name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`. The keys in
107
- `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not they are valid
108
- Pixeltable identifiers).
101
+ name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`.
102
+ The keys in `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not
103
+ they are valid Pixeltable identifiers).
109
104
  primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
110
105
  kwargs: Additional arguments to pass to `create_table`.
106
+ An argument of `column_name_for_split` must be provided if the source is a DatasetDict.
107
+ This column name will contain the split information. If None, no split information will be stored.
111
108
 
112
109
  Returns:
113
110
  A handle to the newly created [`Table`][pixeltable.Table].
114
111
  """
115
- import datasets
116
-
117
- import pixeltable as pxt
118
-
119
- if not isinstance(dataset, (datasets.Dataset, datasets.DatasetDict)):
120
- raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
121
-
122
- # Create the pixeltable schema from the huggingface schema
123
- hf_schema_source = _get_hf_schema(dataset)
124
- schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
125
- hf_schema = huggingface_schema_to_pxt_schema(hf_schema_source, schema_overrides, primary_key)
126
-
127
- # Add the split column to the schema if requested
128
- if column_name_for_split is not None:
129
- if column_name_for_split in hf_schema:
130
- raise excs.Error(
131
- f'Column name `{column_name_for_split}` already exists in dataset schema; provide a different `column_name_for_split`'
132
- )
133
- hf_schema[column_name_for_split] = ts.StringType(nullable=True)
134
-
135
- schema, pxt_pk, _ = normalize_schema_names(hf_schema, primary_key, schema_overrides, True)
136
-
137
- # Prepare to create table and insert data
138
- if table_path in pxt.list_tables():
139
- raise excs.Error(f'table {table_path} already exists')
140
-
141
- if isinstance(dataset, datasets.Dataset):
142
- # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
143
- raw_name = dataset.split._name
144
- split_name = raw_name.split('[')[0] if raw_name is not None else None
145
- dataset_dict = {split_name: dataset}
146
- else:
147
- dataset_dict = dataset
148
-
149
- # extract all class labels from the dataset to translate category ints to strings
150
- categorical_features = {
151
- feature_name: feature_type.names
152
- for (feature_name, feature_type) in hf_schema_source.items()
153
- if isinstance(feature_type, datasets.ClassLabel)
154
- }
155
-
156
- try:
157
- # random tmp name
158
- tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
159
- tab = pxt.create_table(tmp_name, schema, primary_key=pxt_pk, **kwargs)
160
-
161
- def _translate_row(row: dict[str, Any], split_name: str) -> dict[str, Any]:
162
- output_row = row.copy()
163
- # map all class labels to strings
164
- for field, values in categorical_features.items():
165
- output_row[field] = values[row[field]]
166
- # add split name to row
167
- if column_name_for_split is not None:
168
- output_row[column_name_for_split] = split_name
169
- return output_row
170
-
171
- for split_name, split_dataset in dataset_dict.items():
172
- num_batches = split_dataset.size_in_bytes / _K_BATCH_SIZE_BYTES
173
- tuples_per_batch = math.ceil(split_dataset.num_rows / num_batches)
174
- assert tuples_per_batch > 0
175
-
176
- batch = []
177
- for row in split_dataset:
178
- batch.append(_translate_row(row, split_name))
179
- if len(batch) >= tuples_per_batch:
180
- tab.insert(batch)
181
- batch = []
182
- # last batch
183
- if len(batch) > 0:
184
- tab.insert(batch)
185
-
186
- except Exception as e:
187
- _logger.error(f'Error while inserting dataset into table: {tmp_name}')
188
- raise e
189
-
190
- pxt.move(tmp_name, table_path)
191
- return pxt.get_table(table_path)
112
+ return pxt.create_table(
113
+ table_path, source=dataset, schema_overrides=schema_overrides, primary_key=primary_key, extra_args=kwargs
114
+ )
pixeltable/io/pandas.py CHANGED
@@ -7,9 +7,6 @@ from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
7
7
 
8
8
  import pixeltable as pxt
9
9
  import pixeltable.exceptions as excs
10
- from pixeltable import Table
11
-
12
- from .utils import find_or_create_table, normalize_import_parameters, normalize_schema_names
13
10
 
14
11
 
15
12
  def import_pandas(
@@ -43,20 +40,14 @@ def import_pandas(
43
40
  Returns:
44
41
  A handle to the newly created [`Table`][pixeltable.Table].
45
42
  """
46
- schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
47
- pd_schema = df_infer_schema(df, schema_overrides, primary_key)
48
- schema, pxt_pk, col_mapping = normalize_schema_names(pd_schema, primary_key, schema_overrides, False)
49
-
50
- __check_primary_key_values(df, primary_key)
51
-
52
- # Convert all rows to insertable format
53
- tbl_rows = [__df_row_to_pxt_row(row, pd_schema, col_mapping) for row in df.itertuples()]
54
-
55
- table = find_or_create_table(
56
- tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
43
+ return pxt.create_table(
44
+ tbl_name,
45
+ source=df,
46
+ schema_overrides=schema_overrides,
47
+ primary_key=primary_key,
48
+ num_retained_versions=num_retained_versions,
49
+ comment=comment,
57
50
  )
58
- table.insert(tbl_rows)
59
- return table
60
51
 
61
52
 
62
53
  def import_csv(
@@ -77,14 +68,14 @@ def import_csv(
77
68
  Returns:
78
69
  A handle to the newly created [`Table`][pixeltable.Table].
79
70
  """
80
- df = pd.read_csv(filepath_or_buffer, **kwargs)
81
- return import_pandas(
71
+ return pxt.create_table(
82
72
  tbl_name,
83
- df,
73
+ source=filepath_or_buffer,
84
74
  schema_overrides=schema_overrides,
85
75
  primary_key=primary_key,
86
76
  num_retained_versions=num_retained_versions,
87
77
  comment=comment,
78
+ extra_args=kwargs,
88
79
  )
89
80
 
90
81
 
@@ -107,18 +98,18 @@ def import_excel(
107
98
  Returns:
108
99
  A handle to the newly created [`Table`][pixeltable.Table].
109
100
  """
110
- df = pd.read_excel(io, *args, **kwargs)
111
- return import_pandas(
101
+ return pxt.create_table(
112
102
  tbl_name,
113
- df,
103
+ source=io,
114
104
  schema_overrides=schema_overrides,
115
105
  primary_key=primary_key,
116
106
  num_retained_versions=num_retained_versions,
117
107
  comment=comment,
108
+ extra_args=kwargs,
118
109
  )
119
110
 
120
111
 
121
- def __check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
112
+ def _df_check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
122
113
  for pd_name in primary_key:
123
114
  # This can be faster for large DataFrames
124
115
  has_nulls = df[pd_name].count() < len(df)
@@ -146,15 +137,6 @@ def df_infer_schema(
146
137
  return pd_schema
147
138
 
148
139
 
149
- """
150
- # Check if a datetime64[ns, UTC] dtype
151
- def is_datetime_tz_utc(x: Any) -> bool:
152
- if isinstance(x, pd.Timestamp) and x.tzinfo is not None and str(x.tzinfo) == 'UTC':
153
- return True
154
- return pd.api.types.is_datetime64tz_dtype(x) and str(x).endswith('UTC]')
155
- """
156
-
157
-
158
140
  def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.ColumnType]:
159
141
  """
160
142
  Determines a pixeltable ColumnType from a pandas dtype
@@ -165,7 +147,8 @@ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.C
165
147
  Returns:
166
148
  pxt.ColumnType: A pixeltable ColumnType
167
149
  """
168
- # Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly compatible with NumPy dtypes
150
+ # Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly
151
+ # compatible with NumPy dtypes
169
152
  # The timezone-aware datetime64[ns, tz=] dtype is a pandas extension dtype
170
153
  if is_datetime64_any_dtype(pd_dtype):
171
154
  return pxt.TimestampType(nullable=nullable)
@@ -204,32 +187,35 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
204
187
  raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {pd_dtype})')
205
188
 
206
189
 
207
- def __df_row_to_pxt_row(
190
+ def _df_row_to_pxt_row(
208
191
  row: tuple[Any, ...], schema: dict[str, pxt.ColumnType], col_mapping: Optional[dict[str, str]]
209
192
  ) -> dict[str, Any]:
210
193
  """Convert a row to insertable format"""
211
194
  pxt_row: dict[str, Any] = {}
212
195
  for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
196
+ pxt_name = col_mapping.get(col_name, col_name)
197
+ nval: Any
213
198
  if pxt_type.is_float_type():
214
- val = float(val)
199
+ nval = float(val)
215
200
  elif isinstance(val, float) and np.isnan(val):
216
201
  # pandas uses NaN for empty cells, even for types other than float;
217
202
  # for any type but a float, convert these to None
218
- val = None
203
+ nval = None
219
204
  elif pxt_type.is_int_type():
220
- val = int(val)
205
+ nval = int(val)
221
206
  elif pxt_type.is_bool_type():
222
- val = bool(val)
207
+ nval = bool(val)
223
208
  elif pxt_type.is_string_type():
224
- val = str(val)
209
+ nval = str(val)
225
210
  elif pxt_type.is_timestamp_type():
226
211
  if pd.isnull(val):
227
212
  # pandas has the bespoke 'NaT' type for a missing timestamp; postgres is very
228
213
  # much not-ok with it. (But if we convert it to None and then load out the
229
214
  # table contents as a pandas DataFrame, it will correctly restore the 'NaT'!)
230
- val = None
215
+ nval = None
231
216
  else:
232
- val = pd.Timestamp(val).to_pydatetime()
233
- pxt_name = col_name if col_mapping is None else col_mapping[col_name]
234
- pxt_row[pxt_name] = val
217
+ nval = pd.Timestamp(val).to_pydatetime()
218
+ else:
219
+ nval = val
220
+ pxt_row[pxt_name] = nval
235
221
  return pxt_row
pixeltable/io/parquet.py CHANGED
@@ -4,7 +4,6 @@ import datetime
4
4
  import io
5
5
  import json
6
6
  import logging
7
- import random
8
7
  import typing
9
8
  from collections import deque
10
9
  from pathlib import Path
@@ -14,12 +13,10 @@ import numpy as np
14
13
  import PIL.Image
15
14
 
16
15
  import pixeltable as pxt
17
- import pixeltable.exceptions as exc
16
+ import pixeltable.exceptions as excs
18
17
  from pixeltable.env import Env
19
18
  from pixeltable.utils.transactional_directory import transactional_directory
20
19
 
21
- from .utils import normalize_import_parameters, normalize_schema_names
22
-
23
20
  if typing.TYPE_CHECKING:
24
21
  import pyarrow as pa
25
22
 
@@ -78,7 +75,7 @@ def export_parquet(
78
75
  arrow_schema = to_arrow_schema(df.schema)
79
76
 
80
77
  if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
81
- raise exc.Error('Cannot export Dataframe with image columns when inline_images is False')
78
+ raise excs.Error('Cannot export Dataframe with image columns when inline_images is False')
82
79
 
83
80
  # store the changes atomically
84
81
  with transactional_directory(parquet_path) as temp_path:
@@ -87,7 +84,7 @@ def export_parquet(
87
84
  json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
88
85
 
89
86
  batch_num = 0
90
- current_value_batch: dict[str, deque] = {k: deque() for k in df.schema.keys()}
87
+ current_value_batch: dict[str, deque] = {k: deque() for k in df.schema}
91
88
  current_byte_estimate = 0
92
89
 
93
90
  with Env.get().begin_xact():
@@ -111,7 +108,7 @@ def export_parquet(
111
108
  val.save(buf, format='PNG')
112
109
  val = buf.getvalue()
113
110
  else:
114
- assert False, f'unknown image type {type(val)}'
111
+ raise excs.Error(f'unknown image type {type(val)}')
115
112
  length = len(val)
116
113
  elif col_type.is_string_type():
117
114
  length = len(val)
@@ -119,16 +116,14 @@ def export_parquet(
119
116
  if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
120
117
  val = data_row.file_paths[e.slot_idx]
121
118
  else:
122
- assert False, f'unknown video type {type(val)}'
119
+ raise excs.Error(f'unknown video type {type(val)}')
123
120
  length = len(val)
124
121
  elif col_type.is_json_type():
125
122
  val = json.dumps(val)
126
123
  length = len(val)
127
124
  elif col_type.is_array_type():
128
125
  length = val.nbytes
129
- elif col_type.is_int_type():
130
- length = 8
131
- elif col_type.is_float_type():
126
+ elif col_type.is_int_type() or col_type.is_float_type():
132
127
  length = 8
133
128
  elif col_type.is_bool_type():
134
129
  length = 1
@@ -136,7 +131,7 @@ def export_parquet(
136
131
  val = val.astimezone(datetime.timezone.utc)
137
132
  length = 8
138
133
  else:
139
- assert False, f'unknown type {col_type} for {col_name}'
134
+ raise excs.Error(f'unknown type {col_type} for {col_name}')
140
135
 
141
136
  current_value_batch[col_name].append(val)
142
137
  current_byte_estimate += length
@@ -144,7 +139,7 @@ def export_parquet(
144
139
  assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
145
140
  _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
146
141
  batch_num += 1
147
- current_value_batch = {k: deque() for k in df.schema.keys()}
142
+ current_value_batch = {k: deque() for k in df.schema}
148
143
  current_byte_estimate = 0
149
144
 
150
145
  _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
@@ -173,32 +168,12 @@ def import_parquet(
173
168
  Returns:
174
169
  A handle to the newly created table.
175
170
  """
176
- from pyarrow import parquet
177
-
178
- from pixeltable.utils.arrow import ar_infer_schema, iter_tuples2
179
-
180
- input_path = Path(parquet_path).expanduser()
181
- parquet_dataset = parquet.ParquetDataset(str(input_path))
182
-
183
- schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
184
- ar_schema = ar_infer_schema(parquet_dataset.schema, schema_overrides, primary_key)
185
- schema, pxt_pk, col_mapping = normalize_schema_names(ar_schema, primary_key, schema_overrides, False)
186
-
187
- if table in pxt.list_tables():
188
- raise exc.Error(f'Table {table} already exists')
189
-
190
- tmp_name = f'{table}_tmp_{random.randint(0, 100000000)}'
191
- total_rows = 0
192
- try:
193
- tab = pxt.create_table(tmp_name, schema, primary_key=pxt_pk, **kwargs)
194
- for fragment in parquet_dataset.fragments: # type: ignore[attr-defined]
195
- for batch in fragment.to_batches():
196
- dict_batch = list(iter_tuples2(batch, col_mapping, schema))
197
- total_rows += len(dict_batch)
198
- tab.insert(dict_batch)
199
- except Exception as e:
200
- _logger.error(f'Error after inserting {total_rows} rows from Parquet file into table: {e}')
201
- raise e
202
-
203
- pxt.move(tmp_name, table)
204
- return pxt.get_table(table)
171
+ value = kwargs.pop('source_format', None)
172
+ return pxt.create_table(
173
+ table,
174
+ source=parquet_path,
175
+ source_format=value,
176
+ schema_overrides=schema_overrides,
177
+ primary_key=primary_key,
178
+ extra_args=kwargs,
179
+ )