pixeltable 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (57) hide show
  1. pixeltable/__init__.py +1 -0
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/catalog.py +9 -2
  4. pixeltable/catalog/column.py +1 -1
  5. pixeltable/catalog/dir.py +1 -1
  6. pixeltable/catalog/table.py +1 -1
  7. pixeltable/catalog/table_version.py +12 -2
  8. pixeltable/catalog/table_version_path.py +2 -2
  9. pixeltable/catalog/view.py +64 -20
  10. pixeltable/dataframe.py +10 -5
  11. pixeltable/env.py +12 -0
  12. pixeltable/exec/expr_eval/evaluators.py +4 -2
  13. pixeltable/exec/expr_eval/expr_eval_node.py +4 -1
  14. pixeltable/exprs/comparison.py +8 -4
  15. pixeltable/exprs/data_row.py +5 -3
  16. pixeltable/exprs/expr.py +2 -2
  17. pixeltable/exprs/function_call.py +155 -313
  18. pixeltable/func/aggregate_function.py +29 -15
  19. pixeltable/func/callable_function.py +11 -8
  20. pixeltable/func/expr_template_function.py +3 -9
  21. pixeltable/func/function.py +148 -74
  22. pixeltable/func/signature.py +65 -30
  23. pixeltable/func/udf.py +1 -1
  24. pixeltable/functions/__init__.py +1 -0
  25. pixeltable/functions/deepseek.py +121 -0
  26. pixeltable/functions/image.py +7 -7
  27. pixeltable/functions/openai.py +23 -9
  28. pixeltable/functions/video.py +14 -7
  29. pixeltable/globals.py +14 -3
  30. pixeltable/index/embedding_index.py +4 -13
  31. pixeltable/io/globals.py +88 -77
  32. pixeltable/io/hf_datasets.py +34 -34
  33. pixeltable/io/pandas.py +75 -76
  34. pixeltable/io/parquet.py +19 -27
  35. pixeltable/io/utils.py +115 -0
  36. pixeltable/iterators/audio.py +2 -1
  37. pixeltable/iterators/video.py +1 -1
  38. pixeltable/metadata/__init__.py +2 -1
  39. pixeltable/metadata/converters/convert_15.py +18 -8
  40. pixeltable/metadata/converters/convert_27.py +31 -0
  41. pixeltable/metadata/converters/convert_28.py +15 -0
  42. pixeltable/metadata/converters/convert_29.py +111 -0
  43. pixeltable/metadata/converters/util.py +12 -1
  44. pixeltable/metadata/notes.py +3 -0
  45. pixeltable/metadata/schema.py +8 -0
  46. pixeltable/share/__init__.py +1 -0
  47. pixeltable/share/packager.py +41 -13
  48. pixeltable/share/publish.py +97 -0
  49. pixeltable/type_system.py +40 -14
  50. pixeltable/utils/__init__.py +41 -0
  51. pixeltable/utils/arrow.py +40 -7
  52. pixeltable/utils/formatter.py +1 -1
  53. {pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/METADATA +34 -49
  54. {pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/RECORD +57 -51
  55. {pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/WHEEL +1 -1
  56. {pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/LICENSE +0 -0
  57. {pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/entry_points.txt +0 -0
@@ -10,6 +10,8 @@ import pixeltable as pxt
10
10
  import pixeltable.type_system as ts
11
11
  from pixeltable import exceptions as excs
12
12
 
13
+ from .utils import normalize_import_parameters, normalize_schema_names
14
+
13
15
  if typing.TYPE_CHECKING:
14
16
  import datasets # type: ignore[import-untyped]
15
17
 
@@ -28,29 +30,33 @@ _hf_to_pxt: dict[str, ts.ColumnType] = {
28
30
  'int64': ts.IntType(nullable=True),
29
31
  'bool': ts.BoolType(nullable=True),
30
32
  'float32': ts.FloatType(nullable=True),
33
+ 'float64': ts.FloatType(nullable=True),
34
+ 'large_string': ts.StringType(nullable=True),
31
35
  'string': ts.StringType(nullable=True),
32
36
  'timestamp[s]': ts.TimestampType(nullable=True),
33
37
  'timestamp[ms]': ts.TimestampType(nullable=True), # HF dataset iterator converts timestamps to datetime.datetime
38
+ 'timestamp[us]': ts.TimestampType(nullable=True),
34
39
  }
35
40
 
36
41
 
37
- def _to_pixeltable_type(feature_type: Any) -> Optional[ts.ColumnType]:
42
+ def _to_pixeltable_type(feature_type: Any, nullable: bool) -> Optional[ts.ColumnType]:
38
43
  """Convert a huggingface feature type to a pixeltable ColumnType if one is defined."""
39
44
  import datasets
40
45
 
41
46
  if isinstance(feature_type, datasets.ClassLabel):
42
47
  # enum, example: ClassLabel(names=['neg', 'pos'], id=None)
43
- return ts.StringType(nullable=True)
48
+ return ts.StringType(nullable=nullable)
44
49
  elif isinstance(feature_type, datasets.Value):
45
50
  # example: Value(dtype='int64', id=None)
46
- return _hf_to_pxt.get(feature_type.dtype, None)
51
+ pt = _hf_to_pxt.get(feature_type.dtype, None)
52
+ return pt.copy(nullable=nullable) if pt is not None else None
47
53
  elif isinstance(feature_type, datasets.Sequence):
48
54
  # example: cohere wiki. Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)
49
- dtype = _to_pixeltable_type(feature_type.feature)
55
+ dtype = _to_pixeltable_type(feature_type.feature, nullable)
50
56
  length = feature_type.length if feature_type.length != -1 else None
51
57
  return ts.ArrayType(shape=(length,), dtype=dtype)
52
58
  elif isinstance(feature_type, datasets.Image):
53
- return ts.ImageType(nullable=True)
59
+ return ts.ImageType(nullable=nullable)
54
60
  else:
55
61
  return None
56
62
 
@@ -63,15 +69,17 @@ def _get_hf_schema(dataset: Union[datasets.Dataset, datasets.DatasetDict]) -> da
63
69
  return first_dataset.features
64
70
 
65
71
 
66
- def huggingface_schema_to_pixeltable_schema(
67
- hf_dataset: Union[datasets.Dataset, datasets.DatasetDict],
72
+ def huggingface_schema_to_pxt_schema(
73
+ hf_schema: datasets.Features, schema_overrides: dict[str, Any], primary_key: list[str]
68
74
  ) -> dict[str, Optional[ts.ColumnType]]:
69
75
  """Generate a pixeltable schema from a huggingface dataset schema.
70
76
  Columns without a known mapping are mapped to None
71
77
  """
72
- hf_schema = _get_hf_schema(hf_dataset)
73
78
  pixeltable_schema = {
74
- column_name: _to_pixeltable_type(feature_type) for column_name, feature_type in hf_schema.items()
79
+ column_name: _to_pixeltable_type(feature_type, column_name not in primary_key)
80
+ if column_name not in schema_overrides
81
+ else schema_overrides[column_name]
82
+ for column_name, feature_type in hf_schema.items()
75
83
  }
76
84
  return pixeltable_schema
77
85
 
@@ -82,6 +90,7 @@ def import_huggingface_dataset(
82
90
  *,
83
91
  column_name_for_split: Optional[str] = None,
84
92
  schema_overrides: Optional[dict[str, Any]] = None,
93
+ primary_key: Optional[Union[str, list[str]]] = None,
85
94
  **kwargs: Any,
86
95
  ) -> pxt.Table:
87
96
  """Create a new base table from a Huggingface dataset, or dataset dict with multiple splits.
@@ -97,6 +106,7 @@ def import_huggingface_dataset(
97
106
  name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`. The keys in
98
107
  `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not they are valid
99
108
  Pixeltable identifiers).
109
+ primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
100
110
  kwargs: Additional arguments to pass to `create_table`.
101
111
 
102
112
  Returns:
@@ -106,57 +116,47 @@ def import_huggingface_dataset(
106
116
 
107
117
  import pixeltable as pxt
108
118
 
109
- if table_path in pxt.list_tables():
110
- raise excs.Error(f'table {table_path} already exists')
111
-
112
119
  if not isinstance(dataset, (datasets.Dataset, datasets.DatasetDict)):
113
120
  raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
114
121
 
115
- if isinstance(dataset, datasets.Dataset):
116
- # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
117
- raw_name = dataset.split._name
118
- split_name = raw_name.split('[')[0] if raw_name is not None else None
119
- dataset_dict = {split_name: dataset}
120
- else:
121
- dataset_dict = dataset
122
-
123
- pixeltable_schema = huggingface_schema_to_pixeltable_schema(dataset)
124
- if schema_overrides is not None:
125
- pixeltable_schema.update(schema_overrides)
122
+ # Create the pixeltable schema from the huggingface schema
123
+ hf_schema_source = _get_hf_schema(dataset)
124
+ schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
125
+ hf_schema = huggingface_schema_to_pxt_schema(hf_schema_source, schema_overrides, primary_key)
126
126
 
127
+ # Add the split column to the schema if requested
127
128
  if column_name_for_split is not None:
128
- if column_name_for_split in pixeltable_schema:
129
+ if column_name_for_split in hf_schema:
129
130
  raise excs.Error(
130
131
  f'Column name `{column_name_for_split}` already exists in dataset schema; provide a different `column_name_for_split`'
131
132
  )
132
- pixeltable_schema[column_name_for_split] = ts.StringType(nullable=True)
133
+ hf_schema[column_name_for_split] = ts.StringType(nullable=True)
133
134
 
134
- for field, column_type in pixeltable_schema.items():
135
- if column_type is None:
136
- raise excs.Error(f'Could not infer pixeltable type for feature `{field}` in huggingface dataset')
135
+ schema, pxt_pk, _ = normalize_schema_names(hf_schema, primary_key, schema_overrides, True)
136
+
137
+ # Prepare to create table and insert data
138
+ if table_path in pxt.list_tables():
139
+ raise excs.Error(f'table {table_path} already exists')
137
140
 
138
141
  if isinstance(dataset, datasets.Dataset):
139
142
  # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
140
143
  raw_name = dataset.split._name
141
144
  split_name = raw_name.split('[')[0] if raw_name is not None else None
142
145
  dataset_dict = {split_name: dataset}
143
- elif isinstance(dataset, datasets.DatasetDict):
144
- dataset_dict = dataset
145
146
  else:
146
- raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
147
+ dataset_dict = dataset
147
148
 
148
149
  # extract all class labels from the dataset to translate category ints to strings
149
- hf_schema = _get_hf_schema(dataset)
150
150
  categorical_features = {
151
151
  feature_name: feature_type.names
152
- for (feature_name, feature_type) in hf_schema.items()
152
+ for (feature_name, feature_type) in hf_schema_source.items()
153
153
  if isinstance(feature_type, datasets.ClassLabel)
154
154
  }
155
155
 
156
156
  try:
157
157
  # random tmp name
158
158
  tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
159
- tab = pxt.create_table(tmp_name, pixeltable_schema, **kwargs)
159
+ tab = pxt.create_table(tmp_name, schema, primary_key=pxt_pk, **kwargs)
160
160
 
161
161
  def _translate_row(row: dict[str, Any], split_name: str) -> dict[str, Any]:
162
162
  output_row = row.copy()
pixeltable/io/pandas.py CHANGED
@@ -2,17 +2,21 @@ from typing import Any, Optional, Union
2
2
 
3
3
  import numpy as np
4
4
  import pandas as pd
5
+ from pandas._typing import DtypeObj # For pandas dtype type hints
6
+ from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
5
7
 
6
8
  import pixeltable as pxt
7
9
  import pixeltable.exceptions as excs
8
- import pixeltable.type_system as ts
10
+ from pixeltable import Table
11
+
12
+ from .utils import find_or_create_table, normalize_import_parameters, normalize_schema_names
9
13
 
10
14
 
11
15
  def import_pandas(
12
16
  tbl_name: str,
13
17
  df: pd.DataFrame,
14
18
  *,
15
- schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
19
+ schema_overrides: Optional[dict[str, Any]] = None,
16
20
  primary_key: Optional[Union[str, list[str]]] = None,
17
21
  num_retained_versions: int = 10,
18
22
  comment: str = '',
@@ -39,16 +43,16 @@ def import_pandas(
39
43
  Returns:
40
44
  A handle to the newly created [`Table`][pixeltable.Table].
41
45
  """
42
- if schema_overrides is None:
43
- schema_overrides = {}
44
- if primary_key is None:
45
- primary_key = []
46
- elif isinstance(primary_key, str):
47
- primary_key = [primary_key]
48
-
49
- schema, pxt_pk = __df_to_pxt_schema(df, schema_overrides, primary_key)
50
- tbl_rows = (dict(__df_row_to_pxt_row(row, schema)) for row in df.itertuples())
51
- table = pxt.create_table(
46
+ schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
47
+ pd_schema = df_infer_schema(df, schema_overrides, primary_key)
48
+ schema, pxt_pk, col_mapping = normalize_schema_names(pd_schema, primary_key, schema_overrides, False)
49
+
50
+ __check_primary_key_values(df, primary_key)
51
+
52
+ # Convert all rows to insertable format
53
+ tbl_rows = [__df_row_to_pxt_row(row, pd_schema, col_mapping) for row in df.itertuples()]
54
+
55
+ table = find_or_create_table(
52
56
  tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
53
57
  )
54
58
  table.insert(tbl_rows)
@@ -58,7 +62,7 @@ def import_pandas(
58
62
  def import_csv(
59
63
  tbl_name: str,
60
64
  filepath_or_buffer,
61
- schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
65
+ schema_overrides: Optional[dict[str, Any]] = None,
62
66
  primary_key: Optional[Union[str, list[str]]] = None,
63
67
  num_retained_versions: int = 10,
64
68
  comment: str = '',
@@ -88,7 +92,7 @@ def import_excel(
88
92
  tbl_name: str,
89
93
  io,
90
94
  *args,
91
- schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
95
+ schema_overrides: Optional[dict[str, Any]] = None,
92
96
  primary_key: Optional[Union[str, list[str]]] = None,
93
97
  num_retained_versions: int = 10,
94
98
  comment: str = '',
@@ -114,82 +118,73 @@ def import_excel(
114
118
  )
115
119
 
116
120
 
117
- def __df_to_pxt_schema(
121
+ def __check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
122
+ for pd_name in primary_key:
123
+ # This can be faster for large DataFrames
124
+ has_nulls = df[pd_name].count() < len(df)
125
+ if has_nulls:
126
+ raise excs.Error(f'Primary key column `{pd_name}` cannot contain null values.')
127
+
128
+
129
+ def df_infer_schema(
118
130
  df: pd.DataFrame, schema_overrides: dict[str, pxt.ColumnType], primary_key: list[str]
119
- ) -> tuple[dict[str, pxt.ColumnType], list[str]]:
131
+ ) -> dict[str, pxt.ColumnType]:
120
132
  """
121
133
  Infers a Pixeltable schema from a Pandas DataFrame.
122
134
 
123
135
  Returns:
124
136
  A tuple containing a Pixeltable schema and a list of primary key column names.
125
137
  """
126
- for pd_name in schema_overrides:
127
- if pd_name not in df.columns:
128
- raise excs.Error(
129
- f'Column `{pd_name}` specified in `schema_overrides` does not exist in the given `DataFrame`.'
130
- )
131
- for pd_name in primary_key:
132
- if pd_name not in df.columns:
133
- raise excs.Error(f'Primary key column `{pd_name}` does not exist in the given `DataFrame`.')
134
-
135
- schema: dict[str, pxt.ColumnType] = {}
136
- col_mapping: dict[str, str] = {} # Maps Pandas column names to Pixeltable column names
137
-
138
+ pd_schema: dict[str, pxt.ColumnType] = {}
138
139
  for pd_name, pd_dtype in zip(df.columns, df.dtypes):
139
140
  if pd_name in schema_overrides:
140
141
  pxt_type = schema_overrides[pd_name]
141
142
  else:
142
- # This complicated-looking condition is necessary because we cannot safely call `pd.isna()` on
143
- # general objects, so we need to check for nulls in the specific cases where we might expect them.
144
- # isinstance(val, float) will check for NaN values in float columns *as well as* floats appearing
145
- # in object columns (where Pandas uses NaN as a general null).
146
- # np.issubdtype(pd_dtype, np.datetime64) checks for NaT values specifically in datetime columns.
147
- has_na = any(
148
- (isinstance(val, float) or np.issubdtype(pd_dtype, np.datetime64)) and pd.isna(val)
149
- for val in df[pd_name]
150
- )
151
- if has_na and pd_name in primary_key:
152
- raise excs.Error(f'Primary key column `{pd_name}` cannot contain null values.')
153
- pxt_type = __np_dtype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
154
- pxt_name = __normalize_pxt_col_name(pd_name)
155
- # Ensure that column names are unique by appending a distinguishing suffix
156
- # to any collisions
157
- if pxt_name in schema:
158
- n = 2
159
- while f'{pxt_name}_{n}' in schema:
160
- n += 1
161
- pxt_name = f'{pxt_name}_{n}'
162
- schema[pxt_name] = pxt_type
163
- col_mapping[pd_name] = pxt_name
164
-
165
- pxt_pk = [col_mapping[pk] for pk in primary_key]
166
- return schema, pxt_pk
167
-
168
-
169
- def __normalize_pxt_col_name(pd_name: str) -> str:
170
- """
171
- Normalizes an arbitrary DataFrame column name into a valid Pixeltable identifier by:
172
- - replacing any non-ascii or non-alphanumeric characters with an underscore _
173
- - prefixing the result with the letter 'c' if it starts with an underscore or a number
143
+ pxt_type = __pd_coltype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
144
+ pd_schema[pd_name] = pxt_type
145
+
146
+ return pd_schema
147
+
148
+
149
+ """
150
+ # Check if a datetime64[ns, UTC] dtype
151
+ def is_datetime_tz_utc(x: Any) -> bool:
152
+ if isinstance(x, pd.Timestamp) and x.tzinfo is not None and str(x.tzinfo) == 'UTC':
153
+ return True
154
+ return pd.api.types.is_datetime64tz_dtype(x) and str(x).endswith('UTC]')
155
+ """
156
+
157
+
158
+ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.ColumnType]:
174
159
  """
175
- id = ''.join(ch if ch.isascii() and ch.isalnum() else '_' for ch in pd_name)
176
- if id[0].isnumeric():
177
- id = f'c_{id}'
178
- elif id[0] == '_':
179
- id = f'c{id}'
180
- assert pxt.catalog.is_valid_identifier(id), id
181
- return id
160
+ Determines a pixeltable ColumnType from a pandas dtype
182
161
 
162
+ Args:
163
+ pd_dtype: A pandas dtype object
183
164
 
184
- def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bool) -> pxt.ColumnType:
165
+ Returns:
166
+ pxt.ColumnType: A pixeltable ColumnType
167
+ """
168
+ # Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly compatible with NumPy dtypes
169
+ # The timezone-aware datetime64[ns, tz=] dtype is a pandas extension dtype
170
+ if is_datetime64_any_dtype(pd_dtype):
171
+ return pxt.TimestampType(nullable=nullable)
172
+ if is_extension_array_dtype(pd_dtype):
173
+ return None
174
+ # Most other pandas dtypes are directly NumPy compatible
175
+ assert isinstance(pd_dtype, np.dtype)
176
+ return pxt.ArrayType.from_np_dtype(pd_dtype, nullable)
177
+
178
+
179
+ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable: bool) -> pxt.ColumnType:
185
180
  """
186
- Infers a Pixeltable type based on a Numpy dtype.
181
+ Infers a Pixeltable type based on a pandas dtype.
187
182
  """
188
- pxttype = ts.ArrayType.from_np_dtype(np_dtype, nullable)
183
+ pxttype = __pd_dtype_to_pxt_type(pd_dtype, nullable)
189
184
  if pxttype is not None:
190
185
  return pxttype
191
186
 
192
- if np_dtype == np.object_:
187
+ if pd_dtype == np.object_:
193
188
  # The `object_` dtype can mean all sorts of things; see if we can infer the Pixeltable type
194
189
  # based on the actual data in `data_col`.
195
190
  # First drop any null values (they don't contribute to type inference).
@@ -206,11 +201,14 @@ def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bo
206
201
  else:
207
202
  return inferred_type.copy(nullable=nullable)
208
203
 
209
- raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {np_dtype})')
204
+ raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {pd_dtype})')
210
205
 
211
206
 
212
- def __df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType]) -> dict[str, Any]:
213
- rows = {}
207
+ def __df_row_to_pxt_row(
208
+ row: tuple[Any, ...], schema: dict[str, pxt.ColumnType], col_mapping: Optional[dict[str, str]]
209
+ ) -> dict[str, Any]:
210
+ """Convert a row to insertable format"""
211
+ pxt_row: dict[str, Any] = {}
214
212
  for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
215
213
  if pxt_type.is_float_type():
216
214
  val = float(val)
@@ -232,5 +230,6 @@ def __df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType])
232
230
  val = None
233
231
  else:
234
232
  val = pd.Timestamp(val).to_pydatetime()
235
- rows[col_name] = val
236
- return rows
233
+ pxt_name = col_name if col_mapping is None else col_mapping[col_name]
234
+ pxt_row[pxt_name] = val
235
+ return pxt_row
pixeltable/io/parquet.py CHANGED
@@ -15,10 +15,11 @@ import PIL.Image
15
15
 
16
16
  import pixeltable as pxt
17
17
  import pixeltable.exceptions as exc
18
- import pixeltable.type_system as ts
19
18
  from pixeltable.env import Env
20
19
  from pixeltable.utils.transactional_directory import transactional_directory
21
20
 
21
+ from .utils import normalize_import_parameters, normalize_schema_names
22
+
22
23
  if typing.TYPE_CHECKING:
23
24
  import pyarrow as pa
24
25
 
@@ -148,19 +149,13 @@ def export_parquet(
148
149
  _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
149
150
 
150
151
 
151
- def parquet_schema_to_pixeltable_schema(parquet_path: str) -> dict[str, Optional[ts.ColumnType]]:
152
- """Generate a default pixeltable schema for the given parquet file. Returns None for unknown types."""
153
- from pyarrow import parquet
154
-
155
- from pixeltable.utils.arrow import to_pixeltable_schema
156
-
157
- input_path = Path(parquet_path).expanduser()
158
- parquet_dataset = parquet.ParquetDataset(str(input_path))
159
- return to_pixeltable_schema(parquet_dataset.schema)
160
-
161
-
162
152
  def import_parquet(
163
- table: str, *, parquet_path: str, schema_overrides: Optional[dict[str, ts.ColumnType]] = None, **kwargs: Any
153
+ table: str,
154
+ *,
155
+ parquet_path: str,
156
+ schema_overrides: Optional[dict[str, Any]] = None,
157
+ primary_key: Optional[Union[str, list[str]]] = None,
158
+ **kwargs: Any,
164
159
  ) -> pxt.Table:
165
160
  """Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
166
161
 
@@ -171,6 +166,7 @@ def import_parquet(
171
166
  name `name` will be given type `type`, instead of being inferred from the Parquet dataset. The keys in
172
167
  `schema_overrides` should be the column names of the Parquet dataset (whether or not they are valid
173
168
  Pixeltable identifiers).
169
+ primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
174
170
  kwargs: Additional arguments to pass to `create_table`.
175
171
 
176
172
  Returns:
@@ -178,33 +174,29 @@ def import_parquet(
178
174
  """
179
175
  from pyarrow import parquet
180
176
 
181
- import pixeltable as pxt
182
- from pixeltable.utils.arrow import iter_tuples
177
+ from pixeltable.utils.arrow import ar_infer_schema, iter_tuples2
183
178
 
184
179
  input_path = Path(parquet_path).expanduser()
185
180
  parquet_dataset = parquet.ParquetDataset(str(input_path))
186
181
 
187
- schema = parquet_schema_to_pixeltable_schema(parquet_path)
188
- if schema_overrides is None:
189
- schema_overrides = {}
190
-
191
- schema.update(schema_overrides)
192
- for k, v in schema.items():
193
- if v is None:
194
- raise exc.Error(f'Could not infer pixeltable type for column {k} from parquet file')
182
+ schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
183
+ ar_schema = ar_infer_schema(parquet_dataset.schema, schema_overrides, primary_key)
184
+ schema, pxt_pk, col_mapping = normalize_schema_names(ar_schema, primary_key, schema_overrides, False)
195
185
 
196
186
  if table in pxt.list_tables():
197
187
  raise exc.Error(f'Table {table} already exists')
198
188
 
189
+ tmp_name = f'{table}_tmp_{random.randint(0, 100000000)}'
190
+ total_rows = 0
199
191
  try:
200
- tmp_name = f'{table}_tmp_{random.randint(0, 100000000)}'
201
- tab = pxt.create_table(tmp_name, schema, **kwargs)
192
+ tab = pxt.create_table(tmp_name, schema, primary_key=pxt_pk, **kwargs)
202
193
  for fragment in parquet_dataset.fragments: # type: ignore[attr-defined]
203
194
  for batch in fragment.to_batches():
204
- dict_batch = list(iter_tuples(batch))
195
+ dict_batch = list(iter_tuples2(batch, col_mapping, schema))
196
+ total_rows += len(dict_batch)
205
197
  tab.insert(dict_batch)
206
198
  except Exception as e:
207
- _logger.error(f'Error while inserting Parquet file into table: {e}')
199
+ _logger.error(f'Error after inserting {total_rows} rows from Parquet file into table: {e}')
208
200
  raise e
209
201
 
210
202
  pxt.move(tmp_name, table)
pixeltable/io/utils.py ADDED
@@ -0,0 +1,115 @@
1
+ from keyword import iskeyword as is_python_keyword
2
+ from typing import Any, Optional, Union
3
+
4
+ import pixeltable as pxt
5
+ import pixeltable.exceptions as excs
6
+ from pixeltable import Table
7
+ from pixeltable.catalog.globals import is_system_column_name
8
+
9
+
10
+ def normalize_pxt_col_name(name: str) -> str:
11
+ """
12
+ Normalizes an arbitrary DataFrame column name into a valid Pixeltable identifier by:
13
+ - replacing any non-ascii or non-alphanumeric characters with an underscore _
14
+ - prefixing the result with the letter 'c' if it starts with an underscore or a number
15
+ """
16
+ id = ''.join(ch if ch.isascii() and ch.isalnum() else '_' for ch in name)
17
+ if id[0].isnumeric():
18
+ id = f'c_{id}'
19
+ elif id[0] == '_':
20
+ id = f'c{id}'
21
+ assert pxt.catalog.is_valid_identifier(id), id
22
+ return id
23
+
24
+
25
+ def normalize_import_parameters(
26
+ schema_overrides: Optional[dict[str, Any]] = None, primary_key: Optional[Union[str, list[str]]] = None
27
+ ) -> tuple[dict[str, Any], list[str]]:
28
+ if schema_overrides is None:
29
+ schema_overrides = {}
30
+ if primary_key is None:
31
+ primary_key = []
32
+ elif isinstance(primary_key, str):
33
+ primary_key = [primary_key]
34
+ return schema_overrides, primary_key
35
+
36
+
37
+ def _is_usable_as_column_name(name: str, destination_schema: dict[str, Any]) -> bool:
38
+ return not (is_system_column_name(name) or is_python_keyword(name) or name in destination_schema)
39
+
40
+
41
+ def normalize_schema_names(
42
+ in_schema: dict[str, Any],
43
+ primary_key: list[str],
44
+ schema_overrides: dict[str, Any],
45
+ require_valid_pxt_column_names: bool = False,
46
+ ) -> tuple[dict[str, Any], list[str], Optional[dict[str, str]]]:
47
+ """
48
+ Convert all names in the input schema from source names to valid Pixeltable identifiers
49
+ - Ensure that all names are unique.
50
+ - Report an error if any types are missing
51
+ - If "require_valid_pxt_column_names", report an error if any column names are not valid Pixeltable column names
52
+ - Report an error if any primary key columns are missing
53
+ Returns
54
+ - A new schema with normalized column names
55
+ - The primary key columns, mapped to the normalized names
56
+ - A mapping from the original names to the normalized names.
57
+ """
58
+
59
+ # Report any untyped columns as an error
60
+ untyped_cols = [in_name for in_name, column_type in in_schema.items() if column_type is None]
61
+ if len(untyped_cols) > 0:
62
+ raise excs.Error(f'Could not infer pixeltable type for column(s): {", ".join(untyped_cols)}')
63
+
64
+ # Report any columns in `schema_overrides` that are not in the source
65
+ extraneous_overrides = schema_overrides.keys() - in_schema.keys()
66
+ if len(extraneous_overrides) > 0:
67
+ raise excs.Error(
68
+ f'Some column(s) specified in `schema_overrides` are not present in the source: {", ".join(extraneous_overrides)}'
69
+ )
70
+
71
+ schema: dict[str, Any] = {}
72
+ col_mapping: dict[str, str] = {} # Maps column names to Pixeltable column names if needed
73
+ for in_name, pxt_type in in_schema.items():
74
+ pxt_name = normalize_pxt_col_name(in_name)
75
+ # Ensure that column names are unique by appending a distinguishing suffix
76
+ # to any collisions
77
+ pxt_fname = pxt_name
78
+ n = 1
79
+ while not _is_usable_as_column_name(pxt_fname, schema):
80
+ pxt_fname = f'{pxt_name}_{n}'
81
+ n += 1
82
+ schema[pxt_fname] = pxt_type
83
+ col_mapping[in_name] = pxt_fname
84
+
85
+ # Determine if the col_mapping is the identity mapping
86
+ non_identity_keys = [k for k, v in col_mapping.items() if k != v]
87
+ if len(non_identity_keys) > 0:
88
+ if require_valid_pxt_column_names:
89
+ raise excs.Error(
90
+ f'Column names must be valid pixeltable identifiers. Invalid names: {", ".join(non_identity_keys)}'
91
+ )
92
+ else:
93
+ col_mapping = None
94
+
95
+ # Report any primary key columns that are not in the source as an error
96
+ missing_pk = [pk for pk in primary_key if pk not in in_schema]
97
+ if len(missing_pk) > 0:
98
+ raise excs.Error(f'Primary key column(s) are not found in the source: {", ".join(missing_pk)}')
99
+
100
+ pxt_pk = [col_mapping[pk] for pk in primary_key] if col_mapping is not None else primary_key
101
+
102
+ return schema, pxt_pk, col_mapping
103
+
104
+
105
+ def find_or_create_table(
106
+ tbl_path: str,
107
+ schema: dict[str, Any],
108
+ *,
109
+ primary_key: Optional[Union[str, list[str]]],
110
+ num_retained_versions: int,
111
+ comment: str,
112
+ ) -> Table:
113
+ return pxt.create_table(
114
+ tbl_path, schema, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment
115
+ )
@@ -5,7 +5,7 @@ from fractions import Fraction
5
5
  from pathlib import Path
6
6
  from typing import Any, Optional
7
7
 
8
- import av # type: ignore[import-untyped]
8
+ import av
9
9
 
10
10
  import pixeltable.env as env
11
11
  import pixeltable.exceptions as excs
@@ -146,6 +146,7 @@ class AudioSplitter(ComponentIterator):
146
146
  input_stream = self.container.streams.audio[0]
147
147
  codec_name = AudioSplitter.__codec_map.get(input_stream.codec_context.name, input_stream.codec_context.name)
148
148
  output_stream = output_container.add_stream(codec_name, rate=input_stream.codec_context.sample_rate)
149
+ assert isinstance(output_stream, av.audio.stream.AudioStream)
149
150
  frame_count = 0
150
151
  # Since frames don't align with chunk boundaries, we may have read an extra frame in previous iteration
151
152
  # Seek to the nearest frame in stream at current chunk start time
@@ -4,7 +4,7 @@ from fractions import Fraction
4
4
  from pathlib import Path
5
5
  from typing import Any, Optional, Sequence
6
6
 
7
- import av # type: ignore[import-untyped]
7
+ import av
8
8
  import pandas as pd
9
9
  import PIL.Image
10
10
 
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
10
10
  from .schema import SystemInfo, SystemInfoMd
11
11
 
12
12
  # current version of the metadata; this is incremented whenever the metadata schema changes
13
- VERSION = 27
13
+ VERSION = 30
14
14
 
15
15
 
16
16
  def create_system_info(engine: sql.engine.Engine) -> None:
@@ -31,6 +31,7 @@ converter_cbs: dict[int, Callable[[sql.engine.Engine], None]] = {}
31
31
  def register_converter(version: int) -> Callable[[Callable[[sql.engine.Engine], None]], None]:
32
32
  def decorator(fn: Callable[[sql.engine.Engine], None]) -> None:
33
33
  global converter_cbs
34
+ assert version not in converter_cbs
34
35
  converter_cbs[version] = fn
35
36
 
36
37
  return decorator