pixeltable 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (63) hide show
  1. pixeltable/__init__.py +1 -0
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/catalog.py +9 -2
  4. pixeltable/catalog/column.py +1 -1
  5. pixeltable/catalog/dir.py +1 -1
  6. pixeltable/catalog/table.py +3 -1
  7. pixeltable/catalog/table_version.py +12 -2
  8. pixeltable/catalog/table_version_path.py +2 -2
  9. pixeltable/catalog/view.py +64 -20
  10. pixeltable/dataframe.py +11 -6
  11. pixeltable/env.py +12 -0
  12. pixeltable/exec/expr_eval/evaluators.py +4 -2
  13. pixeltable/exec/expr_eval/expr_eval_node.py +4 -1
  14. pixeltable/exprs/comparison.py +8 -4
  15. pixeltable/exprs/data_row.py +9 -7
  16. pixeltable/exprs/expr.py +2 -2
  17. pixeltable/exprs/function_call.py +155 -313
  18. pixeltable/exprs/json_mapper.py +25 -8
  19. pixeltable/exprs/json_path.py +6 -5
  20. pixeltable/exprs/object_ref.py +16 -5
  21. pixeltable/exprs/row_builder.py +10 -3
  22. pixeltable/func/aggregate_function.py +29 -15
  23. pixeltable/func/callable_function.py +11 -8
  24. pixeltable/func/expr_template_function.py +3 -9
  25. pixeltable/func/function.py +148 -74
  26. pixeltable/func/signature.py +65 -30
  27. pixeltable/func/tools.py +26 -26
  28. pixeltable/func/udf.py +1 -1
  29. pixeltable/functions/__init__.py +1 -0
  30. pixeltable/functions/anthropic.py +9 -3
  31. pixeltable/functions/deepseek.py +121 -0
  32. pixeltable/functions/image.py +7 -7
  33. pixeltable/functions/openai.py +30 -13
  34. pixeltable/functions/video.py +14 -7
  35. pixeltable/globals.py +14 -3
  36. pixeltable/index/embedding_index.py +4 -13
  37. pixeltable/io/globals.py +88 -77
  38. pixeltable/io/hf_datasets.py +34 -34
  39. pixeltable/io/pandas.py +75 -76
  40. pixeltable/io/parquet.py +19 -27
  41. pixeltable/io/utils.py +115 -0
  42. pixeltable/iterators/audio.py +2 -1
  43. pixeltable/iterators/video.py +1 -1
  44. pixeltable/metadata/__init__.py +2 -1
  45. pixeltable/metadata/converters/convert_15.py +18 -8
  46. pixeltable/metadata/converters/convert_27.py +31 -0
  47. pixeltable/metadata/converters/convert_28.py +15 -0
  48. pixeltable/metadata/converters/convert_29.py +111 -0
  49. pixeltable/metadata/converters/util.py +12 -1
  50. pixeltable/metadata/notes.py +3 -0
  51. pixeltable/metadata/schema.py +8 -0
  52. pixeltable/share/__init__.py +1 -0
  53. pixeltable/share/packager.py +41 -13
  54. pixeltable/share/publish.py +97 -0
  55. pixeltable/type_system.py +40 -14
  56. pixeltable/utils/__init__.py +41 -0
  57. pixeltable/utils/arrow.py +40 -7
  58. pixeltable/utils/formatter.py +1 -1
  59. {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/METADATA +34 -49
  60. {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/RECORD +63 -57
  61. {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/WHEEL +1 -1
  62. {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/LICENSE +0 -0
  63. {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/entry_points.txt +0 -0
@@ -99,10 +99,10 @@ class EmbeddingIndex(IndexBase):
99
99
  # Now validate the return types of the embedding functions.
100
100
 
101
101
  if self.string_embed is not None:
102
- self._validate_embedding_fn(self.string_embed, ts.ColumnType.Type.STRING)
102
+ self._validate_embedding_fn(self.string_embed)
103
103
 
104
104
  if self.image_embed is not None:
105
- self._validate_embedding_fn(self.image_embed, ts.ColumnType.Type.IMAGE)
105
+ self._validate_embedding_fn(self.image_embed)
106
106
 
107
107
  if c.col_type.is_string_type() and self.string_embed is None:
108
108
  raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
@@ -206,21 +206,12 @@ class EmbeddingIndex(IndexBase):
206
206
  return None
207
207
 
208
208
  @classmethod
209
- def _validate_embedding_fn(cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type) -> None:
209
+ def _validate_embedding_fn(cls, embed_fn: func.Function) -> None:
210
210
  """Validate the given embedding function."""
211
211
  assert not embed_fn.is_polymorphic
212
- sig = embed_fn.signature
213
212
 
214
- # validate return type
215
- param_name = sig.parameters_by_pos[0].name
216
- if expected_type == ts.ColumnType.Type.STRING:
217
- return_type = embed_fn.call_return_type([], {param_name: 'dummy'})
218
- else:
219
- assert expected_type == ts.ColumnType.Type.IMAGE
220
- img = PIL.Image.new('RGB', (512, 512))
221
- return_type = embed_fn.call_return_type([], {param_name: img})
213
+ return_type = embed_fn.signature.return_type
222
214
 
223
- assert return_type is not None
224
215
  if not isinstance(return_type, ts.ArrayType):
225
216
  raise excs.Error(
226
217
  f'The function `{embed_fn.name}` is not a valid embedding: '
pixeltable/io/globals.py CHANGED
@@ -1,3 +1,7 @@
1
+ import json
2
+ import urllib.parse
3
+ import urllib.request
4
+ from pathlib import Path
1
5
  from typing import TYPE_CHECKING, Any, Literal, Optional, Union
2
6
 
3
7
  import pixeltable as pxt
@@ -5,11 +9,61 @@ import pixeltable.exceptions as excs
5
9
  from pixeltable import Table, exprs
6
10
  from pixeltable.env import Env
7
11
  from pixeltable.io.external_store import SyncStatus
12
+ from pixeltable.utils import parse_local_file_path
8
13
 
9
14
  if TYPE_CHECKING:
10
15
  import fiftyone as fo # type: ignore[import-untyped]
11
16
 
12
17
 
18
+ from .utils import find_or_create_table, normalize_import_parameters, normalize_schema_names
19
+
20
+
21
+ def _infer_schema_from_rows(
22
+ rows: list[dict[str, Any]], schema_overrides: dict[str, Any], primary_key: list[str]
23
+ ) -> dict[str, pxt.ColumnType]:
24
+ schema: dict[str, pxt.ColumnType] = {}
25
+ cols_with_nones: set[str] = set()
26
+
27
+ for n, row in enumerate(rows):
28
+ for col_name, value in row.items():
29
+ if col_name in schema_overrides:
30
+ # We do the insertion here; this will ensure that the column order matches the order
31
+ # in which the column names are encountered in the input data, even if `schema_overrides`
32
+ # is specified.
33
+ if col_name not in schema:
34
+ schema[col_name] = schema_overrides[col_name]
35
+ elif value is not None:
36
+ # If `key` is not in `schema_overrides`, then we infer its type from the data.
37
+ # The column type will always be nullable by default.
38
+ col_type = pxt.ColumnType.infer_literal_type(value, nullable=col_name not in primary_key)
39
+ if col_type is None:
40
+ raise excs.Error(
41
+ f'Could not infer type for column `{col_name}`; the value in row {n} has an unsupported type: {type(value)}'
42
+ )
43
+ if col_name not in schema:
44
+ schema[col_name] = col_type
45
+ else:
46
+ supertype = schema[col_name].supertype(col_type)
47
+ if supertype is None:
48
+ raise excs.Error(
49
+ f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
50
+ 'Consider specifying the type explicitly in `schema_overrides`.'
51
+ )
52
+ schema[col_name] = supertype
53
+ else:
54
+ cols_with_nones.add(col_name)
55
+
56
+ entirely_none_cols = cols_with_nones - schema.keys()
57
+ if len(entirely_none_cols) > 0:
58
+ # A column can only end up in `entirely_none_cols` if it was not in `schema_overrides` and
59
+ # was not encountered in any row with a non-None value.
60
+ raise excs.Error(
61
+ f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
62
+ 'Consider specifying the type(s) explicitly in `schema_overrides`.'
63
+ )
64
+ return schema
65
+
66
+
13
67
  def create_label_studio_project(
14
68
  t: Table,
15
69
  label_config: str,
@@ -140,7 +194,7 @@ def import_rows(
140
194
  tbl_path: str,
141
195
  rows: list[dict[str, Any]],
142
196
  *,
143
- schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
197
+ schema_overrides: Optional[dict[str, Any]] = None,
144
198
  primary_key: Optional[Union[str, list[str]]] = None,
145
199
  num_retained_versions: int = 10,
146
200
  comment: str = '',
@@ -169,67 +223,22 @@ def import_rows(
169
223
  Returns:
170
224
  A handle to the newly created [`Table`][pixeltable.Table].
171
225
  """
172
- if schema_overrides is None:
173
- schema_overrides = {}
174
- schema: dict[str, pxt.ColumnType] = {}
175
- cols_with_nones: set[str] = set()
226
+ schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
227
+ row_schema = _infer_schema_from_rows(rows, schema_overrides, primary_key)
228
+ schema, pxt_pk, _ = normalize_schema_names(row_schema, primary_key, schema_overrides, True)
176
229
 
177
- for n, row in enumerate(rows):
178
- for col_name, value in row.items():
179
- if col_name in schema_overrides:
180
- # We do the insertion here; this will ensure that the column order matches the order
181
- # in which the column names are encountered in the input data, even if `schema_overrides`
182
- # is specified.
183
- if col_name not in schema:
184
- schema[col_name] = schema_overrides[col_name]
185
- elif value is not None:
186
- # If `key` is not in `schema_overrides`, then we infer its type from the data.
187
- # The column type will always be nullable by default.
188
- col_type = pxt.ColumnType.infer_literal_type(value, nullable=True)
189
- if col_type is None:
190
- raise excs.Error(
191
- f'Could not infer type for column `{col_name}`; the value in row {n} has an unsupported type: {type(value)}'
192
- )
193
- if col_name not in schema:
194
- schema[col_name] = col_type
195
- else:
196
- supertype = schema[col_name].supertype(col_type)
197
- if supertype is None:
198
- raise excs.Error(
199
- f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
200
- 'Consider specifying the type explicitly in `schema_overrides`.'
201
- )
202
- schema[col_name] = supertype
203
- else:
204
- cols_with_nones.add(col_name)
205
-
206
- extraneous_keys = schema_overrides.keys() - schema.keys()
207
- if len(extraneous_keys) > 0:
208
- raise excs.Error(
209
- f'The following columns specified in `schema_overrides` are not present in the data: {", ".join(extraneous_keys)}'
210
- )
211
-
212
- entirely_none_cols = cols_with_nones - schema.keys()
213
- if len(entirely_none_cols) > 0:
214
- # A column can only end up in `entirely_null_cols` if it was not in `schema_overrides` and
215
- # was not encountered in any row with a non-None value.
216
- raise excs.Error(
217
- f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
218
- 'Consider specifying the type(s) explicitly in `schema_overrides`.'
219
- )
220
-
221
- t = pxt.create_table(
222
- tbl_path, schema, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment
230
+ table = find_or_create_table(
231
+ tbl_path, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
223
232
  )
224
- t.insert(rows)
225
- return t
233
+ table.insert(rows)
234
+ return table
226
235
 
227
236
 
228
237
  def import_json(
229
238
  tbl_path: str,
230
239
  filepath_or_url: str,
231
240
  *,
232
- schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
241
+ schema_overrides: Optional[dict[str, Any]] = None,
233
242
  primary_key: Optional[Union[str, list[str]]] = None,
234
243
  num_retained_versions: int = 10,
235
244
  comment: str = '',
@@ -253,33 +262,35 @@ def import_json(
253
262
  Returns:
254
263
  A handle to the newly created [`Table`][pixeltable.Table].
255
264
  """
256
- import json
257
- import urllib.parse
258
- import urllib.request
259
-
260
- # TODO Consolidate this logic with other places where files/URLs are parsed
261
- parsed = urllib.parse.urlparse(filepath_or_url)
262
- if len(parsed.scheme) <= 1 or parsed.scheme == 'file':
263
- # local file path
264
- if len(parsed.scheme) <= 1:
265
- filepath = filepath_or_url
266
- else:
267
- filepath = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
268
- with open(filepath) as fp:
265
+ path = parse_local_file_path(filepath_or_url)
266
+ if path is None: # it's a URL
267
+ # TODO: This should read from S3 as well.
268
+ contents = urllib.request.urlopen(filepath_or_url).read()
269
+ else:
270
+ with open(path) as fp:
269
271
  contents = fp.read()
272
+
273
+ rows = json.loads(contents, **kwargs)
274
+
275
+ schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
276
+ row_schema = _infer_schema_from_rows(rows, schema_overrides, primary_key)
277
+ schema, pxt_pk, col_mapping = normalize_schema_names(row_schema, primary_key, schema_overrides, False)
278
+
279
+ # Convert all rows to insertable format - not needed, misnamed columns and types are errors in the incoming row format
280
+ if col_mapping is not None:
281
+ tbl_rows = [
282
+ {field if col_mapping is None else col_mapping[field]: val for field, val in row.items()} for row in rows
283
+ ]
270
284
  else:
271
- # URL
272
- contents = urllib.request.urlopen(filepath_or_url).read()
273
- data = json.loads(contents, **kwargs)
274
- return import_rows(
275
- tbl_path,
276
- data,
277
- schema_overrides=schema_overrides,
278
- primary_key=primary_key,
279
- num_retained_versions=num_retained_versions,
280
- comment=comment,
285
+ tbl_rows = rows
286
+
287
+ table = find_or_create_table(
288
+ tbl_path, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
281
289
  )
282
290
 
291
+ table.insert(tbl_rows)
292
+ return table
293
+
283
294
 
284
295
  def export_images_as_fo_dataset(
285
296
  tbl: pxt.Table,
@@ -10,6 +10,8 @@ import pixeltable as pxt
10
10
  import pixeltable.type_system as ts
11
11
  from pixeltable import exceptions as excs
12
12
 
13
+ from .utils import normalize_import_parameters, normalize_schema_names
14
+
13
15
  if typing.TYPE_CHECKING:
14
16
  import datasets # type: ignore[import-untyped]
15
17
 
@@ -28,29 +30,33 @@ _hf_to_pxt: dict[str, ts.ColumnType] = {
28
30
  'int64': ts.IntType(nullable=True),
29
31
  'bool': ts.BoolType(nullable=True),
30
32
  'float32': ts.FloatType(nullable=True),
33
+ 'float64': ts.FloatType(nullable=True),
34
+ 'large_string': ts.StringType(nullable=True),
31
35
  'string': ts.StringType(nullable=True),
32
36
  'timestamp[s]': ts.TimestampType(nullable=True),
33
37
  'timestamp[ms]': ts.TimestampType(nullable=True), # HF dataset iterator converts timestamps to datetime.datetime
38
+ 'timestamp[us]': ts.TimestampType(nullable=True),
34
39
  }
35
40
 
36
41
 
37
- def _to_pixeltable_type(feature_type: Any) -> Optional[ts.ColumnType]:
42
+ def _to_pixeltable_type(feature_type: Any, nullable: bool) -> Optional[ts.ColumnType]:
38
43
  """Convert a huggingface feature type to a pixeltable ColumnType if one is defined."""
39
44
  import datasets
40
45
 
41
46
  if isinstance(feature_type, datasets.ClassLabel):
42
47
  # enum, example: ClassLabel(names=['neg', 'pos'], id=None)
43
- return ts.StringType(nullable=True)
48
+ return ts.StringType(nullable=nullable)
44
49
  elif isinstance(feature_type, datasets.Value):
45
50
  # example: Value(dtype='int64', id=None)
46
- return _hf_to_pxt.get(feature_type.dtype, None)
51
+ pt = _hf_to_pxt.get(feature_type.dtype, None)
52
+ return pt.copy(nullable=nullable) if pt is not None else None
47
53
  elif isinstance(feature_type, datasets.Sequence):
48
54
  # example: cohere wiki. Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)
49
- dtype = _to_pixeltable_type(feature_type.feature)
55
+ dtype = _to_pixeltable_type(feature_type.feature, nullable)
50
56
  length = feature_type.length if feature_type.length != -1 else None
51
57
  return ts.ArrayType(shape=(length,), dtype=dtype)
52
58
  elif isinstance(feature_type, datasets.Image):
53
- return ts.ImageType(nullable=True)
59
+ return ts.ImageType(nullable=nullable)
54
60
  else:
55
61
  return None
56
62
 
@@ -63,15 +69,17 @@ def _get_hf_schema(dataset: Union[datasets.Dataset, datasets.DatasetDict]) -> da
63
69
  return first_dataset.features
64
70
 
65
71
 
66
- def huggingface_schema_to_pixeltable_schema(
67
- hf_dataset: Union[datasets.Dataset, datasets.DatasetDict],
72
+ def huggingface_schema_to_pxt_schema(
73
+ hf_schema: datasets.Features, schema_overrides: dict[str, Any], primary_key: list[str]
68
74
  ) -> dict[str, Optional[ts.ColumnType]]:
69
75
  """Generate a pixeltable schema from a huggingface dataset schema.
70
76
  Columns without a known mapping are mapped to None
71
77
  """
72
- hf_schema = _get_hf_schema(hf_dataset)
73
78
  pixeltable_schema = {
74
- column_name: _to_pixeltable_type(feature_type) for column_name, feature_type in hf_schema.items()
79
+ column_name: _to_pixeltable_type(feature_type, column_name not in primary_key)
80
+ if column_name not in schema_overrides
81
+ else schema_overrides[column_name]
82
+ for column_name, feature_type in hf_schema.items()
75
83
  }
76
84
  return pixeltable_schema
77
85
 
@@ -82,6 +90,7 @@ def import_huggingface_dataset(
82
90
  *,
83
91
  column_name_for_split: Optional[str] = None,
84
92
  schema_overrides: Optional[dict[str, Any]] = None,
93
+ primary_key: Optional[Union[str, list[str]]] = None,
85
94
  **kwargs: Any,
86
95
  ) -> pxt.Table:
87
96
  """Create a new base table from a Huggingface dataset, or dataset dict with multiple splits.
@@ -97,6 +106,7 @@ def import_huggingface_dataset(
97
106
  name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`. The keys in
98
107
  `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not they are valid
99
108
  Pixeltable identifiers).
109
+ primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
100
110
  kwargs: Additional arguments to pass to `create_table`.
101
111
 
102
112
  Returns:
@@ -106,57 +116,47 @@ def import_huggingface_dataset(
106
116
 
107
117
  import pixeltable as pxt
108
118
 
109
- if table_path in pxt.list_tables():
110
- raise excs.Error(f'table {table_path} already exists')
111
-
112
119
  if not isinstance(dataset, (datasets.Dataset, datasets.DatasetDict)):
113
120
  raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
114
121
 
115
- if isinstance(dataset, datasets.Dataset):
116
- # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
117
- raw_name = dataset.split._name
118
- split_name = raw_name.split('[')[0] if raw_name is not None else None
119
- dataset_dict = {split_name: dataset}
120
- else:
121
- dataset_dict = dataset
122
-
123
- pixeltable_schema = huggingface_schema_to_pixeltable_schema(dataset)
124
- if schema_overrides is not None:
125
- pixeltable_schema.update(schema_overrides)
122
+ # Create the pixeltable schema from the huggingface schema
123
+ hf_schema_source = _get_hf_schema(dataset)
124
+ schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
125
+ hf_schema = huggingface_schema_to_pxt_schema(hf_schema_source, schema_overrides, primary_key)
126
126
 
127
+ # Add the split column to the schema if requested
127
128
  if column_name_for_split is not None:
128
- if column_name_for_split in pixeltable_schema:
129
+ if column_name_for_split in hf_schema:
129
130
  raise excs.Error(
130
131
  f'Column name `{column_name_for_split}` already exists in dataset schema; provide a different `column_name_for_split`'
131
132
  )
132
- pixeltable_schema[column_name_for_split] = ts.StringType(nullable=True)
133
+ hf_schema[column_name_for_split] = ts.StringType(nullable=True)
133
134
 
134
- for field, column_type in pixeltable_schema.items():
135
- if column_type is None:
136
- raise excs.Error(f'Could not infer pixeltable type for feature `{field}` in huggingface dataset')
135
+ schema, pxt_pk, _ = normalize_schema_names(hf_schema, primary_key, schema_overrides, True)
136
+
137
+ # Prepare to create table and insert data
138
+ if table_path in pxt.list_tables():
139
+ raise excs.Error(f'table {table_path} already exists')
137
140
 
138
141
  if isinstance(dataset, datasets.Dataset):
139
142
  # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
140
143
  raw_name = dataset.split._name
141
144
  split_name = raw_name.split('[')[0] if raw_name is not None else None
142
145
  dataset_dict = {split_name: dataset}
143
- elif isinstance(dataset, datasets.DatasetDict):
144
- dataset_dict = dataset
145
146
  else:
146
- raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
147
+ dataset_dict = dataset
147
148
 
148
149
  # extract all class labels from the dataset to translate category ints to strings
149
- hf_schema = _get_hf_schema(dataset)
150
150
  categorical_features = {
151
151
  feature_name: feature_type.names
152
- for (feature_name, feature_type) in hf_schema.items()
152
+ for (feature_name, feature_type) in hf_schema_source.items()
153
153
  if isinstance(feature_type, datasets.ClassLabel)
154
154
  }
155
155
 
156
156
  try:
157
157
  # random tmp name
158
158
  tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
159
- tab = pxt.create_table(tmp_name, pixeltable_schema, **kwargs)
159
+ tab = pxt.create_table(tmp_name, schema, primary_key=pxt_pk, **kwargs)
160
160
 
161
161
  def _translate_row(row: dict[str, Any], split_name: str) -> dict[str, Any]:
162
162
  output_row = row.copy()
pixeltable/io/pandas.py CHANGED
@@ -2,17 +2,21 @@ from typing import Any, Optional, Union
2
2
 
3
3
  import numpy as np
4
4
  import pandas as pd
5
+ from pandas._typing import DtypeObj # For pandas dtype type hints
6
+ from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
5
7
 
6
8
  import pixeltable as pxt
7
9
  import pixeltable.exceptions as excs
8
- import pixeltable.type_system as ts
10
+ from pixeltable import Table
11
+
12
+ from .utils import find_or_create_table, normalize_import_parameters, normalize_schema_names
9
13
 
10
14
 
11
15
  def import_pandas(
12
16
  tbl_name: str,
13
17
  df: pd.DataFrame,
14
18
  *,
15
- schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
19
+ schema_overrides: Optional[dict[str, Any]] = None,
16
20
  primary_key: Optional[Union[str, list[str]]] = None,
17
21
  num_retained_versions: int = 10,
18
22
  comment: str = '',
@@ -39,16 +43,16 @@ def import_pandas(
39
43
  Returns:
40
44
  A handle to the newly created [`Table`][pixeltable.Table].
41
45
  """
42
- if schema_overrides is None:
43
- schema_overrides = {}
44
- if primary_key is None:
45
- primary_key = []
46
- elif isinstance(primary_key, str):
47
- primary_key = [primary_key]
48
-
49
- schema, pxt_pk = __df_to_pxt_schema(df, schema_overrides, primary_key)
50
- tbl_rows = (dict(__df_row_to_pxt_row(row, schema)) for row in df.itertuples())
51
- table = pxt.create_table(
46
+ schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
47
+ pd_schema = df_infer_schema(df, schema_overrides, primary_key)
48
+ schema, pxt_pk, col_mapping = normalize_schema_names(pd_schema, primary_key, schema_overrides, False)
49
+
50
+ __check_primary_key_values(df, primary_key)
51
+
52
+ # Convert all rows to insertable format
53
+ tbl_rows = [__df_row_to_pxt_row(row, pd_schema, col_mapping) for row in df.itertuples()]
54
+
55
+ table = find_or_create_table(
52
56
  tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
53
57
  )
54
58
  table.insert(tbl_rows)
@@ -58,7 +62,7 @@ def import_pandas(
58
62
  def import_csv(
59
63
  tbl_name: str,
60
64
  filepath_or_buffer,
61
- schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
65
+ schema_overrides: Optional[dict[str, Any]] = None,
62
66
  primary_key: Optional[Union[str, list[str]]] = None,
63
67
  num_retained_versions: int = 10,
64
68
  comment: str = '',
@@ -88,7 +92,7 @@ def import_excel(
88
92
  tbl_name: str,
89
93
  io,
90
94
  *args,
91
- schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
95
+ schema_overrides: Optional[dict[str, Any]] = None,
92
96
  primary_key: Optional[Union[str, list[str]]] = None,
93
97
  num_retained_versions: int = 10,
94
98
  comment: str = '',
@@ -114,82 +118,73 @@ def import_excel(
114
118
  )
115
119
 
116
120
 
117
- def __df_to_pxt_schema(
121
+ def __check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
122
+ for pd_name in primary_key:
123
+ # This can be faster for large DataFrames
124
+ has_nulls = df[pd_name].count() < len(df)
125
+ if has_nulls:
126
+ raise excs.Error(f'Primary key column `{pd_name}` cannot contain null values.')
127
+
128
+
129
+ def df_infer_schema(
118
130
  df: pd.DataFrame, schema_overrides: dict[str, pxt.ColumnType], primary_key: list[str]
119
- ) -> tuple[dict[str, pxt.ColumnType], list[str]]:
131
+ ) -> dict[str, pxt.ColumnType]:
120
132
  """
121
133
  Infers a Pixeltable schema from a Pandas DataFrame.
122
134
 
123
135
  Returns:
124
136
  A tuple containing a Pixeltable schema and a list of primary key column names.
125
137
  """
126
- for pd_name in schema_overrides:
127
- if pd_name not in df.columns:
128
- raise excs.Error(
129
- f'Column `{pd_name}` specified in `schema_overrides` does not exist in the given `DataFrame`.'
130
- )
131
- for pd_name in primary_key:
132
- if pd_name not in df.columns:
133
- raise excs.Error(f'Primary key column `{pd_name}` does not exist in the given `DataFrame`.')
134
-
135
- schema: dict[str, pxt.ColumnType] = {}
136
- col_mapping: dict[str, str] = {} # Maps Pandas column names to Pixeltable column names
137
-
138
+ pd_schema: dict[str, pxt.ColumnType] = {}
138
139
  for pd_name, pd_dtype in zip(df.columns, df.dtypes):
139
140
  if pd_name in schema_overrides:
140
141
  pxt_type = schema_overrides[pd_name]
141
142
  else:
142
- # This complicated-looking condition is necessary because we cannot safely call `pd.isna()` on
143
- # general objects, so we need to check for nulls in the specific cases where we might expect them.
144
- # isinstance(val, float) will check for NaN values in float columns *as well as* floats appearing
145
- # in object columns (where Pandas uses NaN as a general null).
146
- # np.issubdtype(pd_dtype, np.datetime64) checks for NaT values specifically in datetime columns.
147
- has_na = any(
148
- (isinstance(val, float) or np.issubdtype(pd_dtype, np.datetime64)) and pd.isna(val)
149
- for val in df[pd_name]
150
- )
151
- if has_na and pd_name in primary_key:
152
- raise excs.Error(f'Primary key column `{pd_name}` cannot contain null values.')
153
- pxt_type = __np_dtype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
154
- pxt_name = __normalize_pxt_col_name(pd_name)
155
- # Ensure that column names are unique by appending a distinguishing suffix
156
- # to any collisions
157
- if pxt_name in schema:
158
- n = 2
159
- while f'{pxt_name}_{n}' in schema:
160
- n += 1
161
- pxt_name = f'{pxt_name}_{n}'
162
- schema[pxt_name] = pxt_type
163
- col_mapping[pd_name] = pxt_name
164
-
165
- pxt_pk = [col_mapping[pk] for pk in primary_key]
166
- return schema, pxt_pk
167
-
168
-
169
- def __normalize_pxt_col_name(pd_name: str) -> str:
170
- """
171
- Normalizes an arbitrary DataFrame column name into a valid Pixeltable identifier by:
172
- - replacing any non-ascii or non-alphanumeric characters with an underscore _
173
- - prefixing the result with the letter 'c' if it starts with an underscore or a number
143
+ pxt_type = __pd_coltype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
144
+ pd_schema[pd_name] = pxt_type
145
+
146
+ return pd_schema
147
+
148
+
149
+ """
150
+ # Check if a datetime64[ns, UTC] dtype
151
+ def is_datetime_tz_utc(x: Any) -> bool:
152
+ if isinstance(x, pd.Timestamp) and x.tzinfo is not None and str(x.tzinfo) == 'UTC':
153
+ return True
154
+ return pd.api.types.is_datetime64tz_dtype(x) and str(x).endswith('UTC]')
155
+ """
156
+
157
+
158
+ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.ColumnType]:
174
159
  """
175
- id = ''.join(ch if ch.isascii() and ch.isalnum() else '_' for ch in pd_name)
176
- if id[0].isnumeric():
177
- id = f'c_{id}'
178
- elif id[0] == '_':
179
- id = f'c{id}'
180
- assert pxt.catalog.is_valid_identifier(id), id
181
- return id
160
+ Determines a pixeltable ColumnType from a pandas dtype
182
161
 
162
+ Args:
163
+ pd_dtype: A pandas dtype object
183
164
 
184
- def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bool) -> pxt.ColumnType:
165
+ Returns:
166
+ pxt.ColumnType: A pixeltable ColumnType
167
+ """
168
+ # Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly compatible with NumPy dtypes
169
+ # The timezone-aware datetime64[ns, tz=] dtype is a pandas extension dtype
170
+ if is_datetime64_any_dtype(pd_dtype):
171
+ return pxt.TimestampType(nullable=nullable)
172
+ if is_extension_array_dtype(pd_dtype):
173
+ return None
174
+ # Most other pandas dtypes are directly NumPy compatible
175
+ assert isinstance(pd_dtype, np.dtype)
176
+ return pxt.ArrayType.from_np_dtype(pd_dtype, nullable)
177
+
178
+
179
+ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable: bool) -> pxt.ColumnType:
185
180
  """
186
- Infers a Pixeltable type based on a Numpy dtype.
181
+ Infers a Pixeltable type based on a pandas dtype.
187
182
  """
188
- pxttype = ts.ArrayType.from_np_dtype(np_dtype, nullable)
183
+ pxttype = __pd_dtype_to_pxt_type(pd_dtype, nullable)
189
184
  if pxttype is not None:
190
185
  return pxttype
191
186
 
192
- if np_dtype == np.object_:
187
+ if pd_dtype == np.object_:
193
188
  # The `object_` dtype can mean all sorts of things; see if we can infer the Pixeltable type
194
189
  # based on the actual data in `data_col`.
195
190
  # First drop any null values (they don't contribute to type inference).
@@ -206,11 +201,14 @@ def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bo
206
201
  else:
207
202
  return inferred_type.copy(nullable=nullable)
208
203
 
209
- raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {np_dtype})')
204
+ raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {pd_dtype})')
210
205
 
211
206
 
212
- def __df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType]) -> dict[str, Any]:
213
- rows = {}
207
+ def __df_row_to_pxt_row(
208
+ row: tuple[Any, ...], schema: dict[str, pxt.ColumnType], col_mapping: Optional[dict[str, str]]
209
+ ) -> dict[str, Any]:
210
+ """Convert a row to insertable format"""
211
+ pxt_row: dict[str, Any] = {}
214
212
  for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
215
213
  if pxt_type.is_float_type():
216
214
  val = float(val)
@@ -232,5 +230,6 @@ def __df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType])
232
230
  val = None
233
231
  else:
234
232
  val = pd.Timestamp(val).to_pydatetime()
235
- rows[col_name] = val
236
- return rows
233
+ pxt_name = col_name if col_mapping is None else col_mapping[col_name]
234
+ pxt_row[pxt_name] = val
235
+ return pxt_row