pixeltable 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (52) hide show
  1. pixeltable/__init__.py +1 -2
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/catalog.py +509 -103
  4. pixeltable/catalog/column.py +5 -0
  5. pixeltable/catalog/dir.py +15 -6
  6. pixeltable/catalog/globals.py +16 -0
  7. pixeltable/catalog/insertable_table.py +82 -41
  8. pixeltable/catalog/path.py +15 -0
  9. pixeltable/catalog/schema_object.py +7 -12
  10. pixeltable/catalog/table.py +81 -67
  11. pixeltable/catalog/table_version.py +23 -7
  12. pixeltable/catalog/view.py +9 -6
  13. pixeltable/env.py +15 -9
  14. pixeltable/exec/exec_node.py +1 -1
  15. pixeltable/exprs/__init__.py +2 -1
  16. pixeltable/exprs/arithmetic_expr.py +2 -0
  17. pixeltable/exprs/column_ref.py +38 -2
  18. pixeltable/exprs/expr.py +61 -12
  19. pixeltable/exprs/function_call.py +1 -4
  20. pixeltable/exprs/globals.py +12 -0
  21. pixeltable/exprs/json_mapper.py +4 -4
  22. pixeltable/exprs/json_path.py +10 -11
  23. pixeltable/exprs/similarity_expr.py +5 -20
  24. pixeltable/exprs/string_op.py +107 -0
  25. pixeltable/ext/functions/yolox.py +21 -64
  26. pixeltable/func/callable_function.py +5 -2
  27. pixeltable/func/query_template_function.py +6 -18
  28. pixeltable/func/tools.py +2 -2
  29. pixeltable/functions/__init__.py +1 -1
  30. pixeltable/functions/globals.py +16 -5
  31. pixeltable/globals.py +172 -262
  32. pixeltable/io/__init__.py +3 -2
  33. pixeltable/io/datarows.py +138 -0
  34. pixeltable/io/external_store.py +8 -5
  35. pixeltable/io/globals.py +7 -160
  36. pixeltable/io/hf_datasets.py +21 -98
  37. pixeltable/io/pandas.py +29 -43
  38. pixeltable/io/parquet.py +17 -42
  39. pixeltable/io/table_data_conduit.py +569 -0
  40. pixeltable/io/utils.py +6 -21
  41. pixeltable/metadata/__init__.py +1 -1
  42. pixeltable/metadata/converters/convert_30.py +50 -0
  43. pixeltable/metadata/converters/util.py +26 -1
  44. pixeltable/metadata/notes.py +1 -0
  45. pixeltable/metadata/schema.py +3 -0
  46. pixeltable/utils/arrow.py +32 -7
  47. pixeltable/utils/coroutine.py +41 -0
  48. {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/METADATA +1 -1
  49. {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/RECORD +52 -47
  50. {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/WHEEL +1 -1
  51. {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/LICENSE +0 -0
  52. {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/entry_points.txt +0 -0
pixeltable/io/pandas.py CHANGED
@@ -7,9 +7,6 @@ from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
7
7
 
8
8
  import pixeltable as pxt
9
9
  import pixeltable.exceptions as excs
10
- from pixeltable import Table
11
-
12
- from .utils import find_or_create_table, normalize_import_parameters, normalize_schema_names
13
10
 
14
11
 
15
12
  def import_pandas(
@@ -43,20 +40,14 @@ def import_pandas(
43
40
  Returns:
44
41
  A handle to the newly created [`Table`][pixeltable.Table].
45
42
  """
46
- schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
47
- pd_schema = df_infer_schema(df, schema_overrides, primary_key)
48
- schema, pxt_pk, col_mapping = normalize_schema_names(pd_schema, primary_key, schema_overrides, False)
49
-
50
- __check_primary_key_values(df, primary_key)
51
-
52
- # Convert all rows to insertable format
53
- tbl_rows = [__df_row_to_pxt_row(row, pd_schema, col_mapping) for row in df.itertuples()]
54
-
55
- table = find_or_create_table(
56
- tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
43
+ return pxt.create_table(
44
+ tbl_name,
45
+ source=df,
46
+ schema_overrides=schema_overrides,
47
+ primary_key=primary_key,
48
+ num_retained_versions=num_retained_versions,
49
+ comment=comment,
57
50
  )
58
- table.insert(tbl_rows)
59
- return table
60
51
 
61
52
 
62
53
  def import_csv(
@@ -77,14 +68,14 @@ def import_csv(
77
68
  Returns:
78
69
  A handle to the newly created [`Table`][pixeltable.Table].
79
70
  """
80
- df = pd.read_csv(filepath_or_buffer, **kwargs)
81
- return import_pandas(
71
+ return pxt.create_table(
82
72
  tbl_name,
83
- df,
73
+ source=filepath_or_buffer,
84
74
  schema_overrides=schema_overrides,
85
75
  primary_key=primary_key,
86
76
  num_retained_versions=num_retained_versions,
87
77
  comment=comment,
78
+ extra_args=kwargs,
88
79
  )
89
80
 
90
81
 
@@ -107,18 +98,18 @@ def import_excel(
107
98
  Returns:
108
99
  A handle to the newly created [`Table`][pixeltable.Table].
109
100
  """
110
- df = pd.read_excel(io, *args, **kwargs)
111
- return import_pandas(
101
+ return pxt.create_table(
112
102
  tbl_name,
113
- df,
103
+ source=io,
114
104
  schema_overrides=schema_overrides,
115
105
  primary_key=primary_key,
116
106
  num_retained_versions=num_retained_versions,
117
107
  comment=comment,
108
+ extra_args=kwargs,
118
109
  )
119
110
 
120
111
 
121
- def __check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
112
+ def _df_check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
122
113
  for pd_name in primary_key:
123
114
  # This can be faster for large DataFrames
124
115
  has_nulls = df[pd_name].count() < len(df)
@@ -146,15 +137,6 @@ def df_infer_schema(
146
137
  return pd_schema
147
138
 
148
139
 
149
- """
150
- # Check if a datetime64[ns, UTC] dtype
151
- def is_datetime_tz_utc(x: Any) -> bool:
152
- if isinstance(x, pd.Timestamp) and x.tzinfo is not None and str(x.tzinfo) == 'UTC':
153
- return True
154
- return pd.api.types.is_datetime64tz_dtype(x) and str(x).endswith('UTC]')
155
- """
156
-
157
-
158
140
  def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.ColumnType]:
159
141
  """
160
142
  Determines a pixeltable ColumnType from a pandas dtype
@@ -165,7 +147,8 @@ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.C
165
147
  Returns:
166
148
  pxt.ColumnType: A pixeltable ColumnType
167
149
  """
168
- # Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly compatible with NumPy dtypes
150
+ # Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly
151
+ # compatible with NumPy dtypes
169
152
  # The timezone-aware datetime64[ns, tz=] dtype is a pandas extension dtype
170
153
  if is_datetime64_any_dtype(pd_dtype):
171
154
  return pxt.TimestampType(nullable=nullable)
@@ -204,32 +187,35 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
204
187
  raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {pd_dtype})')
205
188
 
206
189
 
207
- def __df_row_to_pxt_row(
190
+ def _df_row_to_pxt_row(
208
191
  row: tuple[Any, ...], schema: dict[str, pxt.ColumnType], col_mapping: Optional[dict[str, str]]
209
192
  ) -> dict[str, Any]:
210
193
  """Convert a row to insertable format"""
211
194
  pxt_row: dict[str, Any] = {}
212
195
  for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
196
+ pxt_name = col_mapping.get(col_name, col_name)
197
+ nval: Any
213
198
  if pxt_type.is_float_type():
214
- val = float(val)
199
+ nval = float(val)
215
200
  elif isinstance(val, float) and np.isnan(val):
216
201
  # pandas uses NaN for empty cells, even for types other than float;
217
202
  # for any type but a float, convert these to None
218
- val = None
203
+ nval = None
219
204
  elif pxt_type.is_int_type():
220
- val = int(val)
205
+ nval = int(val)
221
206
  elif pxt_type.is_bool_type():
222
- val = bool(val)
207
+ nval = bool(val)
223
208
  elif pxt_type.is_string_type():
224
- val = str(val)
209
+ nval = str(val)
225
210
  elif pxt_type.is_timestamp_type():
226
211
  if pd.isnull(val):
227
212
  # pandas has the bespoke 'NaT' type for a missing timestamp; postgres is very
228
213
  # much not-ok with it. (But if we convert it to None and then load out the
229
214
  # table contents as a pandas DataFrame, it will correctly restore the 'NaT'!)
230
- val = None
215
+ nval = None
231
216
  else:
232
- val = pd.Timestamp(val).to_pydatetime()
233
- pxt_name = col_name if col_mapping is None else col_mapping[col_name]
234
- pxt_row[pxt_name] = val
217
+ nval = pd.Timestamp(val).to_pydatetime()
218
+ else:
219
+ nval = val
220
+ pxt_row[pxt_name] = nval
235
221
  return pxt_row
pixeltable/io/parquet.py CHANGED
@@ -4,7 +4,6 @@ import datetime
4
4
  import io
5
5
  import json
6
6
  import logging
7
- import random
8
7
  import typing
9
8
  from collections import deque
10
9
  from pathlib import Path
@@ -14,12 +13,10 @@ import numpy as np
14
13
  import PIL.Image
15
14
 
16
15
  import pixeltable as pxt
17
- import pixeltable.exceptions as exc
16
+ import pixeltable.exceptions as excs
18
17
  from pixeltable.env import Env
19
18
  from pixeltable.utils.transactional_directory import transactional_directory
20
19
 
21
- from .utils import normalize_import_parameters, normalize_schema_names
22
-
23
20
  if typing.TYPE_CHECKING:
24
21
  import pyarrow as pa
25
22
 
@@ -78,7 +75,7 @@ def export_parquet(
78
75
  arrow_schema = to_arrow_schema(df.schema)
79
76
 
80
77
  if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
81
- raise exc.Error('Cannot export Dataframe with image columns when inline_images is False')
78
+ raise excs.Error('Cannot export Dataframe with image columns when inline_images is False')
82
79
 
83
80
  # store the changes atomically
84
81
  with transactional_directory(parquet_path) as temp_path:
@@ -87,7 +84,7 @@ def export_parquet(
87
84
  json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
88
85
 
89
86
  batch_num = 0
90
- current_value_batch: dict[str, deque] = {k: deque() for k in df.schema.keys()}
87
+ current_value_batch: dict[str, deque] = {k: deque() for k in df.schema}
91
88
  current_byte_estimate = 0
92
89
 
93
90
  with Env.get().begin_xact():
@@ -111,7 +108,7 @@ def export_parquet(
111
108
  val.save(buf, format='PNG')
112
109
  val = buf.getvalue()
113
110
  else:
114
- assert False, f'unknown image type {type(val)}'
111
+ raise excs.Error(f'unknown image type {type(val)}')
115
112
  length = len(val)
116
113
  elif col_type.is_string_type():
117
114
  length = len(val)
@@ -119,16 +116,14 @@ def export_parquet(
119
116
  if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
120
117
  val = data_row.file_paths[e.slot_idx]
121
118
  else:
122
- assert False, f'unknown video type {type(val)}'
119
+ raise excs.Error(f'unknown video type {type(val)}')
123
120
  length = len(val)
124
121
  elif col_type.is_json_type():
125
122
  val = json.dumps(val)
126
123
  length = len(val)
127
124
  elif col_type.is_array_type():
128
125
  length = val.nbytes
129
- elif col_type.is_int_type():
130
- length = 8
131
- elif col_type.is_float_type():
126
+ elif col_type.is_int_type() or col_type.is_float_type():
132
127
  length = 8
133
128
  elif col_type.is_bool_type():
134
129
  length = 1
@@ -136,7 +131,7 @@ def export_parquet(
136
131
  val = val.astimezone(datetime.timezone.utc)
137
132
  length = 8
138
133
  else:
139
- assert False, f'unknown type {col_type} for {col_name}'
134
+ raise excs.Error(f'unknown type {col_type} for {col_name}')
140
135
 
141
136
  current_value_batch[col_name].append(val)
142
137
  current_byte_estimate += length
@@ -144,7 +139,7 @@ def export_parquet(
144
139
  assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
145
140
  _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
146
141
  batch_num += 1
147
- current_value_batch = {k: deque() for k in df.schema.keys()}
142
+ current_value_batch = {k: deque() for k in df.schema}
148
143
  current_byte_estimate = 0
149
144
 
150
145
  _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
@@ -173,32 +168,12 @@ def import_parquet(
173
168
  Returns:
174
169
  A handle to the newly created table.
175
170
  """
176
- from pyarrow import parquet
177
-
178
- from pixeltable.utils.arrow import ar_infer_schema, iter_tuples2
179
-
180
- input_path = Path(parquet_path).expanduser()
181
- parquet_dataset = parquet.ParquetDataset(str(input_path))
182
-
183
- schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
184
- ar_schema = ar_infer_schema(parquet_dataset.schema, schema_overrides, primary_key)
185
- schema, pxt_pk, col_mapping = normalize_schema_names(ar_schema, primary_key, schema_overrides, False)
186
-
187
- if table in pxt.list_tables():
188
- raise exc.Error(f'Table {table} already exists')
189
-
190
- tmp_name = f'{table}_tmp_{random.randint(0, 100000000)}'
191
- total_rows = 0
192
- try:
193
- tab = pxt.create_table(tmp_name, schema, primary_key=pxt_pk, **kwargs)
194
- for fragment in parquet_dataset.fragments: # type: ignore[attr-defined]
195
- for batch in fragment.to_batches():
196
- dict_batch = list(iter_tuples2(batch, col_mapping, schema))
197
- total_rows += len(dict_batch)
198
- tab.insert(dict_batch)
199
- except Exception as e:
200
- _logger.error(f'Error after inserting {total_rows} rows from Parquet file into table: {e}')
201
- raise e
202
-
203
- pxt.move(tmp_name, table)
204
- return pxt.get_table(table)
171
+ value = kwargs.pop('source_format', None)
172
+ return pxt.create_table(
173
+ table,
174
+ source=parquet_path,
175
+ source_format=value,
176
+ schema_overrides=schema_overrides,
177
+ primary_key=primary_key,
178
+ extra_args=kwargs,
179
+ )