pixeltable 0.2.15__py3-none-any.whl → 0.2.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (55) hide show
  1. pixeltable/__version__.py +2 -2
  2. pixeltable/catalog/column.py +3 -0
  3. pixeltable/catalog/dir.py +1 -1
  4. pixeltable/catalog/globals.py +15 -6
  5. pixeltable/catalog/insertable_table.py +23 -8
  6. pixeltable/catalog/named_function.py +1 -1
  7. pixeltable/catalog/path_dict.py +4 -4
  8. pixeltable/catalog/schema_object.py +30 -18
  9. pixeltable/catalog/table.py +84 -99
  10. pixeltable/catalog/table_version.py +35 -24
  11. pixeltable/catalog/table_version_path.py +2 -2
  12. pixeltable/catalog/view.py +15 -8
  13. pixeltable/dataframe.py +56 -56
  14. pixeltable/env.py +7 -5
  15. pixeltable/exec/__init__.py +3 -3
  16. pixeltable/exec/aggregation_node.py +3 -3
  17. pixeltable/exec/expr_eval_node.py +3 -3
  18. pixeltable/exec/in_memory_data_node.py +4 -4
  19. pixeltable/exec/sql_node.py +4 -1
  20. pixeltable/exprs/array_slice.py +3 -4
  21. pixeltable/exprs/column_ref.py +20 -4
  22. pixeltable/exprs/comparison.py +11 -6
  23. pixeltable/exprs/data_row.py +3 -0
  24. pixeltable/exprs/expr.py +51 -23
  25. pixeltable/exprs/function_call.py +8 -1
  26. pixeltable/exprs/inline_array.py +2 -2
  27. pixeltable/exprs/json_path.py +36 -20
  28. pixeltable/exprs/row_builder.py +4 -4
  29. pixeltable/exprs/rowid_ref.py +1 -1
  30. pixeltable/functions/__init__.py +1 -2
  31. pixeltable/functions/anthropic.py +97 -0
  32. pixeltable/functions/audio.py +32 -0
  33. pixeltable/functions/fireworks.py +1 -1
  34. pixeltable/functions/huggingface.py +4 -4
  35. pixeltable/functions/image.py +1 -1
  36. pixeltable/functions/together.py +1 -1
  37. pixeltable/functions/video.py +5 -1
  38. pixeltable/functions/vision.py +2 -6
  39. pixeltable/globals.py +57 -28
  40. pixeltable/io/external_store.py +4 -4
  41. pixeltable/io/globals.py +12 -13
  42. pixeltable/io/label_studio.py +6 -6
  43. pixeltable/io/pandas.py +27 -12
  44. pixeltable/io/parquet.py +14 -14
  45. pixeltable/iterators/document.py +7 -7
  46. pixeltable/plan.py +58 -29
  47. pixeltable/store.py +32 -31
  48. pixeltable/tool/create_test_db_dump.py +12 -6
  49. pixeltable/type_system.py +89 -97
  50. pixeltable/utils/pytorch.py +12 -10
  51. {pixeltable-0.2.15.dist-info → pixeltable-0.2.17.dist-info}/METADATA +10 -10
  52. {pixeltable-0.2.15.dist-info → pixeltable-0.2.17.dist-info}/RECORD +55 -53
  53. {pixeltable-0.2.15.dist-info → pixeltable-0.2.17.dist-info}/LICENSE +0 -0
  54. {pixeltable-0.2.15.dist-info → pixeltable-0.2.17.dist-info}/WHEEL +0 -0
  55. {pixeltable-0.2.15.dist-info → pixeltable-0.2.17.dist-info}/entry_points.txt +0 -0
pixeltable/globals.py CHANGED
@@ -1,15 +1,18 @@
1
1
  import dataclasses
2
2
  import logging
3
3
  from typing import Any, Optional, Union
4
+ from uuid import UUID
4
5
 
5
6
  import pandas as pd
6
7
  import sqlalchemy as sql
8
+ from pandas.io.formats.style import Styler
7
9
  from sqlalchemy.util.preloaded import orm
8
10
 
9
11
  import pixeltable.exceptions as excs
10
12
  import pixeltable.exprs as exprs
11
- from pixeltable import catalog, func, DataFrame
13
+ from pixeltable import DataFrame, catalog, func
12
14
  from pixeltable.catalog import Catalog
15
+ from pixeltable.dataframe import DataFrameResultSet
13
16
  from pixeltable.env import Env
14
17
  from pixeltable.iterators import ComponentIterator
15
18
  from pixeltable.metadata import schema
@@ -24,21 +27,25 @@ def init() -> None:
24
27
 
25
28
  def create_table(
26
29
  path_str: str,
27
- schema: dict[str, Any],
30
+ schema_or_df: Union[dict[str, Any], DataFrame],
28
31
  *,
29
32
  primary_key: Optional[Union[str, list[str]]] = None,
30
33
  num_retained_versions: int = 10,
31
34
  comment: str = '',
32
- ) -> catalog.InsertableTable:
33
- """Create a new `InsertableTable`.
35
+ ) -> catalog.Table:
36
+ """Create a new base table.
34
37
 
35
38
  Args:
36
39
  path_str: Path to the table.
37
- schema: dictionary mapping column names to column types, value expressions, or to column specifications.
40
+ schema_or_df: Either a dictionary that maps column names to column types, or a
41
+ [`DataFrame`][pixeltable.DataFrame] whose contents and schema will be used to pre-populate the table.
42
+ primary_key: An optional column name or list of column names to use as the primary key(s) of the
43
+ table.
38
44
  num_retained_versions: Number of versions of the table to retain.
45
+ comment: An optional comment; its meaning is user-defined.
39
46
 
40
47
  Returns:
41
- The newly created table.
48
+ A handle to the newly created [`Table`][pixeltable.Table].
42
49
 
43
50
  Raises:
44
51
  Error: if the path already exists or is invalid.
@@ -46,12 +53,27 @@ def create_table(
46
53
  Examples:
47
54
  Create a table with an int and a string column:
48
55
 
49
- >>> table = cl.create_table('my_table', schema={'col1': IntType(), 'col2': StringType()})
56
+ >>> table = pxt.create_table('my_table', schema={'col1': IntType(), 'col2': StringType()})
57
+
58
+ Create a table from a select statement over an existing table `tbl`:
59
+
60
+ >>> table = pxt.create_table('my_table', tbl.where(tbl.col1 < 10).select(tbl.col2))
50
61
  """
51
62
  path = catalog.Path(path_str)
52
63
  Catalog.get().paths.check_is_valid(path, expected=None)
53
64
  dir = Catalog.get().paths[path.parent]
54
65
 
66
+ df: Optional[DataFrame] = None
67
+ if isinstance(schema_or_df, dict):
68
+ schema = schema_or_df
69
+ elif isinstance(schema_or_df, DataFrame):
70
+ df = schema_or_df
71
+ schema = df.schema
72
+ elif isinstance(schema_or_df, DataFrameResultSet):
73
+ raise excs.Error('`schema_or_df` must be either a schema dictionary or a Pixeltable DataFrame. (Is there an extraneous call to `collect()`?)')
74
+ else:
75
+ raise excs.Error('`schema_or_df` must be either a schema dictionary or a Pixeltable DataFrame.')
76
+
55
77
  if len(schema) == 0:
56
78
  raise excs.Error(f'Table schema is empty: `{path_str}`')
57
79
 
@@ -63,15 +85,17 @@ def create_table(
63
85
  if not isinstance(primary_key, list) or not all(isinstance(pk, str) for pk in primary_key):
64
86
  raise excs.Error('primary_key must be a single column name or a list of column names')
65
87
 
66
- tbl = catalog.InsertableTable.create(
88
+ tbl = catalog.InsertableTable._create(
67
89
  dir._id,
68
90
  path.name,
69
91
  schema,
92
+ df,
70
93
  primary_key=primary_key,
71
94
  num_retained_versions=num_retained_versions,
72
95
  comment=comment,
73
96
  )
74
97
  Catalog.get().paths[path] = tbl
98
+
75
99
  _logger.info(f'Created table `{path_str}`.')
76
100
  return tbl
77
101
 
@@ -87,25 +111,28 @@ def create_view(
87
111
  num_retained_versions: int = 10,
88
112
  comment: str = '',
89
113
  ignore_errors: bool = False,
90
- ) -> catalog.View:
91
- """Create a new `View`.
114
+ ) -> Optional[catalog.Table]:
115
+ """Create a view of an existing table object (which itself can be a view or a snapshot or a base table).
92
116
 
93
117
  Args:
94
118
  path_str: Path to the view.
95
- base: Table (i.e., table or view or snapshot) or DataFrame to base the view on.
119
+ base: [`Table`][pixeltable.Table] (i.e., table or view or snapshot) or [`DataFrame`][pixeltable.DataFrame] to
120
+ base the view on.
96
121
  schema: dictionary mapping column names to column types, value expressions, or to column specifications.
97
122
  filter: predicate to filter rows of the base table.
98
123
  is_snapshot: Whether the view is a snapshot.
99
124
  iterator: The iterator to use for this view. If specified, then this view will be a one-to-many view of
100
125
  the base table.
101
126
  num_retained_versions: Number of versions of the view to retain.
127
+ comment: Optional comment for the view.
102
128
  ignore_errors: if True, fail silently if the path already exists or is invalid.
103
129
 
104
130
  Returns:
105
- The newly created view.
131
+ A handle to the [`Table`][pixeltable.Table] representing the newly created view. If the path already
132
+ exists or is invalid and `ignore_errors=True`, returns `None`.
106
133
 
107
134
  Raises:
108
- Error: if the path already exists or is invalid.
135
+ Error: if the path already exists or is invalid and `ignore_errors=False`.
109
136
 
110
137
  Examples:
111
138
  Create a view with an additional int and a string column and a filter:
@@ -140,7 +167,7 @@ def create_view(
140
167
  Catalog.get().paths.check_is_valid(path, expected=None)
141
168
  except Exception as e:
142
169
  if ignore_errors:
143
- return
170
+ return None
144
171
  else:
145
172
  raise e
146
173
  dir = Catalog.get().paths[path.parent]
@@ -152,7 +179,7 @@ def create_view(
152
179
  else:
153
180
  iterator_class, iterator_args = iterator
154
181
 
155
- view = catalog.View.create(
182
+ view = catalog.View._create(
156
183
  dir._id,
157
184
  path.name,
158
185
  base=tbl_version_path,
@@ -170,16 +197,16 @@ def create_view(
170
197
 
171
198
 
172
199
  def get_table(path: str) -> catalog.Table:
173
- """Get a handle to a table (including views and snapshots).
200
+ """Get a handle to an existing table or view or snapshot.
174
201
 
175
202
  Args:
176
203
  path: Path to the table.
177
204
 
178
205
  Returns:
179
- A `InsertableTable` or `View` object.
206
+ A handle to the [`Table`][pixeltable.Table].
180
207
 
181
208
  Raises:
182
- Error: If the path does not exist or does not designate a table.
209
+ Error: If the path does not exist or does not designate a table object.
183
210
 
184
211
  Examples:
185
212
  Get handle for a table in the top-level directory:
@@ -197,6 +224,7 @@ def get_table(path: str) -> catalog.Table:
197
224
  p = catalog.Path(path)
198
225
  Catalog.get().paths.check_is_valid(p, expected=catalog.Table)
199
226
  obj = Catalog.get().paths[p]
227
+ assert isinstance(obj, catalog.Table)
200
228
  return obj
201
229
 
202
230
 
@@ -230,15 +258,15 @@ def move(path: str, new_path: str) -> None:
230
258
 
231
259
 
232
260
  def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> None:
233
- """Drop a table.
261
+ """Drop a table or view or snapshot.
234
262
 
235
263
  Args:
236
- path: Path to the table.
264
+ path: Path to the [`Table`][pixeltable.Table].
237
265
  force: If `True`, will also drop all views or sub-views of this table.
238
266
  ignore_errors: Whether to ignore errors if the table does not exist.
239
267
 
240
268
  Raises:
241
- Error: If the path does not exist or does not designate a table and ignore_errors is False.
269
+ Error: If the path does not exist or does not designate a table object and ignore_errors is False.
242
270
 
243
271
  Examples:
244
272
  >>> cl.drop_table('my_table')
@@ -256,7 +284,7 @@ def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> N
256
284
  tbl = cat.paths[path_obj]
257
285
  assert isinstance(tbl, catalog.Table)
258
286
  if len(cat.tbl_dependents[tbl._id]) > 0:
259
- dependent_paths = [dep.path for dep in cat.tbl_dependents[tbl._id]]
287
+ dependent_paths = [dep._path for dep in cat.tbl_dependents[tbl._id]]
260
288
  if force:
261
289
  for dependent_path in dependent_paths:
262
290
  drop_table(dependent_path, force=True)
@@ -268,14 +296,14 @@ def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> N
268
296
 
269
297
 
270
298
  def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
271
- """List the tables in a directory.
299
+ """List the [`Table`][pixeltable.Table]s in a directory.
272
300
 
273
301
  Args:
274
302
  dir_path: Path to the directory. Defaults to the root directory.
275
303
  recursive: Whether to list tables in subdirectories as well.
276
304
 
277
305
  Returns:
278
- A list of table paths.
306
+ A list of [`Table`][pixeltable.Table] paths.
279
307
 
280
308
  Raises:
281
309
  Error: If the path does not exist or does not designate a directory.
@@ -297,7 +325,7 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
297
325
  return [str(p) for p in Catalog.get().paths.get_children(path, child_type=catalog.Table, recursive=recursive)]
298
326
 
299
327
 
300
- def create_dir(path_str: str, ignore_errors: bool = False) -> catalog.Dir:
328
+ def create_dir(path_str: str, ignore_errors: bool = False) -> Optional[catalog.Dir]:
301
329
  """Create a directory.
302
330
 
303
331
  Args:
@@ -325,6 +353,7 @@ def create_dir(path_str: str, ignore_errors: bool = False) -> catalog.Dir:
325
353
  session.add(dir_record)
326
354
  session.flush()
327
355
  assert dir_record.id is not None
356
+ assert isinstance(dir_record.id, UUID)
328
357
  dir = catalog.Dir(dir_record.id, parent._id, path.name)
329
358
  Catalog.get().paths[path] = dir
330
359
  session.commit()
@@ -333,7 +362,7 @@ def create_dir(path_str: str, ignore_errors: bool = False) -> catalog.Dir:
333
362
  return dir
334
363
  except excs.Error as e:
335
364
  if ignore_errors:
336
- return
365
+ return None
337
366
  else:
338
367
  raise e
339
368
 
@@ -415,7 +444,7 @@ def list_dirs(path_str: str = '', recursive: bool = True) -> list[str]:
415
444
  return [str(p) for p in Catalog.get().paths.get_children(path, child_type=catalog.Dir, recursive=recursive)]
416
445
 
417
446
 
418
- def list_functions() -> pd.DataFrame:
447
+ def list_functions() -> Styler:
419
448
  """Returns information about all registered functions.
420
449
 
421
450
  Returns:
@@ -436,7 +465,7 @@ def list_functions() -> pd.DataFrame:
436
465
  'Return Type': [str(f.signature.get_return_type()) for f in functions],
437
466
  }
438
467
  )
439
- pd_df = pd_df.style.set_properties(**{'text-align': 'left'}).set_table_styles(
468
+ pd_df = pd_df.style.set_properties(None, **{'text-align': 'left'}).set_table_styles(
440
469
  [dict(selector='th', props=[('text-align', 'center')])]
441
470
  ) # center-align headings
442
471
  return pd_df.hide(axis='index')
@@ -217,17 +217,17 @@ class Project(ExternalStore, abc.ABC):
217
217
  resolved_col_mapping: dict[Column, str] = {}
218
218
 
219
219
  # Validate names
220
- t_cols = table.column_names()
220
+ t_cols = set(table._schema.keys())
221
221
  for t_col, ext_col in col_mapping.items():
222
222
  if t_col not in t_cols:
223
223
  if is_user_specified_col_mapping:
224
224
  raise excs.Error(
225
- f'Column name `{t_col}` appears as a key in `col_mapping`, but Table `{table.name}` '
225
+ f'Column name `{t_col}` appears as a key in `col_mapping`, but Table `{table._name}` '
226
226
  'contains no such column.'
227
227
  )
228
228
  else:
229
229
  raise excs.Error(
230
- f'Column `{t_col}` does not exist in Table `{table.name}`. Either add a column `{t_col}`, '
230
+ f'Column `{t_col}` does not exist in Table `{table._name}`. Either add a column `{t_col}`, '
231
231
  f'or specify a `col_mapping` to associate a different column with the external field `{ext_col}`.'
232
232
  )
233
233
  if ext_col not in export_cols and ext_col not in import_cols:
@@ -238,7 +238,7 @@ class Project(ExternalStore, abc.ABC):
238
238
  col = table[t_col].col
239
239
  resolved_col_mapping[col] = ext_col
240
240
  # Validate column specs
241
- t_col_types = table.column_types()
241
+ t_col_types = table._schema
242
242
  for t_col, ext_col in col_mapping.items():
243
243
  t_col_type = t_col_types[t_col]
244
244
  if ext_col in export_cols:
pixeltable/io/globals.py CHANGED
@@ -1,5 +1,4 @@
1
1
  from typing import Any, Literal, Optional, Union
2
- import urllib.request
3
2
 
4
3
  import pixeltable as pxt
5
4
  import pixeltable.exceptions as excs
@@ -19,7 +18,7 @@ def create_label_studio_project(
19
18
  **kwargs: Any
20
19
  ) -> SyncStatus:
21
20
  """
22
- Create a new Label Studio project and link it to the specified `Table`.
21
+ Create a new Label Studio project and link it to the specified [`Table`][pixeltable.Table].
23
22
 
24
23
  - A tutorial notebook with fully worked examples can be found here:
25
24
  [Using Label Studio for Annotations with Pixeltable](https://pixeltable.readme.io/docs/label-studio)
@@ -34,7 +33,7 @@ def create_label_studio_project(
34
33
  then the linked project will have a column named `image`. In addition, the linked project
35
34
  will always have a JSON-typed column `annotations` representing the output.
36
35
 
37
- By default, Pixeltable will link each of these columns to a column of the specified `Table`
36
+ By default, Pixeltable will link each of these columns to a column of the specified [`Table`][pixeltable.Table]
38
37
  with the same name. If any of the data fields are missing, an exception will be raised. If
39
38
  the `annotations` column is missing, it will be created. The default names can be overridden
40
39
  by specifying an optional `col_mapping`, with Pixeltable column names as keys and Label
@@ -52,7 +51,7 @@ def create_label_studio_project(
52
51
  - `pip install boto3` (if using S3 import storage)
53
52
 
54
53
  Args:
55
- t: The Table to link to.
54
+ t: The table to link to.
56
55
  label_config: The Label Studio project configuration, in XML format.
57
56
  name: An optional name for the new project in Pixeltable. If specified, must be a valid
58
57
  Pixeltable identifier and must not be the name of any other external data store
@@ -73,7 +72,7 @@ def create_label_studio_project(
73
72
  The default is `post`.
74
73
  col_mapping: An optional mapping of local column names to Label Studio fields.
75
74
  sync_immediately: If `True`, immediately perform an initial synchronization by
76
- exporting all rows of the `Table` as Label Studio tasks.
75
+ exporting all rows of the table as Label Studio tasks.
77
76
  s3_configuration: If specified, S3 import storage will be configured for the new project. This can only
78
77
  be used with `media_import_method='url'`, and if `media_import_method='url'` and any of the media data is
79
78
  referenced by `s3://` URLs, then it must be specified in order for such media to display correctly
@@ -148,15 +147,15 @@ def import_rows(
148
147
  comment: str = ''
149
148
  ) -> Table:
150
149
  """
151
- Creates a new `Table` from a list of dictionaries. The dictionaries must be of the form
152
- `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
150
+ Creates a new base table from a list of dictionaries. The dictionaries must be of the
151
+ form `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
153
152
  supplied data, using the most specific type that can represent all the values in a column.
154
153
 
155
154
  If `schema_overrides` is specified, then for each entry `(column_name, type)` in `schema_overrides`,
156
155
  Pixeltable will force the specified column to the specified type (and will not attempt any type inference
157
156
  for that column).
158
157
 
159
- All column types of the new `Table` will be nullable unless explicitly specified as non-nullable in
158
+ All column types of the new table will be nullable unless explicitly specified as non-nullable in
160
159
  `schema_overrides`.
161
160
 
162
161
  Args:
@@ -169,7 +168,7 @@ def import_rows(
169
168
  comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
170
169
 
171
170
  Returns:
172
- The newly created `Table`.
171
+ A handle to the newly created [`Table`][pixeltable.Table].
173
172
  """
174
173
  if schema_overrides is None:
175
174
  schema_overrides = {}
@@ -187,7 +186,7 @@ def import_rows(
187
186
  elif value is not None:
188
187
  # If `key` is not in `schema_overrides`, then we infer its type from the data.
189
188
  # The column type will always be nullable by default.
190
- col_type = pxt.ColumnType.infer_literal_type(value).copy(nullable=True)
189
+ col_type = pxt.ColumnType.infer_literal_type(value, nullable=True)
191
190
  if col_name not in schema:
192
191
  schema[col_name] = col_type
193
192
  else:
@@ -230,8 +229,8 @@ def import_json(
230
229
  **kwargs: Any
231
230
  ) -> Table:
232
231
  """
233
- Creates a new `Table` from a JSON file. This is a convenience method and is equivalent
234
- to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
232
+ Creates a new base table from a JSON file. This is a convenience method and is
233
+ equivalent to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
235
234
  is the contents of the specified `filepath_or_url`.
236
235
 
237
236
  Args:
@@ -245,7 +244,7 @@ def import_json(
245
244
  kwargs: Additional keyword arguments to pass to `json.loads`.
246
245
 
247
246
  Returns:
248
- The newly created `Table`.
247
+ A handle to the newly created [`Table`][pixeltable.Table].
249
248
  """
250
249
  import json
251
250
  import urllib.parse
@@ -105,7 +105,7 @@ class LabelStudioProject(Project):
105
105
  return {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}
106
106
 
107
107
  def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
108
- _logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t.name}`'
108
+ _logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t._name}`'
109
109
  f' (export: {export_data}, import: {import_data}).')
110
110
  # Collect all existing tasks into a dict with entries `rowid: task`
111
111
  tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
@@ -396,15 +396,15 @@ class LabelStudioProject(Project):
396
396
  updates = [{'_rowid': rowid, local_annotations_col.name: ann} for rowid, ann in annotations.items()]
397
397
  if len(updates) > 0:
398
398
  _logger.info(
399
- f'Updating table `{t.name}`, column `{local_annotations_col.name}` with {len(updates)} total annotations.'
399
+ f'Updating table `{t._name}`, column `{local_annotations_col.name}` with {len(updates)} total annotations.'
400
400
  )
401
401
  # batch_update currently doesn't propagate from views to base tables. As a workaround, we call
402
402
  # batch_update on the actual ancestor table that holds the annotations column.
403
403
  # TODO(aaron-siegel): Simplify this once propagation is properly implemented in batch_update
404
404
  ancestor = t
405
405
  while local_annotations_col not in ancestor._tbl_version.cols:
406
- assert ancestor.base is not None
407
- ancestor = ancestor.base
406
+ assert ancestor._base is not None
407
+ ancestor = ancestor._base
408
408
  update_status = ancestor.batch_update(updates)
409
409
  print(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
410
410
  return SyncStatus(pxt_rows_updated=update_status.num_rows, num_excs=update_status.num_excs)
@@ -565,7 +565,7 @@ class LabelStudioProject(Project):
565
565
 
566
566
  if title is None:
567
567
  # `title` defaults to table name
568
- title = t.name
568
+ title = t._name
569
569
 
570
570
  # Create a column to hold the annotations, if one does not yet exist
571
571
  if col_mapping is None or ANNOTATIONS_COLUMN in col_mapping.values():
@@ -573,7 +573,7 @@ class LabelStudioProject(Project):
573
573
  local_annotations_column = ANNOTATIONS_COLUMN
574
574
  else:
575
575
  local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
576
- if local_annotations_column not in t.column_names():
576
+ if local_annotations_column not in t._schema.keys():
577
577
  t[local_annotations_column] = pxt.JsonType(nullable=True)
578
578
 
579
579
  resolved_col_mapping = cls.validate_columns(
pixeltable/io/pandas.py CHANGED
@@ -15,11 +15,12 @@ def import_pandas(
15
15
  primary_key: Optional[Union[str, list[str]]] = None,
16
16
  num_retained_versions: int = 10,
17
17
  comment: str = ''
18
- ) -> pxt.catalog.InsertableTable:
19
- """Creates a new `Table` from a Pandas `DataFrame`, with the specified name. The schema of the table
20
- will be inferred from the `DataFrame`.
18
+ ) -> pxt.Table:
19
+ """Creates a new base table from a Pandas
20
+ [`DataFrame`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html), with the
21
+ specified name. The schema of the table will be inferred from the DataFrame.
21
22
 
22
- The column names of the new `Table` will be identical to those in the `DataFrame`, as long as they are valid
23
+ The column names of the new table will be identical to those in the DataFrame, as long as they are valid
23
24
  Pixeltable identifiers. If a column name is not a valid Pixeltable identifier, it will be normalized according to
24
25
  the following procedure:
25
26
  - first replace any non-alphanumeric characters with underscores;
@@ -33,6 +34,9 @@ def import_pandas(
33
34
  name `name` will be given type `type`, instead of being inferred from the `DataFrame`. The keys in
34
35
  `schema_overrides` should be the column names of the `DataFrame` (whether or not they are valid
35
36
  Pixeltable identifiers).
37
+
38
+ Returns:
39
+ A handle to the newly created [`Table`][pixeltable.Table].
36
40
  """
37
41
  if schema_overrides is None:
38
42
  schema_overrides = {}
@@ -54,11 +58,15 @@ def import_csv(
54
58
  num_retained_versions: int = 10,
55
59
  comment: str = '',
56
60
  **kwargs
57
- ) -> pxt.catalog.InsertableTable:
61
+ ) -> pxt.Table:
58
62
  """
59
- Creates a new `Table` from a csv file. This is a convenience method and is equivalent
63
+ Creates a new base table from a csv file. This is a convenience method and is equivalent
60
64
  to calling `import_pandas(table_path, pd.read_csv(filepath_or_buffer, **kwargs), schema=schema)`.
61
- See the Pandas documentation for `read_csv` for more details.
65
+ See the Pandas documentation for [`read_csv`](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html)
66
+ for more details.
67
+
68
+ Returns:
69
+ A handle to the newly created [`Table`][pixeltable.Table].
62
70
  """
63
71
  df = pd.read_csv(filepath_or_buffer, **kwargs)
64
72
  return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
@@ -70,11 +78,15 @@ def import_excel(
70
78
  num_retained_versions: int = 10,
71
79
  comment: str = '',
72
80
  **kwargs
73
- ) -> pxt.catalog.InsertableTable:
81
+ ) -> pxt.Table:
74
82
  """
75
- Creates a new `Table` from an excel (.xlsx) file. This is a convenience method and is equivalent
76
- to calling `import_pandas(table_path, pd.read_excel(io, *args, **kwargs), schema=schema)`.
77
- See the Pandas documentation for `read_excel` for more details.
83
+ Creates a new base table from an Excel (.xlsx) file. This is a convenience method and is
84
+ equivalent to calling `import_pandas(table_path, pd.read_excel(io, *args, **kwargs), schema=schema)`.
85
+ See the Pandas documentation for [`read_excel`](https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html)
86
+ for more details.
87
+
88
+ Returns:
89
+ A handle to the newly created [`Table`][pixeltable.Table].
78
90
  """
79
91
  df = pd.read_excel(io, *args, **kwargs)
80
92
  return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
@@ -177,7 +189,10 @@ def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bo
177
189
  return pxt.FloatType(nullable=nullable)
178
190
 
179
191
  inferred_type = pxt.ColumnType.infer_common_literal_type(data_col)
180
- if inferred_type is not None:
192
+ if inferred_type is None:
193
+ # Fallback on StringType if everything else fails
194
+ return pxt.StringType(nullable=nullable)
195
+ else:
181
196
  return inferred_type.copy(nullable=nullable)
182
197
 
183
198
  raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {np_dtype})')
pixeltable/io/parquet.py CHANGED
@@ -19,12 +19,14 @@ from pixeltable.utils.transactional_directory import transactional_directory
19
19
  if typing.TYPE_CHECKING:
20
20
  import pixeltable as pxt
21
21
  import pyarrow as pa
22
+ from pyarrow import parquet
22
23
 
23
24
  _logger = logging.getLogger(__name__)
24
25
 
25
26
 
26
27
  def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
27
28
  import pyarrow as pa
29
+ from pyarrow import parquet
28
30
 
29
31
  pydict = {}
30
32
  for field in schema:
@@ -35,7 +37,7 @@ def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path:
35
37
  pydict[field.name] = value_batch[field.name]
36
38
 
37
39
  tab = pa.Table.from_pydict(pydict, schema=schema)
38
- pa.parquet.write_table(tab, output_path)
40
+ parquet.write_table(tab, output_path)
39
41
 
40
42
 
41
43
  def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
@@ -55,23 +57,21 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
55
57
  """
56
58
  from pixeltable.utils.arrow import to_arrow_schema
57
59
 
58
- column_names = df.get_column_names()
59
- column_types = df.get_column_types()
60
- type_dict = {k: v.as_dict() for k, v in zip(column_names, column_types)}
61
- arrow_schema = to_arrow_schema(dict(zip(column_names, column_types)))
60
+ type_dict = {k: v.as_dict() for k, v in df.schema.items()}
61
+ arrow_schema = to_arrow_schema(df.schema)
62
62
 
63
63
  # store the changes atomically
64
64
  with transactional_directory(dest_path) as temp_path:
65
65
  # dump metadata json file so we can inspect what was the source of the parquet file later on.
66
- json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w')) # pylint: disable=protected-access
66
+ json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
67
67
  json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
68
68
 
69
69
  batch_num = 0
70
- current_value_batch: Dict[str, deque] = {k: deque() for k in column_names}
70
+ current_value_batch: Dict[str, deque] = {k: deque() for k in df.schema.keys()}
71
71
  current_byte_estimate = 0
72
72
 
73
- for data_row in df._exec(): # pylint: disable=protected-access
74
- for col_name, col_type, e in zip(column_names, column_types, df._select_list_exprs): # pylint: disable=protected-access
73
+ for data_row in df._exec():
74
+ for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
75
75
  val = data_row[e.slot_idx]
76
76
  if val is None:
77
77
  current_value_batch[col_name].append(val)
@@ -122,7 +122,7 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
122
122
  assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
123
123
  _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
124
124
  batch_num += 1
125
- current_value_batch = {k: deque() for k in column_names}
125
+ current_value_batch = {k: deque() for k in df.schema.keys()}
126
126
  current_byte_estimate = 0
127
127
 
128
128
  _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
@@ -130,11 +130,11 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
130
130
 
131
131
  def parquet_schema_to_pixeltable_schema(parquet_path: str) -> Dict[str, Optional[ts.ColumnType]]:
132
132
  """Generate a default pixeltable schema for the given parquet file. Returns None for unknown types."""
133
- import pyarrow as pa
133
+ from pyarrow import parquet
134
134
  from pixeltable.utils.arrow import to_pixeltable_schema
135
135
 
136
136
  input_path = Path(parquet_path).expanduser()
137
- parquet_dataset = pa.parquet.ParquetDataset(input_path)
137
+ parquet_dataset = parquet.ParquetDataset(input_path)
138
138
  return to_pixeltable_schema(parquet_dataset.schema)
139
139
 
140
140
 
@@ -159,11 +159,11 @@ def import_parquet(
159
159
  The newly created table. The table will have loaded the data from the Parquet file(s).
160
160
  """
161
161
  import pixeltable as pxt
162
- import pyarrow as pa
162
+ from pyarrow import parquet
163
163
  from pixeltable.utils.arrow import iter_tuples
164
164
 
165
165
  input_path = Path(parquet_path).expanduser()
166
- parquet_dataset = pa.parquet.ParquetDataset(input_path)
166
+ parquet_dataset = parquet.ParquetDataset(input_path)
167
167
 
168
168
  schema = parquet_schema_to_pixeltable_schema(parquet_path)
169
169
  if schema_override is None:
@@ -38,7 +38,7 @@ class DocumentSectionMetadata:
38
38
  sourceline: Optional[int] = None
39
39
  # the stack of headings up to the most recently observed one;
40
40
  # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
41
- heading: Optional[Dict[int, str]] = None
41
+ heading: Optional[Dict[str, str]] = None
42
42
 
43
43
  # pdf-specific metadata
44
44
  page: Optional[int] = None
@@ -236,7 +236,7 @@ class DocumentSplitter(ComponentIterator):
236
236
  accumulated_text = [] # currently accumulated text
237
237
  # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
238
238
 
239
- headings: Dict[int, str] = {} # current state of observed headings (level -> text)
239
+ headings: Dict[str, str] = {} # current state of observed headings (level -> text)
240
240
  sourceline = 0 # most recently seen sourceline
241
241
 
242
242
  def update_metadata(el: bs4.Tag) -> None:
@@ -244,12 +244,11 @@ class DocumentSplitter(ComponentIterator):
244
244
  nonlocal headings, sourceline
245
245
  sourceline = el.sourceline
246
246
  if el.name in _HTML_HEADINGS:
247
- level = int(el.name[1])
248
247
  # remove the previously seen lower levels
249
- lower_levels = [l for l in headings if l > level]
248
+ lower_levels = [l for l in headings if l > el.name]
250
249
  for l in lower_levels:
251
250
  del headings[l]
252
- headings[level] = el.get_text().strip()
251
+ headings[el.name] = el.get_text().strip()
253
252
 
254
253
  def emit() -> None:
255
254
  nonlocal accumulated_text, headings, sourceline
@@ -295,13 +294,14 @@ class DocumentSplitter(ComponentIterator):
295
294
  # current state
296
295
  accumulated_text = [] # currently accumulated text
297
296
  # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
298
- headings: Dict[int, str] = {} # current state of observed headings (level -> text)
297
+ headings: Dict[str, str] = {} # current state of observed headings (level -> text)
299
298
 
300
299
  def update_headings(heading: Dict) -> None:
301
300
  # update current state
302
301
  nonlocal headings
303
302
  assert 'type' in heading and heading['type'] == 'heading'
304
- level = heading['attrs']['level']
303
+ lint = heading['attrs']['level']
304
+ level = f'h{lint}'
305
305
  text = heading['children'][0]['raw'].strip()
306
306
  # remove the previously seen lower levels
307
307
  lower_levels = [l for l in headings.keys() if l > level]