pixeltable 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (60) hide show
  1. pixeltable/__init__.py +1 -0
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +3 -11
  4. pixeltable/catalog/catalog.py +575 -220
  5. pixeltable/catalog/column.py +22 -23
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +2 -148
  8. pixeltable/catalog/insertable_table.py +15 -13
  9. pixeltable/catalog/path.py +6 -0
  10. pixeltable/catalog/schema_object.py +9 -4
  11. pixeltable/catalog/table.py +96 -85
  12. pixeltable/catalog/table_version.py +257 -174
  13. pixeltable/catalog/table_version_path.py +1 -1
  14. pixeltable/catalog/tbl_ops.py +44 -0
  15. pixeltable/catalog/update_status.py +179 -0
  16. pixeltable/catalog/view.py +50 -56
  17. pixeltable/config.py +76 -12
  18. pixeltable/dataframe.py +19 -6
  19. pixeltable/env.py +50 -4
  20. pixeltable/exec/data_row_batch.py +3 -1
  21. pixeltable/exec/exec_node.py +7 -24
  22. pixeltable/exec/expr_eval/schedulers.py +134 -7
  23. pixeltable/exec/in_memory_data_node.py +6 -7
  24. pixeltable/exprs/column_property_ref.py +21 -9
  25. pixeltable/exprs/column_ref.py +7 -2
  26. pixeltable/exprs/function_call.py +2 -2
  27. pixeltable/exprs/row_builder.py +10 -9
  28. pixeltable/exprs/rowid_ref.py +0 -4
  29. pixeltable/func/function.py +3 -3
  30. pixeltable/functions/audio.py +36 -9
  31. pixeltable/functions/gemini.py +4 -4
  32. pixeltable/functions/openai.py +1 -2
  33. pixeltable/functions/video.py +59 -16
  34. pixeltable/globals.py +109 -24
  35. pixeltable/io/__init__.py +1 -1
  36. pixeltable/io/datarows.py +2 -1
  37. pixeltable/io/external_store.py +3 -55
  38. pixeltable/io/globals.py +4 -4
  39. pixeltable/io/hf_datasets.py +10 -2
  40. pixeltable/io/label_studio.py +16 -16
  41. pixeltable/io/pandas.py +1 -0
  42. pixeltable/io/table_data_conduit.py +12 -13
  43. pixeltable/iterators/audio.py +17 -8
  44. pixeltable/iterators/image.py +5 -2
  45. pixeltable/metadata/__init__.py +1 -1
  46. pixeltable/metadata/converters/convert_39.py +125 -0
  47. pixeltable/metadata/converters/util.py +3 -0
  48. pixeltable/metadata/notes.py +1 -0
  49. pixeltable/metadata/schema.py +50 -1
  50. pixeltable/plan.py +4 -0
  51. pixeltable/share/packager.py +20 -38
  52. pixeltable/store.py +40 -51
  53. pixeltable/type_system.py +2 -2
  54. pixeltable/utils/coroutine.py +6 -23
  55. pixeltable/utils/media_store.py +50 -0
  56. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/METADATA +1 -1
  57. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/RECORD +60 -57
  58. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/LICENSE +0 -0
  59. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/WHEEL +0 -0
  60. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/entry_points.txt +0 -0
@@ -1,19 +1,7 @@
1
1
  """
2
2
  Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `VideoType`.
3
-
4
- Example:
5
- ```python
6
- import pixeltable as pxt
7
- import pixeltable.functions as pxtf
8
-
9
- t = pxt.get_table(...)
10
- t.select(pxtf.video.extract_audio(t.video_col)).collect()
11
- ```
12
3
  """
13
4
 
14
- import tempfile
15
- import uuid
16
- from pathlib import Path
17
5
  from typing import Any, Optional
18
6
 
19
7
  import av
@@ -68,8 +56,7 @@ class make_video(pxt.Aggregator):
68
56
  if frame is None:
69
57
  return
70
58
  if self.container is None:
71
- (_, output_filename) = tempfile.mkstemp(suffix='.mp4', dir=str(env.Env.get().tmp_dir))
72
- self.out_file = Path(output_filename)
59
+ self.out_file = env.Env.get().create_tmp_path('.mp4')
73
60
  self.container = av.open(str(self.out_file), mode='w')
74
61
  self.stream = self.container.add_stream('h264', rate=self.fps)
75
62
  self.stream.pix_fmt = 'yuv420p'
@@ -92,12 +79,22 @@ def extract_audio(
92
79
  video_path: pxt.Video, stream_idx: int = 0, format: str = 'wav', codec: Optional[str] = None
93
80
  ) -> pxt.Audio:
94
81
  """
95
- Extract an audio stream from a video file, save it as a media file and return its path.
82
+ Extract an audio stream from a video.
96
83
 
97
84
  Args:
98
85
  stream_idx: Index of the audio stream to extract.
99
86
  format: The target audio format. (`'wav'`, `'mp3'`, `'flac'`).
100
87
  codec: The codec to use for the audio stream. If not provided, a default codec will be used.
88
+
89
+ Returns:
90
+ The extracted audio.
91
+
92
+ Examples:
93
+ Add a computed column to a table `tbl` that extracts audio from an existing column `video_col`:
94
+
95
+ >>> tbl.add_computed_column(
96
+ ... extracted_audio=tbl.video_col.extract_audio(format='flac')
97
+ ... )
101
98
  """
102
99
  if format not in _format_defaults:
103
100
  raise ValueError(f'extract_audio(): unsupported audio format: {format}')
@@ -108,7 +105,7 @@ def extract_audio(
108
105
  return None
109
106
  audio_stream = container.streams.audio[stream_idx]
110
107
  # create this in our tmp directory, so it'll get cleaned up if it's being generated as part of a query
111
- output_filename = str(env.Env.get().tmp_dir / f'{uuid.uuid4()}.{ext}')
108
+ output_filename = str(env.Env.get().create_tmp_path(f'.{ext}'))
112
109
 
113
110
  with av.open(output_filename, 'w', format=format) as output_container:
114
111
  output_stream = output_container.add_stream(codec or default_codec)
@@ -124,6 +121,52 @@ def extract_audio(
124
121
  def get_metadata(video: pxt.Video) -> dict:
125
122
  """
126
123
  Gets various metadata associated with a video file and returns it as a dictionary.
124
+
125
+ Args:
126
+ video: The video to get metadata for.
127
+
128
+ Returns:
129
+ A `dict` such as the following:
130
+
131
+ ```json
132
+ {
133
+ 'bit_exact': False,
134
+ 'bit_rate': 967260,
135
+ 'size': 2234371,
136
+ 'metadata': {
137
+ 'encoder': 'Lavf60.16.100',
138
+ 'major_brand': 'isom',
139
+ 'minor_version': '512',
140
+ 'compatible_brands': 'isomiso2avc1mp41',
141
+ },
142
+ 'streams': [
143
+ {
144
+ 'type': 'video',
145
+ 'width': 640,
146
+ 'height': 360,
147
+ 'frames': 462,
148
+ 'time_base': 1.0 / 12800,
149
+ 'duration': 236544,
150
+ 'duration_seconds': 236544.0 / 12800,
151
+ 'average_rate': 25.0,
152
+ 'base_rate': 25.0,
153
+ 'guessed_rate': 25.0,
154
+ 'metadata': {
155
+ 'language': 'und',
156
+ 'handler_name': 'L-SMASH Video Handler',
157
+ 'vendor_id': '[0][0][0][0]',
158
+ 'encoder': 'Lavc60.31.102 libx264',
159
+ },
160
+ 'codec_context': {'name': 'h264', 'codec_tag': 'avc1', 'profile': 'High', 'pix_fmt': 'yuv420p'},
161
+ }
162
+ ],
163
+ }
164
+ ```
165
+
166
+ Examples:
167
+ Extract metadata for files in the `video_col` column of the table `tbl`:
168
+
169
+ >>> tbl.select(tbl.video_col.get_metadata()).collect()
127
170
  """
128
171
  return _get_metadata(video)
129
172
 
pixeltable/globals.py CHANGED
@@ -8,9 +8,10 @@ from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, Un
8
8
  import pandas as pd
9
9
  from pandas.io.formats.style import Styler
10
10
 
11
- from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share
11
+ from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share, type_system as ts
12
12
  from pixeltable.catalog import Catalog, TableVersionPath
13
13
  from pixeltable.catalog.insertable_table import OnErrorParameter
14
+ from pixeltable.config import Config
14
15
  from pixeltable.env import Env
15
16
  from pixeltable.iterators import ComponentIterator
16
17
 
@@ -34,13 +35,16 @@ if TYPE_CHECKING:
34
35
  _logger = logging.getLogger('pixeltable')
35
36
 
36
37
 
37
- def init() -> None:
38
+ def init(config_overrides: Optional[dict[str, Any]] = None) -> None:
38
39
  """Initializes the Pixeltable environment."""
40
+ if config_overrides is None:
41
+ config_overrides = {}
42
+ Config.init(config_overrides)
39
43
  _ = Catalog.get()
40
44
 
41
45
 
42
46
  def create_table(
43
- path_str: str,
47
+ path: str,
44
48
  schema: Optional[dict[str, Any]] = None,
45
49
  *,
46
50
  source: Optional[TableDataSource] = None,
@@ -54,14 +58,24 @@ def create_table(
54
58
  if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
55
59
  extra_args: Optional[dict[str, Any]] = None, # Additional arguments to data source provider
56
60
  ) -> catalog.Table:
57
- """Create a new base table.
61
+ """Create a new base table. Exactly one of `schema` or `source` must be provided.
62
+
63
+ If a `schema` is provided, then an empty table will be created with the specified schema.
64
+
65
+ If a `source` is provided, then Pixeltable will attempt to infer a data source format and table schema from the
66
+ contents of the specified data, and the data will be imported from the specified source into the new table. The
67
+ source format and/or schema can be specified directly via the `source_format` and `schema_overrides` parameters.
58
68
 
59
69
  Args:
60
- path_str: Path to the table.
61
- schema: A dictionary that maps column names to column types
62
- source: A data source from which a table schema can be inferred and data imported
63
- source_format: A hint to the format of the source data
64
- schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
70
+ path: Pixeltable path (qualified name) of the table, such as `'my_table'` or `'my_dir.my_subdir.my_table'`.
71
+ schema: Schema for the new table, mapping column names to Pixeltable types.
72
+ source: A data source (file, URL, DataFrame, or list of rows) to import from.
73
+ source_format: Must be used in conjunction with a `source`.
74
+ If specified, then the given format will be used to read the source data. (Otherwise,
75
+ Pixeltable will attempt to infer the format from the source data.)
76
+ schema_overrides: Must be used in conjunction with a `source`.
77
+ If specified, then columns in `schema_overrides` will be given the specified types.
78
+ (Pixeltable will attempt to infer the types of any columns not specified.)
65
79
  on_error: Determines the behavior if an error occurs while evaluating a computed column or detecting an
66
80
  invalid media file (such as a corrupt image) for one of the inserted rows.
67
81
 
@@ -77,14 +91,15 @@ def create_table(
77
91
 
78
92
  - `'on_read'`: validate media files at query time
79
93
  - `'on_write'`: validate media files during insert/update operations
80
- if_exists: Directive regarding how to handle if the path already exists.
81
- Must be one of the following:
94
+ if_exists: Determines the behavior if a table already exists at the specified path location.
82
95
 
83
96
  - `'error'`: raise an error
84
97
  - `'ignore'`: do nothing and return the existing table handle
85
- - `'replace'`: if the existing table has no views, drop and replace it with a new one
86
- - `'replace_force'`: drop the existing table and all its views, and create a new one
87
- extra_args: Additional arguments to pass to the source data provider
98
+ - `'replace'`: if the existing table has no views or snapshots, drop and replace it with a new one;
99
+ raise an error if the existing table has views or snapshots
100
+ - `'replace_force'`: drop the existing table and all its views and snapshots, and create a new one
101
+ extra_args: Must be used in conjunction with a `source`. If specified, then additional arguments will be
102
+ passed along to the source data provider.
88
103
 
89
104
  Returns:
90
105
  A handle to the newly created table, or to an already existing table at the path when `if_exists='ignore'`.
@@ -110,7 +125,7 @@ def create_table(
110
125
  >>> tbl1 = pxt.get_table('orig_table')
111
126
  ... tbl2 = pxt.create_table('new_table', tbl1.where(tbl1.col1 < 10).select(tbl1.col2))
112
127
 
113
- Create a table if does not already exist, otherwise get the existing table:
128
+ Create a table if it does not already exist, otherwise get the existing table:
114
129
 
115
130
  >>> tbl = pxt.create_table('my_table', schema={'col1': pxt.Int, 'col2': pxt.String}, if_exists='ignore')
116
131
 
@@ -126,12 +141,12 @@ def create_table(
126
141
  from pixeltable.io.utils import normalize_primary_key_parameter
127
142
 
128
143
  if (schema is None) == (source is None):
129
- raise excs.Error('Must provide either a `schema` or a `source`')
144
+ raise excs.Error('Either a `schema` or a `source` must be provided (but not both)')
130
145
 
131
146
  if schema is not None and (len(schema) == 0 or not isinstance(schema, dict)):
132
147
  raise excs.Error('`schema` must be a non-empty dictionary')
133
148
 
134
- path_obj = catalog.Path(path_str)
149
+ path_obj = catalog.Path(path)
135
150
  if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
136
151
  media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
137
152
  primary_key: Optional[list[str]] = normalize_primary_key_parameter(primary_key)
@@ -142,7 +157,14 @@ def create_table(
142
157
  tds = UnkTableDataConduit(source, source_format=source_format, extra_fields=extra_args)
143
158
  tds.check_source_format()
144
159
  data_source = tds.specialize()
145
- data_source.src_schema_overrides = schema_overrides
160
+ src_schema_overrides: dict[str, ts.ColumnType] = {}
161
+ if schema_overrides is not None:
162
+ for col_name, py_type in schema_overrides.items():
163
+ col_type = ts.ColumnType.normalize_type(py_type, nullable_default=True, allow_builtin_types=False)
164
+ if col_type is None:
165
+ raise excs.Error(f'Invalid type for column {col_name!r} in `schema_overrides`: {py_type}')
166
+ src_schema_overrides[col_name] = col_type
167
+ data_source.src_schema_overrides = src_schema_overrides
146
168
  data_source.src_pk = primary_key
147
169
  data_source.infer_schema()
148
170
  schema = data_source.pxt_schema
@@ -251,9 +273,7 @@ def create_view(
251
273
  tbl_version_path = base._tbl_version_path
252
274
  sample_clause = None
253
275
  elif isinstance(base, DataFrame):
254
- base._validate_mutable('create_view', allow_select=True)
255
- if len(base._from_clause.tbls) > 1:
256
- raise excs.Error('Cannot create a view of a join')
276
+ base._validate_mutable_op_sequence('create_view', allow_select=True)
257
277
  tbl_version_path = base._from_clause.tbls[0]
258
278
  where = base.where_clause
259
279
  sample_clause = base.sample_clause
@@ -533,9 +553,12 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
533
553
 
534
554
  >>> pxt.list_tables('dir1')
535
555
  """
536
- path_obj = catalog.Path(dir_path, empty_is_valid=True) # validate format
537
- cat = Catalog.get()
538
- contents = cat.get_dir_contents(path_obj, recursive=recursive)
556
+ return _list_tables(dir_path, recursive=recursive, allow_system_paths=False)
557
+
558
+
559
+ def _list_tables(dir_path: str = '', recursive: bool = True, allow_system_paths: bool = False) -> list[str]:
560
+ path_obj = catalog.Path(dir_path, empty_is_valid=True, allow_system_paths=allow_system_paths)
561
+ contents = Catalog.get().get_dir_contents(path_obj, recursive=recursive)
539
562
  return [str(p) for p in _extract_paths(contents, parent=path_obj, entry_type=catalog.Table)]
540
563
 
541
564
 
@@ -633,6 +656,68 @@ def drop_dir(path: str, force: bool = False, if_not_exists: Literal['error', 'ig
633
656
  Catalog.get().drop_dir(path_obj, if_not_exists=if_not_exists_, force=force)
634
657
 
635
658
 
659
+ def ls(path: str = '') -> pd.DataFrame:
660
+ """
661
+ List the contents of a Pixeltable directory.
662
+
663
+ This function returns a Pandas DataFrame representing a human-readable listing of the specified directory,
664
+ including various attributes such as version and base table, as appropriate.
665
+
666
+ To get a programmatic list of tables and/or directories, use [list_tables()][pixeltable.list_tables] and/or
667
+ [list_dirs()][pixeltable.list_dirs] instead.
668
+ """
669
+ from pixeltable.catalog import retry_loop
670
+ from pixeltable.metadata import schema
671
+
672
+ cat = Catalog.get()
673
+ path_obj = catalog.Path(path, empty_is_valid=True)
674
+ dir_entries = cat.get_dir_contents(path_obj)
675
+
676
+ @retry_loop(for_write=False)
677
+ def op() -> list[list[str]]:
678
+ rows: list[list[str]] = []
679
+ for name, entry in dir_entries.items():
680
+ if name.startswith('_'):
681
+ continue
682
+ if entry.dir is not None:
683
+ kind = 'dir'
684
+ version = ''
685
+ base = ''
686
+ else:
687
+ assert entry.table is not None
688
+ assert isinstance(entry.table, schema.Table)
689
+ tbl = cat.get_table_by_id(entry.table.id)
690
+ md = tbl.get_metadata()
691
+ base = md['base'] or ''
692
+ if base.startswith('_'):
693
+ base = '<anonymous base table>'
694
+ if md['is_snapshot']:
695
+ kind = 'snapshot'
696
+ elif md['is_view']:
697
+ kind = 'view'
698
+ else:
699
+ kind = 'table'
700
+ version = '' if kind == 'snapshot' else md['version']
701
+ if md['is_replica']:
702
+ kind = f'{kind}-replica'
703
+ rows.append([name, kind, version, base])
704
+ return rows
705
+
706
+ rows = op()
707
+
708
+ rows = sorted(rows, key=lambda x: x[0])
709
+ df = pd.DataFrame(
710
+ {
711
+ 'Name': [row[0] for row in rows],
712
+ 'Kind': [row[1] for row in rows],
713
+ 'Version': [row[2] for row in rows],
714
+ 'Base': [row[3] for row in rows],
715
+ },
716
+ index=([''] * len(rows)),
717
+ )
718
+ return df
719
+
720
+
636
721
  def _extract_paths(
637
722
  dir_entries: dict[str, Catalog.DirEntry],
638
723
  parent: catalog.Path,
pixeltable/io/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
1
  # ruff: noqa: F401
2
2
 
3
3
  from .datarows import import_json, import_rows
4
- from .external_store import ExternalStore, SyncStatus
4
+ from .external_store import ExternalStore
5
5
  from .globals import create_label_studio_project, export_images_as_fo_dataset
6
6
  from .hf_datasets import import_huggingface_dataset
7
7
  from .pandas import import_csv, import_excel, import_pandas
pixeltable/io/datarows.py CHANGED
@@ -8,7 +8,7 @@ from pixeltable import exceptions as excs
8
8
 
9
9
 
10
10
  def _infer_schema_from_rows(
11
- rows: Iterable[dict[str, Any]], schema_overrides: dict[str, Any], primary_key: list[str]
11
+ rows: Iterable[dict[str, Any]], schema_overrides: dict[str, ts.ColumnType], primary_key: list[str]
12
12
  ) -> dict[str, ts.ColumnType]:
13
13
  schema: dict[str, ts.ColumnType] = {}
14
14
  cols_with_nones: set[str] = set()
@@ -20,6 +20,7 @@ def _infer_schema_from_rows(
20
20
  # in which the column names are encountered in the input data, even if `schema_overrides`
21
21
  # is specified.
22
22
  if col_name not in schema:
23
+ assert isinstance(schema_overrides[col_name], ts.ColumnType)
23
24
  schema[col_name] = schema_overrides[col_name]
24
25
  elif value is not None:
25
26
  # If `key` is not in `schema_overrides`, then we infer its type from the data.
@@ -3,14 +3,13 @@ from __future__ import annotations
3
3
  import abc
4
4
  import itertools
5
5
  import logging
6
- from dataclasses import dataclass, field
7
6
  from typing import Any, Optional
8
7
 
9
8
  import pixeltable.exceptions as excs
10
9
  import pixeltable.type_system as ts
11
10
  from pixeltable import Column, Table
12
11
  from pixeltable.catalog import ColumnHandle, TableVersion
13
- from pixeltable.catalog.globals import RowCountStats, UpdateStatus
12
+ from pixeltable.catalog.update_status import UpdateStatus
14
13
 
15
14
  _logger = logging.getLogger('pixeltable')
16
15
 
@@ -46,7 +45,7 @@ class ExternalStore(abc.ABC):
46
45
  """
47
46
 
48
47
  @abc.abstractmethod
49
- def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
48
+ def sync(self, t: Table, export_data: bool, import_data: bool) -> UpdateStatus:
50
49
  """
51
50
  Called by `Table.sync()` to implement store-specific synchronization logic.
52
51
  """
@@ -263,57 +262,6 @@ class Project(ExternalStore, abc.ABC):
263
262
  return resolved_col_mapping
264
263
 
265
264
 
266
- @dataclass(frozen=True)
267
- class SyncStatus:
268
- # stats for the rows affected by the operation in the external store
269
- ext_row_count_stats: RowCountStats = field(default_factory=RowCountStats)
270
-
271
- # stats for the rows affected by the operation
272
- row_count_stats: RowCountStats = field(default_factory=RowCountStats)
273
-
274
- @property
275
- def num_excs(self) -> int:
276
- """
277
- Returns the total number of Pixeltable exceptions that occurred during the operation.
278
- """
279
- return self.row_count_stats.num_excs
280
-
281
- @property
282
- def pxt_rows_updated(self) -> int:
283
- """
284
- Returns the number of Pixeltable rows that were updated as a result of the operation.
285
- """
286
- return self.row_count_stats.upd_rows
287
-
288
- @property
289
- def external_rows_updated(self) -> int:
290
- return self.ext_row_count_stats.upd_rows
291
-
292
- @property
293
- def external_rows_created(self) -> int:
294
- return self.ext_row_count_stats.ins_rows
295
-
296
- @property
297
- def external_rows_deleted(self) -> int:
298
- return self.ext_row_count_stats.del_rows
299
-
300
- def __add__(self, other: 'SyncStatus') -> 'SyncStatus':
301
- """
302
- Add the sync status from two SyncStatus objects together.
303
- """
304
- return SyncStatus(
305
- ext_row_count_stats=self.ext_row_count_stats + other.ext_row_count_stats,
306
- row_count_stats=self.row_count_stats + other.row_count_stats,
307
- )
308
-
309
- @classmethod
310
- def from_update_status(cls, us: UpdateStatus) -> 'SyncStatus':
311
- """
312
- Copy information from an UpdateStatus to a SyncStatus.
313
- """
314
- return SyncStatus(row_count_stats=us.row_count_stats + us.cascade_row_count_stats)
315
-
316
-
317
265
  class MockProject(Project):
318
266
  """A project that cannot be synced, used mainly for testing."""
319
267
 
@@ -348,7 +296,7 @@ class MockProject(Project):
348
296
  def get_import_columns(self) -> dict[str, ts.ColumnType]:
349
297
  return self.import_cols
350
298
 
351
- def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
299
+ def sync(self, t: Table, export_data: bool, import_data: bool) -> UpdateStatus:
352
300
  raise NotImplementedError()
353
301
 
354
302
  def delete(self) -> None:
pixeltable/io/globals.py CHANGED
@@ -5,8 +5,8 @@ from typing import TYPE_CHECKING, Any, Literal, Optional, Union
5
5
  import pixeltable as pxt
6
6
  import pixeltable.exceptions as excs
7
7
  from pixeltable import Table, exprs
8
+ from pixeltable.catalog.update_status import UpdateStatus
8
9
  from pixeltable.env import Env
9
- from pixeltable.io.external_store import SyncStatus
10
10
 
11
11
  if TYPE_CHECKING:
12
12
  import fiftyone as fo # type: ignore[import-untyped]
@@ -22,7 +22,7 @@ def create_label_studio_project(
22
22
  sync_immediately: bool = True,
23
23
  s3_configuration: Optional[dict[str, Any]] = None,
24
24
  **kwargs: Any,
25
- ) -> SyncStatus:
25
+ ) -> UpdateStatus:
26
26
  """
27
27
  Create a new Label Studio project and link it to the specified [`Table`][pixeltable.Table].
28
28
 
@@ -96,7 +96,7 @@ def create_label_studio_project(
96
96
  [Label Studio start_project docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.start_project).
97
97
 
98
98
  Returns:
99
- A `SyncStatus` representing the status of any synchronization operations that occurred.
99
+ An `UpdateStatus` representing the status of any synchronization operations that occurred.
100
100
 
101
101
  Examples:
102
102
  Create a Label Studio project whose tasks correspond to videos stored in the `video_col`
@@ -136,7 +136,7 @@ def create_label_studio_project(
136
136
  if sync_immediately:
137
137
  return t.sync()
138
138
  else:
139
- return SyncStatus()
139
+ return UpdateStatus()
140
140
 
141
141
 
142
142
  def export_images_as_fo_dataset(
@@ -50,10 +50,18 @@ def _to_pixeltable_type(feature_type: Any, nullable: bool) -> Optional[ts.Column
50
50
  elif isinstance(feature_type, datasets.Sequence):
51
51
  # example: cohere wiki. Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)
52
52
  dtype = _to_pixeltable_type(feature_type.feature, nullable)
53
- length = feature_type.length if feature_type.length != -1 else None
54
- return ts.ArrayType(shape=(length,), dtype=dtype)
53
+ if dtype is None:
54
+ return None
55
+ if dtype.is_int_type() or dtype.is_float_type() or dtype.is_bool_type() or dtype.is_string_type():
56
+ length = feature_type.length if feature_type.length != -1 else None
57
+ return ts.ArrayType(shape=(length,), dtype=dtype, nullable=nullable)
58
+ else:
59
+ # Sequence of dicts must be cast as Json
60
+ return ts.JsonType(nullable=nullable)
55
61
  elif isinstance(feature_type, datasets.Image):
56
62
  return ts.ImageType(nullable=nullable)
63
+ elif isinstance(feature_type, dict):
64
+ return ts.JsonType(nullable=nullable)
57
65
  else:
58
66
  return None
59
67
 
@@ -14,10 +14,10 @@ from requests.exceptions import HTTPError
14
14
  import pixeltable.type_system as ts
15
15
  from pixeltable import Column, Table, env, exceptions as excs
16
16
  from pixeltable.catalog import ColumnHandle
17
- from pixeltable.catalog.globals import RowCountStats
17
+ from pixeltable.catalog.update_status import RowCountStats, UpdateStatus
18
18
  from pixeltable.config import Config
19
19
  from pixeltable.exprs import ColumnRef, DataRow, Expr
20
- from pixeltable.io.external_store import Project, SyncStatus
20
+ from pixeltable.io.external_store import Project
21
21
  from pixeltable.utils import coco
22
22
 
23
23
  # label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
@@ -111,14 +111,14 @@ class LabelStudioProject(Project):
111
111
  """
112
112
  return {ANNOTATIONS_COLUMN: ts.JsonType(nullable=True)}
113
113
 
114
- def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
114
+ def sync(self, t: Table, export_data: bool, import_data: bool) -> UpdateStatus:
115
115
  _logger.info(
116
116
  f'Syncing Label Studio project "{self.project_title}" with table `{t._name}`'
117
117
  f' (export: {export_data}, import: {import_data}).'
118
118
  )
119
119
  # Collect all existing tasks into a dict with entries `rowid: task`
120
120
  tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
121
- sync_status = SyncStatus()
121
+ sync_status = UpdateStatus()
122
122
  if export_data:
123
123
  export_sync_status = self.__update_tasks(t, tasks)
124
124
  sync_status += export_sync_status
@@ -148,7 +148,7 @@ class LabelStudioProject(Project):
148
148
  f'Label Studio project {self.project_title!r}.'
149
149
  )
150
150
 
151
- def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) -> SyncStatus:
151
+ def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) -> UpdateStatus:
152
152
  """
153
153
  Updates all tasks in this Label Studio project based on the Pixeltable data:
154
154
  - Creates new tasks for rows that don't map to any existing task;
@@ -161,7 +161,7 @@ class LabelStudioProject(Project):
161
161
  t_data_cols = [t_col for t_col, ext_col_name in self.col_mapping.items() if ext_col_name in config.data_keys]
162
162
 
163
163
  if len(t_data_cols) == 0:
164
- return SyncStatus()
164
+ return UpdateStatus()
165
165
 
166
166
  # Columns in `t` that map to `rectanglelabels` preannotations
167
167
  t_rl_cols = [
@@ -192,7 +192,7 @@ class LabelStudioProject(Project):
192
192
  media_col: ColumnHandle,
193
193
  t_rl_cols: list[ColumnHandle],
194
194
  rl_info: list['_RectangleLabel'],
195
- ) -> SyncStatus:
195
+ ) -> UpdateStatus:
196
196
  is_stored = media_col.get().is_stored
197
197
  # If it's a stored column, we can use `localpath`
198
198
  localpath_col_opt = [t[media_col.get().name].localpath] if is_stored else []
@@ -238,7 +238,7 @@ class LabelStudioProject(Project):
238
238
 
239
239
  env.Env.get().console_logger.info(f'Created {tasks_created} new task(s) in {self}.')
240
240
 
241
- sync_status = SyncStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created))
241
+ sync_status = UpdateStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created))
242
242
 
243
243
  deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
244
244
  sync_status += deletion_sync_status
@@ -251,7 +251,7 @@ class LabelStudioProject(Project):
251
251
  t_data_cols: list[ColumnHandle],
252
252
  t_rl_cols: list[ColumnHandle],
253
253
  rl_info: list['_RectangleLabel'],
254
- ) -> SyncStatus:
254
+ ) -> UpdateStatus:
255
255
  ext_data_cols = [self.col_mapping[col] for col in t_data_cols]
256
256
  expr_refs: dict[str, Expr] = {} # kwargs for the select statement
257
257
  for col in t_data_cols:
@@ -342,7 +342,7 @@ class LabelStudioProject(Project):
342
342
  f'Created {tasks_created} new task(s) and updated {tasks_updated} existing task(s) in {self}.'
343
343
  )
344
344
 
345
- sync_status = SyncStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created, upd_rows=tasks_updated))
345
+ sync_status = UpdateStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created, upd_rows=tasks_updated))
346
346
 
347
347
  deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
348
348
  sync_status += deletion_sync_status
@@ -367,7 +367,7 @@ class LabelStudioProject(Project):
367
367
 
368
368
  def __delete_stale_tasks(
369
369
  self, existing_tasks: dict[tuple, dict], row_ids_in_pxt: set[tuple], tasks_created: int
370
- ) -> SyncStatus:
370
+ ) -> UpdateStatus:
371
371
  deleted_rowids = set(existing_tasks.keys()) - row_ids_in_pxt
372
372
  # Sanity check the math
373
373
  assert len(deleted_rowids) == len(existing_tasks) + tasks_created - len(row_ids_in_pxt)
@@ -383,11 +383,11 @@ class LabelStudioProject(Project):
383
383
  for rowid in deleted_rowids:
384
384
  del existing_tasks[rowid]
385
385
 
386
- return SyncStatus(ext_row_count_stats=RowCountStats(del_rows=len(deleted_rowids)))
386
+ return UpdateStatus(ext_row_count_stats=RowCountStats(del_rows=len(deleted_rowids)))
387
387
 
388
- def __update_table_from_tasks(self, t: Table, tasks: dict[tuple, dict]) -> SyncStatus:
388
+ def __update_table_from_tasks(self, t: Table, tasks: dict[tuple, dict]) -> UpdateStatus:
389
389
  if ANNOTATIONS_COLUMN not in self.col_mapping.values():
390
- return SyncStatus()
390
+ return UpdateStatus()
391
391
 
392
392
  annotations = {
393
393
  # Replace [] by None to indicate no annotations. We do want to sync rows with no annotations,
@@ -422,9 +422,9 @@ class LabelStudioProject(Project):
422
422
  ancestor = ancestor._get_base_table()
423
423
  update_status = ancestor.batch_update(updates)
424
424
  env.Env.get().console_logger.info(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
425
- return SyncStatus.from_update_status(update_status)
425
+ return update_status
426
426
  else:
427
- return SyncStatus()
427
+ return UpdateStatus()
428
428
 
429
429
  def as_dict(self) -> dict[str, Any]:
430
430
  return {
pixeltable/io/pandas.py CHANGED
@@ -132,6 +132,7 @@ def df_infer_schema(
132
132
  pd_schema: dict[str, ts.ColumnType] = {}
133
133
  for pd_name, pd_dtype in zip(df.columns, df.dtypes):
134
134
  if pd_name in schema_overrides:
135
+ assert isinstance(schema_overrides[pd_name], ts.ColumnType)
135
136
  pxt_type = schema_overrides[pd_name]
136
137
  else:
137
138
  pxt_type = __pd_coltype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)