pixeltable 0.4.2__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (47) hide show
  1. pixeltable/__init__.py +1 -0
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +2 -10
  4. pixeltable/catalog/catalog.py +64 -38
  5. pixeltable/catalog/column.py +22 -23
  6. pixeltable/catalog/globals.py +2 -148
  7. pixeltable/catalog/insertable_table.py +6 -4
  8. pixeltable/catalog/path.py +6 -0
  9. pixeltable/catalog/table.py +51 -32
  10. pixeltable/catalog/table_version.py +69 -45
  11. pixeltable/catalog/update_status.py +179 -0
  12. pixeltable/catalog/view.py +9 -2
  13. pixeltable/config.py +76 -12
  14. pixeltable/dataframe.py +1 -1
  15. pixeltable/env.py +29 -0
  16. pixeltable/exec/exec_node.py +7 -24
  17. pixeltable/exec/expr_eval/schedulers.py +134 -7
  18. pixeltable/exprs/column_property_ref.py +21 -9
  19. pixeltable/exprs/column_ref.py +5 -1
  20. pixeltable/exprs/function_call.py +2 -2
  21. pixeltable/exprs/row_builder.py +10 -9
  22. pixeltable/exprs/rowid_ref.py +0 -4
  23. pixeltable/func/function.py +3 -3
  24. pixeltable/functions/audio.py +36 -9
  25. pixeltable/functions/video.py +57 -10
  26. pixeltable/globals.py +61 -1
  27. pixeltable/io/__init__.py +1 -1
  28. pixeltable/io/external_store.py +3 -55
  29. pixeltable/io/globals.py +4 -4
  30. pixeltable/io/hf_datasets.py +10 -2
  31. pixeltable/io/label_studio.py +16 -16
  32. pixeltable/metadata/__init__.py +1 -1
  33. pixeltable/metadata/converters/convert_39.py +125 -0
  34. pixeltable/metadata/converters/util.py +3 -0
  35. pixeltable/metadata/notes.py +1 -0
  36. pixeltable/metadata/schema.py +14 -2
  37. pixeltable/plan.py +4 -0
  38. pixeltable/share/packager.py +20 -38
  39. pixeltable/store.py +18 -50
  40. pixeltable/type_system.py +2 -2
  41. pixeltable/utils/coroutine.py +6 -23
  42. pixeltable/utils/media_store.py +39 -0
  43. {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/METADATA +1 -1
  44. {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/RECORD +47 -45
  45. {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/LICENSE +0 -0
  46. {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/WHEEL +0 -0
  47. {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/entry_points.txt +0 -0
@@ -1,14 +1,5 @@
1
1
  """
2
2
  Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `AudioType`.
3
-
4
- Example:
5
- ```python
6
- import pixeltable as pxt
7
- import pixeltable.functions as pxtf
8
-
9
- t = pxt.get_table(...)
10
- t.select(pxtf.audio.get_metadata()).collect()
11
- ```
12
3
  """
13
4
 
14
5
  import pixeltable as pxt
@@ -19,6 +10,42 @@ from pixeltable.utils.code import local_public_names
19
10
  def get_metadata(audio: pxt.Audio) -> dict:
20
11
  """
21
12
  Gets various metadata associated with an audio file and returns it as a dictionary.
13
+
14
+ Args:
15
+ audio: The audio to get metadata for.
16
+
17
+ Returns:
18
+ A `dict` such as the following:
19
+
20
+ ```json
21
+ {
22
+ 'size': 2568827,
23
+ 'streams': [
24
+ {
25
+ 'type': 'audio',
26
+ 'frames': 0,
27
+ 'duration': 2646000,
28
+ 'metadata': {},
29
+ 'time_base': 2.2675736961451248e-05,
30
+ 'codec_context': {
31
+ 'name': 'flac',
32
+ 'profile': None,
33
+ 'channels': 1,
34
+ 'codec_tag': '\\x00\\x00\\x00\\x00',
35
+ },
36
+ 'duration_seconds': 60.0,
37
+ }
38
+ ],
39
+ 'bit_rate': 342510,
40
+ 'metadata': {'encoder': 'Lavf61.1.100'},
41
+ 'bit_exact': False,
42
+ }
43
+ ```
44
+
45
+ Examples:
46
+ Extract metadata for files in the `audio_col` column of the table `tbl`:
47
+
48
+ >>> tbl.select(tbl.audio_col.get_metadata()).collect()
22
49
  """
23
50
  return pxt.functions.video._get_metadata(audio)
24
51
 
@@ -1,14 +1,5 @@
1
1
  """
2
2
  Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `VideoType`.
3
-
4
- Example:
5
- ```python
6
- import pixeltable as pxt
7
- import pixeltable.functions as pxtf
8
-
9
- t = pxt.get_table(...)
10
- t.select(pxtf.video.extract_audio(t.video_col)).collect()
11
- ```
12
3
  """
13
4
 
14
5
  import tempfile
@@ -92,12 +83,22 @@ def extract_audio(
92
83
  video_path: pxt.Video, stream_idx: int = 0, format: str = 'wav', codec: Optional[str] = None
93
84
  ) -> pxt.Audio:
94
85
  """
95
- Extract an audio stream from a video file, save it as a media file and return its path.
86
+ Extract an audio stream from a video.
96
87
 
97
88
  Args:
98
89
  stream_idx: Index of the audio stream to extract.
99
90
  format: The target audio format. (`'wav'`, `'mp3'`, `'flac'`).
100
91
  codec: The codec to use for the audio stream. If not provided, a default codec will be used.
92
+
93
+ Returns:
94
+ The extracted audio.
95
+
96
+ Examples:
97
+ Add a computed column to a table `tbl` that extracts audio from an existing column `video_col`:
98
+
99
+ >>> tbl.add_computed_column(
100
+ ... extracted_audio=tbl.video_col.extract_audio(format='flac')
101
+ ... )
101
102
  """
102
103
  if format not in _format_defaults:
103
104
  raise ValueError(f'extract_audio(): unsupported audio format: {format}')
@@ -124,6 +125,52 @@ def extract_audio(
124
125
  def get_metadata(video: pxt.Video) -> dict:
125
126
  """
126
127
  Gets various metadata associated with a video file and returns it as a dictionary.
128
+
129
+ Args:
130
+ video: The video to get metadata for.
131
+
132
+ Returns:
133
+ A `dict` such as the following:
134
+
135
+ ```json
136
+ {
137
+ 'bit_exact': False,
138
+ 'bit_rate': 967260,
139
+ 'size': 2234371,
140
+ 'metadata': {
141
+ 'encoder': 'Lavf60.16.100',
142
+ 'major_brand': 'isom',
143
+ 'minor_version': '512',
144
+ 'compatible_brands': 'isomiso2avc1mp41',
145
+ },
146
+ 'streams': [
147
+ {
148
+ 'type': 'video',
149
+ 'width': 640,
150
+ 'height': 360,
151
+ 'frames': 462,
152
+ 'time_base': 1.0 / 12800,
153
+ 'duration': 236544,
154
+ 'duration_seconds': 236544.0 / 12800,
155
+ 'average_rate': 25.0,
156
+ 'base_rate': 25.0,
157
+ 'guessed_rate': 25.0,
158
+ 'metadata': {
159
+ 'language': 'und',
160
+ 'handler_name': 'L-SMASH Video Handler',
161
+ 'vendor_id': '[0][0][0][0]',
162
+ 'encoder': 'Lavc60.31.102 libx264',
163
+ },
164
+ 'codec_context': {'name': 'h264', 'codec_tag': 'avc1', 'profile': 'High', 'pix_fmt': 'yuv420p'},
165
+ }
166
+ ],
167
+ }
168
+ ```
169
+
170
+ Examples:
171
+ Extract metadata for files in the `video_col` column of the table `tbl`:
172
+
173
+ >>> tbl.select(tbl.video_col.get_metadata()).collect()
127
174
  """
128
175
  return _get_metadata(video)
129
176
 
pixeltable/globals.py CHANGED
@@ -11,6 +11,7 @@ from pandas.io.formats.style import Styler
11
11
  from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share
12
12
  from pixeltable.catalog import Catalog, TableVersionPath
13
13
  from pixeltable.catalog.insertable_table import OnErrorParameter
14
+ from pixeltable.config import Config
14
15
  from pixeltable.env import Env
15
16
  from pixeltable.iterators import ComponentIterator
16
17
 
@@ -34,8 +35,11 @@ if TYPE_CHECKING:
34
35
  _logger = logging.getLogger('pixeltable')
35
36
 
36
37
 
37
- def init() -> None:
38
+ def init(config_overrides: Optional[dict[str, Any]] = None) -> None:
38
39
  """Initializes the Pixeltable environment."""
40
+ if config_overrides is None:
41
+ config_overrides = {}
42
+ Config.init(config_overrides)
39
43
  _ = Catalog.get()
40
44
 
41
45
 
@@ -633,6 +637,62 @@ def drop_dir(path: str, force: bool = False, if_not_exists: Literal['error', 'ig
633
637
  Catalog.get().drop_dir(path_obj, if_not_exists=if_not_exists_, force=force)
634
638
 
635
639
 
640
+ def ls(path: str = '') -> pd.DataFrame:
641
+ """
642
+ List the contents of a Pixeltable directory.
643
+
644
+ This function returns a Pandas DataFrame representing a human-readable listing of the specified directory,
645
+ including various attributes such as version and base table, as appropriate.
646
+
647
+ To get a programmatic list of tables and/or directories, use [list_tables()][pixeltable.list_tables] and/or
648
+ [list_dirs()][pixeltable.list_dirs] instead.
649
+ """
650
+ from pixeltable.metadata import schema
651
+
652
+ cat = Catalog.get()
653
+ path_obj = catalog.Path(path, empty_is_valid=True)
654
+ dir_entries = cat.get_dir_contents(path_obj)
655
+ rows: list[list[str]] = []
656
+ with Catalog.get().begin_xact():
657
+ for name, entry in dir_entries.items():
658
+ if name.startswith('_'):
659
+ continue
660
+ if entry.dir is not None:
661
+ kind = 'dir'
662
+ version = ''
663
+ base = ''
664
+ else:
665
+ assert entry.table is not None
666
+ assert isinstance(entry.table, schema.Table)
667
+ tbl = cat.get_table_by_id(entry.table.id)
668
+ md = tbl.get_metadata()
669
+ base = md['base'] or ''
670
+ if base.startswith('_'):
671
+ base = '<anonymous base table>'
672
+ if md['is_snapshot']:
673
+ kind = 'snapshot'
674
+ elif md['is_view']:
675
+ kind = 'view'
676
+ else:
677
+ kind = 'table'
678
+ version = '' if kind == 'snapshot' else md['version']
679
+ if md['is_replica']:
680
+ kind = f'{kind}-replica'
681
+ rows.append([name, kind, version, base])
682
+
683
+ rows = sorted(rows, key=lambda x: x[0])
684
+ df = pd.DataFrame(
685
+ {
686
+ 'Name': [row[0] for row in rows],
687
+ 'Kind': [row[1] for row in rows],
688
+ 'Version': [row[2] for row in rows],
689
+ 'Base': [row[3] for row in rows],
690
+ },
691
+ index=([''] * len(rows)),
692
+ )
693
+ return df
694
+
695
+
636
696
  def _extract_paths(
637
697
  dir_entries: dict[str, Catalog.DirEntry],
638
698
  parent: catalog.Path,
pixeltable/io/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
1
  # ruff: noqa: F401
2
2
 
3
3
  from .datarows import import_json, import_rows
4
- from .external_store import ExternalStore, SyncStatus
4
+ from .external_store import ExternalStore
5
5
  from .globals import create_label_studio_project, export_images_as_fo_dataset
6
6
  from .hf_datasets import import_huggingface_dataset
7
7
  from .pandas import import_csv, import_excel, import_pandas
@@ -3,14 +3,13 @@ from __future__ import annotations
3
3
  import abc
4
4
  import itertools
5
5
  import logging
6
- from dataclasses import dataclass, field
7
6
  from typing import Any, Optional
8
7
 
9
8
  import pixeltable.exceptions as excs
10
9
  import pixeltable.type_system as ts
11
10
  from pixeltable import Column, Table
12
11
  from pixeltable.catalog import ColumnHandle, TableVersion
13
- from pixeltable.catalog.globals import RowCountStats, UpdateStatus
12
+ from pixeltable.catalog.update_status import UpdateStatus
14
13
 
15
14
  _logger = logging.getLogger('pixeltable')
16
15
 
@@ -46,7 +45,7 @@ class ExternalStore(abc.ABC):
46
45
  """
47
46
 
48
47
  @abc.abstractmethod
49
- def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
48
+ def sync(self, t: Table, export_data: bool, import_data: bool) -> UpdateStatus:
50
49
  """
51
50
  Called by `Table.sync()` to implement store-specific synchronization logic.
52
51
  """
@@ -263,57 +262,6 @@ class Project(ExternalStore, abc.ABC):
263
262
  return resolved_col_mapping
264
263
 
265
264
 
266
- @dataclass(frozen=True)
267
- class SyncStatus:
268
- # stats for the rows affected by the operation in the external store
269
- ext_row_count_stats: RowCountStats = field(default_factory=RowCountStats)
270
-
271
- # stats for the rows affected by the operation
272
- row_count_stats: RowCountStats = field(default_factory=RowCountStats)
273
-
274
- @property
275
- def num_excs(self) -> int:
276
- """
277
- Returns the total number of Pixeltable exceptions that occurred during the operation.
278
- """
279
- return self.row_count_stats.num_excs
280
-
281
- @property
282
- def pxt_rows_updated(self) -> int:
283
- """
284
- Returns the number of Pixeltable rows that were updated as a result of the operation.
285
- """
286
- return self.row_count_stats.upd_rows
287
-
288
- @property
289
- def external_rows_updated(self) -> int:
290
- return self.ext_row_count_stats.upd_rows
291
-
292
- @property
293
- def external_rows_created(self) -> int:
294
- return self.ext_row_count_stats.ins_rows
295
-
296
- @property
297
- def external_rows_deleted(self) -> int:
298
- return self.ext_row_count_stats.del_rows
299
-
300
- def __add__(self, other: 'SyncStatus') -> 'SyncStatus':
301
- """
302
- Add the sync status from two SyncStatus objects together.
303
- """
304
- return SyncStatus(
305
- ext_row_count_stats=self.ext_row_count_stats + other.ext_row_count_stats,
306
- row_count_stats=self.row_count_stats + other.row_count_stats,
307
- )
308
-
309
- @classmethod
310
- def from_update_status(cls, us: UpdateStatus) -> 'SyncStatus':
311
- """
312
- Copy information from an UpdateStatus to a SyncStatus.
313
- """
314
- return SyncStatus(row_count_stats=us.row_count_stats + us.cascade_row_count_stats)
315
-
316
-
317
265
  class MockProject(Project):
318
266
  """A project that cannot be synced, used mainly for testing."""
319
267
 
@@ -348,7 +296,7 @@ class MockProject(Project):
348
296
  def get_import_columns(self) -> dict[str, ts.ColumnType]:
349
297
  return self.import_cols
350
298
 
351
- def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
299
+ def sync(self, t: Table, export_data: bool, import_data: bool) -> UpdateStatus:
352
300
  raise NotImplementedError()
353
301
 
354
302
  def delete(self) -> None:
pixeltable/io/globals.py CHANGED
@@ -5,8 +5,8 @@ from typing import TYPE_CHECKING, Any, Literal, Optional, Union
5
5
  import pixeltable as pxt
6
6
  import pixeltable.exceptions as excs
7
7
  from pixeltable import Table, exprs
8
+ from pixeltable.catalog.update_status import UpdateStatus
8
9
  from pixeltable.env import Env
9
- from pixeltable.io.external_store import SyncStatus
10
10
 
11
11
  if TYPE_CHECKING:
12
12
  import fiftyone as fo # type: ignore[import-untyped]
@@ -22,7 +22,7 @@ def create_label_studio_project(
22
22
  sync_immediately: bool = True,
23
23
  s3_configuration: Optional[dict[str, Any]] = None,
24
24
  **kwargs: Any,
25
- ) -> SyncStatus:
25
+ ) -> UpdateStatus:
26
26
  """
27
27
  Create a new Label Studio project and link it to the specified [`Table`][pixeltable.Table].
28
28
 
@@ -96,7 +96,7 @@ def create_label_studio_project(
96
96
  [Label Studio start_project docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.start_project).
97
97
 
98
98
  Returns:
99
- A `SyncStatus` representing the status of any synchronization operations that occurred.
99
+ An `UpdateStatus` representing the status of any synchronization operations that occurred.
100
100
 
101
101
  Examples:
102
102
  Create a Label Studio project whose tasks correspond to videos stored in the `video_col`
@@ -136,7 +136,7 @@ def create_label_studio_project(
136
136
  if sync_immediately:
137
137
  return t.sync()
138
138
  else:
139
- return SyncStatus()
139
+ return UpdateStatus()
140
140
 
141
141
 
142
142
  def export_images_as_fo_dataset(
@@ -50,10 +50,18 @@ def _to_pixeltable_type(feature_type: Any, nullable: bool) -> Optional[ts.Column
50
50
  elif isinstance(feature_type, datasets.Sequence):
51
51
  # example: cohere wiki. Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)
52
52
  dtype = _to_pixeltable_type(feature_type.feature, nullable)
53
- length = feature_type.length if feature_type.length != -1 else None
54
- return ts.ArrayType(shape=(length,), dtype=dtype)
53
+ if dtype is None:
54
+ return None
55
+ if dtype.is_int_type() or dtype.is_float_type() or dtype.is_bool_type() or dtype.is_string_type():
56
+ length = feature_type.length if feature_type.length != -1 else None
57
+ return ts.ArrayType(shape=(length,), dtype=dtype, nullable=nullable)
58
+ else:
59
+ # Sequence of dicts must be cast as Json
60
+ return ts.JsonType(nullable=nullable)
55
61
  elif isinstance(feature_type, datasets.Image):
56
62
  return ts.ImageType(nullable=nullable)
63
+ elif isinstance(feature_type, dict):
64
+ return ts.JsonType(nullable=nullable)
57
65
  else:
58
66
  return None
59
67
 
@@ -14,10 +14,10 @@ from requests.exceptions import HTTPError
14
14
  import pixeltable.type_system as ts
15
15
  from pixeltable import Column, Table, env, exceptions as excs
16
16
  from pixeltable.catalog import ColumnHandle
17
- from pixeltable.catalog.globals import RowCountStats
17
+ from pixeltable.catalog.update_status import RowCountStats, UpdateStatus
18
18
  from pixeltable.config import Config
19
19
  from pixeltable.exprs import ColumnRef, DataRow, Expr
20
- from pixeltable.io.external_store import Project, SyncStatus
20
+ from pixeltable.io.external_store import Project
21
21
  from pixeltable.utils import coco
22
22
 
23
23
  # label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
@@ -111,14 +111,14 @@ class LabelStudioProject(Project):
111
111
  """
112
112
  return {ANNOTATIONS_COLUMN: ts.JsonType(nullable=True)}
113
113
 
114
- def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
114
+ def sync(self, t: Table, export_data: bool, import_data: bool) -> UpdateStatus:
115
115
  _logger.info(
116
116
  f'Syncing Label Studio project "{self.project_title}" with table `{t._name}`'
117
117
  f' (export: {export_data}, import: {import_data}).'
118
118
  )
119
119
  # Collect all existing tasks into a dict with entries `rowid: task`
120
120
  tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
121
- sync_status = SyncStatus()
121
+ sync_status = UpdateStatus()
122
122
  if export_data:
123
123
  export_sync_status = self.__update_tasks(t, tasks)
124
124
  sync_status += export_sync_status
@@ -148,7 +148,7 @@ class LabelStudioProject(Project):
148
148
  f'Label Studio project {self.project_title!r}.'
149
149
  )
150
150
 
151
- def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) -> SyncStatus:
151
+ def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) -> UpdateStatus:
152
152
  """
153
153
  Updates all tasks in this Label Studio project based on the Pixeltable data:
154
154
  - Creates new tasks for rows that don't map to any existing task;
@@ -161,7 +161,7 @@ class LabelStudioProject(Project):
161
161
  t_data_cols = [t_col for t_col, ext_col_name in self.col_mapping.items() if ext_col_name in config.data_keys]
162
162
 
163
163
  if len(t_data_cols) == 0:
164
- return SyncStatus()
164
+ return UpdateStatus()
165
165
 
166
166
  # Columns in `t` that map to `rectanglelabels` preannotations
167
167
  t_rl_cols = [
@@ -192,7 +192,7 @@ class LabelStudioProject(Project):
192
192
  media_col: ColumnHandle,
193
193
  t_rl_cols: list[ColumnHandle],
194
194
  rl_info: list['_RectangleLabel'],
195
- ) -> SyncStatus:
195
+ ) -> UpdateStatus:
196
196
  is_stored = media_col.get().is_stored
197
197
  # If it's a stored column, we can use `localpath`
198
198
  localpath_col_opt = [t[media_col.get().name].localpath] if is_stored else []
@@ -238,7 +238,7 @@ class LabelStudioProject(Project):
238
238
 
239
239
  env.Env.get().console_logger.info(f'Created {tasks_created} new task(s) in {self}.')
240
240
 
241
- sync_status = SyncStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created))
241
+ sync_status = UpdateStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created))
242
242
 
243
243
  deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
244
244
  sync_status += deletion_sync_status
@@ -251,7 +251,7 @@ class LabelStudioProject(Project):
251
251
  t_data_cols: list[ColumnHandle],
252
252
  t_rl_cols: list[ColumnHandle],
253
253
  rl_info: list['_RectangleLabel'],
254
- ) -> SyncStatus:
254
+ ) -> UpdateStatus:
255
255
  ext_data_cols = [self.col_mapping[col] for col in t_data_cols]
256
256
  expr_refs: dict[str, Expr] = {} # kwargs for the select statement
257
257
  for col in t_data_cols:
@@ -342,7 +342,7 @@ class LabelStudioProject(Project):
342
342
  f'Created {tasks_created} new task(s) and updated {tasks_updated} existing task(s) in {self}.'
343
343
  )
344
344
 
345
- sync_status = SyncStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created, upd_rows=tasks_updated))
345
+ sync_status = UpdateStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created, upd_rows=tasks_updated))
346
346
 
347
347
  deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
348
348
  sync_status += deletion_sync_status
@@ -367,7 +367,7 @@ class LabelStudioProject(Project):
367
367
 
368
368
  def __delete_stale_tasks(
369
369
  self, existing_tasks: dict[tuple, dict], row_ids_in_pxt: set[tuple], tasks_created: int
370
- ) -> SyncStatus:
370
+ ) -> UpdateStatus:
371
371
  deleted_rowids = set(existing_tasks.keys()) - row_ids_in_pxt
372
372
  # Sanity check the math
373
373
  assert len(deleted_rowids) == len(existing_tasks) + tasks_created - len(row_ids_in_pxt)
@@ -383,11 +383,11 @@ class LabelStudioProject(Project):
383
383
  for rowid in deleted_rowids:
384
384
  del existing_tasks[rowid]
385
385
 
386
- return SyncStatus(ext_row_count_stats=RowCountStats(del_rows=len(deleted_rowids)))
386
+ return UpdateStatus(ext_row_count_stats=RowCountStats(del_rows=len(deleted_rowids)))
387
387
 
388
- def __update_table_from_tasks(self, t: Table, tasks: dict[tuple, dict]) -> SyncStatus:
388
+ def __update_table_from_tasks(self, t: Table, tasks: dict[tuple, dict]) -> UpdateStatus:
389
389
  if ANNOTATIONS_COLUMN not in self.col_mapping.values():
390
- return SyncStatus()
390
+ return UpdateStatus()
391
391
 
392
392
  annotations = {
393
393
  # Replace [] by None to indicate no annotations. We do want to sync rows with no annotations,
@@ -422,9 +422,9 @@ class LabelStudioProject(Project):
422
422
  ancestor = ancestor._get_base_table()
423
423
  update_status = ancestor.batch_update(updates)
424
424
  env.Env.get().console_logger.info(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
425
- return SyncStatus.from_update_status(update_status)
425
+ return update_status
426
426
  else:
427
- return SyncStatus()
427
+ return UpdateStatus()
428
428
 
429
429
  def as_dict(self) -> dict[str, Any]:
430
430
  return {
@@ -18,7 +18,7 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
18
18
  _logger = logging.getLogger('pixeltable')
19
19
 
20
20
  # current version of the metadata; this is incremented whenever the metadata schema changes
21
- VERSION = 39
21
+ VERSION = 40
22
22
 
23
23
 
24
24
  def create_system_info(engine: sql.engine.Engine) -> None:
@@ -0,0 +1,125 @@
1
+ import logging
2
+ from typing import Optional
3
+ from uuid import UUID
4
+
5
+ import sqlalchemy as sql
6
+
7
+ from pixeltable.metadata import register_converter
8
+ from pixeltable.metadata.converters.util import convert_table_md
9
+
10
+ _logger = logging.getLogger('pixeltable')
11
+
12
+
13
+ @register_converter(version=39)
14
+ def _(engine: sql.engine.Engine) -> None:
15
+ convert_table_md(engine, table_modifier=__table_modifier)
16
+
17
+
18
+ def __table_modifier(conn: sql.Connection, tbl_id: UUID, orig_table_md: dict, updated_table_md: dict) -> None:
19
+ store_prefix = 'view' if orig_table_md['view_md'] is not None else 'tbl'
20
+ store_name = f'{store_prefix}_{tbl_id.hex}'
21
+
22
+ # Get the list of column names that need to be migrated
23
+ col_names = find_error_columns(conn=conn, store_name=store_name)
24
+ if len(col_names) == 0:
25
+ _logger.info(f'No error columns found in table {store_name}. Skipping migration.')
26
+ return
27
+
28
+ # Check if the table exists, outside of the metadata we were given
29
+ # There seem to be cases where the metadata is present in the catalog,
30
+ # but the table itself is not in the database.
31
+ check_table_sql = sql.text(f"""
32
+ SELECT EXISTS (
33
+ SELECT 1
34
+ FROM information_schema.tables
35
+ WHERE table_name = '{store_name}'
36
+ )
37
+ """)
38
+ table_exists = conn.execute(check_table_sql).scalar()
39
+ if not table_exists:
40
+ _logger.warning(f'Table {store_name} does not exist. Skipping migration.')
41
+ return
42
+
43
+ return migrate_error_to_cellmd_columns(conn, store_name, col_names)
44
+
45
+
46
+ def find_error_columns(conn: sql.Connection, store_name: str) -> list[str]:
47
+ """
48
+ Return and errormsg or errortype columns in the given table
49
+
50
+ Args:
51
+ conn: SQLAlchemy connection
52
+ store_name: Name of the table to check
53
+
54
+ Returns:
55
+ List of column name roots (root_errormsg, root_errortype)
56
+ """
57
+ check_columns_sql = sql.text(f"""
58
+ SELECT column_name
59
+ FROM information_schema.columns
60
+ WHERE table_name = '{store_name}'
61
+ """)
62
+ found_columns = [
63
+ row[0]
64
+ for row in conn.execute(check_columns_sql)
65
+ if row[0].endswith('_errormsg') or row[0].endswith('_errortype')
66
+ ]
67
+ column_roots = {s.removesuffix('_errormsg').removesuffix('_errortype') for s in found_columns}
68
+ return [*column_roots]
69
+
70
+
71
+ def migrate_error_to_cellmd_columns(
72
+ conn: sql.Connection, store_name: str, col_names: list[str], backup_table: Optional[str] = None
73
+ ) -> None:
74
+ """
75
+ Safe version with error handling and optional backup.
76
+
77
+ Args:
78
+ engine: SQLAlchemy engine
79
+ store_name: Name of the table to modify
80
+ col_names: List of column name prefixes
81
+ backup_table: Optional name for backup table
82
+
83
+ Usage:
84
+ migrate_error_to_cellmd_columns(engine, 'my_table', ['columnname'], 'my_table_backup')
85
+ """
86
+
87
+ try:
88
+ # Optional: Create backup
89
+ if backup_table:
90
+ backup_sql = sql.text(f"""
91
+ CREATE TABLE {backup_table} AS SELECT * FROM {store_name}
92
+ """)
93
+ conn.execute(backup_sql)
94
+ _logger.info(f'Backup created: {backup_table}')
95
+
96
+ # Step 1: Add new columns
97
+ add_column_str = ', '.join(f'ADD COLUMN {col}_cellmd JSONB DEFAULT NULL' for col in col_names)
98
+ add_column_sql = sql.text(f'ALTER TABLE {store_name} {add_column_str}')
99
+ conn.execute(add_column_sql)
100
+ _logger.info(f'Added columns: {", ".join(f"{col}_cellmd" for col in col_names)}')
101
+
102
+ # Step 2: Populate new columns
103
+ set_column_str = ', '.join(
104
+ [
105
+ f'{col}_cellmd = CASE WHEN {col}_errormsg IS NULL OR {col}_errortype IS NULL '
106
+ f"THEN NULL ELSE jsonb_build_object('errormsg', {col}_errormsg, 'errortype', {col}_errortype) END"
107
+ for col in col_names
108
+ ]
109
+ )
110
+ populate_sql = sql.text(f'UPDATE {store_name} SET {set_column_str}')
111
+ result = conn.execute(populate_sql)
112
+ _logger.info(f'Updated {result.rowcount} rows')
113
+
114
+ # Step 3: Drop old columns
115
+ drop_columns_str = ', '.join(
116
+ [f'DROP COLUMN IF EXISTS {col}_errormsg, DROP COLUMN IF EXISTS {col}_errortype' for col in col_names]
117
+ )
118
+ drop_columns_sql = sql.text(f'ALTER TABLE {store_name} {drop_columns_str}')
119
+ conn.execute(drop_columns_sql)
120
+ _logger.info(f'Dropped columns: {", ".join(f"{col}_errormsg, {col}_errortype" for col in col_names)}')
121
+ _logger.info(f'Migration completed successfully for table: {store_name}')
122
+
123
+ except sql.exc.SQLAlchemyError as e:
124
+ _logger.error(f'Migration for table {store_name} failed: {e}')
125
+ raise
@@ -16,6 +16,7 @@ def convert_table_md(
16
16
  column_md_updater: Optional[Callable[[dict], None]] = None,
17
17
  external_store_md_updater: Optional[Callable[[dict], None]] = None,
18
18
  substitution_fn: Optional[Callable[[Optional[str], Any], Optional[tuple[Optional[str], Any]]]] = None,
19
+ table_modifier: Optional[Callable[[sql.Connection, UUID, dict, dict], None]] = None,
19
20
  ) -> None:
20
21
  """
21
22
  Converts schema.TableMd dicts based on the specified conversion functions.
@@ -50,6 +51,8 @@ def convert_table_md(
50
51
  if updated_table_md != table_md:
51
52
  __logger.info(f'Updating schema for table: {tbl_id}')
52
53
  conn.execute(sql.update(Table).where(Table.id == tbl_id).values(md=updated_table_md))
54
+ if table_modifier is not None:
55
+ table_modifier(conn, tbl_id, table_md, updated_table_md)
53
56
 
54
57
  for row in conn.execute(sql.select(Function)):
55
58
  fn_id = row[0]
@@ -2,6 +2,7 @@
2
2
  # rather than as a comment, so that the existence of a description can be enforced by
3
3
  # the unit tests when new versions are added.
4
4
  VERSION_NOTES = {
5
+ 40: 'Convert error property columns to cellmd columns',
5
6
  39: 'ColumnHandles in external stores',
6
7
  38: 'Added TableMd.view_sn',
7
8
  37: 'Add support for the sample() method on DataFrames',