pixeltable 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -0
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +3 -11
- pixeltable/catalog/catalog.py +575 -220
- pixeltable/catalog/column.py +22 -23
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/globals.py +2 -148
- pixeltable/catalog/insertable_table.py +15 -13
- pixeltable/catalog/path.py +6 -0
- pixeltable/catalog/schema_object.py +9 -4
- pixeltable/catalog/table.py +96 -85
- pixeltable/catalog/table_version.py +257 -174
- pixeltable/catalog/table_version_path.py +1 -1
- pixeltable/catalog/tbl_ops.py +44 -0
- pixeltable/catalog/update_status.py +179 -0
- pixeltable/catalog/view.py +50 -56
- pixeltable/config.py +76 -12
- pixeltable/dataframe.py +19 -6
- pixeltable/env.py +50 -4
- pixeltable/exec/data_row_batch.py +3 -1
- pixeltable/exec/exec_node.py +7 -24
- pixeltable/exec/expr_eval/schedulers.py +134 -7
- pixeltable/exec/in_memory_data_node.py +6 -7
- pixeltable/exprs/column_property_ref.py +21 -9
- pixeltable/exprs/column_ref.py +7 -2
- pixeltable/exprs/function_call.py +2 -2
- pixeltable/exprs/row_builder.py +10 -9
- pixeltable/exprs/rowid_ref.py +0 -4
- pixeltable/func/function.py +3 -3
- pixeltable/functions/audio.py +36 -9
- pixeltable/functions/gemini.py +4 -4
- pixeltable/functions/openai.py +1 -2
- pixeltable/functions/video.py +59 -16
- pixeltable/globals.py +109 -24
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/datarows.py +2 -1
- pixeltable/io/external_store.py +3 -55
- pixeltable/io/globals.py +4 -4
- pixeltable/io/hf_datasets.py +10 -2
- pixeltable/io/label_studio.py +16 -16
- pixeltable/io/pandas.py +1 -0
- pixeltable/io/table_data_conduit.py +12 -13
- pixeltable/iterators/audio.py +17 -8
- pixeltable/iterators/image.py +5 -2
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_39.py +125 -0
- pixeltable/metadata/converters/util.py +3 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +50 -1
- pixeltable/plan.py +4 -0
- pixeltable/share/packager.py +20 -38
- pixeltable/store.py +40 -51
- pixeltable/type_system.py +2 -2
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/media_store.py +50 -0
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/METADATA +1 -1
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/RECORD +60 -57
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/LICENSE +0 -0
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/entry_points.txt +0 -0
pixeltable/functions/video.py
CHANGED
|
@@ -1,19 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `VideoType`.
|
|
3
|
-
|
|
4
|
-
Example:
|
|
5
|
-
```python
|
|
6
|
-
import pixeltable as pxt
|
|
7
|
-
import pixeltable.functions as pxtf
|
|
8
|
-
|
|
9
|
-
t = pxt.get_table(...)
|
|
10
|
-
t.select(pxtf.video.extract_audio(t.video_col)).collect()
|
|
11
|
-
```
|
|
12
3
|
"""
|
|
13
4
|
|
|
14
|
-
import tempfile
|
|
15
|
-
import uuid
|
|
16
|
-
from pathlib import Path
|
|
17
5
|
from typing import Any, Optional
|
|
18
6
|
|
|
19
7
|
import av
|
|
@@ -68,8 +56,7 @@ class make_video(pxt.Aggregator):
|
|
|
68
56
|
if frame is None:
|
|
69
57
|
return
|
|
70
58
|
if self.container is None:
|
|
71
|
-
|
|
72
|
-
self.out_file = Path(output_filename)
|
|
59
|
+
self.out_file = env.Env.get().create_tmp_path('.mp4')
|
|
73
60
|
self.container = av.open(str(self.out_file), mode='w')
|
|
74
61
|
self.stream = self.container.add_stream('h264', rate=self.fps)
|
|
75
62
|
self.stream.pix_fmt = 'yuv420p'
|
|
@@ -92,12 +79,22 @@ def extract_audio(
|
|
|
92
79
|
video_path: pxt.Video, stream_idx: int = 0, format: str = 'wav', codec: Optional[str] = None
|
|
93
80
|
) -> pxt.Audio:
|
|
94
81
|
"""
|
|
95
|
-
Extract an audio stream from a video
|
|
82
|
+
Extract an audio stream from a video.
|
|
96
83
|
|
|
97
84
|
Args:
|
|
98
85
|
stream_idx: Index of the audio stream to extract.
|
|
99
86
|
format: The target audio format. (`'wav'`, `'mp3'`, `'flac'`).
|
|
100
87
|
codec: The codec to use for the audio stream. If not provided, a default codec will be used.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
The extracted audio.
|
|
91
|
+
|
|
92
|
+
Examples:
|
|
93
|
+
Add a computed column to a table `tbl` that extracts audio from an existing column `video_col`:
|
|
94
|
+
|
|
95
|
+
>>> tbl.add_computed_column(
|
|
96
|
+
... extracted_audio=tbl.video_col.extract_audio(format='flac')
|
|
97
|
+
... )
|
|
101
98
|
"""
|
|
102
99
|
if format not in _format_defaults:
|
|
103
100
|
raise ValueError(f'extract_audio(): unsupported audio format: {format}')
|
|
@@ -108,7 +105,7 @@ def extract_audio(
|
|
|
108
105
|
return None
|
|
109
106
|
audio_stream = container.streams.audio[stream_idx]
|
|
110
107
|
# create this in our tmp directory, so it'll get cleaned up if it's being generated as part of a query
|
|
111
|
-
output_filename = str(env.Env.get().
|
|
108
|
+
output_filename = str(env.Env.get().create_tmp_path(f'.{ext}'))
|
|
112
109
|
|
|
113
110
|
with av.open(output_filename, 'w', format=format) as output_container:
|
|
114
111
|
output_stream = output_container.add_stream(codec or default_codec)
|
|
@@ -124,6 +121,52 @@ def extract_audio(
|
|
|
124
121
|
def get_metadata(video: pxt.Video) -> dict:
|
|
125
122
|
"""
|
|
126
123
|
Gets various metadata associated with a video file and returns it as a dictionary.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
video: The video to get metadata for.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
A `dict` such as the following:
|
|
130
|
+
|
|
131
|
+
```json
|
|
132
|
+
{
|
|
133
|
+
'bit_exact': False,
|
|
134
|
+
'bit_rate': 967260,
|
|
135
|
+
'size': 2234371,
|
|
136
|
+
'metadata': {
|
|
137
|
+
'encoder': 'Lavf60.16.100',
|
|
138
|
+
'major_brand': 'isom',
|
|
139
|
+
'minor_version': '512',
|
|
140
|
+
'compatible_brands': 'isomiso2avc1mp41',
|
|
141
|
+
},
|
|
142
|
+
'streams': [
|
|
143
|
+
{
|
|
144
|
+
'type': 'video',
|
|
145
|
+
'width': 640,
|
|
146
|
+
'height': 360,
|
|
147
|
+
'frames': 462,
|
|
148
|
+
'time_base': 1.0 / 12800,
|
|
149
|
+
'duration': 236544,
|
|
150
|
+
'duration_seconds': 236544.0 / 12800,
|
|
151
|
+
'average_rate': 25.0,
|
|
152
|
+
'base_rate': 25.0,
|
|
153
|
+
'guessed_rate': 25.0,
|
|
154
|
+
'metadata': {
|
|
155
|
+
'language': 'und',
|
|
156
|
+
'handler_name': 'L-SMASH Video Handler',
|
|
157
|
+
'vendor_id': '[0][0][0][0]',
|
|
158
|
+
'encoder': 'Lavc60.31.102 libx264',
|
|
159
|
+
},
|
|
160
|
+
'codec_context': {'name': 'h264', 'codec_tag': 'avc1', 'profile': 'High', 'pix_fmt': 'yuv420p'},
|
|
161
|
+
}
|
|
162
|
+
],
|
|
163
|
+
}
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
Examples:
|
|
167
|
+
Extract metadata for files in the `video_col` column of the table `tbl`:
|
|
168
|
+
|
|
169
|
+
>>> tbl.select(tbl.video_col.get_metadata()).collect()
|
|
127
170
|
"""
|
|
128
171
|
return _get_metadata(video)
|
|
129
172
|
|
pixeltable/globals.py
CHANGED
|
@@ -8,9 +8,10 @@ from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, Un
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
from pandas.io.formats.style import Styler
|
|
10
10
|
|
|
11
|
-
from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share
|
|
11
|
+
from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share, type_system as ts
|
|
12
12
|
from pixeltable.catalog import Catalog, TableVersionPath
|
|
13
13
|
from pixeltable.catalog.insertable_table import OnErrorParameter
|
|
14
|
+
from pixeltable.config import Config
|
|
14
15
|
from pixeltable.env import Env
|
|
15
16
|
from pixeltable.iterators import ComponentIterator
|
|
16
17
|
|
|
@@ -34,13 +35,16 @@ if TYPE_CHECKING:
|
|
|
34
35
|
_logger = logging.getLogger('pixeltable')
|
|
35
36
|
|
|
36
37
|
|
|
37
|
-
def init() -> None:
|
|
38
|
+
def init(config_overrides: Optional[dict[str, Any]] = None) -> None:
|
|
38
39
|
"""Initializes the Pixeltable environment."""
|
|
40
|
+
if config_overrides is None:
|
|
41
|
+
config_overrides = {}
|
|
42
|
+
Config.init(config_overrides)
|
|
39
43
|
_ = Catalog.get()
|
|
40
44
|
|
|
41
45
|
|
|
42
46
|
def create_table(
|
|
43
|
-
|
|
47
|
+
path: str,
|
|
44
48
|
schema: Optional[dict[str, Any]] = None,
|
|
45
49
|
*,
|
|
46
50
|
source: Optional[TableDataSource] = None,
|
|
@@ -54,14 +58,24 @@ def create_table(
|
|
|
54
58
|
if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
|
|
55
59
|
extra_args: Optional[dict[str, Any]] = None, # Additional arguments to data source provider
|
|
56
60
|
) -> catalog.Table:
|
|
57
|
-
"""Create a new base table.
|
|
61
|
+
"""Create a new base table. Exactly one of `schema` or `source` must be provided.
|
|
62
|
+
|
|
63
|
+
If a `schema` is provided, then an empty table will be created with the specified schema.
|
|
64
|
+
|
|
65
|
+
If a `source` is provided, then Pixeltable will attempt to infer a data source format and table schema from the
|
|
66
|
+
contents of the specified data, and the data will be imported from the specified source into the new table. The
|
|
67
|
+
source format and/or schema can be specified directly via the `source_format` and `schema_overrides` parameters.
|
|
58
68
|
|
|
59
69
|
Args:
|
|
60
|
-
|
|
61
|
-
schema:
|
|
62
|
-
source: A data source
|
|
63
|
-
source_format:
|
|
64
|
-
|
|
70
|
+
path: Pixeltable path (qualified name) of the table, such as `'my_table'` or `'my_dir.my_subdir.my_table'`.
|
|
71
|
+
schema: Schema for the new table, mapping column names to Pixeltable types.
|
|
72
|
+
source: A data source (file, URL, DataFrame, or list of rows) to import from.
|
|
73
|
+
source_format: Must be used in conjunction with a `source`.
|
|
74
|
+
If specified, then the given format will be used to read the source data. (Otherwise,
|
|
75
|
+
Pixeltable will attempt to infer the format from the source data.)
|
|
76
|
+
schema_overrides: Must be used in conjunction with a `source`.
|
|
77
|
+
If specified, then columns in `schema_overrides` will be given the specified types.
|
|
78
|
+
(Pixeltable will attempt to infer the types of any columns not specified.)
|
|
65
79
|
on_error: Determines the behavior if an error occurs while evaluating a computed column or detecting an
|
|
66
80
|
invalid media file (such as a corrupt image) for one of the inserted rows.
|
|
67
81
|
|
|
@@ -77,14 +91,15 @@ def create_table(
|
|
|
77
91
|
|
|
78
92
|
- `'on_read'`: validate media files at query time
|
|
79
93
|
- `'on_write'`: validate media files during insert/update operations
|
|
80
|
-
if_exists:
|
|
81
|
-
Must be one of the following:
|
|
94
|
+
if_exists: Determines the behavior if a table already exists at the specified path location.
|
|
82
95
|
|
|
83
96
|
- `'error'`: raise an error
|
|
84
97
|
- `'ignore'`: do nothing and return the existing table handle
|
|
85
|
-
- `'replace'`: if the existing table has no views, drop and replace it with a new one
|
|
86
|
-
|
|
87
|
-
|
|
98
|
+
- `'replace'`: if the existing table has no views or snapshots, drop and replace it with a new one;
|
|
99
|
+
raise an error if the existing table has views or snapshots
|
|
100
|
+
- `'replace_force'`: drop the existing table and all its views and snapshots, and create a new one
|
|
101
|
+
extra_args: Must be used in conjunction with a `source`. If specified, then additional arguments will be
|
|
102
|
+
passed along to the source data provider.
|
|
88
103
|
|
|
89
104
|
Returns:
|
|
90
105
|
A handle to the newly created table, or to an already existing table at the path when `if_exists='ignore'`.
|
|
@@ -110,7 +125,7 @@ def create_table(
|
|
|
110
125
|
>>> tbl1 = pxt.get_table('orig_table')
|
|
111
126
|
... tbl2 = pxt.create_table('new_table', tbl1.where(tbl1.col1 < 10).select(tbl1.col2))
|
|
112
127
|
|
|
113
|
-
Create a table if does not already exist, otherwise get the existing table:
|
|
128
|
+
Create a table if it does not already exist, otherwise get the existing table:
|
|
114
129
|
|
|
115
130
|
>>> tbl = pxt.create_table('my_table', schema={'col1': pxt.Int, 'col2': pxt.String}, if_exists='ignore')
|
|
116
131
|
|
|
@@ -126,12 +141,12 @@ def create_table(
|
|
|
126
141
|
from pixeltable.io.utils import normalize_primary_key_parameter
|
|
127
142
|
|
|
128
143
|
if (schema is None) == (source is None):
|
|
129
|
-
raise excs.Error('
|
|
144
|
+
raise excs.Error('Either a `schema` or a `source` must be provided (but not both)')
|
|
130
145
|
|
|
131
146
|
if schema is not None and (len(schema) == 0 or not isinstance(schema, dict)):
|
|
132
147
|
raise excs.Error('`schema` must be a non-empty dictionary')
|
|
133
148
|
|
|
134
|
-
path_obj = catalog.Path(
|
|
149
|
+
path_obj = catalog.Path(path)
|
|
135
150
|
if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
|
|
136
151
|
media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
|
|
137
152
|
primary_key: Optional[list[str]] = normalize_primary_key_parameter(primary_key)
|
|
@@ -142,7 +157,14 @@ def create_table(
|
|
|
142
157
|
tds = UnkTableDataConduit(source, source_format=source_format, extra_fields=extra_args)
|
|
143
158
|
tds.check_source_format()
|
|
144
159
|
data_source = tds.specialize()
|
|
145
|
-
|
|
160
|
+
src_schema_overrides: dict[str, ts.ColumnType] = {}
|
|
161
|
+
if schema_overrides is not None:
|
|
162
|
+
for col_name, py_type in schema_overrides.items():
|
|
163
|
+
col_type = ts.ColumnType.normalize_type(py_type, nullable_default=True, allow_builtin_types=False)
|
|
164
|
+
if col_type is None:
|
|
165
|
+
raise excs.Error(f'Invalid type for column {col_name!r} in `schema_overrides`: {py_type}')
|
|
166
|
+
src_schema_overrides[col_name] = col_type
|
|
167
|
+
data_source.src_schema_overrides = src_schema_overrides
|
|
146
168
|
data_source.src_pk = primary_key
|
|
147
169
|
data_source.infer_schema()
|
|
148
170
|
schema = data_source.pxt_schema
|
|
@@ -251,9 +273,7 @@ def create_view(
|
|
|
251
273
|
tbl_version_path = base._tbl_version_path
|
|
252
274
|
sample_clause = None
|
|
253
275
|
elif isinstance(base, DataFrame):
|
|
254
|
-
base.
|
|
255
|
-
if len(base._from_clause.tbls) > 1:
|
|
256
|
-
raise excs.Error('Cannot create a view of a join')
|
|
276
|
+
base._validate_mutable_op_sequence('create_view', allow_select=True)
|
|
257
277
|
tbl_version_path = base._from_clause.tbls[0]
|
|
258
278
|
where = base.where_clause
|
|
259
279
|
sample_clause = base.sample_clause
|
|
@@ -533,9 +553,12 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
|
|
|
533
553
|
|
|
534
554
|
>>> pxt.list_tables('dir1')
|
|
535
555
|
"""
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
556
|
+
return _list_tables(dir_path, recursive=recursive, allow_system_paths=False)
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def _list_tables(dir_path: str = '', recursive: bool = True, allow_system_paths: bool = False) -> list[str]:
|
|
560
|
+
path_obj = catalog.Path(dir_path, empty_is_valid=True, allow_system_paths=allow_system_paths)
|
|
561
|
+
contents = Catalog.get().get_dir_contents(path_obj, recursive=recursive)
|
|
539
562
|
return [str(p) for p in _extract_paths(contents, parent=path_obj, entry_type=catalog.Table)]
|
|
540
563
|
|
|
541
564
|
|
|
@@ -633,6 +656,68 @@ def drop_dir(path: str, force: bool = False, if_not_exists: Literal['error', 'ig
|
|
|
633
656
|
Catalog.get().drop_dir(path_obj, if_not_exists=if_not_exists_, force=force)
|
|
634
657
|
|
|
635
658
|
|
|
659
|
+
def ls(path: str = '') -> pd.DataFrame:
|
|
660
|
+
"""
|
|
661
|
+
List the contents of a Pixeltable directory.
|
|
662
|
+
|
|
663
|
+
This function returns a Pandas DataFrame representing a human-readable listing of the specified directory,
|
|
664
|
+
including various attributes such as version and base table, as appropriate.
|
|
665
|
+
|
|
666
|
+
To get a programmatic list of tables and/or directories, use [list_tables()][pixeltable.list_tables] and/or
|
|
667
|
+
[list_dirs()][pixeltable.list_dirs] instead.
|
|
668
|
+
"""
|
|
669
|
+
from pixeltable.catalog import retry_loop
|
|
670
|
+
from pixeltable.metadata import schema
|
|
671
|
+
|
|
672
|
+
cat = Catalog.get()
|
|
673
|
+
path_obj = catalog.Path(path, empty_is_valid=True)
|
|
674
|
+
dir_entries = cat.get_dir_contents(path_obj)
|
|
675
|
+
|
|
676
|
+
@retry_loop(for_write=False)
|
|
677
|
+
def op() -> list[list[str]]:
|
|
678
|
+
rows: list[list[str]] = []
|
|
679
|
+
for name, entry in dir_entries.items():
|
|
680
|
+
if name.startswith('_'):
|
|
681
|
+
continue
|
|
682
|
+
if entry.dir is not None:
|
|
683
|
+
kind = 'dir'
|
|
684
|
+
version = ''
|
|
685
|
+
base = ''
|
|
686
|
+
else:
|
|
687
|
+
assert entry.table is not None
|
|
688
|
+
assert isinstance(entry.table, schema.Table)
|
|
689
|
+
tbl = cat.get_table_by_id(entry.table.id)
|
|
690
|
+
md = tbl.get_metadata()
|
|
691
|
+
base = md['base'] or ''
|
|
692
|
+
if base.startswith('_'):
|
|
693
|
+
base = '<anonymous base table>'
|
|
694
|
+
if md['is_snapshot']:
|
|
695
|
+
kind = 'snapshot'
|
|
696
|
+
elif md['is_view']:
|
|
697
|
+
kind = 'view'
|
|
698
|
+
else:
|
|
699
|
+
kind = 'table'
|
|
700
|
+
version = '' if kind == 'snapshot' else md['version']
|
|
701
|
+
if md['is_replica']:
|
|
702
|
+
kind = f'{kind}-replica'
|
|
703
|
+
rows.append([name, kind, version, base])
|
|
704
|
+
return rows
|
|
705
|
+
|
|
706
|
+
rows = op()
|
|
707
|
+
|
|
708
|
+
rows = sorted(rows, key=lambda x: x[0])
|
|
709
|
+
df = pd.DataFrame(
|
|
710
|
+
{
|
|
711
|
+
'Name': [row[0] for row in rows],
|
|
712
|
+
'Kind': [row[1] for row in rows],
|
|
713
|
+
'Version': [row[2] for row in rows],
|
|
714
|
+
'Base': [row[3] for row in rows],
|
|
715
|
+
},
|
|
716
|
+
index=([''] * len(rows)),
|
|
717
|
+
)
|
|
718
|
+
return df
|
|
719
|
+
|
|
720
|
+
|
|
636
721
|
def _extract_paths(
|
|
637
722
|
dir_entries: dict[str, Catalog.DirEntry],
|
|
638
723
|
parent: catalog.Path,
|
pixeltable/io/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# ruff: noqa: F401
|
|
2
2
|
|
|
3
3
|
from .datarows import import_json, import_rows
|
|
4
|
-
from .external_store import ExternalStore
|
|
4
|
+
from .external_store import ExternalStore
|
|
5
5
|
from .globals import create_label_studio_project, export_images_as_fo_dataset
|
|
6
6
|
from .hf_datasets import import_huggingface_dataset
|
|
7
7
|
from .pandas import import_csv, import_excel, import_pandas
|
pixeltable/io/datarows.py
CHANGED
|
@@ -8,7 +8,7 @@ from pixeltable import exceptions as excs
|
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def _infer_schema_from_rows(
|
|
11
|
-
rows: Iterable[dict[str, Any]], schema_overrides: dict[str,
|
|
11
|
+
rows: Iterable[dict[str, Any]], schema_overrides: dict[str, ts.ColumnType], primary_key: list[str]
|
|
12
12
|
) -> dict[str, ts.ColumnType]:
|
|
13
13
|
schema: dict[str, ts.ColumnType] = {}
|
|
14
14
|
cols_with_nones: set[str] = set()
|
|
@@ -20,6 +20,7 @@ def _infer_schema_from_rows(
|
|
|
20
20
|
# in which the column names are encountered in the input data, even if `schema_overrides`
|
|
21
21
|
# is specified.
|
|
22
22
|
if col_name not in schema:
|
|
23
|
+
assert isinstance(schema_overrides[col_name], ts.ColumnType)
|
|
23
24
|
schema[col_name] = schema_overrides[col_name]
|
|
24
25
|
elif value is not None:
|
|
25
26
|
# If `key` is not in `schema_overrides`, then we infer its type from the data.
|
pixeltable/io/external_store.py
CHANGED
|
@@ -3,14 +3,13 @@ from __future__ import annotations
|
|
|
3
3
|
import abc
|
|
4
4
|
import itertools
|
|
5
5
|
import logging
|
|
6
|
-
from dataclasses import dataclass, field
|
|
7
6
|
from typing import Any, Optional
|
|
8
7
|
|
|
9
8
|
import pixeltable.exceptions as excs
|
|
10
9
|
import pixeltable.type_system as ts
|
|
11
10
|
from pixeltable import Column, Table
|
|
12
11
|
from pixeltable.catalog import ColumnHandle, TableVersion
|
|
13
|
-
from pixeltable.catalog.
|
|
12
|
+
from pixeltable.catalog.update_status import UpdateStatus
|
|
14
13
|
|
|
15
14
|
_logger = logging.getLogger('pixeltable')
|
|
16
15
|
|
|
@@ -46,7 +45,7 @@ class ExternalStore(abc.ABC):
|
|
|
46
45
|
"""
|
|
47
46
|
|
|
48
47
|
@abc.abstractmethod
|
|
49
|
-
def sync(self, t: Table, export_data: bool, import_data: bool) ->
|
|
48
|
+
def sync(self, t: Table, export_data: bool, import_data: bool) -> UpdateStatus:
|
|
50
49
|
"""
|
|
51
50
|
Called by `Table.sync()` to implement store-specific synchronization logic.
|
|
52
51
|
"""
|
|
@@ -263,57 +262,6 @@ class Project(ExternalStore, abc.ABC):
|
|
|
263
262
|
return resolved_col_mapping
|
|
264
263
|
|
|
265
264
|
|
|
266
|
-
@dataclass(frozen=True)
|
|
267
|
-
class SyncStatus:
|
|
268
|
-
# stats for the rows affected by the operation in the external store
|
|
269
|
-
ext_row_count_stats: RowCountStats = field(default_factory=RowCountStats)
|
|
270
|
-
|
|
271
|
-
# stats for the rows affected by the operation
|
|
272
|
-
row_count_stats: RowCountStats = field(default_factory=RowCountStats)
|
|
273
|
-
|
|
274
|
-
@property
|
|
275
|
-
def num_excs(self) -> int:
|
|
276
|
-
"""
|
|
277
|
-
Returns the total number of Pixeltable exceptions that occurred during the operation.
|
|
278
|
-
"""
|
|
279
|
-
return self.row_count_stats.num_excs
|
|
280
|
-
|
|
281
|
-
@property
|
|
282
|
-
def pxt_rows_updated(self) -> int:
|
|
283
|
-
"""
|
|
284
|
-
Returns the number of Pixeltable rows that were updated as a result of the operation.
|
|
285
|
-
"""
|
|
286
|
-
return self.row_count_stats.upd_rows
|
|
287
|
-
|
|
288
|
-
@property
|
|
289
|
-
def external_rows_updated(self) -> int:
|
|
290
|
-
return self.ext_row_count_stats.upd_rows
|
|
291
|
-
|
|
292
|
-
@property
|
|
293
|
-
def external_rows_created(self) -> int:
|
|
294
|
-
return self.ext_row_count_stats.ins_rows
|
|
295
|
-
|
|
296
|
-
@property
|
|
297
|
-
def external_rows_deleted(self) -> int:
|
|
298
|
-
return self.ext_row_count_stats.del_rows
|
|
299
|
-
|
|
300
|
-
def __add__(self, other: 'SyncStatus') -> 'SyncStatus':
|
|
301
|
-
"""
|
|
302
|
-
Add the sync status from two SyncStatus objects together.
|
|
303
|
-
"""
|
|
304
|
-
return SyncStatus(
|
|
305
|
-
ext_row_count_stats=self.ext_row_count_stats + other.ext_row_count_stats,
|
|
306
|
-
row_count_stats=self.row_count_stats + other.row_count_stats,
|
|
307
|
-
)
|
|
308
|
-
|
|
309
|
-
@classmethod
|
|
310
|
-
def from_update_status(cls, us: UpdateStatus) -> 'SyncStatus':
|
|
311
|
-
"""
|
|
312
|
-
Copy information from an UpdateStatus to a SyncStatus.
|
|
313
|
-
"""
|
|
314
|
-
return SyncStatus(row_count_stats=us.row_count_stats + us.cascade_row_count_stats)
|
|
315
|
-
|
|
316
|
-
|
|
317
265
|
class MockProject(Project):
|
|
318
266
|
"""A project that cannot be synced, used mainly for testing."""
|
|
319
267
|
|
|
@@ -348,7 +296,7 @@ class MockProject(Project):
|
|
|
348
296
|
def get_import_columns(self) -> dict[str, ts.ColumnType]:
|
|
349
297
|
return self.import_cols
|
|
350
298
|
|
|
351
|
-
def sync(self, t: Table, export_data: bool, import_data: bool) ->
|
|
299
|
+
def sync(self, t: Table, export_data: bool, import_data: bool) -> UpdateStatus:
|
|
352
300
|
raise NotImplementedError()
|
|
353
301
|
|
|
354
302
|
def delete(self) -> None:
|
pixeltable/io/globals.py
CHANGED
|
@@ -5,8 +5,8 @@ from typing import TYPE_CHECKING, Any, Literal, Optional, Union
|
|
|
5
5
|
import pixeltable as pxt
|
|
6
6
|
import pixeltable.exceptions as excs
|
|
7
7
|
from pixeltable import Table, exprs
|
|
8
|
+
from pixeltable.catalog.update_status import UpdateStatus
|
|
8
9
|
from pixeltable.env import Env
|
|
9
|
-
from pixeltable.io.external_store import SyncStatus
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
12
|
import fiftyone as fo # type: ignore[import-untyped]
|
|
@@ -22,7 +22,7 @@ def create_label_studio_project(
|
|
|
22
22
|
sync_immediately: bool = True,
|
|
23
23
|
s3_configuration: Optional[dict[str, Any]] = None,
|
|
24
24
|
**kwargs: Any,
|
|
25
|
-
) ->
|
|
25
|
+
) -> UpdateStatus:
|
|
26
26
|
"""
|
|
27
27
|
Create a new Label Studio project and link it to the specified [`Table`][pixeltable.Table].
|
|
28
28
|
|
|
@@ -96,7 +96,7 @@ def create_label_studio_project(
|
|
|
96
96
|
[Label Studio start_project docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.start_project).
|
|
97
97
|
|
|
98
98
|
Returns:
|
|
99
|
-
|
|
99
|
+
An `UpdateStatus` representing the status of any synchronization operations that occurred.
|
|
100
100
|
|
|
101
101
|
Examples:
|
|
102
102
|
Create a Label Studio project whose tasks correspond to videos stored in the `video_col`
|
|
@@ -136,7 +136,7 @@ def create_label_studio_project(
|
|
|
136
136
|
if sync_immediately:
|
|
137
137
|
return t.sync()
|
|
138
138
|
else:
|
|
139
|
-
return
|
|
139
|
+
return UpdateStatus()
|
|
140
140
|
|
|
141
141
|
|
|
142
142
|
def export_images_as_fo_dataset(
|
pixeltable/io/hf_datasets.py
CHANGED
|
@@ -50,10 +50,18 @@ def _to_pixeltable_type(feature_type: Any, nullable: bool) -> Optional[ts.Column
|
|
|
50
50
|
elif isinstance(feature_type, datasets.Sequence):
|
|
51
51
|
# example: cohere wiki. Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)
|
|
52
52
|
dtype = _to_pixeltable_type(feature_type.feature, nullable)
|
|
53
|
-
|
|
54
|
-
|
|
53
|
+
if dtype is None:
|
|
54
|
+
return None
|
|
55
|
+
if dtype.is_int_type() or dtype.is_float_type() or dtype.is_bool_type() or dtype.is_string_type():
|
|
56
|
+
length = feature_type.length if feature_type.length != -1 else None
|
|
57
|
+
return ts.ArrayType(shape=(length,), dtype=dtype, nullable=nullable)
|
|
58
|
+
else:
|
|
59
|
+
# Sequence of dicts must be cast as Json
|
|
60
|
+
return ts.JsonType(nullable=nullable)
|
|
55
61
|
elif isinstance(feature_type, datasets.Image):
|
|
56
62
|
return ts.ImageType(nullable=nullable)
|
|
63
|
+
elif isinstance(feature_type, dict):
|
|
64
|
+
return ts.JsonType(nullable=nullable)
|
|
57
65
|
else:
|
|
58
66
|
return None
|
|
59
67
|
|
pixeltable/io/label_studio.py
CHANGED
|
@@ -14,10 +14,10 @@ from requests.exceptions import HTTPError
|
|
|
14
14
|
import pixeltable.type_system as ts
|
|
15
15
|
from pixeltable import Column, Table, env, exceptions as excs
|
|
16
16
|
from pixeltable.catalog import ColumnHandle
|
|
17
|
-
from pixeltable.catalog.
|
|
17
|
+
from pixeltable.catalog.update_status import RowCountStats, UpdateStatus
|
|
18
18
|
from pixeltable.config import Config
|
|
19
19
|
from pixeltable.exprs import ColumnRef, DataRow, Expr
|
|
20
|
-
from pixeltable.io.external_store import Project
|
|
20
|
+
from pixeltable.io.external_store import Project
|
|
21
21
|
from pixeltable.utils import coco
|
|
22
22
|
|
|
23
23
|
# label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
|
|
@@ -111,14 +111,14 @@ class LabelStudioProject(Project):
|
|
|
111
111
|
"""
|
|
112
112
|
return {ANNOTATIONS_COLUMN: ts.JsonType(nullable=True)}
|
|
113
113
|
|
|
114
|
-
def sync(self, t: Table, export_data: bool, import_data: bool) ->
|
|
114
|
+
def sync(self, t: Table, export_data: bool, import_data: bool) -> UpdateStatus:
|
|
115
115
|
_logger.info(
|
|
116
116
|
f'Syncing Label Studio project "{self.project_title}" with table `{t._name}`'
|
|
117
117
|
f' (export: {export_data}, import: {import_data}).'
|
|
118
118
|
)
|
|
119
119
|
# Collect all existing tasks into a dict with entries `rowid: task`
|
|
120
120
|
tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
|
|
121
|
-
sync_status =
|
|
121
|
+
sync_status = UpdateStatus()
|
|
122
122
|
if export_data:
|
|
123
123
|
export_sync_status = self.__update_tasks(t, tasks)
|
|
124
124
|
sync_status += export_sync_status
|
|
@@ -148,7 +148,7 @@ class LabelStudioProject(Project):
|
|
|
148
148
|
f'Label Studio project {self.project_title!r}.'
|
|
149
149
|
)
|
|
150
150
|
|
|
151
|
-
def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) ->
|
|
151
|
+
def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) -> UpdateStatus:
|
|
152
152
|
"""
|
|
153
153
|
Updates all tasks in this Label Studio project based on the Pixeltable data:
|
|
154
154
|
- Creates new tasks for rows that don't map to any existing task;
|
|
@@ -161,7 +161,7 @@ class LabelStudioProject(Project):
|
|
|
161
161
|
t_data_cols = [t_col for t_col, ext_col_name in self.col_mapping.items() if ext_col_name in config.data_keys]
|
|
162
162
|
|
|
163
163
|
if len(t_data_cols) == 0:
|
|
164
|
-
return
|
|
164
|
+
return UpdateStatus()
|
|
165
165
|
|
|
166
166
|
# Columns in `t` that map to `rectanglelabels` preannotations
|
|
167
167
|
t_rl_cols = [
|
|
@@ -192,7 +192,7 @@ class LabelStudioProject(Project):
|
|
|
192
192
|
media_col: ColumnHandle,
|
|
193
193
|
t_rl_cols: list[ColumnHandle],
|
|
194
194
|
rl_info: list['_RectangleLabel'],
|
|
195
|
-
) ->
|
|
195
|
+
) -> UpdateStatus:
|
|
196
196
|
is_stored = media_col.get().is_stored
|
|
197
197
|
# If it's a stored column, we can use `localpath`
|
|
198
198
|
localpath_col_opt = [t[media_col.get().name].localpath] if is_stored else []
|
|
@@ -238,7 +238,7 @@ class LabelStudioProject(Project):
|
|
|
238
238
|
|
|
239
239
|
env.Env.get().console_logger.info(f'Created {tasks_created} new task(s) in {self}.')
|
|
240
240
|
|
|
241
|
-
sync_status =
|
|
241
|
+
sync_status = UpdateStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created))
|
|
242
242
|
|
|
243
243
|
deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
|
|
244
244
|
sync_status += deletion_sync_status
|
|
@@ -251,7 +251,7 @@ class LabelStudioProject(Project):
|
|
|
251
251
|
t_data_cols: list[ColumnHandle],
|
|
252
252
|
t_rl_cols: list[ColumnHandle],
|
|
253
253
|
rl_info: list['_RectangleLabel'],
|
|
254
|
-
) ->
|
|
254
|
+
) -> UpdateStatus:
|
|
255
255
|
ext_data_cols = [self.col_mapping[col] for col in t_data_cols]
|
|
256
256
|
expr_refs: dict[str, Expr] = {} # kwargs for the select statement
|
|
257
257
|
for col in t_data_cols:
|
|
@@ -342,7 +342,7 @@ class LabelStudioProject(Project):
|
|
|
342
342
|
f'Created {tasks_created} new task(s) and updated {tasks_updated} existing task(s) in {self}.'
|
|
343
343
|
)
|
|
344
344
|
|
|
345
|
-
sync_status =
|
|
345
|
+
sync_status = UpdateStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created, upd_rows=tasks_updated))
|
|
346
346
|
|
|
347
347
|
deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
|
|
348
348
|
sync_status += deletion_sync_status
|
|
@@ -367,7 +367,7 @@ class LabelStudioProject(Project):
|
|
|
367
367
|
|
|
368
368
|
def __delete_stale_tasks(
|
|
369
369
|
self, existing_tasks: dict[tuple, dict], row_ids_in_pxt: set[tuple], tasks_created: int
|
|
370
|
-
) ->
|
|
370
|
+
) -> UpdateStatus:
|
|
371
371
|
deleted_rowids = set(existing_tasks.keys()) - row_ids_in_pxt
|
|
372
372
|
# Sanity check the math
|
|
373
373
|
assert len(deleted_rowids) == len(existing_tasks) + tasks_created - len(row_ids_in_pxt)
|
|
@@ -383,11 +383,11 @@ class LabelStudioProject(Project):
|
|
|
383
383
|
for rowid in deleted_rowids:
|
|
384
384
|
del existing_tasks[rowid]
|
|
385
385
|
|
|
386
|
-
return
|
|
386
|
+
return UpdateStatus(ext_row_count_stats=RowCountStats(del_rows=len(deleted_rowids)))
|
|
387
387
|
|
|
388
|
-
def __update_table_from_tasks(self, t: Table, tasks: dict[tuple, dict]) ->
|
|
388
|
+
def __update_table_from_tasks(self, t: Table, tasks: dict[tuple, dict]) -> UpdateStatus:
|
|
389
389
|
if ANNOTATIONS_COLUMN not in self.col_mapping.values():
|
|
390
|
-
return
|
|
390
|
+
return UpdateStatus()
|
|
391
391
|
|
|
392
392
|
annotations = {
|
|
393
393
|
# Replace [] by None to indicate no annotations. We do want to sync rows with no annotations,
|
|
@@ -422,9 +422,9 @@ class LabelStudioProject(Project):
|
|
|
422
422
|
ancestor = ancestor._get_base_table()
|
|
423
423
|
update_status = ancestor.batch_update(updates)
|
|
424
424
|
env.Env.get().console_logger.info(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
|
|
425
|
-
return
|
|
425
|
+
return update_status
|
|
426
426
|
else:
|
|
427
|
-
return
|
|
427
|
+
return UpdateStatus()
|
|
428
428
|
|
|
429
429
|
def as_dict(self) -> dict[str, Any]:
|
|
430
430
|
return {
|
pixeltable/io/pandas.py
CHANGED
|
@@ -132,6 +132,7 @@ def df_infer_schema(
|
|
|
132
132
|
pd_schema: dict[str, ts.ColumnType] = {}
|
|
133
133
|
for pd_name, pd_dtype in zip(df.columns, df.dtypes):
|
|
134
134
|
if pd_name in schema_overrides:
|
|
135
|
+
assert isinstance(schema_overrides[pd_name], ts.ColumnType)
|
|
135
136
|
pxt_type = schema_overrides[pd_name]
|
|
136
137
|
else:
|
|
137
138
|
pxt_type = __pd_coltype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
|