pixeltable 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/column.py +5 -0
- pixeltable/catalog/globals.py +8 -0
- pixeltable/catalog/insertable_table.py +2 -2
- pixeltable/catalog/table.py +27 -9
- pixeltable/catalog/table_version.py +41 -68
- pixeltable/catalog/view.py +3 -3
- pixeltable/dataframe.py +7 -6
- pixeltable/exec/__init__.py +2 -1
- pixeltable/exec/expr_eval_node.py +8 -1
- pixeltable/exec/row_update_node.py +61 -0
- pixeltable/exec/{sql_scan_node.py → sql_node.py} +120 -56
- pixeltable/exprs/__init__.py +1 -2
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +12 -12
- pixeltable/exprs/expr.py +67 -22
- pixeltable/exprs/function_call.py +60 -29
- pixeltable/exprs/globals.py +2 -0
- pixeltable/exprs/in_predicate.py +3 -3
- pixeltable/exprs/inline_array.py +18 -11
- pixeltable/exprs/is_null.py +5 -5
- pixeltable/exprs/method_ref.py +63 -0
- pixeltable/ext/__init__.py +9 -0
- pixeltable/ext/functions/__init__.py +8 -0
- pixeltable/ext/functions/whisperx.py +45 -5
- pixeltable/ext/functions/yolox.py +60 -14
- pixeltable/func/aggregate_function.py +10 -4
- pixeltable/func/callable_function.py +16 -4
- pixeltable/func/expr_template_function.py +1 -1
- pixeltable/func/function.py +12 -2
- pixeltable/func/function_registry.py +26 -9
- pixeltable/func/udf.py +32 -4
- pixeltable/functions/__init__.py +1 -1
- pixeltable/functions/fireworks.py +33 -0
- pixeltable/functions/globals.py +36 -1
- pixeltable/functions/huggingface.py +155 -7
- pixeltable/functions/image.py +242 -40
- pixeltable/functions/openai.py +214 -0
- pixeltable/functions/string.py +600 -8
- pixeltable/functions/timestamp.py +210 -0
- pixeltable/functions/together.py +106 -0
- pixeltable/functions/video.py +28 -10
- pixeltable/functions/whisper.py +32 -0
- pixeltable/globals.py +3 -3
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/globals.py +186 -5
- pixeltable/io/label_studio.py +42 -2
- pixeltable/io/pandas.py +70 -34
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_18.py +39 -0
- pixeltable/metadata/notes.py +10 -0
- pixeltable/plan.py +82 -7
- pixeltable/tool/create_test_db_dump.py +4 -5
- pixeltable/tool/doc_plugins/griffe.py +81 -0
- pixeltable/tool/doc_plugins/mkdocstrings.py +6 -0
- pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +135 -0
- pixeltable/type_system.py +15 -14
- pixeltable/utils/s3.py +1 -1
- pixeltable-0.2.14.dist-info/METADATA +206 -0
- {pixeltable-0.2.12.dist-info → pixeltable-0.2.14.dist-info}/RECORD +64 -56
- pixeltable-0.2.14.dist-info/entry_points.txt +3 -0
- pixeltable/exprs/image_member_access.py +0 -96
- pixeltable/exprs/predicate.py +0 -44
- pixeltable-0.2.12.dist-info/METADATA +0 -137
- {pixeltable-0.2.12.dist-info → pixeltable-0.2.14.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.12.dist-info → pixeltable-0.2.14.dist-info}/WHEEL +0 -0
pixeltable/io/globals.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
|
-
from typing import Any, Optional,
|
|
1
|
+
from typing import Any, Literal, Optional, Union
|
|
2
|
+
import urllib.request
|
|
2
3
|
|
|
4
|
+
import pixeltable as pxt
|
|
3
5
|
import pixeltable.exceptions as excs
|
|
4
6
|
from pixeltable import Table
|
|
5
7
|
from pixeltable.io.external_store import SyncStatus
|
|
@@ -13,11 +15,14 @@ def create_label_studio_project(
|
|
|
13
15
|
media_import_method: Literal['post', 'file', 'url'] = 'post',
|
|
14
16
|
col_mapping: Optional[dict[str, str]] = None,
|
|
15
17
|
sync_immediately: bool = True,
|
|
18
|
+
s3_configuration: Optional[dict[str, Any]] = None,
|
|
16
19
|
**kwargs: Any
|
|
17
20
|
) -> SyncStatus:
|
|
18
|
-
# TODO(aaron-siegel): Add link in docstring to a Label Studio howto
|
|
19
21
|
"""
|
|
20
|
-
|
|
22
|
+
Create a new Label Studio project and link it to the specified `Table`.
|
|
23
|
+
|
|
24
|
+
- A tutorial notebook with fully worked examples can be found here:
|
|
25
|
+
[Using Label Studio for Annotations with Pixeltable](https://pixeltable.readme.io/docs/label-studio)
|
|
21
26
|
|
|
22
27
|
The required parameter `label_config` specifies the Label Studio project configuration,
|
|
23
28
|
in XML format, as described in the Label Studio documentation. The linked project will
|
|
@@ -41,6 +46,11 @@ def create_label_studio_project(
|
|
|
41
46
|
* Set the `LABEL_STUDIO_API_KEY` and `LABEL_STUDIO_URL` environment variables; or
|
|
42
47
|
* Specify `api_key` and `url` fields in the `label-studio` section of `$PIXELTABLE_HOME/config.yaml`.
|
|
43
48
|
|
|
49
|
+
__Requirements:__
|
|
50
|
+
|
|
51
|
+
- `pip install label-studio-sdk`
|
|
52
|
+
- `pip install boto3` (if using S3 import storage)
|
|
53
|
+
|
|
44
54
|
Args:
|
|
45
55
|
t: The Table to link to.
|
|
46
56
|
label_config: The Label Studio project configuration, in XML format.
|
|
@@ -52,6 +62,7 @@ def create_label_studio_project(
|
|
|
52
62
|
will see inside Label Studio. Unlike `name`, it does not need to be an identifier and
|
|
53
63
|
does not need to be unique. If not specified, the table name `t.name` will be used.
|
|
54
64
|
media_import_method: The method to use when transferring media files to Label Studio:
|
|
65
|
+
|
|
55
66
|
- `post`: Media will be sent to Label Studio via HTTP post. This should generally only be used for
|
|
56
67
|
prototyping; due to restrictions in Label Studio, it can only be used with projects that have
|
|
57
68
|
just one data field, and does not scale well.
|
|
@@ -63,9 +74,48 @@ def create_label_studio_project(
|
|
|
63
74
|
col_mapping: An optional mapping of local column names to Label Studio fields.
|
|
64
75
|
sync_immediately: If `True`, immediately perform an initial synchronization by
|
|
65
76
|
exporting all rows of the `Table` as Label Studio tasks.
|
|
77
|
+
s3_configuration: If specified, S3 import storage will be configured for the new project. This can only
|
|
78
|
+
be used with `media_import_method='url'`, and if `media_import_method='url'` and any of the media data is
|
|
79
|
+
referenced by `s3://` URLs, then it must be specified in order for such media to display correctly
|
|
80
|
+
in the Label Studio interface.
|
|
81
|
+
|
|
82
|
+
The items in the `s3_configuration` dictionary correspond to kwarg
|
|
83
|
+
parameters of the Label Studio `connect_s3_import_storage` method, as described in the
|
|
84
|
+
[Label Studio connect_s3_import_storage docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.connect_s3_import_storage).
|
|
85
|
+
`bucket` must be specified; all other parameters are optional. If credentials are not specified explicitly,
|
|
86
|
+
Pixeltable will attempt to retrieve them from the environment (such as from `~/.aws/credentials`). If a title is not
|
|
87
|
+
specified, Pixeltable will use the default `'Pixeltable-S3-Import-Storage'`. All other parameters use their Label
|
|
88
|
+
Studio defaults.
|
|
66
89
|
kwargs: Additional keyword arguments are passed to the `start_project` method in the Label
|
|
67
|
-
Studio SDK, as described
|
|
68
|
-
https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.start_project
|
|
90
|
+
Studio SDK, as described in the
|
|
91
|
+
[Label Studio start_project docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.start_project).
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
A `SyncStatus` representing the status of any synchronization operations that occurred.
|
|
95
|
+
|
|
96
|
+
Examples:
|
|
97
|
+
Create a Label Studio project whose tasks correspond to videos stored in the `video_col` column of the table `tbl`:
|
|
98
|
+
|
|
99
|
+
>>> config = \"\"\"
|
|
100
|
+
<View>
|
|
101
|
+
<Video name="video_obj" value="$video_col"/>
|
|
102
|
+
<Choices name="video-category" toName="video" showInLine="true">
|
|
103
|
+
<Choice value="city"/>
|
|
104
|
+
<Choice value="food"/>
|
|
105
|
+
<Choice value="sports"/>
|
|
106
|
+
</Choices>
|
|
107
|
+
</View>\"\"\"
|
|
108
|
+
create_label_studio_project(tbl, config)
|
|
109
|
+
|
|
110
|
+
Create a Label Studio project with the same configuration, using `media_import_method='url'`,
|
|
111
|
+
whose media are stored in an S3 bucket:
|
|
112
|
+
|
|
113
|
+
>>> create_label_studio_project(
|
|
114
|
+
tbl,
|
|
115
|
+
config,
|
|
116
|
+
media_import_method='url',
|
|
117
|
+
s3_configuration={'bucket': 'my-bucket', 'region_name': 'us-east-2'}
|
|
118
|
+
)
|
|
69
119
|
"""
|
|
70
120
|
from pixeltable.io.label_studio import LabelStudioProject
|
|
71
121
|
|
|
@@ -76,6 +126,7 @@ def create_label_studio_project(
|
|
|
76
126
|
title,
|
|
77
127
|
media_import_method,
|
|
78
128
|
col_mapping,
|
|
129
|
+
s3_configuration,
|
|
79
130
|
**kwargs
|
|
80
131
|
)
|
|
81
132
|
|
|
@@ -85,3 +136,133 @@ def create_label_studio_project(
|
|
|
85
136
|
return t.sync()
|
|
86
137
|
else:
|
|
87
138
|
return SyncStatus.empty()
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def import_rows(
|
|
142
|
+
tbl_path: str,
|
|
143
|
+
rows: list[dict[str, Any]],
|
|
144
|
+
*,
|
|
145
|
+
schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
|
|
146
|
+
primary_key: Optional[Union[str, list[str]]] = None,
|
|
147
|
+
num_retained_versions: int = 10,
|
|
148
|
+
comment: str = ''
|
|
149
|
+
) -> Table:
|
|
150
|
+
"""
|
|
151
|
+
Creates a new `Table` from a list of dictionaries. The dictionaries must be of the form
|
|
152
|
+
`{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
|
|
153
|
+
supplied data, using the most specific type that can represent all the values in a column.
|
|
154
|
+
|
|
155
|
+
If `schema_overrides` is specified, then for each entry `(column_name, type)` in `schema_overrides`,
|
|
156
|
+
Pixeltable will force the specified column to the specified type (and will not attempt any type inference
|
|
157
|
+
for that column).
|
|
158
|
+
|
|
159
|
+
All column types of the new `Table` will be nullable unless explicitly specified as non-nullable in
|
|
160
|
+
`schema_overrides`.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
tbl_path: The qualified name of the table to create.
|
|
164
|
+
rows: The list of dictionaries to import.
|
|
165
|
+
schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
|
|
166
|
+
as described above.
|
|
167
|
+
primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
|
|
168
|
+
num_retained_versions: The number of retained versions of the table (see [`create_table()`][pixeltable.create_table]).
|
|
169
|
+
comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
The newly created `Table`.
|
|
173
|
+
"""
|
|
174
|
+
if schema_overrides is None:
|
|
175
|
+
schema_overrides = {}
|
|
176
|
+
schema: dict[str, pxt.ColumnType] = {}
|
|
177
|
+
cols_with_nones: set[str] = set()
|
|
178
|
+
|
|
179
|
+
for n, row in enumerate(rows):
|
|
180
|
+
for col_name, value in row.items():
|
|
181
|
+
if col_name in schema_overrides:
|
|
182
|
+
# We do the insertion here; this will ensure that the column order matches the order
|
|
183
|
+
# in which the column names are encountered in the input data, even if `schema_overrides`
|
|
184
|
+
# is specified.
|
|
185
|
+
if col_name not in schema:
|
|
186
|
+
schema[col_name] = schema_overrides[col_name]
|
|
187
|
+
elif value is not None:
|
|
188
|
+
# If `key` is not in `schema_overrides`, then we infer its type from the data.
|
|
189
|
+
# The column type will always be nullable by default.
|
|
190
|
+
col_type = pxt.ColumnType.infer_literal_type(value).copy(nullable=True)
|
|
191
|
+
if col_name not in schema:
|
|
192
|
+
schema[col_name] = col_type
|
|
193
|
+
else:
|
|
194
|
+
supertype = pxt.ColumnType.supertype(schema[col_name], col_type)
|
|
195
|
+
if supertype is None:
|
|
196
|
+
raise excs.Error(
|
|
197
|
+
f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
|
|
198
|
+
'Consider specifying the type explicitly in `schema_overrides`.'
|
|
199
|
+
)
|
|
200
|
+
schema[col_name] = supertype
|
|
201
|
+
else:
|
|
202
|
+
cols_with_nones.add(col_name)
|
|
203
|
+
|
|
204
|
+
extraneous_keys = schema_overrides.keys() - schema.keys()
|
|
205
|
+
if len(extraneous_keys) > 0:
|
|
206
|
+
raise excs.Error(f'The following columns specified in `schema_overrides` are not present in the data: {", ".join(extraneous_keys)}')
|
|
207
|
+
|
|
208
|
+
entirely_none_cols = cols_with_nones - schema.keys()
|
|
209
|
+
if len(entirely_none_cols) > 0:
|
|
210
|
+
# A column can only end up in `entirely_null_cols` if it was not in `schema_overrides` and
|
|
211
|
+
# was not encountered in any row with a non-None value.
|
|
212
|
+
raise excs.Error(
|
|
213
|
+
f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
|
|
214
|
+
'Consider specifying the type(s) explicitly in `schema_overrides`.'
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
t = pxt.create_table(tbl_path, schema, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
|
|
218
|
+
t.insert(rows)
|
|
219
|
+
return t
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def import_json(
|
|
223
|
+
tbl_path: str,
|
|
224
|
+
filepath_or_url: str,
|
|
225
|
+
*,
|
|
226
|
+
schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
|
|
227
|
+
primary_key: Optional[Union[str, list[str]]] = None,
|
|
228
|
+
num_retained_versions: int = 10,
|
|
229
|
+
comment: str = '',
|
|
230
|
+
**kwargs: Any
|
|
231
|
+
) -> Table:
|
|
232
|
+
"""
|
|
233
|
+
Creates a new `Table` from a JSON file. This is a convenience method and is equivalent
|
|
234
|
+
to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
|
|
235
|
+
is the contents of the specified `filepath_or_url`.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
tbl_path: The name of the table to create.
|
|
239
|
+
filepath_or_url: The path or URL of the JSON file.
|
|
240
|
+
schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
|
|
241
|
+
(see [`import_rows()`][pixeltable.io.import_rows]).
|
|
242
|
+
primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
|
|
243
|
+
num_retained_versions: The number of retained versions of the table (see [`create_table()`][pixeltable.create_table]).
|
|
244
|
+
comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
|
|
245
|
+
kwargs: Additional keyword arguments to pass to `json.loads`.
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
The newly created `Table`.
|
|
249
|
+
"""
|
|
250
|
+
import json
|
|
251
|
+
import urllib.parse
|
|
252
|
+
import urllib.request
|
|
253
|
+
|
|
254
|
+
# TODO Consolidate this logic with other places where files/URLs are parsed
|
|
255
|
+
parsed = urllib.parse.urlparse(filepath_or_url)
|
|
256
|
+
if len(parsed.scheme) <= 1 or parsed.scheme == 'file':
|
|
257
|
+
# local file path
|
|
258
|
+
if len(parsed.scheme) <= 1:
|
|
259
|
+
filepath = filepath_or_url
|
|
260
|
+
else:
|
|
261
|
+
filepath = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
|
|
262
|
+
with open(filepath) as fp:
|
|
263
|
+
contents = fp.read()
|
|
264
|
+
else:
|
|
265
|
+
# URL
|
|
266
|
+
contents = urllib.request.urlopen(filepath_or_url).read()
|
|
267
|
+
data = json.loads(contents, **kwargs)
|
|
268
|
+
return import_rows(tbl_path, data, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
|
pixeltable/io/label_studio.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import copy
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
4
|
import os
|
|
@@ -18,6 +19,15 @@ from pixeltable.exprs import ColumnRef, DataRow, Expr
|
|
|
18
19
|
from pixeltable.io.external_store import Project, SyncStatus
|
|
19
20
|
from pixeltable.utils import coco
|
|
20
21
|
|
|
22
|
+
# label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
|
|
23
|
+
# the import two different ways to insure intercompatibility
|
|
24
|
+
try:
|
|
25
|
+
# label_studio_sdk<1 compatibility
|
|
26
|
+
import label_studio_sdk.project as ls_project # type: ignore
|
|
27
|
+
except ImportError:
|
|
28
|
+
# label_studio_sdk>=1 compatibility
|
|
29
|
+
import label_studio_sdk._legacy.project as ls_project # type: ignore
|
|
30
|
+
|
|
21
31
|
_logger = logging.getLogger('pixeltable')
|
|
22
32
|
|
|
23
33
|
|
|
@@ -50,11 +60,11 @@ class LabelStudioProject(Project):
|
|
|
50
60
|
"""
|
|
51
61
|
self.project_id = project_id
|
|
52
62
|
self.media_import_method = media_import_method
|
|
53
|
-
self._project: Optional[
|
|
63
|
+
self._project: Optional[ls_project.Project] = None
|
|
54
64
|
super().__init__(name, col_mapping, stored_proxies)
|
|
55
65
|
|
|
56
66
|
@property
|
|
57
|
-
def project(self) ->
|
|
67
|
+
def project(self) -> ls_project.Project:
|
|
58
68
|
"""The `Project` object corresponding to this Label Studio project."""
|
|
59
69
|
if self._project is None:
|
|
60
70
|
try:
|
|
@@ -536,6 +546,7 @@ class LabelStudioProject(Project):
|
|
|
536
546
|
title: Optional[str],
|
|
537
547
|
media_import_method: Literal['post', 'file', 'url'],
|
|
538
548
|
col_mapping: Optional[dict[str, str]],
|
|
549
|
+
s3_configuration: Optional[dict[str, Any]],
|
|
539
550
|
**kwargs: Any
|
|
540
551
|
) -> 'LabelStudioProject':
|
|
541
552
|
"""
|
|
@@ -572,6 +583,31 @@ class LabelStudioProject(Project):
|
|
|
572
583
|
if media_import_method == 'post' and len(config.data_keys) > 1:
|
|
573
584
|
raise excs.Error('`media_import_method` cannot be `post` if there is more than one data key')
|
|
574
585
|
|
|
586
|
+
if s3_configuration is not None:
|
|
587
|
+
if media_import_method != 'url':
|
|
588
|
+
raise excs.Error("`s3_configuration` is only valid when `media_import_method == 'url'`")
|
|
589
|
+
s3_configuration = copy.copy(s3_configuration)
|
|
590
|
+
if not 'bucket' in s3_configuration:
|
|
591
|
+
raise excs.Error('`s3_configuration` must contain a `bucket` field')
|
|
592
|
+
if not 'title' in s3_configuration:
|
|
593
|
+
s3_configuration['title'] = 'Pixeltable-S3-Import-Storage'
|
|
594
|
+
if ('aws_access_key_id' not in s3_configuration and
|
|
595
|
+
'aws_secret_access_key' not in s3_configuration and
|
|
596
|
+
'aws_session_token' not in s3_configuration):
|
|
597
|
+
# Attempt to fill any missing credentials from the environment
|
|
598
|
+
try:
|
|
599
|
+
import boto3
|
|
600
|
+
s3_credentials = boto3.Session().get_credentials().get_frozen_credentials()
|
|
601
|
+
_logger.info(f'Using AWS credentials from the environment for Label Studio project: {title}')
|
|
602
|
+
s3_configuration['aws_access_key_id'] = s3_credentials.access_key
|
|
603
|
+
s3_configuration['aws_secret_access_key'] = s3_credentials.secret_key
|
|
604
|
+
s3_configuration['aws_session_token'] = s3_credentials.token
|
|
605
|
+
except Exception as exc:
|
|
606
|
+
# This is not necessarily a problem, but we should log that it happened
|
|
607
|
+
_logger.debug(f'Unable to retrieve AWS credentials from the environment: {exc}')
|
|
608
|
+
pass
|
|
609
|
+
|
|
610
|
+
_logger.info(f'Creating Label Studio project: {title}')
|
|
575
611
|
project = _label_studio_client().start_project(title=title, label_config=label_config, **kwargs)
|
|
576
612
|
|
|
577
613
|
if media_import_method == 'file':
|
|
@@ -591,6 +627,10 @@ class LabelStudioProject(Project):
|
|
|
591
627
|
) from exc
|
|
592
628
|
raise # Handle any other exception type normally
|
|
593
629
|
|
|
630
|
+
if s3_configuration is not None:
|
|
631
|
+
_logger.info(f'Setting up S3 import storage for Label Studio project: {title}')
|
|
632
|
+
project.connect_s3_import_storage(**s3_configuration)
|
|
633
|
+
|
|
594
634
|
project_id = project.get_params()['id']
|
|
595
635
|
return LabelStudioProject(name, project_id, media_import_method, resolved_col_mapping)
|
|
596
636
|
|
pixeltable/io/pandas.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Any, Optional, Union
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
import pandas as pd
|
|
@@ -9,10 +9,13 @@ import pixeltable.type_system as ts
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def import_pandas(
|
|
12
|
-
tbl_name: str, df: pd.DataFrame, *, schema_overrides: Optional[dict[str, pxt.ColumnType]] = None
|
|
12
|
+
tbl_name: str, df: pd.DataFrame, *, schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
|
|
13
|
+
primary_key: Optional[Union[str, list[str]]] = None,
|
|
14
|
+
num_retained_versions: int = 10,
|
|
15
|
+
comment: str = ''
|
|
13
16
|
) -> pxt.catalog.InsertableTable:
|
|
14
17
|
"""Creates a new `Table` from a Pandas `DataFrame`, with the specified name. The schema of the table
|
|
15
|
-
will be inferred from the `DataFrame
|
|
18
|
+
will be inferred from the `DataFrame`.
|
|
16
19
|
|
|
17
20
|
The column names of the new `Table` will be identical to those in the `DataFrame`, as long as they are valid
|
|
18
21
|
Pixeltable identifiers. If a column name is not a valid Pixeltable identifier, it will be normalized according to
|
|
@@ -29,15 +32,26 @@ def import_pandas(
|
|
|
29
32
|
`schema_overrides` should be the column names of the `DataFrame` (whether or not they are valid
|
|
30
33
|
Pixeltable identifiers).
|
|
31
34
|
"""
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
+
if schema_overrides is None:
|
|
36
|
+
schema_overrides = {}
|
|
37
|
+
if primary_key is None:
|
|
38
|
+
primary_key = []
|
|
39
|
+
elif isinstance(primary_key, str):
|
|
40
|
+
primary_key = [primary_key]
|
|
41
|
+
|
|
42
|
+
schema, pxt_pk = __df_to_pxt_schema(df, schema_overrides, primary_key)
|
|
43
|
+
tbl_rows = (dict(__df_row_to_pxt_row(row, schema)) for row in df.itertuples())
|
|
44
|
+
table = pxt.create_table(tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment)
|
|
35
45
|
table.insert(tbl_rows)
|
|
36
46
|
return table
|
|
37
47
|
|
|
38
48
|
|
|
39
49
|
def import_csv(
|
|
40
|
-
|
|
50
|
+
tbl_name: str, filepath_or_buffer, schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
|
|
51
|
+
primary_key: Optional[Union[str, list[str]]] = None,
|
|
52
|
+
num_retained_versions: int = 10,
|
|
53
|
+
comment: str = '',
|
|
54
|
+
**kwargs
|
|
41
55
|
) -> pxt.catalog.InsertableTable:
|
|
42
56
|
"""
|
|
43
57
|
Creates a new `Table` from a csv file. This is a convenience method and is equivalent
|
|
@@ -45,11 +59,15 @@ def import_csv(
|
|
|
45
59
|
See the Pandas documentation for `read_csv` for more details.
|
|
46
60
|
"""
|
|
47
61
|
df = pd.read_csv(filepath_or_buffer, **kwargs)
|
|
48
|
-
return import_pandas(
|
|
62
|
+
return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
|
|
49
63
|
|
|
50
64
|
|
|
51
65
|
def import_excel(
|
|
52
|
-
|
|
66
|
+
tbl_name: str, io, *args, schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
|
|
67
|
+
primary_key: Optional[Union[str, list[str]]] = None,
|
|
68
|
+
num_retained_versions: int = 10,
|
|
69
|
+
comment: str = '',
|
|
70
|
+
**kwargs
|
|
53
71
|
) -> pxt.catalog.InsertableTable:
|
|
54
72
|
"""
|
|
55
73
|
Creates a new `Table` from an excel (.xlsx) file. This is a convenience method and is equivalent
|
|
@@ -57,25 +75,36 @@ def import_excel(
|
|
|
57
75
|
See the Pandas documentation for `read_excel` for more details.
|
|
58
76
|
"""
|
|
59
77
|
df = pd.read_excel(io, *args, **kwargs)
|
|
60
|
-
return import_pandas(
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def
|
|
64
|
-
df: pd.DataFrame, schema_overrides:
|
|
65
|
-
) -> dict[str, pxt.ColumnType]:
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
78
|
+
return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def __df_to_pxt_schema(
|
|
82
|
+
df: pd.DataFrame, schema_overrides: dict[str, pxt.ColumnType], primary_key: list[str]
|
|
83
|
+
) -> tuple[dict[str, pxt.ColumnType], list[str]]:
|
|
84
|
+
"""
|
|
85
|
+
Infers a Pixeltable schema from a Pandas DataFrame.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
A tuple containing a Pixeltable schema and a list of primary key column names.
|
|
89
|
+
"""
|
|
90
|
+
for pd_name in schema_overrides:
|
|
91
|
+
if pd_name not in df.columns:
|
|
92
|
+
raise excs.Error(
|
|
93
|
+
f'Column `{pd_name}` specified in `schema_overrides` does not exist in the given `DataFrame`.'
|
|
94
|
+
)
|
|
95
|
+
for pd_name in primary_key:
|
|
96
|
+
if pd_name not in df.columns:
|
|
97
|
+
raise excs.Error(f'Primary key column `{pd_name}` does not exist in the given `DataFrame`.')
|
|
98
|
+
|
|
99
|
+
schema: dict[str, pxt.ColumnType] = {}
|
|
100
|
+
col_mapping: dict[str, str] = {} # Maps Pandas column names to Pixeltable column names
|
|
101
|
+
|
|
73
102
|
for pd_name, pd_dtype in zip(df.columns, df.dtypes):
|
|
74
|
-
if
|
|
103
|
+
if pd_name in schema_overrides:
|
|
75
104
|
pxt_type = schema_overrides[pd_name]
|
|
76
105
|
else:
|
|
77
|
-
pxt_type =
|
|
78
|
-
pxt_name =
|
|
106
|
+
pxt_type = __np_dtype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
|
|
107
|
+
pxt_name = __normalize_pxt_col_name(pd_name)
|
|
79
108
|
# Ensure that column names are unique by appending a distinguishing suffix
|
|
80
109
|
# to any collisions
|
|
81
110
|
if pxt_name in schema:
|
|
@@ -84,10 +113,13 @@ def _df_to_pxt_schema(
|
|
|
84
113
|
n += 1
|
|
85
114
|
pxt_name = f'{pxt_name}_{n}'
|
|
86
115
|
schema[pxt_name] = pxt_type
|
|
87
|
-
|
|
116
|
+
col_mapping[pd_name] = pxt_name
|
|
117
|
+
|
|
118
|
+
pxt_pk = [col_mapping[pk] for pk in primary_key]
|
|
119
|
+
return schema, pxt_pk
|
|
88
120
|
|
|
89
121
|
|
|
90
|
-
def
|
|
122
|
+
def __normalize_pxt_col_name(pd_name: str) -> str:
|
|
91
123
|
"""
|
|
92
124
|
Normalizes an arbitrary DataFrame column name into a valid Pixeltable identifier by:
|
|
93
125
|
- replacing any non-ascii or non-alphanumeric characters with an underscore _
|
|
@@ -102,26 +134,30 @@ def _normalize_pxt_col_name(pd_name: str) -> str:
|
|
|
102
134
|
return id
|
|
103
135
|
|
|
104
136
|
|
|
105
|
-
def
|
|
137
|
+
def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bool) -> pxt.ColumnType:
|
|
106
138
|
"""
|
|
107
139
|
Infers a Pixeltable type based on a Numpy dtype.
|
|
108
140
|
"""
|
|
109
141
|
if np.issubdtype(np_dtype, np.integer):
|
|
110
|
-
return pxt.IntType()
|
|
142
|
+
return pxt.IntType(nullable=nullable)
|
|
111
143
|
if np.issubdtype(np_dtype, np.floating):
|
|
112
|
-
return pxt.FloatType()
|
|
144
|
+
return pxt.FloatType(nullable=nullable)
|
|
113
145
|
if np.issubdtype(np_dtype, np.bool_):
|
|
114
|
-
return pxt.BoolType()
|
|
146
|
+
return pxt.BoolType(nullable=nullable)
|
|
115
147
|
if np_dtype == np.object_ or np.issubdtype(np_dtype, np.character):
|
|
116
148
|
has_nan = any(isinstance(val, float) and np.isnan(val) for val in data_col)
|
|
117
|
-
|
|
149
|
+
if has_nan and not nullable:
|
|
150
|
+
raise excs.Error(f'Primary key column `{data_col.name}` cannot contain null values.')
|
|
151
|
+
return pxt.StringType(nullable=nullable)
|
|
118
152
|
if np.issubdtype(np_dtype, np.datetime64):
|
|
119
153
|
has_nat = any(pd.isnull(val) for val in data_col)
|
|
120
|
-
|
|
154
|
+
if has_nat and not nullable:
|
|
155
|
+
raise excs.Error(f'Primary key column `{data_col.name}` cannot contain null values.')
|
|
156
|
+
return pxt.TimestampType(nullable=nullable)
|
|
121
157
|
raise excs.Error(f'Unsupported dtype: {np_dtype}')
|
|
122
158
|
|
|
123
159
|
|
|
124
|
-
def
|
|
160
|
+
def __df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType]) -> dict[str, Any]:
|
|
125
161
|
rows = {}
|
|
126
162
|
for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
|
|
127
163
|
if pxt_type.is_float_type():
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
|
|
|
10
10
|
from .schema import SystemInfo, SystemInfoMd
|
|
11
11
|
|
|
12
12
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
13
|
-
VERSION =
|
|
13
|
+
VERSION = 19
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
import sqlalchemy as sql
|
|
3
|
+
|
|
4
|
+
from pixeltable.metadata import register_converter
|
|
5
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@register_converter(version=18)
|
|
9
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
10
|
+
convert_table_md(
|
|
11
|
+
engine,
|
|
12
|
+
substitution_fn=__substitute_md
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def __substitute_md(k: Any, v: Any) -> Optional[tuple[Any, Any]]:
|
|
17
|
+
# Migrate a few changed function names
|
|
18
|
+
if k == 'path' and v == 'pixeltable.functions.string.str_format':
|
|
19
|
+
return 'path', 'pixeltable.functions.string.format'
|
|
20
|
+
if k == 'path' and v.startswith('pixeltable.functions.pil.image'):
|
|
21
|
+
return 'path', v.replace('pixeltable.functions.pil.image', 'pixeltable.functions.image')
|
|
22
|
+
# Migrate deprecated `ImageMemberAccess` expressions to `FunctionCall`s
|
|
23
|
+
if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'ImageMemberAccess':
|
|
24
|
+
member_name = v['member_name']
|
|
25
|
+
new_v = {
|
|
26
|
+
'fn': {
|
|
27
|
+
'path': f'pixeltable.functions.image.{member_name}',
|
|
28
|
+
'_classpath': 'pixeltable.func.callable_function.CallableFunction',
|
|
29
|
+
},
|
|
30
|
+
'args': [[0, None]],
|
|
31
|
+
'kwargs': {},
|
|
32
|
+
'_classname': 'FunctionCall',
|
|
33
|
+
'components': v['components'],
|
|
34
|
+
'group_by_stop_idx': 0,
|
|
35
|
+
'group_by_start_idx': 0,
|
|
36
|
+
'order_by_start_idx': 1,
|
|
37
|
+
}
|
|
38
|
+
return k, new_v
|
|
39
|
+
return None
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Descriptive notes for each new metadata version. These are stored in a Python dict
|
|
2
|
+
# rather than as a comment, so that the existence of a description can be enforced by
|
|
3
|
+
# the unit tests when new versions are added.
|
|
4
|
+
VERSION_NOTES = {
|
|
5
|
+
19: 'UDF renames; ImageMemberAccess removal',
|
|
6
|
+
18: 'Restructured index metadata',
|
|
7
|
+
17: 'Renamed remotes to external_stores',
|
|
8
|
+
16: 'Query functions; deferred Expr deserialization',
|
|
9
|
+
15: 'Remotes in table metadata',
|
|
10
|
+
}
|