pixeltable 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +2 -3
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +2 -1
- pixeltable/catalog/catalog.py +63 -36
- pixeltable/catalog/column.py +11 -4
- pixeltable/catalog/dir.py +5 -5
- pixeltable/catalog/globals.py +28 -14
- pixeltable/catalog/insertable_table.py +81 -43
- pixeltable/catalog/path.py +2 -2
- pixeltable/catalog/table.py +140 -109
- pixeltable/catalog/table_version.py +60 -43
- pixeltable/catalog/table_version_handle.py +3 -0
- pixeltable/catalog/table_version_path.py +1 -1
- pixeltable/catalog/view.py +17 -9
- pixeltable/dataframe.py +5 -3
- pixeltable/env.py +109 -43
- pixeltable/exec/__init__.py +2 -0
- pixeltable/exec/aggregation_node.py +6 -8
- pixeltable/exec/cache_prefetch_node.py +4 -7
- pixeltable/exec/component_iteration_node.py +1 -3
- pixeltable/exec/data_row_batch.py +1 -2
- pixeltable/exec/exec_context.py +1 -1
- pixeltable/exec/exec_node.py +2 -3
- pixeltable/exec/expr_eval/__init__.py +2 -0
- pixeltable/exec/expr_eval/evaluators.py +137 -20
- pixeltable/exec/expr_eval/expr_eval_node.py +43 -64
- pixeltable/exec/expr_eval/globals.py +68 -7
- pixeltable/exec/expr_eval/schedulers.py +25 -23
- pixeltable/exec/in_memory_data_node.py +8 -6
- pixeltable/exec/row_update_node.py +3 -4
- pixeltable/exec/sql_node.py +16 -17
- pixeltable/exprs/__init__.py +3 -2
- pixeltable/exprs/arithmetic_expr.py +2 -0
- pixeltable/exprs/column_property_ref.py +1 -1
- pixeltable/exprs/column_ref.py +39 -3
- pixeltable/exprs/compound_predicate.py +1 -1
- pixeltable/exprs/data_row.py +17 -1
- pixeltable/exprs/expr.py +51 -21
- pixeltable/exprs/function_call.py +34 -2
- pixeltable/exprs/globals.py +12 -0
- pixeltable/exprs/json_mapper.py +95 -48
- pixeltable/exprs/json_path.py +3 -10
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +33 -6
- pixeltable/exprs/similarity_expr.py +6 -21
- pixeltable/exprs/sql_element_cache.py +1 -1
- pixeltable/exprs/string_op.py +107 -0
- pixeltable/ext/__init__.py +1 -1
- pixeltable/ext/functions/__init__.py +1 -1
- pixeltable/ext/functions/whisperx.py +1 -1
- pixeltable/ext/functions/yolox.py +22 -65
- pixeltable/func/aggregate_function.py +1 -1
- pixeltable/func/callable_function.py +2 -5
- pixeltable/func/expr_template_function.py +22 -2
- pixeltable/func/function.py +4 -5
- pixeltable/func/function_registry.py +1 -1
- pixeltable/func/signature.py +1 -1
- pixeltable/func/tools.py +2 -2
- pixeltable/func/udf.py +2 -2
- pixeltable/functions/__init__.py +2 -2
- pixeltable/functions/anthropic.py +2 -2
- pixeltable/functions/audio.py +1 -1
- pixeltable/functions/deepseek.py +1 -1
- pixeltable/functions/fireworks.py +1 -1
- pixeltable/functions/globals.py +22 -11
- pixeltable/functions/huggingface.py +1 -1
- pixeltable/functions/image.py +1 -1
- pixeltable/functions/json.py +1 -1
- pixeltable/functions/llama_cpp.py +1 -1
- pixeltable/functions/math.py +1 -1
- pixeltable/functions/mistralai.py +1 -1
- pixeltable/functions/ollama.py +1 -1
- pixeltable/functions/openai.py +2 -2
- pixeltable/functions/replicate.py +1 -1
- pixeltable/functions/string.py +1 -1
- pixeltable/functions/timestamp.py +1 -1
- pixeltable/functions/together.py +1 -1
- pixeltable/functions/util.py +1 -1
- pixeltable/functions/video.py +2 -2
- pixeltable/functions/vision.py +2 -2
- pixeltable/globals.py +85 -33
- pixeltable/index/embedding_index.py +12 -1
- pixeltable/io/__init__.py +8 -5
- pixeltable/io/datarows.py +138 -0
- pixeltable/io/external_store.py +8 -5
- pixeltable/io/fiftyone.py +6 -7
- pixeltable/io/globals.py +7 -160
- pixeltable/io/hf_datasets.py +21 -98
- pixeltable/io/label_studio.py +21 -20
- pixeltable/io/pandas.py +35 -48
- pixeltable/io/parquet.py +17 -42
- pixeltable/io/table_data_conduit.py +569 -0
- pixeltable/io/utils.py +6 -21
- pixeltable/iterators/__init__.py +1 -1
- pixeltable/metadata/__init__.py +6 -4
- pixeltable/metadata/converters/convert_24.py +3 -3
- pixeltable/metadata/converters/convert_25.py +1 -1
- pixeltable/metadata/converters/convert_29.py +1 -1
- pixeltable/metadata/converters/convert_30.py +50 -0
- pixeltable/metadata/converters/util.py +26 -1
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +3 -0
- pixeltable/store.py +2 -2
- pixeltable/type_system.py +19 -7
- pixeltable/utils/arrow.py +32 -7
- pixeltable/utils/console_output.py +3 -2
- pixeltable/utils/coroutine.py +3 -3
- pixeltable/utils/dbms.py +66 -0
- pixeltable/utils/documents.py +61 -67
- pixeltable/utils/filecache.py +1 -1
- pixeltable/utils/http_server.py +3 -2
- pixeltable/utils/pytorch.py +1 -1
- pixeltable/utils/sql.py +1 -1
- pixeltable-0.3.11.dist-info/METADATA +436 -0
- pixeltable-0.3.11.dist-info/RECORD +179 -0
- {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/WHEEL +1 -1
- pixeltable/catalog/path_dict.py +0 -169
- pixeltable-0.3.9.dist-info/METADATA +0 -382
- pixeltable-0.3.9.dist-info/RECORD +0 -175
- {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/entry_points.txt +0 -0
pixeltable/globals.py
CHANGED
|
@@ -1,16 +1,37 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
4
|
+
import os
|
|
2
5
|
import urllib.parse
|
|
3
|
-
from
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, Union
|
|
4
8
|
|
|
5
9
|
import pandas as pd
|
|
6
10
|
from pandas.io.formats.style import Styler
|
|
7
11
|
|
|
8
12
|
from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share
|
|
9
13
|
from pixeltable.catalog import Catalog, TableVersionPath
|
|
10
|
-
from pixeltable.
|
|
14
|
+
from pixeltable.catalog.insertable_table import OnErrorParameter
|
|
11
15
|
from pixeltable.env import Env
|
|
12
16
|
from pixeltable.iterators import ComponentIterator
|
|
13
17
|
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
import datasets # type: ignore[import-untyped]
|
|
20
|
+
|
|
21
|
+
RowData = list[dict[str, Any]]
|
|
22
|
+
TableDataSource = Union[
|
|
23
|
+
str,
|
|
24
|
+
os.PathLike,
|
|
25
|
+
Path, # OS paths, filenames, URLs
|
|
26
|
+
Iterator[dict[str, Any]], # iterator producing dictionaries of values
|
|
27
|
+
RowData, # list of dictionaries
|
|
28
|
+
DataFrame, # Pixeltable DataFrame
|
|
29
|
+
pd.DataFrame, # pandas DataFrame
|
|
30
|
+
'datasets.Dataset',
|
|
31
|
+
'datasets.DatasetDict', # Huggingface datasets
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
|
|
14
35
|
_logger = logging.getLogger('pixeltable')
|
|
15
36
|
|
|
16
37
|
|
|
@@ -20,21 +41,35 @@ def init() -> None:
|
|
|
20
41
|
|
|
21
42
|
|
|
22
43
|
def create_table(
|
|
23
|
-
|
|
24
|
-
|
|
44
|
+
path_str: str,
|
|
45
|
+
schema: Optional[dict[str, Any]] = None,
|
|
25
46
|
*,
|
|
47
|
+
source: Optional[TableDataSource] = None,
|
|
48
|
+
source_format: Optional[Literal['csv', 'excel', 'parquet', 'json']] = None,
|
|
49
|
+
schema_overrides: Optional[dict[str, Any]] = None,
|
|
50
|
+
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
26
51
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
27
52
|
num_retained_versions: int = 10,
|
|
28
53
|
comment: str = '',
|
|
29
54
|
media_validation: Literal['on_read', 'on_write'] = 'on_write',
|
|
30
55
|
if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
|
|
56
|
+
extra_args: Optional[dict[str, Any]] = None, # Additional arguments to data source provider
|
|
31
57
|
) -> catalog.Table:
|
|
32
58
|
"""Create a new base table.
|
|
33
59
|
|
|
34
60
|
Args:
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
61
|
+
path_str: Path to the table.
|
|
62
|
+
schema: A dictionary that maps column names to column types
|
|
63
|
+
source: A data source from which a table schema can be inferred and data imported
|
|
64
|
+
source_format: A hint to the format of the source data
|
|
65
|
+
schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
|
|
66
|
+
on_error: Determines the behavior if an error occurs while evaluating a computed column or detecting an
|
|
67
|
+
invalid media file (such as a corrupt image) for one of the inserted rows.
|
|
68
|
+
|
|
69
|
+
- If `on_error='abort'`, then an exception will be raised and the rows will not be inserted.
|
|
70
|
+
- If `on_error='ignore'`, then execution will continue and the rows will be inserted. Any cells
|
|
71
|
+
with errors will have a `None` value for that cell, with information about the error stored in the
|
|
72
|
+
corresponding `tbl.col_name.errortype` and `tbl.col_name.errormsg` fields.
|
|
38
73
|
primary_key: An optional column name or list of column names to use as the primary key(s) of the
|
|
39
74
|
table.
|
|
40
75
|
num_retained_versions: Number of versions of the table to retain.
|
|
@@ -50,6 +85,7 @@ def create_table(
|
|
|
50
85
|
- `'ignore'`: do nothing and return the existing table handle
|
|
51
86
|
- `'replace'`: if the existing table has no views, drop and replace it with a new one
|
|
52
87
|
- `'replace_force'`: drop the existing table and all its views, and create a new one
|
|
88
|
+
extra_args: Additional arguments to pass to the source data provider
|
|
53
89
|
|
|
54
90
|
Returns:
|
|
55
91
|
A handle to the newly created table, or to an already existing table at the path when `if_exists='ignore'`.
|
|
@@ -61,7 +97,8 @@ def create_table(
|
|
|
61
97
|
- the path is invalid, or
|
|
62
98
|
- the path already exists and `if_exists='error'`, or
|
|
63
99
|
- the path already exists and is not a table, or
|
|
64
|
-
- an error occurs while attempting to create the table
|
|
100
|
+
- an error occurs while attempting to create the table, or
|
|
101
|
+
- an error occurs while attempting to import data from the source.
|
|
65
102
|
|
|
66
103
|
Examples:
|
|
67
104
|
Create a table with an int and a string column:
|
|
@@ -81,45 +118,60 @@ def create_table(
|
|
|
81
118
|
Create a table with an int and a float column, and replace any existing table:
|
|
82
119
|
|
|
83
120
|
>>> tbl = pxt.create_table('my_table', schema={'col1': pxt.Int, 'col2': pxt.Float}, if_exists='replace')
|
|
121
|
+
|
|
122
|
+
Create a table from a CSV file:
|
|
123
|
+
|
|
124
|
+
>>> tbl = pxt.create_table('my_table', source='data.csv')
|
|
84
125
|
"""
|
|
85
|
-
|
|
126
|
+
from pixeltable.io.table_data_conduit import DFTableDataConduit, UnkTableDataConduit
|
|
127
|
+
from pixeltable.io.utils import normalize_primary_key_parameter
|
|
128
|
+
|
|
129
|
+
if (schema is None) == (source is None):
|
|
130
|
+
raise excs.Error('Must provide either a `schema` or a `source`')
|
|
131
|
+
|
|
132
|
+
if schema is not None and (len(schema) == 0 or not isinstance(schema, dict)):
|
|
133
|
+
raise excs.Error('`schema` must be a non-empty dictionary')
|
|
134
|
+
|
|
135
|
+
path_obj = catalog.Path(path_str)
|
|
86
136
|
if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
|
|
137
|
+
media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
|
|
138
|
+
primary_key: Optional[list[str]] = normalize_primary_key_parameter(primary_key)
|
|
139
|
+
table: catalog.Table = None
|
|
140
|
+
tds = None
|
|
141
|
+
data_source = None
|
|
142
|
+
if source is not None:
|
|
143
|
+
tds = UnkTableDataConduit(source, source_format=source_format, extra_fields=extra_args)
|
|
144
|
+
tds.check_source_format()
|
|
145
|
+
data_source = tds.specialize()
|
|
146
|
+
data_source.src_schema_overrides = schema_overrides
|
|
147
|
+
data_source.src_pk = primary_key
|
|
148
|
+
data_source.infer_schema()
|
|
149
|
+
schema = data_source.pxt_schema
|
|
150
|
+
primary_key = data_source.pxt_pk
|
|
151
|
+
is_direct_df = data_source.is_direct_df()
|
|
152
|
+
else:
|
|
153
|
+
is_direct_df = False
|
|
87
154
|
|
|
88
|
-
|
|
89
|
-
if isinstance(schema_or_df, dict):
|
|
90
|
-
schema = schema_or_df
|
|
91
|
-
elif isinstance(schema_or_df, DataFrame):
|
|
92
|
-
df = schema_or_df
|
|
93
|
-
schema = df.schema
|
|
94
|
-
elif isinstance(schema_or_df, DataFrameResultSet):
|
|
155
|
+
if len(schema) == 0 or not isinstance(schema, dict):
|
|
95
156
|
raise excs.Error(
|
|
96
|
-
'
|
|
97
|
-
'(Is there an extraneous call to `collect()`?)'
|
|
157
|
+
'Unable to create a proper schema from supplied `source`. Please use appropriate `schema_overrides`.'
|
|
98
158
|
)
|
|
99
|
-
else:
|
|
100
|
-
raise excs.Error('`schema_or_df` must be either a schema dictionary or a Pixeltable DataFrame.')
|
|
101
|
-
|
|
102
|
-
if len(schema) == 0:
|
|
103
|
-
raise excs.Error(f'Table schema is empty: {path!r}')
|
|
104
|
-
|
|
105
|
-
if primary_key is None:
|
|
106
|
-
primary_key = []
|
|
107
|
-
elif isinstance(primary_key, str):
|
|
108
|
-
primary_key = [primary_key]
|
|
109
|
-
elif not isinstance(primary_key, list) or not all(isinstance(pk, str) for pk in primary_key):
|
|
110
|
-
raise excs.Error('primary_key must be a single column name or a list of column names')
|
|
111
159
|
|
|
112
|
-
|
|
113
|
-
return Catalog.get().create_table(
|
|
160
|
+
table = Catalog.get().create_table(
|
|
114
161
|
path_obj,
|
|
115
162
|
schema,
|
|
116
|
-
|
|
163
|
+
data_source.pxt_df if isinstance(data_source, DFTableDataConduit) else None,
|
|
117
164
|
if_exists=if_exists_,
|
|
118
165
|
primary_key=primary_key,
|
|
119
166
|
comment=comment,
|
|
120
167
|
media_validation=media_validation_,
|
|
121
168
|
num_retained_versions=num_retained_versions,
|
|
122
169
|
)
|
|
170
|
+
if data_source is not None and not is_direct_df:
|
|
171
|
+
fail_on_exception = OnErrorParameter.fail_on_exception(on_error)
|
|
172
|
+
table.insert_table_data_source(data_source=data_source, fail_on_exception=fail_on_exception)
|
|
173
|
+
|
|
174
|
+
return table
|
|
123
175
|
|
|
124
176
|
|
|
125
177
|
def create_view(
|
|
@@ -208,7 +208,18 @@ class EmbeddingIndex(IndexBase):
|
|
|
208
208
|
and len(sig.required_parameters) <= 1
|
|
209
209
|
and sig.parameters_by_pos[0].col_type.type_enum == expected_type
|
|
210
210
|
):
|
|
211
|
-
|
|
211
|
+
# We found a valid signature. Now, if it has more than one parameter, we need to transform it into a
|
|
212
|
+
# 1-ary function by fixing all the other parameters to their defaults. This is to ensure that
|
|
213
|
+
# conditional_return_type resolves correctly.
|
|
214
|
+
if len(sig.parameters) == 1:
|
|
215
|
+
unary_fn = resolved_fn
|
|
216
|
+
else:
|
|
217
|
+
assert all(sig.parameters_by_pos[i].has_default for i in range(1, len(sig.parameters)))
|
|
218
|
+
defaults = {param.name: param.default for param in sig.parameters_by_pos[1:]}
|
|
219
|
+
unary_fn = resolved_fn.using(**defaults)
|
|
220
|
+
assert not unary_fn.is_polymorphic
|
|
221
|
+
assert len(unary_fn.signature.parameters) == 1
|
|
222
|
+
return unary_fn
|
|
212
223
|
return None
|
|
213
224
|
|
|
214
225
|
@classmethod
|
pixeltable/io/__init__.py
CHANGED
|
@@ -1,13 +1,16 @@
|
|
|
1
|
+
# ruff: noqa: F401
|
|
2
|
+
|
|
3
|
+
from .datarows import import_json, import_rows
|
|
1
4
|
from .external_store import ExternalStore, SyncStatus
|
|
2
|
-
from .globals import create_label_studio_project, export_images_as_fo_dataset
|
|
5
|
+
from .globals import create_label_studio_project, export_images_as_fo_dataset
|
|
3
6
|
from .hf_datasets import import_huggingface_dataset
|
|
4
7
|
from .pandas import import_csv, import_excel, import_pandas
|
|
5
8
|
from .parquet import export_parquet, import_parquet
|
|
6
9
|
|
|
7
|
-
__default_dir =
|
|
8
|
-
__removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet'}
|
|
9
|
-
__all__ = sorted(
|
|
10
|
+
__default_dir = {symbol for symbol in dir() if not symbol.startswith('_')}
|
|
11
|
+
__removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows'}
|
|
12
|
+
__all__ = sorted(__default_dir - __removed_symbols)
|
|
10
13
|
|
|
11
14
|
|
|
12
|
-
def __dir__():
|
|
15
|
+
def __dir__() -> list[str]:
|
|
13
16
|
return __all__
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Iterable, Optional, Union
|
|
4
|
+
|
|
5
|
+
import pixeltable as pxt
|
|
6
|
+
from pixeltable import exceptions as excs
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _infer_schema_from_rows(
|
|
10
|
+
rows: Iterable[dict[str, Any]], schema_overrides: dict[str, Any], primary_key: list[str]
|
|
11
|
+
) -> dict[str, pxt.ColumnType]:
|
|
12
|
+
schema: dict[str, pxt.ColumnType] = {}
|
|
13
|
+
cols_with_nones: set[str] = set()
|
|
14
|
+
|
|
15
|
+
for n, row in enumerate(rows):
|
|
16
|
+
for col_name, value in row.items():
|
|
17
|
+
if col_name in schema_overrides:
|
|
18
|
+
# We do the insertion here; this will ensure that the column order matches the order
|
|
19
|
+
# in which the column names are encountered in the input data, even if `schema_overrides`
|
|
20
|
+
# is specified.
|
|
21
|
+
if col_name not in schema:
|
|
22
|
+
schema[col_name] = schema_overrides[col_name]
|
|
23
|
+
elif value is not None:
|
|
24
|
+
# If `key` is not in `schema_overrides`, then we infer its type from the data.
|
|
25
|
+
# The column type will always be nullable by default.
|
|
26
|
+
col_type = pxt.ColumnType.infer_literal_type(value, nullable=col_name not in primary_key)
|
|
27
|
+
if col_type is None:
|
|
28
|
+
raise excs.Error(
|
|
29
|
+
f'Could not infer type for column `{col_name}`; the value in row {n} '
|
|
30
|
+
f'has an unsupported type: {type(value)}'
|
|
31
|
+
)
|
|
32
|
+
if col_name not in schema:
|
|
33
|
+
schema[col_name] = col_type
|
|
34
|
+
else:
|
|
35
|
+
supertype = schema[col_name].supertype(col_type)
|
|
36
|
+
if supertype is None:
|
|
37
|
+
raise excs.Error(
|
|
38
|
+
f'Could not infer type of column `{col_name}`; the value in row {n} '
|
|
39
|
+
f'does not match preceding type {schema[col_name]}: {value!r}\n'
|
|
40
|
+
'Consider specifying the type explicitly in `schema_overrides`.'
|
|
41
|
+
)
|
|
42
|
+
schema[col_name] = supertype
|
|
43
|
+
else:
|
|
44
|
+
cols_with_nones.add(col_name)
|
|
45
|
+
|
|
46
|
+
entirely_none_cols = cols_with_nones - schema.keys()
|
|
47
|
+
if len(entirely_none_cols) > 0:
|
|
48
|
+
# A column can only end up in `entirely_none_cols` if it was not in `schema_overrides` and
|
|
49
|
+
# was not encountered in any row with a non-None value.
|
|
50
|
+
raise excs.Error(
|
|
51
|
+
f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
|
|
52
|
+
'Consider specifying the type(s) explicitly in `schema_overrides`.'
|
|
53
|
+
)
|
|
54
|
+
return schema
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def import_rows(
|
|
58
|
+
tbl_path: str,
|
|
59
|
+
rows: list[dict[str, Any]],
|
|
60
|
+
*,
|
|
61
|
+
schema_overrides: Optional[dict[str, Any]] = None,
|
|
62
|
+
primary_key: Optional[Union[str, list[str]]] = None,
|
|
63
|
+
num_retained_versions: int = 10,
|
|
64
|
+
comment: str = '',
|
|
65
|
+
) -> pxt.Table:
|
|
66
|
+
"""
|
|
67
|
+
Creates a new base table from a list of dictionaries. The dictionaries must be of the
|
|
68
|
+
form `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
|
|
69
|
+
supplied data, using the most specific type that can represent all the values in a column.
|
|
70
|
+
|
|
71
|
+
If `schema_overrides` is specified, then for each entry `(column_name, type)` in `schema_overrides`,
|
|
72
|
+
Pixeltable will force the specified column to the specified type (and will not attempt any type inference
|
|
73
|
+
for that column).
|
|
74
|
+
|
|
75
|
+
All column types of the new table will be nullable unless explicitly specified as non-nullable in
|
|
76
|
+
`schema_overrides`.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
tbl_path: The qualified name of the table to create.
|
|
80
|
+
rows: The list of dictionaries to import.
|
|
81
|
+
schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
|
|
82
|
+
as described above.
|
|
83
|
+
primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
|
|
84
|
+
num_retained_versions: The number of retained versions of the table
|
|
85
|
+
(see [`create_table()`][pixeltable.create_table]).
|
|
86
|
+
comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
A handle to the newly created [`Table`][pixeltable.Table].
|
|
90
|
+
"""
|
|
91
|
+
return pxt.create_table(
|
|
92
|
+
tbl_path,
|
|
93
|
+
source=rows,
|
|
94
|
+
schema_overrides=schema_overrides,
|
|
95
|
+
primary_key=primary_key,
|
|
96
|
+
num_retained_versions=num_retained_versions,
|
|
97
|
+
comment=comment,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def import_json(
|
|
102
|
+
tbl_path: str,
|
|
103
|
+
filepath_or_url: str,
|
|
104
|
+
*,
|
|
105
|
+
schema_overrides: Optional[dict[str, Any]] = None,
|
|
106
|
+
primary_key: Optional[Union[str, list[str]]] = None,
|
|
107
|
+
num_retained_versions: int = 10,
|
|
108
|
+
comment: str = '',
|
|
109
|
+
**kwargs: Any,
|
|
110
|
+
) -> pxt.Table:
|
|
111
|
+
"""
|
|
112
|
+
Creates a new base table from a JSON file. This is a convenience method and is
|
|
113
|
+
equivalent to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
|
|
114
|
+
is the contents of the specified `filepath_or_url`.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
tbl_path: The name of the table to create.
|
|
118
|
+
filepath_or_url: The path or URL of the JSON file.
|
|
119
|
+
schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
|
|
120
|
+
(see [`import_rows()`][pixeltable.io.import_rows]).
|
|
121
|
+
primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
|
|
122
|
+
num_retained_versions: The number of retained versions of the table
|
|
123
|
+
(see [`create_table()`][pixeltable.create_table]).
|
|
124
|
+
comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
|
|
125
|
+
kwargs: Additional keyword arguments to pass to `json.loads`.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
A handle to the newly created [`Table`][pixeltable.Table].
|
|
129
|
+
"""
|
|
130
|
+
return pxt.create_table(
|
|
131
|
+
tbl_path,
|
|
132
|
+
source=filepath_or_url,
|
|
133
|
+
schema_overrides=schema_overrides,
|
|
134
|
+
primary_key=primary_key,
|
|
135
|
+
num_retained_versions=num_retained_versions,
|
|
136
|
+
comment=comment,
|
|
137
|
+
extra_args=kwargs,
|
|
138
|
+
)
|
pixeltable/io/external_store.py
CHANGED
|
@@ -97,7 +97,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
97
97
|
# This ensures that the media in those columns resides in the media store.
|
|
98
98
|
# First determine which columns (if any) need stored proxies, but don't have one yet.
|
|
99
99
|
stored_proxies_needed: list[Column] = []
|
|
100
|
-
for col in self.col_mapping
|
|
100
|
+
for col in self.col_mapping:
|
|
101
101
|
if col.col_type.is_media_type() and not (col.is_stored and col.is_computed):
|
|
102
102
|
# If this column is already proxied in some other Project, use the existing proxy to avoid
|
|
103
103
|
# duplication. Otherwise, we'll create a new one.
|
|
@@ -234,7 +234,8 @@ class Project(ExternalStore, abc.ABC):
|
|
|
234
234
|
else:
|
|
235
235
|
raise excs.Error(
|
|
236
236
|
f'Column `{t_col}` does not exist in Table `{table._name}`. Either add a column `{t_col}`, '
|
|
237
|
-
f'or specify a `col_mapping` to associate a different column with
|
|
237
|
+
f'or specify a `col_mapping` to associate a different column with '
|
|
238
|
+
f'the external field `{ext_col}`.'
|
|
238
239
|
)
|
|
239
240
|
if ext_col not in export_cols and ext_col not in import_cols:
|
|
240
241
|
raise excs.Error(
|
|
@@ -253,7 +254,8 @@ class Project(ExternalStore, abc.ABC):
|
|
|
253
254
|
ext_col_type = export_cols[ext_col]
|
|
254
255
|
if not ext_col_type.is_supertype_of(t_col_type, ignore_nullable=True):
|
|
255
256
|
raise excs.Error(
|
|
256
|
-
f'Column `{t_col}` cannot be exported to external column `{ext_col}`
|
|
257
|
+
f'Column `{t_col}` cannot be exported to external column `{ext_col}` '
|
|
258
|
+
f'(incompatible types; expecting `{ext_col_type}`)'
|
|
257
259
|
)
|
|
258
260
|
if ext_col in import_cols:
|
|
259
261
|
# Validate that the external column can be assigned to the table column
|
|
@@ -264,7 +266,8 @@ class Project(ExternalStore, abc.ABC):
|
|
|
264
266
|
ext_col_type = import_cols[ext_col]
|
|
265
267
|
if not t_col_type.is_supertype_of(ext_col_type, ignore_nullable=True):
|
|
266
268
|
raise excs.Error(
|
|
267
|
-
f'Column `{t_col}` cannot be imported from external column `{ext_col}`
|
|
269
|
+
f'Column `{t_col}` cannot be imported from external column `{ext_col}` '
|
|
270
|
+
f'(incompatible types; expecting `{ext_col_type}`)'
|
|
268
271
|
)
|
|
269
272
|
return resolved_col_mapping
|
|
270
273
|
|
|
@@ -368,7 +371,7 @@ class MockProject(Project):
|
|
|
368
371
|
{cls._column_from_dict(entry[0]): cls._column_from_dict(entry[1]) for entry in md['stored_proxies']},
|
|
369
372
|
)
|
|
370
373
|
|
|
371
|
-
def __eq__(self, other:
|
|
374
|
+
def __eq__(self, other: object) -> bool:
|
|
372
375
|
if not isinstance(other, MockProject):
|
|
373
376
|
return False
|
|
374
377
|
return self.name == other.name
|
pixeltable/io/fiftyone.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import Iterator, Optional, Union
|
|
2
|
+
from typing import Any, Iterator, Optional, Union
|
|
3
3
|
|
|
4
4
|
import fiftyone as fo # type: ignore[import-untyped]
|
|
5
5
|
import fiftyone.utils.data as foud # type: ignore[import-untyped]
|
|
@@ -59,10 +59,9 @@ class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
|
|
|
59
59
|
for exprs_, label_cls, default_name in label_categories:
|
|
60
60
|
if exprs_ is None or isinstance(exprs_, dict):
|
|
61
61
|
continue
|
|
62
|
-
if isinstance(exprs_, exprs.Expr)
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
for expr in exprs_:
|
|
62
|
+
exprs_list = [exprs_] if isinstance(exprs_, exprs.Expr) else exprs_
|
|
63
|
+
assert isinstance(exprs_list, list)
|
|
64
|
+
for expr in exprs_list:
|
|
66
65
|
if default_name not in self.__labels:
|
|
67
66
|
name = default_name
|
|
68
67
|
else:
|
|
@@ -125,7 +124,7 @@ class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
|
|
|
125
124
|
elif label_cls is fo.Detections:
|
|
126
125
|
label = fo.Detections(detections=self.__as_fo_detections(label_data))
|
|
127
126
|
else:
|
|
128
|
-
|
|
127
|
+
raise AssertionError()
|
|
129
128
|
labels[label_name] = label
|
|
130
129
|
|
|
131
130
|
return file, metadata, labels
|
|
@@ -166,5 +165,5 @@ class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
|
|
|
166
165
|
def get_dataset_info(self) -> dict:
|
|
167
166
|
pass
|
|
168
167
|
|
|
169
|
-
def close(self, *args) -> None:
|
|
168
|
+
def close(self, *args: Any) -> None:
|
|
170
169
|
pass
|
pixeltable/io/globals.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
3
|
-
import urllib.request
|
|
4
|
-
from pathlib import Path
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
5
3
|
from typing import TYPE_CHECKING, Any, Literal, Optional, Union
|
|
6
4
|
|
|
7
5
|
import pixeltable as pxt
|
|
@@ -9,61 +7,11 @@ import pixeltable.exceptions as excs
|
|
|
9
7
|
from pixeltable import Table, exprs
|
|
10
8
|
from pixeltable.env import Env
|
|
11
9
|
from pixeltable.io.external_store import SyncStatus
|
|
12
|
-
from pixeltable.utils import parse_local_file_path
|
|
13
10
|
|
|
14
11
|
if TYPE_CHECKING:
|
|
15
12
|
import fiftyone as fo # type: ignore[import-untyped]
|
|
16
13
|
|
|
17
14
|
|
|
18
|
-
from .utils import find_or_create_table, normalize_import_parameters, normalize_schema_names
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def _infer_schema_from_rows(
|
|
22
|
-
rows: list[dict[str, Any]], schema_overrides: dict[str, Any], primary_key: list[str]
|
|
23
|
-
) -> dict[str, pxt.ColumnType]:
|
|
24
|
-
schema: dict[str, pxt.ColumnType] = {}
|
|
25
|
-
cols_with_nones: set[str] = set()
|
|
26
|
-
|
|
27
|
-
for n, row in enumerate(rows):
|
|
28
|
-
for col_name, value in row.items():
|
|
29
|
-
if col_name in schema_overrides:
|
|
30
|
-
# We do the insertion here; this will ensure that the column order matches the order
|
|
31
|
-
# in which the column names are encountered in the input data, even if `schema_overrides`
|
|
32
|
-
# is specified.
|
|
33
|
-
if col_name not in schema:
|
|
34
|
-
schema[col_name] = schema_overrides[col_name]
|
|
35
|
-
elif value is not None:
|
|
36
|
-
# If `key` is not in `schema_overrides`, then we infer its type from the data.
|
|
37
|
-
# The column type will always be nullable by default.
|
|
38
|
-
col_type = pxt.ColumnType.infer_literal_type(value, nullable=col_name not in primary_key)
|
|
39
|
-
if col_type is None:
|
|
40
|
-
raise excs.Error(
|
|
41
|
-
f'Could not infer type for column `{col_name}`; the value in row {n} has an unsupported type: {type(value)}'
|
|
42
|
-
)
|
|
43
|
-
if col_name not in schema:
|
|
44
|
-
schema[col_name] = col_type
|
|
45
|
-
else:
|
|
46
|
-
supertype = schema[col_name].supertype(col_type)
|
|
47
|
-
if supertype is None:
|
|
48
|
-
raise excs.Error(
|
|
49
|
-
f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
|
|
50
|
-
'Consider specifying the type explicitly in `schema_overrides`.'
|
|
51
|
-
)
|
|
52
|
-
schema[col_name] = supertype
|
|
53
|
-
else:
|
|
54
|
-
cols_with_nones.add(col_name)
|
|
55
|
-
|
|
56
|
-
entirely_none_cols = cols_with_nones - schema.keys()
|
|
57
|
-
if len(entirely_none_cols) > 0:
|
|
58
|
-
# A column can only end up in `entirely_none_cols` if it was not in `schema_overrides` and
|
|
59
|
-
# was not encountered in any row with a non-None value.
|
|
60
|
-
raise excs.Error(
|
|
61
|
-
f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
|
|
62
|
-
'Consider specifying the type(s) explicitly in `schema_overrides`.'
|
|
63
|
-
)
|
|
64
|
-
return schema
|
|
65
|
-
|
|
66
|
-
|
|
67
15
|
def create_label_studio_project(
|
|
68
16
|
t: Table,
|
|
69
17
|
label_config: str,
|
|
@@ -140,9 +88,9 @@ def create_label_studio_project(
|
|
|
140
88
|
parameters of the Label Studio `connect_s3_import_storage` method, as described in the
|
|
141
89
|
[Label Studio connect_s3_import_storage docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.connect_s3_import_storage).
|
|
142
90
|
`bucket` must be specified; all other parameters are optional. If credentials are not specified explicitly,
|
|
143
|
-
Pixeltable will attempt to retrieve them from the environment (such as from `~/.aws/credentials`).
|
|
144
|
-
specified, Pixeltable will use the default `'Pixeltable-S3-Import-Storage'`.
|
|
145
|
-
Studio defaults.
|
|
91
|
+
Pixeltable will attempt to retrieve them from the environment (such as from `~/.aws/credentials`).
|
|
92
|
+
If a title is not specified, Pixeltable will use the default `'Pixeltable-S3-Import-Storage'`.
|
|
93
|
+
All other parameters use their Label Studio defaults.
|
|
146
94
|
kwargs: Additional keyword arguments are passed to the `start_project` method in the Label
|
|
147
95
|
Studio SDK, as described in the
|
|
148
96
|
[Label Studio start_project docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.start_project).
|
|
@@ -151,7 +99,8 @@ def create_label_studio_project(
|
|
|
151
99
|
A `SyncStatus` representing the status of any synchronization operations that occurred.
|
|
152
100
|
|
|
153
101
|
Examples:
|
|
154
|
-
Create a Label Studio project whose tasks correspond to videos stored in the `video_col`
|
|
102
|
+
Create a Label Studio project whose tasks correspond to videos stored in the `video_col`
|
|
103
|
+
column of the table `tbl`:
|
|
155
104
|
|
|
156
105
|
>>> config = \"\"\"
|
|
157
106
|
<View>
|
|
@@ -190,108 +139,6 @@ def create_label_studio_project(
|
|
|
190
139
|
return SyncStatus.empty()
|
|
191
140
|
|
|
192
141
|
|
|
193
|
-
def import_rows(
|
|
194
|
-
tbl_path: str,
|
|
195
|
-
rows: list[dict[str, Any]],
|
|
196
|
-
*,
|
|
197
|
-
schema_overrides: Optional[dict[str, Any]] = None,
|
|
198
|
-
primary_key: Optional[Union[str, list[str]]] = None,
|
|
199
|
-
num_retained_versions: int = 10,
|
|
200
|
-
comment: str = '',
|
|
201
|
-
) -> Table:
|
|
202
|
-
"""
|
|
203
|
-
Creates a new base table from a list of dictionaries. The dictionaries must be of the
|
|
204
|
-
form `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
|
|
205
|
-
supplied data, using the most specific type that can represent all the values in a column.
|
|
206
|
-
|
|
207
|
-
If `schema_overrides` is specified, then for each entry `(column_name, type)` in `schema_overrides`,
|
|
208
|
-
Pixeltable will force the specified column to the specified type (and will not attempt any type inference
|
|
209
|
-
for that column).
|
|
210
|
-
|
|
211
|
-
All column types of the new table will be nullable unless explicitly specified as non-nullable in
|
|
212
|
-
`schema_overrides`.
|
|
213
|
-
|
|
214
|
-
Args:
|
|
215
|
-
tbl_path: The qualified name of the table to create.
|
|
216
|
-
rows: The list of dictionaries to import.
|
|
217
|
-
schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
|
|
218
|
-
as described above.
|
|
219
|
-
primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
|
|
220
|
-
num_retained_versions: The number of retained versions of the table (see [`create_table()`][pixeltable.create_table]).
|
|
221
|
-
comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
|
|
222
|
-
|
|
223
|
-
Returns:
|
|
224
|
-
A handle to the newly created [`Table`][pixeltable.Table].
|
|
225
|
-
"""
|
|
226
|
-
schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
|
|
227
|
-
row_schema = _infer_schema_from_rows(rows, schema_overrides, primary_key)
|
|
228
|
-
schema, pxt_pk, _ = normalize_schema_names(row_schema, primary_key, schema_overrides, True)
|
|
229
|
-
|
|
230
|
-
table = find_or_create_table(
|
|
231
|
-
tbl_path, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
|
|
232
|
-
)
|
|
233
|
-
table.insert(rows)
|
|
234
|
-
return table
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
def import_json(
|
|
238
|
-
tbl_path: str,
|
|
239
|
-
filepath_or_url: str,
|
|
240
|
-
*,
|
|
241
|
-
schema_overrides: Optional[dict[str, Any]] = None,
|
|
242
|
-
primary_key: Optional[Union[str, list[str]]] = None,
|
|
243
|
-
num_retained_versions: int = 10,
|
|
244
|
-
comment: str = '',
|
|
245
|
-
**kwargs: Any,
|
|
246
|
-
) -> Table:
|
|
247
|
-
"""
|
|
248
|
-
Creates a new base table from a JSON file. This is a convenience method and is
|
|
249
|
-
equivalent to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
|
|
250
|
-
is the contents of the specified `filepath_or_url`.
|
|
251
|
-
|
|
252
|
-
Args:
|
|
253
|
-
tbl_path: The name of the table to create.
|
|
254
|
-
filepath_or_url: The path or URL of the JSON file.
|
|
255
|
-
schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
|
|
256
|
-
(see [`import_rows()`][pixeltable.io.import_rows]).
|
|
257
|
-
primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
|
|
258
|
-
num_retained_versions: The number of retained versions of the table (see [`create_table()`][pixeltable.create_table]).
|
|
259
|
-
comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
|
|
260
|
-
kwargs: Additional keyword arguments to pass to `json.loads`.
|
|
261
|
-
|
|
262
|
-
Returns:
|
|
263
|
-
A handle to the newly created [`Table`][pixeltable.Table].
|
|
264
|
-
"""
|
|
265
|
-
path = parse_local_file_path(filepath_or_url)
|
|
266
|
-
if path is None: # it's a URL
|
|
267
|
-
# TODO: This should read from S3 as well.
|
|
268
|
-
contents = urllib.request.urlopen(filepath_or_url).read()
|
|
269
|
-
else:
|
|
270
|
-
with open(path) as fp:
|
|
271
|
-
contents = fp.read()
|
|
272
|
-
|
|
273
|
-
rows = json.loads(contents, **kwargs)
|
|
274
|
-
|
|
275
|
-
schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
|
|
276
|
-
row_schema = _infer_schema_from_rows(rows, schema_overrides, primary_key)
|
|
277
|
-
schema, pxt_pk, col_mapping = normalize_schema_names(row_schema, primary_key, schema_overrides, False)
|
|
278
|
-
|
|
279
|
-
# Convert all rows to insertable format - not needed, misnamed columns and types are errors in the incoming row format
|
|
280
|
-
if col_mapping is not None:
|
|
281
|
-
tbl_rows = [
|
|
282
|
-
{field if col_mapping is None else col_mapping[field]: val for field, val in row.items()} for row in rows
|
|
283
|
-
]
|
|
284
|
-
else:
|
|
285
|
-
tbl_rows = rows
|
|
286
|
-
|
|
287
|
-
table = find_or_create_table(
|
|
288
|
-
tbl_path, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
|
|
289
|
-
)
|
|
290
|
-
|
|
291
|
-
table.insert(tbl_rows)
|
|
292
|
-
return table
|
|
293
|
-
|
|
294
|
-
|
|
295
142
|
def export_images_as_fo_dataset(
|
|
296
143
|
tbl: pxt.Table,
|
|
297
144
|
images: exprs.Expr,
|