pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +23 -5
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -3
- pixeltable/catalog/catalog.py +1318 -404
- pixeltable/catalog/column.py +186 -115
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/globals.py +11 -43
- pixeltable/catalog/insertable_table.py +167 -79
- pixeltable/catalog/path.py +61 -23
- pixeltable/catalog/schema_object.py +9 -10
- pixeltable/catalog/table.py +626 -308
- pixeltable/catalog/table_metadata.py +101 -0
- pixeltable/catalog/table_version.py +713 -569
- pixeltable/catalog/table_version_handle.py +37 -6
- pixeltable/catalog/table_version_path.py +42 -29
- pixeltable/catalog/tbl_ops.py +50 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +108 -94
- pixeltable/config.py +128 -22
- pixeltable/dataframe.py +188 -100
- pixeltable/env.py +407 -136
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +3 -0
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +231 -0
- pixeltable/exec/cell_reconstruction_node.py +135 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +7 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +190 -30
- pixeltable/exec/globals.py +32 -0
- pixeltable/exec/in_memory_data_node.py +18 -18
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +206 -101
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +34 -30
- pixeltable/exprs/column_ref.py +92 -96
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +152 -55
- pixeltable/exprs/expr.py +62 -43
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +75 -37
- pixeltable/exprs/globals.py +1 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +10 -27
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +127 -53
- pixeltable/exprs/rowid_ref.py +8 -12
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +10 -10
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +20 -18
- pixeltable/func/signature.py +43 -16
- pixeltable/func/tools.py +23 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +6 -0
- pixeltable/functions/anthropic.py +93 -33
- pixeltable/functions/audio.py +114 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +1 -1
- pixeltable/functions/deepseek.py +20 -9
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +28 -11
- pixeltable/functions/globals.py +13 -13
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1046 -23
- pixeltable/functions/image.py +9 -18
- pixeltable/functions/llama_cpp.py +23 -8
- pixeltable/functions/math.py +3 -4
- pixeltable/functions/mistralai.py +4 -15
- pixeltable/functions/ollama.py +16 -9
- pixeltable/functions/openai.py +104 -82
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +2 -2
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +13 -14
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/video.py +1388 -106
- pixeltable/functions/vision.py +7 -7
- pixeltable/functions/whisper.py +15 -7
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +332 -105
- pixeltable/index/base.py +13 -22
- pixeltable/index/btree.py +23 -22
- pixeltable/index/embedding_index.py +32 -44
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +7 -6
- pixeltable/io/external_store.py +49 -77
- pixeltable/io/fiftyone.py +11 -11
- pixeltable/io/globals.py +29 -28
- pixeltable/io/hf_datasets.py +17 -9
- pixeltable/io/label_studio.py +70 -66
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +12 -11
- pixeltable/io/parquet.py +13 -93
- pixeltable/io/table_data_conduit.py +71 -47
- pixeltable/io/utils.py +3 -3
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +21 -11
- pixeltable/iterators/document.py +116 -55
- pixeltable/iterators/image.py +5 -2
- pixeltable/iterators/video.py +293 -13
- pixeltable/metadata/__init__.py +4 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/util.py +13 -12
- pixeltable/metadata/notes.py +4 -0
- pixeltable/metadata/schema.py +79 -42
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +274 -223
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +259 -129
- pixeltable/share/protocol/__init__.py +34 -0
- pixeltable/share/protocol/common.py +170 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +109 -0
- pixeltable/share/publish.py +213 -57
- pixeltable/store.py +238 -175
- pixeltable/type_system.py +104 -63
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +108 -13
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +31 -5
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +283 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +88 -0
- pixeltable/utils/local_store.py +316 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +528 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +392 -0
- pixeltable-0.4.20.dist-info/METADATA +587 -0
- pixeltable-0.4.20.dist-info/RECORD +218 -0
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
- pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable/utils/sample.py +0 -25
- pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
- pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
- pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
pixeltable/io/label_studio.py
CHANGED
|
@@ -4,19 +4,22 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Iterator, Literal
|
|
7
|
+
from typing import Any, Iterator, Literal
|
|
8
8
|
from xml.etree import ElementTree as ET
|
|
9
9
|
|
|
10
|
-
import label_studio_sdk
|
|
10
|
+
import label_studio_sdk
|
|
11
11
|
import PIL.Image
|
|
12
12
|
from requests.exceptions import HTTPError
|
|
13
13
|
|
|
14
14
|
import pixeltable.type_system as ts
|
|
15
15
|
from pixeltable import Column, Table, env, exceptions as excs
|
|
16
|
+
from pixeltable.catalog import ColumnHandle
|
|
17
|
+
from pixeltable.catalog.update_status import RowCountStats, UpdateStatus
|
|
16
18
|
from pixeltable.config import Config
|
|
17
19
|
from pixeltable.exprs import ColumnRef, DataRow, Expr
|
|
18
|
-
from pixeltable.io.external_store import Project
|
|
20
|
+
from pixeltable.io.external_store import Project
|
|
19
21
|
from pixeltable.utils import coco
|
|
22
|
+
from pixeltable.utils.local_store import TempStore
|
|
20
23
|
|
|
21
24
|
# label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
|
|
22
25
|
# the import two different ways to insure intercompatibility
|
|
@@ -25,7 +28,7 @@ try:
|
|
|
25
28
|
import label_studio_sdk.project as ls_project # type: ignore
|
|
26
29
|
except ImportError:
|
|
27
30
|
# label_studio_sdk>=1 compatibility
|
|
28
|
-
import label_studio_sdk._legacy.project as ls_project
|
|
31
|
+
import label_studio_sdk._legacy.project as ls_project
|
|
29
32
|
|
|
30
33
|
_logger = logging.getLogger('pixeltable')
|
|
31
34
|
|
|
@@ -43,23 +46,26 @@ class LabelStudioProject(Project):
|
|
|
43
46
|
"""
|
|
44
47
|
An [`ExternalStore`][pixeltable.io.ExternalStore] that represents a Label Studio project, providing functionality
|
|
45
48
|
for synchronizing between a Pixeltable table and a Label Studio project.
|
|
49
|
+
|
|
50
|
+
The constructor will NOT create a new Label Studio project; it is also used when loading
|
|
51
|
+
metadata for existing projects.
|
|
46
52
|
"""
|
|
47
53
|
|
|
54
|
+
project_id: int # Label Studio project ID
|
|
55
|
+
media_import_method: Literal['post', 'file', 'url']
|
|
56
|
+
_project: ls_project.Project | None
|
|
57
|
+
|
|
48
58
|
def __init__(
|
|
49
59
|
self,
|
|
50
60
|
name: str,
|
|
51
61
|
project_id: int,
|
|
52
62
|
media_import_method: Literal['post', 'file', 'url'],
|
|
53
|
-
col_mapping: dict[
|
|
54
|
-
stored_proxies:
|
|
63
|
+
col_mapping: dict[ColumnHandle, str],
|
|
64
|
+
stored_proxies: dict[ColumnHandle, ColumnHandle] | None = None,
|
|
55
65
|
):
|
|
56
|
-
"""
|
|
57
|
-
The constructor will NOT create a new Label Studio project; it is also used when loading
|
|
58
|
-
metadata for existing projects.
|
|
59
|
-
"""
|
|
60
66
|
self.project_id = project_id
|
|
61
67
|
self.media_import_method = media_import_method
|
|
62
|
-
self._project
|
|
68
|
+
self._project = None
|
|
63
69
|
super().__init__(name, col_mapping, stored_proxies)
|
|
64
70
|
|
|
65
71
|
@property
|
|
@@ -105,20 +111,20 @@ class LabelStudioProject(Project):
|
|
|
105
111
|
"""
|
|
106
112
|
return {ANNOTATIONS_COLUMN: ts.JsonType(nullable=True)}
|
|
107
113
|
|
|
108
|
-
def sync(self, t: Table, export_data: bool, import_data: bool) ->
|
|
114
|
+
def sync(self, t: Table, export_data: bool, import_data: bool) -> UpdateStatus:
|
|
109
115
|
_logger.info(
|
|
110
116
|
f'Syncing Label Studio project "{self.project_title}" with table `{t._name}`'
|
|
111
117
|
f' (export: {export_data}, import: {import_data}).'
|
|
112
118
|
)
|
|
113
119
|
# Collect all existing tasks into a dict with entries `rowid: task`
|
|
114
120
|
tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
|
|
115
|
-
sync_status =
|
|
121
|
+
sync_status = UpdateStatus()
|
|
116
122
|
if export_data:
|
|
117
123
|
export_sync_status = self.__update_tasks(t, tasks)
|
|
118
|
-
sync_status
|
|
124
|
+
sync_status += export_sync_status
|
|
119
125
|
if import_data:
|
|
120
126
|
import_sync_status = self.__update_table_from_tasks(t, tasks)
|
|
121
|
-
sync_status
|
|
127
|
+
sync_status += import_sync_status
|
|
122
128
|
return sync_status
|
|
123
129
|
|
|
124
130
|
def __fetch_all_tasks(self) -> Iterator[dict[str, Any]]:
|
|
@@ -142,7 +148,7 @@ class LabelStudioProject(Project):
|
|
|
142
148
|
f'Label Studio project {self.project_title!r}.'
|
|
143
149
|
)
|
|
144
150
|
|
|
145
|
-
def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) ->
|
|
151
|
+
def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) -> UpdateStatus:
|
|
146
152
|
"""
|
|
147
153
|
Updates all tasks in this Label Studio project based on the Pixeltable data:
|
|
148
154
|
- Creates new tasks for rows that don't map to any existing task;
|
|
@@ -155,7 +161,7 @@ class LabelStudioProject(Project):
|
|
|
155
161
|
t_data_cols = [t_col for t_col, ext_col_name in self.col_mapping.items() if ext_col_name in config.data_keys]
|
|
156
162
|
|
|
157
163
|
if len(t_data_cols) == 0:
|
|
158
|
-
return
|
|
164
|
+
return UpdateStatus()
|
|
159
165
|
|
|
160
166
|
# Columns in `t` that map to `rectanglelabels` preannotations
|
|
161
167
|
t_rl_cols = [
|
|
@@ -183,15 +189,15 @@ class LabelStudioProject(Project):
|
|
|
183
189
|
self,
|
|
184
190
|
t: Table,
|
|
185
191
|
existing_tasks: dict[tuple, dict],
|
|
186
|
-
media_col:
|
|
187
|
-
t_rl_cols: list[
|
|
192
|
+
media_col: ColumnHandle,
|
|
193
|
+
t_rl_cols: list[ColumnHandle],
|
|
188
194
|
rl_info: list['_RectangleLabel'],
|
|
189
|
-
) ->
|
|
190
|
-
is_stored = media_col.is_stored
|
|
195
|
+
) -> UpdateStatus:
|
|
196
|
+
is_stored = media_col.get().is_stored
|
|
191
197
|
# If it's a stored column, we can use `localpath`
|
|
192
|
-
localpath_col_opt = [t[media_col.name].localpath] if is_stored else []
|
|
198
|
+
localpath_col_opt = [t[media_col.get().name].localpath] if is_stored else []
|
|
193
199
|
# Select the media column, rectanglelabels columns, and localpath (if appropriate)
|
|
194
|
-
rows = t.select(t[media_col.name], *[t[col.name] for col in t_rl_cols], *localpath_col_opt)
|
|
200
|
+
rows = t.select(t[media_col.get().name], *[t[col.get().name] for col in t_rl_cols], *localpath_col_opt)
|
|
195
201
|
tasks_created = 0
|
|
196
202
|
row_ids_in_pxt: set[tuple] = set()
|
|
197
203
|
|
|
@@ -209,7 +215,7 @@ class LabelStudioProject(Project):
|
|
|
209
215
|
else:
|
|
210
216
|
# No localpath; create a temp file and upload it
|
|
211
217
|
assert isinstance(row[media_col_idx], PIL.Image.Image)
|
|
212
|
-
file =
|
|
218
|
+
file = TempStore.create_path(extension='.png')
|
|
213
219
|
row[media_col_idx].save(file, format='png')
|
|
214
220
|
task_id = self.project.import_tasks(file)[0]
|
|
215
221
|
os.remove(file)
|
|
@@ -232,48 +238,48 @@ class LabelStudioProject(Project):
|
|
|
232
238
|
|
|
233
239
|
env.Env.get().console_logger.info(f'Created {tasks_created} new task(s) in {self}.')
|
|
234
240
|
|
|
235
|
-
sync_status =
|
|
241
|
+
sync_status = UpdateStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created))
|
|
236
242
|
|
|
237
243
|
deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
|
|
238
|
-
|
|
239
|
-
return sync_status
|
|
244
|
+
sync_status += deletion_sync_status
|
|
245
|
+
return sync_status
|
|
240
246
|
|
|
241
247
|
def __update_tasks_by_files(
|
|
242
248
|
self,
|
|
243
249
|
t: Table,
|
|
244
250
|
existing_tasks: dict[tuple, dict],
|
|
245
|
-
t_data_cols: list[
|
|
246
|
-
t_rl_cols: list[
|
|
251
|
+
t_data_cols: list[ColumnHandle],
|
|
252
|
+
t_rl_cols: list[ColumnHandle],
|
|
247
253
|
rl_info: list['_RectangleLabel'],
|
|
248
|
-
) ->
|
|
254
|
+
) -> UpdateStatus:
|
|
249
255
|
ext_data_cols = [self.col_mapping[col] for col in t_data_cols]
|
|
250
256
|
expr_refs: dict[str, Expr] = {} # kwargs for the select statement
|
|
251
257
|
for col in t_data_cols:
|
|
252
|
-
col_name = col.name
|
|
258
|
+
col_name = col.get().name
|
|
253
259
|
if self.media_import_method == 'url':
|
|
254
260
|
expr_refs[col_name] = t[col_name].fileurl
|
|
255
261
|
else:
|
|
256
262
|
assert self.media_import_method == 'file'
|
|
257
|
-
if not col.col_type.is_media_type():
|
|
263
|
+
if not col.get().col_type.is_media_type():
|
|
258
264
|
# Not a media column; query the data directly
|
|
259
|
-
expr_refs[col_name] =
|
|
265
|
+
expr_refs[col_name] = t[col_name]
|
|
260
266
|
elif col in self.stored_proxies:
|
|
261
267
|
# Media column that has a stored proxy; use it. We have to give it a name,
|
|
262
268
|
# since it's an anonymous column
|
|
263
|
-
stored_proxy_col = self.stored_proxies[col]
|
|
269
|
+
stored_proxy_col = self.stored_proxies[col].get()
|
|
264
270
|
expr_refs[f'{col_name}_proxy'] = ColumnRef(stored_proxy_col).localpath
|
|
265
271
|
else:
|
|
266
272
|
# Media column without a stored proxy; this means it's a stored computed column,
|
|
267
273
|
# and we can just use the localpath
|
|
268
274
|
expr_refs[col_name] = t[col_name].localpath
|
|
269
275
|
|
|
270
|
-
df = t.select(*[t[col.name] for col in t_rl_cols], **expr_refs)
|
|
276
|
+
df = t.select(*[t[col.get().name] for col in t_rl_cols], **expr_refs)
|
|
271
277
|
# The following buffers will hold `DataRow` indices that correspond to each of the selected
|
|
272
278
|
# columns. `rl_col_idxs` holds the indices for the columns that map to RectangleLabels
|
|
273
279
|
# preannotations; `data_col_idxs` holds the indices for the columns that map to data fields.
|
|
274
280
|
# We have to wait until we begin iterating to populate them, so they're initially `None`.
|
|
275
|
-
rl_col_idxs:
|
|
276
|
-
data_col_idxs:
|
|
281
|
+
rl_col_idxs: list[int] | None = None
|
|
282
|
+
data_col_idxs: list[int] | None = None
|
|
277
283
|
|
|
278
284
|
row_ids_in_pxt: set[tuple] = set()
|
|
279
285
|
tasks_created = 0
|
|
@@ -286,11 +292,11 @@ class LabelStudioProject(Project):
|
|
|
286
292
|
data_vals = [row[idx] for idx in data_col_idxs]
|
|
287
293
|
coco_annotations = [row[idx] for idx in rl_col_idxs]
|
|
288
294
|
for i in range(len(t_data_cols)):
|
|
289
|
-
if t_data_cols[i].col_type.is_media_type():
|
|
295
|
+
if t_data_cols[i].get().col_type.is_media_type():
|
|
290
296
|
# Special handling for media columns
|
|
291
297
|
assert isinstance(data_vals[i], str)
|
|
292
298
|
if self.media_import_method == 'url':
|
|
293
|
-
data_vals[i] = self.__validate_fileurl(t_data_cols[i], data_vals[i])
|
|
299
|
+
data_vals[i] = self.__validate_fileurl(t_data_cols[i].get(), data_vals[i])
|
|
294
300
|
else:
|
|
295
301
|
assert self.media_import_method == 'file'
|
|
296
302
|
data_vals[i] = self.__localpath_to_lspath(data_vals[i])
|
|
@@ -336,14 +342,14 @@ class LabelStudioProject(Project):
|
|
|
336
342
|
f'Created {tasks_created} new task(s) and updated {tasks_updated} existing task(s) in {self}.'
|
|
337
343
|
)
|
|
338
344
|
|
|
339
|
-
sync_status =
|
|
345
|
+
sync_status = UpdateStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created, upd_rows=tasks_updated))
|
|
340
346
|
|
|
341
347
|
deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
|
|
342
|
-
|
|
343
|
-
return sync_status
|
|
348
|
+
sync_status += deletion_sync_status
|
|
349
|
+
return sync_status
|
|
344
350
|
|
|
345
351
|
@classmethod
|
|
346
|
-
def __validate_fileurl(cls, col: Column, url: str) ->
|
|
352
|
+
def __validate_fileurl(cls, col: Column, url: str) -> str | None:
|
|
347
353
|
# Check that the URL is one that will be visible to Label Studio. If it isn't, log an info message
|
|
348
354
|
# to help users debug the issue.
|
|
349
355
|
if not (url.startswith('http://') or url.startswith('https://')):
|
|
@@ -361,7 +367,7 @@ class LabelStudioProject(Project):
|
|
|
361
367
|
|
|
362
368
|
def __delete_stale_tasks(
|
|
363
369
|
self, existing_tasks: dict[tuple, dict], row_ids_in_pxt: set[tuple], tasks_created: int
|
|
364
|
-
) ->
|
|
370
|
+
) -> UpdateStatus:
|
|
365
371
|
deleted_rowids = set(existing_tasks.keys()) - row_ids_in_pxt
|
|
366
372
|
# Sanity check the math
|
|
367
373
|
assert len(deleted_rowids) == len(existing_tasks) + tasks_created - len(row_ids_in_pxt)
|
|
@@ -377,11 +383,11 @@ class LabelStudioProject(Project):
|
|
|
377
383
|
for rowid in deleted_rowids:
|
|
378
384
|
del existing_tasks[rowid]
|
|
379
385
|
|
|
380
|
-
return
|
|
386
|
+
return UpdateStatus(ext_row_count_stats=RowCountStats(del_rows=len(deleted_rowids)))
|
|
381
387
|
|
|
382
|
-
def __update_table_from_tasks(self, t: Table, tasks: dict[tuple, dict]) ->
|
|
388
|
+
def __update_table_from_tasks(self, t: Table, tasks: dict[tuple, dict]) -> UpdateStatus:
|
|
383
389
|
if ANNOTATIONS_COLUMN not in self.col_mapping.values():
|
|
384
|
-
return
|
|
390
|
+
return UpdateStatus()
|
|
385
391
|
|
|
386
392
|
annotations = {
|
|
387
393
|
# Replace [] by None to indicate no annotations. We do want to sync rows with no annotations,
|
|
@@ -391,7 +397,7 @@ class LabelStudioProject(Project):
|
|
|
391
397
|
for task in tasks.values()
|
|
392
398
|
}
|
|
393
399
|
|
|
394
|
-
local_annotations_col = next(k for k, v in self.col_mapping.items() if v == ANNOTATIONS_COLUMN)
|
|
400
|
+
local_annotations_col = next(k for k, v in self.col_mapping.items() if v == ANNOTATIONS_COLUMN).get()
|
|
395
401
|
|
|
396
402
|
# Prune the annotations down to just the ones that have actually changed.
|
|
397
403
|
rows = t.select(t[local_annotations_col.name])
|
|
@@ -412,23 +418,21 @@ class LabelStudioProject(Project):
|
|
|
412
418
|
# TODO(aaron-siegel): Simplify this once propagation is properly implemented in batch_update
|
|
413
419
|
ancestor = t
|
|
414
420
|
while local_annotations_col not in ancestor._tbl_version.get().cols:
|
|
415
|
-
assert ancestor.
|
|
416
|
-
ancestor = ancestor.
|
|
421
|
+
assert ancestor._get_base_table is not None
|
|
422
|
+
ancestor = ancestor._get_base_table()
|
|
417
423
|
update_status = ancestor.batch_update(updates)
|
|
418
424
|
env.Env.get().console_logger.info(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
|
|
419
|
-
return
|
|
425
|
+
return update_status
|
|
420
426
|
else:
|
|
421
|
-
return
|
|
427
|
+
return UpdateStatus()
|
|
422
428
|
|
|
423
429
|
def as_dict(self) -> dict[str, Any]:
|
|
424
430
|
return {
|
|
425
431
|
'name': self.name,
|
|
426
432
|
'project_id': self.project_id,
|
|
427
433
|
'media_import_method': self.media_import_method,
|
|
428
|
-
'col_mapping': [[
|
|
429
|
-
'stored_proxies': [
|
|
430
|
-
[self._column_as_dict(k), self._column_as_dict(v)] for k, v in self.stored_proxies.items()
|
|
431
|
-
],
|
|
434
|
+
'col_mapping': [[k.as_dict(), v] for k, v in self.col_mapping.items()],
|
|
435
|
+
'stored_proxies': [[k.as_dict(), v.as_dict()] for k, v in self.stored_proxies.items()],
|
|
432
436
|
}
|
|
433
437
|
|
|
434
438
|
@classmethod
|
|
@@ -437,8 +441,8 @@ class LabelStudioProject(Project):
|
|
|
437
441
|
md['name'],
|
|
438
442
|
md['project_id'],
|
|
439
443
|
md['media_import_method'],
|
|
440
|
-
{
|
|
441
|
-
{
|
|
444
|
+
{ColumnHandle.from_dict(entry[0]): entry[1] for entry in md['col_mapping']},
|
|
445
|
+
{ColumnHandle.from_dict(entry[0]): ColumnHandle.from_dict(entry[1]) for entry in md['stored_proxies']},
|
|
442
446
|
)
|
|
443
447
|
|
|
444
448
|
def __repr__(self) -> str:
|
|
@@ -493,7 +497,7 @@ class LabelStudioProject(Project):
|
|
|
493
497
|
|
|
494
498
|
@classmethod
|
|
495
499
|
def __coco_to_predictions(
|
|
496
|
-
cls, coco_annotations: dict[str, Any], from_name: str, rl_info: '_RectangleLabel', task_id:
|
|
500
|
+
cls, coco_annotations: dict[str, Any], from_name: str, rl_info: '_RectangleLabel', task_id: int | None = None
|
|
497
501
|
) -> dict[str, Any]:
|
|
498
502
|
width = coco_annotations['image']['width']
|
|
499
503
|
height = coco_annotations['image']['height']
|
|
@@ -545,11 +549,11 @@ class LabelStudioProject(Project):
|
|
|
545
549
|
cls,
|
|
546
550
|
t: Table,
|
|
547
551
|
label_config: str,
|
|
548
|
-
name:
|
|
549
|
-
title:
|
|
552
|
+
name: str | None,
|
|
553
|
+
title: str | None,
|
|
550
554
|
media_import_method: Literal['post', 'file', 'url'],
|
|
551
|
-
col_mapping:
|
|
552
|
-
s3_configuration:
|
|
555
|
+
col_mapping: dict[str, str] | None,
|
|
556
|
+
s3_configuration: dict[str, Any] | None,
|
|
553
557
|
**kwargs: Any,
|
|
554
558
|
) -> 'LabelStudioProject':
|
|
555
559
|
"""
|
|
@@ -560,7 +564,7 @@ class LabelStudioProject(Project):
|
|
|
560
564
|
|
|
561
565
|
if name is None:
|
|
562
566
|
# Create a default name that's unique to the table
|
|
563
|
-
all_stores = t.external_stores
|
|
567
|
+
all_stores = t.external_stores()
|
|
564
568
|
n = 0
|
|
565
569
|
while f'ls_project_{n}' in all_stores:
|
|
566
570
|
n += 1
|
|
@@ -576,7 +580,7 @@ class LabelStudioProject(Project):
|
|
|
576
580
|
local_annotations_column = ANNOTATIONS_COLUMN
|
|
577
581
|
else:
|
|
578
582
|
local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
|
|
579
|
-
if local_annotations_column not in t.
|
|
583
|
+
if local_annotations_column not in t._get_schema():
|
|
580
584
|
t.add_columns({local_annotations_column: ts.Json})
|
|
581
585
|
|
|
582
586
|
resolved_col_mapping = cls.validate_columns(
|
|
@@ -648,7 +652,7 @@ class LabelStudioProject(Project):
|
|
|
648
652
|
|
|
649
653
|
@dataclass(frozen=True)
|
|
650
654
|
class _DataKey:
|
|
651
|
-
name:
|
|
655
|
+
name: str | None # The 'name' attribute of the data key; may differ from the field name
|
|
652
656
|
column_type: ts.ColumnType
|
|
653
657
|
|
|
654
658
|
|
pixeltable/io/lancedb.py
ADDED
pixeltable/io/pandas.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
@@ -16,8 +16,8 @@ def import_pandas(
|
|
|
16
16
|
tbl_name: str,
|
|
17
17
|
df: pd.DataFrame,
|
|
18
18
|
*,
|
|
19
|
-
schema_overrides:
|
|
20
|
-
primary_key:
|
|
19
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
20
|
+
primary_key: str | list[str] | None = None,
|
|
21
21
|
num_retained_versions: int = 10,
|
|
22
22
|
comment: str = '',
|
|
23
23
|
) -> pxt.Table:
|
|
@@ -55,9 +55,9 @@ def import_pandas(
|
|
|
55
55
|
|
|
56
56
|
def import_csv(
|
|
57
57
|
tbl_name: str,
|
|
58
|
-
filepath_or_buffer:
|
|
59
|
-
schema_overrides:
|
|
60
|
-
primary_key:
|
|
58
|
+
filepath_or_buffer: str | os.PathLike,
|
|
59
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
60
|
+
primary_key: str | list[str] | None = None,
|
|
61
61
|
num_retained_versions: int = 10,
|
|
62
62
|
comment: str = '',
|
|
63
63
|
**kwargs: Any,
|
|
@@ -84,10 +84,10 @@ def import_csv(
|
|
|
84
84
|
|
|
85
85
|
def import_excel(
|
|
86
86
|
tbl_name: str,
|
|
87
|
-
io:
|
|
87
|
+
io: str | os.PathLike,
|
|
88
88
|
*,
|
|
89
|
-
schema_overrides:
|
|
90
|
-
primary_key:
|
|
89
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
90
|
+
primary_key: str | list[str] | None = None,
|
|
91
91
|
num_retained_versions: int = 10,
|
|
92
92
|
comment: str = '',
|
|
93
93
|
**kwargs: Any,
|
|
@@ -132,6 +132,7 @@ def df_infer_schema(
|
|
|
132
132
|
pd_schema: dict[str, ts.ColumnType] = {}
|
|
133
133
|
for pd_name, pd_dtype in zip(df.columns, df.dtypes):
|
|
134
134
|
if pd_name in schema_overrides:
|
|
135
|
+
assert isinstance(schema_overrides[pd_name], ts.ColumnType)
|
|
135
136
|
pxt_type = schema_overrides[pd_name]
|
|
136
137
|
else:
|
|
137
138
|
pxt_type = __pd_coltype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
|
|
@@ -140,7 +141,7 @@ def df_infer_schema(
|
|
|
140
141
|
return pd_schema
|
|
141
142
|
|
|
142
143
|
|
|
143
|
-
def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) ->
|
|
144
|
+
def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> ts.ColumnType | None:
|
|
144
145
|
"""
|
|
145
146
|
Determines a pixeltable ColumnType from a pandas dtype
|
|
146
147
|
|
|
@@ -191,7 +192,7 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
|
|
|
191
192
|
|
|
192
193
|
|
|
193
194
|
def _df_row_to_pxt_row(
|
|
194
|
-
row: tuple[Any, ...], schema: dict[str, ts.ColumnType], col_mapping:
|
|
195
|
+
row: tuple[Any, ...], schema: dict[str, ts.ColumnType], col_mapping: dict[str, str] | None
|
|
195
196
|
) -> dict[str, Any]:
|
|
196
197
|
"""Convert a row to insertable format"""
|
|
197
198
|
pxt_row: dict[str, Any] = {}
|
pixeltable/io/parquet.py
CHANGED
|
@@ -1,16 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import datetime
|
|
4
|
-
import io
|
|
5
3
|
import json
|
|
6
4
|
import logging
|
|
7
5
|
import typing
|
|
8
|
-
from collections import deque
|
|
9
6
|
from pathlib import Path
|
|
10
|
-
from typing import Any
|
|
11
|
-
|
|
12
|
-
import numpy as np
|
|
13
|
-
import PIL.Image
|
|
7
|
+
from typing import Any
|
|
14
8
|
|
|
15
9
|
import pixeltable as pxt
|
|
16
10
|
import pixeltable.exceptions as excs
|
|
@@ -18,31 +12,13 @@ from pixeltable.catalog import Catalog
|
|
|
18
12
|
from pixeltable.utils.transactional_directory import transactional_directory
|
|
19
13
|
|
|
20
14
|
if typing.TYPE_CHECKING:
|
|
21
|
-
import pyarrow as pa
|
|
22
|
-
|
|
23
15
|
import pixeltable as pxt
|
|
24
16
|
|
|
25
17
|
_logger = logging.getLogger('pixeltable')
|
|
26
18
|
|
|
27
19
|
|
|
28
|
-
def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
|
|
29
|
-
import pyarrow as pa
|
|
30
|
-
from pyarrow import parquet
|
|
31
|
-
|
|
32
|
-
pydict = {}
|
|
33
|
-
for field in schema:
|
|
34
|
-
if isinstance(field.type, pa.FixedShapeTensorType):
|
|
35
|
-
stacked_arr = np.stack(value_batch[field.name])
|
|
36
|
-
pydict[field.name] = pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr)
|
|
37
|
-
else:
|
|
38
|
-
pydict[field.name] = value_batch[field.name]
|
|
39
|
-
|
|
40
|
-
tab = pa.Table.from_pydict(pydict, schema=schema)
|
|
41
|
-
parquet.write_table(tab, str(output_path))
|
|
42
|
-
|
|
43
|
-
|
|
44
20
|
def export_parquet(
|
|
45
|
-
table_or_df:
|
|
21
|
+
table_or_df: pxt.Table | pxt.DataFrame,
|
|
46
22
|
parquet_path: Path,
|
|
47
23
|
partition_size_bytes: int = 100_000_000,
|
|
48
24
|
inline_images: bool = False,
|
|
@@ -63,7 +39,9 @@ def export_parquet(
|
|
|
63
39
|
If False, will raise an error if the Dataframe has any image column.
|
|
64
40
|
Default False.
|
|
65
41
|
"""
|
|
66
|
-
|
|
42
|
+
import pyarrow as pa
|
|
43
|
+
|
|
44
|
+
from pixeltable.utils.arrow import to_record_batches
|
|
67
45
|
|
|
68
46
|
df: pxt.DataFrame
|
|
69
47
|
if isinstance(table_or_df, pxt.catalog.Table):
|
|
@@ -71,9 +49,6 @@ def export_parquet(
|
|
|
71
49
|
else:
|
|
72
50
|
df = table_or_df
|
|
73
51
|
|
|
74
|
-
type_dict = {k: v.as_dict() for k, v in df.schema.items()}
|
|
75
|
-
arrow_schema = to_arrow_schema(df.schema)
|
|
76
|
-
|
|
77
52
|
if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
|
|
78
53
|
raise excs.Error('Cannot export Dataframe with image columns when inline_images is False')
|
|
79
54
|
|
|
@@ -81,78 +56,23 @@ def export_parquet(
|
|
|
81
56
|
with transactional_directory(parquet_path) as temp_path:
|
|
82
57
|
# dump metadata json file so we can inspect what was the source of the parquet file later on.
|
|
83
58
|
json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
|
|
59
|
+
type_dict = {k: v.as_dict() for k, v in df.schema.items()}
|
|
84
60
|
json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
|
|
85
|
-
|
|
86
61
|
batch_num = 0
|
|
87
|
-
current_value_batch: dict[str, deque] = {k: deque() for k in df.schema}
|
|
88
|
-
current_byte_estimate = 0
|
|
89
|
-
|
|
90
62
|
with Catalog.get().begin_xact(for_write=False):
|
|
91
|
-
for
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
continue
|
|
97
|
-
|
|
98
|
-
assert val is not None
|
|
99
|
-
if col_type.is_image_type():
|
|
100
|
-
# images get inlined into the parquet file
|
|
101
|
-
if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
|
|
102
|
-
# if there is a file, read directly to preserve information
|
|
103
|
-
with open(data_row.file_paths[e.slot_idx], 'rb') as f:
|
|
104
|
-
val = f.read()
|
|
105
|
-
elif isinstance(val, PIL.Image.Image):
|
|
106
|
-
# if no file available, eg. bc it is computed, convert to png
|
|
107
|
-
buf = io.BytesIO()
|
|
108
|
-
val.save(buf, format='PNG')
|
|
109
|
-
val = buf.getvalue()
|
|
110
|
-
else:
|
|
111
|
-
raise excs.Error(f'unknown image type {type(val)}')
|
|
112
|
-
length = len(val)
|
|
113
|
-
elif col_type.is_string_type():
|
|
114
|
-
length = len(val)
|
|
115
|
-
elif col_type.is_video_type() or col_type.is_audio_type():
|
|
116
|
-
if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
|
|
117
|
-
val = data_row.file_paths[e.slot_idx]
|
|
118
|
-
else:
|
|
119
|
-
raise excs.Error(f'unknown audio/video type {type(val)}')
|
|
120
|
-
length = len(val)
|
|
121
|
-
elif col_type.is_json_type():
|
|
122
|
-
val = json.dumps(val)
|
|
123
|
-
length = len(val)
|
|
124
|
-
elif col_type.is_array_type():
|
|
125
|
-
length = val.nbytes
|
|
126
|
-
elif col_type.is_int_type() or col_type.is_float_type():
|
|
127
|
-
length = 8
|
|
128
|
-
elif col_type.is_bool_type():
|
|
129
|
-
length = 1
|
|
130
|
-
elif col_type.is_date_type():
|
|
131
|
-
length = 4
|
|
132
|
-
elif col_type.is_timestamp_type():
|
|
133
|
-
val = val.astimezone(datetime.timezone.utc)
|
|
134
|
-
length = 8
|
|
135
|
-
else:
|
|
136
|
-
raise excs.Error(f'unknown type {col_type} for {col_name}')
|
|
137
|
-
|
|
138
|
-
current_value_batch[col_name].append(val)
|
|
139
|
-
current_byte_estimate += length
|
|
140
|
-
if current_byte_estimate > partition_size_bytes:
|
|
141
|
-
assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
|
|
142
|
-
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
143
|
-
batch_num += 1
|
|
144
|
-
current_value_batch = {k: deque() for k in df.schema}
|
|
145
|
-
current_byte_estimate = 0
|
|
146
|
-
|
|
147
|
-
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
63
|
+
for record_batch in to_record_batches(df, partition_size_bytes):
|
|
64
|
+
output_path = temp_path / f'part-{batch_num:05d}.parquet'
|
|
65
|
+
arrow_tbl = pa.Table.from_batches([record_batch])
|
|
66
|
+
pa.parquet.write_table(arrow_tbl, str(output_path))
|
|
67
|
+
batch_num += 1
|
|
148
68
|
|
|
149
69
|
|
|
150
70
|
def import_parquet(
|
|
151
71
|
table: str,
|
|
152
72
|
*,
|
|
153
73
|
parquet_path: str,
|
|
154
|
-
schema_overrides:
|
|
155
|
-
primary_key:
|
|
74
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
75
|
+
primary_key: str | list[str] | None = None,
|
|
156
76
|
**kwargs: Any,
|
|
157
77
|
) -> pxt.Table:
|
|
158
78
|
"""Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
|