pixeltable 0.2.21__py3-none-any.whl → 0.2.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (82) hide show
  1. pixeltable/__version__.py +2 -2
  2. pixeltable/catalog/__init__.py +1 -1
  3. pixeltable/catalog/column.py +37 -11
  4. pixeltable/catalog/globals.py +18 -0
  5. pixeltable/catalog/insertable_table.py +6 -4
  6. pixeltable/catalog/table.py +19 -3
  7. pixeltable/catalog/table_version.py +34 -14
  8. pixeltable/catalog/view.py +16 -17
  9. pixeltable/dataframe.py +7 -8
  10. pixeltable/env.py +5 -0
  11. pixeltable/exec/__init__.py +0 -1
  12. pixeltable/exec/aggregation_node.py +6 -3
  13. pixeltable/exec/cache_prefetch_node.py +1 -1
  14. pixeltable/exec/data_row_batch.py +2 -19
  15. pixeltable/exec/exec_node.py +2 -1
  16. pixeltable/exec/expr_eval_node.py +17 -10
  17. pixeltable/exec/in_memory_data_node.py +6 -3
  18. pixeltable/exec/sql_node.py +24 -25
  19. pixeltable/exprs/arithmetic_expr.py +3 -1
  20. pixeltable/exprs/array_slice.py +7 -7
  21. pixeltable/exprs/column_property_ref.py +37 -10
  22. pixeltable/exprs/column_ref.py +93 -14
  23. pixeltable/exprs/comparison.py +5 -5
  24. pixeltable/exprs/compound_predicate.py +8 -7
  25. pixeltable/exprs/data_row.py +27 -18
  26. pixeltable/exprs/expr.py +53 -52
  27. pixeltable/exprs/expr_set.py +5 -0
  28. pixeltable/exprs/function_call.py +32 -16
  29. pixeltable/exprs/globals.py +4 -1
  30. pixeltable/exprs/in_predicate.py +8 -7
  31. pixeltable/exprs/inline_expr.py +4 -4
  32. pixeltable/exprs/is_null.py +4 -4
  33. pixeltable/exprs/json_mapper.py +11 -12
  34. pixeltable/exprs/json_path.py +5 -10
  35. pixeltable/exprs/literal.py +5 -5
  36. pixeltable/exprs/method_ref.py +5 -4
  37. pixeltable/exprs/object_ref.py +2 -1
  38. pixeltable/exprs/row_builder.py +88 -36
  39. pixeltable/exprs/rowid_ref.py +12 -11
  40. pixeltable/exprs/similarity_expr.py +12 -7
  41. pixeltable/exprs/sql_element_cache.py +7 -5
  42. pixeltable/exprs/type_cast.py +8 -6
  43. pixeltable/exprs/variable.py +5 -4
  44. pixeltable/func/aggregate_function.py +1 -1
  45. pixeltable/func/function.py +11 -10
  46. pixeltable/functions/__init__.py +2 -2
  47. pixeltable/functions/globals.py +5 -7
  48. pixeltable/functions/huggingface.py +19 -20
  49. pixeltable/functions/llama_cpp.py +106 -0
  50. pixeltable/functions/ollama.py +147 -0
  51. pixeltable/functions/replicate.py +72 -0
  52. pixeltable/functions/string.py +9 -0
  53. pixeltable/globals.py +12 -20
  54. pixeltable/index/btree.py +16 -3
  55. pixeltable/index/embedding_index.py +4 -4
  56. pixeltable/io/__init__.py +1 -2
  57. pixeltable/io/fiftyone.py +178 -0
  58. pixeltable/io/globals.py +96 -2
  59. pixeltable/iterators/base.py +3 -2
  60. pixeltable/iterators/document.py +1 -1
  61. pixeltable/iterators/video.py +120 -63
  62. pixeltable/metadata/__init__.py +1 -1
  63. pixeltable/metadata/converters/convert_21.py +34 -0
  64. pixeltable/metadata/converters/util.py +45 -4
  65. pixeltable/metadata/notes.py +1 -0
  66. pixeltable/metadata/schema.py +8 -0
  67. pixeltable/plan.py +16 -14
  68. pixeltable/py.typed +0 -0
  69. pixeltable/store.py +7 -2
  70. pixeltable/tool/create_test_video.py +1 -1
  71. pixeltable/tool/embed_udf.py +1 -1
  72. pixeltable/tool/mypy_plugin.py +28 -5
  73. pixeltable/type_system.py +17 -1
  74. pixeltable/utils/documents.py +15 -1
  75. pixeltable/utils/formatter.py +9 -10
  76. {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/METADATA +46 -10
  77. pixeltable-0.2.22.dist-info/RECORD +153 -0
  78. pixeltable/exec/media_validation_node.py +0 -43
  79. pixeltable-0.2.21.dist-info/RECORD +0 -148
  80. {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
  81. {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
  82. {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,72 @@
1
+ """
2
+ Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs)
3
+ that wrap various endpoints from the Replicate API. In order to use them, you must
4
+ first `pip install replicate` and configure your Replicate credentials, as described in
5
+ the [Working with Replicate](https://pixeltable.readme.io/docs/working-with-replicate) tutorial.
6
+ """
7
+
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ import pixeltable as pxt
11
+ from pixeltable.env import Env, register_client
12
+ from pixeltable.utils.code import local_public_names
13
+
14
+ if TYPE_CHECKING:
15
+ import replicate # type: ignore[import-untyped]
16
+
17
+
18
+ @register_client('replicate')
19
+ def _(api_token: str) -> 'replicate.Client':
20
+ import replicate
21
+ return replicate.Client(api_token=api_token)
22
+
23
+
24
+ def _replicate_client() -> 'replicate.Client':
25
+ return Env.get().get_client('replicate')
26
+
27
+
28
+ @pxt.udf
29
+ def run(
30
+ input: dict[str, Any],
31
+ *,
32
+ ref: str,
33
+ ) -> dict[str, Any]:
34
+ """
35
+ Run a model on Replicate.
36
+
37
+ For additional details, see: <https://replicate.com/docs/topics/models/run-a-model>
38
+
39
+ __Requirements:__
40
+
41
+ - `pip install replicate`
42
+
43
+ Args:
44
+ input: The input parameters for the model.
45
+ ref: The name of the model to run.
46
+
47
+ Returns:
48
+ The output of the model.
49
+
50
+ Examples:
51
+ Add a computed column that applies the model `meta/meta-llama-3-8b-instruct`
52
+ to an existing Pixeltable column `tbl.prompt` of the table `tbl`:
53
+
54
+ >>> input = {'system_prompt': 'You are a helpful assistant.', 'prompt': tbl.prompt}
55
+ ... tbl['response'] = run(input, ref='meta/meta-llama-3-8b-instruct')
56
+
57
+ Add a computed column that uses the model `black-forest-labs/flux-schnell`
58
+ to generate images from an existing Pixeltable column `tbl.prompt`:
59
+
60
+ >>> input = {'prompt': tbl.prompt, 'go_fast': True, 'megapixels': '1'}
61
+ ... tbl['response'] = run(input, ref='black-forest-labs/flux-schnell')
62
+ ... tbl['image'] = tbl.response.output[0].astype(pxt.Image)
63
+ """
64
+ Env.get().require_package('replicate')
65
+ return _replicate_client().run(ref, input, use_file_output=False)
66
+
67
+
68
+ __all__ = local_public_names(__name__)
69
+
70
+
71
+ def __dir__():
72
+ return __all__
@@ -283,6 +283,15 @@ def isspace(self: str) -> bool:
283
283
  """
284
284
  return self.isspace()
285
285
 
286
+ @pxt.udf
287
+ def join(sep: str, elements: list) -> str:
288
+ """
289
+ Return a string which is the concatenation of the strings in `elements`.
290
+
291
+ Equivalent to [`str.join()`](https://docs.python.org/3/library/stdtypes.html#str.join)
292
+ """
293
+ return sep.join(elements)
294
+
286
295
  @pxt.udf(is_method=True)
287
296
  def len(self: str) -> int:
288
297
  """
pixeltable/globals.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import dataclasses
2
2
  import logging
3
- from typing import Any, Iterable, Optional, Union
3
+ from typing import Any, Iterable, Optional, Union, Literal
4
4
  from uuid import UUID
5
5
 
6
6
  import pandas as pd
@@ -33,6 +33,7 @@ def create_table(
33
33
  primary_key: Optional[Union[str, list[str]]] = None,
34
34
  num_retained_versions: int = 10,
35
35
  comment: str = '',
36
+ media_validation: Literal['on_read', 'on_write'] = 'on_write'
36
37
  ) -> catalog.Table:
37
38
  """Create a new base table.
38
39
 
@@ -44,6 +45,9 @@ def create_table(
44
45
  table.
45
46
  num_retained_versions: Number of versions of the table to retain.
46
47
  comment: An optional comment; its meaning is user-defined.
48
+ media_validation: Media validation policy for the table.
49
+ - `'on_read'`: validate media files at query time
50
+ - `'on_write'`: validate media files during insert/update operations
47
51
 
48
52
  Returns:
49
53
  A handle to the newly created [`Table`][pixeltable.Table].
@@ -89,14 +93,8 @@ def create_table(
89
93
  raise excs.Error('primary_key must be a single column name or a list of column names')
90
94
 
91
95
  tbl = catalog.InsertableTable._create(
92
- dir._id,
93
- path.name,
94
- schema,
95
- df,
96
- primary_key=primary_key,
97
- num_retained_versions=num_retained_versions,
98
- comment=comment,
99
- )
96
+ dir._id, path.name, schema, df, primary_key=primary_key, num_retained_versions=num_retained_versions,
97
+ comment=comment, media_validation=catalog.MediaValidation.validated(media_validation, 'media_validation'))
100
98
  Catalog.get().paths[path] = tbl
101
99
 
102
100
  _logger.info(f'Created table `{path_str}`.')
@@ -112,6 +110,7 @@ def create_view(
112
110
  iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]] = None,
113
111
  num_retained_versions: int = 10,
114
112
  comment: str = '',
113
+ media_validation: Literal['on_read', 'on_write'] = 'on_write',
115
114
  ignore_errors: bool = False,
116
115
  ) -> Optional[catalog.Table]:
117
116
  """Create a view of an existing table object (which itself can be a view or a snapshot or a base table).
@@ -177,17 +176,10 @@ def create_view(
177
176
  iterator_class, iterator_args = iterator
178
177
 
179
178
  view = catalog.View._create(
180
- dir._id,
181
- path.name,
182
- base=tbl_version_path,
183
- additional_columns=additional_columns,
184
- predicate=where,
185
- is_snapshot=is_snapshot,
186
- iterator_cls=iterator_class,
187
- iterator_args=iterator_args,
188
- num_retained_versions=num_retained_versions,
189
- comment=comment,
190
- )
179
+ dir._id, path.name, base=tbl_version_path, additional_columns=additional_columns, predicate=where,
180
+ is_snapshot=is_snapshot, iterator_cls=iterator_class, iterator_args=iterator_args,
181
+ num_retained_versions=num_retained_versions, comment=comment,
182
+ media_validation=catalog.MediaValidation.validated(media_validation, 'media_validation'))
191
183
  Catalog.get().paths[path] = view
192
184
  _logger.info(f'Created view `{path_str}`.')
193
185
  FileCache.get().emit_eviction_warnings()
pixeltable/index/btree.py CHANGED
@@ -1,13 +1,16 @@
1
- from typing import Optional
1
+ from typing import Optional, TYPE_CHECKING
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
5
+ # TODO: why does this import result in a circular import, but the one im embedding_index.py doesn't?
6
+ # import pixeltable.catalog as catalog
5
7
  import pixeltable.exceptions as excs
6
8
  from pixeltable import catalog, exprs
7
9
  from pixeltable.func.udf import udf
8
-
9
10
  from .base import IndexBase
10
11
 
12
+ if TYPE_CHECKING:
13
+ import pixeltable.exprs
11
14
 
12
15
  class BtreeIndex(IndexBase):
13
16
  """
@@ -15,6 +18,8 @@ class BtreeIndex(IndexBase):
15
18
  """
16
19
  MAX_STRING_LEN = 256
17
20
 
21
+ value_expr: 'pixeltable.exprs.Expr'
22
+
18
23
  @staticmethod
19
24
  @udf
20
25
  def str_filter(s: Optional[str]) -> Optional[str]:
@@ -25,7 +30,14 @@ class BtreeIndex(IndexBase):
25
30
  def __init__(self, c: 'catalog.Column'):
26
31
  if not c.col_type.is_scalar_type() and not c.col_type.is_media_type():
27
32
  raise excs.Error(f'Index on column {c.name}: B-tree index requires scalar or media type, got {c.col_type}')
28
- self.value_expr = BtreeIndex.str_filter(exprs.ColumnRef(c)) if c.col_type.is_string_type() else exprs.ColumnRef(c)
33
+ if c.col_type.is_media_type():
34
+ # an index on a media column is an index on the file url
35
+ # no validation for media columns: we're only interested in the string value
36
+ self.value_expr = exprs.ColumnRef(c, perform_validation=False)
37
+ else:
38
+ self.value_expr = (
39
+ BtreeIndex.str_filter(exprs.ColumnRef(c)) if c.col_type.is_string_type() else exprs.ColumnRef(c)
40
+ )
29
41
 
30
42
  def index_value_expr(self) -> 'exprs.Expr':
31
43
  return self.value_expr
@@ -52,3 +64,4 @@ class BtreeIndex(IndexBase):
52
64
  @classmethod
53
65
  def from_dict(cls, c: 'catalog.Column', d: dict) -> 'BtreeIndex':
54
66
  return cls(c)
67
+
@@ -86,8 +86,8 @@ class EmbeddingIndex(IndexBase):
86
86
  )
87
87
  idx.create(bind=conn)
88
88
 
89
- def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.ClauseElement:
90
- """Create a ClauseElement that represents '<val_column> <op> <item>'"""
89
+ def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.ColumnElement:
90
+ """Create a ColumnElement that represents '<val_column> <op> <item>'"""
91
91
  assert isinstance(item, (str, PIL.Image.Image))
92
92
  if isinstance(item, str):
93
93
  assert self.string_embed is not None
@@ -104,8 +104,8 @@ class EmbeddingIndex(IndexBase):
104
104
  assert self.metric == self.Metric.L2
105
105
  return val_column.sa_col.l2_distance(embedding)
106
106
 
107
- def order_by_clause(self, val_column: catalog.Column, item: Any, is_asc: bool) -> sql.ClauseElement:
108
- """Create a ClauseElement that is used in an ORDER BY clause"""
107
+ def order_by_clause(self, val_column: catalog.Column, item: Any, is_asc: bool) -> sql.ColumnElement:
108
+ """Create a ColumnElement that is used in an ORDER BY clause"""
109
109
  assert isinstance(item, (str, PIL.Image.Image))
110
110
  embedding: Optional[np.ndarray] = None
111
111
  if isinstance(item, str):
pixeltable/io/__init__.py CHANGED
@@ -1,10 +1,9 @@
1
1
  from .external_store import ExternalStore, SyncStatus
2
- from .globals import create_label_studio_project, import_rows, import_json
2
+ from .globals import create_label_studio_project, export_images_as_fo_dataset, import_json, import_rows
3
3
  from .hf_datasets import import_huggingface_dataset
4
4
  from .pandas import import_csv, import_excel, import_pandas
5
5
  from .parquet import import_parquet
6
6
 
7
-
8
7
  __default_dir = set(symbol for symbol in dir() if not symbol.startswith('_'))
9
8
  __removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet'}
10
9
  __all__ = sorted(list(__default_dir - __removed_symbols))
@@ -0,0 +1,178 @@
1
+ import os
2
+ from typing import Iterator, Optional, Union
3
+
4
+ import fiftyone as fo # type: ignore[import-untyped]
5
+ import fiftyone.utils.data as foud # type: ignore[import-untyped]
6
+ import PIL.Image
7
+ import puremagic
8
+
9
+ import pixeltable as pxt
10
+ import pixeltable.exceptions as excs
11
+ from pixeltable import exprs
12
+ from pixeltable.env import Env
13
+
14
+
15
+ class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
16
+ """
17
+ Implementation of a FiftyOne `DatasetImporter` that reads image data from a Pixeltable table.
18
+ """
19
+ __image_format: str # format to use for any exported images that are not already stored on disk
20
+ __labels: dict[str, tuple[exprs.Expr, type[fo.Label]]] # label_name -> (expr, label_cls)
21
+ __image_idx: int # index of the image expr in the select list
22
+ __localpath_idx: Optional[int] # index of the image localpath in the select list, if present
23
+ __row_iter: Iterator[list] # iterator over the table rows, to be convered to FiftyOne samples
24
+
25
+ def __init__(
26
+ self,
27
+ tbl: pxt.Table,
28
+ image: exprs.Expr,
29
+ image_format: str,
30
+ classifications: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
31
+ detections: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
32
+ dataset_dir: Optional[os.PathLike] = None,
33
+ shuffle: bool = False,
34
+ seed: Union[int, float, str, bytes, bytearray, None] = None,
35
+ max_samples: Optional[int] = None,
36
+ ):
37
+ super().__init__(
38
+ dataset_dir=dataset_dir,
39
+ shuffle=shuffle,
40
+ seed=seed,
41
+ max_samples=max_samples
42
+ )
43
+
44
+ self.__image_format = image_format
45
+
46
+ label_categories = [
47
+ (classifications, fo.Classifications, 'classifications'),
48
+ (detections, fo.Detections, 'detections'),
49
+ ]
50
+
51
+ # Construct the labels. First add labels for all label types that have named dictionaries.
52
+ self.__labels = {}
53
+ for exprs_, label_cls, _ in label_categories:
54
+ if isinstance(exprs_, dict):
55
+ for label_name, expr in exprs_.items():
56
+ if not label_name.isidentifier():
57
+ raise excs.Error(f"Invalid label name: {label_name}")
58
+ if label_name in self.__labels:
59
+ raise excs.Error(f"Duplicate label name: {label_name}")
60
+ self.__labels[label_name] = (expr, label_cls)
61
+
62
+ # Now add the remaining labels, assigning unused default names.
63
+ for exprs_, label_cls, default_name in label_categories:
64
+ if exprs_ is None or isinstance(exprs_, dict):
65
+ continue
66
+ if isinstance(exprs_, exprs.Expr):
67
+ exprs_ = [exprs_]
68
+ assert isinstance(exprs_, list)
69
+ for expr in exprs_:
70
+ if default_name not in self.__labels:
71
+ name = default_name
72
+ else:
73
+ i = 1
74
+ while f'{default_name}_{i}' in self.__labels:
75
+ i += 1
76
+ name = f'{default_name}_{i}'
77
+ self.__labels[name] = (expr, label_cls)
78
+
79
+ # Build the select list:
80
+ # - Labels first, in the order they appear in self.__labels
81
+ # - Then the `image` expr
82
+ # - Then `image.localpath`, if `images` is a stored columnref
83
+
84
+ selection = [expr for expr, _ in self.__labels.values()]
85
+ self.__image_idx = len(selection)
86
+ selection.append(image)
87
+
88
+ if isinstance(image, exprs.ColumnRef) and image.col.is_stored:
89
+ # A stored image column; we can use the existing localpaths
90
+ self.__localpath_idx = len(selection)
91
+ selection.append(image.localpath)
92
+ else:
93
+ self.__localpath_idx = None
94
+
95
+ df = tbl.select(*selection)
96
+ self.__row_iter = df._output_row_iterator()
97
+
98
+ def __next__(self) -> tuple[str, Optional[fo.ImageMetadata], Optional[dict[str, fo.Label]]]:
99
+ row = next(self.__row_iter)
100
+ img = row[self.__image_idx]
101
+ assert isinstance(img, PIL.Image.Image)
102
+ if self.__localpath_idx is not None:
103
+ # Use the existing localpath of the stored image
104
+ file = row[self.__localpath_idx]
105
+ assert isinstance(file, str)
106
+ else:
107
+ # Write the dynamically created image to a temp file
108
+ file = str(Env.get().create_tmp_path(f'.{self.__image_format}'))
109
+ img.save(file, format=self.__image_format)
110
+
111
+ metadata = fo.ImageMetadata(
112
+ size_bytes=os.path.getsize(file),
113
+ mime_type=puremagic.from_file(file, mime=True),
114
+ width=img.width,
115
+ height=img.height,
116
+ filepath=file,
117
+ num_channels=len(img.getbands()),
118
+ )
119
+
120
+ labels: dict[str, fo.Label] = {}
121
+ for idx, (label_name, (_, label_cls)) in enumerate(self.__labels.items()):
122
+ label_data = row[idx]
123
+ if label_data is None:
124
+ continue
125
+
126
+ label: fo.Label
127
+ if label_cls is fo.Classifications:
128
+ label = fo.Classifications(classifications=self.__as_fo_classifications(label_data))
129
+ elif label_cls is fo.Detections:
130
+ label = fo.Detections(detections=self.__as_fo_detections(label_data))
131
+ else:
132
+ assert False
133
+ labels[label_name] = label
134
+
135
+ return file, metadata, labels
136
+
137
+ def __as_fo_classifications(self, data: list) -> list[fo.Classification]:
138
+ if not isinstance(data, list) or any('label' not in entry for entry in data):
139
+ raise excs.Error(
140
+ f'Invalid classifications data: {data}\n'
141
+ "(Expected a list of dicts, each containing a 'label' key)"
142
+ )
143
+ return [
144
+ fo.Classification(label=entry['label'], confidence=entry.get('confidence'))
145
+ for entry in data
146
+ ]
147
+
148
+ def __as_fo_detections(self, data: list) -> list[fo.Detections]:
149
+ if not isinstance(data, list) or any('label' not in entry or 'bounding_box' not in entry for entry in data):
150
+ raise excs.Error(
151
+ f'Invalid detections data: {data}\n'
152
+ "(Expected a list of dicts, each containing a 'label' and 'bounding_box' key)"
153
+ )
154
+ return [
155
+ fo.Detection(label=entry['label'], bounding_box=entry['bounding_box'], confidence=entry.get('confidence'))
156
+ for entry in data
157
+ ]
158
+
159
+ @property
160
+ def has_dataset_info(self) -> bool:
161
+ return False
162
+
163
+ @property
164
+ def has_image_metadata(self) -> bool:
165
+ return True
166
+
167
+ @property
168
+ def label_cls(self) -> dict[str, type]:
169
+ return {label_name: label_cls for label_name, (_, label_cls) in self.__labels.items()}
170
+
171
+ def setup(self) -> None:
172
+ pass
173
+
174
+ def get_dataset_info(self) -> dict:
175
+ pass
176
+
177
+ def close(self, *args) -> None:
178
+ pass
pixeltable/io/globals.py CHANGED
@@ -1,10 +1,14 @@
1
- from typing import Any, Literal, Optional, Union
1
+ from typing import TYPE_CHECKING, Any, Literal, Optional, Union
2
2
 
3
3
  import pixeltable as pxt
4
4
  import pixeltable.exceptions as excs
5
- from pixeltable import Table
5
+ from pixeltable import Table, exprs
6
+ from pixeltable.env import Env
6
7
  from pixeltable.io.external_store import SyncStatus
7
8
 
9
+ if TYPE_CHECKING:
10
+ import fiftyone as fo # type: ignore[import-untyped]
11
+
8
12
 
9
13
  def create_label_studio_project(
10
14
  t: Table,
@@ -116,6 +120,8 @@ def create_label_studio_project(
116
120
  s3_configuration={'bucket': 'my-bucket', 'region_name': 'us-east-2'}
117
121
  )
118
122
  """
123
+ Env.get().require_package('label_studio_sdk')
124
+
119
125
  from pixeltable.io.label_studio import LabelStudioProject
120
126
 
121
127
  ls_project = LabelStudioProject.create(
@@ -267,3 +273,91 @@ def import_json(
267
273
  contents = urllib.request.urlopen(filepath_or_url).read()
268
274
  data = json.loads(contents, **kwargs)
269
275
  return import_rows(tbl_path, data, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
276
+
277
+
278
+ def export_images_as_fo_dataset(
279
+ tbl: pxt.Table,
280
+ images: exprs.Expr,
281
+ image_format: str = 'webp',
282
+ classifications: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
283
+ detections: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
284
+ ) -> 'fo.Dataset':
285
+ """
286
+ Export images from a Pixeltable table as a Voxel51 dataset. The data must consist of a single column
287
+ (or expression) containing image data, along with optional additional columns containing labels. Currently, only
288
+ classification and detection labels are supported.
289
+
290
+ The [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial contains a
291
+ fully worked example showing how to export data from a Pixeltable table and load it into Voxel51.
292
+
293
+ Images in the dataset that already exist on disk will be exported directly, in whatever format they
294
+ are stored in. Images that are not already on disk (such as frames extracted using a
295
+ [`FrameIterator`][pixeltable.iterators.FrameIterator]) will first be written to disk in the specified
296
+ `image_format`.
297
+
298
+ The label parameters accept one or more sets of labels of each type. If a single `Expr` is provided, then it will
299
+ be exported as a single set of labels with a default name such as `classifications`.
300
+ (The single set of labels may still containing multiple individual labels; see below.)
301
+ If a list of `Expr`s is provided, then each one will be exported as a separate set of labels with a default name
302
+ such as `classifications`, `classifications_1`, etc. If a dictionary of `Expr`s is provided, then each entry will
303
+ be exported as a set of labels with the specified name.
304
+
305
+ __Requirements:__
306
+
307
+ - `pip install fiftyone`
308
+
309
+ Args:
310
+ tbl: The table from which to export data.
311
+ images: A column or expression that contains the images to export.
312
+ image_format: The format to use when writing out images for export.
313
+ classifications: Optional image classification labels. If a single `Expr` is provided, it must be a table
314
+ column or an expression that evaluates to a list of dictionaries. Each dictionary in the list corresponds
315
+ to an image class and must have the following structure:
316
+
317
+ ```python
318
+ {'label': 'zebra', 'confidence': 0.325}
319
+ ```
320
+
321
+ If multiple `Expr`s are provided, each one must evaluate to a list of such dictionaries.
322
+ detections: Optional image detection labels. If a single `Expr` is provided, it must be a table column or an
323
+ expression that evaluates to a list of dictionaries. Each dictionary in the list corresponds to an image
324
+ detection, and must have the following structure:
325
+
326
+ ```python
327
+ {
328
+ 'label': 'giraffe',
329
+ 'confidence': 0.99,
330
+ 'bounding_box': [0.081, 0.836, 0.202, 0.136] # [x, y, w, h], fractional coordinates
331
+ }
332
+ ```
333
+
334
+ If multiple `Expr`s are provided, each one must evaluate to a list of such dictionaries.
335
+
336
+ Returns:
337
+ A Voxel51 dataset.
338
+
339
+ Example:
340
+ Export the images in the `image` column of the table `tbl` as a Voxel51 dataset, using classification
341
+ labels from `tbl.classifications`:
342
+
343
+ >>> export_as_fiftyone(
344
+ ... tbl,
345
+ ... tbl.image,
346
+ ... classifications=tbl.classifications
347
+ ... )
348
+
349
+ See the [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial
350
+ for a fully worked example.
351
+ """
352
+ Env.get().require_package('fiftyone')
353
+
354
+ import fiftyone as fo
355
+
356
+ from pixeltable.io.fiftyone import PxtImageDatasetImporter
357
+
358
+ if not images.col_type.is_image_type():
359
+ raise excs.Error(f'`images` must be an expression of type Image (got {images.col_type._to_base_str()})')
360
+
361
+ return fo.Dataset.from_importer(PxtImageDatasetImporter(
362
+ tbl, images, image_format, classifications=classifications, detections=detections
363
+ ))
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
- from typing import Dict, Any, Tuple, List
3
- from abc import abstractmethod, ABC
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any
4
5
 
5
6
  from pixeltable.type_system import ColumnType
6
7
 
@@ -152,7 +152,7 @@ class DocumentSplitter(ComponentIterator):
152
152
  assert self._doc_handle.pdf_doc is not None
153
153
  self._sections = self._pdf_sections()
154
154
  else:
155
- assert False, f'unknown document format: {self._doc_handle.format}'
155
+ assert False, f'Unsupported document format: {self._doc_handle.format}'
156
156
 
157
157
  if Separator.SENTENCE in self._separators:
158
158
  self._sections = self._sentence_sections(self._sections)