pixeltable 0.2.11__py3-none-any.whl → 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (48) hide show
  1. pixeltable/__init__.py +2 -2
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/insertable_table.py +2 -2
  4. pixeltable/catalog/schema_object.py +28 -2
  5. pixeltable/catalog/table.py +68 -30
  6. pixeltable/catalog/table_version.py +14 -43
  7. pixeltable/catalog/view.py +2 -2
  8. pixeltable/dataframe.py +8 -7
  9. pixeltable/exec/expr_eval_node.py +8 -1
  10. pixeltable/exec/sql_scan_node.py +1 -1
  11. pixeltable/exprs/__init__.py +0 -1
  12. pixeltable/exprs/column_ref.py +2 -7
  13. pixeltable/exprs/comparison.py +5 -5
  14. pixeltable/exprs/compound_predicate.py +12 -12
  15. pixeltable/exprs/expr.py +32 -0
  16. pixeltable/exprs/in_predicate.py +3 -3
  17. pixeltable/exprs/is_null.py +5 -5
  18. pixeltable/exprs/similarity_expr.py +27 -16
  19. pixeltable/func/aggregate_function.py +10 -4
  20. pixeltable/func/callable_function.py +4 -0
  21. pixeltable/func/function_registry.py +2 -0
  22. pixeltable/functions/globals.py +36 -1
  23. pixeltable/functions/huggingface.py +62 -4
  24. pixeltable/functions/image.py +17 -0
  25. pixeltable/functions/openai.py +1 -1
  26. pixeltable/functions/string.py +622 -7
  27. pixeltable/functions/video.py +26 -8
  28. pixeltable/globals.py +54 -50
  29. pixeltable/index/embedding_index.py +28 -27
  30. pixeltable/io/external_store.py +2 -2
  31. pixeltable/io/globals.py +54 -5
  32. pixeltable/io/label_studio.py +45 -5
  33. pixeltable/io/pandas.py +18 -7
  34. pixeltable/metadata/__init__.py +1 -1
  35. pixeltable/metadata/converters/convert_17.py +26 -0
  36. pixeltable/plan.py +6 -6
  37. pixeltable/tool/create_test_db_dump.py +2 -2
  38. pixeltable/tool/doc_plugins/griffe.py +77 -0
  39. pixeltable/tool/doc_plugins/mkdocstrings.py +6 -0
  40. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +135 -0
  41. pixeltable/utils/s3.py +1 -1
  42. pixeltable-0.2.13.dist-info/METADATA +206 -0
  43. {pixeltable-0.2.11.dist-info → pixeltable-0.2.13.dist-info}/RECORD +46 -42
  44. pixeltable-0.2.13.dist-info/entry_points.txt +3 -0
  45. pixeltable/exprs/predicate.py +0 -44
  46. pixeltable-0.2.11.dist-info/METADATA +0 -137
  47. {pixeltable-0.2.11.dist-info → pixeltable-0.2.13.dist-info}/LICENSE +0 -0
  48. {pixeltable-0.2.11.dist-info → pixeltable-0.2.13.dist-info}/WHEEL +0 -0
pixeltable/globals.py CHANGED
@@ -7,10 +7,10 @@ import sqlalchemy as sql
7
7
  from sqlalchemy.util.preloaded import orm
8
8
 
9
9
  import pixeltable.exceptions as excs
10
+ import pixeltable.exprs as exprs
10
11
  from pixeltable import catalog, func, DataFrame
11
12
  from pixeltable.catalog import Catalog
12
13
  from pixeltable.env import Env
13
- from pixeltable.exprs import Predicate
14
14
  from pixeltable.iterators import ComponentIterator
15
15
  from pixeltable.metadata import schema
16
16
 
@@ -81,7 +81,7 @@ def create_view(
81
81
  base: Union[catalog.Table, DataFrame],
82
82
  *,
83
83
  schema: Optional[dict[str, Any]] = None,
84
- filter: Optional[Predicate] = None,
84
+ filter: Optional[exprs.Expr] = None,
85
85
  is_snapshot: bool = False,
86
86
  iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]] = None,
87
87
  num_retained_versions: int = 10,
@@ -94,7 +94,7 @@ def create_view(
94
94
  path_str: Path to the view.
95
95
  base: Table (i.e., table or view or snapshot) or DataFrame to base the view on.
96
96
  schema: dictionary mapping column names to column types, value expressions, or to column specifications.
97
- filter: Predicate to filter rows of the base table.
97
+ filter: predicate to filter rows of the base table.
98
98
  is_snapshot: Whether the view is a snapshot.
99
99
  iterator: The iterator to use for this view. If specified, then this view will be a one-to-many view of
100
100
  the base table.
@@ -234,7 +234,7 @@ def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> N
234
234
 
235
235
  Args:
236
236
  path: Path to the table.
237
- force: Whether to drop the table even if it has unsaved changes.
237
+ force: If `True`, will also drop all views or sub-views of this table.
238
238
  ignore_errors: Whether to ignore errors if the table does not exist.
239
239
 
240
240
  Raises:
@@ -243,21 +243,27 @@ def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> N
243
243
  Examples:
244
244
  >>> cl.drop_table('my_table')
245
245
  """
246
+ cat = Catalog.get()
246
247
  path_obj = catalog.Path(path)
247
248
  try:
248
- Catalog.get().paths.check_is_valid(path_obj, expected=catalog.Table)
249
+ cat.paths.check_is_valid(path_obj, expected=catalog.Table)
249
250
  except Exception as e:
250
- if ignore_errors:
251
+ if ignore_errors or force:
251
252
  _logger.info(f'Skipped table `{path}` (does not exist).')
252
253
  return
253
254
  else:
254
255
  raise e
255
- tbl = Catalog.get().paths[path_obj]
256
- if len(Catalog.get().tbl_dependents[tbl._id]) > 0:
257
- dependent_paths = [get_path(dep) for dep in Catalog.get().tbl_dependents[tbl._id]]
258
- raise excs.Error(f'Table {path} has dependents: {", ".join(dependent_paths)}')
256
+ tbl = cat.paths[path_obj]
257
+ assert isinstance(tbl, catalog.Table)
258
+ if len(cat.tbl_dependents[tbl._id]) > 0:
259
+ dependent_paths = [dep.path for dep in cat.tbl_dependents[tbl._id]]
260
+ if force:
261
+ for dependent_path in dependent_paths:
262
+ drop_table(dependent_path, force=True)
263
+ else:
264
+ raise excs.Error(f'Table {path} has dependents: {", ".join(dependent_paths)}')
259
265
  tbl._drop()
260
- del Catalog.get().paths[path_obj]
266
+ del cat.paths[path_obj]
261
267
  _logger.info(f'Dropped table `{path}`.')
262
268
 
263
269
 
@@ -291,7 +297,7 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
291
297
  return [str(p) for p in Catalog.get().paths.get_children(path, child_type=catalog.Table, recursive=recursive)]
292
298
 
293
299
 
294
- def create_dir(path_str: str, ignore_errors: bool = False) -> None:
300
+ def create_dir(path_str: str, ignore_errors: bool = False) -> catalog.Dir:
295
301
  """Create a directory.
296
302
 
297
303
  Args:
@@ -319,10 +325,12 @@ def create_dir(path_str: str, ignore_errors: bool = False) -> None:
319
325
  session.add(dir_record)
320
326
  session.flush()
321
327
  assert dir_record.id is not None
322
- Catalog.get().paths[path] = catalog.Dir(dir_record.id, parent._id, path.name)
328
+ dir = catalog.Dir(dir_record.id, parent._id, path.name)
329
+ Catalog.get().paths[path] = dir
323
330
  session.commit()
324
331
  _logger.info(f'Created directory `{path_str}`.')
325
332
  print(f'Created directory `{path_str}`.')
333
+ return dir
326
334
  except excs.Error as e:
327
335
  if ignore_errors:
328
336
  return
@@ -330,7 +338,7 @@ def create_dir(path_str: str, ignore_errors: bool = False) -> None:
330
338
  raise e
331
339
 
332
340
 
333
- def rm_dir(path_str: str) -> None:
341
+ def drop_dir(path_str: str, force: bool = False, ignore_errors: bool = False) -> None:
334
342
  """Remove a directory.
335
343
 
336
344
  Args:
@@ -340,31 +348,49 @@ def rm_dir(path_str: str) -> None:
340
348
  Error: If the path does not exist or does not designate a directory or if the directory is not empty.
341
349
 
342
350
  Examples:
343
- >>> cl.rm_dir('my_dir')
351
+ >>> cl.drop_dir('my_dir')
344
352
 
345
353
  Remove a subdirectory:
346
354
 
347
- >>> cl.rm_dir('my_dir.sub_dir')
355
+ >>> cl.drop_dir('my_dir.sub_dir')
348
356
  """
357
+ cat = Catalog.get()
349
358
  path = catalog.Path(path_str)
350
- Catalog.get().paths.check_is_valid(path, expected=catalog.Dir)
351
359
 
352
- # make sure it's empty
353
- if len(Catalog.get().paths.get_children(path, child_type=None, recursive=True)) > 0:
354
- raise excs.Error(f'Directory {path_str} is not empty')
355
- # TODO: figure out how to make force=True work in the presence of snapshots
356
- # # delete tables
357
- # for tbl_path in self.paths.get_children(path, child_type=MutableTable, recursive=True):
358
- # self.drop_table(str(tbl_path), force=True)
359
- # # rm subdirs
360
- # for dir_path in self.paths.get_children(path, child_type=Dir, recursive=False):
361
- # self.rm_dir(str(dir_path), force=True)
360
+ try:
361
+ cat.paths.check_is_valid(path, expected=catalog.Dir)
362
+ except Exception as e:
363
+ if ignore_errors or force:
364
+ _logger.info(f'Skipped directory `{path}` (does not exist).')
365
+ return
366
+ else:
367
+ raise e
368
+
369
+ children = cat.paths.get_children(path, child_type=None, recursive=True)
370
+
371
+ if len(children) > 0 and not force:
372
+ raise excs.Error(f'Directory `{path_str}` is not empty.')
373
+
374
+ for child in children:
375
+ assert isinstance(child, catalog.Path)
376
+ # We need to check that the child is still in `cat.paths`, since it is possible it was
377
+ # already deleted as a dependent of a preceding child in the iteration.
378
+ try:
379
+ obj = cat.paths[child]
380
+ except excs.Error:
381
+ continue
382
+ if isinstance(obj, catalog.Dir):
383
+ drop_dir(str(child), force=True)
384
+ else:
385
+ assert isinstance(obj, catalog.Table)
386
+ assert not obj._is_dropped # else it should have been removed from `cat.paths` already
387
+ drop_table(str(child), force=True)
362
388
 
363
389
  with Env.get().engine.begin() as conn:
364
390
  dir = Catalog.get().paths[path]
365
391
  conn.execute(sql.delete(schema.Dir.__table__).where(schema.Dir.id == dir._id))
366
392
  del Catalog.get().paths[path]
367
- _logger.info(f'Removed directory {path_str}')
393
+ _logger.info(f'Removed directory `{path_str}`.')
368
394
 
369
395
 
370
396
  def list_dirs(path_str: str = '', recursive: bool = True) -> list[str]:
@@ -416,28 +442,6 @@ def list_functions() -> pd.DataFrame:
416
442
  return pd_df.hide(axis='index')
417
443
 
418
444
 
419
- def get_path(schema_obj: catalog.SchemaObject) -> str:
420
- """Returns the path to a SchemaObject.
421
-
422
- Args:
423
- schema_obj: SchemaObject to get the path for.
424
-
425
- Returns:
426
- Path to the SchemaObject.
427
- """
428
- path_elements: list[str] = []
429
- dir_id = schema_obj._dir_id
430
- while dir_id is not None:
431
- dir = Catalog.get().paths.get_schema_obj(dir_id)
432
- if dir._dir_id is None:
433
- # this is the root dir with name '', which we don't want to include in the path
434
- break
435
- path_elements.insert(0, dir._name)
436
- dir_id = dir._dir_id
437
- path_elements.append(schema_obj._name)
438
- return '.'.join(path_elements)
439
-
440
-
441
445
  def configure_logging(
442
446
  *,
443
447
  to_stdout: Optional[bool] = None,
@@ -24,6 +24,7 @@ class EmbeddingIndex(IndexBase):
24
24
  - similarity_clause() converts those metrics back to their original form; it is used in expressions outside
25
25
  the Order By clause
26
26
  - order_by_clause() is used exclusively in the ORDER BY clause
27
+ - embedding function parameters are named '<type-name>_embed', where type-name is ColumnType.Type.name
27
28
  """
28
29
 
29
30
  class Metric(enum.Enum):
@@ -38,30 +39,30 @@ class EmbeddingIndex(IndexBase):
38
39
  }
39
40
 
40
41
  def __init__(
41
- self, c: catalog.Column, metric: str, text_embed: Optional[func.Function] = None,
42
- img_embed: Optional[func.Function] = None):
42
+ self, c: catalog.Column, metric: str, string_embed: Optional[func.Function] = None,
43
+ image_embed: Optional[func.Function] = None):
43
44
  metric_names = [m.name.lower() for m in self.Metric]
44
45
  if metric.lower() not in metric_names:
45
46
  raise excs.Error(f'Invalid metric {metric}, must be one of {metric_names}')
46
47
  if not c.col_type.is_string_type() and not c.col_type.is_image_type():
47
48
  raise excs.Error(f'Embedding index requires string or image column')
48
- if c.col_type.is_string_type() and text_embed is None:
49
- raise excs.Error(f'Text embedding function is required for column {c.name} (parameter `txt_embed`)')
50
- if c.col_type.is_image_type() and img_embed is None:
51
- raise excs.Error(f'Image embedding function is required for column {c.name} (parameter `img_embed`)')
52
- if text_embed is not None:
49
+ if c.col_type.is_string_type() and string_embed is None:
50
+ raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
51
+ if c.col_type.is_image_type() and image_embed is None:
52
+ raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
53
+ if string_embed is not None:
53
54
  # verify signature
54
- self._validate_embedding_fn(text_embed, 'txt_embed', ts.ColumnType.Type.STRING)
55
- if img_embed is not None:
55
+ self._validate_embedding_fn(string_embed, 'string_embed', ts.ColumnType.Type.STRING)
56
+ if image_embed is not None:
56
57
  # verify signature
57
- self._validate_embedding_fn(img_embed, 'img_embed', ts.ColumnType.Type.IMAGE)
58
+ self._validate_embedding_fn(image_embed, 'image_embed', ts.ColumnType.Type.IMAGE)
58
59
 
59
60
  self.metric = self.Metric[metric.upper()]
60
61
  from pixeltable.exprs import ColumnRef
61
- self.value_expr = text_embed(ColumnRef(c)) if c.col_type.is_string_type() else img_embed(ColumnRef(c))
62
+ self.value_expr = string_embed(ColumnRef(c)) if c.col_type.is_string_type() else image_embed(ColumnRef(c))
62
63
  assert self.value_expr.col_type.is_array_type()
63
- self.txt_embed = text_embed
64
- self.img_embed = img_embed
64
+ self.string_embed = string_embed
65
+ self.image_embed = image_embed
65
66
  vector_size = self.value_expr.col_type.shape[0]
66
67
  assert vector_size is not None
67
68
  self.index_col_type = pgvector.sqlalchemy.Vector(vector_size)
@@ -88,14 +89,14 @@ class EmbeddingIndex(IndexBase):
88
89
  idx.create(bind=conn)
89
90
 
90
91
  def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.ClauseElement:
91
- """Create a ClauseElement to that represents '<val_column> <op> <item>'"""
92
+ """Create a ClauseElement that represents '<val_column> <op> <item>'"""
92
93
  assert isinstance(item, (str, PIL.Image.Image))
93
94
  if isinstance(item, str):
94
- assert self.txt_embed is not None
95
- embedding = self.txt_embed.exec(item)
95
+ assert self.string_embed is not None
96
+ embedding = self.string_embed.exec(item)
96
97
  if isinstance(item, PIL.Image.Image):
97
- assert self.img_embed is not None
98
- embedding = self.img_embed.exec(item)
98
+ assert self.image_embed is not None
99
+ embedding = self.image_embed.exec(item)
99
100
 
100
101
  if self.metric == self.Metric.COSINE:
101
102
  return val_column.sa_col.cosine_distance(embedding) * -1 + 1
@@ -110,11 +111,11 @@ class EmbeddingIndex(IndexBase):
110
111
  assert isinstance(item, (str, PIL.Image.Image))
111
112
  embedding: Optional[np.ndarray] = None
112
113
  if isinstance(item, str):
113
- assert self.txt_embed is not None
114
- embedding = self.txt_embed.exec(item)
114
+ assert self.string_embed is not None
115
+ embedding = self.string_embed.exec(item)
115
116
  if isinstance(item, PIL.Image.Image):
116
- assert self.img_embed is not None
117
- embedding = self.img_embed.exec(item)
117
+ assert self.image_embed is not None
118
+ embedding = self.image_embed.exec(item)
118
119
  assert embedding is not None
119
120
 
120
121
  if self.metric == self.Metric.COSINE:
@@ -160,12 +161,12 @@ class EmbeddingIndex(IndexBase):
160
161
  def as_dict(self) -> dict:
161
162
  return {
162
163
  'metric': self.metric.name.lower(),
163
- 'txt_embed': None if self.txt_embed is None else self.txt_embed.as_dict(),
164
- 'img_embed': None if self.img_embed is None else self.img_embed.as_dict()
164
+ 'string_embed': None if self.string_embed is None else self.string_embed.as_dict(),
165
+ 'image_embed': None if self.image_embed is None else self.image_embed.as_dict()
165
166
  }
166
167
 
167
168
  @classmethod
168
169
  def from_dict(cls, c: catalog.Column, d: dict) -> EmbeddingIndex:
169
- txt_embed = func.Function.from_dict(d['txt_embed']) if d['txt_embed'] is not None else None
170
- img_embed = func.Function.from_dict(d['img_embed']) if d['img_embed'] is not None else None
171
- return cls(c, metric=d['metric'], text_embed=txt_embed, img_embed=img_embed)
170
+ string_embed = func.Function.from_dict(d['string_embed']) if d['string_embed'] is not None else None
171
+ image_embed = func.Function.from_dict(d['image_embed']) if d['image_embed'] is not None else None
172
+ return cls(c, metric=d['metric'], string_embed=string_embed, image_embed=image_embed)
@@ -222,12 +222,12 @@ class Project(ExternalStore, abc.ABC):
222
222
  if t_col not in t_cols:
223
223
  if is_user_specified_col_mapping:
224
224
  raise excs.Error(
225
- f'Column name `{t_col}` appears as a key in `col_mapping`, but Table `{table.get_name()}` '
225
+ f'Column name `{t_col}` appears as a key in `col_mapping`, but Table `{table.name}` '
226
226
  'contains no such column.'
227
227
  )
228
228
  else:
229
229
  raise excs.Error(
230
- f'Column `{t_col}` does not exist in Table `{table.get_name()}`. Either add a column `{t_col}`, '
230
+ f'Column `{t_col}` does not exist in Table `{table.name}`. Either add a column `{t_col}`, '
231
231
  f'or specify a `col_mapping` to associate a different column with the external field `{ext_col}`.'
232
232
  )
233
233
  if ext_col not in export_cols and ext_col not in import_cols:
pixeltable/io/globals.py CHANGED
@@ -13,11 +13,14 @@ def create_label_studio_project(
13
13
  media_import_method: Literal['post', 'file', 'url'] = 'post',
14
14
  col_mapping: Optional[dict[str, str]] = None,
15
15
  sync_immediately: bool = True,
16
+ s3_configuration: Optional[dict[str, Any]] = None,
16
17
  **kwargs: Any
17
18
  ) -> SyncStatus:
18
- # TODO(aaron-siegel): Add link in docstring to a Label Studio howto
19
19
  """
20
- Creates a new Label Studio project and links it to the specified `Table`.
20
+ Create a new Label Studio project and link it to the specified `Table`.
21
+
22
+ - A tutorial notebook with fully worked examples can be found here:
23
+ [Using Label Studio for Annotations with Pixeltable](https://pixeltable.readme.io/docs/label-studio)
21
24
 
22
25
  The required parameter `label_config` specifies the Label Studio project configuration,
23
26
  in XML format, as described in the Label Studio documentation. The linked project will
@@ -41,6 +44,11 @@ def create_label_studio_project(
41
44
  * Set the `LABEL_STUDIO_API_KEY` and `LABEL_STUDIO_URL` environment variables; or
42
45
  * Specify `api_key` and `url` fields in the `label-studio` section of `$PIXELTABLE_HOME/config.yaml`.
43
46
 
47
+ __Requirements:__
48
+
49
+ - `pip install label-studio-sdk`
50
+ - `pip install boto3` (if using S3 import storage)
51
+
44
52
  Args:
45
53
  t: The Table to link to.
46
54
  label_config: The Label Studio project configuration, in XML format.
@@ -50,8 +58,9 @@ def create_label_studio_project(
50
58
  `ls_project_0`, `ls_project_1`, etc.
51
59
  title: An optional title for the Label Studio project. This is the title that annotators
52
60
  will see inside Label Studio. Unlike `name`, it does not need to be an identifier and
53
- does not need to be unique. If not specified, the table name `t.get_name()` will be used.
61
+ does not need to be unique. If not specified, the table name `t.name` will be used.
54
62
  media_import_method: The method to use when transferring media files to Label Studio:
63
+
55
64
  - `post`: Media will be sent to Label Studio via HTTP post. This should generally only be used for
56
65
  prototyping; due to restrictions in Label Studio, it can only be used with projects that have
57
66
  just one data field, and does not scale well.
@@ -63,9 +72,48 @@ def create_label_studio_project(
63
72
  col_mapping: An optional mapping of local column names to Label Studio fields.
64
73
  sync_immediately: If `True`, immediately perform an initial synchronization by
65
74
  exporting all rows of the `Table` as Label Studio tasks.
75
+ s3_configuration: If specified, S3 import storage will be configured for the new project. This can only
76
+ be used with `media_import_method='url'`, and if `media_import_method='url'` and any of the media data is
77
+ referenced by `s3://` URLs, then it must be specified in order for such media to display correctly
78
+ in the Label Studio interface.
79
+
80
+ The items in the `s3_configuration` dictionary correspond to kwarg
81
+ parameters of the Label Studio `connect_s3_import_storage` method, as described in the
82
+ [Label Studio connect_s3_import_storage docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.connect_s3_import_storage).
83
+ `bucket` must be specified; all other parameters are optional. If credentials are not specified explicitly,
84
+ Pixeltable will attempt to retrieve them from the environment (such as from `~/.aws/credentials`). If a title is not
85
+ specified, Pixeltable will use the default `'Pixeltable-S3-Import-Storage'`. All other parameters use their Label
86
+ Studio defaults.
66
87
  kwargs: Additional keyword arguments are passed to the `start_project` method in the Label
67
- Studio SDK, as described here:
68
- https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.start_project
88
+ Studio SDK, as described in the
89
+ [Label Studio start_project docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.start_project).
90
+
91
+ Returns:
92
+ A `SyncStatus` representing the status of any synchronization operations that occurred.
93
+
94
+ Examples:
95
+ Create a Label Studio project whose tasks correspond to videos stored in the `video_col` column of the table `tbl`:
96
+
97
+ >>> config = \"\"\"
98
+ <View>
99
+ <Video name="video_obj" value="$video_col"/>
100
+ <Choices name="video-category" toName="video" showInLine="true">
101
+ <Choice value="city"/>
102
+ <Choice value="food"/>
103
+ <Choice value="sports"/>
104
+ </Choices>
105
+ </View>\"\"\"
106
+ create_label_studio_project(tbl, config)
107
+
108
+ Create a Label Studio project with the same configuration, using `media_import_method='url'`,
109
+ whose media are stored in an S3 bucket:
110
+
111
+ >>> create_label_studio_project(
112
+ tbl,
113
+ config,
114
+ media_import_method='url',
115
+ s3_configuration={'bucket': 'my-bucket', 'region_name': 'us-east-2'}
116
+ )
69
117
  """
70
118
  from pixeltable.io.label_studio import LabelStudioProject
71
119
 
@@ -76,6 +124,7 @@ def create_label_studio_project(
76
124
  title,
77
125
  media_import_method,
78
126
  col_mapping,
127
+ s3_configuration,
79
128
  **kwargs
80
129
  )
81
130
 
@@ -1,3 +1,4 @@
1
+ import copy
1
2
  import json
2
3
  import logging
3
4
  import os
@@ -18,6 +19,15 @@ from pixeltable.exprs import ColumnRef, DataRow, Expr
18
19
  from pixeltable.io.external_store import Project, SyncStatus
19
20
  from pixeltable.utils import coco
20
21
 
22
+ # label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
23
+ # the import two different ways to insure intercompatibility
24
+ try:
25
+ # label_studio_sdk<1 compatibility
26
+ import label_studio_sdk.project as ls_project # type: ignore
27
+ except ImportError:
28
+ # label_studio_sdk>=1 compatibility
29
+ import label_studio_sdk._legacy.project as ls_project # type: ignore
30
+
21
31
  _logger = logging.getLogger('pixeltable')
22
32
 
23
33
 
@@ -50,11 +60,11 @@ class LabelStudioProject(Project):
50
60
  """
51
61
  self.project_id = project_id
52
62
  self.media_import_method = media_import_method
53
- self._project: Optional[label_studio_sdk.project.Project] = None
63
+ self._project: Optional[ls_project.Project] = None
54
64
  super().__init__(name, col_mapping, stored_proxies)
55
65
 
56
66
  @property
57
- def project(self) -> label_studio_sdk.project.Project:
67
+ def project(self) -> ls_project.Project:
58
68
  """The `Project` object corresponding to this Label Studio project."""
59
69
  if self._project is None:
60
70
  try:
@@ -95,7 +105,7 @@ class LabelStudioProject(Project):
95
105
  return {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}
96
106
 
97
107
  def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
98
- _logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t.get_name()}`'
108
+ _logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t.name}`'
99
109
  f' (export: {export_data}, import: {import_data}).')
100
110
  # Collect all existing tasks into a dict with entries `rowid: task`
101
111
  tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
@@ -386,7 +396,7 @@ class LabelStudioProject(Project):
386
396
  updates = [{'_rowid': rowid, local_annotations_col.name: ann} for rowid, ann in annotations.items()]
387
397
  if len(updates) > 0:
388
398
  _logger.info(
389
- f'Updating table `{t.get_name()}`, column `{local_annotations_col.name}` with {len(updates)} total annotations.'
399
+ f'Updating table `{t.name}`, column `{local_annotations_col.name}` with {len(updates)} total annotations.'
390
400
  )
391
401
  # batch_update currently doesn't propagate from views to base tables. As a workaround, we call
392
402
  # batch_update on the actual ancestor table that holds the annotations column.
@@ -536,6 +546,7 @@ class LabelStudioProject(Project):
536
546
  title: Optional[str],
537
547
  media_import_method: Literal['post', 'file', 'url'],
538
548
  col_mapping: Optional[dict[str, str]],
549
+ s3_configuration: Optional[dict[str, Any]],
539
550
  **kwargs: Any
540
551
  ) -> 'LabelStudioProject':
541
552
  """
@@ -554,7 +565,7 @@ class LabelStudioProject(Project):
554
565
 
555
566
  if title is None:
556
567
  # `title` defaults to table name
557
- title = t.get_name()
568
+ title = t.name
558
569
 
559
570
  # Create a column to hold the annotations, if one does not yet exist
560
571
  if col_mapping is None or ANNOTATIONS_COLUMN in col_mapping.values():
@@ -572,6 +583,31 @@ class LabelStudioProject(Project):
572
583
  if media_import_method == 'post' and len(config.data_keys) > 1:
573
584
  raise excs.Error('`media_import_method` cannot be `post` if there is more than one data key')
574
585
 
586
+ if s3_configuration is not None:
587
+ if media_import_method != 'url':
588
+ raise excs.Error("`s3_configuration` is only valid when `media_import_method == 'url'`")
589
+ s3_configuration = copy.copy(s3_configuration)
590
+ if not 'bucket' in s3_configuration:
591
+ raise excs.Error('`s3_configuration` must contain a `bucket` field')
592
+ if not 'title' in s3_configuration:
593
+ s3_configuration['title'] = 'Pixeltable-S3-Import-Storage'
594
+ if ('aws_access_key_id' not in s3_configuration and
595
+ 'aws_secret_access_key' not in s3_configuration and
596
+ 'aws_session_token' not in s3_configuration):
597
+ # Attempt to fill any missing credentials from the environment
598
+ try:
599
+ import boto3
600
+ s3_credentials = boto3.Session().get_credentials().get_frozen_credentials()
601
+ _logger.info(f'Using AWS credentials from the environment for Label Studio project: {title}')
602
+ s3_configuration['aws_access_key_id'] = s3_credentials.access_key
603
+ s3_configuration['aws_secret_access_key'] = s3_credentials.secret_key
604
+ s3_configuration['aws_session_token'] = s3_credentials.token
605
+ except Exception as exc:
606
+ # This is not necessarily a problem, but we should log that it happened
607
+ _logger.debug(f'Unable to retrieve AWS credentials from the environment: {exc}')
608
+ pass
609
+
610
+ _logger.info(f'Creating Label Studio project: {title}')
575
611
  project = _label_studio_client().start_project(title=title, label_config=label_config, **kwargs)
576
612
 
577
613
  if media_import_method == 'file':
@@ -591,6 +627,10 @@ class LabelStudioProject(Project):
591
627
  ) from exc
592
628
  raise # Handle any other exception type normally
593
629
 
630
+ if s3_configuration is not None:
631
+ _logger.info(f'Setting up S3 import storage for Label Studio project: {title}')
632
+ project.connect_s3_import_storage(**s3_configuration)
633
+
594
634
  project_id = project.get_params()['id']
595
635
  return LabelStudioProject(name, project_id, media_import_method, resolved_col_mapping)
596
636
 
pixeltable/io/pandas.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Optional, Any, Iterable
1
+ from typing import Optional, Any, Iterable, Union
2
2
 
3
3
  import numpy as np
4
4
  import pandas as pd
@@ -9,7 +9,10 @@ import pixeltable.type_system as ts
9
9
 
10
10
 
11
11
  def import_pandas(
12
- tbl_name: str, df: pd.DataFrame, *, schema_overrides: Optional[dict[str, pxt.ColumnType]] = None
12
+ tbl_name: str, df: pd.DataFrame, *, schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
13
+ primary_key: Optional[Union[str, list[str]]] = None,
14
+ num_retained_versions: int = 10,
15
+ comment: str = ''
13
16
  ) -> pxt.catalog.InsertableTable:
14
17
  """Creates a new `Table` from a Pandas `DataFrame`, with the specified name. The schema of the table
15
18
  will be inferred from the `DataFrame`, unless `schema` is specified.
@@ -31,13 +34,17 @@ def import_pandas(
31
34
  """
32
35
  schema = _df_to_pxt_schema(df, schema_overrides)
33
36
  tbl_rows = (dict(_df_row_to_pxt_row(row, schema)) for row in df.itertuples())
34
- table = pxt.create_table(tbl_name, schema)
37
+ table = pxt.create_table(tbl_name, schema, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
35
38
  table.insert(tbl_rows)
36
39
  return table
37
40
 
38
41
 
39
42
  def import_csv(
40
- table_path: str, filepath_or_buffer, schema_overrides: Optional[dict[str, ts.ColumnType]] = None, **kwargs
43
+ tbl_name: str, filepath_or_buffer, schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
44
+ primary_key: Optional[Union[str, list[str]]] = None,
45
+ num_retained_versions: int = 10,
46
+ comment: str = '',
47
+ **kwargs
41
48
  ) -> pxt.catalog.InsertableTable:
42
49
  """
43
50
  Creates a new `Table` from a csv file. This is a convenience method and is equivalent
@@ -45,11 +52,15 @@ def import_csv(
45
52
  See the Pandas documentation for `read_csv` for more details.
46
53
  """
47
54
  df = pd.read_csv(filepath_or_buffer, **kwargs)
48
- return import_pandas(table_path, df, schema_overrides=schema_overrides)
55
+ return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
49
56
 
50
57
 
51
58
  def import_excel(
52
- table_path: str, io, *args, schema_overrides: Optional[dict[str, ts.ColumnType]] = None, **kwargs
59
+ tbl_name: str, io, *args, schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
60
+ primary_key: Optional[Union[str, list[str]]] = None,
61
+ num_retained_versions: int = 10,
62
+ comment: str = '',
63
+ **kwargs
53
64
  ) -> pxt.catalog.InsertableTable:
54
65
  """
55
66
  Creates a new `Table` from an excel (.xlsx) file. This is a convenience method and is equivalent
@@ -57,7 +68,7 @@ def import_excel(
57
68
  See the Pandas documentation for `read_excel` for more details.
58
69
  """
59
70
  df = pd.read_excel(io, *args, **kwargs)
60
- return import_pandas(table_path, df, schema_overrides=schema_overrides)
71
+ return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
61
72
 
62
73
 
63
74
  def _df_to_pxt_schema(
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
10
10
  from .schema import SystemInfo, SystemInfoMd
11
11
 
12
12
  # current version of the metadata; this is incremented whenever the metadata schema changes
13
- VERSION = 17
13
+ VERSION = 18
14
14
 
15
15
 
16
16
  def create_system_info(engine: sql.engine.Engine) -> None:
@@ -0,0 +1,26 @@
1
+ import sqlalchemy as sql
2
+
3
+ from pixeltable.metadata import register_converter
4
+ from pixeltable.metadata.converters.util import convert_table_md
5
+
6
+
7
+ @register_converter(version=17)
8
+ def _(engine: sql.engine.Engine) -> None:
9
+ convert_table_md(
10
+ engine,
11
+ table_md_updater=__update_table_md
12
+ )
13
+
14
+
15
+ def __update_table_md(table_md: dict) -> None:
16
+ # key changes in IndexMd.init_args: img_embed -> image_embed, txt_embed -> string_embed
17
+ if len(table_md['index_md']) == 0:
18
+ return
19
+ for idx_md in table_md['index_md'].values():
20
+ if not idx_md['class_fqn'].endswith('.EmbeddingIndex'):
21
+ continue
22
+ init_dict = idx_md['init_args']
23
+ init_dict['image_embed'] = init_dict['img_embed']
24
+ del init_dict['img_embed']
25
+ init_dict['string_embed'] = init_dict['txt_embed']
26
+ del init_dict['txt_embed']