PyPI - pixeltable - Versions diffs - 0.2.11__py3-none-any.whl → 0.2.13__py3-none-any.whl - Mend

pixeltable 0.2.11py3-none-any.whl → 0.2.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (48) hide show

pixeltable/__init__.py +2 -2
pixeltable/__version__.py +2 -2
pixeltable/catalog/insertable_table.py +2 -2
pixeltable/catalog/schema_object.py +28 -2
pixeltable/catalog/table.py +68 -30
pixeltable/catalog/table_version.py +14 -43
pixeltable/catalog/view.py +2 -2
pixeltable/dataframe.py +8 -7
pixeltable/exec/expr_eval_node.py +8 -1
pixeltable/exec/sql_scan_node.py +1 -1
pixeltable/exprs/__init__.py +0 -1
pixeltable/exprs/column_ref.py +2 -7
pixeltable/exprs/comparison.py +5 -5
pixeltable/exprs/compound_predicate.py +12 -12
pixeltable/exprs/expr.py +32 -0
pixeltable/exprs/in_predicate.py +3 -3
pixeltable/exprs/is_null.py +5 -5
pixeltable/exprs/similarity_expr.py +27 -16
pixeltable/func/aggregate_function.py +10 -4
pixeltable/func/callable_function.py +4 -0
pixeltable/func/function_registry.py +2 -0
pixeltable/functions/globals.py +36 -1
pixeltable/functions/huggingface.py +62 -4
pixeltable/functions/image.py +17 -0
pixeltable/functions/openai.py +1 -1
pixeltable/functions/string.py +622 -7
pixeltable/functions/video.py +26 -8
pixeltable/globals.py +54 -50
pixeltable/index/embedding_index.py +28 -27
pixeltable/io/external_store.py +2 -2
pixeltable/io/globals.py +54 -5
pixeltable/io/label_studio.py +45 -5
pixeltable/io/pandas.py +18 -7
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_17.py +26 -0
pixeltable/plan.py +6 -6
pixeltable/tool/create_test_db_dump.py +2 -2
pixeltable/tool/doc_plugins/griffe.py +77 -0
pixeltable/tool/doc_plugins/mkdocstrings.py +6 -0
pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +135 -0
pixeltable/utils/s3.py +1 -1
pixeltable-0.2.13.dist-info/METADATA +206 -0
{pixeltable-0.2.11.dist-info → pixeltable-0.2.13.dist-info}/RECORD +46 -42
pixeltable-0.2.13.dist-info/entry_points.txt +3 -0
pixeltable/exprs/predicate.py +0 -44
pixeltable-0.2.11.dist-info/METADATA +0 -137
{pixeltable-0.2.11.dist-info → pixeltable-0.2.13.dist-info}/LICENSE +0 -0
{pixeltable-0.2.11.dist-info → pixeltable-0.2.13.dist-info}/WHEEL +0 -0

pixeltable/globals.py CHANGED Viewed

@@ -7,10 +7,10 @@ import sqlalchemy as sql
 from sqlalchemy.util.preloaded import orm
 import pixeltable.exceptions as excs
+import pixeltable.exprs as exprs
 from pixeltable import catalog, func, DataFrame
 from pixeltable.catalog import Catalog
 from pixeltable.env import Env
-from pixeltable.exprs import Predicate
 from pixeltable.iterators import ComponentIterator
 from pixeltable.metadata import schema
@@ -81,7 +81,7 @@ def create_view(
     base: Union[catalog.Table, DataFrame],
     *,
     schema: Optional[dict[str, Any]] = None,
-    filter: Optional[Predicate] = None,
+    filter: Optional[exprs.Expr] = None,
     is_snapshot: bool = False,
     iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]] = None,
     num_retained_versions: int = 10,
@@ -94,7 +94,7 @@ def create_view(
         path_str: Path to the view.
         base: Table (i.e., table or view or snapshot) or DataFrame to base the view on.
         schema: dictionary mapping column names to column types, value expressions, or to column specifications.
-        filter: Predicate to filter rows of the base table.
+        filter: predicate to filter rows of the base table.
         is_snapshot: Whether the view is a snapshot.
         iterator: The iterator to use for this view. If specified, then this view will be a one-to-many view of
             the base table.
@@ -234,7 +234,7 @@ def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> N
     Args:
         path: Path to the table.
-        force: Whether to drop the table even if it has unsaved changes.
+        force: If `True`, will also drop all views or sub-views of this table.
         ignore_errors: Whether to ignore errors if the table does not exist.
     Raises:
@@ -243,21 +243,27 @@ def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> N
     Examples:
         >>> cl.drop_table('my_table')
     """
+    cat = Catalog.get()
     path_obj = catalog.Path(path)
     try:
-        Catalog.get().paths.check_is_valid(path_obj, expected=catalog.Table)
+        cat.paths.check_is_valid(path_obj, expected=catalog.Table)
     except Exception as e:
-        if ignore_errors:
+        if ignore_errors or force:
             _logger.info(f'Skipped table `{path}` (does not exist).')
             return
         else:
             raise e
-    tbl = Catalog.get().paths[path_obj]
-    if len(Catalog.get().tbl_dependents[tbl._id]) > 0:
-        dependent_paths = [get_path(dep) for dep in Catalog.get().tbl_dependents[tbl._id]]
-        raise excs.Error(f'Table {path} has dependents: {", ".join(dependent_paths)}')
+    tbl = cat.paths[path_obj]
+    assert isinstance(tbl, catalog.Table)
+    if len(cat.tbl_dependents[tbl._id]) > 0:
+        dependent_paths = [dep.path for dep in cat.tbl_dependents[tbl._id]]
+        if force:
+            for dependent_path in dependent_paths:
+                drop_table(dependent_path, force=True)
+        else:
+            raise excs.Error(f'Table {path} has dependents: {", ".join(dependent_paths)}')
     tbl._drop()
-    del Catalog.get().paths[path_obj]
+    del cat.paths[path_obj]
     _logger.info(f'Dropped table `{path}`.')
@@ -291,7 +297,7 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
     return [str(p) for p in Catalog.get().paths.get_children(path, child_type=catalog.Table, recursive=recursive)]
-def create_dir(path_str: str, ignore_errors: bool = False) -> None:
+def create_dir(path_str: str, ignore_errors: bool = False) -> catalog.Dir:
     """Create a directory.
     Args:
@@ -319,10 +325,12 @@ def create_dir(path_str: str, ignore_errors: bool = False) -> None:
             session.add(dir_record)
             session.flush()
             assert dir_record.id is not None
-            Catalog.get().paths[path] = catalog.Dir(dir_record.id, parent._id, path.name)
+            dir = catalog.Dir(dir_record.id, parent._id, path.name)
+            Catalog.get().paths[path] = dir
             session.commit()
             _logger.info(f'Created directory `{path_str}`.')
             print(f'Created directory `{path_str}`.')
+            return dir
     except excs.Error as e:
         if ignore_errors:
             return
@@ -330,7 +338,7 @@ def create_dir(path_str: str, ignore_errors: bool = False) -> None:
             raise e
-def rm_dir(path_str: str) -> None:
+def drop_dir(path_str: str, force: bool = False, ignore_errors: bool = False) -> None:
     """Remove a directory.
     Args:
@@ -340,31 +348,49 @@ def rm_dir(path_str: str) -> None:
         Error: If the path does not exist or does not designate a directory or if the directory is not empty.
     Examples:
-        >>> cl.rm_dir('my_dir')
+        >>> cl.drop_dir('my_dir')
         Remove a subdirectory:
-        >>> cl.rm_dir('my_dir.sub_dir')
+        >>> cl.drop_dir('my_dir.sub_dir')
     """
+    cat = Catalog.get()
     path = catalog.Path(path_str)
-    Catalog.get().paths.check_is_valid(path, expected=catalog.Dir)
-    # make sure it's empty
-    if len(Catalog.get().paths.get_children(path, child_type=None, recursive=True)) > 0:
-        raise excs.Error(f'Directory {path_str} is not empty')
-    # TODO: figure out how to make force=True work in the presence of snapshots
-    #        # delete tables
-    #        for tbl_path in self.paths.get_children(path, child_type=MutableTable, recursive=True):
-    #            self.drop_table(str(tbl_path), force=True)
-    #        # rm subdirs
-    #        for dir_path in self.paths.get_children(path, child_type=Dir, recursive=False):
-    #            self.rm_dir(str(dir_path), force=True)
+    try:
+        cat.paths.check_is_valid(path, expected=catalog.Dir)
+    except Exception as e:
+        if ignore_errors or force:
+            _logger.info(f'Skipped directory `{path}` (does not exist).')
+            return
+        else:
+            raise e
+    children = cat.paths.get_children(path, child_type=None, recursive=True)
+    if len(children) > 0 and not force:
+        raise excs.Error(f'Directory `{path_str}` is not empty.')
+    for child in children:
+        assert isinstance(child, catalog.Path)
+        # We need to check that the child is still in `cat.paths`, since it is possible it was
+        # already deleted as a dependent of a preceding child in the iteration.
+        try:
+            obj = cat.paths[child]
+        except excs.Error:
+            continue
+        if isinstance(obj, catalog.Dir):
+            drop_dir(str(child), force=True)
+        else:
+            assert isinstance(obj, catalog.Table)
+            assert not obj._is_dropped  # else it should have been removed from `cat.paths` already
+            drop_table(str(child), force=True)
     with Env.get().engine.begin() as conn:
         dir = Catalog.get().paths[path]
         conn.execute(sql.delete(schema.Dir.__table__).where(schema.Dir.id == dir._id))
     del Catalog.get().paths[path]
-    _logger.info(f'Removed directory {path_str}')
+    _logger.info(f'Removed directory `{path_str}`.')
 def list_dirs(path_str: str = '', recursive: bool = True) -> list[str]:
@@ -416,28 +442,6 @@ def list_functions() -> pd.DataFrame:
     return pd_df.hide(axis='index')
-def get_path(schema_obj: catalog.SchemaObject) -> str:
-    """Returns the path to a SchemaObject.
-    Args:
-        schema_obj: SchemaObject to get the path for.
-    Returns:
-        Path to the SchemaObject.
-    """
-    path_elements: list[str] = []
-    dir_id = schema_obj._dir_id
-    while dir_id is not None:
-        dir = Catalog.get().paths.get_schema_obj(dir_id)
-        if dir._dir_id is None:
-            # this is the root dir with name '', which we don't want to include in the path
-            break
-        path_elements.insert(0, dir._name)
-        dir_id = dir._dir_id
-    path_elements.append(schema_obj._name)
-    return '.'.join(path_elements)
 def configure_logging(
     *,
     to_stdout: Optional[bool] = None,

pixeltable/index/embedding_index.py CHANGED Viewed

@@ -24,6 +24,7 @@ class EmbeddingIndex(IndexBase):
     - similarity_clause() converts those metrics back to their original form; it is used in expressions outside
       the Order By clause
     - order_by_clause() is used exclusively in the ORDER BY clause
+    - embedding function parameters are named '<type-name>_embed', where type-name is ColumnType.Type.name
     """
     class Metric(enum.Enum):
@@ -38,30 +39,30 @@ class EmbeddingIndex(IndexBase):
     }
     def __init__(
-            self, c: catalog.Column, metric: str, text_embed: Optional[func.Function] = None,
-            img_embed: Optional[func.Function] = None):
+            self, c: catalog.Column, metric: str, string_embed: Optional[func.Function] = None,
+            image_embed: Optional[func.Function] = None):
         metric_names = [m.name.lower() for m in self.Metric]
         if metric.lower() not in metric_names:
             raise excs.Error(f'Invalid metric {metric}, must be one of {metric_names}')
         if not c.col_type.is_string_type() and not c.col_type.is_image_type():
             raise excs.Error(f'Embedding index requires string or image column')
-        if c.col_type.is_string_type() and text_embed is None:
-                raise excs.Error(f'Text embedding function is required for column {c.name} (parameter `txt_embed`)')
-        if c.col_type.is_image_type() and img_embed is None:
-            raise excs.Error(f'Image embedding function is required for column {c.name} (parameter `img_embed`)')
-        if text_embed is not None:
+        if c.col_type.is_string_type() and string_embed is None:
+                raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
+        if c.col_type.is_image_type() and image_embed is None:
+            raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
+        if string_embed is not None:
             # verify signature
-            self._validate_embedding_fn(text_embed, 'txt_embed', ts.ColumnType.Type.STRING)
-        if img_embed is not None:
+            self._validate_embedding_fn(string_embed, 'string_embed', ts.ColumnType.Type.STRING)
+        if image_embed is not None:
             # verify signature
-            self._validate_embedding_fn(img_embed, 'img_embed', ts.ColumnType.Type.IMAGE)
+            self._validate_embedding_fn(image_embed, 'image_embed', ts.ColumnType.Type.IMAGE)
         self.metric = self.Metric[metric.upper()]
         from pixeltable.exprs import ColumnRef
-        self.value_expr = text_embed(ColumnRef(c)) if c.col_type.is_string_type() else img_embed(ColumnRef(c))
+        self.value_expr = string_embed(ColumnRef(c)) if c.col_type.is_string_type() else image_embed(ColumnRef(c))
         assert self.value_expr.col_type.is_array_type()
-        self.txt_embed = text_embed
-        self.img_embed = img_embed
+        self.string_embed = string_embed
+        self.image_embed = image_embed
         vector_size = self.value_expr.col_type.shape[0]
         assert vector_size is not None
         self.index_col_type = pgvector.sqlalchemy.Vector(vector_size)
@@ -88,14 +89,14 @@ class EmbeddingIndex(IndexBase):
         idx.create(bind=conn)
     def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.ClauseElement:
-        """Create a ClauseElement to that represents '<val_column> <op> <item>'"""
+        """Create a ClauseElement that represents '<val_column> <op> <item>'"""
         assert isinstance(item, (str, PIL.Image.Image))
         if isinstance(item, str):
-            assert self.txt_embed is not None
-            embedding = self.txt_embed.exec(item)
+            assert self.string_embed is not None
+            embedding = self.string_embed.exec(item)
         if isinstance(item, PIL.Image.Image):
-            assert self.img_embed is not None
-            embedding = self.img_embed.exec(item)
+            assert self.image_embed is not None
+            embedding = self.image_embed.exec(item)
         if self.metric == self.Metric.COSINE:
             return val_column.sa_col.cosine_distance(embedding) * -1 + 1
@@ -110,11 +111,11 @@ class EmbeddingIndex(IndexBase):
         assert isinstance(item, (str, PIL.Image.Image))
         embedding: Optional[np.ndarray] = None
         if isinstance(item, str):
-            assert self.txt_embed is not None
-            embedding = self.txt_embed.exec(item)
+            assert self.string_embed is not None
+            embedding = self.string_embed.exec(item)
         if isinstance(item, PIL.Image.Image):
-            assert self.img_embed is not None
-            embedding = self.img_embed.exec(item)
+            assert self.image_embed is not None
+            embedding = self.image_embed.exec(item)
         assert embedding is not None
         if self.metric == self.Metric.COSINE:
@@ -160,12 +161,12 @@ class EmbeddingIndex(IndexBase):
     def as_dict(self) -> dict:
         return {
             'metric': self.metric.name.lower(),
-            'txt_embed': None if self.txt_embed is None else self.txt_embed.as_dict(),
-            'img_embed': None if self.img_embed is None else self.img_embed.as_dict()
+            'string_embed': None if self.string_embed is None else self.string_embed.as_dict(),
+            'image_embed': None if self.image_embed is None else self.image_embed.as_dict()
         }
     @classmethod
     def from_dict(cls, c: catalog.Column, d: dict) -> EmbeddingIndex:
-        txt_embed = func.Function.from_dict(d['txt_embed']) if d['txt_embed'] is not None else None
-        img_embed = func.Function.from_dict(d['img_embed']) if d['img_embed'] is not None else None
-        return cls(c, metric=d['metric'], text_embed=txt_embed, img_embed=img_embed)
+        string_embed = func.Function.from_dict(d['string_embed']) if d['string_embed'] is not None else None
+        image_embed = func.Function.from_dict(d['image_embed']) if d['image_embed'] is not None else None
+        return cls(c, metric=d['metric'], string_embed=string_embed, image_embed=image_embed)

pixeltable/io/external_store.py CHANGED Viewed

@@ -222,12 +222,12 @@ class Project(ExternalStore, abc.ABC):
             if t_col not in t_cols:
                 if is_user_specified_col_mapping:
                     raise excs.Error(
-                        f'Column name `{t_col}` appears as a key in `col_mapping`, but Table `{table.get_name()}` '
+                        f'Column name `{t_col}` appears as a key in `col_mapping`, but Table `{table.name}` '
                         'contains no such column.'
                     )
                 else:
                     raise excs.Error(
-                        f'Column `{t_col}` does not exist in Table `{table.get_name()}`. Either add a column `{t_col}`, '
+                        f'Column `{t_col}` does not exist in Table `{table.name}`. Either add a column `{t_col}`, '
                         f'or specify a `col_mapping` to associate a different column with the external field `{ext_col}`.'
                     )
             if ext_col not in export_cols and ext_col not in import_cols:

pixeltable/io/globals.py CHANGED Viewed

@@ -13,11 +13,14 @@ def create_label_studio_project(
         media_import_method: Literal['post', 'file', 'url'] = 'post',
         col_mapping: Optional[dict[str, str]] = None,
         sync_immediately: bool = True,
+        s3_configuration: Optional[dict[str, Any]] = None,
         **kwargs: Any
 ) -> SyncStatus:
-    # TODO(aaron-siegel): Add link in docstring to a Label Studio howto
     """
-    Creates a new Label Studio project and links it to the specified `Table`.
+    Create a new Label Studio project and link it to the specified `Table`.
+    - A tutorial notebook with fully worked examples can be found here:
+      [Using Label Studio for Annotations with Pixeltable](https://pixeltable.readme.io/docs/label-studio)
     The required parameter `label_config` specifies the Label Studio project configuration,
     in XML format, as described in the Label Studio documentation. The linked project will
@@ -41,6 +44,11 @@ def create_label_studio_project(
     * Set the `LABEL_STUDIO_API_KEY` and `LABEL_STUDIO_URL` environment variables; or
     * Specify `api_key` and `url` fields in the `label-studio` section of `$PIXELTABLE_HOME/config.yaml`.
+    __Requirements:__
+    - `pip install label-studio-sdk`
+    - `pip install boto3` (if using S3 import storage)
     Args:
         t: The Table to link to.
         label_config: The Label Studio project configuration, in XML format.
@@ -50,8 +58,9 @@ def create_label_studio_project(
             `ls_project_0`, `ls_project_1`, etc.
         title: An optional title for the Label Studio project. This is the title that annotators
             will see inside Label Studio. Unlike `name`, it does not need to be an identifier and
-            does not need to be unique. If not specified, the table name `t.get_name()` will be used.
+            does not need to be unique. If not specified, the table name `t.name` will be used.
         media_import_method: The method to use when transferring media files to Label Studio:
             - `post`: Media will be sent to Label Studio via HTTP post. This should generally only be used for
                 prototyping; due to restrictions in Label Studio, it can only be used with projects that have
                 just one data field, and does not scale well.
@@ -63,9 +72,48 @@ def create_label_studio_project(
         col_mapping: An optional mapping of local column names to Label Studio fields.
         sync_immediately: If `True`, immediately perform an initial synchronization by
             exporting all rows of the `Table` as Label Studio tasks.
+        s3_configuration: If specified, S3 import storage will be configured for the new project. This can only
+            be used with `media_import_method='url'`, and if `media_import_method='url'` and any of the media data is
+            referenced by `s3://` URLs, then it must be specified in order for such media to display correctly
+            in the Label Studio interface.
+            The items in the `s3_configuration` dictionary correspond to kwarg
+            parameters of the Label Studio `connect_s3_import_storage` method, as described in the
+            [Label Studio connect_s3_import_storage docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.connect_s3_import_storage).
+            `bucket` must be specified; all other parameters are optional. If credentials are not specified explicitly,
+            Pixeltable will attempt to retrieve them from the environment (such as from `~/.aws/credentials`). If a title is not
+            specified, Pixeltable will use the default `'Pixeltable-S3-Import-Storage'`. All other parameters use their Label
+            Studio defaults.
         kwargs: Additional keyword arguments are passed to the `start_project` method in the Label
-            Studio SDK, as described here:
-            https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.start_project
+            Studio SDK, as described in the
+            [Label Studio start_project docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.start_project).
+    Returns:
+        A `SyncStatus` representing the status of any synchronization operations that occurred.
+    Examples:
+        Create a Label Studio project whose tasks correspond to videos stored in the `video_col` column of the table `tbl`:
+        >>> config = \"\"\"
+            <View>
+                <Video name="video_obj" value="$video_col"/>
+                <Choices name="video-category" toName="video" showInLine="true">
+                    <Choice value="city"/>
+                    <Choice value="food"/>
+                    <Choice value="sports"/>
+                </Choices>
+            </View>\"\"\"
+            create_label_studio_project(tbl, config)
+        Create a Label Studio project with the same configuration, using `media_import_method='url'`,
+        whose media are stored in an S3 bucket:
+        >>> create_label_studio_project(
+                tbl,
+                config,
+                media_import_method='url',
+                s3_configuration={'bucket': 'my-bucket', 'region_name': 'us-east-2'}
+            )
     """
     from pixeltable.io.label_studio import LabelStudioProject
@@ -76,6 +124,7 @@ def create_label_studio_project(
         title,
         media_import_method,
         col_mapping,
+        s3_configuration,
         **kwargs
     )

pixeltable/io/label_studio.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import copy
 import json
 import logging
 import os
@@ -18,6 +19,15 @@ from pixeltable.exprs import ColumnRef, DataRow, Expr
 from pixeltable.io.external_store import Project, SyncStatus
 from pixeltable.utils import coco
+# label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
+# the import two different ways to insure intercompatibility
+try:
+    # label_studio_sdk<1 compatibility
+    import label_studio_sdk.project as ls_project  # type: ignore
+except ImportError:
+    # label_studio_sdk>=1 compatibility
+    import label_studio_sdk._legacy.project as ls_project  # type: ignore
 _logger = logging.getLogger('pixeltable')
@@ -50,11 +60,11 @@ class LabelStudioProject(Project):
         """
         self.project_id = project_id
         self.media_import_method = media_import_method
-        self._project: Optional[label_studio_sdk.project.Project] = None
+        self._project: Optional[ls_project.Project] = None
         super().__init__(name, col_mapping, stored_proxies)
     @property
-    def project(self) -> label_studio_sdk.project.Project:
+    def project(self) -> ls_project.Project:
         """The `Project` object corresponding to this Label Studio project."""
         if self._project is None:
             try:
@@ -95,7 +105,7 @@ class LabelStudioProject(Project):
         return {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}
     def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
-        _logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t.get_name()}`'
+        _logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t.name}`'
                      f' (export: {export_data}, import: {import_data}).')
         # Collect all existing tasks into a dict with entries `rowid: task`
         tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
@@ -386,7 +396,7 @@ class LabelStudioProject(Project):
         updates = [{'_rowid': rowid, local_annotations_col.name: ann} for rowid, ann in annotations.items()]
         if len(updates) > 0:
             _logger.info(
-                f'Updating table `{t.get_name()}`, column `{local_annotations_col.name}` with {len(updates)} total annotations.'
+                f'Updating table `{t.name}`, column `{local_annotations_col.name}` with {len(updates)} total annotations.'
             )
             # batch_update currently doesn't propagate from views to base tables. As a workaround, we call
             # batch_update on the actual ancestor table that holds the annotations column.
@@ -536,6 +546,7 @@ class LabelStudioProject(Project):
             title: Optional[str],
             media_import_method: Literal['post', 'file', 'url'],
             col_mapping: Optional[dict[str, str]],
+            s3_configuration: Optional[dict[str, Any]],
             **kwargs: Any
     ) -> 'LabelStudioProject':
         """
@@ -554,7 +565,7 @@ class LabelStudioProject(Project):
         if title is None:
             # `title` defaults to table name
-            title = t.get_name()
+            title = t.name
         # Create a column to hold the annotations, if one does not yet exist
         if col_mapping is None or ANNOTATIONS_COLUMN in col_mapping.values():
@@ -572,6 +583,31 @@ class LabelStudioProject(Project):
         if media_import_method == 'post' and len(config.data_keys) > 1:
             raise excs.Error('`media_import_method` cannot be `post` if there is more than one data key')
+        if s3_configuration is not None:
+            if media_import_method != 'url':
+                raise excs.Error("`s3_configuration` is only valid when `media_import_method == 'url'`")
+            s3_configuration = copy.copy(s3_configuration)
+            if not 'bucket' in s3_configuration:
+                raise excs.Error('`s3_configuration` must contain a `bucket` field')
+            if not 'title' in s3_configuration:
+                s3_configuration['title'] = 'Pixeltable-S3-Import-Storage'
+            if ('aws_access_key_id' not in s3_configuration and
+                'aws_secret_access_key' not in s3_configuration and
+                'aws_session_token' not in s3_configuration):
+                # Attempt to fill any missing credentials from the environment
+                try:
+                    import boto3
+                    s3_credentials = boto3.Session().get_credentials().get_frozen_credentials()
+                    _logger.info(f'Using AWS credentials from the environment for Label Studio project: {title}')
+                    s3_configuration['aws_access_key_id'] = s3_credentials.access_key
+                    s3_configuration['aws_secret_access_key'] = s3_credentials.secret_key
+                    s3_configuration['aws_session_token'] = s3_credentials.token
+                except Exception as exc:
+                    # This is not necessarily a problem, but we should log that it happened
+                    _logger.debug(f'Unable to retrieve AWS credentials from the environment: {exc}')
+                    pass
+        _logger.info(f'Creating Label Studio project: {title}')
         project = _label_studio_client().start_project(title=title, label_config=label_config, **kwargs)
         if media_import_method == 'file':
@@ -591,6 +627,10 @@ class LabelStudioProject(Project):
                         ) from exc
                 raise  # Handle any other exception type normally
+        if s3_configuration is not None:
+            _logger.info(f'Setting up S3 import storage for Label Studio project: {title}')
+            project.connect_s3_import_storage(**s3_configuration)
         project_id = project.get_params()['id']
         return LabelStudioProject(name, project_id, media_import_method, resolved_col_mapping)

pixeltable/io/pandas.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Optional, Any, Iterable
+from typing import Optional, Any, Iterable, Union
 import numpy as np
 import pandas as pd
@@ -9,7 +9,10 @@ import pixeltable.type_system as ts
 def import_pandas(
-    tbl_name: str, df: pd.DataFrame, *, schema_overrides: Optional[dict[str, pxt.ColumnType]] = None
+    tbl_name: str, df: pd.DataFrame, *, schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
+    primary_key: Optional[Union[str, list[str]]] = None,
+    num_retained_versions: int = 10,
+    comment: str = ''
 ) -> pxt.catalog.InsertableTable:
     """Creates a new `Table` from a Pandas `DataFrame`, with the specified name. The schema of the table
     will be inferred from the `DataFrame`, unless `schema` is specified.
@@ -31,13 +34,17 @@ def import_pandas(
     """
     schema = _df_to_pxt_schema(df, schema_overrides)
     tbl_rows = (dict(_df_row_to_pxt_row(row, schema)) for row in df.itertuples())
-    table = pxt.create_table(tbl_name, schema)
+    table = pxt.create_table(tbl_name, schema, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
     table.insert(tbl_rows)
     return table
 def import_csv(
-    table_path: str, filepath_or_buffer, schema_overrides: Optional[dict[str, ts.ColumnType]] = None, **kwargs
+    tbl_name: str, filepath_or_buffer, schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
+    primary_key: Optional[Union[str, list[str]]] = None,
+    num_retained_versions: int = 10,
+    comment: str = '',
+    **kwargs
 ) -> pxt.catalog.InsertableTable:
     """
     Creates a new `Table` from a csv file. This is a convenience method and is equivalent
@@ -45,11 +52,15 @@ def import_csv(
     See the Pandas documentation for `read_csv` for more details.
     """
     df = pd.read_csv(filepath_or_buffer, **kwargs)
-    return import_pandas(table_path, df, schema_overrides=schema_overrides)
+    return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
 def import_excel(
-    table_path: str, io, *args, schema_overrides: Optional[dict[str, ts.ColumnType]] = None, **kwargs
+    tbl_name: str, io, *args, schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
+    primary_key: Optional[Union[str, list[str]]] = None,
+    num_retained_versions: int = 10,
+    comment: str = '',
+    **kwargs
 ) -> pxt.catalog.InsertableTable:
     """
     Creates a new `Table` from an excel (.xlsx) file. This is a convenience method and is equivalent
@@ -57,7 +68,7 @@ def import_excel(
     See the Pandas documentation for `read_excel` for more details.
     """
     df = pd.read_excel(io, *args, **kwargs)
-    return import_pandas(table_path, df, schema_overrides=schema_overrides)
+    return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
 def _df_to_pxt_schema(

pixeltable/metadata/__init__.py CHANGED Viewed

@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
 from .schema import SystemInfo, SystemInfoMd
 # current version of the metadata; this is incremented whenever the metadata schema changes
-VERSION = 17
+VERSION = 18
 def create_system_info(engine: sql.engine.Engine) -> None:

pixeltable/metadata/converters/convert_17.py ADDED Viewed

@@ -0,0 +1,26 @@
+import sqlalchemy as sql
+from pixeltable.metadata import register_converter
+from pixeltable.metadata.converters.util import convert_table_md
+@register_converter(version=17)
+def _(engine: sql.engine.Engine) -> None:
+    convert_table_md(
+        engine,
+        table_md_updater=__update_table_md
+    )
+def __update_table_md(table_md: dict) -> None:
+    # key changes in IndexMd.init_args: img_embed -> image_embed, txt_embed -> string_embed
+    if len(table_md['index_md']) == 0:
+        return
+    for idx_md in table_md['index_md'].values():
+        if not idx_md['class_fqn'].endswith('.EmbeddingIndex'):
+            continue
+        init_dict = idx_md['init_args']
+        init_dict['image_embed'] = init_dict['img_embed']
+        del init_dict['img_embed']
+        init_dict['string_embed'] = init_dict['txt_embed']
+        del init_dict['txt_embed']

pixeltable 0.2.11__py3-none-any.whl → 0.2.13__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.11py3-none-any.whl → 0.2.13py3-none-any.whl