pixeltable 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (67) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/column.py +5 -0
  4. pixeltable/catalog/globals.py +8 -0
  5. pixeltable/catalog/insertable_table.py +2 -2
  6. pixeltable/catalog/table.py +27 -9
  7. pixeltable/catalog/table_version.py +41 -68
  8. pixeltable/catalog/view.py +3 -3
  9. pixeltable/dataframe.py +7 -6
  10. pixeltable/exec/__init__.py +2 -1
  11. pixeltable/exec/expr_eval_node.py +8 -1
  12. pixeltable/exec/row_update_node.py +61 -0
  13. pixeltable/exec/{sql_scan_node.py → sql_node.py} +120 -56
  14. pixeltable/exprs/__init__.py +1 -2
  15. pixeltable/exprs/comparison.py +5 -5
  16. pixeltable/exprs/compound_predicate.py +12 -12
  17. pixeltable/exprs/expr.py +67 -22
  18. pixeltable/exprs/function_call.py +60 -29
  19. pixeltable/exprs/globals.py +2 -0
  20. pixeltable/exprs/in_predicate.py +3 -3
  21. pixeltable/exprs/inline_array.py +18 -11
  22. pixeltable/exprs/is_null.py +5 -5
  23. pixeltable/exprs/method_ref.py +63 -0
  24. pixeltable/ext/__init__.py +9 -0
  25. pixeltable/ext/functions/__init__.py +8 -0
  26. pixeltable/ext/functions/whisperx.py +45 -5
  27. pixeltable/ext/functions/yolox.py +60 -14
  28. pixeltable/func/aggregate_function.py +10 -4
  29. pixeltable/func/callable_function.py +16 -4
  30. pixeltable/func/expr_template_function.py +1 -1
  31. pixeltable/func/function.py +12 -2
  32. pixeltable/func/function_registry.py +26 -9
  33. pixeltable/func/udf.py +32 -4
  34. pixeltable/functions/__init__.py +1 -1
  35. pixeltable/functions/fireworks.py +33 -0
  36. pixeltable/functions/globals.py +36 -1
  37. pixeltable/functions/huggingface.py +155 -7
  38. pixeltable/functions/image.py +242 -40
  39. pixeltable/functions/openai.py +214 -0
  40. pixeltable/functions/string.py +600 -8
  41. pixeltable/functions/timestamp.py +210 -0
  42. pixeltable/functions/together.py +106 -0
  43. pixeltable/functions/video.py +28 -10
  44. pixeltable/functions/whisper.py +32 -0
  45. pixeltable/globals.py +3 -3
  46. pixeltable/io/__init__.py +1 -1
  47. pixeltable/io/globals.py +186 -5
  48. pixeltable/io/label_studio.py +42 -2
  49. pixeltable/io/pandas.py +70 -34
  50. pixeltable/metadata/__init__.py +1 -1
  51. pixeltable/metadata/converters/convert_18.py +39 -0
  52. pixeltable/metadata/notes.py +10 -0
  53. pixeltable/plan.py +82 -7
  54. pixeltable/tool/create_test_db_dump.py +4 -5
  55. pixeltable/tool/doc_plugins/griffe.py +81 -0
  56. pixeltable/tool/doc_plugins/mkdocstrings.py +6 -0
  57. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +135 -0
  58. pixeltable/type_system.py +15 -14
  59. pixeltable/utils/s3.py +1 -1
  60. pixeltable-0.2.14.dist-info/METADATA +206 -0
  61. {pixeltable-0.2.12.dist-info → pixeltable-0.2.14.dist-info}/RECORD +64 -56
  62. pixeltable-0.2.14.dist-info/entry_points.txt +3 -0
  63. pixeltable/exprs/image_member_access.py +0 -96
  64. pixeltable/exprs/predicate.py +0 -44
  65. pixeltable-0.2.12.dist-info/METADATA +0 -137
  66. {pixeltable-0.2.12.dist-info → pixeltable-0.2.14.dist-info}/LICENSE +0 -0
  67. {pixeltable-0.2.12.dist-info → pixeltable-0.2.14.dist-info}/WHEEL +0 -0
pixeltable/io/globals.py CHANGED
@@ -1,5 +1,7 @@
1
- from typing import Any, Optional, Literal
1
+ from typing import Any, Literal, Optional, Union
2
+ import urllib.request
2
3
 
4
+ import pixeltable as pxt
3
5
  import pixeltable.exceptions as excs
4
6
  from pixeltable import Table
5
7
  from pixeltable.io.external_store import SyncStatus
@@ -13,11 +15,14 @@ def create_label_studio_project(
13
15
  media_import_method: Literal['post', 'file', 'url'] = 'post',
14
16
  col_mapping: Optional[dict[str, str]] = None,
15
17
  sync_immediately: bool = True,
18
+ s3_configuration: Optional[dict[str, Any]] = None,
16
19
  **kwargs: Any
17
20
  ) -> SyncStatus:
18
- # TODO(aaron-siegel): Add link in docstring to a Label Studio howto
19
21
  """
20
- Creates a new Label Studio project and links it to the specified `Table`.
22
+ Create a new Label Studio project and link it to the specified `Table`.
23
+
24
+ - A tutorial notebook with fully worked examples can be found here:
25
+ [Using Label Studio for Annotations with Pixeltable](https://pixeltable.readme.io/docs/label-studio)
21
26
 
22
27
  The required parameter `label_config` specifies the Label Studio project configuration,
23
28
  in XML format, as described in the Label Studio documentation. The linked project will
@@ -41,6 +46,11 @@ def create_label_studio_project(
41
46
  * Set the `LABEL_STUDIO_API_KEY` and `LABEL_STUDIO_URL` environment variables; or
42
47
  * Specify `api_key` and `url` fields in the `label-studio` section of `$PIXELTABLE_HOME/config.yaml`.
43
48
 
49
+ __Requirements:__
50
+
51
+ - `pip install label-studio-sdk`
52
+ - `pip install boto3` (if using S3 import storage)
53
+
44
54
  Args:
45
55
  t: The Table to link to.
46
56
  label_config: The Label Studio project configuration, in XML format.
@@ -52,6 +62,7 @@ def create_label_studio_project(
52
62
  will see inside Label Studio. Unlike `name`, it does not need to be an identifier and
53
63
  does not need to be unique. If not specified, the table name `t.name` will be used.
54
64
  media_import_method: The method to use when transferring media files to Label Studio:
65
+
55
66
  - `post`: Media will be sent to Label Studio via HTTP post. This should generally only be used for
56
67
  prototyping; due to restrictions in Label Studio, it can only be used with projects that have
57
68
  just one data field, and does not scale well.
@@ -63,9 +74,48 @@ def create_label_studio_project(
63
74
  col_mapping: An optional mapping of local column names to Label Studio fields.
64
75
  sync_immediately: If `True`, immediately perform an initial synchronization by
65
76
  exporting all rows of the `Table` as Label Studio tasks.
77
+ s3_configuration: If specified, S3 import storage will be configured for the new project. This can only
78
+ be used with `media_import_method='url'`, and if `media_import_method='url'` and any of the media data is
79
+ referenced by `s3://` URLs, then it must be specified in order for such media to display correctly
80
+ in the Label Studio interface.
81
+
82
+ The items in the `s3_configuration` dictionary correspond to kwarg
83
+ parameters of the Label Studio `connect_s3_import_storage` method, as described in the
84
+ [Label Studio connect_s3_import_storage docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.connect_s3_import_storage).
85
+ `bucket` must be specified; all other parameters are optional. If credentials are not specified explicitly,
86
+ Pixeltable will attempt to retrieve them from the environment (such as from `~/.aws/credentials`). If a title is not
87
+ specified, Pixeltable will use the default `'Pixeltable-S3-Import-Storage'`. All other parameters use their Label
88
+ Studio defaults.
66
89
  kwargs: Additional keyword arguments are passed to the `start_project` method in the Label
67
- Studio SDK, as described here:
68
- https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.start_project
90
+ Studio SDK, as described in the
91
+ [Label Studio start_project docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.start_project).
92
+
93
+ Returns:
94
+ A `SyncStatus` representing the status of any synchronization operations that occurred.
95
+
96
+ Examples:
97
+ Create a Label Studio project whose tasks correspond to videos stored in the `video_col` column of the table `tbl`:
98
+
99
+ >>> config = \"\"\"
100
+ <View>
101
+ <Video name="video_obj" value="$video_col"/>
102
+ <Choices name="video-category" toName="video" showInLine="true">
103
+ <Choice value="city"/>
104
+ <Choice value="food"/>
105
+ <Choice value="sports"/>
106
+ </Choices>
107
+ </View>\"\"\"
108
+ create_label_studio_project(tbl, config)
109
+
110
+ Create a Label Studio project with the same configuration, using `media_import_method='url'`,
111
+ whose media are stored in an S3 bucket:
112
+
113
+ >>> create_label_studio_project(
114
+ tbl,
115
+ config,
116
+ media_import_method='url',
117
+ s3_configuration={'bucket': 'my-bucket', 'region_name': 'us-east-2'}
118
+ )
69
119
  """
70
120
  from pixeltable.io.label_studio import LabelStudioProject
71
121
 
@@ -76,6 +126,7 @@ def create_label_studio_project(
76
126
  title,
77
127
  media_import_method,
78
128
  col_mapping,
129
+ s3_configuration,
79
130
  **kwargs
80
131
  )
81
132
 
@@ -85,3 +136,133 @@ def create_label_studio_project(
85
136
  return t.sync()
86
137
  else:
87
138
  return SyncStatus.empty()
139
+
140
+
141
+ def import_rows(
142
+ tbl_path: str,
143
+ rows: list[dict[str, Any]],
144
+ *,
145
+ schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
146
+ primary_key: Optional[Union[str, list[str]]] = None,
147
+ num_retained_versions: int = 10,
148
+ comment: str = ''
149
+ ) -> Table:
150
+ """
151
+ Creates a new `Table` from a list of dictionaries. The dictionaries must be of the form
152
+ `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
153
+ supplied data, using the most specific type that can represent all the values in a column.
154
+
155
+ If `schema_overrides` is specified, then for each entry `(column_name, type)` in `schema_overrides`,
156
+ Pixeltable will force the specified column to the specified type (and will not attempt any type inference
157
+ for that column).
158
+
159
+ All column types of the new `Table` will be nullable unless explicitly specified as non-nullable in
160
+ `schema_overrides`.
161
+
162
+ Args:
163
+ tbl_path: The qualified name of the table to create.
164
+ rows: The list of dictionaries to import.
165
+ schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
166
+ as described above.
167
+ primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
168
+ num_retained_versions: The number of retained versions of the table (see [`create_table()`][pixeltable.create_table]).
169
+ comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
170
+
171
+ Returns:
172
+ The newly created `Table`.
173
+ """
174
+ if schema_overrides is None:
175
+ schema_overrides = {}
176
+ schema: dict[str, pxt.ColumnType] = {}
177
+ cols_with_nones: set[str] = set()
178
+
179
+ for n, row in enumerate(rows):
180
+ for col_name, value in row.items():
181
+ if col_name in schema_overrides:
182
+ # We do the insertion here; this will ensure that the column order matches the order
183
+ # in which the column names are encountered in the input data, even if `schema_overrides`
184
+ # is specified.
185
+ if col_name not in schema:
186
+ schema[col_name] = schema_overrides[col_name]
187
+ elif value is not None:
188
+ # If `key` is not in `schema_overrides`, then we infer its type from the data.
189
+ # The column type will always be nullable by default.
190
+ col_type = pxt.ColumnType.infer_literal_type(value).copy(nullable=True)
191
+ if col_name not in schema:
192
+ schema[col_name] = col_type
193
+ else:
194
+ supertype = pxt.ColumnType.supertype(schema[col_name], col_type)
195
+ if supertype is None:
196
+ raise excs.Error(
197
+ f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
198
+ 'Consider specifying the type explicitly in `schema_overrides`.'
199
+ )
200
+ schema[col_name] = supertype
201
+ else:
202
+ cols_with_nones.add(col_name)
203
+
204
+ extraneous_keys = schema_overrides.keys() - schema.keys()
205
+ if len(extraneous_keys) > 0:
206
+ raise excs.Error(f'The following columns specified in `schema_overrides` are not present in the data: {", ".join(extraneous_keys)}')
207
+
208
+ entirely_none_cols = cols_with_nones - schema.keys()
209
+ if len(entirely_none_cols) > 0:
210
+ # A column can only end up in `entirely_null_cols` if it was not in `schema_overrides` and
211
+ # was not encountered in any row with a non-None value.
212
+ raise excs.Error(
213
+ f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
214
+ 'Consider specifying the type(s) explicitly in `schema_overrides`.'
215
+ )
216
+
217
+ t = pxt.create_table(tbl_path, schema, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
218
+ t.insert(rows)
219
+ return t
220
+
221
+
222
+ def import_json(
223
+ tbl_path: str,
224
+ filepath_or_url: str,
225
+ *,
226
+ schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
227
+ primary_key: Optional[Union[str, list[str]]] = None,
228
+ num_retained_versions: int = 10,
229
+ comment: str = '',
230
+ **kwargs: Any
231
+ ) -> Table:
232
+ """
233
+ Creates a new `Table` from a JSON file. This is a convenience method and is equivalent
234
+ to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
235
+ is the contents of the specified `filepath_or_url`.
236
+
237
+ Args:
238
+ tbl_path: The name of the table to create.
239
+ filepath_or_url: The path or URL of the JSON file.
240
+ schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
241
+ (see [`import_rows()`][pixeltable.io.import_rows]).
242
+ primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
243
+ num_retained_versions: The number of retained versions of the table (see [`create_table()`][pixeltable.create_table]).
244
+ comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
245
+ kwargs: Additional keyword arguments to pass to `json.loads`.
246
+
247
+ Returns:
248
+ The newly created `Table`.
249
+ """
250
+ import json
251
+ import urllib.parse
252
+ import urllib.request
253
+
254
+ # TODO Consolidate this logic with other places where files/URLs are parsed
255
+ parsed = urllib.parse.urlparse(filepath_or_url)
256
+ if len(parsed.scheme) <= 1 or parsed.scheme == 'file':
257
+ # local file path
258
+ if len(parsed.scheme) <= 1:
259
+ filepath = filepath_or_url
260
+ else:
261
+ filepath = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
262
+ with open(filepath) as fp:
263
+ contents = fp.read()
264
+ else:
265
+ # URL
266
+ contents = urllib.request.urlopen(filepath_or_url).read()
267
+ data = json.loads(contents, **kwargs)
268
+ return import_rows(tbl_path, data, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
@@ -1,3 +1,4 @@
1
+ import copy
1
2
  import json
2
3
  import logging
3
4
  import os
@@ -18,6 +19,15 @@ from pixeltable.exprs import ColumnRef, DataRow, Expr
18
19
  from pixeltable.io.external_store import Project, SyncStatus
19
20
  from pixeltable.utils import coco
20
21
 
22
+ # label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
23
+ # the import two different ways to insure intercompatibility
24
+ try:
25
+ # label_studio_sdk<1 compatibility
26
+ import label_studio_sdk.project as ls_project # type: ignore
27
+ except ImportError:
28
+ # label_studio_sdk>=1 compatibility
29
+ import label_studio_sdk._legacy.project as ls_project # type: ignore
30
+
21
31
  _logger = logging.getLogger('pixeltable')
22
32
 
23
33
 
@@ -50,11 +60,11 @@ class LabelStudioProject(Project):
50
60
  """
51
61
  self.project_id = project_id
52
62
  self.media_import_method = media_import_method
53
- self._project: Optional[label_studio_sdk.project.Project] = None
63
+ self._project: Optional[ls_project.Project] = None
54
64
  super().__init__(name, col_mapping, stored_proxies)
55
65
 
56
66
  @property
57
- def project(self) -> label_studio_sdk.project.Project:
67
+ def project(self) -> ls_project.Project:
58
68
  """The `Project` object corresponding to this Label Studio project."""
59
69
  if self._project is None:
60
70
  try:
@@ -536,6 +546,7 @@ class LabelStudioProject(Project):
536
546
  title: Optional[str],
537
547
  media_import_method: Literal['post', 'file', 'url'],
538
548
  col_mapping: Optional[dict[str, str]],
549
+ s3_configuration: Optional[dict[str, Any]],
539
550
  **kwargs: Any
540
551
  ) -> 'LabelStudioProject':
541
552
  """
@@ -572,6 +583,31 @@ class LabelStudioProject(Project):
572
583
  if media_import_method == 'post' and len(config.data_keys) > 1:
573
584
  raise excs.Error('`media_import_method` cannot be `post` if there is more than one data key')
574
585
 
586
+ if s3_configuration is not None:
587
+ if media_import_method != 'url':
588
+ raise excs.Error("`s3_configuration` is only valid when `media_import_method == 'url'`")
589
+ s3_configuration = copy.copy(s3_configuration)
590
+ if not 'bucket' in s3_configuration:
591
+ raise excs.Error('`s3_configuration` must contain a `bucket` field')
592
+ if not 'title' in s3_configuration:
593
+ s3_configuration['title'] = 'Pixeltable-S3-Import-Storage'
594
+ if ('aws_access_key_id' not in s3_configuration and
595
+ 'aws_secret_access_key' not in s3_configuration and
596
+ 'aws_session_token' not in s3_configuration):
597
+ # Attempt to fill any missing credentials from the environment
598
+ try:
599
+ import boto3
600
+ s3_credentials = boto3.Session().get_credentials().get_frozen_credentials()
601
+ _logger.info(f'Using AWS credentials from the environment for Label Studio project: {title}')
602
+ s3_configuration['aws_access_key_id'] = s3_credentials.access_key
603
+ s3_configuration['aws_secret_access_key'] = s3_credentials.secret_key
604
+ s3_configuration['aws_session_token'] = s3_credentials.token
605
+ except Exception as exc:
606
+ # This is not necessarily a problem, but we should log that it happened
607
+ _logger.debug(f'Unable to retrieve AWS credentials from the environment: {exc}')
608
+ pass
609
+
610
+ _logger.info(f'Creating Label Studio project: {title}')
575
611
  project = _label_studio_client().start_project(title=title, label_config=label_config, **kwargs)
576
612
 
577
613
  if media_import_method == 'file':
@@ -591,6 +627,10 @@ class LabelStudioProject(Project):
591
627
  ) from exc
592
628
  raise # Handle any other exception type normally
593
629
 
630
+ if s3_configuration is not None:
631
+ _logger.info(f'Setting up S3 import storage for Label Studio project: {title}')
632
+ project.connect_s3_import_storage(**s3_configuration)
633
+
594
634
  project_id = project.get_params()['id']
595
635
  return LabelStudioProject(name, project_id, media_import_method, resolved_col_mapping)
596
636
 
pixeltable/io/pandas.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Optional, Any, Iterable
1
+ from typing import Any, Optional, Union
2
2
 
3
3
  import numpy as np
4
4
  import pandas as pd
@@ -9,10 +9,13 @@ import pixeltable.type_system as ts
9
9
 
10
10
 
11
11
  def import_pandas(
12
- tbl_name: str, df: pd.DataFrame, *, schema_overrides: Optional[dict[str, pxt.ColumnType]] = None
12
+ tbl_name: str, df: pd.DataFrame, *, schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
13
+ primary_key: Optional[Union[str, list[str]]] = None,
14
+ num_retained_versions: int = 10,
15
+ comment: str = ''
13
16
  ) -> pxt.catalog.InsertableTable:
14
17
  """Creates a new `Table` from a Pandas `DataFrame`, with the specified name. The schema of the table
15
- will be inferred from the `DataFrame`, unless `schema` is specified.
18
+ will be inferred from the `DataFrame`.
16
19
 
17
20
  The column names of the new `Table` will be identical to those in the `DataFrame`, as long as they are valid
18
21
  Pixeltable identifiers. If a column name is not a valid Pixeltable identifier, it will be normalized according to
@@ -29,15 +32,26 @@ def import_pandas(
29
32
  `schema_overrides` should be the column names of the `DataFrame` (whether or not they are valid
30
33
  Pixeltable identifiers).
31
34
  """
32
- schema = _df_to_pxt_schema(df, schema_overrides)
33
- tbl_rows = (dict(_df_row_to_pxt_row(row, schema)) for row in df.itertuples())
34
- table = pxt.create_table(tbl_name, schema)
35
+ if schema_overrides is None:
36
+ schema_overrides = {}
37
+ if primary_key is None:
38
+ primary_key = []
39
+ elif isinstance(primary_key, str):
40
+ primary_key = [primary_key]
41
+
42
+ schema, pxt_pk = __df_to_pxt_schema(df, schema_overrides, primary_key)
43
+ tbl_rows = (dict(__df_row_to_pxt_row(row, schema)) for row in df.itertuples())
44
+ table = pxt.create_table(tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment)
35
45
  table.insert(tbl_rows)
36
46
  return table
37
47
 
38
48
 
39
49
  def import_csv(
40
- table_path: str, filepath_or_buffer, schema_overrides: Optional[dict[str, ts.ColumnType]] = None, **kwargs
50
+ tbl_name: str, filepath_or_buffer, schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
51
+ primary_key: Optional[Union[str, list[str]]] = None,
52
+ num_retained_versions: int = 10,
53
+ comment: str = '',
54
+ **kwargs
41
55
  ) -> pxt.catalog.InsertableTable:
42
56
  """
43
57
  Creates a new `Table` from a csv file. This is a convenience method and is equivalent
@@ -45,11 +59,15 @@ def import_csv(
45
59
  See the Pandas documentation for `read_csv` for more details.
46
60
  """
47
61
  df = pd.read_csv(filepath_or_buffer, **kwargs)
48
- return import_pandas(table_path, df, schema_overrides=schema_overrides)
62
+ return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
49
63
 
50
64
 
51
65
  def import_excel(
52
- table_path: str, io, *args, schema_overrides: Optional[dict[str, ts.ColumnType]] = None, **kwargs
66
+ tbl_name: str, io, *args, schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
67
+ primary_key: Optional[Union[str, list[str]]] = None,
68
+ num_retained_versions: int = 10,
69
+ comment: str = '',
70
+ **kwargs
53
71
  ) -> pxt.catalog.InsertableTable:
54
72
  """
55
73
  Creates a new `Table` from an excel (.xlsx) file. This is a convenience method and is equivalent
@@ -57,25 +75,36 @@ def import_excel(
57
75
  See the Pandas documentation for `read_excel` for more details.
58
76
  """
59
77
  df = pd.read_excel(io, *args, **kwargs)
60
- return import_pandas(table_path, df, schema_overrides=schema_overrides)
61
-
62
-
63
- def _df_to_pxt_schema(
64
- df: pd.DataFrame, schema_overrides: Optional[dict[str, pxt.ColumnType]]
65
- ) -> dict[str, pxt.ColumnType]:
66
- if schema_overrides is not None:
67
- for pd_name in schema_overrides:
68
- if pd_name not in df.columns:
69
- raise excs.Error(
70
- f'Column `{pd_name}` specified in `schema_overrides` does not exist in the given `DataFrame`.'
71
- )
72
- schema = {}
78
+ return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
79
+
80
+
81
+ def __df_to_pxt_schema(
82
+ df: pd.DataFrame, schema_overrides: dict[str, pxt.ColumnType], primary_key: list[str]
83
+ ) -> tuple[dict[str, pxt.ColumnType], list[str]]:
84
+ """
85
+ Infers a Pixeltable schema from a Pandas DataFrame.
86
+
87
+ Returns:
88
+ A tuple containing a Pixeltable schema and a list of primary key column names.
89
+ """
90
+ for pd_name in schema_overrides:
91
+ if pd_name not in df.columns:
92
+ raise excs.Error(
93
+ f'Column `{pd_name}` specified in `schema_overrides` does not exist in the given `DataFrame`.'
94
+ )
95
+ for pd_name in primary_key:
96
+ if pd_name not in df.columns:
97
+ raise excs.Error(f'Primary key column `{pd_name}` does not exist in the given `DataFrame`.')
98
+
99
+ schema: dict[str, pxt.ColumnType] = {}
100
+ col_mapping: dict[str, str] = {} # Maps Pandas column names to Pixeltable column names
101
+
73
102
  for pd_name, pd_dtype in zip(df.columns, df.dtypes):
74
- if schema_overrides is not None and pd_name in schema_overrides:
103
+ if pd_name in schema_overrides:
75
104
  pxt_type = schema_overrides[pd_name]
76
105
  else:
77
- pxt_type = _np_dtype_to_pxt_type(pd_dtype, df[pd_name])
78
- pxt_name = _normalize_pxt_col_name(pd_name)
106
+ pxt_type = __np_dtype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
107
+ pxt_name = __normalize_pxt_col_name(pd_name)
79
108
  # Ensure that column names are unique by appending a distinguishing suffix
80
109
  # to any collisions
81
110
  if pxt_name in schema:
@@ -84,10 +113,13 @@ def _df_to_pxt_schema(
84
113
  n += 1
85
114
  pxt_name = f'{pxt_name}_{n}'
86
115
  schema[pxt_name] = pxt_type
87
- return schema
116
+ col_mapping[pd_name] = pxt_name
117
+
118
+ pxt_pk = [col_mapping[pk] for pk in primary_key]
119
+ return schema, pxt_pk
88
120
 
89
121
 
90
- def _normalize_pxt_col_name(pd_name: str) -> str:
122
+ def __normalize_pxt_col_name(pd_name: str) -> str:
91
123
  """
92
124
  Normalizes an arbitrary DataFrame column name into a valid Pixeltable identifier by:
93
125
  - replacing any non-ascii or non-alphanumeric characters with an underscore _
@@ -102,26 +134,30 @@ def _normalize_pxt_col_name(pd_name: str) -> str:
102
134
  return id
103
135
 
104
136
 
105
- def _np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series) -> pxt.ColumnType:
137
+ def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bool) -> pxt.ColumnType:
106
138
  """
107
139
  Infers a Pixeltable type based on a Numpy dtype.
108
140
  """
109
141
  if np.issubdtype(np_dtype, np.integer):
110
- return pxt.IntType()
142
+ return pxt.IntType(nullable=nullable)
111
143
  if np.issubdtype(np_dtype, np.floating):
112
- return pxt.FloatType()
144
+ return pxt.FloatType(nullable=nullable)
113
145
  if np.issubdtype(np_dtype, np.bool_):
114
- return pxt.BoolType()
146
+ return pxt.BoolType(nullable=nullable)
115
147
  if np_dtype == np.object_ or np.issubdtype(np_dtype, np.character):
116
148
  has_nan = any(isinstance(val, float) and np.isnan(val) for val in data_col)
117
- return pxt.StringType(nullable=has_nan)
149
+ if has_nan and not nullable:
150
+ raise excs.Error(f'Primary key column `{data_col.name}` cannot contain null values.')
151
+ return pxt.StringType(nullable=nullable)
118
152
  if np.issubdtype(np_dtype, np.datetime64):
119
153
  has_nat = any(pd.isnull(val) for val in data_col)
120
- return pxt.TimestampType(nullable=has_nat)
154
+ if has_nat and not nullable:
155
+ raise excs.Error(f'Primary key column `{data_col.name}` cannot contain null values.')
156
+ return pxt.TimestampType(nullable=nullable)
121
157
  raise excs.Error(f'Unsupported dtype: {np_dtype}')
122
158
 
123
159
 
124
- def _df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType]) -> dict[str, Any]:
160
+ def __df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType]) -> dict[str, Any]:
125
161
  rows = {}
126
162
  for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
127
163
  if pxt_type.is_float_type():
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
10
10
  from .schema import SystemInfo, SystemInfoMd
11
11
 
12
12
  # current version of the metadata; this is incremented whenever the metadata schema changes
13
- VERSION = 18
13
+ VERSION = 19
14
14
 
15
15
 
16
16
  def create_system_info(engine: sql.engine.Engine) -> None:
@@ -0,0 +1,39 @@
1
+ from typing import Any, Optional
2
+ import sqlalchemy as sql
3
+
4
+ from pixeltable.metadata import register_converter
5
+ from pixeltable.metadata.converters.util import convert_table_md
6
+
7
+
8
+ @register_converter(version=18)
9
+ def _(engine: sql.engine.Engine) -> None:
10
+ convert_table_md(
11
+ engine,
12
+ substitution_fn=__substitute_md
13
+ )
14
+
15
+
16
+ def __substitute_md(k: Any, v: Any) -> Optional[tuple[Any, Any]]:
17
+ # Migrate a few changed function names
18
+ if k == 'path' and v == 'pixeltable.functions.string.str_format':
19
+ return 'path', 'pixeltable.functions.string.format'
20
+ if k == 'path' and v.startswith('pixeltable.functions.pil.image'):
21
+ return 'path', v.replace('pixeltable.functions.pil.image', 'pixeltable.functions.image')
22
+ # Migrate deprecated `ImageMemberAccess` expressions to `FunctionCall`s
23
+ if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'ImageMemberAccess':
24
+ member_name = v['member_name']
25
+ new_v = {
26
+ 'fn': {
27
+ 'path': f'pixeltable.functions.image.{member_name}',
28
+ '_classpath': 'pixeltable.func.callable_function.CallableFunction',
29
+ },
30
+ 'args': [[0, None]],
31
+ 'kwargs': {},
32
+ '_classname': 'FunctionCall',
33
+ 'components': v['components'],
34
+ 'group_by_stop_idx': 0,
35
+ 'group_by_start_idx': 0,
36
+ 'order_by_start_idx': 1,
37
+ }
38
+ return k, new_v
39
+ return None
@@ -0,0 +1,10 @@
1
+ # Descriptive notes for each new metadata version. These are stored in a Python dict
2
+ # rather than as a comment, so that the existence of a description can be enforced by
3
+ # the unit tests when new versions are added.
4
+ VERSION_NOTES = {
5
+ 19: 'UDF renames; ImageMemberAccess removal',
6
+ 18: 'Restructured index metadata',
7
+ 17: 'Renamed remotes to external_stores',
8
+ 16: 'Query functions; deferred Expr deserialization',
9
+ 15: 'Remotes in table metadata',
10
+ }