pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. pixeltable/__init__.py +42 -8
  2. pixeltable/{dataframe.py → _query.py} +470 -206
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +5 -4
  5. pixeltable/catalog/catalog.py +1785 -432
  6. pixeltable/catalog/column.py +190 -113
  7. pixeltable/catalog/dir.py +2 -4
  8. pixeltable/catalog/globals.py +19 -46
  9. pixeltable/catalog/insertable_table.py +191 -98
  10. pixeltable/catalog/path.py +63 -23
  11. pixeltable/catalog/schema_object.py +11 -15
  12. pixeltable/catalog/table.py +843 -436
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +978 -657
  15. pixeltable/catalog/table_version_handle.py +72 -16
  16. pixeltable/catalog/table_version_path.py +112 -43
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +134 -90
  20. pixeltable/config.py +134 -22
  21. pixeltable/env.py +471 -157
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +4 -1
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +11 -7
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +106 -56
  37. pixeltable/exec/globals.py +35 -0
  38. pixeltable/exec/in_memory_data_node.py +19 -19
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +351 -84
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +36 -23
  46. pixeltable/exprs/column_ref.py +213 -89
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +164 -54
  50. pixeltable/exprs/expr.py +70 -44
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +100 -40
  54. pixeltable/exprs/globals.py +2 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +18 -32
  57. pixeltable/exprs/is_null.py +7 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +27 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +167 -67
  64. pixeltable/exprs/rowid_ref.py +25 -10
  65. pixeltable/exprs/similarity_expr.py +58 -40
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +17 -11
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +29 -27
  78. pixeltable/func/signature.py +46 -19
  79. pixeltable/func/tools.py +31 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +16 -0
  82. pixeltable/functions/anthropic.py +123 -77
  83. pixeltable/functions/audio.py +147 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +7 -4
  86. pixeltable/functions/deepseek.py +35 -43
  87. pixeltable/functions/document.py +81 -0
  88. pixeltable/functions/fal.py +76 -0
  89. pixeltable/functions/fireworks.py +11 -20
  90. pixeltable/functions/gemini.py +195 -39
  91. pixeltable/functions/globals.py +142 -14
  92. pixeltable/functions/groq.py +108 -0
  93. pixeltable/functions/huggingface.py +1056 -24
  94. pixeltable/functions/image.py +115 -57
  95. pixeltable/functions/json.py +1 -1
  96. pixeltable/functions/llama_cpp.py +28 -13
  97. pixeltable/functions/math.py +67 -5
  98. pixeltable/functions/mistralai.py +18 -55
  99. pixeltable/functions/net.py +70 -0
  100. pixeltable/functions/ollama.py +20 -13
  101. pixeltable/functions/openai.py +240 -226
  102. pixeltable/functions/openrouter.py +143 -0
  103. pixeltable/functions/replicate.py +4 -4
  104. pixeltable/functions/reve.py +250 -0
  105. pixeltable/functions/string.py +239 -69
  106. pixeltable/functions/timestamp.py +16 -16
  107. pixeltable/functions/together.py +24 -84
  108. pixeltable/functions/twelvelabs.py +188 -0
  109. pixeltable/functions/util.py +6 -1
  110. pixeltable/functions/uuid.py +30 -0
  111. pixeltable/functions/video.py +1515 -107
  112. pixeltable/functions/vision.py +8 -8
  113. pixeltable/functions/voyageai.py +289 -0
  114. pixeltable/functions/whisper.py +16 -8
  115. pixeltable/functions/whisperx.py +179 -0
  116. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  117. pixeltable/globals.py +362 -115
  118. pixeltable/index/base.py +17 -21
  119. pixeltable/index/btree.py +28 -22
  120. pixeltable/index/embedding_index.py +100 -118
  121. pixeltable/io/__init__.py +4 -2
  122. pixeltable/io/datarows.py +8 -7
  123. pixeltable/io/external_store.py +56 -105
  124. pixeltable/io/fiftyone.py +13 -13
  125. pixeltable/io/globals.py +31 -30
  126. pixeltable/io/hf_datasets.py +61 -16
  127. pixeltable/io/label_studio.py +74 -70
  128. pixeltable/io/lancedb.py +3 -0
  129. pixeltable/io/pandas.py +21 -12
  130. pixeltable/io/parquet.py +25 -105
  131. pixeltable/io/table_data_conduit.py +250 -123
  132. pixeltable/io/utils.py +4 -4
  133. pixeltable/iterators/__init__.py +2 -1
  134. pixeltable/iterators/audio.py +26 -25
  135. pixeltable/iterators/base.py +9 -3
  136. pixeltable/iterators/document.py +112 -78
  137. pixeltable/iterators/image.py +12 -15
  138. pixeltable/iterators/string.py +11 -4
  139. pixeltable/iterators/video.py +523 -120
  140. pixeltable/metadata/__init__.py +14 -3
  141. pixeltable/metadata/converters/convert_13.py +2 -2
  142. pixeltable/metadata/converters/convert_18.py +2 -2
  143. pixeltable/metadata/converters/convert_19.py +2 -2
  144. pixeltable/metadata/converters/convert_20.py +2 -2
  145. pixeltable/metadata/converters/convert_21.py +2 -2
  146. pixeltable/metadata/converters/convert_22.py +2 -2
  147. pixeltable/metadata/converters/convert_24.py +2 -2
  148. pixeltable/metadata/converters/convert_25.py +2 -2
  149. pixeltable/metadata/converters/convert_26.py +2 -2
  150. pixeltable/metadata/converters/convert_29.py +4 -4
  151. pixeltable/metadata/converters/convert_30.py +34 -21
  152. pixeltable/metadata/converters/convert_34.py +2 -2
  153. pixeltable/metadata/converters/convert_35.py +9 -0
  154. pixeltable/metadata/converters/convert_36.py +38 -0
  155. pixeltable/metadata/converters/convert_37.py +15 -0
  156. pixeltable/metadata/converters/convert_38.py +39 -0
  157. pixeltable/metadata/converters/convert_39.py +124 -0
  158. pixeltable/metadata/converters/convert_40.py +73 -0
  159. pixeltable/metadata/converters/convert_41.py +12 -0
  160. pixeltable/metadata/converters/convert_42.py +9 -0
  161. pixeltable/metadata/converters/convert_43.py +44 -0
  162. pixeltable/metadata/converters/util.py +20 -31
  163. pixeltable/metadata/notes.py +9 -0
  164. pixeltable/metadata/schema.py +140 -53
  165. pixeltable/metadata/utils.py +74 -0
  166. pixeltable/mypy/__init__.py +3 -0
  167. pixeltable/mypy/mypy_plugin.py +123 -0
  168. pixeltable/plan.py +382 -115
  169. pixeltable/share/__init__.py +1 -1
  170. pixeltable/share/packager.py +547 -83
  171. pixeltable/share/protocol/__init__.py +33 -0
  172. pixeltable/share/protocol/common.py +165 -0
  173. pixeltable/share/protocol/operation_types.py +33 -0
  174. pixeltable/share/protocol/replica.py +119 -0
  175. pixeltable/share/publish.py +257 -59
  176. pixeltable/store.py +311 -194
  177. pixeltable/type_system.py +373 -211
  178. pixeltable/utils/__init__.py +2 -3
  179. pixeltable/utils/arrow.py +131 -17
  180. pixeltable/utils/av.py +298 -0
  181. pixeltable/utils/azure_store.py +346 -0
  182. pixeltable/utils/coco.py +6 -6
  183. pixeltable/utils/code.py +3 -3
  184. pixeltable/utils/console_output.py +4 -1
  185. pixeltable/utils/coroutine.py +6 -23
  186. pixeltable/utils/dbms.py +32 -6
  187. pixeltable/utils/description_helper.py +4 -5
  188. pixeltable/utils/documents.py +7 -18
  189. pixeltable/utils/exception_handler.py +7 -30
  190. pixeltable/utils/filecache.py +6 -6
  191. pixeltable/utils/formatter.py +86 -48
  192. pixeltable/utils/gcs_store.py +295 -0
  193. pixeltable/utils/http.py +133 -0
  194. pixeltable/utils/http_server.py +2 -3
  195. pixeltable/utils/iceberg.py +1 -2
  196. pixeltable/utils/image.py +17 -0
  197. pixeltable/utils/lancedb.py +90 -0
  198. pixeltable/utils/local_store.py +322 -0
  199. pixeltable/utils/misc.py +5 -0
  200. pixeltable/utils/object_stores.py +573 -0
  201. pixeltable/utils/pydantic.py +60 -0
  202. pixeltable/utils/pytorch.py +5 -6
  203. pixeltable/utils/s3_store.py +527 -0
  204. pixeltable/utils/sql.py +26 -0
  205. pixeltable/utils/system.py +30 -0
  206. pixeltable-0.5.7.dist-info/METADATA +579 -0
  207. pixeltable-0.5.7.dist-info/RECORD +227 -0
  208. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  209. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  210. pixeltable/__version__.py +0 -3
  211. pixeltable/catalog/named_function.py +0 -40
  212. pixeltable/ext/__init__.py +0 -17
  213. pixeltable/ext/functions/__init__.py +0 -11
  214. pixeltable/ext/functions/whisperx.py +0 -77
  215. pixeltable/utils/media_store.py +0 -77
  216. pixeltable/utils/s3.py +0 -17
  217. pixeltable-0.3.14.dist-info/METADATA +0 -434
  218. pixeltable-0.3.14.dist-info/RECORD +0 -186
  219. pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
  220. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import typing
4
- from typing import Any, Optional, Union
4
+ from typing import Any
5
5
 
6
6
  import pixeltable as pxt
7
7
  import pixeltable.type_system as ts
@@ -31,12 +31,13 @@ _hf_to_pxt: dict[str, ts.ColumnType] = {
31
31
  'timestamp[s]': ts.TimestampType(nullable=True),
32
32
  'timestamp[ms]': ts.TimestampType(nullable=True), # HF dataset iterator converts timestamps to datetime.datetime
33
33
  'timestamp[us]': ts.TimestampType(nullable=True),
34
+ 'timestamp[ns]': ts.TimestampType(nullable=True),
34
35
  'date32': ts.DateType(nullable=True),
35
36
  'date64': ts.DateType(nullable=True),
36
37
  }
37
38
 
38
39
 
39
- def _to_pixeltable_type(feature_type: Any, nullable: bool) -> Optional[ts.ColumnType]:
40
+ def _to_pixeltable_type(feature_type: Any, nullable: bool) -> ts.ColumnType | None:
40
41
  """Convert a huggingface feature type to a pixeltable ColumnType if one is defined."""
41
42
  import datasets
42
43
 
@@ -47,28 +48,51 @@ def _to_pixeltable_type(feature_type: Any, nullable: bool) -> Optional[ts.Column
47
48
  # example: Value(dtype='int64', id=None)
48
49
  pt = _hf_to_pxt.get(feature_type.dtype, None)
49
50
  return pt.copy(nullable=nullable) if pt is not None else None
50
- elif isinstance(feature_type, datasets.Sequence):
51
+ elif isinstance(feature_type, (datasets.Sequence, datasets.LargeList)):
51
52
  # example: cohere wiki. Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)
52
53
  dtype = _to_pixeltable_type(feature_type.feature, nullable)
53
- length = feature_type.length if feature_type.length != -1 else None
54
- return ts.ArrayType(shape=(length,), dtype=dtype)
54
+ if dtype is None:
55
+ return None
56
+ if dtype.is_int_type() or dtype.is_float_type() or dtype.is_bool_type() or dtype.is_string_type():
57
+ length = feature_type.length if feature_type.length != -1 else None
58
+ return ts.ArrayType(shape=(length,), dtype=dtype, nullable=nullable)
59
+ else:
60
+ # Sequence of dicts must be cast as Json
61
+ return ts.JsonType(nullable=nullable)
55
62
  elif isinstance(feature_type, datasets.Image):
56
63
  return ts.ImageType(nullable=nullable)
64
+ elif isinstance(feature_type, datasets.Audio):
65
+ return ts.AudioType(nullable=nullable)
66
+ elif isinstance(feature_type, datasets.Video):
67
+ return ts.VideoType(nullable=nullable)
68
+ elif isinstance(feature_type, (datasets.Array2D, datasets.Array3D, datasets.Array4D, datasets.Array5D)):
69
+ # Multi-dimensional arrays with fixed shape and dtype
70
+ inner_dtype = _hf_to_pxt.get(feature_type.dtype, None)
71
+ if inner_dtype is None:
72
+ return None
73
+ return ts.ArrayType(shape=feature_type.shape, dtype=inner_dtype, nullable=nullable)
74
+ elif isinstance(feature_type, (datasets.Translation, datasets.TranslationVariableLanguages)):
75
+ # Translation types are dict-like structures
76
+ return ts.JsonType(nullable=nullable)
77
+ elif isinstance(feature_type, (list, dict)):
78
+ return ts.JsonType(nullable=nullable)
57
79
  else:
58
80
  return None
59
81
 
60
82
 
61
- def _get_hf_schema(dataset: Union[datasets.Dataset, datasets.DatasetDict]) -> datasets.Features:
83
+ def _get_hf_schema(dataset: datasets.Dataset | datasets.DatasetDict) -> datasets.Features:
62
84
  """Get the schema of a huggingface dataset as a dictionary."""
63
85
  import datasets
64
86
 
65
- first_dataset = dataset if isinstance(dataset, datasets.Dataset) else next(iter(dataset.values()))
87
+ first_dataset = (
88
+ dataset if isinstance(dataset, (datasets.Dataset, datasets.IterableDataset)) else next(iter(dataset.values()))
89
+ )
66
90
  return first_dataset.features
67
91
 
68
92
 
69
93
  def huggingface_schema_to_pxt_schema(
70
94
  hf_schema: datasets.Features, schema_overrides: dict[str, Any], primary_key: list[str]
71
- ) -> dict[str, Optional[ts.ColumnType]]:
95
+ ) -> dict[str, ts.ColumnType | None]:
72
96
  """Generate a pixeltable schema from a huggingface dataset schema.
73
97
  Columns without a known mapping are mapped to None
74
98
  """
@@ -83,20 +107,41 @@ def huggingface_schema_to_pxt_schema(
83
107
 
84
108
  def import_huggingface_dataset(
85
109
  table_path: str,
86
- dataset: Union[datasets.Dataset, datasets.DatasetDict],
110
+ dataset: datasets.Dataset | datasets.DatasetDict | datasets.IterableDataset | datasets.IterableDatasetDict,
87
111
  *,
88
- schema_overrides: Optional[dict[str, Any]] = None,
89
- primary_key: Optional[Union[str, list[str]]] = None,
112
+ schema_overrides: dict[str, Any] | None = None,
113
+ primary_key: str | list[str] | None = None,
90
114
  **kwargs: Any,
91
115
  ) -> pxt.Table:
92
- """Create a new base table from a Huggingface dataset, or dataset dict with multiple splits.
93
- Requires `datasets` library to be installed.
116
+ """
117
+ Create a new base table from a Huggingface dataset, or dataset dict with multiple splits.
118
+ Requires `datasets` library to be installed.
119
+
120
+ HuggingFace feature types are mapped to Pixeltable column types as follows:
121
+
122
+ - `Value(bool)`: `Bool`<br/>
123
+ `Value(int*/uint*)`: `Int`<br/>
124
+ `Value(float*)`: `Float`<br/>
125
+ `Value(string/large_string)`: `String`<br/>
126
+ `Value(timestamp*)`: `Timestamp`<br/>
127
+ `Value(date*)`: `Date`
128
+ - `ClassLabel`: `String` (converted to label names)
129
+ - `Sequence`/`LargeList` of numeric types: `Array`
130
+ - `Sequence`/`LargeList` of string: `Json`
131
+ - `Sequence`/`LargeList` of dicts: `Json`
132
+ - `Array2D`-`Array5D`: `Array` (preserves shape)
133
+ - `Image`: `Image`
134
+ - `Audio`: `Audio`
135
+ - `Video`: `Video`
136
+ - `Translation`/`TranslationVariableLanguages`: `Json`
94
137
 
95
138
  Args:
96
139
  table_path: Path to the table.
97
- dataset: Huggingface [`datasets.Dataset`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.Dataset)
98
- or [`datasets.DatasetDict`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.DatasetDict)
99
- to insert into the table.
140
+ dataset: An instance of any of the Huggingface dataset classes:
141
+ [`datasets.Dataset`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.Dataset),
142
+ [`datasets.DatasetDict`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.DatasetDict),
143
+ [`datasets.IterableDataset`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.IterableDataset),
144
+ [`datasets.IterableDatasetDict`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.IterableDatasetDict)
100
145
  schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
101
146
  name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`.
102
147
  The keys in `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not
@@ -4,19 +4,22 @@ import logging
4
4
  import os
5
5
  from dataclasses import dataclass
6
6
  from pathlib import Path
7
- from typing import Any, Iterator, Literal, Optional, cast
7
+ from typing import Any, Iterator, Literal
8
8
  from xml.etree import ElementTree as ET
9
9
 
10
- import label_studio_sdk # type: ignore[import-untyped]
10
+ import label_studio_sdk
11
11
  import PIL.Image
12
12
  from requests.exceptions import HTTPError
13
13
 
14
14
  import pixeltable.type_system as ts
15
15
  from pixeltable import Column, Table, env, exceptions as excs
16
+ from pixeltable.catalog import ColumnHandle
17
+ from pixeltable.catalog.update_status import RowCountStats, UpdateStatus
16
18
  from pixeltable.config import Config
17
19
  from pixeltable.exprs import ColumnRef, DataRow, Expr
18
- from pixeltable.io.external_store import Project, SyncStatus
20
+ from pixeltable.io.external_store import Project
19
21
  from pixeltable.utils import coco
22
+ from pixeltable.utils.local_store import TempStore
20
23
 
21
24
  # label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
22
25
  # the import two different ways to insure intercompatibility
@@ -25,7 +28,7 @@ try:
25
28
  import label_studio_sdk.project as ls_project # type: ignore
26
29
  except ImportError:
27
30
  # label_studio_sdk>=1 compatibility
28
- import label_studio_sdk._legacy.project as ls_project # type: ignore
31
+ import label_studio_sdk._legacy.project as ls_project
29
32
 
30
33
  _logger = logging.getLogger('pixeltable')
31
34
 
@@ -43,23 +46,26 @@ class LabelStudioProject(Project):
43
46
  """
44
47
  An [`ExternalStore`][pixeltable.io.ExternalStore] that represents a Label Studio project, providing functionality
45
48
  for synchronizing between a Pixeltable table and a Label Studio project.
49
+
50
+ The constructor will NOT create a new Label Studio project; it is also used when loading
51
+ metadata for existing projects.
46
52
  """
47
53
 
54
+ project_id: int # Label Studio project ID
55
+ media_import_method: Literal['post', 'file', 'url']
56
+ _project: ls_project.Project | None
57
+
48
58
  def __init__(
49
59
  self,
50
60
  name: str,
51
61
  project_id: int,
52
62
  media_import_method: Literal['post', 'file', 'url'],
53
- col_mapping: dict[Column, str],
54
- stored_proxies: Optional[dict[Column, Column]] = None,
63
+ col_mapping: dict[ColumnHandle, str],
64
+ stored_proxies: dict[ColumnHandle, ColumnHandle] | None = None,
55
65
  ):
56
- """
57
- The constructor will NOT create a new Label Studio project; it is also used when loading
58
- metadata for existing projects.
59
- """
60
66
  self.project_id = project_id
61
67
  self.media_import_method = media_import_method
62
- self._project: Optional[ls_project.Project] = None
68
+ self._project = None
63
69
  super().__init__(name, col_mapping, stored_proxies)
64
70
 
65
71
  @property
@@ -105,20 +111,20 @@ class LabelStudioProject(Project):
105
111
  """
106
112
  return {ANNOTATIONS_COLUMN: ts.JsonType(nullable=True)}
107
113
 
108
- def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
114
+ def sync(self, t: Table, export_data: bool, import_data: bool) -> UpdateStatus:
109
115
  _logger.info(
110
116
  f'Syncing Label Studio project "{self.project_title}" with table `{t._name}`'
111
117
  f' (export: {export_data}, import: {import_data}).'
112
118
  )
113
119
  # Collect all existing tasks into a dict with entries `rowid: task`
114
120
  tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
115
- sync_status = SyncStatus.empty()
121
+ sync_status = UpdateStatus()
116
122
  if export_data:
117
123
  export_sync_status = self.__update_tasks(t, tasks)
118
- sync_status = sync_status.combine(export_sync_status)
124
+ sync_status += export_sync_status
119
125
  if import_data:
120
126
  import_sync_status = self.__update_table_from_tasks(t, tasks)
121
- sync_status = sync_status.combine(import_sync_status)
127
+ sync_status += import_sync_status
122
128
  return sync_status
123
129
 
124
130
  def __fetch_all_tasks(self) -> Iterator[dict[str, Any]]:
@@ -142,7 +148,7 @@ class LabelStudioProject(Project):
142
148
  f'Label Studio project {self.project_title!r}.'
143
149
  )
144
150
 
145
- def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) -> SyncStatus:
151
+ def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) -> UpdateStatus:
146
152
  """
147
153
  Updates all tasks in this Label Studio project based on the Pixeltable data:
148
154
  - Creates new tasks for rows that don't map to any existing task;
@@ -155,7 +161,7 @@ class LabelStudioProject(Project):
155
161
  t_data_cols = [t_col for t_col, ext_col_name in self.col_mapping.items() if ext_col_name in config.data_keys]
156
162
 
157
163
  if len(t_data_cols) == 0:
158
- return SyncStatus.empty()
164
+ return UpdateStatus()
159
165
 
160
166
  # Columns in `t` that map to `rectanglelabels` preannotations
161
167
  t_rl_cols = [
@@ -183,15 +189,15 @@ class LabelStudioProject(Project):
183
189
  self,
184
190
  t: Table,
185
191
  existing_tasks: dict[tuple, dict],
186
- media_col: Column,
187
- t_rl_cols: list[Column],
192
+ media_col: ColumnHandle,
193
+ t_rl_cols: list[ColumnHandle],
188
194
  rl_info: list['_RectangleLabel'],
189
- ) -> SyncStatus:
190
- is_stored = media_col.is_stored
195
+ ) -> UpdateStatus:
196
+ is_stored = media_col.get().is_stored
191
197
  # If it's a stored column, we can use `localpath`
192
- localpath_col_opt = [t[media_col.name].localpath] if is_stored else []
198
+ localpath_col_opt = [t[media_col.get().name].localpath] if is_stored else []
193
199
  # Select the media column, rectanglelabels columns, and localpath (if appropriate)
194
- rows = t.select(t[media_col.name], *[t[col.name] for col in t_rl_cols], *localpath_col_opt)
200
+ rows = t.select(t[media_col.get().name], *[t[col.get().name] for col in t_rl_cols], *localpath_col_opt)
195
201
  tasks_created = 0
196
202
  row_ids_in_pxt: set[tuple] = set()
197
203
 
@@ -209,7 +215,7 @@ class LabelStudioProject(Project):
209
215
  else:
210
216
  # No localpath; create a temp file and upload it
211
217
  assert isinstance(row[media_col_idx], PIL.Image.Image)
212
- file = env.Env.get().create_tmp_path(extension='.png')
218
+ file = TempStore.create_path(extension='.png')
213
219
  row[media_col_idx].save(file, format='png')
214
220
  task_id = self.project.import_tasks(file)[0]
215
221
  os.remove(file)
@@ -232,48 +238,48 @@ class LabelStudioProject(Project):
232
238
 
233
239
  env.Env.get().console_logger.info(f'Created {tasks_created} new task(s) in {self}.')
234
240
 
235
- sync_status = SyncStatus(external_rows_created=tasks_created)
241
+ sync_status = UpdateStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created))
236
242
 
237
243
  deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
238
-
239
- return sync_status.combine(deletion_sync_status)
244
+ sync_status += deletion_sync_status
245
+ return sync_status
240
246
 
241
247
  def __update_tasks_by_files(
242
248
  self,
243
249
  t: Table,
244
250
  existing_tasks: dict[tuple, dict],
245
- t_data_cols: list[Column],
246
- t_rl_cols: list[Column],
251
+ t_data_cols: list[ColumnHandle],
252
+ t_rl_cols: list[ColumnHandle],
247
253
  rl_info: list['_RectangleLabel'],
248
- ) -> SyncStatus:
254
+ ) -> UpdateStatus:
249
255
  ext_data_cols = [self.col_mapping[col] for col in t_data_cols]
250
256
  expr_refs: dict[str, Expr] = {} # kwargs for the select statement
251
257
  for col in t_data_cols:
252
- col_name = col.name
258
+ col_name = col.get().name
253
259
  if self.media_import_method == 'url':
254
260
  expr_refs[col_name] = t[col_name].fileurl
255
261
  else:
256
262
  assert self.media_import_method == 'file'
257
- if not col.col_type.is_media_type():
263
+ if not col.get().col_type.is_media_type():
258
264
  # Not a media column; query the data directly
259
- expr_refs[col_name] = cast(ColumnRef, t[col_name])
265
+ expr_refs[col_name] = t[col_name]
260
266
  elif col in self.stored_proxies:
261
267
  # Media column that has a stored proxy; use it. We have to give it a name,
262
268
  # since it's an anonymous column
263
- stored_proxy_col = self.stored_proxies[col]
269
+ stored_proxy_col = self.stored_proxies[col].get()
264
270
  expr_refs[f'{col_name}_proxy'] = ColumnRef(stored_proxy_col).localpath
265
271
  else:
266
272
  # Media column without a stored proxy; this means it's a stored computed column,
267
273
  # and we can just use the localpath
268
274
  expr_refs[col_name] = t[col_name].localpath
269
275
 
270
- df = t.select(*[t[col.name] for col in t_rl_cols], **expr_refs)
276
+ query = t.select(*[t[col.get().name] for col in t_rl_cols], **expr_refs)
271
277
  # The following buffers will hold `DataRow` indices that correspond to each of the selected
272
278
  # columns. `rl_col_idxs` holds the indices for the columns that map to RectangleLabels
273
279
  # preannotations; `data_col_idxs` holds the indices for the columns that map to data fields.
274
280
  # We have to wait until we begin iterating to populate them, so they're initially `None`.
275
- rl_col_idxs: Optional[list[int]] = None
276
- data_col_idxs: Optional[list[int]] = None
281
+ rl_col_idxs: list[int] | None = None
282
+ data_col_idxs: list[int] | None = None
277
283
 
278
284
  row_ids_in_pxt: set[tuple] = set()
279
285
  tasks_created = 0
@@ -286,11 +292,11 @@ class LabelStudioProject(Project):
286
292
  data_vals = [row[idx] for idx in data_col_idxs]
287
293
  coco_annotations = [row[idx] for idx in rl_col_idxs]
288
294
  for i in range(len(t_data_cols)):
289
- if t_data_cols[i].col_type.is_media_type():
295
+ if t_data_cols[i].get().col_type.is_media_type():
290
296
  # Special handling for media columns
291
297
  assert isinstance(data_vals[i], str)
292
298
  if self.media_import_method == 'url':
293
- data_vals[i] = self.__validate_fileurl(t_data_cols[i], data_vals[i])
299
+ data_vals[i] = self.__validate_fileurl(t_data_cols[i].get(), data_vals[i])
294
300
  else:
295
301
  assert self.media_import_method == 'file'
296
302
  data_vals[i] = self.__localpath_to_lspath(data_vals[i])
@@ -304,10 +310,10 @@ class LabelStudioProject(Project):
304
310
  'predictions': predictions,
305
311
  }
306
312
 
307
- for row in df._exec():
313
+ for row in query._exec():
308
314
  if rl_col_idxs is None:
309
- rl_col_idxs = [expr.slot_idx for expr in df._select_list_exprs[: len(t_rl_cols)]]
310
- data_col_idxs = [expr.slot_idx for expr in df._select_list_exprs[len(t_rl_cols) :]]
315
+ rl_col_idxs = [expr.slot_idx for expr in query._select_list_exprs[: len(t_rl_cols)]]
316
+ data_col_idxs = [expr.slot_idx for expr in query._select_list_exprs[len(t_rl_cols) :]]
311
317
  row_ids_in_pxt.add(row.rowid)
312
318
  task_info = create_task_info(row)
313
319
  # TODO(aaron-siegel): Implement more efficient update logic (currently involves a full table scan)
@@ -336,14 +342,14 @@ class LabelStudioProject(Project):
336
342
  f'Created {tasks_created} new task(s) and updated {tasks_updated} existing task(s) in {self}.'
337
343
  )
338
344
 
339
- sync_status = SyncStatus(external_rows_created=tasks_created, external_rows_updated=tasks_updated)
345
+ sync_status = UpdateStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created, upd_rows=tasks_updated))
340
346
 
341
347
  deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
342
-
343
- return sync_status.combine(deletion_sync_status)
348
+ sync_status += deletion_sync_status
349
+ return sync_status
344
350
 
345
351
  @classmethod
346
- def __validate_fileurl(cls, col: Column, url: str) -> Optional[str]:
352
+ def __validate_fileurl(cls, col: Column, url: str) -> str | None:
347
353
  # Check that the URL is one that will be visible to Label Studio. If it isn't, log an info message
348
354
  # to help users debug the issue.
349
355
  if not (url.startswith('http://') or url.startswith('https://')):
@@ -361,7 +367,7 @@ class LabelStudioProject(Project):
361
367
 
362
368
  def __delete_stale_tasks(
363
369
  self, existing_tasks: dict[tuple, dict], row_ids_in_pxt: set[tuple], tasks_created: int
364
- ) -> SyncStatus:
370
+ ) -> UpdateStatus:
365
371
  deleted_rowids = set(existing_tasks.keys()) - row_ids_in_pxt
366
372
  # Sanity check the math
367
373
  assert len(deleted_rowids) == len(existing_tasks) + tasks_created - len(row_ids_in_pxt)
@@ -377,11 +383,11 @@ class LabelStudioProject(Project):
377
383
  for rowid in deleted_rowids:
378
384
  del existing_tasks[rowid]
379
385
 
380
- return SyncStatus(external_rows_deleted=len(deleted_rowids))
386
+ return UpdateStatus(ext_row_count_stats=RowCountStats(del_rows=len(deleted_rowids)))
381
387
 
382
- def __update_table_from_tasks(self, t: Table, tasks: dict[tuple, dict]) -> SyncStatus:
388
+ def __update_table_from_tasks(self, t: Table, tasks: dict[tuple, dict]) -> UpdateStatus:
383
389
  if ANNOTATIONS_COLUMN not in self.col_mapping.values():
384
- return SyncStatus.empty()
390
+ return UpdateStatus()
385
391
 
386
392
  annotations = {
387
393
  # Replace [] by None to indicate no annotations. We do want to sync rows with no annotations,
@@ -391,7 +397,7 @@ class LabelStudioProject(Project):
391
397
  for task in tasks.values()
392
398
  }
393
399
 
394
- local_annotations_col = next(k for k, v in self.col_mapping.items() if v == ANNOTATIONS_COLUMN)
400
+ local_annotations_col = next(k for k, v in self.col_mapping.items() if v == ANNOTATIONS_COLUMN).get()
395
401
 
396
402
  # Prune the annotations down to just the ones that have actually changed.
397
403
  rows = t.select(t[local_annotations_col.name])
@@ -412,23 +418,21 @@ class LabelStudioProject(Project):
412
418
  # TODO(aaron-siegel): Simplify this once propagation is properly implemented in batch_update
413
419
  ancestor = t
414
420
  while local_annotations_col not in ancestor._tbl_version.get().cols:
415
- assert ancestor._base_table is not None
416
- ancestor = ancestor._base_table
421
+ assert ancestor._get_base_table is not None
422
+ ancestor = ancestor._get_base_table()
417
423
  update_status = ancestor.batch_update(updates)
418
424
  env.Env.get().console_logger.info(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
419
- return SyncStatus(pxt_rows_updated=update_status.num_rows, num_excs=update_status.num_excs)
425
+ return update_status
420
426
  else:
421
- return SyncStatus.empty()
427
+ return UpdateStatus()
422
428
 
423
429
  def as_dict(self) -> dict[str, Any]:
424
430
  return {
425
431
  'name': self.name,
426
432
  'project_id': self.project_id,
427
433
  'media_import_method': self.media_import_method,
428
- 'col_mapping': [[self._column_as_dict(k), v] for k, v in self.col_mapping.items()],
429
- 'stored_proxies': [
430
- [self._column_as_dict(k), self._column_as_dict(v)] for k, v in self.stored_proxies.items()
431
- ],
434
+ 'col_mapping': [[k.as_dict(), v] for k, v in self.col_mapping.items()],
435
+ 'stored_proxies': [[k.as_dict(), v.as_dict()] for k, v in self.stored_proxies.items()],
432
436
  }
433
437
 
434
438
  @classmethod
@@ -437,8 +441,8 @@ class LabelStudioProject(Project):
437
441
  md['name'],
438
442
  md['project_id'],
439
443
  md['media_import_method'],
440
- {cls._column_from_dict(entry[0]): entry[1] for entry in md['col_mapping']},
441
- {cls._column_from_dict(entry[0]): cls._column_from_dict(entry[1]) for entry in md['stored_proxies']},
444
+ {ColumnHandle.from_dict(entry[0]): entry[1] for entry in md['col_mapping']},
445
+ {ColumnHandle.from_dict(entry[0]): ColumnHandle.from_dict(entry[1]) for entry in md['stored_proxies']},
442
446
  )
443
447
 
444
448
  def __repr__(self) -> str:
@@ -493,7 +497,7 @@ class LabelStudioProject(Project):
493
497
 
494
498
  @classmethod
495
499
  def __coco_to_predictions(
496
- cls, coco_annotations: dict[str, Any], from_name: str, rl_info: '_RectangleLabel', task_id: Optional[int] = None
500
+ cls, coco_annotations: dict[str, Any], from_name: str, rl_info: '_RectangleLabel', task_id: int | None = None
497
501
  ) -> dict[str, Any]:
498
502
  width = coco_annotations['image']['width']
499
503
  height = coco_annotations['image']['height']
@@ -545,11 +549,11 @@ class LabelStudioProject(Project):
545
549
  cls,
546
550
  t: Table,
547
551
  label_config: str,
548
- name: Optional[str],
549
- title: Optional[str],
552
+ name: str | None,
553
+ title: str | None,
550
554
  media_import_method: Literal['post', 'file', 'url'],
551
- col_mapping: Optional[dict[str, str]],
552
- s3_configuration: Optional[dict[str, Any]],
555
+ col_mapping: dict[str, str] | None,
556
+ s3_configuration: dict[str, Any] | None,
553
557
  **kwargs: Any,
554
558
  ) -> 'LabelStudioProject':
555
559
  """
@@ -560,7 +564,7 @@ class LabelStudioProject(Project):
560
564
 
561
565
  if name is None:
562
566
  # Create a default name that's unique to the table
563
- all_stores = t.external_stores
567
+ all_stores = t.external_stores()
564
568
  n = 0
565
569
  while f'ls_project_{n}' in all_stores:
566
570
  n += 1
@@ -576,8 +580,8 @@ class LabelStudioProject(Project):
576
580
  local_annotations_column = ANNOTATIONS_COLUMN
577
581
  else:
578
582
  local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
579
- if local_annotations_column not in t._schema:
580
- t.add_columns({local_annotations_column: ts.JsonType(nullable=True)})
583
+ if local_annotations_column not in t._get_schema():
584
+ t.add_columns({local_annotations_column: ts.Json})
581
585
 
582
586
  resolved_col_mapping = cls.validate_columns(
583
587
  t, config.export_columns, {ANNOTATIONS_COLUMN: ts.JsonType(nullable=True)}, col_mapping
@@ -648,7 +652,7 @@ class LabelStudioProject(Project):
648
652
 
649
653
  @dataclass(frozen=True)
650
654
  class _DataKey:
651
- name: Optional[str] # The 'name' attribute of the data key; may differ from the field name
655
+ name: str | None # The 'name' attribute of the data key; may differ from the field name
652
656
  column_type: ts.ColumnType
653
657
 
654
658
 
@@ -0,0 +1,3 @@
1
+ from pixeltable.utils.lancedb import export_lancedb
2
+
3
+ __all__ = ['export_lancedb']
pixeltable/io/pandas.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import os
2
- from typing import Any, Optional, Union
2
+ import uuid
3
+ from typing import Any
3
4
 
4
5
  import numpy as np
5
6
  import pandas as pd
@@ -16,8 +17,8 @@ def import_pandas(
16
17
  tbl_name: str,
17
18
  df: pd.DataFrame,
18
19
  *,
19
- schema_overrides: Optional[dict[str, Any]] = None,
20
- primary_key: Optional[Union[str, list[str]]] = None,
20
+ schema_overrides: dict[str, Any] | None = None,
21
+ primary_key: str | list[str] | None = None,
21
22
  num_retained_versions: int = 10,
22
23
  comment: str = '',
23
24
  ) -> pxt.Table:
@@ -55,9 +56,9 @@ def import_pandas(
55
56
 
56
57
  def import_csv(
57
58
  tbl_name: str,
58
- filepath_or_buffer: Union[str, os.PathLike],
59
- schema_overrides: Optional[dict[str, Any]] = None,
60
- primary_key: Optional[Union[str, list[str]]] = None,
59
+ filepath_or_buffer: str | os.PathLike,
60
+ schema_overrides: dict[str, Any] | None = None,
61
+ primary_key: str | list[str] | None = None,
61
62
  num_retained_versions: int = 10,
62
63
  comment: str = '',
63
64
  **kwargs: Any,
@@ -84,10 +85,10 @@ def import_csv(
84
85
 
85
86
  def import_excel(
86
87
  tbl_name: str,
87
- io: Union[str, os.PathLike],
88
+ io: str | os.PathLike,
88
89
  *,
89
- schema_overrides: Optional[dict[str, Any]] = None,
90
- primary_key: Optional[Union[str, list[str]]] = None,
90
+ schema_overrides: dict[str, Any] | None = None,
91
+ primary_key: str | list[str] | None = None,
91
92
  num_retained_versions: int = 10,
92
93
  comment: str = '',
93
94
  **kwargs: Any,
@@ -132,6 +133,7 @@ def df_infer_schema(
132
133
  pd_schema: dict[str, ts.ColumnType] = {}
133
134
  for pd_name, pd_dtype in zip(df.columns, df.dtypes):
134
135
  if pd_name in schema_overrides:
136
+ assert isinstance(schema_overrides[pd_name], ts.ColumnType)
135
137
  pxt_type = schema_overrides[pd_name]
136
138
  else:
137
139
  pxt_type = __pd_coltype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
@@ -140,7 +142,7 @@ def df_infer_schema(
140
142
  return pd_schema
141
143
 
142
144
 
143
- def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[ts.ColumnType]:
145
+ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> ts.ColumnType | None:
144
146
  """
145
147
  Determines a pixeltable ColumnType from a pandas dtype
146
148
 
@@ -159,7 +161,7 @@ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[ts.Co
159
161
  return None
160
162
  # Most other pandas dtypes are directly NumPy compatible
161
163
  assert isinstance(pd_dtype, np.dtype)
162
- return ts.ArrayType.from_np_dtype(pd_dtype, nullable)
164
+ return ts.ColumnType.from_np_dtype(pd_dtype, nullable)
163
165
 
164
166
 
165
167
  def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable: bool) -> ts.ColumnType:
@@ -191,7 +193,7 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
191
193
 
192
194
 
193
195
  def _df_row_to_pxt_row(
194
- row: tuple[Any, ...], schema: dict[str, ts.ColumnType], col_mapping: Optional[dict[str, str]]
196
+ row: tuple[Any, ...], schema: dict[str, ts.ColumnType], col_mapping: dict[str, str] | None
195
197
  ) -> dict[str, Any]:
196
198
  """Convert a row to insertable format"""
197
199
  pxt_row: dict[str, Any] = {}
@@ -229,6 +231,13 @@ def _df_row_to_pxt_row(
229
231
  nval = pd.Timestamp(tval).tz_localize(tz=Env.get().default_time_zone)
230
232
  else:
231
233
  nval = tval.astimezone(Env.get().default_time_zone)
234
+ elif pxt_type.is_uuid_type():
235
+ if pd.isnull(val):
236
+ nval = None
237
+ elif isinstance(val, uuid.UUID):
238
+ nval = val
239
+ else:
240
+ nval = uuid.UUID(val)
232
241
  else:
233
242
  nval = val
234
243
  pxt_row[pxt_name] = nval