pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (202) hide show
  1. pixeltable/__init__.py +23 -5
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/__init__.py +5 -3
  4. pixeltable/catalog/catalog.py +1318 -404
  5. pixeltable/catalog/column.py +186 -115
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +11 -43
  8. pixeltable/catalog/insertable_table.py +167 -79
  9. pixeltable/catalog/path.py +61 -23
  10. pixeltable/catalog/schema_object.py +9 -10
  11. pixeltable/catalog/table.py +626 -308
  12. pixeltable/catalog/table_metadata.py +101 -0
  13. pixeltable/catalog/table_version.py +713 -569
  14. pixeltable/catalog/table_version_handle.py +37 -6
  15. pixeltable/catalog/table_version_path.py +42 -29
  16. pixeltable/catalog/tbl_ops.py +50 -0
  17. pixeltable/catalog/update_status.py +191 -0
  18. pixeltable/catalog/view.py +108 -94
  19. pixeltable/config.py +128 -22
  20. pixeltable/dataframe.py +188 -100
  21. pixeltable/env.py +407 -136
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +3 -0
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +231 -0
  27. pixeltable/exec/cell_reconstruction_node.py +135 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +7 -6
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +190 -30
  37. pixeltable/exec/globals.py +32 -0
  38. pixeltable/exec/in_memory_data_node.py +18 -18
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +206 -101
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +34 -30
  46. pixeltable/exprs/column_ref.py +92 -96
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +152 -55
  50. pixeltable/exprs/expr.py +62 -43
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +75 -37
  54. pixeltable/exprs/globals.py +1 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +10 -27
  57. pixeltable/exprs/is_null.py +1 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +5 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +127 -53
  64. pixeltable/exprs/rowid_ref.py +8 -12
  65. pixeltable/exprs/similarity_expr.py +50 -25
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +10 -10
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +20 -18
  78. pixeltable/func/signature.py +43 -16
  79. pixeltable/func/tools.py +23 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +6 -0
  82. pixeltable/functions/anthropic.py +93 -33
  83. pixeltable/functions/audio.py +114 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +1 -1
  86. pixeltable/functions/deepseek.py +20 -9
  87. pixeltable/functions/fireworks.py +2 -2
  88. pixeltable/functions/gemini.py +28 -11
  89. pixeltable/functions/globals.py +13 -13
  90. pixeltable/functions/groq.py +108 -0
  91. pixeltable/functions/huggingface.py +1046 -23
  92. pixeltable/functions/image.py +9 -18
  93. pixeltable/functions/llama_cpp.py +23 -8
  94. pixeltable/functions/math.py +3 -4
  95. pixeltable/functions/mistralai.py +4 -15
  96. pixeltable/functions/ollama.py +16 -9
  97. pixeltable/functions/openai.py +104 -82
  98. pixeltable/functions/openrouter.py +143 -0
  99. pixeltable/functions/replicate.py +2 -2
  100. pixeltable/functions/reve.py +250 -0
  101. pixeltable/functions/string.py +21 -28
  102. pixeltable/functions/timestamp.py +13 -14
  103. pixeltable/functions/together.py +4 -6
  104. pixeltable/functions/twelvelabs.py +92 -0
  105. pixeltable/functions/util.py +6 -1
  106. pixeltable/functions/video.py +1388 -106
  107. pixeltable/functions/vision.py +7 -7
  108. pixeltable/functions/whisper.py +15 -7
  109. pixeltable/functions/whisperx.py +179 -0
  110. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  111. pixeltable/globals.py +332 -105
  112. pixeltable/index/base.py +13 -22
  113. pixeltable/index/btree.py +23 -22
  114. pixeltable/index/embedding_index.py +32 -44
  115. pixeltable/io/__init__.py +4 -2
  116. pixeltable/io/datarows.py +7 -6
  117. pixeltable/io/external_store.py +49 -77
  118. pixeltable/io/fiftyone.py +11 -11
  119. pixeltable/io/globals.py +29 -28
  120. pixeltable/io/hf_datasets.py +17 -9
  121. pixeltable/io/label_studio.py +70 -66
  122. pixeltable/io/lancedb.py +3 -0
  123. pixeltable/io/pandas.py +12 -11
  124. pixeltable/io/parquet.py +13 -93
  125. pixeltable/io/table_data_conduit.py +71 -47
  126. pixeltable/io/utils.py +3 -3
  127. pixeltable/iterators/__init__.py +2 -1
  128. pixeltable/iterators/audio.py +21 -11
  129. pixeltable/iterators/document.py +116 -55
  130. pixeltable/iterators/image.py +5 -2
  131. pixeltable/iterators/video.py +293 -13
  132. pixeltable/metadata/__init__.py +4 -2
  133. pixeltable/metadata/converters/convert_18.py +2 -2
  134. pixeltable/metadata/converters/convert_19.py +2 -2
  135. pixeltable/metadata/converters/convert_20.py +2 -2
  136. pixeltable/metadata/converters/convert_21.py +2 -2
  137. pixeltable/metadata/converters/convert_22.py +2 -2
  138. pixeltable/metadata/converters/convert_24.py +2 -2
  139. pixeltable/metadata/converters/convert_25.py +2 -2
  140. pixeltable/metadata/converters/convert_26.py +2 -2
  141. pixeltable/metadata/converters/convert_29.py +4 -4
  142. pixeltable/metadata/converters/convert_34.py +2 -2
  143. pixeltable/metadata/converters/convert_36.py +2 -2
  144. pixeltable/metadata/converters/convert_37.py +15 -0
  145. pixeltable/metadata/converters/convert_38.py +39 -0
  146. pixeltable/metadata/converters/convert_39.py +124 -0
  147. pixeltable/metadata/converters/convert_40.py +73 -0
  148. pixeltable/metadata/converters/util.py +13 -12
  149. pixeltable/metadata/notes.py +4 -0
  150. pixeltable/metadata/schema.py +79 -42
  151. pixeltable/metadata/utils.py +74 -0
  152. pixeltable/mypy/__init__.py +3 -0
  153. pixeltable/mypy/mypy_plugin.py +123 -0
  154. pixeltable/plan.py +274 -223
  155. pixeltable/share/__init__.py +1 -1
  156. pixeltable/share/packager.py +259 -129
  157. pixeltable/share/protocol/__init__.py +34 -0
  158. pixeltable/share/protocol/common.py +170 -0
  159. pixeltable/share/protocol/operation_types.py +33 -0
  160. pixeltable/share/protocol/replica.py +109 -0
  161. pixeltable/share/publish.py +213 -57
  162. pixeltable/store.py +238 -175
  163. pixeltable/type_system.py +104 -63
  164. pixeltable/utils/__init__.py +2 -3
  165. pixeltable/utils/arrow.py +108 -13
  166. pixeltable/utils/av.py +298 -0
  167. pixeltable/utils/azure_store.py +305 -0
  168. pixeltable/utils/code.py +3 -3
  169. pixeltable/utils/console_output.py +4 -1
  170. pixeltable/utils/coroutine.py +6 -23
  171. pixeltable/utils/dbms.py +31 -5
  172. pixeltable/utils/description_helper.py +4 -5
  173. pixeltable/utils/documents.py +5 -6
  174. pixeltable/utils/exception_handler.py +7 -30
  175. pixeltable/utils/filecache.py +6 -6
  176. pixeltable/utils/formatter.py +4 -6
  177. pixeltable/utils/gcs_store.py +283 -0
  178. pixeltable/utils/http_server.py +2 -3
  179. pixeltable/utils/iceberg.py +1 -2
  180. pixeltable/utils/image.py +17 -0
  181. pixeltable/utils/lancedb.py +88 -0
  182. pixeltable/utils/local_store.py +316 -0
  183. pixeltable/utils/misc.py +5 -0
  184. pixeltable/utils/object_stores.py +528 -0
  185. pixeltable/utils/pydantic.py +60 -0
  186. pixeltable/utils/pytorch.py +5 -6
  187. pixeltable/utils/s3_store.py +392 -0
  188. pixeltable-0.4.20.dist-info/METADATA +587 -0
  189. pixeltable-0.4.20.dist-info/RECORD +218 -0
  190. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
  191. pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
  192. pixeltable/__version__.py +0 -3
  193. pixeltable/ext/__init__.py +0 -17
  194. pixeltable/ext/functions/__init__.py +0 -11
  195. pixeltable/ext/functions/whisperx.py +0 -77
  196. pixeltable/utils/media_store.py +0 -77
  197. pixeltable/utils/s3.py +0 -17
  198. pixeltable/utils/sample.py +0 -25
  199. pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
  200. pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
  201. pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
  202. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
@@ -4,19 +4,22 @@ import logging
4
4
  import os
5
5
  from dataclasses import dataclass
6
6
  from pathlib import Path
7
- from typing import Any, Iterator, Literal, Optional, cast
7
+ from typing import Any, Iterator, Literal
8
8
  from xml.etree import ElementTree as ET
9
9
 
10
- import label_studio_sdk # type: ignore[import-untyped]
10
+ import label_studio_sdk
11
11
  import PIL.Image
12
12
  from requests.exceptions import HTTPError
13
13
 
14
14
  import pixeltable.type_system as ts
15
15
  from pixeltable import Column, Table, env, exceptions as excs
16
+ from pixeltable.catalog import ColumnHandle
17
+ from pixeltable.catalog.update_status import RowCountStats, UpdateStatus
16
18
  from pixeltable.config import Config
17
19
  from pixeltable.exprs import ColumnRef, DataRow, Expr
18
- from pixeltable.io.external_store import Project, SyncStatus
20
+ from pixeltable.io.external_store import Project
19
21
  from pixeltable.utils import coco
22
+ from pixeltable.utils.local_store import TempStore
20
23
 
21
24
  # label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
22
25
  # the import two different ways to insure intercompatibility
@@ -25,7 +28,7 @@ try:
25
28
  import label_studio_sdk.project as ls_project # type: ignore
26
29
  except ImportError:
27
30
  # label_studio_sdk>=1 compatibility
28
- import label_studio_sdk._legacy.project as ls_project # type: ignore
31
+ import label_studio_sdk._legacy.project as ls_project
29
32
 
30
33
  _logger = logging.getLogger('pixeltable')
31
34
 
@@ -43,23 +46,26 @@ class LabelStudioProject(Project):
43
46
  """
44
47
  An [`ExternalStore`][pixeltable.io.ExternalStore] that represents a Label Studio project, providing functionality
45
48
  for synchronizing between a Pixeltable table and a Label Studio project.
49
+
50
+ The constructor will NOT create a new Label Studio project; it is also used when loading
51
+ metadata for existing projects.
46
52
  """
47
53
 
54
+ project_id: int # Label Studio project ID
55
+ media_import_method: Literal['post', 'file', 'url']
56
+ _project: ls_project.Project | None
57
+
48
58
  def __init__(
49
59
  self,
50
60
  name: str,
51
61
  project_id: int,
52
62
  media_import_method: Literal['post', 'file', 'url'],
53
- col_mapping: dict[Column, str],
54
- stored_proxies: Optional[dict[Column, Column]] = None,
63
+ col_mapping: dict[ColumnHandle, str],
64
+ stored_proxies: dict[ColumnHandle, ColumnHandle] | None = None,
55
65
  ):
56
- """
57
- The constructor will NOT create a new Label Studio project; it is also used when loading
58
- metadata for existing projects.
59
- """
60
66
  self.project_id = project_id
61
67
  self.media_import_method = media_import_method
62
- self._project: Optional[ls_project.Project] = None
68
+ self._project = None
63
69
  super().__init__(name, col_mapping, stored_proxies)
64
70
 
65
71
  @property
@@ -105,20 +111,20 @@ class LabelStudioProject(Project):
105
111
  """
106
112
  return {ANNOTATIONS_COLUMN: ts.JsonType(nullable=True)}
107
113
 
108
- def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
114
+ def sync(self, t: Table, export_data: bool, import_data: bool) -> UpdateStatus:
109
115
  _logger.info(
110
116
  f'Syncing Label Studio project "{self.project_title}" with table `{t._name}`'
111
117
  f' (export: {export_data}, import: {import_data}).'
112
118
  )
113
119
  # Collect all existing tasks into a dict with entries `rowid: task`
114
120
  tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
115
- sync_status = SyncStatus.empty()
121
+ sync_status = UpdateStatus()
116
122
  if export_data:
117
123
  export_sync_status = self.__update_tasks(t, tasks)
118
- sync_status = sync_status.combine(export_sync_status)
124
+ sync_status += export_sync_status
119
125
  if import_data:
120
126
  import_sync_status = self.__update_table_from_tasks(t, tasks)
121
- sync_status = sync_status.combine(import_sync_status)
127
+ sync_status += import_sync_status
122
128
  return sync_status
123
129
 
124
130
  def __fetch_all_tasks(self) -> Iterator[dict[str, Any]]:
@@ -142,7 +148,7 @@ class LabelStudioProject(Project):
142
148
  f'Label Studio project {self.project_title!r}.'
143
149
  )
144
150
 
145
- def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) -> SyncStatus:
151
+ def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) -> UpdateStatus:
146
152
  """
147
153
  Updates all tasks in this Label Studio project based on the Pixeltable data:
148
154
  - Creates new tasks for rows that don't map to any existing task;
@@ -155,7 +161,7 @@ class LabelStudioProject(Project):
155
161
  t_data_cols = [t_col for t_col, ext_col_name in self.col_mapping.items() if ext_col_name in config.data_keys]
156
162
 
157
163
  if len(t_data_cols) == 0:
158
- return SyncStatus.empty()
164
+ return UpdateStatus()
159
165
 
160
166
  # Columns in `t` that map to `rectanglelabels` preannotations
161
167
  t_rl_cols = [
@@ -183,15 +189,15 @@ class LabelStudioProject(Project):
183
189
  self,
184
190
  t: Table,
185
191
  existing_tasks: dict[tuple, dict],
186
- media_col: Column,
187
- t_rl_cols: list[Column],
192
+ media_col: ColumnHandle,
193
+ t_rl_cols: list[ColumnHandle],
188
194
  rl_info: list['_RectangleLabel'],
189
- ) -> SyncStatus:
190
- is_stored = media_col.is_stored
195
+ ) -> UpdateStatus:
196
+ is_stored = media_col.get().is_stored
191
197
  # If it's a stored column, we can use `localpath`
192
- localpath_col_opt = [t[media_col.name].localpath] if is_stored else []
198
+ localpath_col_opt = [t[media_col.get().name].localpath] if is_stored else []
193
199
  # Select the media column, rectanglelabels columns, and localpath (if appropriate)
194
- rows = t.select(t[media_col.name], *[t[col.name] for col in t_rl_cols], *localpath_col_opt)
200
+ rows = t.select(t[media_col.get().name], *[t[col.get().name] for col in t_rl_cols], *localpath_col_opt)
195
201
  tasks_created = 0
196
202
  row_ids_in_pxt: set[tuple] = set()
197
203
 
@@ -209,7 +215,7 @@ class LabelStudioProject(Project):
209
215
  else:
210
216
  # No localpath; create a temp file and upload it
211
217
  assert isinstance(row[media_col_idx], PIL.Image.Image)
212
- file = env.Env.get().create_tmp_path(extension='.png')
218
+ file = TempStore.create_path(extension='.png')
213
219
  row[media_col_idx].save(file, format='png')
214
220
  task_id = self.project.import_tasks(file)[0]
215
221
  os.remove(file)
@@ -232,48 +238,48 @@ class LabelStudioProject(Project):
232
238
 
233
239
  env.Env.get().console_logger.info(f'Created {tasks_created} new task(s) in {self}.')
234
240
 
235
- sync_status = SyncStatus(external_rows_created=tasks_created)
241
+ sync_status = UpdateStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created))
236
242
 
237
243
  deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
238
-
239
- return sync_status.combine(deletion_sync_status)
244
+ sync_status += deletion_sync_status
245
+ return sync_status
240
246
 
241
247
  def __update_tasks_by_files(
242
248
  self,
243
249
  t: Table,
244
250
  existing_tasks: dict[tuple, dict],
245
- t_data_cols: list[Column],
246
- t_rl_cols: list[Column],
251
+ t_data_cols: list[ColumnHandle],
252
+ t_rl_cols: list[ColumnHandle],
247
253
  rl_info: list['_RectangleLabel'],
248
- ) -> SyncStatus:
254
+ ) -> UpdateStatus:
249
255
  ext_data_cols = [self.col_mapping[col] for col in t_data_cols]
250
256
  expr_refs: dict[str, Expr] = {} # kwargs for the select statement
251
257
  for col in t_data_cols:
252
- col_name = col.name
258
+ col_name = col.get().name
253
259
  if self.media_import_method == 'url':
254
260
  expr_refs[col_name] = t[col_name].fileurl
255
261
  else:
256
262
  assert self.media_import_method == 'file'
257
- if not col.col_type.is_media_type():
263
+ if not col.get().col_type.is_media_type():
258
264
  # Not a media column; query the data directly
259
- expr_refs[col_name] = cast(ColumnRef, t[col_name])
265
+ expr_refs[col_name] = t[col_name]
260
266
  elif col in self.stored_proxies:
261
267
  # Media column that has a stored proxy; use it. We have to give it a name,
262
268
  # since it's an anonymous column
263
- stored_proxy_col = self.stored_proxies[col]
269
+ stored_proxy_col = self.stored_proxies[col].get()
264
270
  expr_refs[f'{col_name}_proxy'] = ColumnRef(stored_proxy_col).localpath
265
271
  else:
266
272
  # Media column without a stored proxy; this means it's a stored computed column,
267
273
  # and we can just use the localpath
268
274
  expr_refs[col_name] = t[col_name].localpath
269
275
 
270
- df = t.select(*[t[col.name] for col in t_rl_cols], **expr_refs)
276
+ df = t.select(*[t[col.get().name] for col in t_rl_cols], **expr_refs)
271
277
  # The following buffers will hold `DataRow` indices that correspond to each of the selected
272
278
  # columns. `rl_col_idxs` holds the indices for the columns that map to RectangleLabels
273
279
  # preannotations; `data_col_idxs` holds the indices for the columns that map to data fields.
274
280
  # We have to wait until we begin iterating to populate them, so they're initially `None`.
275
- rl_col_idxs: Optional[list[int]] = None
276
- data_col_idxs: Optional[list[int]] = None
281
+ rl_col_idxs: list[int] | None = None
282
+ data_col_idxs: list[int] | None = None
277
283
 
278
284
  row_ids_in_pxt: set[tuple] = set()
279
285
  tasks_created = 0
@@ -286,11 +292,11 @@ class LabelStudioProject(Project):
286
292
  data_vals = [row[idx] for idx in data_col_idxs]
287
293
  coco_annotations = [row[idx] for idx in rl_col_idxs]
288
294
  for i in range(len(t_data_cols)):
289
- if t_data_cols[i].col_type.is_media_type():
295
+ if t_data_cols[i].get().col_type.is_media_type():
290
296
  # Special handling for media columns
291
297
  assert isinstance(data_vals[i], str)
292
298
  if self.media_import_method == 'url':
293
- data_vals[i] = self.__validate_fileurl(t_data_cols[i], data_vals[i])
299
+ data_vals[i] = self.__validate_fileurl(t_data_cols[i].get(), data_vals[i])
294
300
  else:
295
301
  assert self.media_import_method == 'file'
296
302
  data_vals[i] = self.__localpath_to_lspath(data_vals[i])
@@ -336,14 +342,14 @@ class LabelStudioProject(Project):
336
342
  f'Created {tasks_created} new task(s) and updated {tasks_updated} existing task(s) in {self}.'
337
343
  )
338
344
 
339
- sync_status = SyncStatus(external_rows_created=tasks_created, external_rows_updated=tasks_updated)
345
+ sync_status = UpdateStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created, upd_rows=tasks_updated))
340
346
 
341
347
  deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
342
-
343
- return sync_status.combine(deletion_sync_status)
348
+ sync_status += deletion_sync_status
349
+ return sync_status
344
350
 
345
351
  @classmethod
346
- def __validate_fileurl(cls, col: Column, url: str) -> Optional[str]:
352
+ def __validate_fileurl(cls, col: Column, url: str) -> str | None:
347
353
  # Check that the URL is one that will be visible to Label Studio. If it isn't, log an info message
348
354
  # to help users debug the issue.
349
355
  if not (url.startswith('http://') or url.startswith('https://')):
@@ -361,7 +367,7 @@ class LabelStudioProject(Project):
361
367
 
362
368
  def __delete_stale_tasks(
363
369
  self, existing_tasks: dict[tuple, dict], row_ids_in_pxt: set[tuple], tasks_created: int
364
- ) -> SyncStatus:
370
+ ) -> UpdateStatus:
365
371
  deleted_rowids = set(existing_tasks.keys()) - row_ids_in_pxt
366
372
  # Sanity check the math
367
373
  assert len(deleted_rowids) == len(existing_tasks) + tasks_created - len(row_ids_in_pxt)
@@ -377,11 +383,11 @@ class LabelStudioProject(Project):
377
383
  for rowid in deleted_rowids:
378
384
  del existing_tasks[rowid]
379
385
 
380
- return SyncStatus(external_rows_deleted=len(deleted_rowids))
386
+ return UpdateStatus(ext_row_count_stats=RowCountStats(del_rows=len(deleted_rowids)))
381
387
 
382
- def __update_table_from_tasks(self, t: Table, tasks: dict[tuple, dict]) -> SyncStatus:
388
+ def __update_table_from_tasks(self, t: Table, tasks: dict[tuple, dict]) -> UpdateStatus:
383
389
  if ANNOTATIONS_COLUMN not in self.col_mapping.values():
384
- return SyncStatus.empty()
390
+ return UpdateStatus()
385
391
 
386
392
  annotations = {
387
393
  # Replace [] by None to indicate no annotations. We do want to sync rows with no annotations,
@@ -391,7 +397,7 @@ class LabelStudioProject(Project):
391
397
  for task in tasks.values()
392
398
  }
393
399
 
394
- local_annotations_col = next(k for k, v in self.col_mapping.items() if v == ANNOTATIONS_COLUMN)
400
+ local_annotations_col = next(k for k, v in self.col_mapping.items() if v == ANNOTATIONS_COLUMN).get()
395
401
 
396
402
  # Prune the annotations down to just the ones that have actually changed.
397
403
  rows = t.select(t[local_annotations_col.name])
@@ -412,23 +418,21 @@ class LabelStudioProject(Project):
412
418
  # TODO(aaron-siegel): Simplify this once propagation is properly implemented in batch_update
413
419
  ancestor = t
414
420
  while local_annotations_col not in ancestor._tbl_version.get().cols:
415
- assert ancestor._base_table is not None
416
- ancestor = ancestor._base_table
421
+ assert ancestor._get_base_table is not None
422
+ ancestor = ancestor._get_base_table()
417
423
  update_status = ancestor.batch_update(updates)
418
424
  env.Env.get().console_logger.info(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
419
- return SyncStatus(pxt_rows_updated=update_status.num_rows, num_excs=update_status.num_excs)
425
+ return update_status
420
426
  else:
421
- return SyncStatus.empty()
427
+ return UpdateStatus()
422
428
 
423
429
  def as_dict(self) -> dict[str, Any]:
424
430
  return {
425
431
  'name': self.name,
426
432
  'project_id': self.project_id,
427
433
  'media_import_method': self.media_import_method,
428
- 'col_mapping': [[self._column_as_dict(k), v] for k, v in self.col_mapping.items()],
429
- 'stored_proxies': [
430
- [self._column_as_dict(k), self._column_as_dict(v)] for k, v in self.stored_proxies.items()
431
- ],
434
+ 'col_mapping': [[k.as_dict(), v] for k, v in self.col_mapping.items()],
435
+ 'stored_proxies': [[k.as_dict(), v.as_dict()] for k, v in self.stored_proxies.items()],
432
436
  }
433
437
 
434
438
  @classmethod
@@ -437,8 +441,8 @@ class LabelStudioProject(Project):
437
441
  md['name'],
438
442
  md['project_id'],
439
443
  md['media_import_method'],
440
- {cls._column_from_dict(entry[0]): entry[1] for entry in md['col_mapping']},
441
- {cls._column_from_dict(entry[0]): cls._column_from_dict(entry[1]) for entry in md['stored_proxies']},
444
+ {ColumnHandle.from_dict(entry[0]): entry[1] for entry in md['col_mapping']},
445
+ {ColumnHandle.from_dict(entry[0]): ColumnHandle.from_dict(entry[1]) for entry in md['stored_proxies']},
442
446
  )
443
447
 
444
448
  def __repr__(self) -> str:
@@ -493,7 +497,7 @@ class LabelStudioProject(Project):
493
497
 
494
498
  @classmethod
495
499
  def __coco_to_predictions(
496
- cls, coco_annotations: dict[str, Any], from_name: str, rl_info: '_RectangleLabel', task_id: Optional[int] = None
500
+ cls, coco_annotations: dict[str, Any], from_name: str, rl_info: '_RectangleLabel', task_id: int | None = None
497
501
  ) -> dict[str, Any]:
498
502
  width = coco_annotations['image']['width']
499
503
  height = coco_annotations['image']['height']
@@ -545,11 +549,11 @@ class LabelStudioProject(Project):
545
549
  cls,
546
550
  t: Table,
547
551
  label_config: str,
548
- name: Optional[str],
549
- title: Optional[str],
552
+ name: str | None,
553
+ title: str | None,
550
554
  media_import_method: Literal['post', 'file', 'url'],
551
- col_mapping: Optional[dict[str, str]],
552
- s3_configuration: Optional[dict[str, Any]],
555
+ col_mapping: dict[str, str] | None,
556
+ s3_configuration: dict[str, Any] | None,
553
557
  **kwargs: Any,
554
558
  ) -> 'LabelStudioProject':
555
559
  """
@@ -560,7 +564,7 @@ class LabelStudioProject(Project):
560
564
 
561
565
  if name is None:
562
566
  # Create a default name that's unique to the table
563
- all_stores = t.external_stores
567
+ all_stores = t.external_stores()
564
568
  n = 0
565
569
  while f'ls_project_{n}' in all_stores:
566
570
  n += 1
@@ -576,7 +580,7 @@ class LabelStudioProject(Project):
576
580
  local_annotations_column = ANNOTATIONS_COLUMN
577
581
  else:
578
582
  local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
579
- if local_annotations_column not in t._schema:
583
+ if local_annotations_column not in t._get_schema():
580
584
  t.add_columns({local_annotations_column: ts.Json})
581
585
 
582
586
  resolved_col_mapping = cls.validate_columns(
@@ -648,7 +652,7 @@ class LabelStudioProject(Project):
648
652
 
649
653
  @dataclass(frozen=True)
650
654
  class _DataKey:
651
- name: Optional[str] # The 'name' attribute of the data key; may differ from the field name
655
+ name: str | None # The 'name' attribute of the data key; may differ from the field name
652
656
  column_type: ts.ColumnType
653
657
 
654
658
 
@@ -0,0 +1,3 @@
1
+ from pixeltable.utils.lancedb import export_lancedb
2
+
3
+ __all__ = ['export_lancedb']
pixeltable/io/pandas.py CHANGED
@@ -1,5 +1,5 @@
1
1
  import os
2
- from typing import Any, Optional, Union
2
+ from typing import Any
3
3
 
4
4
  import numpy as np
5
5
  import pandas as pd
@@ -16,8 +16,8 @@ def import_pandas(
16
16
  tbl_name: str,
17
17
  df: pd.DataFrame,
18
18
  *,
19
- schema_overrides: Optional[dict[str, Any]] = None,
20
- primary_key: Optional[Union[str, list[str]]] = None,
19
+ schema_overrides: dict[str, Any] | None = None,
20
+ primary_key: str | list[str] | None = None,
21
21
  num_retained_versions: int = 10,
22
22
  comment: str = '',
23
23
  ) -> pxt.Table:
@@ -55,9 +55,9 @@ def import_pandas(
55
55
 
56
56
  def import_csv(
57
57
  tbl_name: str,
58
- filepath_or_buffer: Union[str, os.PathLike],
59
- schema_overrides: Optional[dict[str, Any]] = None,
60
- primary_key: Optional[Union[str, list[str]]] = None,
58
+ filepath_or_buffer: str | os.PathLike,
59
+ schema_overrides: dict[str, Any] | None = None,
60
+ primary_key: str | list[str] | None = None,
61
61
  num_retained_versions: int = 10,
62
62
  comment: str = '',
63
63
  **kwargs: Any,
@@ -84,10 +84,10 @@ def import_csv(
84
84
 
85
85
  def import_excel(
86
86
  tbl_name: str,
87
- io: Union[str, os.PathLike],
87
+ io: str | os.PathLike,
88
88
  *,
89
- schema_overrides: Optional[dict[str, Any]] = None,
90
- primary_key: Optional[Union[str, list[str]]] = None,
89
+ schema_overrides: dict[str, Any] | None = None,
90
+ primary_key: str | list[str] | None = None,
91
91
  num_retained_versions: int = 10,
92
92
  comment: str = '',
93
93
  **kwargs: Any,
@@ -132,6 +132,7 @@ def df_infer_schema(
132
132
  pd_schema: dict[str, ts.ColumnType] = {}
133
133
  for pd_name, pd_dtype in zip(df.columns, df.dtypes):
134
134
  if pd_name in schema_overrides:
135
+ assert isinstance(schema_overrides[pd_name], ts.ColumnType)
135
136
  pxt_type = schema_overrides[pd_name]
136
137
  else:
137
138
  pxt_type = __pd_coltype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
@@ -140,7 +141,7 @@ def df_infer_schema(
140
141
  return pd_schema
141
142
 
142
143
 
143
- def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[ts.ColumnType]:
144
+ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> ts.ColumnType | None:
144
145
  """
145
146
  Determines a pixeltable ColumnType from a pandas dtype
146
147
 
@@ -191,7 +192,7 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
191
192
 
192
193
 
193
194
  def _df_row_to_pxt_row(
194
- row: tuple[Any, ...], schema: dict[str, ts.ColumnType], col_mapping: Optional[dict[str, str]]
195
+ row: tuple[Any, ...], schema: dict[str, ts.ColumnType], col_mapping: dict[str, str] | None
195
196
  ) -> dict[str, Any]:
196
197
  """Convert a row to insertable format"""
197
198
  pxt_row: dict[str, Any] = {}
pixeltable/io/parquet.py CHANGED
@@ -1,16 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
- import datetime
4
- import io
5
3
  import json
6
4
  import logging
7
5
  import typing
8
- from collections import deque
9
6
  from pathlib import Path
10
- from typing import Any, Optional, Union
11
-
12
- import numpy as np
13
- import PIL.Image
7
+ from typing import Any
14
8
 
15
9
  import pixeltable as pxt
16
10
  import pixeltable.exceptions as excs
@@ -18,31 +12,13 @@ from pixeltable.catalog import Catalog
18
12
  from pixeltable.utils.transactional_directory import transactional_directory
19
13
 
20
14
  if typing.TYPE_CHECKING:
21
- import pyarrow as pa
22
-
23
15
  import pixeltable as pxt
24
16
 
25
17
  _logger = logging.getLogger('pixeltable')
26
18
 
27
19
 
28
- def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
29
- import pyarrow as pa
30
- from pyarrow import parquet
31
-
32
- pydict = {}
33
- for field in schema:
34
- if isinstance(field.type, pa.FixedShapeTensorType):
35
- stacked_arr = np.stack(value_batch[field.name])
36
- pydict[field.name] = pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr)
37
- else:
38
- pydict[field.name] = value_batch[field.name]
39
-
40
- tab = pa.Table.from_pydict(pydict, schema=schema)
41
- parquet.write_table(tab, str(output_path))
42
-
43
-
44
20
  def export_parquet(
45
- table_or_df: Union[pxt.Table, pxt.DataFrame],
21
+ table_or_df: pxt.Table | pxt.DataFrame,
46
22
  parquet_path: Path,
47
23
  partition_size_bytes: int = 100_000_000,
48
24
  inline_images: bool = False,
@@ -63,7 +39,9 @@ def export_parquet(
63
39
  If False, will raise an error if the Dataframe has any image column.
64
40
  Default False.
65
41
  """
66
- from pixeltable.utils.arrow import to_arrow_schema
42
+ import pyarrow as pa
43
+
44
+ from pixeltable.utils.arrow import to_record_batches
67
45
 
68
46
  df: pxt.DataFrame
69
47
  if isinstance(table_or_df, pxt.catalog.Table):
@@ -71,9 +49,6 @@ def export_parquet(
71
49
  else:
72
50
  df = table_or_df
73
51
 
74
- type_dict = {k: v.as_dict() for k, v in df.schema.items()}
75
- arrow_schema = to_arrow_schema(df.schema)
76
-
77
52
  if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
78
53
  raise excs.Error('Cannot export Dataframe with image columns when inline_images is False')
79
54
 
@@ -81,78 +56,23 @@ def export_parquet(
81
56
  with transactional_directory(parquet_path) as temp_path:
82
57
  # dump metadata json file so we can inspect what was the source of the parquet file later on.
83
58
  json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
59
+ type_dict = {k: v.as_dict() for k, v in df.schema.items()}
84
60
  json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
85
-
86
61
  batch_num = 0
87
- current_value_batch: dict[str, deque] = {k: deque() for k in df.schema}
88
- current_byte_estimate = 0
89
-
90
62
  with Catalog.get().begin_xact(for_write=False):
91
- for data_row in df._exec():
92
- for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
93
- val = data_row[e.slot_idx]
94
- if val is None:
95
- current_value_batch[col_name].append(val)
96
- continue
97
-
98
- assert val is not None
99
- if col_type.is_image_type():
100
- # images get inlined into the parquet file
101
- if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
102
- # if there is a file, read directly to preserve information
103
- with open(data_row.file_paths[e.slot_idx], 'rb') as f:
104
- val = f.read()
105
- elif isinstance(val, PIL.Image.Image):
106
- # if no file available, eg. bc it is computed, convert to png
107
- buf = io.BytesIO()
108
- val.save(buf, format='PNG')
109
- val = buf.getvalue()
110
- else:
111
- raise excs.Error(f'unknown image type {type(val)}')
112
- length = len(val)
113
- elif col_type.is_string_type():
114
- length = len(val)
115
- elif col_type.is_video_type() or col_type.is_audio_type():
116
- if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
117
- val = data_row.file_paths[e.slot_idx]
118
- else:
119
- raise excs.Error(f'unknown audio/video type {type(val)}')
120
- length = len(val)
121
- elif col_type.is_json_type():
122
- val = json.dumps(val)
123
- length = len(val)
124
- elif col_type.is_array_type():
125
- length = val.nbytes
126
- elif col_type.is_int_type() or col_type.is_float_type():
127
- length = 8
128
- elif col_type.is_bool_type():
129
- length = 1
130
- elif col_type.is_date_type():
131
- length = 4
132
- elif col_type.is_timestamp_type():
133
- val = val.astimezone(datetime.timezone.utc)
134
- length = 8
135
- else:
136
- raise excs.Error(f'unknown type {col_type} for {col_name}')
137
-
138
- current_value_batch[col_name].append(val)
139
- current_byte_estimate += length
140
- if current_byte_estimate > partition_size_bytes:
141
- assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
142
- _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
143
- batch_num += 1
144
- current_value_batch = {k: deque() for k in df.schema}
145
- current_byte_estimate = 0
146
-
147
- _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
63
+ for record_batch in to_record_batches(df, partition_size_bytes):
64
+ output_path = temp_path / f'part-{batch_num:05d}.parquet'
65
+ arrow_tbl = pa.Table.from_batches([record_batch])
66
+ pa.parquet.write_table(arrow_tbl, str(output_path))
67
+ batch_num += 1
148
68
 
149
69
 
150
70
  def import_parquet(
151
71
  table: str,
152
72
  *,
153
73
  parquet_path: str,
154
- schema_overrides: Optional[dict[str, Any]] = None,
155
- primary_key: Optional[Union[str, list[str]]] = None,
74
+ schema_overrides: dict[str, Any] | None = None,
75
+ primary_key: str | list[str] | None = None,
156
76
  **kwargs: Any,
157
77
  ) -> pxt.Table:
158
78
  """Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.