pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/io/globals.py CHANGED
@@ -1,31 +1,33 @@
1
- from typing import TYPE_CHECKING, Any, Literal, Optional, Union
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any, Literal
2
4
 
3
5
  import pixeltable as pxt
4
6
  import pixeltable.exceptions as excs
5
7
  from pixeltable import Table, exprs
8
+ from pixeltable.catalog.update_status import UpdateStatus
6
9
  from pixeltable.env import Env
7
- from pixeltable.io.external_store import SyncStatus
8
10
 
9
11
  if TYPE_CHECKING:
10
12
  import fiftyone as fo # type: ignore[import-untyped]
11
13
 
12
14
 
13
15
  def create_label_studio_project(
14
- t: Table,
15
- label_config: str,
16
- name: Optional[str] = None,
17
- title: Optional[str] = None,
18
- media_import_method: Literal['post', 'file', 'url'] = 'post',
19
- col_mapping: Optional[dict[str, str]] = None,
20
- sync_immediately: bool = True,
21
- s3_configuration: Optional[dict[str, Any]] = None,
22
- **kwargs: Any
23
- ) -> SyncStatus:
16
+ t: Table,
17
+ label_config: str,
18
+ name: str | None = None,
19
+ title: str | None = None,
20
+ media_import_method: Literal['post', 'file', 'url'] = 'post',
21
+ col_mapping: dict[str, str] | None = None,
22
+ sync_immediately: bool = True,
23
+ s3_configuration: dict[str, Any] | None = None,
24
+ **kwargs: Any,
25
+ ) -> UpdateStatus:
24
26
  """
25
27
  Create a new Label Studio project and link it to the specified [`Table`][pixeltable.Table].
26
28
 
27
29
  - A tutorial notebook with fully worked examples can be found here:
28
- [Using Label Studio for Annotations with Pixeltable](https://pixeltable.readme.io/docs/label-studio)
30
+ [Using Label Studio for Annotations with Pixeltable](https://docs.pixeltable.com/notebooks/integrations/using-label-studio-with-pixeltable)
29
31
 
30
32
  The required parameter `label_config` specifies the Label Studio project configuration,
31
33
  in XML format, as described in the Label Studio documentation. The linked project will
@@ -86,53 +88,48 @@ def create_label_studio_project(
86
88
  parameters of the Label Studio `connect_s3_import_storage` method, as described in the
87
89
  [Label Studio connect_s3_import_storage docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.connect_s3_import_storage).
88
90
  `bucket` must be specified; all other parameters are optional. If credentials are not specified explicitly,
89
- Pixeltable will attempt to retrieve them from the environment (such as from `~/.aws/credentials`). If a title is not
90
- specified, Pixeltable will use the default `'Pixeltable-S3-Import-Storage'`. All other parameters use their Label
91
- Studio defaults.
91
+ Pixeltable will attempt to retrieve them from the environment (such as from `~/.aws/credentials`).
92
+ If a title is not specified, Pixeltable will use the default `'Pixeltable-S3-Import-Storage'`.
93
+ All other parameters use their Label Studio defaults.
92
94
  kwargs: Additional keyword arguments are passed to the `start_project` method in the Label
93
95
  Studio SDK, as described in the
94
96
  [Label Studio start_project docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.start_project).
95
97
 
96
98
  Returns:
97
- A `SyncStatus` representing the status of any synchronization operations that occurred.
99
+ An `UpdateStatus` representing the status of any synchronization operations that occurred.
98
100
 
99
101
  Examples:
100
- Create a Label Studio project whose tasks correspond to videos stored in the `video_col` column of the table `tbl`:
102
+ Create a Label Studio project whose tasks correspond to videos stored in the `video_col`
103
+ column of the table `tbl`:
101
104
 
102
105
  >>> config = \"\"\"
103
- <View>
104
- <Video name="video_obj" value="$video_col"/>
105
- <Choices name="video-category" toName="video" showInLine="true">
106
- <Choice value="city"/>
107
- <Choice value="food"/>
108
- <Choice value="sports"/>
109
- </Choices>
110
- </View>\"\"\"
111
- create_label_studio_project(tbl, config)
106
+ ... <View>
107
+ ... <Video name="video_obj" value="$video_col"/>
108
+ ... <Choices name="video-category" toName="video" showInLine="true">
109
+ ... <Choice value="city"/>
110
+ ... <Choice value="food"/>
111
+ ... <Choice value="sports"/>
112
+ ... </Choices>
113
+ ... </View>
114
+ ... \"\"\"
115
+ >>> create_label_studio_project(tbl, config)
112
116
 
113
117
  Create a Label Studio project with the same configuration, using `media_import_method='url'`,
114
118
  whose media are stored in an S3 bucket:
115
119
 
116
120
  >>> create_label_studio_project(
117
- tbl,
118
- config,
119
- media_import_method='url',
120
- s3_configuration={'bucket': 'my-bucket', 'region_name': 'us-east-2'}
121
- )
121
+ ... tbl,
122
+ ... config,
123
+ ... media_import_method='url',
124
+ ... s3_configuration={'bucket': 'my-bucket', 'region_name': 'us-east-2'}
125
+ ... )
122
126
  """
123
127
  Env.get().require_package('label_studio_sdk')
124
128
 
125
129
  from pixeltable.io.label_studio import LabelStudioProject
126
130
 
127
131
  ls_project = LabelStudioProject.create(
128
- t,
129
- label_config,
130
- name,
131
- title,
132
- media_import_method,
133
- col_mapping,
134
- s3_configuration,
135
- **kwargs
132
+ t, label_config, name, title, media_import_method, col_mapping, s3_configuration, **kwargs
136
133
  )
137
134
 
138
135
  # Link the project to `t`, and sync if appropriate.
@@ -140,159 +137,27 @@ def create_label_studio_project(
140
137
  if sync_immediately:
141
138
  return t.sync()
142
139
  else:
143
- return SyncStatus.empty()
144
-
145
-
146
- def import_rows(
147
- tbl_path: str,
148
- rows: list[dict[str, Any]],
149
- *,
150
- schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
151
- primary_key: Optional[Union[str, list[str]]] = None,
152
- num_retained_versions: int = 10,
153
- comment: str = ''
154
- ) -> Table:
155
- """
156
- Creates a new base table from a list of dictionaries. The dictionaries must be of the
157
- form `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
158
- supplied data, using the most specific type that can represent all the values in a column.
159
-
160
- If `schema_overrides` is specified, then for each entry `(column_name, type)` in `schema_overrides`,
161
- Pixeltable will force the specified column to the specified type (and will not attempt any type inference
162
- for that column).
163
-
164
- All column types of the new table will be nullable unless explicitly specified as non-nullable in
165
- `schema_overrides`.
166
-
167
- Args:
168
- tbl_path: The qualified name of the table to create.
169
- rows: The list of dictionaries to import.
170
- schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
171
- as described above.
172
- primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
173
- num_retained_versions: The number of retained versions of the table (see [`create_table()`][pixeltable.create_table]).
174
- comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
175
-
176
- Returns:
177
- A handle to the newly created [`Table`][pixeltable.Table].
178
- """
179
- if schema_overrides is None:
180
- schema_overrides = {}
181
- schema: dict[str, pxt.ColumnType] = {}
182
- cols_with_nones: set[str] = set()
183
-
184
- for n, row in enumerate(rows):
185
- for col_name, value in row.items():
186
- if col_name in schema_overrides:
187
- # We do the insertion here; this will ensure that the column order matches the order
188
- # in which the column names are encountered in the input data, even if `schema_overrides`
189
- # is specified.
190
- if col_name not in schema:
191
- schema[col_name] = schema_overrides[col_name]
192
- elif value is not None:
193
- # If `key` is not in `schema_overrides`, then we infer its type from the data.
194
- # The column type will always be nullable by default.
195
- col_type = pxt.ColumnType.infer_literal_type(value, nullable=True)
196
- if col_type is None:
197
- raise excs.Error(f'Could not infer type for column `{col_name}`; the value in row {n} has an unsupported type: {type(value)}')
198
- if col_name not in schema:
199
- schema[col_name] = col_type
200
- else:
201
- supertype = schema[col_name].supertype(col_type)
202
- if supertype is None:
203
- raise excs.Error(
204
- f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
205
- 'Consider specifying the type explicitly in `schema_overrides`.'
206
- )
207
- schema[col_name] = supertype
208
- else:
209
- cols_with_nones.add(col_name)
210
-
211
- extraneous_keys = schema_overrides.keys() - schema.keys()
212
- if len(extraneous_keys) > 0:
213
- raise excs.Error(f'The following columns specified in `schema_overrides` are not present in the data: {", ".join(extraneous_keys)}')
214
-
215
- entirely_none_cols = cols_with_nones - schema.keys()
216
- if len(entirely_none_cols) > 0:
217
- # A column can only end up in `entirely_null_cols` if it was not in `schema_overrides` and
218
- # was not encountered in any row with a non-None value.
219
- raise excs.Error(
220
- f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
221
- 'Consider specifying the type(s) explicitly in `schema_overrides`.'
222
- )
223
-
224
- t = pxt.create_table(tbl_path, schema, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
225
- t.insert(rows)
226
- return t
227
-
228
-
229
- def import_json(
230
- tbl_path: str,
231
- filepath_or_url: str,
232
- *,
233
- schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
234
- primary_key: Optional[Union[str, list[str]]] = None,
235
- num_retained_versions: int = 10,
236
- comment: str = '',
237
- **kwargs: Any
238
- ) -> Table:
239
- """
240
- Creates a new base table from a JSON file. This is a convenience method and is
241
- equivalent to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
242
- is the contents of the specified `filepath_or_url`.
243
-
244
- Args:
245
- tbl_path: The name of the table to create.
246
- filepath_or_url: The path or URL of the JSON file.
247
- schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
248
- (see [`import_rows()`][pixeltable.io.import_rows]).
249
- primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
250
- num_retained_versions: The number of retained versions of the table (see [`create_table()`][pixeltable.create_table]).
251
- comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
252
- kwargs: Additional keyword arguments to pass to `json.loads`.
253
-
254
- Returns:
255
- A handle to the newly created [`Table`][pixeltable.Table].
256
- """
257
- import json
258
- import urllib.parse
259
- import urllib.request
260
-
261
- # TODO Consolidate this logic with other places where files/URLs are parsed
262
- parsed = urllib.parse.urlparse(filepath_or_url)
263
- if len(parsed.scheme) <= 1 or parsed.scheme == 'file':
264
- # local file path
265
- if len(parsed.scheme) <= 1:
266
- filepath = filepath_or_url
267
- else:
268
- filepath = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
269
- with open(filepath) as fp:
270
- contents = fp.read()
271
- else:
272
- # URL
273
- contents = urllib.request.urlopen(filepath_or_url).read()
274
- data = json.loads(contents, **kwargs)
275
- return import_rows(tbl_path, data, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
140
+ return UpdateStatus()
276
141
 
277
142
 
278
143
  def export_images_as_fo_dataset(
279
144
  tbl: pxt.Table,
280
145
  images: exprs.Expr,
281
146
  image_format: str = 'webp',
282
- classifications: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
283
- detections: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
147
+ classifications: exprs.Expr | list[exprs.Expr] | dict[str, exprs.Expr] | None = None,
148
+ detections: exprs.Expr | list[exprs.Expr] | dict[str, exprs.Expr] | None = None,
284
149
  ) -> 'fo.Dataset':
285
150
  """
286
151
  Export images from a Pixeltable table as a Voxel51 dataset. The data must consist of a single column
287
152
  (or expression) containing image data, along with optional additional columns containing labels. Currently, only
288
153
  classification and detection labels are supported.
289
154
 
290
- The [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial contains a
155
+ The [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/examples/vision/voxel51) tutorial contains a
291
156
  fully worked example showing how to export data from a Pixeltable table and load it into Voxel51.
292
157
 
293
158
  Images in the dataset that already exist on disk will be exported directly, in whatever format they
294
159
  are stored in. Images that are not already on disk (such as frames extracted using a
295
- [`FrameIterator`][pixeltable.iterators.FrameIterator]) will first be written to disk in the specified
160
+ [`frame_iterator`][pixeltable.functions.video.frame_iterator]) will first be written to disk in the specified
296
161
  `image_format`.
297
162
 
298
163
  The label parameters accept one or more sets of labels of each type. If a single `Expr` is provided, then it will
@@ -340,13 +205,13 @@ def export_images_as_fo_dataset(
340
205
  Export the images in the `image` column of the table `tbl` as a Voxel51 dataset, using classification
341
206
  labels from `tbl.classifications`:
342
207
 
343
- >>> export_as_fiftyone(
208
+ >>> export_images_as_fo_dataset(
344
209
  ... tbl,
345
210
  ... tbl.image,
346
211
  ... classifications=tbl.classifications
347
212
  ... )
348
213
 
349
- See the [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial
214
+ See the [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/examples/vision/voxel51) tutorial
350
215
  for a fully worked example.
351
216
  """
352
217
  Env.get().require_package('fiftyone')
@@ -358,6 +223,6 @@ def export_images_as_fo_dataset(
358
223
  if not images.col_type.is_image_type():
359
224
  raise excs.Error(f'`images` must be an expression of type Image (got {images.col_type._to_base_str()})')
360
225
 
361
- return fo.Dataset.from_importer(PxtImageDatasetImporter(
362
- tbl, images, image_format, classifications=classifications, detections=detections
363
- ))
226
+ return fo.Dataset.from_importer(
227
+ PxtImageDatasetImporter(tbl, images, image_format, classifications=classifications, detections=detections)
228
+ )
@@ -1,190 +1,159 @@
1
1
  from __future__ import annotations
2
2
 
3
- import logging
4
- import math
5
- import random
6
3
  import typing
7
- from typing import Union, Optional, Any
4
+ from typing import Any
8
5
 
9
6
  import pixeltable as pxt
10
7
  import pixeltable.type_system as ts
11
- from pixeltable import exceptions as excs
12
8
 
13
9
  if typing.TYPE_CHECKING:
14
10
  import datasets # type: ignore[import-untyped]
15
11
 
16
- _logger = logging.getLogger(__name__)
17
12
 
18
- # use 100MB as the batch size limit for loading a huggingface dataset into pixeltable.
19
- # The primary goal is to bound memory use, regardless of dataset size.
20
- # Second goal is to limit overhead. 100MB is presumed to be reasonable for a lot of storage systems.
21
- _K_BATCH_SIZE_BYTES = 100_000_000
22
-
23
- # note, there are many more types. we allow overrides in the schema_override parameter
13
+ # note, there are many more types. we allow overrides in the schema_overrides parameter
24
14
  # to handle cases where the appropriate type is not yet mapped, or to override this mapping.
25
15
  # https://huggingface.co/docs/datasets/v2.17.0/en/package_reference/main_classes#datasets.Value
26
16
  _hf_to_pxt: dict[str, ts.ColumnType] = {
27
- 'int32': ts.IntType(nullable=True), # pixeltable widens to big int
28
- 'int64': ts.IntType(nullable=True),
29
17
  'bool': ts.BoolType(nullable=True),
18
+ 'int8': ts.IntType(nullable=True),
19
+ 'int16': ts.IntType(nullable=True),
20
+ 'int32': ts.IntType(nullable=True),
21
+ 'int64': ts.IntType(nullable=True),
22
+ 'uint8': ts.IntType(nullable=True),
23
+ 'uint16': ts.IntType(nullable=True),
24
+ 'uint32': ts.IntType(nullable=True),
25
+ 'uint64': ts.IntType(nullable=True),
26
+ 'float16': ts.FloatType(nullable=True),
30
27
  'float32': ts.FloatType(nullable=True),
28
+ 'float64': ts.FloatType(nullable=True),
31
29
  'string': ts.StringType(nullable=True),
30
+ 'large_string': ts.StringType(nullable=True),
32
31
  'timestamp[s]': ts.TimestampType(nullable=True),
33
32
  'timestamp[ms]': ts.TimestampType(nullable=True), # HF dataset iterator converts timestamps to datetime.datetime
33
+ 'timestamp[us]': ts.TimestampType(nullable=True),
34
+ 'timestamp[ns]': ts.TimestampType(nullable=True),
35
+ 'date32': ts.DateType(nullable=True),
36
+ 'date64': ts.DateType(nullable=True),
34
37
  }
35
38
 
36
39
 
37
- def _to_pixeltable_type(feature_type: Any) -> Optional[ts.ColumnType]:
40
+ def _to_pixeltable_type(feature_type: Any, nullable: bool) -> ts.ColumnType | None:
38
41
  """Convert a huggingface feature type to a pixeltable ColumnType if one is defined."""
39
42
  import datasets
40
43
 
41
44
  if isinstance(feature_type, datasets.ClassLabel):
42
45
  # enum, example: ClassLabel(names=['neg', 'pos'], id=None)
43
- return ts.StringType(nullable=True)
46
+ return ts.StringType(nullable=nullable)
44
47
  elif isinstance(feature_type, datasets.Value):
45
48
  # example: Value(dtype='int64', id=None)
46
- return _hf_to_pxt.get(feature_type.dtype, None)
47
- elif isinstance(feature_type, datasets.Sequence):
49
+ pt = _hf_to_pxt.get(feature_type.dtype, None)
50
+ return pt.copy(nullable=nullable) if pt is not None else None
51
+ elif isinstance(feature_type, (datasets.Sequence, datasets.LargeList)):
48
52
  # example: cohere wiki. Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)
49
- dtype = _to_pixeltable_type(feature_type.feature)
50
- length = feature_type.length if feature_type.length != -1 else None
51
- return ts.ArrayType(shape=(length,), dtype=dtype)
53
+ dtype = _to_pixeltable_type(feature_type.feature, nullable)
54
+ if dtype is None:
55
+ return None
56
+ if dtype.is_int_type() or dtype.is_float_type() or dtype.is_bool_type() or dtype.is_string_type():
57
+ length = feature_type.length if feature_type.length != -1 else None
58
+ return ts.ArrayType(shape=(length,), dtype=dtype, nullable=nullable)
59
+ else:
60
+ # Sequence of dicts must be cast as Json
61
+ return ts.JsonType(nullable=nullable)
52
62
  elif isinstance(feature_type, datasets.Image):
53
- return ts.ImageType(nullable=True)
63
+ return ts.ImageType(nullable=nullable)
64
+ elif isinstance(feature_type, datasets.Audio):
65
+ return ts.AudioType(nullable=nullable)
66
+ elif isinstance(feature_type, datasets.Video):
67
+ return ts.VideoType(nullable=nullable)
68
+ elif isinstance(feature_type, (datasets.Array2D, datasets.Array3D, datasets.Array4D, datasets.Array5D)):
69
+ # Multi-dimensional arrays with fixed shape and dtype
70
+ inner_dtype = _hf_to_pxt.get(feature_type.dtype, None)
71
+ if inner_dtype is None:
72
+ return None
73
+ return ts.ArrayType(shape=feature_type.shape, dtype=inner_dtype, nullable=nullable)
74
+ elif isinstance(feature_type, (datasets.Translation, datasets.TranslationVariableLanguages)):
75
+ # Translation types are dict-like structures
76
+ return ts.JsonType(nullable=nullable)
77
+ elif isinstance(feature_type, (list, dict)):
78
+ return ts.JsonType(nullable=nullable)
54
79
  else:
55
80
  return None
56
81
 
57
82
 
58
- def _get_hf_schema(dataset: Union[datasets.Dataset, datasets.DatasetDict]) -> datasets.Features:
83
+ def _get_hf_schema(dataset: datasets.Dataset | datasets.DatasetDict) -> datasets.Features:
59
84
  """Get the schema of a huggingface dataset as a dictionary."""
60
85
  import datasets
61
86
 
62
- first_dataset = dataset if isinstance(dataset, datasets.Dataset) else next(iter(dataset.values()))
87
+ first_dataset = (
88
+ dataset if isinstance(dataset, (datasets.Dataset, datasets.IterableDataset)) else next(iter(dataset.values()))
89
+ )
63
90
  return first_dataset.features
64
91
 
65
92
 
66
- def huggingface_schema_to_pixeltable_schema(
67
- hf_dataset: Union[datasets.Dataset, datasets.DatasetDict],
68
- ) -> dict[str, Optional[ts.ColumnType]]:
93
+ def huggingface_schema_to_pxt_schema(
94
+ hf_schema: datasets.Features, schema_overrides: dict[str, Any], primary_key: list[str]
95
+ ) -> dict[str, ts.ColumnType | None]:
69
96
  """Generate a pixeltable schema from a huggingface dataset schema.
70
97
  Columns without a known mapping are mapped to None
71
98
  """
72
- hf_schema = _get_hf_schema(hf_dataset)
73
99
  pixeltable_schema = {
74
- column_name: _to_pixeltable_type(feature_type) for column_name, feature_type in hf_schema.items()
100
+ column_name: _to_pixeltable_type(feature_type, column_name not in primary_key)
101
+ if column_name not in schema_overrides
102
+ else schema_overrides[column_name]
103
+ for column_name, feature_type in hf_schema.items()
75
104
  }
76
105
  return pixeltable_schema
77
106
 
78
107
 
79
108
  def import_huggingface_dataset(
80
109
  table_path: str,
81
- dataset: Union[datasets.Dataset, datasets.DatasetDict],
110
+ dataset: datasets.Dataset | datasets.DatasetDict | datasets.IterableDataset | datasets.IterableDatasetDict,
82
111
  *,
83
- column_name_for_split: Optional[str] = None,
84
- schema_overrides: Optional[dict[str, Any]] = None,
112
+ schema_overrides: dict[str, Any] | None = None,
113
+ primary_key: str | list[str] | None = None,
85
114
  **kwargs: Any,
86
115
  ) -> pxt.Table:
87
- """Create a new base table from a Huggingface dataset, or dataset dict with multiple splits.
88
- Requires `datasets` library to be installed.
116
+ """
117
+ Create a new base table from a Huggingface dataset, or dataset dict with multiple splits.
118
+ Requires `datasets` library to be installed.
119
+
120
+ HuggingFace feature types are mapped to Pixeltable column types as follows:
121
+
122
+ - `Value(bool)`: `Bool`<br/>
123
+ `Value(int*/uint*)`: `Int`<br/>
124
+ `Value(float*)`: `Float`<br/>
125
+ `Value(string/large_string)`: `String`<br/>
126
+ `Value(timestamp*)`: `Timestamp`<br/>
127
+ `Value(date*)`: `Date`
128
+ - `ClassLabel`: `String` (converted to label names)
129
+ - `Sequence`/`LargeList` of numeric types: `Array`
130
+ - `Sequence`/`LargeList` of string: `Json`
131
+ - `Sequence`/`LargeList` of dicts: `Json`
132
+ - `Array2D`-`Array5D`: `Array` (preserves shape)
133
+ - `Image`: `Image`
134
+ - `Audio`: `Audio`
135
+ - `Video`: `Video`
136
+ - `Translation`/`TranslationVariableLanguages`: `Json`
89
137
 
90
138
  Args:
91
139
  table_path: Path to the table.
92
- dataset: Huggingface [`datasets.Dataset`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.Dataset)
93
- or [`datasets.DatasetDict`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.DatasetDict)
94
- to insert into the table.
95
- column_name_for_split: column name to use for split information. If None, no split information will be stored.
140
+ dataset: An instance of any of the Huggingface dataset classes:
141
+ [`datasets.Dataset`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.Dataset),
142
+ [`datasets.DatasetDict`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.DatasetDict),
143
+ [`datasets.IterableDataset`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.IterableDataset),
144
+ [`datasets.IterableDatasetDict`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.IterableDatasetDict)
96
145
  schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
97
- name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`. The keys in
98
- `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not they are valid
99
- Pixeltable identifiers).
146
+ name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`.
147
+ The keys in `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not
148
+ they are valid Pixeltable identifiers).
149
+ primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
100
150
  kwargs: Additional arguments to pass to `create_table`.
151
+ An argument of `column_name_for_split` must be provided if the source is a DatasetDict.
152
+ This column name will contain the split information. If None, no split information will be stored.
101
153
 
102
154
  Returns:
103
155
  A handle to the newly created [`Table`][pixeltable.Table].
104
156
  """
105
- import datasets
106
- import pixeltable as pxt
107
-
108
- if table_path in pxt.list_tables():
109
- raise excs.Error(f'table {table_path} already exists')
110
-
111
- if not isinstance(dataset, (datasets.Dataset, datasets.DatasetDict)):
112
- raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
113
-
114
- if isinstance(dataset, datasets.Dataset):
115
- # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
116
- raw_name = dataset.split._name
117
- split_name = raw_name.split('[')[0] if raw_name is not None else None
118
- dataset_dict = {split_name: dataset}
119
- else:
120
- dataset_dict = dataset
121
-
122
- pixeltable_schema = huggingface_schema_to_pixeltable_schema(dataset)
123
- if schema_overrides is not None:
124
- pixeltable_schema.update(schema_overrides)
125
-
126
- if column_name_for_split is not None:
127
- if column_name_for_split in pixeltable_schema:
128
- raise excs.Error(
129
- f'Column name `{column_name_for_split}` already exists in dataset schema; provide a different `column_name_for_split`'
130
- )
131
- pixeltable_schema[column_name_for_split] = ts.StringType(nullable=True)
132
-
133
- for field, column_type in pixeltable_schema.items():
134
- if column_type is None:
135
- raise excs.Error(f'Could not infer pixeltable type for feature `{field}` in huggingface dataset')
136
-
137
- if isinstance(dataset, datasets.Dataset):
138
- # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
139
- raw_name = dataset.split._name
140
- split_name = raw_name.split('[')[0] if raw_name is not None else None
141
- dataset_dict = {split_name: dataset}
142
- elif isinstance(dataset, datasets.DatasetDict):
143
- dataset_dict = dataset
144
- else:
145
- raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
146
-
147
- # extract all class labels from the dataset to translate category ints to strings
148
- hf_schema = _get_hf_schema(dataset)
149
- categorical_features = {
150
- feature_name: feature_type.names
151
- for (feature_name, feature_type) in hf_schema.items()
152
- if isinstance(feature_type, datasets.ClassLabel)
153
- }
154
-
155
- try:
156
- # random tmp name
157
- tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
158
- tab = pxt.create_table(tmp_name, pixeltable_schema, **kwargs)
159
-
160
- def _translate_row(row: dict[str, Any], split_name: str) -> dict[str, Any]:
161
- output_row = row.copy()
162
- # map all class labels to strings
163
- for field, values in categorical_features.items():
164
- output_row[field] = values[row[field]]
165
- # add split name to row
166
- if column_name_for_split is not None:
167
- output_row[column_name_for_split] = split_name
168
- return output_row
169
-
170
- for split_name, split_dataset in dataset_dict.items():
171
- num_batches = split_dataset.size_in_bytes / _K_BATCH_SIZE_BYTES
172
- tuples_per_batch = math.ceil(split_dataset.num_rows / num_batches)
173
- assert tuples_per_batch > 0
174
-
175
- batch = []
176
- for row in split_dataset:
177
- batch.append(_translate_row(row, split_name))
178
- if len(batch) >= tuples_per_batch:
179
- tab.insert(batch)
180
- batch = []
181
- # last batch
182
- if len(batch) > 0:
183
- tab.insert(batch)
184
-
185
- except Exception as e:
186
- _logger.error(f'Error while inserting dataset into table: {tmp_name}')
187
- raise e
188
-
189
- pxt.move(tmp_name, table_path)
190
- return pxt.get_table(table_path)
157
+ return pxt.create_table(
158
+ table_path, source=dataset, schema_overrides=schema_overrides, primary_key=primary_key, extra_args=kwargs
159
+ )