pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (202) hide show
  1. pixeltable/__init__.py +23 -5
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/__init__.py +5 -3
  4. pixeltable/catalog/catalog.py +1318 -404
  5. pixeltable/catalog/column.py +186 -115
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +11 -43
  8. pixeltable/catalog/insertable_table.py +167 -79
  9. pixeltable/catalog/path.py +61 -23
  10. pixeltable/catalog/schema_object.py +9 -10
  11. pixeltable/catalog/table.py +626 -308
  12. pixeltable/catalog/table_metadata.py +101 -0
  13. pixeltable/catalog/table_version.py +713 -569
  14. pixeltable/catalog/table_version_handle.py +37 -6
  15. pixeltable/catalog/table_version_path.py +42 -29
  16. pixeltable/catalog/tbl_ops.py +50 -0
  17. pixeltable/catalog/update_status.py +191 -0
  18. pixeltable/catalog/view.py +108 -94
  19. pixeltable/config.py +128 -22
  20. pixeltable/dataframe.py +188 -100
  21. pixeltable/env.py +407 -136
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +3 -0
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +231 -0
  27. pixeltable/exec/cell_reconstruction_node.py +135 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +7 -6
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +190 -30
  37. pixeltable/exec/globals.py +32 -0
  38. pixeltable/exec/in_memory_data_node.py +18 -18
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +206 -101
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +34 -30
  46. pixeltable/exprs/column_ref.py +92 -96
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +152 -55
  50. pixeltable/exprs/expr.py +62 -43
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +75 -37
  54. pixeltable/exprs/globals.py +1 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +10 -27
  57. pixeltable/exprs/is_null.py +1 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +5 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +127 -53
  64. pixeltable/exprs/rowid_ref.py +8 -12
  65. pixeltable/exprs/similarity_expr.py +50 -25
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +10 -10
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +20 -18
  78. pixeltable/func/signature.py +43 -16
  79. pixeltable/func/tools.py +23 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +6 -0
  82. pixeltable/functions/anthropic.py +93 -33
  83. pixeltable/functions/audio.py +114 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +1 -1
  86. pixeltable/functions/deepseek.py +20 -9
  87. pixeltable/functions/fireworks.py +2 -2
  88. pixeltable/functions/gemini.py +28 -11
  89. pixeltable/functions/globals.py +13 -13
  90. pixeltable/functions/groq.py +108 -0
  91. pixeltable/functions/huggingface.py +1046 -23
  92. pixeltable/functions/image.py +9 -18
  93. pixeltable/functions/llama_cpp.py +23 -8
  94. pixeltable/functions/math.py +3 -4
  95. pixeltable/functions/mistralai.py +4 -15
  96. pixeltable/functions/ollama.py +16 -9
  97. pixeltable/functions/openai.py +104 -82
  98. pixeltable/functions/openrouter.py +143 -0
  99. pixeltable/functions/replicate.py +2 -2
  100. pixeltable/functions/reve.py +250 -0
  101. pixeltable/functions/string.py +21 -28
  102. pixeltable/functions/timestamp.py +13 -14
  103. pixeltable/functions/together.py +4 -6
  104. pixeltable/functions/twelvelabs.py +92 -0
  105. pixeltable/functions/util.py +6 -1
  106. pixeltable/functions/video.py +1388 -106
  107. pixeltable/functions/vision.py +7 -7
  108. pixeltable/functions/whisper.py +15 -7
  109. pixeltable/functions/whisperx.py +179 -0
  110. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  111. pixeltable/globals.py +332 -105
  112. pixeltable/index/base.py +13 -22
  113. pixeltable/index/btree.py +23 -22
  114. pixeltable/index/embedding_index.py +32 -44
  115. pixeltable/io/__init__.py +4 -2
  116. pixeltable/io/datarows.py +7 -6
  117. pixeltable/io/external_store.py +49 -77
  118. pixeltable/io/fiftyone.py +11 -11
  119. pixeltable/io/globals.py +29 -28
  120. pixeltable/io/hf_datasets.py +17 -9
  121. pixeltable/io/label_studio.py +70 -66
  122. pixeltable/io/lancedb.py +3 -0
  123. pixeltable/io/pandas.py +12 -11
  124. pixeltable/io/parquet.py +13 -93
  125. pixeltable/io/table_data_conduit.py +71 -47
  126. pixeltable/io/utils.py +3 -3
  127. pixeltable/iterators/__init__.py +2 -1
  128. pixeltable/iterators/audio.py +21 -11
  129. pixeltable/iterators/document.py +116 -55
  130. pixeltable/iterators/image.py +5 -2
  131. pixeltable/iterators/video.py +293 -13
  132. pixeltable/metadata/__init__.py +4 -2
  133. pixeltable/metadata/converters/convert_18.py +2 -2
  134. pixeltable/metadata/converters/convert_19.py +2 -2
  135. pixeltable/metadata/converters/convert_20.py +2 -2
  136. pixeltable/metadata/converters/convert_21.py +2 -2
  137. pixeltable/metadata/converters/convert_22.py +2 -2
  138. pixeltable/metadata/converters/convert_24.py +2 -2
  139. pixeltable/metadata/converters/convert_25.py +2 -2
  140. pixeltable/metadata/converters/convert_26.py +2 -2
  141. pixeltable/metadata/converters/convert_29.py +4 -4
  142. pixeltable/metadata/converters/convert_34.py +2 -2
  143. pixeltable/metadata/converters/convert_36.py +2 -2
  144. pixeltable/metadata/converters/convert_37.py +15 -0
  145. pixeltable/metadata/converters/convert_38.py +39 -0
  146. pixeltable/metadata/converters/convert_39.py +124 -0
  147. pixeltable/metadata/converters/convert_40.py +73 -0
  148. pixeltable/metadata/converters/util.py +13 -12
  149. pixeltable/metadata/notes.py +4 -0
  150. pixeltable/metadata/schema.py +79 -42
  151. pixeltable/metadata/utils.py +74 -0
  152. pixeltable/mypy/__init__.py +3 -0
  153. pixeltable/mypy/mypy_plugin.py +123 -0
  154. pixeltable/plan.py +274 -223
  155. pixeltable/share/__init__.py +1 -1
  156. pixeltable/share/packager.py +259 -129
  157. pixeltable/share/protocol/__init__.py +34 -0
  158. pixeltable/share/protocol/common.py +170 -0
  159. pixeltable/share/protocol/operation_types.py +33 -0
  160. pixeltable/share/protocol/replica.py +109 -0
  161. pixeltable/share/publish.py +213 -57
  162. pixeltable/store.py +238 -175
  163. pixeltable/type_system.py +104 -63
  164. pixeltable/utils/__init__.py +2 -3
  165. pixeltable/utils/arrow.py +108 -13
  166. pixeltable/utils/av.py +298 -0
  167. pixeltable/utils/azure_store.py +305 -0
  168. pixeltable/utils/code.py +3 -3
  169. pixeltable/utils/console_output.py +4 -1
  170. pixeltable/utils/coroutine.py +6 -23
  171. pixeltable/utils/dbms.py +31 -5
  172. pixeltable/utils/description_helper.py +4 -5
  173. pixeltable/utils/documents.py +5 -6
  174. pixeltable/utils/exception_handler.py +7 -30
  175. pixeltable/utils/filecache.py +6 -6
  176. pixeltable/utils/formatter.py +4 -6
  177. pixeltable/utils/gcs_store.py +283 -0
  178. pixeltable/utils/http_server.py +2 -3
  179. pixeltable/utils/iceberg.py +1 -2
  180. pixeltable/utils/image.py +17 -0
  181. pixeltable/utils/lancedb.py +88 -0
  182. pixeltable/utils/local_store.py +316 -0
  183. pixeltable/utils/misc.py +5 -0
  184. pixeltable/utils/object_stores.py +528 -0
  185. pixeltable/utils/pydantic.py +60 -0
  186. pixeltable/utils/pytorch.py +5 -6
  187. pixeltable/utils/s3_store.py +392 -0
  188. pixeltable-0.4.20.dist-info/METADATA +587 -0
  189. pixeltable-0.4.20.dist-info/RECORD +218 -0
  190. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
  191. pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
  192. pixeltable/__version__.py +0 -3
  193. pixeltable/ext/__init__.py +0 -17
  194. pixeltable/ext/functions/__init__.py +0 -11
  195. pixeltable/ext/functions/whisperx.py +0 -77
  196. pixeltable/utils/media_store.py +0 -77
  197. pixeltable/utils/s3.py +0 -17
  198. pixeltable/utils/sample.py +0 -25
  199. pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
  200. pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
  201. pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
  202. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
@@ -8,9 +8,11 @@ import urllib.parse
8
8
  import urllib.request
9
9
  from dataclasses import dataclass, field, fields
10
10
  from pathlib import Path
11
- from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, Union, cast
11
+ from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, cast
12
12
 
13
+ import numpy as np
13
14
  import pandas as pd
15
+ import PIL
14
16
  from pyarrow.parquet import ParquetDataset
15
17
 
16
18
  import pixeltable as pxt
@@ -47,16 +49,16 @@ class TableDataConduitFormat(str, enum.Enum):
47
49
 
48
50
  @dataclass
49
51
  class TableDataConduit:
50
- source: TableDataSource
51
- source_format: Optional[str] = None
52
- source_column_map: Optional[dict[str, str]] = None
52
+ source: 'TableDataSource'
53
+ source_format: str | None = None
54
+ source_column_map: dict[str, str] | None = None
53
55
  if_row_exists: Literal['update', 'ignore', 'error'] = 'error'
54
- pxt_schema: Optional[dict[str, Any]] = None
55
- src_schema_overrides: Optional[dict[str, Any]] = None
56
- src_schema: Optional[dict[str, Any]] = None
57
- pxt_pk: Optional[list[str]] = None
58
- src_pk: Optional[list[str]] = None
59
- valid_rows: Optional[RowData] = None
56
+ pxt_schema: dict[str, ts.ColumnType] | None = None
57
+ src_schema_overrides: dict[str, ts.ColumnType] | None = None
58
+ src_schema: dict[str, ts.ColumnType] | None = None
59
+ pxt_pk: list[str] | None = None
60
+ src_pk: list[str] | None = None
61
+ valid_rows: RowData | None = None
60
62
  extra_fields: dict[str, Any] = field(default_factory=dict)
61
63
 
62
64
  reqd_col_names: set[str] = field(default_factory=set)
@@ -87,7 +89,7 @@ class TableDataConduit:
87
89
  for name, coltype in self.pxt_schema.items():
88
90
  self.pxt_schema[name] = ts.ColumnType.normalize_type(coltype)
89
91
 
90
- def infer_schema(self) -> dict[str, Any]:
92
+ def infer_schema(self) -> dict[str, ts.ColumnType]:
91
93
  raise NotImplementedError
92
94
 
93
95
  def valid_row_batch(self) -> Iterator[RowData]:
@@ -101,7 +103,7 @@ class TableDataConduit:
101
103
  def add_table_info(self, table: pxt.Table) -> None:
102
104
  """Add information about the table into which we are inserting data"""
103
105
  assert isinstance(table, pxt.Table)
104
- self.pxt_schema = table._schema
106
+ self.pxt_schema = table._get_schema()
105
107
  self.pxt_pk = table._tbl_version.get().primary_key
106
108
  for col in table._tbl_version_path.columns():
107
109
  if col.is_required_for_insert:
@@ -137,7 +139,7 @@ class DFTableDataConduit(TableDataConduit):
137
139
  t.pxt_df = tds.source
138
140
  return t
139
141
 
140
- def infer_schema(self) -> dict[str, Any]:
142
+ def infer_schema(self) -> dict[str, ts.ColumnType]:
141
143
  self.pxt_schema = self.pxt_df.schema
142
144
  self.pxt_pk = self.src_pk
143
145
  return self.pxt_schema
@@ -149,7 +151,7 @@ class DFTableDataConduit(TableDataConduit):
149
151
 
150
152
 
151
153
  class RowDataTableDataConduit(TableDataConduit):
152
- raw_rows: Optional[RowData] = None
154
+ raw_rows: RowData | None = None
153
155
  disable_mapping: bool = True
154
156
  batch_count: int = 0
155
157
 
@@ -168,7 +170,7 @@ class RowDataTableDataConduit(TableDataConduit):
168
170
  t.batch_count = 0
169
171
  return t
170
172
 
171
- def infer_schema(self) -> dict[str, Any]:
173
+ def infer_schema(self) -> dict[str, ts.ColumnType]:
172
174
  from .datarows import _infer_schema_from_rows
173
175
 
174
176
  if self.source_column_map is None:
@@ -239,7 +241,7 @@ class PandasTableDataConduit(TableDataConduit):
239
241
  t.batch_count = 0
240
242
  return t
241
243
 
242
- def infer_schema_part1(self) -> tuple[dict[str, Any], list[str]]:
244
+ def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
243
245
  """Return inferred schema, inferred primary key, and source column map"""
244
246
  if self.source_column_map is None:
245
247
  if self.src_schema_overrides is None:
@@ -252,7 +254,7 @@ class PandasTableDataConduit(TableDataConduit):
252
254
  else:
253
255
  raise NotImplementedError()
254
256
 
255
- def infer_schema(self) -> dict[str, Any]:
257
+ def infer_schema(self) -> dict[str, ts.ColumnType]:
256
258
  self.pxt_schema, self.pxt_pk = self.infer_schema_part1()
257
259
  self.normalize_pxt_schema_types()
258
260
  _df_check_primary_key_values(self.pd_df, self.src_pk)
@@ -325,10 +327,13 @@ class JsonTableDataConduit(TableDataConduit):
325
327
 
326
328
 
327
329
  class HFTableDataConduit(TableDataConduit):
328
- hf_ds: Optional[Union[datasets.Dataset, datasets.DatasetDict]] = None
329
- column_name_for_split: Optional[str] = None
330
+ """
331
+ TODO:
332
+ - use set_format('arrow') and convert ChunkedArrays to PIL.Image.Image instead of going through numpy, which is slow
333
+ """
334
+
335
+ column_name_for_split: str | None = None
330
336
  categorical_features: dict[str, dict[int, str]]
331
- hf_schema: dict[str, Any] = None
332
337
  dataset_dict: dict[str, datasets.Dataset] = None
333
338
  hf_schema_source: dict[str, Any] = None
334
339
 
@@ -340,9 +345,19 @@ class HFTableDataConduit(TableDataConduit):
340
345
  import datasets
341
346
 
342
347
  assert isinstance(tds.source, (datasets.Dataset, datasets.DatasetDict))
343
- t.hf_ds = tds.source
344
348
  if 'column_name_for_split' in t.extra_fields:
345
349
  t.column_name_for_split = t.extra_fields['column_name_for_split']
350
+
351
+ # make sure we get numpy arrays for arrays, not Python lists
352
+ source = tds.source.with_format(type='numpy')
353
+ if isinstance(source, datasets.Dataset):
354
+ # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
355
+ raw_name = source.split._name
356
+ split_name = raw_name.split('[')[0] if raw_name is not None else None
357
+ t.dataset_dict = {split_name: source}
358
+ else:
359
+ assert isinstance(source, datasets.DatasetDict)
360
+ t.dataset_dict = source
346
361
  return t
347
362
 
348
363
  @classmethod
@@ -356,13 +371,13 @@ class HFTableDataConduit(TableDataConduit):
356
371
  except ImportError:
357
372
  return False
358
373
 
359
- def infer_schema_part1(self) -> tuple[dict[str, Any], list[str]]:
374
+ def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
360
375
  from pixeltable.io.hf_datasets import _get_hf_schema, huggingface_schema_to_pxt_schema
361
376
 
362
377
  if self.source_column_map is None:
363
378
  if self.src_schema_overrides is None:
364
379
  self.src_schema_overrides = {}
365
- self.hf_schema_source = _get_hf_schema(self.hf_ds)
380
+ self.hf_schema_source = _get_hf_schema(self.source)
366
381
  self.src_schema = huggingface_schema_to_pxt_schema(
367
382
  self.hf_schema_source, self.src_schema_overrides, self.src_pk
368
383
  )
@@ -397,15 +412,6 @@ class HFTableDataConduit(TableDataConduit):
397
412
  def prepare_insert(self) -> None:
398
413
  import datasets
399
414
 
400
- if isinstance(self.source, datasets.Dataset):
401
- # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
402
- raw_name = self.source.split._name
403
- split_name = raw_name.split('[')[0] if raw_name is not None else None
404
- self.dataset_dict = {split_name: self.source}
405
- else:
406
- assert isinstance(self.source, datasets.DatasetDict)
407
- self.dataset_dict = self.source
408
-
409
415
  # extract all class labels from the dataset to translate category ints to strings
410
416
  self.categorical_features = {
411
417
  feature_name: feature_type.names
@@ -416,26 +422,44 @@ class HFTableDataConduit(TableDataConduit):
416
422
  self.source_column_map = {}
417
423
  self.check_source_columns_are_insertable(self.hf_schema_source.keys())
418
424
 
419
- def _translate_row(self, row: dict[str, Any], split_name: str) -> dict[str, Any]:
425
+ def _translate_row(self, row: dict[str, Any], split_name: str, features: datasets.Features) -> dict[str, Any]:
420
426
  output_row: dict[str, Any] = {}
421
427
  for col_name, val in row.items():
422
428
  # translate category ints to strings
423
429
  new_val = self.categorical_features[col_name][val] if col_name in self.categorical_features else val
424
430
  mapped_col_name = self.source_column_map.get(col_name, col_name)
425
431
 
426
- # Convert values to the appropriate type if needed
427
- try:
428
- checked_val = self.pxt_schema[mapped_col_name].create_literal(new_val)
429
- except TypeError as e:
430
- msg = str(e)
431
- raise excs.Error(f'Error in column {col_name}: {msg[0].lower() + msg[1:]}\nRow: {row}') from e
432
- output_row[mapped_col_name] = checked_val
432
+ new_val = self._translate_val(new_val, features[col_name])
433
+ output_row[mapped_col_name] = new_val
433
434
 
434
435
  # add split name to output row
435
436
  if self.column_name_for_split is not None:
436
437
  output_row[self.column_name_for_split] = split_name
437
438
  return output_row
438
439
 
440
+ def _translate_val(self, val: Any, feature: datasets.Feature) -> Any:
441
+ """Convert numpy scalars to Python types and images to PIL.Image.Image"""
442
+ import datasets
443
+
444
+ if isinstance(feature, datasets.Value):
445
+ if isinstance(val, (np.generic, np.ndarray)):
446
+ # a scalar, which we want as a standard Python type
447
+ assert np.ndim(val) == 0
448
+ return val.item()
449
+ else:
450
+ # a standard Python object
451
+ return val
452
+ elif isinstance(feature, datasets.Sequence):
453
+ assert np.ndim(val) > 0
454
+ return val
455
+ elif isinstance(feature, datasets.Image):
456
+ return PIL.Image.fromarray(val)
457
+ elif isinstance(feature, dict):
458
+ assert isinstance(val, dict)
459
+ return {k: self._translate_val(v, feature[k]) for k, v in val.items()}
460
+ else:
461
+ return val
462
+
439
463
  def valid_row_batch(self) -> Iterator[RowData]:
440
464
  for split_name, split_dataset in self.dataset_dict.items():
441
465
  num_batches = split_dataset.size_in_bytes / self._K_BATCH_SIZE_BYTES
@@ -444,7 +468,7 @@ class HFTableDataConduit(TableDataConduit):
444
468
 
445
469
  batch = []
446
470
  for row in split_dataset:
447
- batch.append(self._translate_row(row, split_name))
471
+ batch.append(self._translate_row(row, split_name, split_dataset.features))
448
472
  if len(batch) >= tuples_per_batch:
449
473
  yield batch
450
474
  batch = []
@@ -454,7 +478,7 @@ class HFTableDataConduit(TableDataConduit):
454
478
 
455
479
 
456
480
  class ParquetTableDataConduit(TableDataConduit):
457
- pq_ds: Optional[ParquetDataset] = None
481
+ pq_ds: ParquetDataset | None = None
458
482
 
459
483
  @classmethod
460
484
  def from_tds(cls, tds: TableDataConduit) -> 'ParquetTableDataConduit':
@@ -469,13 +493,13 @@ class ParquetTableDataConduit(TableDataConduit):
469
493
  t.pq_ds = parquet.ParquetDataset(str(input_path))
470
494
  return t
471
495
 
472
- def infer_schema_part1(self) -> tuple[dict[str, Any], list[str]]:
473
- from pixeltable.utils.arrow import ar_infer_schema
496
+ def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
497
+ from pixeltable.utils.arrow import to_pxt_schema
474
498
 
475
499
  if self.source_column_map is None:
476
500
  if self.src_schema_overrides is None:
477
501
  self.src_schema_overrides = {}
478
- self.src_schema = ar_infer_schema(self.pq_ds.schema, self.src_schema_overrides, self.src_pk)
502
+ self.src_schema = to_pxt_schema(self.pq_ds.schema, self.src_schema_overrides, self.src_pk)
479
503
  inferred_schema, inferred_pk, self.source_column_map = normalize_schema_names(
480
504
  self.src_schema, self.src_pk, self.src_schema_overrides
481
505
  )
@@ -483,7 +507,7 @@ class ParquetTableDataConduit(TableDataConduit):
483
507
  else:
484
508
  raise NotImplementedError()
485
509
 
486
- def infer_schema(self) -> dict[str, Any]:
510
+ def infer_schema(self) -> dict[str, ts.ColumnType]:
487
511
  self.pxt_schema, self.pxt_pk = self.infer_schema_part1()
488
512
  self.normalize_pxt_schema_types()
489
513
  self.prepare_insert()
@@ -504,7 +528,7 @@ class ParquetTableDataConduit(TableDataConduit):
504
528
  from pixeltable.utils.arrow import iter_tuples2
505
529
 
506
530
  try:
507
- for fragment in self.pq_ds.fragments: # type: ignore[attr-defined]
531
+ for fragment in self.pq_ds.fragments:
508
532
  for batch in fragment.to_batches():
509
533
  dict_batch = list(iter_tuples2(batch, self.source_column_map, self.pxt_schema))
510
534
  self.total_rows += len(dict_batch)
pixeltable/io/utils.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from keyword import iskeyword as is_python_keyword
2
- from typing import Any, Optional, Union
2
+ from typing import Any
3
3
 
4
4
  import pixeltable as pxt
5
5
  import pixeltable.exceptions as excs
@@ -21,7 +21,7 @@ def normalize_pxt_col_name(name: str) -> str:
21
21
  return id
22
22
 
23
23
 
24
- def normalize_primary_key_parameter(primary_key: Optional[Union[str, list[str]]] = None) -> list[str]:
24
+ def normalize_primary_key_parameter(primary_key: str | list[str] | None = None) -> list[str]:
25
25
  if primary_key is None:
26
26
  primary_key = []
27
27
  elif isinstance(primary_key, str):
@@ -40,7 +40,7 @@ def normalize_schema_names(
40
40
  primary_key: list[str],
41
41
  schema_overrides: dict[str, Any],
42
42
  require_valid_pxt_column_names: bool = False,
43
- ) -> tuple[dict[str, Any], list[str], Optional[dict[str, str]]]:
43
+ ) -> tuple[dict[str, Any], list[str], dict[str, str] | None]:
44
44
  """
45
45
  Convert all names in the input schema from source names to valid Pixeltable identifiers
46
46
  - Ensure that all names are unique.
@@ -1,3 +1,4 @@
1
+ """Iterators for splitting media and documents into components."""
1
2
  # ruff: noqa: F401
2
3
 
3
4
  from .audio import AudioSplitter
@@ -5,7 +6,7 @@ from .base import ComponentIterator
5
6
  from .document import DocumentSplitter
6
7
  from .image import TileIterator
7
8
  from .string import StringSplitter
8
- from .video import FrameIterator
9
+ from .video import FrameIterator, VideoSplitter
9
10
 
10
11
  __default_dir = {symbol for symbol in dir() if not symbol.startswith('_')}
11
12
  __removed_symbols = {'base', 'document', 'video'}
@@ -1,12 +1,12 @@
1
1
  import logging
2
- import uuid
3
2
  from fractions import Fraction
4
3
  from pathlib import Path
5
- from typing import Any, ClassVar, Optional
4
+ from typing import Any, ClassVar
6
5
 
7
6
  import av
8
7
 
9
- from pixeltable import env, exceptions as excs, type_system as ts
8
+ from pixeltable import exceptions as excs, type_system as ts
9
+ from pixeltable.utils.local_store import TempStore
10
10
 
11
11
  from .base import ComponentIterator
12
12
 
@@ -37,7 +37,7 @@ class AudioSplitter(ComponentIterator):
37
37
 
38
38
  # List of chunks to extract
39
39
  # Each chunk is defined by start and end presentation timestamps in audio file (int)
40
- chunks_to_extract_in_pts: Optional[list[tuple[int, int]]]
40
+ chunks_to_extract_in_pts: list[tuple[int, int]] | None
41
41
  # next chunk to extract
42
42
  next_pos: int
43
43
 
@@ -55,12 +55,9 @@ class AudioSplitter(ComponentIterator):
55
55
  def __init__(
56
56
  self, audio: str, chunk_duration_sec: float, *, overlap_sec: float = 0.0, min_chunk_duration_sec: float = 0.0
57
57
  ):
58
- if chunk_duration_sec <= 0.0:
59
- raise excs.Error('chunk_duration_sec must be a positive number')
60
- if chunk_duration_sec < min_chunk_duration_sec:
61
- raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
62
- if overlap_sec >= chunk_duration_sec:
63
- raise excs.Error('overlap_sec must be less than chunk_duration_sec')
58
+ assert chunk_duration_sec > 0.0
59
+ assert chunk_duration_sec >= min_chunk_duration_sec
60
+ assert overlap_sec < chunk_duration_sec
64
61
  audio_path = Path(audio)
65
62
  assert audio_path.exists() and audio_path.is_file()
66
63
  self.audio_path = audio_path
@@ -128,6 +125,19 @@ class AudioSplitter(ComponentIterator):
128
125
 
129
126
  @classmethod
130
127
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
128
+ param_names = ['chunk_duration_sec', 'min_chunk_duration_sec', 'overlap_sec']
129
+ params = dict(zip(param_names, args))
130
+ params.update(kwargs)
131
+
132
+ chunk_duration_sec = params['chunk_duration_sec']
133
+ min_chunk_duration_sec = params.get('min_chunk_duration_sec', 0.0)
134
+ overlap_sec = params.get('overlap_sec', 0.0)
135
+ if chunk_duration_sec <= 0.0:
136
+ raise excs.Error('chunk_duration_sec must be a positive number')
137
+ if chunk_duration_sec < min_chunk_duration_sec:
138
+ raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
139
+ if overlap_sec >= chunk_duration_sec:
140
+ raise excs.Error('overlap_sec must be less than chunk_duration_sec')
131
141
  return {
132
142
  'start_time_sec': ts.FloatType(),
133
143
  'end_time_sec': ts.FloatType(),
@@ -140,7 +150,7 @@ class AudioSplitter(ComponentIterator):
140
150
  target_chunk_start, target_chunk_end = self.chunks_to_extract_in_pts[self.next_pos]
141
151
  chunk_start_pts = 0
142
152
  chunk_end_pts = 0
143
- chunk_file = str(env.Env.get().tmp_dir / f'{uuid.uuid4()}{self.audio_path.suffix}')
153
+ chunk_file = str(TempStore.create_path(extension=self.audio_path.suffix))
144
154
  output_container = av.open(chunk_file, mode='w')
145
155
  input_stream = self.container.streams.audio[0]
146
156
  codec_name = AudioSplitter.__codec_map.get(input_stream.codec_context.name, input_stream.codec_context.name)