pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. pixeltable/__init__.py +42 -8
  2. pixeltable/{dataframe.py → _query.py} +470 -206
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +5 -4
  5. pixeltable/catalog/catalog.py +1785 -432
  6. pixeltable/catalog/column.py +190 -113
  7. pixeltable/catalog/dir.py +2 -4
  8. pixeltable/catalog/globals.py +19 -46
  9. pixeltable/catalog/insertable_table.py +191 -98
  10. pixeltable/catalog/path.py +63 -23
  11. pixeltable/catalog/schema_object.py +11 -15
  12. pixeltable/catalog/table.py +843 -436
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +978 -657
  15. pixeltable/catalog/table_version_handle.py +72 -16
  16. pixeltable/catalog/table_version_path.py +112 -43
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +134 -90
  20. pixeltable/config.py +134 -22
  21. pixeltable/env.py +471 -157
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +4 -1
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +11 -7
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +106 -56
  37. pixeltable/exec/globals.py +35 -0
  38. pixeltable/exec/in_memory_data_node.py +19 -19
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +351 -84
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +36 -23
  46. pixeltable/exprs/column_ref.py +213 -89
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +164 -54
  50. pixeltable/exprs/expr.py +70 -44
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +100 -40
  54. pixeltable/exprs/globals.py +2 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +18 -32
  57. pixeltable/exprs/is_null.py +7 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +27 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +167 -67
  64. pixeltable/exprs/rowid_ref.py +25 -10
  65. pixeltable/exprs/similarity_expr.py +58 -40
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +17 -11
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +29 -27
  78. pixeltable/func/signature.py +46 -19
  79. pixeltable/func/tools.py +31 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +16 -0
  82. pixeltable/functions/anthropic.py +123 -77
  83. pixeltable/functions/audio.py +147 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +7 -4
  86. pixeltable/functions/deepseek.py +35 -43
  87. pixeltable/functions/document.py +81 -0
  88. pixeltable/functions/fal.py +76 -0
  89. pixeltable/functions/fireworks.py +11 -20
  90. pixeltable/functions/gemini.py +195 -39
  91. pixeltable/functions/globals.py +142 -14
  92. pixeltable/functions/groq.py +108 -0
  93. pixeltable/functions/huggingface.py +1056 -24
  94. pixeltable/functions/image.py +115 -57
  95. pixeltable/functions/json.py +1 -1
  96. pixeltable/functions/llama_cpp.py +28 -13
  97. pixeltable/functions/math.py +67 -5
  98. pixeltable/functions/mistralai.py +18 -55
  99. pixeltable/functions/net.py +70 -0
  100. pixeltable/functions/ollama.py +20 -13
  101. pixeltable/functions/openai.py +240 -226
  102. pixeltable/functions/openrouter.py +143 -0
  103. pixeltable/functions/replicate.py +4 -4
  104. pixeltable/functions/reve.py +250 -0
  105. pixeltable/functions/string.py +239 -69
  106. pixeltable/functions/timestamp.py +16 -16
  107. pixeltable/functions/together.py +24 -84
  108. pixeltable/functions/twelvelabs.py +188 -0
  109. pixeltable/functions/util.py +6 -1
  110. pixeltable/functions/uuid.py +30 -0
  111. pixeltable/functions/video.py +1515 -107
  112. pixeltable/functions/vision.py +8 -8
  113. pixeltable/functions/voyageai.py +289 -0
  114. pixeltable/functions/whisper.py +16 -8
  115. pixeltable/functions/whisperx.py +179 -0
  116. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  117. pixeltable/globals.py +362 -115
  118. pixeltable/index/base.py +17 -21
  119. pixeltable/index/btree.py +28 -22
  120. pixeltable/index/embedding_index.py +100 -118
  121. pixeltable/io/__init__.py +4 -2
  122. pixeltable/io/datarows.py +8 -7
  123. pixeltable/io/external_store.py +56 -105
  124. pixeltable/io/fiftyone.py +13 -13
  125. pixeltable/io/globals.py +31 -30
  126. pixeltable/io/hf_datasets.py +61 -16
  127. pixeltable/io/label_studio.py +74 -70
  128. pixeltable/io/lancedb.py +3 -0
  129. pixeltable/io/pandas.py +21 -12
  130. pixeltable/io/parquet.py +25 -105
  131. pixeltable/io/table_data_conduit.py +250 -123
  132. pixeltable/io/utils.py +4 -4
  133. pixeltable/iterators/__init__.py +2 -1
  134. pixeltable/iterators/audio.py +26 -25
  135. pixeltable/iterators/base.py +9 -3
  136. pixeltable/iterators/document.py +112 -78
  137. pixeltable/iterators/image.py +12 -15
  138. pixeltable/iterators/string.py +11 -4
  139. pixeltable/iterators/video.py +523 -120
  140. pixeltable/metadata/__init__.py +14 -3
  141. pixeltable/metadata/converters/convert_13.py +2 -2
  142. pixeltable/metadata/converters/convert_18.py +2 -2
  143. pixeltable/metadata/converters/convert_19.py +2 -2
  144. pixeltable/metadata/converters/convert_20.py +2 -2
  145. pixeltable/metadata/converters/convert_21.py +2 -2
  146. pixeltable/metadata/converters/convert_22.py +2 -2
  147. pixeltable/metadata/converters/convert_24.py +2 -2
  148. pixeltable/metadata/converters/convert_25.py +2 -2
  149. pixeltable/metadata/converters/convert_26.py +2 -2
  150. pixeltable/metadata/converters/convert_29.py +4 -4
  151. pixeltable/metadata/converters/convert_30.py +34 -21
  152. pixeltable/metadata/converters/convert_34.py +2 -2
  153. pixeltable/metadata/converters/convert_35.py +9 -0
  154. pixeltable/metadata/converters/convert_36.py +38 -0
  155. pixeltable/metadata/converters/convert_37.py +15 -0
  156. pixeltable/metadata/converters/convert_38.py +39 -0
  157. pixeltable/metadata/converters/convert_39.py +124 -0
  158. pixeltable/metadata/converters/convert_40.py +73 -0
  159. pixeltable/metadata/converters/convert_41.py +12 -0
  160. pixeltable/metadata/converters/convert_42.py +9 -0
  161. pixeltable/metadata/converters/convert_43.py +44 -0
  162. pixeltable/metadata/converters/util.py +20 -31
  163. pixeltable/metadata/notes.py +9 -0
  164. pixeltable/metadata/schema.py +140 -53
  165. pixeltable/metadata/utils.py +74 -0
  166. pixeltable/mypy/__init__.py +3 -0
  167. pixeltable/mypy/mypy_plugin.py +123 -0
  168. pixeltable/plan.py +382 -115
  169. pixeltable/share/__init__.py +1 -1
  170. pixeltable/share/packager.py +547 -83
  171. pixeltable/share/protocol/__init__.py +33 -0
  172. pixeltable/share/protocol/common.py +165 -0
  173. pixeltable/share/protocol/operation_types.py +33 -0
  174. pixeltable/share/protocol/replica.py +119 -0
  175. pixeltable/share/publish.py +257 -59
  176. pixeltable/store.py +311 -194
  177. pixeltable/type_system.py +373 -211
  178. pixeltable/utils/__init__.py +2 -3
  179. pixeltable/utils/arrow.py +131 -17
  180. pixeltable/utils/av.py +298 -0
  181. pixeltable/utils/azure_store.py +346 -0
  182. pixeltable/utils/coco.py +6 -6
  183. pixeltable/utils/code.py +3 -3
  184. pixeltable/utils/console_output.py +4 -1
  185. pixeltable/utils/coroutine.py +6 -23
  186. pixeltable/utils/dbms.py +32 -6
  187. pixeltable/utils/description_helper.py +4 -5
  188. pixeltable/utils/documents.py +7 -18
  189. pixeltable/utils/exception_handler.py +7 -30
  190. pixeltable/utils/filecache.py +6 -6
  191. pixeltable/utils/formatter.py +86 -48
  192. pixeltable/utils/gcs_store.py +295 -0
  193. pixeltable/utils/http.py +133 -0
  194. pixeltable/utils/http_server.py +2 -3
  195. pixeltable/utils/iceberg.py +1 -2
  196. pixeltable/utils/image.py +17 -0
  197. pixeltable/utils/lancedb.py +90 -0
  198. pixeltable/utils/local_store.py +322 -0
  199. pixeltable/utils/misc.py +5 -0
  200. pixeltable/utils/object_stores.py +573 -0
  201. pixeltable/utils/pydantic.py +60 -0
  202. pixeltable/utils/pytorch.py +5 -6
  203. pixeltable/utils/s3_store.py +527 -0
  204. pixeltable/utils/sql.py +26 -0
  205. pixeltable/utils/system.py +30 -0
  206. pixeltable-0.5.7.dist-info/METADATA +579 -0
  207. pixeltable-0.5.7.dist-info/RECORD +227 -0
  208. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  209. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  210. pixeltable/__version__.py +0 -3
  211. pixeltable/catalog/named_function.py +0 -40
  212. pixeltable/ext/__init__.py +0 -17
  213. pixeltable/ext/functions/__init__.py +0 -11
  214. pixeltable/ext/functions/whisperx.py +0 -77
  215. pixeltable/utils/media_store.py +0 -77
  216. pixeltable/utils/s3.py +0 -17
  217. pixeltable-0.3.14.dist-info/METADATA +0 -434
  218. pixeltable-0.3.14.dist-info/RECORD +0 -186
  219. pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
  220. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -2,7 +2,6 @@ import hashlib
2
2
  import urllib.parse
3
3
  import urllib.request
4
4
  from pathlib import Path
5
- from typing import Optional, Union
6
5
 
7
6
 
8
7
  def print_perf_counter_delta(delta: float) -> str:
@@ -24,7 +23,7 @@ def print_perf_counter_delta(delta: float) -> str:
24
23
  return f'{delta:.2f} s'
25
24
 
26
25
 
27
- def sha256sum(path: Union[Path, str]) -> str:
26
+ def sha256sum(path: Path | str) -> str:
28
27
  """
29
28
  Compute the SHA256 hash of a file.
30
29
  """
@@ -39,7 +38,7 @@ def sha256sum(path: Union[Path, str]) -> str:
39
38
  return h.hexdigest()
40
39
 
41
40
 
42
- def parse_local_file_path(file_or_url: str) -> Optional[Path]:
41
+ def parse_local_file_path(file_or_url: str) -> Path | None:
43
42
  """
44
43
  Parses a string that may be either a URL or a local file path.
45
44
 
pixeltable/utils/arrow.py CHANGED
@@ -1,15 +1,23 @@
1
1
  import datetime
2
- from typing import Any, Iterator, Optional, Union
2
+ import io
3
+ import json
4
+ import uuid
5
+ from typing import TYPE_CHECKING, Any, Iterator, cast
3
6
 
4
7
  import numpy as np
8
+ import PIL.Image
5
9
  import pyarrow as pa
6
10
 
11
+ import pixeltable.exceptions as excs
7
12
  import pixeltable.type_system as ts
8
13
 
14
+ if TYPE_CHECKING:
15
+ import pixeltable as pxt
16
+
9
17
  PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
10
18
  pa.string(): ts.StringType(nullable=True),
11
19
  pa.large_string(): ts.StringType(nullable=True),
12
- pa.timestamp('us', tz=datetime.timezone.utc): ts.TimestampType(nullable=True),
20
+ pa.timestamp('us', tz='UTC'): ts.TimestampType(nullable=True),
13
21
  pa.bool_(): ts.BoolType(nullable=True),
14
22
  pa.int8(): ts.IntType(nullable=True),
15
23
  pa.int16(): ts.IntType(nullable=True),
@@ -23,16 +31,19 @@ PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
23
31
  pa.float64(): ts.FloatType(nullable=True),
24
32
  pa.date32(): ts.DateType(nullable=True),
25
33
  pa.date64(): ts.DateType(nullable=True),
26
- pa.binary(): None, # cannot import binary (inline image)
34
+ pa.uuid(): ts.UUIDType(nullable=True),
35
+ pa.binary(): ts.BinaryType(nullable=True),
27
36
  }
28
37
 
29
38
  PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
30
39
  ts.StringType: pa.string(),
31
- ts.TimestampType: pa.timestamp('us', tz=datetime.timezone.utc), # postgres timestamp is microseconds
40
+ ts.TimestampType: pa.timestamp('us', tz='UTC'), # postgres timestamp is microseconds
32
41
  ts.DateType: pa.date32(), # This could be date64
42
+ ts.UUIDType: pa.uuid(),
33
43
  ts.BoolType: pa.bool_(),
34
44
  ts.IntType: pa.int64(),
35
45
  ts.FloatType: pa.float32(),
46
+ ts.BinaryType: pa.binary(),
36
47
  ts.JsonType: pa.string(), # TODO(orm) pa.struct() is possible
37
48
  ts.ImageType: pa.binary(), # inline image
38
49
  ts.AudioType: pa.string(), # path
@@ -41,7 +52,7 @@ PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
41
52
  }
42
53
 
43
54
 
44
- def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.ColumnType]:
55
+ def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> ts.ColumnType | None:
45
56
  """Convert a pyarrow DataType to a pixeltable ColumnType if one is defined.
46
57
  Returns None if no conversion is currently implemented.
47
58
  """
@@ -54,50 +65,144 @@ def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.C
54
65
  dtype = to_pixeltable_type(arrow_type.value_type, nullable)
55
66
  if dtype is None:
56
67
  return None
57
- return ts.ArrayType(shape=arrow_type.shape, dtype=dtype, nullable=nullable)
68
+ return ts.ArrayType(shape=tuple(arrow_type.shape), dtype=dtype, nullable=nullable)
58
69
  else:
59
70
  return None
60
71
 
61
72
 
62
- def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
73
+ def to_arrow_type(pixeltable_type: ts.ColumnType) -> pa.DataType | None:
63
74
  """Convert a pixeltable DataType to a pyarrow datatype if one is defined.
64
75
  Returns None if no conversion is currently implemented.
65
76
  """
66
77
  if pixeltable_type.__class__ in PXT_TO_PA_TYPES:
67
78
  return PXT_TO_PA_TYPES[pixeltable_type.__class__]
68
79
  elif isinstance(pixeltable_type, ts.ArrayType):
69
- return pa.fixed_shape_tensor(pa.from_numpy_dtype(pixeltable_type.numpy_dtype()), pixeltable_type.shape)
80
+ return pa.fixed_shape_tensor(pa.from_numpy_dtype(pixeltable_type.dtype), pixeltable_type.shape)
70
81
  else:
71
82
  return None
72
83
 
73
84
 
74
- def ar_infer_schema(
85
+ def to_pxt_schema(
75
86
  arrow_schema: pa.Schema, schema_overrides: dict[str, Any], primary_key: list[str]
76
87
  ) -> dict[str, ts.ColumnType]:
77
88
  """Convert a pyarrow Schema to a schema using pyarrow names and pixeltable types."""
78
- ar_schema = {
89
+ pxt_schema = {
79
90
  field.name: to_pixeltable_type(field.type, field.name not in primary_key)
80
91
  if field.name not in schema_overrides
81
92
  else schema_overrides[field.name]
82
93
  for field in arrow_schema
83
94
  }
84
- return ar_schema
95
+ return pxt_schema
85
96
 
86
97
 
87
98
  def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
88
- return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items()) # type: ignore[misc]
99
+ return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())
100
+
101
+
102
+ def _to_record_batch(column_vals: dict[str, list[Any]], schema: pa.Schema) -> pa.RecordBatch:
103
+ import pyarrow as pa
104
+
105
+ pa_arrays: list[pa.Array] = []
106
+ for field in schema:
107
+ if isinstance(field.type, pa.FixedShapeTensorType):
108
+ stacked_arr = np.stack(column_vals[field.name])
109
+ pa_arrays.append(pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr))
110
+ else:
111
+ pa_array = cast(pa.Array, pa.array(column_vals[field.name]))
112
+ pa_arrays.append(pa_array)
113
+ return pa.RecordBatch.from_arrays(pa_arrays, schema=schema)
114
+
115
+
116
+ def to_record_batches(query: 'pxt.Query', batch_size_bytes: int) -> Iterator[pa.RecordBatch]:
117
+ arrow_schema = to_arrow_schema(query.schema)
118
+ batch_columns: dict[str, list[Any]] = {k: [] for k in query.schema}
119
+ current_byte_estimate = 0
120
+ num_batch_rows = 0
89
121
 
122
+ # TODO: in order to avoid having to deal with ExprEvalError here, ResultSet should be an iterator
123
+ # over _exec()
124
+ try:
125
+ for data_row in query._exec():
126
+ num_batch_rows += 1
127
+ for (col_name, col_type), e in zip(query.schema.items(), query._select_list_exprs):
128
+ val = data_row[e.slot_idx]
129
+ val_size_bytes: int
130
+ if val is None:
131
+ batch_columns[col_name].append(val)
132
+ continue
90
133
 
91
- def to_pydict(batch: Union[pa.Table, pa.RecordBatch]) -> dict[str, Union[list, np.ndarray]]:
134
+ assert val is not None
135
+ if col_type.is_image_type():
136
+ # images get inlined into the parquet file
137
+ if data_row.file_paths[e.slot_idx] is not None:
138
+ # if there is a file, read directly to preserve information
139
+ with open(data_row.file_paths[e.slot_idx], 'rb') as f:
140
+ val = f.read()
141
+ elif isinstance(val, PIL.Image.Image):
142
+ # no file available: save as png
143
+ buf = io.BytesIO()
144
+ val.save(buf, format='png')
145
+ val = buf.getvalue()
146
+ else:
147
+ raise excs.Error(f'unknown image type {type(val)}')
148
+ val_size_bytes = len(val)
149
+ elif col_type.is_string_type():
150
+ val_size_bytes = len(val)
151
+ elif col_type.is_uuid_type():
152
+ # pa.uuid() uses fixed_size_binary(16) as storage type
153
+ val = val.bytes # Convert UUID to 16-byte binary for arrow
154
+ val_size_bytes = len(val)
155
+ elif col_type.is_binary_type():
156
+ val_size_bytes = len(val)
157
+ elif col_type.is_media_type():
158
+ assert data_row.file_paths[e.slot_idx] is not None
159
+ val = data_row.file_paths[e.slot_idx]
160
+ val_size_bytes = len(val)
161
+ elif col_type.is_json_type():
162
+ val = json.dumps(val)
163
+ val_size_bytes = len(val)
164
+ elif col_type.is_array_type():
165
+ val_size_bytes = val.nbytes
166
+ elif col_type.is_int_type() or col_type.is_float_type():
167
+ val_size_bytes = 8
168
+ elif col_type.is_bool_type():
169
+ val_size_bytes = 1
170
+ elif col_type.is_date_type():
171
+ val_size_bytes = 4
172
+ elif col_type.is_timestamp_type():
173
+ val = val.astimezone(datetime.timezone.utc)
174
+ val_size_bytes = 8
175
+ else:
176
+ raise excs.Error(f'unknown type {col_type} for {col_name}')
177
+
178
+ batch_columns[col_name].append(val)
179
+ current_byte_estimate += val_size_bytes
180
+
181
+ if current_byte_estimate > batch_size_bytes and num_batch_rows > 0:
182
+ record_batch = _to_record_batch(batch_columns, arrow_schema)
183
+ yield record_batch
184
+ batch_columns = {k: [] for k in query.schema}
185
+ current_byte_estimate = 0
186
+ num_batch_rows = 0
187
+
188
+ except excs.ExprEvalError as e:
189
+ query._raise_expr_eval_err(e)
190
+
191
+ if num_batch_rows > 0:
192
+ record_batch = _to_record_batch(batch_columns, arrow_schema)
193
+ yield record_batch
194
+
195
+
196
+ def to_pydict(batch: pa.Table | pa.RecordBatch) -> dict[str, list | np.ndarray]:
92
197
  """Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
93
198
  this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.
94
199
  """
95
- out: dict[str, Union[list, np.ndarray]] = {}
200
+ out: dict[str, list | np.ndarray] = {}
96
201
  for k, name in enumerate(batch.schema.names):
97
202
  col = batch.column(k)
98
203
  if isinstance(col.type, pa.FixedShapeTensorType):
99
204
  # treat array columns as numpy arrays to easily preserve numpy type
100
- out[name] = col.to_numpy(zero_copy_only=False) # type: ignore[call-arg]
205
+ out[name] = col.to_numpy(zero_copy_only=False)
101
206
  else:
102
207
  # for the rest, use pydict to preserve python types
103
208
  out[name] = col.to_pylist()
@@ -105,7 +210,7 @@ def to_pydict(batch: Union[pa.Table, pa.RecordBatch]) -> dict[str, Union[list, n
105
210
  return out
106
211
 
107
212
 
108
- def iter_tuples(batch: Union[pa.Table, pa.RecordBatch]) -> Iterator[dict[str, Any]]:
213
+ def iter_tuples(batch: pa.Table | pa.RecordBatch) -> Iterator[dict[str, Any]]:
109
214
  """Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
110
215
  pydict = to_pydict(batch)
111
216
  assert len(pydict) > 0, 'empty record batch'
@@ -129,6 +234,15 @@ def _ar_val_to_pxt_val(val: Any, pxt_type: ts.ColumnType) -> Any:
129
234
  return bool(val)
130
235
  elif pxt_type.is_string_type():
131
236
  return str(val)
237
+ elif pxt_type.is_uuid_type():
238
+ if isinstance(val, uuid.UUID):
239
+ return val
240
+ if isinstance(val, bytes):
241
+ return uuid.UUID(bytes=val)
242
+ return uuid.UUID(val)
243
+ elif pxt_type.is_binary_type():
244
+ assert isinstance(val, bytes)
245
+ return val
132
246
  elif pxt_type.is_date_type():
133
247
  if isinstance(val, str):
134
248
  return datetime.date.fromisoformat(val)
@@ -145,7 +259,7 @@ def _ar_val_to_pxt_val(val: Any, pxt_type: ts.ColumnType) -> Any:
145
259
 
146
260
 
147
261
  def iter_tuples2(
148
- batch: Union[pa.Table, pa.RecordBatch], col_mapping: Optional[dict[str, str]], schema: dict[str, ts.ColumnType]
262
+ batch: pa.Table | pa.RecordBatch, col_mapping: dict[str, str] | None, schema: dict[str, ts.ColumnType]
149
263
  ) -> Iterator[dict[str, Any]]:
150
264
  """Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
151
265
  pydict = to_pydict(batch)
pixeltable/utils/av.py ADDED
@@ -0,0 +1,298 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from fractions import Fraction
5
+ from pathlib import Path
6
+ from types import TracebackType
7
+ from typing import Any, Iterator
8
+
9
+ import av
10
+ import av.stream
11
+ import PIL.Image
12
+ from typing_extensions import Self
13
+
14
+ from pixeltable.env import Env
15
+
16
+ # format -> (codec, extension)
17
+ AUDIO_FORMATS: dict[str, tuple[str, str]] = {
18
+ 'wav': ('pcm_s16le', 'wav'),
19
+ 'mp3': ('libmp3lame', 'mp3'),
20
+ 'flac': ('flac', 'flac'),
21
+ 'mp4': ('aac', 'm4a'),
22
+ }
23
+
24
+
25
+ def get_metadata(path: str) -> dict:
26
+ with av.open(path) as container:
27
+ assert isinstance(container, av.container.InputContainer)
28
+ streams_info = [__get_stream_metadata(stream) for stream in container.streams]
29
+ result = {
30
+ 'bit_exact': getattr(container, 'bit_exact', False),
31
+ 'bit_rate': container.bit_rate,
32
+ 'size': container.size,
33
+ 'metadata': container.metadata,
34
+ 'streams': streams_info,
35
+ }
36
+ return result
37
+
38
+
39
+ def __get_stream_metadata(stream: av.stream.Stream) -> dict:
40
+ if stream.type not in ('audio', 'video'):
41
+ return {'type': stream.type} # Currently unsupported
42
+
43
+ codec_context = stream.codec_context
44
+ codec_context_md: dict[str, Any] = {
45
+ 'name': codec_context.name,
46
+ 'codec_tag': codec_context.codec_tag.encode('unicode-escape').decode('utf-8'),
47
+ 'profile': codec_context.profile,
48
+ }
49
+ metadata = {
50
+ 'type': stream.type,
51
+ 'duration': stream.duration,
52
+ 'time_base': float(stream.time_base) if stream.time_base is not None else None,
53
+ 'duration_seconds': float(stream.duration * stream.time_base)
54
+ if stream.duration is not None and stream.time_base is not None
55
+ else None,
56
+ 'frames': stream.frames,
57
+ 'metadata': stream.metadata,
58
+ 'codec_context': codec_context_md,
59
+ }
60
+
61
+ if stream.type == 'audio':
62
+ # Additional metadata for audio
63
+ channels = getattr(stream.codec_context, 'channels', None)
64
+ codec_context_md['channels'] = int(channels) if channels is not None else None
65
+ else:
66
+ assert stream.type == 'video'
67
+ assert isinstance(stream, av.video.stream.VideoStream)
68
+ # Additional metadata for video
69
+ codec_context_md['pix_fmt'] = getattr(stream.codec_context, 'pix_fmt', None)
70
+ metadata.update(
71
+ **{
72
+ 'width': stream.width,
73
+ 'height': stream.height,
74
+ 'frames': stream.frames,
75
+ 'average_rate': float(stream.average_rate) if stream.average_rate is not None else None,
76
+ 'base_rate': float(stream.base_rate) if stream.base_rate is not None else None,
77
+ 'guessed_rate': float(stream.guessed_rate) if stream.guessed_rate is not None else None,
78
+ }
79
+ )
80
+
81
+ return metadata
82
+
83
+
84
+ def get_video_duration(path: str) -> float | None:
85
+ """Return video duration in seconds."""
86
+ with av.open(path) as container:
87
+ video_stream = container.streams.video[0]
88
+ if video_stream is None:
89
+ return None
90
+ if video_stream.duration is not None:
91
+ return float(video_stream.duration * video_stream.time_base)
92
+
93
+ # if duration is not in the header, look for it in the last packet
94
+ last_pts: int | None = None
95
+ for packet in container.demux(video_stream):
96
+ if packet.pts is not None:
97
+ last_pts = packet.pts
98
+ if last_pts is not None:
99
+ return float(last_pts * video_stream.time_base)
100
+
101
+ return None
102
+
103
+
104
+ def has_audio_stream(path: str) -> bool:
105
+ """Check if video has audio stream using PyAV."""
106
+ md = get_metadata(path)
107
+ return any(stream['type'] == 'audio' for stream in md['streams'])
108
+
109
+
110
+ def ffmpeg_clip_cmd(
111
+ input_path: str,
112
+ output_path: str,
113
+ start_time: float,
114
+ duration: float | None = None,
115
+ fast: bool = True,
116
+ video_encoder: str | None = None,
117
+ video_encoder_args: dict[str, Any] | None = None,
118
+ ) -> list[str]:
119
+ cmd = ['ffmpeg']
120
+ if fast:
121
+ # fast: -ss before -i
122
+ cmd.extend(
123
+ [
124
+ '-ss',
125
+ str(start_time),
126
+ '-i',
127
+ input_path,
128
+ '-map',
129
+ '0', # Copy all streams from input
130
+ '-c',
131
+ 'copy', # Stream copy (no re-encoding)
132
+ ]
133
+ )
134
+ else:
135
+ if video_encoder is None:
136
+ video_encoder = Env.get().default_video_encoder
137
+
138
+ # accurate: -ss after -i
139
+ cmd.extend(
140
+ [
141
+ '-i',
142
+ input_path,
143
+ '-ss',
144
+ str(start_time),
145
+ '-map',
146
+ '0', # Copy all streams from input
147
+ '-c:a',
148
+ 'copy', # audio copy
149
+ '-c:s',
150
+ 'copy', # subtitle copy
151
+ '-c:v',
152
+ video_encoder, # re-encode video
153
+ ]
154
+ )
155
+ if video_encoder_args is not None:
156
+ for k, v in video_encoder_args.items():
157
+ cmd.extend([f'-{k}', str(v)])
158
+
159
+ if duration is not None:
160
+ cmd.extend(['-t', str(duration)])
161
+ cmd.extend(['-loglevel', 'error', output_path])
162
+ return cmd
163
+
164
+
165
+ def ffmpeg_segment_cmd(
166
+ input_path: str,
167
+ output_pattern: str,
168
+ segment_duration: float | None = None,
169
+ segment_times: list[float] | None = None,
170
+ video_encoder: str | None = None,
171
+ video_encoder_args: dict[str, Any] | None = None,
172
+ ) -> list[str]:
173
+ """Commandline for frame-accurate segmentation"""
174
+ assert (segment_duration is None) != (segment_times is None)
175
+ if video_encoder is None:
176
+ video_encoder = Env.get().default_video_encoder
177
+
178
+ cmd = [
179
+ 'ffmpeg',
180
+ '-i',
181
+ input_path,
182
+ '-map',
183
+ '0', # Copy all streams from input
184
+ '-c:a',
185
+ 'copy', # don't re-encode audio
186
+ '-c:v',
187
+ video_encoder, # re-encode video
188
+ ]
189
+ if video_encoder_args is not None:
190
+ for k, v in video_encoder_args.items():
191
+ cmd.extend([f'-{k}', str(v)])
192
+ cmd.extend(['-f', 'segment'])
193
+
194
+ # -force_key_frames needs to precede -f segment
195
+ if segment_duration is not None:
196
+ cmd.extend(
197
+ [
198
+ '-force_key_frames',
199
+ f'expr:gte(t,n_forced*{segment_duration})', # Force keyframe at each segment boundary
200
+ '-f',
201
+ 'segment',
202
+ '-segment_time',
203
+ str(segment_duration),
204
+ ]
205
+ )
206
+ else:
207
+ assert segment_times is not None
208
+ times_str = ','.join([str(t) for t in segment_times])
209
+ cmd.extend(['-force_key_frames', times_str, '-f', 'segment', '-segment_times', times_str])
210
+
211
+ cmd.extend(
212
+ [
213
+ '-reset_timestamps',
214
+ '1', # Reset timestamps for each segment
215
+ '-loglevel',
216
+ 'error', # Only show errors
217
+ output_pattern,
218
+ ]
219
+ )
220
+ return cmd
221
+
222
+
223
+ class VideoFrames:
224
+ """
225
+ Context manager for iterating over video frames at a specified frame rate.
226
+
227
+ Args:
228
+ path: Path to the video file
229
+ fps: Number of frames to extract per second. If None or 0.0, extracts all frames.
230
+ """
231
+
232
+ path: Path
233
+ fps: float
234
+ container: av.container.input.InputContainer | None
235
+ video_framerate: Fraction | None
236
+ video_time_base: Fraction | None
237
+ video_start_time: int | None
238
+
239
+ @dataclass
240
+ class Item:
241
+ frame_idx: int
242
+ pts: int
243
+ dts: int
244
+ time: float
245
+ is_corrupt: bool
246
+ key_frame: bool
247
+ pict_type: int
248
+ interlaced_frame: bool
249
+ frame: PIL.Image.Image
250
+
251
+ def __init__(self, path: Path, fps: float | None = None) -> None:
252
+ self.path = path
253
+ self.fps = 0.0 if fps is None else fps
254
+ self.container = None
255
+ self.video_framerate = None
256
+ self.video_time_base = None
257
+ self.video_start_time = None
258
+
259
+ def __enter__(self) -> Self:
260
+ self.container = av.open(self.path)
261
+ stream = self.container.streams.video[0]
262
+ self.video_framerate = stream.average_rate
263
+ self.video_time_base = stream.time_base
264
+ self.video_start_time = stream.start_time or 0
265
+ return self
266
+
267
+ def __exit__(
268
+ self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None
269
+ ) -> None:
270
+ # Clean up
271
+ if self.container:
272
+ self.container.close()
273
+
274
+ def __iter__(self) -> Iterator[Item]:
275
+ num_returned = 0
276
+ frame_idx = -1
277
+ while True:
278
+ try:
279
+ frame = next(self.container.decode(video=0))
280
+ except (StopIteration, EOFError):
281
+ return
282
+
283
+ frame_idx += 1
284
+ if self.fps == 0.0 or (num_returned <= frame.time * self.fps):
285
+ img = frame.to_image()
286
+ assert isinstance(img, PIL.Image.Image)
287
+ yield VideoFrames.Item(
288
+ frame_idx=frame_idx,
289
+ pts=frame.pts,
290
+ dts=frame.dts,
291
+ time=frame.time,
292
+ is_corrupt=frame.is_corrupt,
293
+ key_frame=frame.key_frame,
294
+ pict_type=frame.pict_type,
295
+ interlaced_frame=frame.interlaced_frame,
296
+ frame=img,
297
+ )
298
+ num_returned += 1