pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -1,3 +1,9 @@
1
+ import hashlib
2
+ import urllib.parse
3
+ import urllib.request
4
+ from pathlib import Path
5
+
6
+
1
7
  def print_perf_counter_delta(delta: float) -> str:
2
8
  """Prints a performance counter delta in a human-readable format.
3
9
 
@@ -15,3 +21,37 @@ def print_perf_counter_delta(delta: float) -> str:
15
21
  return f'{delta * 1e3:.2f} ms'
16
22
  else:
17
23
  return f'{delta:.2f} s'
24
+
25
+
26
+ def sha256sum(path: Path | str) -> str:
27
+ """
28
+ Compute the SHA256 hash of a file.
29
+ """
30
+ if isinstance(path, str):
31
+ path = Path(path)
32
+
33
+ h = hashlib.sha256()
34
+ with open(path, 'rb') as file:
35
+ while chunk := file.read(h.block_size):
36
+ h.update(chunk)
37
+
38
+ return h.hexdigest()
39
+
40
+
41
+ def parse_local_file_path(file_or_url: str) -> Path | None:
42
+ """
43
+ Parses a string that may be either a URL or a local file path.
44
+
45
+ If the string is a local file path or a file-scheme URL (file://), then a Path object will be returned.
46
+ Otherwise, None will be returned.
47
+ """
48
+ parsed = urllib.parse.urlparse(file_or_url)
49
+ if len(parsed.scheme) <= 1:
50
+ # We're using `urlparse` to help distinguish file paths from URLs. If there is no scheme, then it's a file path.
51
+ # If there's a single-character scheme, we also interpret this as a file path; this insures that drive letters
52
+ # on Windows pathnames are correctly handled.
53
+ return Path(file_or_url).absolute()
54
+ elif parsed.scheme == 'file':
55
+ return Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
56
+ else:
57
+ return None
pixeltable/utils/arrow.py CHANGED
@@ -1,35 +1,49 @@
1
- import logging
2
- from typing import Any, Iterator, Optional, Union
1
+ import datetime
2
+ import io
3
+ import json
4
+ import uuid
5
+ from typing import TYPE_CHECKING, Any, Iterator, cast
3
6
 
4
7
  import numpy as np
8
+ import PIL.Image
5
9
  import pyarrow as pa
6
- import datetime
7
10
 
11
+ import pixeltable.exceptions as excs
8
12
  import pixeltable.type_system as ts
9
- from pixeltable.env import Env
10
-
11
- _tz_def = Env().get().default_time_zone
12
13
 
13
- _logger = logging.getLogger(__name__)
14
+ if TYPE_CHECKING:
15
+ import pixeltable as pxt
14
16
 
15
- _pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
17
+ PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
16
18
  pa.string(): ts.StringType(nullable=True),
19
+ pa.large_string(): ts.StringType(nullable=True),
20
+ pa.timestamp('us', tz='UTC'): ts.TimestampType(nullable=True),
17
21
  pa.bool_(): ts.BoolType(nullable=True),
18
- pa.uint8(): ts.IntType(nullable=True),
19
22
  pa.int8(): ts.IntType(nullable=True),
20
- pa.uint32(): ts.IntType(nullable=True),
21
- pa.uint64(): ts.IntType(nullable=True),
23
+ pa.int16(): ts.IntType(nullable=True),
22
24
  pa.int32(): ts.IntType(nullable=True),
23
25
  pa.int64(): ts.IntType(nullable=True),
26
+ pa.uint8(): ts.IntType(nullable=True),
27
+ pa.uint16(): ts.IntType(nullable=True),
28
+ pa.uint32(): ts.IntType(nullable=True),
29
+ pa.uint64(): ts.IntType(nullable=True),
24
30
  pa.float32(): ts.FloatType(nullable=True),
31
+ pa.float64(): ts.FloatType(nullable=True),
32
+ pa.date32(): ts.DateType(nullable=True),
33
+ pa.date64(): ts.DateType(nullable=True),
34
+ pa.uuid(): ts.UUIDType(nullable=True),
35
+ pa.binary(): ts.BinaryType(nullable=True),
25
36
  }
26
37
 
27
- _pt_to_pa: dict[type[ts.ColumnType], pa.DataType] = {
38
+ PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
28
39
  ts.StringType: pa.string(),
29
- ts.TimestampType: pa.timestamp('us', tz=datetime.timezone.utc), # postgres timestamp is microseconds
40
+ ts.TimestampType: pa.timestamp('us', tz='UTC'), # postgres timestamp is microseconds
41
+ ts.DateType: pa.date32(), # This could be date64
42
+ ts.UUIDType: pa.uuid(),
30
43
  ts.BoolType: pa.bool_(),
31
44
  ts.IntType: pa.int64(),
32
45
  ts.FloatType: pa.float32(),
46
+ ts.BinaryType: pa.binary(),
33
47
  ts.JsonType: pa.string(), # TODO(orm) pa.struct() is possible
34
48
  ts.ImageType: pa.binary(), # inline image
35
49
  ts.AudioType: pa.string(), # path
@@ -38,48 +52,152 @@ _pt_to_pa: dict[type[ts.ColumnType], pa.DataType] = {
38
52
  }
39
53
 
40
54
 
41
- def to_pixeltable_type(arrow_type: pa.DataType) -> Optional[ts.ColumnType]:
55
+ def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> ts.ColumnType | None:
42
56
  """Convert a pyarrow DataType to a pixeltable ColumnType if one is defined.
43
57
  Returns None if no conversion is currently implemented.
44
58
  """
45
59
  if isinstance(arrow_type, pa.TimestampType):
46
- return ts.TimestampType(nullable=True)
47
- elif arrow_type in _pa_to_pt:
48
- return _pa_to_pt[arrow_type]
60
+ return ts.TimestampType(nullable=nullable)
61
+ elif arrow_type in PA_TO_PXT_TYPES:
62
+ pt = PA_TO_PXT_TYPES[arrow_type]
63
+ return pt.copy(nullable=nullable) if pt is not None else None
49
64
  elif isinstance(arrow_type, pa.FixedShapeTensorType):
50
- dtype = to_pixeltable_type(arrow_type.value_type)
65
+ dtype = to_pixeltable_type(arrow_type.value_type, nullable)
51
66
  if dtype is None:
52
67
  return None
53
- return ts.ArrayType(shape=arrow_type.shape, dtype=dtype)
68
+ return ts.ArrayType(shape=tuple(arrow_type.shape), dtype=dtype, nullable=nullable)
54
69
  else:
55
70
  return None
56
71
 
57
72
 
58
- def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
73
+ def to_arrow_type(pixeltable_type: ts.ColumnType) -> pa.DataType | None:
59
74
  """Convert a pixeltable DataType to a pyarrow datatype if one is defined.
60
75
  Returns None if no conversion is currently implemented.
61
76
  """
62
- if pixeltable_type.__class__ in _pt_to_pa:
63
- return _pt_to_pa[pixeltable_type.__class__]
77
+ if pixeltable_type.__class__ in PXT_TO_PA_TYPES:
78
+ return PXT_TO_PA_TYPES[pixeltable_type.__class__]
64
79
  elif isinstance(pixeltable_type, ts.ArrayType):
65
- return pa.fixed_shape_tensor(pa.from_numpy_dtype(pixeltable_type.numpy_dtype()), pixeltable_type.shape)
80
+ return pa.fixed_shape_tensor(pa.from_numpy_dtype(pixeltable_type.dtype), pixeltable_type.shape)
66
81
  else:
67
82
  return None
68
83
 
69
84
 
70
- def to_pixeltable_schema(arrow_schema: pa.Schema) -> dict[str, ts.ColumnType]:
71
- return {field.name: to_pixeltable_type(field.type) for field in arrow_schema}
85
+ def to_pxt_schema(
86
+ arrow_schema: pa.Schema, schema_overrides: dict[str, Any], primary_key: list[str]
87
+ ) -> dict[str, ts.ColumnType]:
88
+ """Convert a pyarrow Schema to a schema using pyarrow names and pixeltable types."""
89
+ pxt_schema = {
90
+ field.name: to_pixeltable_type(field.type, field.name not in primary_key)
91
+ if field.name not in schema_overrides
92
+ else schema_overrides[field.name]
93
+ for field in arrow_schema
94
+ }
95
+ return pxt_schema
72
96
 
73
97
 
74
98
  def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
75
- return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items()) # type: ignore[misc]
99
+ return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())
100
+
101
+
102
+ def _to_record_batch(column_vals: dict[str, list[Any]], schema: pa.Schema) -> pa.RecordBatch:
103
+ import pyarrow as pa
104
+
105
+ pa_arrays: list[pa.Array] = []
106
+ for field in schema:
107
+ if isinstance(field.type, pa.FixedShapeTensorType):
108
+ stacked_arr = np.stack(column_vals[field.name])
109
+ pa_arrays.append(pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr))
110
+ else:
111
+ pa_array = cast(pa.Array, pa.array(column_vals[field.name]))
112
+ pa_arrays.append(pa_array)
113
+ return pa.RecordBatch.from_arrays(pa_arrays, schema=schema)
114
+
115
+
116
+ def to_record_batches(query: 'pxt.Query', batch_size_bytes: int) -> Iterator[pa.RecordBatch]:
117
+ arrow_schema = to_arrow_schema(query.schema)
118
+ batch_columns: dict[str, list[Any]] = {k: [] for k in query.schema}
119
+ current_byte_estimate = 0
120
+ num_batch_rows = 0
121
+
122
+ # TODO: in order to avoid having to deal with ExprEvalError here, ResultSet should be an iterator
123
+ # over _exec()
124
+ try:
125
+ for data_row in query._exec():
126
+ num_batch_rows += 1
127
+ for (col_name, col_type), e in zip(query.schema.items(), query._select_list_exprs):
128
+ val = data_row[e.slot_idx]
129
+ val_size_bytes: int
130
+ if val is None:
131
+ batch_columns[col_name].append(val)
132
+ continue
133
+
134
+ assert val is not None
135
+ if col_type.is_image_type():
136
+ # images get inlined into the parquet file
137
+ if data_row.file_paths[e.slot_idx] is not None:
138
+ # if there is a file, read directly to preserve information
139
+ with open(data_row.file_paths[e.slot_idx], 'rb') as f:
140
+ val = f.read()
141
+ elif isinstance(val, PIL.Image.Image):
142
+ # no file available: save as png
143
+ buf = io.BytesIO()
144
+ val.save(buf, format='png')
145
+ val = buf.getvalue()
146
+ else:
147
+ raise excs.Error(f'unknown image type {type(val)}')
148
+ val_size_bytes = len(val)
149
+ elif col_type.is_string_type():
150
+ val_size_bytes = len(val)
151
+ elif col_type.is_uuid_type():
152
+ # pa.uuid() uses fixed_size_binary(16) as storage type
153
+ val = val.bytes # Convert UUID to 16-byte binary for arrow
154
+ val_size_bytes = len(val)
155
+ elif col_type.is_binary_type():
156
+ val_size_bytes = len(val)
157
+ elif col_type.is_media_type():
158
+ assert data_row.file_paths[e.slot_idx] is not None
159
+ val = data_row.file_paths[e.slot_idx]
160
+ val_size_bytes = len(val)
161
+ elif col_type.is_json_type():
162
+ val = json.dumps(val)
163
+ val_size_bytes = len(val)
164
+ elif col_type.is_array_type():
165
+ val_size_bytes = val.nbytes
166
+ elif col_type.is_int_type() or col_type.is_float_type():
167
+ val_size_bytes = 8
168
+ elif col_type.is_bool_type():
169
+ val_size_bytes = 1
170
+ elif col_type.is_date_type():
171
+ val_size_bytes = 4
172
+ elif col_type.is_timestamp_type():
173
+ val = val.astimezone(datetime.timezone.utc)
174
+ val_size_bytes = 8
175
+ else:
176
+ raise excs.Error(f'unknown type {col_type} for {col_name}')
76
177
 
178
+ batch_columns[col_name].append(val)
179
+ current_byte_estimate += val_size_bytes
77
180
 
78
- def to_pydict(batch: pa.RecordBatch) -> dict[str, Union[list, np.ndarray]]:
181
+ if current_byte_estimate > batch_size_bytes and num_batch_rows > 0:
182
+ record_batch = _to_record_batch(batch_columns, arrow_schema)
183
+ yield record_batch
184
+ batch_columns = {k: [] for k in query.schema}
185
+ current_byte_estimate = 0
186
+ num_batch_rows = 0
187
+
188
+ except excs.ExprEvalError as e:
189
+ query._raise_expr_eval_err(e)
190
+
191
+ if num_batch_rows > 0:
192
+ record_batch = _to_record_batch(batch_columns, arrow_schema)
193
+ yield record_batch
194
+
195
+
196
+ def to_pydict(batch: pa.Table | pa.RecordBatch) -> dict[str, list | np.ndarray]:
79
197
  """Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
80
198
  this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.
81
199
  """
82
- out: dict[str, Union[list, np.ndarray]] = {}
200
+ out: dict[str, list | np.ndarray] = {}
83
201
  for k, name in enumerate(batch.schema.names):
84
202
  col = batch.column(k)
85
203
  if isinstance(col.type, pa.FixedShapeTensorType):
@@ -92,7 +210,7 @@ def to_pydict(batch: pa.RecordBatch) -> dict[str, Union[list, np.ndarray]]:
92
210
  return out
93
211
 
94
212
 
95
- def iter_tuples(batch: pa.RecordBatch) -> Iterator[dict[str, Any]]:
213
+ def iter_tuples(batch: pa.Table | pa.RecordBatch) -> Iterator[dict[str, Any]]:
96
214
  """Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
97
215
  pydict = to_pydict(batch)
98
216
  assert len(pydict) > 0, 'empty record batch'
@@ -102,3 +220,57 @@ def iter_tuples(batch: pa.RecordBatch) -> Iterator[dict[str, Any]]:
102
220
 
103
221
  for i in range(batch_size):
104
222
  yield {col_name: values[i] for col_name, values in pydict.items()}
223
+
224
+
225
+ def _ar_val_to_pxt_val(val: Any, pxt_type: ts.ColumnType) -> Any:
226
+ """Convert a value to insertable format"""
227
+ if val is None:
228
+ return None
229
+ if pxt_type.is_float_type():
230
+ return float(val)
231
+ elif pxt_type.is_int_type():
232
+ return int(val)
233
+ elif pxt_type.is_bool_type():
234
+ return bool(val)
235
+ elif pxt_type.is_string_type():
236
+ return str(val)
237
+ elif pxt_type.is_uuid_type():
238
+ if isinstance(val, uuid.UUID):
239
+ return val
240
+ if isinstance(val, bytes):
241
+ return uuid.UUID(bytes=val)
242
+ return uuid.UUID(val)
243
+ elif pxt_type.is_binary_type():
244
+ assert isinstance(val, bytes)
245
+ return val
246
+ elif pxt_type.is_date_type():
247
+ if isinstance(val, str):
248
+ return datetime.date.fromisoformat(val)
249
+ if isinstance(val, datetime.date):
250
+ return val
251
+ elif pxt_type.is_timestamp_type():
252
+ if isinstance(val, str):
253
+ return datetime.datetime.fromisoformat(val)
254
+ if isinstance(val, datetime.datetime):
255
+ return val
256
+ elif pxt_type.is_array_type():
257
+ return pxt_type.create_literal(val)
258
+ raise ValueError(f'Unsupported type {pxt_type} for value {val}')
259
+
260
+
261
+ def iter_tuples2(
262
+ batch: pa.Table | pa.RecordBatch, col_mapping: dict[str, str] | None, schema: dict[str, ts.ColumnType]
263
+ ) -> Iterator[dict[str, Any]]:
264
+ """Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
265
+ pydict = to_pydict(batch)
266
+ assert len(pydict) > 0, 'empty record batch'
267
+ for _, v in pydict.items():
268
+ batch_size = len(v)
269
+ break
270
+
271
+ for i in range(batch_size):
272
+ # Convert a row to insertable format
273
+ yield {
274
+ (pxt_name := col_mapping.get(col_name, col_name)): _ar_val_to_pxt_val(values[i], schema[pxt_name])
275
+ for col_name, values in pydict.items()
276
+ }
pixeltable/utils/av.py ADDED
@@ -0,0 +1,298 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from fractions import Fraction
5
+ from pathlib import Path
6
+ from types import TracebackType
7
+ from typing import Any, Iterator
8
+
9
+ import av
10
+ import av.stream
11
+ import PIL.Image
12
+ from typing_extensions import Self
13
+
14
+ from pixeltable.env import Env
15
+
16
+ # format -> (codec, extension)
17
+ AUDIO_FORMATS: dict[str, tuple[str, str]] = {
18
+ 'wav': ('pcm_s16le', 'wav'),
19
+ 'mp3': ('libmp3lame', 'mp3'),
20
+ 'flac': ('flac', 'flac'),
21
+ 'mp4': ('aac', 'm4a'),
22
+ }
23
+
24
+
25
+ def get_metadata(path: str) -> dict:
26
+ with av.open(path) as container:
27
+ assert isinstance(container, av.container.InputContainer)
28
+ streams_info = [__get_stream_metadata(stream) for stream in container.streams]
29
+ result = {
30
+ 'bit_exact': getattr(container, 'bit_exact', False),
31
+ 'bit_rate': container.bit_rate,
32
+ 'size': container.size,
33
+ 'metadata': container.metadata,
34
+ 'streams': streams_info,
35
+ }
36
+ return result
37
+
38
+
39
+ def __get_stream_metadata(stream: av.stream.Stream) -> dict:
40
+ if stream.type not in ('audio', 'video'):
41
+ return {'type': stream.type} # Currently unsupported
42
+
43
+ codec_context = stream.codec_context
44
+ codec_context_md: dict[str, Any] = {
45
+ 'name': codec_context.name,
46
+ 'codec_tag': codec_context.codec_tag.encode('unicode-escape').decode('utf-8'),
47
+ 'profile': codec_context.profile,
48
+ }
49
+ metadata = {
50
+ 'type': stream.type,
51
+ 'duration': stream.duration,
52
+ 'time_base': float(stream.time_base) if stream.time_base is not None else None,
53
+ 'duration_seconds': float(stream.duration * stream.time_base)
54
+ if stream.duration is not None and stream.time_base is not None
55
+ else None,
56
+ 'frames': stream.frames,
57
+ 'metadata': stream.metadata,
58
+ 'codec_context': codec_context_md,
59
+ }
60
+
61
+ if stream.type == 'audio':
62
+ # Additional metadata for audio
63
+ channels = getattr(stream.codec_context, 'channels', None)
64
+ codec_context_md['channels'] = int(channels) if channels is not None else None
65
+ else:
66
+ assert stream.type == 'video'
67
+ assert isinstance(stream, av.video.stream.VideoStream)
68
+ # Additional metadata for video
69
+ codec_context_md['pix_fmt'] = getattr(stream.codec_context, 'pix_fmt', None)
70
+ metadata.update(
71
+ **{
72
+ 'width': stream.width,
73
+ 'height': stream.height,
74
+ 'frames': stream.frames,
75
+ 'average_rate': float(stream.average_rate) if stream.average_rate is not None else None,
76
+ 'base_rate': float(stream.base_rate) if stream.base_rate is not None else None,
77
+ 'guessed_rate': float(stream.guessed_rate) if stream.guessed_rate is not None else None,
78
+ }
79
+ )
80
+
81
+ return metadata
82
+
83
+
84
+ def get_video_duration(path: str) -> float | None:
85
+ """Return video duration in seconds."""
86
+ with av.open(path) as container:
87
+ video_stream = container.streams.video[0]
88
+ if video_stream is None:
89
+ return None
90
+ if video_stream.duration is not None:
91
+ return float(video_stream.duration * video_stream.time_base)
92
+
93
+ # if duration is not in the header, look for it in the last packet
94
+ last_pts: int | None = None
95
+ for packet in container.demux(video_stream):
96
+ if packet.pts is not None:
97
+ last_pts = packet.pts
98
+ if last_pts is not None:
99
+ return float(last_pts * video_stream.time_base)
100
+
101
+ return None
102
+
103
+
104
+ def has_audio_stream(path: str) -> bool:
105
+ """Check if video has audio stream using PyAV."""
106
+ md = get_metadata(path)
107
+ return any(stream['type'] == 'audio' for stream in md['streams'])
108
+
109
+
110
+ def ffmpeg_clip_cmd(
111
+ input_path: str,
112
+ output_path: str,
113
+ start_time: float,
114
+ duration: float | None = None,
115
+ fast: bool = True,
116
+ video_encoder: str | None = None,
117
+ video_encoder_args: dict[str, Any] | None = None,
118
+ ) -> list[str]:
119
+ cmd = ['ffmpeg']
120
+ if fast:
121
+ # fast: -ss before -i
122
+ cmd.extend(
123
+ [
124
+ '-ss',
125
+ str(start_time),
126
+ '-i',
127
+ input_path,
128
+ '-map',
129
+ '0', # Copy all streams from input
130
+ '-c',
131
+ 'copy', # Stream copy (no re-encoding)
132
+ ]
133
+ )
134
+ else:
135
+ if video_encoder is None:
136
+ video_encoder = Env.get().default_video_encoder
137
+
138
+ # accurate: -ss after -i
139
+ cmd.extend(
140
+ [
141
+ '-i',
142
+ input_path,
143
+ '-ss',
144
+ str(start_time),
145
+ '-map',
146
+ '0', # Copy all streams from input
147
+ '-c:a',
148
+ 'copy', # audio copy
149
+ '-c:s',
150
+ 'copy', # subtitle copy
151
+ '-c:v',
152
+ video_encoder, # re-encode video
153
+ ]
154
+ )
155
+ if video_encoder_args is not None:
156
+ for k, v in video_encoder_args.items():
157
+ cmd.extend([f'-{k}', str(v)])
158
+
159
+ if duration is not None:
160
+ cmd.extend(['-t', str(duration)])
161
+ cmd.extend(['-loglevel', 'error', output_path])
162
+ return cmd
163
+
164
+
165
+ def ffmpeg_segment_cmd(
166
+ input_path: str,
167
+ output_pattern: str,
168
+ segment_duration: float | None = None,
169
+ segment_times: list[float] | None = None,
170
+ video_encoder: str | None = None,
171
+ video_encoder_args: dict[str, Any] | None = None,
172
+ ) -> list[str]:
173
+ """Commandline for frame-accurate segmentation"""
174
+ assert (segment_duration is None) != (segment_times is None)
175
+ if video_encoder is None:
176
+ video_encoder = Env.get().default_video_encoder
177
+
178
+ cmd = [
179
+ 'ffmpeg',
180
+ '-i',
181
+ input_path,
182
+ '-map',
183
+ '0', # Copy all streams from input
184
+ '-c:a',
185
+ 'copy', # don't re-encode audio
186
+ '-c:v',
187
+ video_encoder, # re-encode video
188
+ ]
189
+ if video_encoder_args is not None:
190
+ for k, v in video_encoder_args.items():
191
+ cmd.extend([f'-{k}', str(v)])
192
+ cmd.extend(['-f', 'segment'])
193
+
194
+ # -force_key_frames needs to precede -f segment
195
+ if segment_duration is not None:
196
+ cmd.extend(
197
+ [
198
+ '-force_key_frames',
199
+ f'expr:gte(t,n_forced*{segment_duration})', # Force keyframe at each segment boundary
200
+ '-f',
201
+ 'segment',
202
+ '-segment_time',
203
+ str(segment_duration),
204
+ ]
205
+ )
206
+ else:
207
+ assert segment_times is not None
208
+ times_str = ','.join([str(t) for t in segment_times])
209
+ cmd.extend(['-force_key_frames', times_str, '-f', 'segment', '-segment_times', times_str])
210
+
211
+ cmd.extend(
212
+ [
213
+ '-reset_timestamps',
214
+ '1', # Reset timestamps for each segment
215
+ '-loglevel',
216
+ 'error', # Only show errors
217
+ output_pattern,
218
+ ]
219
+ )
220
+ return cmd
221
+
222
+
223
+ class VideoFrames:
224
+ """
225
+ Context manager for iterating over video frames at a specified frame rate.
226
+
227
+ Args:
228
+ path: Path to the video file
229
+ fps: Number of frames to extract per second. If None or 0.0, extracts all frames.
230
+ """
231
+
232
+ path: Path
233
+ fps: float
234
+ container: av.container.input.InputContainer | None
235
+ video_framerate: Fraction | None
236
+ video_time_base: Fraction | None
237
+ video_start_time: int | None
238
+
239
+ @dataclass
240
+ class Item:
241
+ frame_idx: int
242
+ pts: int
243
+ dts: int
244
+ time: float
245
+ is_corrupt: bool
246
+ key_frame: bool
247
+ pict_type: int
248
+ interlaced_frame: bool
249
+ frame: PIL.Image.Image
250
+
251
+ def __init__(self, path: Path, fps: float | None = None) -> None:
252
+ self.path = path
253
+ self.fps = 0.0 if fps is None else fps
254
+ self.container = None
255
+ self.video_framerate = None
256
+ self.video_time_base = None
257
+ self.video_start_time = None
258
+
259
+ def __enter__(self) -> Self:
260
+ self.container = av.open(self.path)
261
+ stream = self.container.streams.video[0]
262
+ self.video_framerate = stream.average_rate
263
+ self.video_time_base = stream.time_base
264
+ self.video_start_time = stream.start_time or 0
265
+ return self
266
+
267
+ def __exit__(
268
+ self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None
269
+ ) -> None:
270
+ # Clean up
271
+ if self.container:
272
+ self.container.close()
273
+
274
+ def __iter__(self) -> Iterator[Item]:
275
+ num_returned = 0
276
+ frame_idx = -1
277
+ while True:
278
+ try:
279
+ frame = next(self.container.decode(video=0))
280
+ except (StopIteration, EOFError):
281
+ return
282
+
283
+ frame_idx += 1
284
+ if self.fps == 0.0 or (num_returned <= frame.time * self.fps):
285
+ img = frame.to_image()
286
+ assert isinstance(img, PIL.Image.Image)
287
+ yield VideoFrames.Item(
288
+ frame_idx=frame_idx,
289
+ pts=frame.pts,
290
+ dts=frame.dts,
291
+ time=frame.time,
292
+ is_corrupt=frame.is_corrupt,
293
+ key_frame=frame.key_frame,
294
+ pict_type=frame.pict_type,
295
+ interlaced_frame=frame.interlaced_frame,
296
+ frame=img,
297
+ )
298
+ num_returned += 1