pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. pixeltable/__init__.py +42 -8
  2. pixeltable/{dataframe.py → _query.py} +470 -206
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +5 -4
  5. pixeltable/catalog/catalog.py +1785 -432
  6. pixeltable/catalog/column.py +190 -113
  7. pixeltable/catalog/dir.py +2 -4
  8. pixeltable/catalog/globals.py +19 -46
  9. pixeltable/catalog/insertable_table.py +191 -98
  10. pixeltable/catalog/path.py +63 -23
  11. pixeltable/catalog/schema_object.py +11 -15
  12. pixeltable/catalog/table.py +843 -436
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +978 -657
  15. pixeltable/catalog/table_version_handle.py +72 -16
  16. pixeltable/catalog/table_version_path.py +112 -43
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +134 -90
  20. pixeltable/config.py +134 -22
  21. pixeltable/env.py +471 -157
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +4 -1
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +11 -7
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +106 -56
  37. pixeltable/exec/globals.py +35 -0
  38. pixeltable/exec/in_memory_data_node.py +19 -19
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +351 -84
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +36 -23
  46. pixeltable/exprs/column_ref.py +213 -89
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +164 -54
  50. pixeltable/exprs/expr.py +70 -44
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +100 -40
  54. pixeltable/exprs/globals.py +2 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +18 -32
  57. pixeltable/exprs/is_null.py +7 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +27 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +167 -67
  64. pixeltable/exprs/rowid_ref.py +25 -10
  65. pixeltable/exprs/similarity_expr.py +58 -40
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +17 -11
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +29 -27
  78. pixeltable/func/signature.py +46 -19
  79. pixeltable/func/tools.py +31 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +16 -0
  82. pixeltable/functions/anthropic.py +123 -77
  83. pixeltable/functions/audio.py +147 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +7 -4
  86. pixeltable/functions/deepseek.py +35 -43
  87. pixeltable/functions/document.py +81 -0
  88. pixeltable/functions/fal.py +76 -0
  89. pixeltable/functions/fireworks.py +11 -20
  90. pixeltable/functions/gemini.py +195 -39
  91. pixeltable/functions/globals.py +142 -14
  92. pixeltable/functions/groq.py +108 -0
  93. pixeltable/functions/huggingface.py +1056 -24
  94. pixeltable/functions/image.py +115 -57
  95. pixeltable/functions/json.py +1 -1
  96. pixeltable/functions/llama_cpp.py +28 -13
  97. pixeltable/functions/math.py +67 -5
  98. pixeltable/functions/mistralai.py +18 -55
  99. pixeltable/functions/net.py +70 -0
  100. pixeltable/functions/ollama.py +20 -13
  101. pixeltable/functions/openai.py +240 -226
  102. pixeltable/functions/openrouter.py +143 -0
  103. pixeltable/functions/replicate.py +4 -4
  104. pixeltable/functions/reve.py +250 -0
  105. pixeltable/functions/string.py +239 -69
  106. pixeltable/functions/timestamp.py +16 -16
  107. pixeltable/functions/together.py +24 -84
  108. pixeltable/functions/twelvelabs.py +188 -0
  109. pixeltable/functions/util.py +6 -1
  110. pixeltable/functions/uuid.py +30 -0
  111. pixeltable/functions/video.py +1515 -107
  112. pixeltable/functions/vision.py +8 -8
  113. pixeltable/functions/voyageai.py +289 -0
  114. pixeltable/functions/whisper.py +16 -8
  115. pixeltable/functions/whisperx.py +179 -0
  116. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  117. pixeltable/globals.py +362 -115
  118. pixeltable/index/base.py +17 -21
  119. pixeltable/index/btree.py +28 -22
  120. pixeltable/index/embedding_index.py +100 -118
  121. pixeltable/io/__init__.py +4 -2
  122. pixeltable/io/datarows.py +8 -7
  123. pixeltable/io/external_store.py +56 -105
  124. pixeltable/io/fiftyone.py +13 -13
  125. pixeltable/io/globals.py +31 -30
  126. pixeltable/io/hf_datasets.py +61 -16
  127. pixeltable/io/label_studio.py +74 -70
  128. pixeltable/io/lancedb.py +3 -0
  129. pixeltable/io/pandas.py +21 -12
  130. pixeltable/io/parquet.py +25 -105
  131. pixeltable/io/table_data_conduit.py +250 -123
  132. pixeltable/io/utils.py +4 -4
  133. pixeltable/iterators/__init__.py +2 -1
  134. pixeltable/iterators/audio.py +26 -25
  135. pixeltable/iterators/base.py +9 -3
  136. pixeltable/iterators/document.py +112 -78
  137. pixeltable/iterators/image.py +12 -15
  138. pixeltable/iterators/string.py +11 -4
  139. pixeltable/iterators/video.py +523 -120
  140. pixeltable/metadata/__init__.py +14 -3
  141. pixeltable/metadata/converters/convert_13.py +2 -2
  142. pixeltable/metadata/converters/convert_18.py +2 -2
  143. pixeltable/metadata/converters/convert_19.py +2 -2
  144. pixeltable/metadata/converters/convert_20.py +2 -2
  145. pixeltable/metadata/converters/convert_21.py +2 -2
  146. pixeltable/metadata/converters/convert_22.py +2 -2
  147. pixeltable/metadata/converters/convert_24.py +2 -2
  148. pixeltable/metadata/converters/convert_25.py +2 -2
  149. pixeltable/metadata/converters/convert_26.py +2 -2
  150. pixeltable/metadata/converters/convert_29.py +4 -4
  151. pixeltable/metadata/converters/convert_30.py +34 -21
  152. pixeltable/metadata/converters/convert_34.py +2 -2
  153. pixeltable/metadata/converters/convert_35.py +9 -0
  154. pixeltable/metadata/converters/convert_36.py +38 -0
  155. pixeltable/metadata/converters/convert_37.py +15 -0
  156. pixeltable/metadata/converters/convert_38.py +39 -0
  157. pixeltable/metadata/converters/convert_39.py +124 -0
  158. pixeltable/metadata/converters/convert_40.py +73 -0
  159. pixeltable/metadata/converters/convert_41.py +12 -0
  160. pixeltable/metadata/converters/convert_42.py +9 -0
  161. pixeltable/metadata/converters/convert_43.py +44 -0
  162. pixeltable/metadata/converters/util.py +20 -31
  163. pixeltable/metadata/notes.py +9 -0
  164. pixeltable/metadata/schema.py +140 -53
  165. pixeltable/metadata/utils.py +74 -0
  166. pixeltable/mypy/__init__.py +3 -0
  167. pixeltable/mypy/mypy_plugin.py +123 -0
  168. pixeltable/plan.py +382 -115
  169. pixeltable/share/__init__.py +1 -1
  170. pixeltable/share/packager.py +547 -83
  171. pixeltable/share/protocol/__init__.py +33 -0
  172. pixeltable/share/protocol/common.py +165 -0
  173. pixeltable/share/protocol/operation_types.py +33 -0
  174. pixeltable/share/protocol/replica.py +119 -0
  175. pixeltable/share/publish.py +257 -59
  176. pixeltable/store.py +311 -194
  177. pixeltable/type_system.py +373 -211
  178. pixeltable/utils/__init__.py +2 -3
  179. pixeltable/utils/arrow.py +131 -17
  180. pixeltable/utils/av.py +298 -0
  181. pixeltable/utils/azure_store.py +346 -0
  182. pixeltable/utils/coco.py +6 -6
  183. pixeltable/utils/code.py +3 -3
  184. pixeltable/utils/console_output.py +4 -1
  185. pixeltable/utils/coroutine.py +6 -23
  186. pixeltable/utils/dbms.py +32 -6
  187. pixeltable/utils/description_helper.py +4 -5
  188. pixeltable/utils/documents.py +7 -18
  189. pixeltable/utils/exception_handler.py +7 -30
  190. pixeltable/utils/filecache.py +6 -6
  191. pixeltable/utils/formatter.py +86 -48
  192. pixeltable/utils/gcs_store.py +295 -0
  193. pixeltable/utils/http.py +133 -0
  194. pixeltable/utils/http_server.py +2 -3
  195. pixeltable/utils/iceberg.py +1 -2
  196. pixeltable/utils/image.py +17 -0
  197. pixeltable/utils/lancedb.py +90 -0
  198. pixeltable/utils/local_store.py +322 -0
  199. pixeltable/utils/misc.py +5 -0
  200. pixeltable/utils/object_stores.py +573 -0
  201. pixeltable/utils/pydantic.py +60 -0
  202. pixeltable/utils/pytorch.py +5 -6
  203. pixeltable/utils/s3_store.py +527 -0
  204. pixeltable/utils/sql.py +26 -0
  205. pixeltable/utils/system.py +30 -0
  206. pixeltable-0.5.7.dist-info/METADATA +579 -0
  207. pixeltable-0.5.7.dist-info/RECORD +227 -0
  208. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  209. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  210. pixeltable/__version__.py +0 -3
  211. pixeltable/catalog/named_function.py +0 -40
  212. pixeltable/ext/__init__.py +0 -17
  213. pixeltable/ext/functions/__init__.py +0 -11
  214. pixeltable/ext/functions/whisperx.py +0 -77
  215. pixeltable/utils/media_store.py +0 -77
  216. pixeltable/utils/s3.py +0 -17
  217. pixeltable-0.3.14.dist-info/METADATA +0 -434
  218. pixeltable-0.3.14.dist-info/RECORD +0 -186
  219. pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
  220. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/io/parquet.py CHANGED
@@ -1,158 +1,78 @@
1
1
  from __future__ import annotations
2
2
 
3
- import datetime
4
- import io
5
3
  import json
6
4
  import logging
7
5
  import typing
8
- from collections import deque
9
6
  from pathlib import Path
10
- from typing import Any, Optional, Union
11
-
12
- import numpy as np
13
- import PIL.Image
7
+ from typing import Any
14
8
 
15
9
  import pixeltable as pxt
16
10
  import pixeltable.exceptions as excs
17
- from pixeltable.env import Env
11
+ from pixeltable.catalog import Catalog
18
12
  from pixeltable.utils.transactional_directory import transactional_directory
19
13
 
20
14
  if typing.TYPE_CHECKING:
21
- import pyarrow as pa
22
-
23
15
  import pixeltable as pxt
24
16
 
25
17
  _logger = logging.getLogger('pixeltable')
26
18
 
27
19
 
28
- def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
29
- import pyarrow as pa
30
- from pyarrow import parquet
31
-
32
- pydict = {}
33
- for field in schema:
34
- if isinstance(field.type, pa.FixedShapeTensorType):
35
- stacked_arr = np.stack(value_batch[field.name])
36
- pydict[field.name] = pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr)
37
- else:
38
- pydict[field.name] = value_batch[field.name]
39
-
40
- tab = pa.Table.from_pydict(pydict, schema=schema)
41
- parquet.write_table(tab, str(output_path))
42
-
43
-
44
20
  def export_parquet(
45
- table_or_df: Union[pxt.Table, pxt.DataFrame],
21
+ table_or_query: pxt.Table | pxt.Query,
46
22
  parquet_path: Path,
47
23
  partition_size_bytes: int = 100_000_000,
48
24
  inline_images: bool = False,
49
25
  ) -> None:
50
26
  """
51
- Exports a dataframe's data to one or more Parquet files. Requires pyarrow to be installed.
27
+ Exports a Query's data to one or more Parquet files. Requires pyarrow to be installed.
52
28
 
53
29
  It additionally writes the pixeltable metadata in a json file, which would otherwise
54
30
  not be available in the parquet format.
55
31
 
56
32
  Args:
57
- table_or_df : Table or Dataframe to export.
33
+ table_or_query : Table or Query to export.
58
34
  parquet_path : Path to directory to write the parquet files to.
59
35
  partition_size_bytes : The maximum target size for each chunk. Default 100_000_000 bytes.
60
36
  inline_images : If True, images are stored inline in the parquet file. This is useful
61
37
  for small images, to be imported as pytorch dataset. But can be inefficient
62
38
  for large images, and cannot be imported into pixeltable.
63
- If False, will raise an error if the Dataframe has any image column.
39
+ If False, will raise an error if the Query has any image column.
64
40
  Default False.
65
41
  """
66
- from pixeltable.utils.arrow import to_arrow_schema
42
+ import pyarrow as pa
67
43
 
68
- df: pxt.DataFrame
69
- if isinstance(table_or_df, pxt.catalog.Table):
70
- df = table_or_df._df()
71
- else:
72
- df = table_or_df
44
+ from pixeltable.utils.arrow import to_record_batches
73
45
 
74
- type_dict = {k: v.as_dict() for k, v in df.schema.items()}
75
- arrow_schema = to_arrow_schema(df.schema)
46
+ query: pxt.Query
47
+ if isinstance(table_or_query, pxt.catalog.Table):
48
+ query = table_or_query.select()
49
+ else:
50
+ query = table_or_query
76
51
 
77
- if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
78
- raise excs.Error('Cannot export Dataframe with image columns when inline_images is False')
52
+ if not inline_images and any(col_type.is_image_type() for col_type in query.schema.values()):
53
+ raise excs.Error('Cannot export Query with image columns when inline_images is False')
79
54
 
80
55
  # store the changes atomically
81
56
  with transactional_directory(parquet_path) as temp_path:
82
57
  # dump metadata json file so we can inspect what was the source of the parquet file later on.
83
- json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
58
+ json.dump(query.as_dict(), (temp_path / '.pixeltable.json').open('w'))
59
+ type_dict = {k: v.as_dict() for k, v in query.schema.items()}
84
60
  json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
85
-
86
61
  batch_num = 0
87
- current_value_batch: dict[str, deque] = {k: deque() for k in df.schema}
88
- current_byte_estimate = 0
89
-
90
- with Env.get().begin_xact():
91
- for data_row in df._exec():
92
- for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
93
- val = data_row[e.slot_idx]
94
- if val is None:
95
- current_value_batch[col_name].append(val)
96
- continue
97
-
98
- assert val is not None
99
- if col_type.is_image_type():
100
- # images get inlined into the parquet file
101
- if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
102
- # if there is a file, read directly to preserve information
103
- with open(data_row.file_paths[e.slot_idx], 'rb') as f:
104
- val = f.read()
105
- elif isinstance(val, PIL.Image.Image):
106
- # if no file available, eg. bc it is computed, convert to png
107
- buf = io.BytesIO()
108
- val.save(buf, format='PNG')
109
- val = buf.getvalue()
110
- else:
111
- raise excs.Error(f'unknown image type {type(val)}')
112
- length = len(val)
113
- elif col_type.is_string_type():
114
- length = len(val)
115
- elif col_type.is_video_type():
116
- if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
117
- val = data_row.file_paths[e.slot_idx]
118
- else:
119
- raise excs.Error(f'unknown video type {type(val)}')
120
- length = len(val)
121
- elif col_type.is_json_type():
122
- val = json.dumps(val)
123
- length = len(val)
124
- elif col_type.is_array_type():
125
- length = val.nbytes
126
- elif col_type.is_int_type() or col_type.is_float_type():
127
- length = 8
128
- elif col_type.is_bool_type():
129
- length = 1
130
- elif col_type.is_date_type():
131
- length = 4
132
- elif col_type.is_timestamp_type():
133
- val = val.astimezone(datetime.timezone.utc)
134
- length = 8
135
- else:
136
- raise excs.Error(f'unknown type {col_type} for {col_name}')
137
-
138
- current_value_batch[col_name].append(val)
139
- current_byte_estimate += length
140
- if current_byte_estimate > partition_size_bytes:
141
- assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
142
- _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
143
- batch_num += 1
144
- current_value_batch = {k: deque() for k in df.schema}
145
- current_byte_estimate = 0
146
-
147
- _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
62
+ with Catalog.get().begin_xact(for_write=False):
63
+ for record_batch in to_record_batches(query, partition_size_bytes):
64
+ output_path = temp_path / f'part-{batch_num:05d}.parquet'
65
+ arrow_tbl = pa.Table.from_batches([record_batch])
66
+ pa.parquet.write_table(arrow_tbl, str(output_path))
67
+ batch_num += 1
148
68
 
149
69
 
150
70
  def import_parquet(
151
71
  table: str,
152
72
  *,
153
73
  parquet_path: str,
154
- schema_overrides: Optional[dict[str, Any]] = None,
155
- primary_key: Optional[Union[str, list[str]]] = None,
74
+ schema_overrides: dict[str, Any] | None = None,
75
+ primary_key: str | list[str] | None = None,
156
76
  **kwargs: Any,
157
77
  ) -> pxt.Table:
158
78
  """Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.