pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,3 @@
1
+ from pixeltable.utils.lancedb import export_lancedb
2
+
3
+ __all__ = ['export_lancedb']
pixeltable/io/pandas.py CHANGED
@@ -1,18 +1,26 @@
1
- from typing import Any, Optional, Union
1
+ import os
2
+ import uuid
3
+ from typing import Any
2
4
 
3
5
  import numpy as np
4
6
  import pandas as pd
7
+ from pandas._typing import DtypeObj # For pandas dtype type hints
8
+ from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
5
9
 
6
10
  import pixeltable as pxt
7
11
  import pixeltable.exceptions as excs
8
12
  import pixeltable.type_system as ts
13
+ from pixeltable.env import Env
9
14
 
10
15
 
11
16
  def import_pandas(
12
- tbl_name: str, df: pd.DataFrame, *, schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
13
- primary_key: Optional[Union[str, list[str]]] = None,
17
+ tbl_name: str,
18
+ df: pd.DataFrame,
19
+ *,
20
+ schema_overrides: dict[str, Any] | None = None,
21
+ primary_key: str | list[str] | None = None,
14
22
  num_retained_versions: int = 10,
15
- comment: str = ''
23
+ comment: str = '',
16
24
  ) -> pxt.Table:
17
25
  """Creates a new base table from a Pandas
18
26
  [`DataFrame`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html), with the
@@ -36,26 +44,24 @@ def import_pandas(
36
44
  Returns:
37
45
  A handle to the newly created [`Table`][pixeltable.Table].
38
46
  """
39
- if schema_overrides is None:
40
- schema_overrides = {}
41
- if primary_key is None:
42
- primary_key = []
43
- elif isinstance(primary_key, str):
44
- primary_key = [primary_key]
45
-
46
- schema, pxt_pk = __df_to_pxt_schema(df, schema_overrides, primary_key)
47
- tbl_rows = (dict(__df_row_to_pxt_row(row, schema)) for row in df.itertuples())
48
- table = pxt.create_table(tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment)
49
- table.insert(tbl_rows)
50
- return table
47
+ return pxt.create_table(
48
+ tbl_name,
49
+ source=df,
50
+ schema_overrides=schema_overrides,
51
+ primary_key=primary_key,
52
+ num_retained_versions=num_retained_versions,
53
+ comment=comment,
54
+ )
51
55
 
52
56
 
53
57
  def import_csv(
54
- tbl_name: str, filepath_or_buffer, schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
55
- primary_key: Optional[Union[str, list[str]]] = None,
58
+ tbl_name: str,
59
+ filepath_or_buffer: str | os.PathLike,
60
+ schema_overrides: dict[str, Any] | None = None,
61
+ primary_key: str | list[str] | None = None,
56
62
  num_retained_versions: int = 10,
57
63
  comment: str = '',
58
- **kwargs
64
+ **kwargs: Any,
59
65
  ) -> pxt.Table:
60
66
  """
61
67
  Creates a new base table from a csv file. This is a convenience method and is equivalent
@@ -66,16 +72,26 @@ def import_csv(
66
72
  Returns:
67
73
  A handle to the newly created [`Table`][pixeltable.Table].
68
74
  """
69
- df = pd.read_csv(filepath_or_buffer, **kwargs)
70
- return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
75
+ return pxt.create_table(
76
+ tbl_name,
77
+ source=filepath_or_buffer,
78
+ schema_overrides=schema_overrides,
79
+ primary_key=primary_key,
80
+ num_retained_versions=num_retained_versions,
81
+ comment=comment,
82
+ extra_args=kwargs,
83
+ )
71
84
 
72
85
 
73
86
  def import_excel(
74
- tbl_name: str, io, *args, schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
75
- primary_key: Optional[Union[str, list[str]]] = None,
87
+ tbl_name: str,
88
+ io: str | os.PathLike,
89
+ *,
90
+ schema_overrides: dict[str, Any] | None = None,
91
+ primary_key: str | list[str] | None = None,
76
92
  num_retained_versions: int = 10,
77
93
  comment: str = '',
78
- **kwargs
94
+ **kwargs: Any,
79
95
  ) -> pxt.Table:
80
96
  """
81
97
  Creates a new base table from an Excel (.xlsx) file. This is a convenience method and is
@@ -86,97 +102,77 @@ def import_excel(
86
102
  Returns:
87
103
  A handle to the newly created [`Table`][pixeltable.Table].
88
104
  """
89
- df = pd.read_excel(io, *args, **kwargs)
90
- return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
105
+ return pxt.create_table(
106
+ tbl_name,
107
+ source=io,
108
+ schema_overrides=schema_overrides,
109
+ primary_key=primary_key,
110
+ num_retained_versions=num_retained_versions,
111
+ comment=comment,
112
+ extra_args=kwargs,
113
+ )
114
+
115
+
116
+ def _df_check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
117
+ for pd_name in primary_key:
118
+ # This can be faster for large DataFrames
119
+ has_nulls = df[pd_name].count() < len(df)
120
+ if has_nulls:
121
+ raise excs.Error(f'Primary key column `{pd_name}` cannot contain null values.')
91
122
 
92
123
 
93
- def __df_to_pxt_schema(
94
- df: pd.DataFrame, schema_overrides: dict[str, pxt.ColumnType], primary_key: list[str]
95
- ) -> tuple[dict[str, pxt.ColumnType], list[str]]:
124
+ def df_infer_schema(
125
+ df: pd.DataFrame, schema_overrides: dict[str, ts.ColumnType], primary_key: list[str]
126
+ ) -> dict[str, ts.ColumnType]:
96
127
  """
97
128
  Infers a Pixeltable schema from a Pandas DataFrame.
98
129
 
99
130
  Returns:
100
131
  A tuple containing a Pixeltable schema and a list of primary key column names.
101
132
  """
102
- for pd_name in schema_overrides:
103
- if pd_name not in df.columns:
104
- raise excs.Error(
105
- f'Column `{pd_name}` specified in `schema_overrides` does not exist in the given `DataFrame`.'
106
- )
107
- for pd_name in primary_key:
108
- if pd_name not in df.columns:
109
- raise excs.Error(f'Primary key column `{pd_name}` does not exist in the given `DataFrame`.')
110
-
111
- schema: dict[str, pxt.ColumnType] = {}
112
- col_mapping: dict[str, str] = {} # Maps Pandas column names to Pixeltable column names
113
-
133
+ pd_schema: dict[str, ts.ColumnType] = {}
114
134
  for pd_name, pd_dtype in zip(df.columns, df.dtypes):
115
135
  if pd_name in schema_overrides:
136
+ assert isinstance(schema_overrides[pd_name], ts.ColumnType)
116
137
  pxt_type = schema_overrides[pd_name]
117
138
  else:
118
- # This complicated-looking condition is necessary because we cannot safely call `pd.isna()` on
119
- # general objects, so we need to check for nulls in the specific cases where we might expect them.
120
- # isinstance(val, float) will check for NaN values in float columns *as well as* floats appearing
121
- # in object columns (where Pandas uses NaN as a general null).
122
- # np.issubdtype(pd_dtype, np.datetime64) checks for NaT values specifically in datetime columns.
123
- has_na = any(
124
- (isinstance(val, float) or np.issubdtype(pd_dtype, np.datetime64)) and pd.isna(val)
125
- for val in df[pd_name]
126
- )
127
- if has_na and pd_name in primary_key:
128
- raise excs.Error(f'Primary key column `{pd_name}` cannot contain null values.')
129
- pxt_type = __np_dtype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
130
- pxt_name = __normalize_pxt_col_name(pd_name)
131
- # Ensure that column names are unique by appending a distinguishing suffix
132
- # to any collisions
133
- if pxt_name in schema:
134
- n = 2
135
- while f'{pxt_name}_{n}' in schema:
136
- n += 1
137
- pxt_name = f'{pxt_name}_{n}'
138
- schema[pxt_name] = pxt_type
139
- col_mapping[pd_name] = pxt_name
140
-
141
- pxt_pk = [col_mapping[pk] for pk in primary_key]
142
- return schema, pxt_pk
143
-
144
-
145
- def __normalize_pxt_col_name(pd_name: str) -> str:
146
- """
147
- Normalizes an arbitrary DataFrame column name into a valid Pixeltable identifier by:
148
- - replacing any non-ascii or non-alphanumeric characters with an underscore _
149
- - prefixing the result with the letter 'c' if it starts with an underscore or a number
150
- """
151
- id = ''.join(ch if ch.isascii() and ch.isalnum() else '_' for ch in pd_name)
152
- if id[0].isnumeric():
153
- id = f'c_{id}'
154
- elif id[0] == '_':
155
- id = f'c{id}'
156
- assert pxt.catalog.is_valid_identifier(id), id
157
- return id
158
-
139
+ pxt_type = __pd_coltype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
140
+ pd_schema[pd_name] = pxt_type
159
141
 
160
- def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bool) -> pxt.ColumnType:
161
- """
162
- Infers a Pixeltable type based on a Numpy dtype.
163
- """
164
- if np.issubdtype(np_dtype, np.integer):
165
- return pxt.IntType(nullable=nullable)
142
+ return pd_schema
166
143
 
167
- if np.issubdtype(np_dtype, np.floating):
168
- return pxt.FloatType(nullable=nullable)
169
144
 
170
- if np.issubdtype(np_dtype, np.bool_):
171
- return pxt.BoolType(nullable=nullable)
145
+ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> ts.ColumnType | None:
146
+ """
147
+ Determines a pixeltable ColumnType from a pandas dtype
172
148
 
173
- if np.issubdtype(np_dtype, np.character):
174
- return pxt.StringType(nullable=nullable)
149
+ Args:
150
+ pd_dtype: A pandas dtype object
175
151
 
176
- if np.issubdtype(np_dtype, np.datetime64):
177
- return pxt.TimestampType(nullable=nullable)
152
+ Returns:
153
+ ts.ColumnType: A pixeltable ColumnType
154
+ """
155
+ # Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly
156
+ # compatible with NumPy dtypes
157
+ # The timezone-aware datetime64[ns, tz=] dtype is a pandas extension dtype
158
+ if is_datetime64_any_dtype(pd_dtype):
159
+ return ts.TimestampType(nullable=nullable)
160
+ if is_extension_array_dtype(pd_dtype):
161
+ return None
162
+ # Most other pandas dtypes are directly NumPy compatible
163
+ assert isinstance(pd_dtype, np.dtype)
164
+ return ts.ColumnType.from_np_dtype(pd_dtype, nullable)
165
+
166
+
167
+ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable: bool) -> ts.ColumnType:
168
+ """
169
+ Infers a Pixeltable type based on a pandas dtype.
170
+ """
171
+ pxttype = __pd_dtype_to_pxt_type(pd_dtype, nullable)
172
+ if pxttype is not None:
173
+ return pxttype
178
174
 
179
- if np_dtype == np.object_:
175
+ if pd_dtype == np.object_:
180
176
  # The `object_` dtype can mean all sorts of things; see if we can infer the Pixeltable type
181
177
  # based on the actual data in `data_col`.
182
178
  # First drop any null values (they don't contribute to type inference).
@@ -184,40 +180,65 @@ def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bo
184
180
 
185
181
  if len(data_col) == 0:
186
182
  # No non-null values; default to FloatType (the Pandas type of an all-NaN column)
187
- return pxt.FloatType(nullable=nullable)
183
+ return ts.FloatType(nullable=nullable)
188
184
 
189
- inferred_type = pxt.ColumnType.infer_common_literal_type(data_col)
185
+ inferred_type = ts.ColumnType.infer_common_literal_type(data_col)
190
186
  if inferred_type is None:
191
187
  # Fallback on StringType if everything else fails
192
- return pxt.StringType(nullable=nullable)
188
+ return ts.StringType(nullable=nullable)
193
189
  else:
194
190
  return inferred_type.copy(nullable=nullable)
195
191
 
196
- raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {np_dtype})')
192
+ raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {pd_dtype})')
197
193
 
198
194
 
199
- def __df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType]) -> dict[str, Any]:
200
- rows = {}
195
+ def _df_row_to_pxt_row(
196
+ row: tuple[Any, ...], schema: dict[str, ts.ColumnType], col_mapping: dict[str, str] | None
197
+ ) -> dict[str, Any]:
198
+ """Convert a row to insertable format"""
199
+ pxt_row: dict[str, Any] = {}
201
200
  for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
201
+ pxt_name = col_mapping.get(col_name, col_name)
202
+ nval: Any
202
203
  if pxt_type.is_float_type():
203
- val = float(val)
204
+ nval = float(val)
204
205
  elif isinstance(val, float) and np.isnan(val):
205
206
  # pandas uses NaN for empty cells, even for types other than float;
206
207
  # for any type but a float, convert these to None
207
- val = None
208
+ nval = None
208
209
  elif pxt_type.is_int_type():
209
- val = int(val)
210
+ nval = int(val)
210
211
  elif pxt_type.is_bool_type():
211
- val = bool(val)
212
+ nval = bool(val)
212
213
  elif pxt_type.is_string_type():
213
- val = str(val)
214
+ nval = str(val)
215
+ elif pxt_type.is_date_type():
216
+ if pd.isnull(val):
217
+ # pandas has the bespoke 'NaT' valud for a missing timestamp
218
+ # This is not supported by postgres, and must be converted to None
219
+ nval = None
220
+ else:
221
+ nval = pd.Timestamp(val).date()
214
222
  elif pxt_type.is_timestamp_type():
215
223
  if pd.isnull(val):
216
- # pandas has the bespoke 'NaT' type for a missing timestamp; postgres is very
217
- # much not-ok with it. (But if we convert it to None and then load out the
218
- # table contents as a pandas DataFrame, it will correctly restore the 'NaT'!)
219
- val = None
224
+ # pandas has the bespoke 'NaT' value for a missing timestamp
225
+ # This is not supported by postgres, and must be converted to None
226
+ nval = None
220
227
  else:
221
- val = pd.Timestamp(val).to_pydatetime()
222
- rows[col_name] = val
223
- return rows
228
+ tval = pd.Timestamp(val)
229
+ # pandas supports tz-aware and naive timestamps.
230
+ if tval.tz is None:
231
+ nval = pd.Timestamp(tval).tz_localize(tz=Env.get().default_time_zone)
232
+ else:
233
+ nval = tval.astimezone(Env.get().default_time_zone)
234
+ elif pxt_type.is_uuid_type():
235
+ if pd.isnull(val):
236
+ nval = None
237
+ elif isinstance(val, uuid.UUID):
238
+ nval = val
239
+ else:
240
+ nval = uuid.UUID(val)
241
+ else:
242
+ nval = val
243
+ pxt_row[pxt_name] = nval
244
+ return pxt_row
pixeltable/io/parquet.py CHANGED
@@ -1,168 +1,78 @@
1
1
  from __future__ import annotations
2
2
 
3
- import io
4
3
  import json
5
4
  import logging
6
- import random
7
5
  import typing
8
- from collections import deque
9
6
  from pathlib import Path
10
- from typing import Any, Optional, Union
11
-
12
- import numpy as np
13
- import PIL.Image
14
- import datetime
7
+ from typing import Any
15
8
 
16
9
  import pixeltable as pxt
17
- from pixeltable.env import Env
18
- import pixeltable.exceptions as exc
19
- import pixeltable.type_system as ts
10
+ import pixeltable.exceptions as excs
11
+ from pixeltable.catalog import Catalog
20
12
  from pixeltable.utils.transactional_directory import transactional_directory
21
13
 
22
14
  if typing.TYPE_CHECKING:
23
- import pyarrow as pa
24
15
  import pixeltable as pxt
25
16
 
26
- _logger = logging.getLogger(__name__)
27
-
28
-
29
- def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
30
- import pyarrow as pa
31
- from pyarrow import parquet
32
-
33
- pydict = {}
34
- for field in schema:
35
- if isinstance(field.type, pa.FixedShapeTensorType):
36
- stacked_arr = np.stack(value_batch[field.name])
37
- pydict[field.name] = pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr)
38
- else:
39
- pydict[field.name] = value_batch[field.name]
40
-
41
- tab = pa.Table.from_pydict(pydict, schema=schema)
42
- parquet.write_table(tab, str(output_path))
17
+ _logger = logging.getLogger('pixeltable')
43
18
 
44
19
 
45
20
  def export_parquet(
46
- table_or_df: Union[pxt.Table, pxt.DataFrame],
47
- parquet_path: Path,
48
- partition_size_bytes: int = 100_000_000,
49
- inline_images: bool = False
50
- ) -> None:
21
+ table_or_query: pxt.Table | pxt.Query,
22
+ parquet_path: Path,
23
+ partition_size_bytes: int = 100_000_000,
24
+ inline_images: bool = False,
25
+ ) -> None:
51
26
  """
52
- Exports a dataframe's data to one or more Parquet files. Requires pyarrow to be installed.
27
+ Exports a Query's data to one or more Parquet files. Requires pyarrow to be installed.
53
28
 
54
29
  It additionally writes the pixeltable metadata in a json file, which would otherwise
55
30
  not be available in the parquet format.
56
31
 
57
32
  Args:
58
- table_or_df : Table or Dataframe to export.
33
+ table_or_query : Table or Query to export.
59
34
  parquet_path : Path to directory to write the parquet files to.
60
35
  partition_size_bytes : The maximum target size for each chunk. Default 100_000_000 bytes.
61
36
  inline_images : If True, images are stored inline in the parquet file. This is useful
62
37
  for small images, to be imported as pytorch dataset. But can be inefficient
63
38
  for large images, and cannot be imported into pixeltable.
64
- If False, will raise an error if the Dataframe has any image column.
39
+ If False, will raise an error if the Query has any image column.
65
40
  Default False.
66
41
  """
67
- from pixeltable.utils.arrow import to_arrow_schema
42
+ import pyarrow as pa
68
43
 
69
- df: pxt.DataFrame
70
- if isinstance(table_or_df, pxt.catalog.Table):
71
- df = table_or_df._df()
72
- else:
73
- df = table_or_df
44
+ from pixeltable.utils.arrow import to_record_batches
74
45
 
75
- type_dict = {k: v.as_dict() for k, v in df.schema.items()}
76
- arrow_schema = to_arrow_schema(df.schema)
46
+ query: pxt.Query
47
+ if isinstance(table_or_query, pxt.catalog.Table):
48
+ query = table_or_query.select()
49
+ else:
50
+ query = table_or_query
77
51
 
78
- if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
79
- raise exc.Error('Cannot export Dataframe with image columns when inline_images is False')
52
+ if not inline_images and any(col_type.is_image_type() for col_type in query.schema.values()):
53
+ raise excs.Error('Cannot export Query with image columns when inline_images is False')
80
54
 
81
55
  # store the changes atomically
82
56
  with transactional_directory(parquet_path) as temp_path:
83
57
  # dump metadata json file so we can inspect what was the source of the parquet file later on.
84
- json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
58
+ json.dump(query.as_dict(), (temp_path / '.pixeltable.json').open('w'))
59
+ type_dict = {k: v.as_dict() for k, v in query.schema.items()}
85
60
  json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
86
-
87
61
  batch_num = 0
88
- current_value_batch: dict[str, deque] = {k: deque() for k in df.schema.keys()}
89
- current_byte_estimate = 0
90
-
91
- for data_row in df._exec():
92
- for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
93
- val = data_row[e.slot_idx]
94
- if val is None:
95
- current_value_batch[col_name].append(val)
96
- continue
97
-
98
- assert val is not None
99
- if col_type.is_image_type():
100
- # images get inlined into the parquet file
101
- if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
102
- # if there is a file, read directly to preserve information
103
- with open(data_row.file_paths[e.slot_idx], 'rb') as f:
104
- val = f.read()
105
- elif isinstance(val, PIL.Image.Image):
106
- # if no file available, eg. bc it is computed, convert to png
107
- buf = io.BytesIO()
108
- val.save(buf, format='PNG')
109
- val = buf.getvalue()
110
- else:
111
- assert False, f'unknown image type {type(val)}'
112
- length = len(val)
113
- elif col_type.is_string_type():
114
- length = len(val)
115
- elif col_type.is_video_type():
116
- if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
117
- val = data_row.file_paths[e.slot_idx]
118
- else:
119
- assert False, f'unknown video type {type(val)}'
120
- length = len(val)
121
- elif col_type.is_json_type():
122
- val = json.dumps(val)
123
- length = len(val)
124
- elif col_type.is_array_type():
125
- length = val.nbytes
126
- elif col_type.is_int_type():
127
- length = 8
128
- elif col_type.is_float_type():
129
- length = 8
130
- elif col_type.is_bool_type():
131
- length = 1
132
- elif col_type.is_timestamp_type():
133
- val = val.astimezone(datetime.timezone.utc)
134
- length = 8
135
- else:
136
- assert False, f'unknown type {col_type} for {col_name}'
137
-
138
- current_value_batch[col_name].append(val)
139
- current_byte_estimate += length
140
- if current_byte_estimate > partition_size_bytes:
141
- assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
142
- _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
62
+ with Catalog.get().begin_xact(for_write=False):
63
+ for record_batch in to_record_batches(query, partition_size_bytes):
64
+ output_path = temp_path / f'part-{batch_num:05d}.parquet'
65
+ arrow_tbl = pa.Table.from_batches([record_batch])
66
+ pa.parquet.write_table(arrow_tbl, str(output_path))
143
67
  batch_num += 1
144
- current_value_batch = {k: deque() for k in df.schema.keys()}
145
- current_byte_estimate = 0
146
-
147
- _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
148
-
149
-
150
- def parquet_schema_to_pixeltable_schema(parquet_path: str) -> dict[str, Optional[ts.ColumnType]]:
151
- """Generate a default pixeltable schema for the given parquet file. Returns None for unknown types."""
152
- from pyarrow import parquet
153
-
154
- from pixeltable.utils.arrow import to_pixeltable_schema
155
-
156
- input_path = Path(parquet_path).expanduser()
157
- parquet_dataset = parquet.ParquetDataset(str(input_path))
158
- return to_pixeltable_schema(parquet_dataset.schema)
159
68
 
160
69
 
161
70
  def import_parquet(
162
71
  table: str,
163
72
  *,
164
73
  parquet_path: str,
165
- schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
74
+ schema_overrides: dict[str, Any] | None = None,
75
+ primary_key: str | list[str] | None = None,
166
76
  **kwargs: Any,
167
77
  ) -> pxt.Table:
168
78
  """Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
@@ -174,41 +84,18 @@ def import_parquet(
174
84
  name `name` will be given type `type`, instead of being inferred from the Parquet dataset. The keys in
175
85
  `schema_overrides` should be the column names of the Parquet dataset (whether or not they are valid
176
86
  Pixeltable identifiers).
87
+ primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
177
88
  kwargs: Additional arguments to pass to `create_table`.
178
89
 
179
90
  Returns:
180
91
  A handle to the newly created table.
181
92
  """
182
- from pyarrow import parquet
183
-
184
- import pixeltable as pxt
185
- from pixeltable.utils.arrow import iter_tuples
186
-
187
- input_path = Path(parquet_path).expanduser()
188
- parquet_dataset = parquet.ParquetDataset(str(input_path))
189
-
190
- schema = parquet_schema_to_pixeltable_schema(parquet_path)
191
- if schema_overrides is None:
192
- schema_overrides = {}
193
-
194
- schema.update(schema_overrides)
195
- for k, v in schema.items():
196
- if v is None:
197
- raise exc.Error(f'Could not infer pixeltable type for column {k} from parquet file')
198
-
199
- if table in pxt.list_tables():
200
- raise exc.Error(f'Table {table} already exists')
201
-
202
- try:
203
- tmp_name = f'{table}_tmp_{random.randint(0, 100000000)}'
204
- tab = pxt.create_table(tmp_name, schema, **kwargs)
205
- for fragment in parquet_dataset.fragments: # type: ignore[attr-defined]
206
- for batch in fragment.to_batches():
207
- dict_batch = list(iter_tuples(batch))
208
- tab.insert(dict_batch)
209
- except Exception as e:
210
- _logger.error(f'Error while inserting Parquet file into table: {e}')
211
- raise e
212
-
213
- pxt.move(tmp_name, table)
214
- return pxt.get_table(table)
93
+ value = kwargs.pop('source_format', None)
94
+ return pxt.create_table(
95
+ table,
96
+ source=parquet_path,
97
+ source_format=value,
98
+ schema_overrides=schema_overrides,
99
+ primary_key=primary_key,
100
+ extra_args=kwargs,
101
+ )