pixeltable 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (119) hide show
  1. pixeltable/__init__.py +53 -0
  2. pixeltable/__version__.py +3 -0
  3. pixeltable/catalog/__init__.py +13 -0
  4. pixeltable/catalog/catalog.py +159 -0
  5. pixeltable/catalog/column.py +181 -0
  6. pixeltable/catalog/dir.py +32 -0
  7. pixeltable/catalog/globals.py +33 -0
  8. pixeltable/catalog/insertable_table.py +192 -0
  9. pixeltable/catalog/named_function.py +36 -0
  10. pixeltable/catalog/path.py +58 -0
  11. pixeltable/catalog/path_dict.py +139 -0
  12. pixeltable/catalog/schema_object.py +39 -0
  13. pixeltable/catalog/table.py +695 -0
  14. pixeltable/catalog/table_version.py +1026 -0
  15. pixeltable/catalog/table_version_path.py +133 -0
  16. pixeltable/catalog/view.py +203 -0
  17. pixeltable/dataframe.py +749 -0
  18. pixeltable/env.py +466 -0
  19. pixeltable/exceptions.py +17 -0
  20. pixeltable/exec/__init__.py +10 -0
  21. pixeltable/exec/aggregation_node.py +78 -0
  22. pixeltable/exec/cache_prefetch_node.py +116 -0
  23. pixeltable/exec/component_iteration_node.py +79 -0
  24. pixeltable/exec/data_row_batch.py +94 -0
  25. pixeltable/exec/exec_context.py +22 -0
  26. pixeltable/exec/exec_node.py +61 -0
  27. pixeltable/exec/expr_eval_node.py +217 -0
  28. pixeltable/exec/in_memory_data_node.py +73 -0
  29. pixeltable/exec/media_validation_node.py +43 -0
  30. pixeltable/exec/sql_scan_node.py +226 -0
  31. pixeltable/exprs/__init__.py +25 -0
  32. pixeltable/exprs/arithmetic_expr.py +102 -0
  33. pixeltable/exprs/array_slice.py +71 -0
  34. pixeltable/exprs/column_property_ref.py +77 -0
  35. pixeltable/exprs/column_ref.py +114 -0
  36. pixeltable/exprs/comparison.py +77 -0
  37. pixeltable/exprs/compound_predicate.py +98 -0
  38. pixeltable/exprs/data_row.py +199 -0
  39. pixeltable/exprs/expr.py +594 -0
  40. pixeltable/exprs/expr_set.py +39 -0
  41. pixeltable/exprs/function_call.py +382 -0
  42. pixeltable/exprs/globals.py +69 -0
  43. pixeltable/exprs/image_member_access.py +96 -0
  44. pixeltable/exprs/in_predicate.py +96 -0
  45. pixeltable/exprs/inline_array.py +109 -0
  46. pixeltable/exprs/inline_dict.py +103 -0
  47. pixeltable/exprs/is_null.py +38 -0
  48. pixeltable/exprs/json_mapper.py +121 -0
  49. pixeltable/exprs/json_path.py +159 -0
  50. pixeltable/exprs/literal.py +66 -0
  51. pixeltable/exprs/object_ref.py +41 -0
  52. pixeltable/exprs/predicate.py +44 -0
  53. pixeltable/exprs/row_builder.py +329 -0
  54. pixeltable/exprs/rowid_ref.py +94 -0
  55. pixeltable/exprs/similarity_expr.py +65 -0
  56. pixeltable/exprs/type_cast.py +53 -0
  57. pixeltable/exprs/variable.py +45 -0
  58. pixeltable/ext/__init__.py +5 -0
  59. pixeltable/ext/functions/yolox.py +92 -0
  60. pixeltable/func/__init__.py +7 -0
  61. pixeltable/func/aggregate_function.py +197 -0
  62. pixeltable/func/callable_function.py +113 -0
  63. pixeltable/func/expr_template_function.py +99 -0
  64. pixeltable/func/function.py +141 -0
  65. pixeltable/func/function_registry.py +227 -0
  66. pixeltable/func/globals.py +46 -0
  67. pixeltable/func/nos_function.py +202 -0
  68. pixeltable/func/signature.py +162 -0
  69. pixeltable/func/udf.py +164 -0
  70. pixeltable/functions/__init__.py +95 -0
  71. pixeltable/functions/eval.py +215 -0
  72. pixeltable/functions/fireworks.py +34 -0
  73. pixeltable/functions/huggingface.py +167 -0
  74. pixeltable/functions/image.py +16 -0
  75. pixeltable/functions/openai.py +289 -0
  76. pixeltable/functions/pil/image.py +147 -0
  77. pixeltable/functions/string.py +13 -0
  78. pixeltable/functions/together.py +143 -0
  79. pixeltable/functions/util.py +52 -0
  80. pixeltable/functions/video.py +62 -0
  81. pixeltable/globals.py +425 -0
  82. pixeltable/index/__init__.py +2 -0
  83. pixeltable/index/base.py +51 -0
  84. pixeltable/index/embedding_index.py +168 -0
  85. pixeltable/io/__init__.py +3 -0
  86. pixeltable/io/hf_datasets.py +188 -0
  87. pixeltable/io/pandas.py +148 -0
  88. pixeltable/io/parquet.py +192 -0
  89. pixeltable/iterators/__init__.py +3 -0
  90. pixeltable/iterators/base.py +52 -0
  91. pixeltable/iterators/document.py +432 -0
  92. pixeltable/iterators/video.py +88 -0
  93. pixeltable/metadata/__init__.py +58 -0
  94. pixeltable/metadata/converters/convert_10.py +18 -0
  95. pixeltable/metadata/converters/convert_12.py +3 -0
  96. pixeltable/metadata/converters/convert_13.py +41 -0
  97. pixeltable/metadata/schema.py +234 -0
  98. pixeltable/plan.py +620 -0
  99. pixeltable/store.py +424 -0
  100. pixeltable/tool/create_test_db_dump.py +184 -0
  101. pixeltable/tool/create_test_video.py +81 -0
  102. pixeltable/type_system.py +846 -0
  103. pixeltable/utils/__init__.py +17 -0
  104. pixeltable/utils/arrow.py +98 -0
  105. pixeltable/utils/clip.py +18 -0
  106. pixeltable/utils/coco.py +136 -0
  107. pixeltable/utils/documents.py +69 -0
  108. pixeltable/utils/filecache.py +195 -0
  109. pixeltable/utils/help.py +11 -0
  110. pixeltable/utils/http_server.py +70 -0
  111. pixeltable/utils/media_store.py +76 -0
  112. pixeltable/utils/pytorch.py +91 -0
  113. pixeltable/utils/s3.py +13 -0
  114. pixeltable/utils/sql.py +17 -0
  115. pixeltable/utils/transactional_directory.py +35 -0
  116. pixeltable-0.0.0.dist-info/LICENSE +18 -0
  117. pixeltable-0.0.0.dist-info/METADATA +131 -0
  118. pixeltable-0.0.0.dist-info/RECORD +119 -0
  119. pixeltable-0.0.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,188 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import math
5
+ import random
6
+ import typing
7
+ from typing import Union, Optional, Any
8
+
9
+ import pixeltable
10
+ import pixeltable.type_system as ts
11
+ from pixeltable import exceptions as excs
12
+
13
+ if typing.TYPE_CHECKING:
14
+ import datasets
15
+
16
+ _logger = logging.getLogger(__name__)
17
+
18
+ # use 100MB as the batch size limit for loading a huggingface dataset into pixeltable.
19
+ # The primary goal is to bound memory use, regardless of dataset size.
20
+ # Second goal is to limit overhead. 100MB is presumed to be reasonable for a lot of storage systems.
21
+ _K_BATCH_SIZE_BYTES = 100_000_000
22
+
23
+ # note, there are many more types. we allow overrides in the schema_override parameter
24
+ # to handle cases where the appropriate type is not yet mapped, or to override this mapping.
25
+ # https://huggingface.co/docs/datasets/v2.17.0/en/package_reference/main_classes#datasets.Value
26
+ _hf_to_pxt: dict[str, ts.ColumnType] = {
27
+ 'int32': ts.IntType(nullable=True), # pixeltable widens to big int
28
+ 'int64': ts.IntType(nullable=True),
29
+ 'bool': ts.BoolType(nullable=True),
30
+ 'float32': ts.FloatType(nullable=True),
31
+ 'string': ts.StringType(nullable=True),
32
+ 'timestamp[s]': ts.TimestampType(nullable=True),
33
+ 'timestamp[ms]': ts.TimestampType(nullable=True), # HF dataset iterator converts timestamps to datetime.datetime
34
+ }
35
+
36
+
37
+ def _to_pixeltable_type(
38
+ feature_type: Union[datasets.ClassLabel, datasets.Value, datasets.Sequence],
39
+ ) -> Optional[ts.ColumnType]:
40
+ """Convert a huggingface feature type to a pixeltable ColumnType if one is defined."""
41
+ import datasets
42
+
43
+ if isinstance(feature_type, datasets.ClassLabel):
44
+ # enum, example: ClassLabel(names=['neg', 'pos'], id=None)
45
+ return ts.StringType(nullable=True)
46
+ elif isinstance(feature_type, datasets.Value):
47
+ # example: Value(dtype='int64', id=None)
48
+ return _hf_to_pxt.get(feature_type.dtype, None)
49
+ elif isinstance(feature_type, datasets.Sequence):
50
+ # example: cohere wiki. Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)
51
+ dtype = _to_pixeltable_type(feature_type.feature)
52
+ length = feature_type.length if feature_type.length != -1 else None
53
+ return ts.ArrayType(shape=(length,), dtype=dtype)
54
+ else:
55
+ return None
56
+
57
+
58
+ def _get_hf_schema(dataset: Union[datasets.Dataset, datasets.DatasetDict]) -> datasets.Features:
59
+ """Get the schema of a huggingface dataset as a dictionary."""
60
+ import datasets
61
+
62
+ first_dataset = dataset if isinstance(dataset, datasets.Dataset) else next(iter(dataset.values()))
63
+ return first_dataset.features
64
+
65
+
66
+ def huggingface_schema_to_pixeltable_schema(
67
+ hf_dataset: Union[datasets.Dataset, datasets.DatasetDict],
68
+ ) -> dict[str, Optional[ts.ColumnType]]:
69
+ """Generate a pixeltable schema from a huggingface dataset schema.
70
+ Columns without a known mapping are mapped to None
71
+ """
72
+ hf_schema = _get_hf_schema(hf_dataset)
73
+ pixeltable_schema = {
74
+ column_name: _to_pixeltable_type(feature_type) for column_name, feature_type in hf_schema.items()
75
+ }
76
+ return pixeltable_schema
77
+
78
+
79
+ def import_huggingface_dataset(
80
+ table_path: str,
81
+ dataset: Union[datasets.Dataset, datasets.DatasetDict],
82
+ *,
83
+ column_name_for_split: Optional[str] = None,
84
+ schema_override: Optional[dict[str, Any]] = None,
85
+ **kwargs,
86
+ ) -> 'pixeltable.InsertableTable':
87
+ """Create a new `Table` from a Huggingface dataset, or dataset dict with multiple splits.
88
+ Requires datasets library to be installed.
89
+
90
+ Args:
91
+ path_str: Path to the table.
92
+ dataset: Huggingface datasets.Dataset or datasets.DatasetDict to insert into the table.
93
+ column_name_for_split: column name to use for split information. If None, no split information will be stored.
94
+ schema_override: Optional dictionary mapping column names to column type to override the corresponding defaults from
95
+ `pixeltable.utils.hf_datasets.huggingface_schema_to_pixeltable_schema`. The column type should be a pixeltable ColumnType.
96
+ For example, {'col_vid': VideoType()}, rather than {'col_vid': StringType()}.
97
+
98
+ kwargs: Additional arguments to pass to `create_table`.
99
+
100
+ Returns:
101
+ The newly created table. The table will have loaded the data from the dataset.
102
+ """
103
+ import datasets
104
+ import pixeltable as pxt
105
+
106
+ if table_path in pxt.list_tables():
107
+ raise excs.Error(f'table {table_path} already exists')
108
+
109
+ if not isinstance(dataset, (datasets.Dataset, datasets.DatasetDict)):
110
+ raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
111
+
112
+ if isinstance(dataset, datasets.Dataset):
113
+ # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
114
+ raw_name = dataset.split._name
115
+ split_name = raw_name.split('[')[0] if raw_name is not None else None
116
+ dataset_dict = {split_name: dataset}
117
+ else:
118
+ dataset_dict = dataset
119
+
120
+ pixeltable_schema = huggingface_schema_to_pixeltable_schema(dataset)
121
+ if schema_override is not None:
122
+ pixeltable_schema.update(schema_override)
123
+
124
+ if column_name_for_split is not None:
125
+ if column_name_for_split in pixeltable_schema:
126
+ raise excs.Error(
127
+ f'Column name `{column_name_for_split}` already exists in dataset schema; provide a different `column_name_for_split`'
128
+ )
129
+ pixeltable_schema[column_name_for_split] = ts.StringType(nullable=True)
130
+
131
+ for field, column_type in pixeltable_schema.items():
132
+ if column_type is None:
133
+ raise excs.Error(f'Could not infer pixeltable type for feature `{field}` in huggingface dataset')
134
+
135
+ if isinstance(dataset, datasets.Dataset):
136
+ # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
137
+ raw_name = dataset.split._name
138
+ split_name = raw_name.split('[')[0] if raw_name is not None else None
139
+ dataset_dict = {split_name: dataset}
140
+ elif isinstance(dataset, datasets.DatasetDict):
141
+ dataset_dict = dataset
142
+ else:
143
+ raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
144
+
145
+ # extract all class labels from the dataset to translate category ints to strings
146
+ hf_schema = _get_hf_schema(dataset)
147
+ categorical_features = {
148
+ feature_name: feature_type.names
149
+ for (feature_name, feature_type) in hf_schema.items()
150
+ if isinstance(feature_type, datasets.ClassLabel)
151
+ }
152
+
153
+ try:
154
+ # random tmp name
155
+ tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
156
+ tab = pxt.create_table(tmp_name, pixeltable_schema, **kwargs)
157
+
158
+ def _translate_row(row: dict[str, Any], split_name: str) -> dict[str, Any]:
159
+ output_row = row.copy()
160
+ # map all class labels to strings
161
+ for field, values in categorical_features.items():
162
+ output_row[field] = values[row[field]]
163
+ # add split name to row
164
+ if column_name_for_split is not None:
165
+ output_row[column_name_for_split] = split_name
166
+ return output_row
167
+
168
+ for split_name, split_dataset in dataset_dict.items():
169
+ num_batches = split_dataset.size_in_bytes / _K_BATCH_SIZE_BYTES
170
+ tuples_per_batch = math.ceil(split_dataset.num_rows / num_batches)
171
+ assert tuples_per_batch > 0
172
+
173
+ batch = []
174
+ for row in split_dataset:
175
+ batch.append(_translate_row(row, split_name))
176
+ if len(batch) >= tuples_per_batch:
177
+ tab.insert(batch)
178
+ batch = []
179
+ # last batch
180
+ if len(batch) > 0:
181
+ tab.insert(batch)
182
+
183
+ except Exception as e:
184
+ _logger.error(f'Error while inserting dataset into table: {tmp_name}')
185
+ raise e
186
+
187
+ pxt.move(tmp_name, table_path)
188
+ return pxt.get_table(table_path)
@@ -0,0 +1,148 @@
1
+ from typing import Optional, Any, Iterable
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ import pixeltable as pxt
7
+ import pixeltable.exceptions as excs
8
+ import pixeltable.type_system as ts
9
+
10
+
11
+ def import_pandas(
12
+ tbl_name: str, df: pd.DataFrame, *, schema_overrides: Optional[dict[str, pxt.ColumnType]] = None
13
+ ) -> pxt.catalog.InsertableTable:
14
+ """Creates a new `Table` from a Pandas `DataFrame`, with the specified name. The schema of the table
15
+ will be inferred from the `DataFrame`, unless `schema` is specified.
16
+
17
+ The column names of the new `Table` will be identical to those in the `DataFrame`, as long as they are valid
18
+ Pixeltable identifiers. If a column name is not a valid Pixeltable identifier, it will be normalized according to
19
+ the following procedure:
20
+ - first replace any non-alphanumeric characters with underscores;
21
+ - then, preface the result with the letter 'c' if it begins with a number or an underscore;
22
+ - then, if there are any duplicate column names, suffix the duplicates with '_2', '_3', etc., in column order.
23
+
24
+ Args:
25
+ tbl_name: The name of the table to create.
26
+ df: The Pandas `DataFrame`.
27
+ schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
28
+ name `name` will be given type `type`, instead of being inferred from the `DataFrame`. The keys in
29
+ `schema_overrides` should be the column names of the `DataFrame` (whether or not they are valid
30
+ Pixeltable identifiers).
31
+ """
32
+ schema = _df_to_pxt_schema(df, schema_overrides)
33
+ tbl_rows = (dict(_df_row_to_pxt_row(row, schema)) for row in df.itertuples())
34
+ table = pxt.create_table(tbl_name, schema)
35
+ table.insert(tbl_rows)
36
+ return table
37
+
38
+
39
+ def import_csv(
40
+ table_path: str, filepath_or_buffer, schema_overrides: Optional[dict[str, ts.ColumnType]] = None, **kwargs
41
+ ) -> pxt.catalog.InsertableTable:
42
+ """
43
+ Creates a new `Table` from a csv file. This is a convenience method and is equivalent
44
+ to calling `import_pandas(table_path, pd.read_csv(filepath_or_buffer, **kwargs), schema=schema)`.
45
+ See the Pandas documentation for `read_csv` for more details.
46
+ """
47
+ df = pd.read_csv(filepath_or_buffer, **kwargs)
48
+ return import_pandas(table_path, df, schema_overrides=schema_overrides)
49
+
50
+
51
+ def import_excel(
52
+ table_path: str, io, *args, schema_overrides: Optional[dict[str, ts.ColumnType]] = None, **kwargs
53
+ ) -> pxt.catalog.InsertableTable:
54
+ """
55
+ Creates a new `Table` from an excel (.xlsx) file. This is a convenience method and is equivalent
56
+ to calling `import_pandas(table_path, pd.read_excel(io, *args, **kwargs), schema=schema)`.
57
+ See the Pandas documentation for `read_excel` for more details.
58
+ """
59
+ df = pd.read_excel(io, *args, **kwargs)
60
+ return import_pandas(table_path, df, schema_overrides=schema_overrides)
61
+
62
+
63
+ def _df_to_pxt_schema(
64
+ df: pd.DataFrame, schema_overrides: Optional[dict[str, pxt.ColumnType]]
65
+ ) -> dict[str, pxt.ColumnType]:
66
+ if schema_overrides is not None:
67
+ for pd_name in schema_overrides:
68
+ if pd_name not in df.columns:
69
+ raise excs.Error(
70
+ f'Column `{pd_name}` specified in `schema_overrides` does not exist in the given `DataFrame`.'
71
+ )
72
+ schema = {}
73
+ for pd_name, pd_dtype in zip(df.columns, df.dtypes):
74
+ if schema_overrides is not None and pd_name in schema_overrides:
75
+ pxt_type = schema_overrides[pd_name]
76
+ else:
77
+ pxt_type = _np_dtype_to_pxt_type(pd_dtype, df[pd_name])
78
+ pxt_name = _normalize_pxt_col_name(pd_name)
79
+ # Ensure that column names are unique by appending a distinguishing suffix
80
+ # to any collisions
81
+ if pxt_name in schema:
82
+ n = 2
83
+ while f'{pxt_name}_{n}' in schema:
84
+ n += 1
85
+ pxt_name = f'{pxt_name}_{n}'
86
+ schema[pxt_name] = pxt_type
87
+ return schema
88
+
89
+
90
+ def _normalize_pxt_col_name(pd_name: str) -> str:
91
+ """
92
+ Normalizes an arbitrary DataFrame column name into a valid Pixeltable identifier by:
93
+ - replacing any non-ascii or non-alphanumeric characters with an underscore _
94
+ - prefixing the result with the letter 'c' if it starts with an underscore or a number
95
+ """
96
+ id = ''.join(ch if ch.isascii() and ch.isalnum() else '_' for ch in pd_name)
97
+ if id[0].isnumeric():
98
+ id = f'c_{id}'
99
+ elif id[0] == '_':
100
+ id = f'c{id}'
101
+ assert pxt.catalog.is_valid_identifier(id), id
102
+ return id
103
+
104
+
105
+ def _np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series) -> pxt.ColumnType:
106
+ """
107
+ Infers a Pixeltable type based on a Numpy dtype.
108
+ """
109
+ if np.issubdtype(np_dtype, np.integer):
110
+ return pxt.IntType()
111
+ if np.issubdtype(np_dtype, np.floating):
112
+ return pxt.FloatType()
113
+ if np.issubdtype(np_dtype, np.bool_):
114
+ return pxt.BoolType()
115
+ if np_dtype == np.object_ or np.issubdtype(np_dtype, np.character):
116
+ has_nan = any(isinstance(val, float) and np.isnan(val) for val in data_col)
117
+ return pxt.StringType(nullable=has_nan)
118
+ if np.issubdtype(np_dtype, np.datetime64):
119
+ has_nat = any(pd.isnull(val) for val in data_col)
120
+ return pxt.TimestampType(nullable=has_nat)
121
+ raise excs.Error(f'Unsupported dtype: {np_dtype}')
122
+
123
+
124
+ def _df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType]) -> dict[str, Any]:
125
+ rows = {}
126
+ for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
127
+ if pxt_type.is_float_type():
128
+ val = float(val)
129
+ elif isinstance(val, float) and np.isnan(val):
130
+ # pandas uses NaN for empty cells, even for types other than float;
131
+ # for any type but a float, convert these to None
132
+ val = None
133
+ elif pxt_type.is_int_type():
134
+ val = int(val)
135
+ elif pxt_type.is_bool_type():
136
+ val = bool(val)
137
+ elif pxt_type.is_string_type():
138
+ val = str(val)
139
+ elif pxt_type.is_timestamp_type():
140
+ if pd.isnull(val):
141
+ # pandas has the bespoke 'NaT' type for a missing timestamp; postgres is very
142
+ # much not-ok with it. (But if we convert it to None and then load out the
143
+ # table contents as a pandas DataFrame, it will correctly restore the 'NaT'!)
144
+ val = None
145
+ else:
146
+ val = pd.Timestamp(val).to_pydatetime()
147
+ rows[col_name] = val
148
+ return rows
@@ -0,0 +1,192 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import json
5
+ import logging
6
+ import random
7
+ import typing
8
+ from collections import deque
9
+ from pathlib import Path
10
+ from typing import Dict, Optional
11
+
12
+ import PIL.Image
13
+ import numpy as np
14
+
15
+ import pixeltable.exceptions as exc
16
+ import pixeltable.type_system as ts
17
+ from pixeltable.utils.transactional_directory import transactional_directory
18
+
19
+ if typing.TYPE_CHECKING:
20
+ import pixeltable as pxt
21
+ import pyarrow as pa
22
+
23
+ _logger = logging.getLogger(__name__)
24
+
25
+
26
+ def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
27
+ import pyarrow as pa
28
+
29
+ pydict = {}
30
+ for field in schema:
31
+ if isinstance(field.type, pa.FixedShapeTensorType):
32
+ stacked_arr = np.stack(value_batch[field.name])
33
+ pydict[field.name] = pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr)
34
+ else:
35
+ pydict[field.name] = value_batch[field.name]
36
+
37
+ tab = pa.Table.from_pydict(pydict, schema=schema)
38
+ pa.parquet.write_table(tab, output_path)
39
+
40
+
41
+ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
42
+ """
43
+ Internal method to stream dataframe data to parquet format.
44
+ Does not materialize the dataset to memory.
45
+
46
+ It preserves pixeltable type metadata in a json file, which would otherwise
47
+ not be available in the parquet format.
48
+
49
+ Images are stored inline in a compressed format in their parquet file.
50
+
51
+ Args:
52
+ df : dataframe to save.
53
+ dest_path : path to directory to save the parquet files to.
54
+ partition_size_bytes : maximum target size for each chunk. Default 100_000_000 bytes.
55
+ """
56
+ from pixeltable.utils.arrow import to_arrow_schema
57
+
58
+ column_names = df.get_column_names()
59
+ column_types = df.get_column_types()
60
+ type_dict = {k: v.as_dict() for k, v in zip(column_names, column_types)}
61
+ arrow_schema = to_arrow_schema(dict(zip(column_names, column_types)))
62
+
63
+ # store the changes atomically
64
+ with transactional_directory(dest_path) as temp_path:
65
+ # dump metadata json file so we can inspect what was the source of the parquet file later on.
66
+ json.dump(df._as_dict(), (temp_path / '.pixeltable.json').open('w')) # pylint: disable=protected-access
67
+ json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
68
+
69
+ batch_num = 0
70
+ current_value_batch: Dict[str, deque] = {k: deque() for k in column_names}
71
+ current_byte_estimate = 0
72
+
73
+ for data_row in df._exec(): # pylint: disable=protected-access
74
+ for col_name, col_type, e in zip(column_names, column_types, df._select_list_exprs): # pylint: disable=protected-access
75
+ val = data_row[e.slot_idx]
76
+ if val is None:
77
+ current_value_batch[col_name].append(val)
78
+ continue
79
+
80
+ assert val is not None
81
+ if col_type.is_image_type():
82
+ # images get inlined into the parquet file
83
+ if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
84
+ # if there is a file, read directly to preserve information
85
+ with open(data_row.file_paths[e.slot_idx], 'rb') as f:
86
+ val = f.read()
87
+ elif isinstance(val, PIL.Image.Image):
88
+ # if no file available, eg. bc it is computed, convert to png
89
+ buf = io.BytesIO()
90
+ val.save(buf, format='PNG')
91
+ val = buf.getvalue()
92
+ else:
93
+ assert False, f'unknown image type {type(val)}'
94
+ length = len(val)
95
+ elif col_type.is_string_type():
96
+ length = len(val)
97
+ elif col_type.is_video_type():
98
+ if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
99
+ val = data_row.file_paths[e.slot_idx]
100
+ else:
101
+ assert False, f'unknown video type {type(val)}'
102
+ length = len(val)
103
+ elif col_type.is_json_type():
104
+ val = json.dumps(val)
105
+ length = len(val)
106
+ elif col_type.is_array_type():
107
+ length = val.nbytes
108
+ elif col_type.is_int_type():
109
+ length = 8
110
+ elif col_type.is_float_type():
111
+ length = 8
112
+ elif col_type.is_bool_type():
113
+ length = 1
114
+ elif col_type.is_timestamp_type():
115
+ length = 8
116
+ else:
117
+ assert False, f'unknown type {col_type} for {col_name}'
118
+
119
+ current_value_batch[col_name].append(val)
120
+ current_byte_estimate += length
121
+ if current_byte_estimate > partition_size_bytes:
122
+ assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
123
+ _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
124
+ batch_num += 1
125
+ current_value_batch = {k: deque() for k in column_names}
126
+ current_byte_estimate = 0
127
+
128
+ _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
129
+
130
+
131
+ def parquet_schema_to_pixeltable_schema(parquet_path: str) -> Dict[str, Optional[ts.ColumnType]]:
132
+ """Generate a default pixeltable schema for the given parquet file. Returns None for unknown types."""
133
+ import pyarrow as pa
134
+ from pixeltable.utils.arrow import to_pixeltable_schema
135
+
136
+ input_path = Path(parquet_path).expanduser()
137
+ parquet_dataset = pa.parquet.ParquetDataset(input_path)
138
+ return to_pixeltable_schema(parquet_dataset.schema)
139
+
140
+
141
+ def import_parquet(
142
+ table_path: str,
143
+ *,
144
+ parquet_path: str,
145
+ schema_override: Optional[Dict[str, ts.ColumnType]] = None,
146
+ **kwargs,
147
+ ) -> pxt.catalog.InsertableTable:
148
+ """Create a new `Table` from a Parquet file or set of files. Requires pyarrow to be installed.
149
+ Args:
150
+ path_str: Path to the table within pixeltable.
151
+ parquet_path: Path to an individual Parquet file or directory of Parquet files.
152
+ schema_override: Optional dictionary mapping column names to column type to override the default
153
+ schema inferred from the Parquet file. The column type should be a pixeltable ColumnType.
154
+ For example, {'col_vid': VideoType()}, rather than {'col_vid': StringType()}.
155
+ Any fields not provided explicitly will map to types with `pixeltable.utils.parquet.parquet_schema_to_pixeltable_schema`
156
+ kwargs: Additional arguments to pass to `create_table`.
157
+
158
+ Returns:
159
+ The newly created table. The table will have loaded the data from the Parquet file(s).
160
+ """
161
+ import pixeltable as pxt
162
+ import pyarrow as pa
163
+ from pixeltable.utils.arrow import iter_tuples
164
+
165
+ input_path = Path(parquet_path).expanduser()
166
+ parquet_dataset = pa.parquet.ParquetDataset(input_path)
167
+
168
+ schema = parquet_schema_to_pixeltable_schema(parquet_path)
169
+ if schema_override is None:
170
+ schema_override = {}
171
+
172
+ schema.update(schema_override)
173
+ for k, v in schema.items():
174
+ if v is None:
175
+ raise exc.Error(f'Could not infer pixeltable type for column {k} from parquet file')
176
+
177
+ if table_path in pxt.list_tables():
178
+ raise exc.Error(f'Table {table_path} already exists')
179
+
180
+ try:
181
+ tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
182
+ tab = pxt.create_table(tmp_name, schema, **kwargs)
183
+ for fragment in parquet_dataset.fragments:
184
+ for batch in fragment.to_batches():
185
+ dict_batch = list(iter_tuples(batch))
186
+ tab.insert(dict_batch)
187
+ except Exception as e:
188
+ _logger.error(f'Error while inserting Parquet file into table: {e}')
189
+ raise e
190
+
191
+ pxt.move(tmp_name, table_path)
192
+ return pxt.get_table(table_path)
@@ -0,0 +1,3 @@
1
+ from .base import ComponentIterator
2
+ from .document import DocumentSplitter
3
+ from .video import FrameIterator
@@ -0,0 +1,52 @@
1
+ from __future__ import annotations
2
+ from typing import Dict, Any, Tuple, List
3
+ from abc import abstractmethod, ABC
4
+
5
+ from pixeltable.type_system import ColumnType
6
+
7
+
8
+ class ComponentIterator(ABC):
9
+ """Base class for iterators."""
10
+
11
+ @classmethod
12
+ @abstractmethod
13
+ def input_schema(cls) -> Dict[str, ColumnType]:
14
+ """Provide the Pixeltable types of the init() parameters
15
+
16
+ The keys need to match the names of the init() parameters. This is equivalent to the parameters_types
17
+ parameter of the @function decorator.
18
+ """
19
+ raise NotImplementedError
20
+
21
+ @classmethod
22
+ @abstractmethod
23
+ def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
24
+ """Specify the dictionary returned by next() and a list of unstored column names
25
+
26
+ Returns:
27
+ a dictionary which is turned into a list of columns in the output table
28
+ a list of unstored column names
29
+ """
30
+ raise NotImplementedError
31
+
32
+ def __iter__(self) -> ComponentIterator:
33
+ return self
34
+
35
+ @abstractmethod
36
+ def __next__(self) -> Dict[str, Any]:
37
+ """Return the next element of the iterator as a dictionary or raise StopIteration"""
38
+ raise NotImplementedError
39
+
40
+ @abstractmethod
41
+ def close(self) -> None:
42
+ """Close the iterator and release all resources"""
43
+ raise NotImplementedError
44
+
45
+ @abstractmethod
46
+ def set_pos(self, pos: int) -> None:
47
+ """Set the iterator position to pos"""
48
+ raise NotImplementedError
49
+
50
+ @classmethod
51
+ def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
52
+ return cls, kwargs