pixeltable 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (110) hide show
  1. pixeltable/__init__.py +20 -9
  2. pixeltable/__version__.py +3 -0
  3. pixeltable/catalog/column.py +23 -7
  4. pixeltable/catalog/insertable_table.py +32 -19
  5. pixeltable/catalog/table.py +210 -20
  6. pixeltable/catalog/table_version.py +272 -111
  7. pixeltable/catalog/table_version_path.py +6 -1
  8. pixeltable/dataframe.py +184 -110
  9. pixeltable/datatransfer/__init__.py +1 -0
  10. pixeltable/datatransfer/label_studio.py +526 -0
  11. pixeltable/datatransfer/remote.py +113 -0
  12. pixeltable/env.py +213 -79
  13. pixeltable/exec/__init__.py +2 -1
  14. pixeltable/exec/data_row_batch.py +6 -7
  15. pixeltable/exec/expr_eval_node.py +28 -28
  16. pixeltable/exec/sql_scan_node.py +7 -6
  17. pixeltable/exprs/__init__.py +4 -3
  18. pixeltable/exprs/column_ref.py +11 -2
  19. pixeltable/exprs/comparison.py +39 -1
  20. pixeltable/exprs/data_row.py +7 -0
  21. pixeltable/exprs/expr.py +26 -19
  22. pixeltable/exprs/function_call.py +17 -18
  23. pixeltable/exprs/globals.py +14 -2
  24. pixeltable/exprs/image_member_access.py +9 -28
  25. pixeltable/exprs/in_predicate.py +96 -0
  26. pixeltable/exprs/inline_array.py +13 -11
  27. pixeltable/exprs/inline_dict.py +15 -13
  28. pixeltable/exprs/row_builder.py +7 -1
  29. pixeltable/exprs/similarity_expr.py +67 -0
  30. pixeltable/ext/functions/whisperx.py +30 -0
  31. pixeltable/ext/functions/yolox.py +16 -0
  32. pixeltable/func/__init__.py +0 -2
  33. pixeltable/func/aggregate_function.py +5 -2
  34. pixeltable/func/callable_function.py +57 -13
  35. pixeltable/func/expr_template_function.py +14 -3
  36. pixeltable/func/function.py +35 -4
  37. pixeltable/func/signature.py +5 -15
  38. pixeltable/func/udf.py +8 -12
  39. pixeltable/functions/fireworks.py +9 -4
  40. pixeltable/functions/huggingface.py +48 -5
  41. pixeltable/functions/openai.py +49 -11
  42. pixeltable/functions/pil/image.py +61 -64
  43. pixeltable/functions/together.py +32 -6
  44. pixeltable/functions/util.py +0 -43
  45. pixeltable/functions/video.py +46 -8
  46. pixeltable/globals.py +443 -0
  47. pixeltable/index/__init__.py +1 -0
  48. pixeltable/index/base.py +9 -2
  49. pixeltable/index/btree.py +54 -0
  50. pixeltable/index/embedding_index.py +91 -15
  51. pixeltable/io/__init__.py +4 -0
  52. pixeltable/io/globals.py +59 -0
  53. pixeltable/{utils → io}/hf_datasets.py +48 -17
  54. pixeltable/io/pandas.py +148 -0
  55. pixeltable/{utils → io}/parquet.py +58 -33
  56. pixeltable/iterators/__init__.py +1 -1
  57. pixeltable/iterators/base.py +8 -4
  58. pixeltable/iterators/document.py +225 -93
  59. pixeltable/iterators/video.py +16 -9
  60. pixeltable/metadata/__init__.py +8 -4
  61. pixeltable/metadata/converters/convert_12.py +3 -0
  62. pixeltable/metadata/converters/convert_13.py +41 -0
  63. pixeltable/metadata/converters/convert_14.py +13 -0
  64. pixeltable/metadata/converters/convert_15.py +29 -0
  65. pixeltable/metadata/converters/util.py +63 -0
  66. pixeltable/metadata/schema.py +12 -6
  67. pixeltable/plan.py +11 -24
  68. pixeltable/store.py +16 -23
  69. pixeltable/tool/create_test_db_dump.py +49 -14
  70. pixeltable/type_system.py +27 -58
  71. pixeltable/utils/coco.py +94 -0
  72. pixeltable/utils/documents.py +42 -12
  73. pixeltable/utils/http_server.py +70 -0
  74. pixeltable-0.2.7.dist-info/METADATA +137 -0
  75. pixeltable-0.2.7.dist-info/RECORD +126 -0
  76. {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +1 -1
  77. pixeltable/client.py +0 -600
  78. pixeltable/exprs/image_similarity_predicate.py +0 -58
  79. pixeltable/func/batched_function.py +0 -53
  80. pixeltable/func/nos_function.py +0 -202
  81. pixeltable/tests/conftest.py +0 -171
  82. pixeltable/tests/ext/test_yolox.py +0 -21
  83. pixeltable/tests/functions/test_fireworks.py +0 -43
  84. pixeltable/tests/functions/test_functions.py +0 -60
  85. pixeltable/tests/functions/test_huggingface.py +0 -158
  86. pixeltable/tests/functions/test_openai.py +0 -162
  87. pixeltable/tests/functions/test_together.py +0 -112
  88. pixeltable/tests/test_audio.py +0 -65
  89. pixeltable/tests/test_catalog.py +0 -27
  90. pixeltable/tests/test_client.py +0 -21
  91. pixeltable/tests/test_component_view.py +0 -379
  92. pixeltable/tests/test_dataframe.py +0 -440
  93. pixeltable/tests/test_dirs.py +0 -107
  94. pixeltable/tests/test_document.py +0 -120
  95. pixeltable/tests/test_exprs.py +0 -802
  96. pixeltable/tests/test_function.py +0 -332
  97. pixeltable/tests/test_index.py +0 -138
  98. pixeltable/tests/test_migration.py +0 -44
  99. pixeltable/tests/test_nos.py +0 -54
  100. pixeltable/tests/test_snapshot.py +0 -231
  101. pixeltable/tests/test_table.py +0 -1343
  102. pixeltable/tests/test_transactional_directory.py +0 -42
  103. pixeltable/tests/test_types.py +0 -52
  104. pixeltable/tests/test_video.py +0 -159
  105. pixeltable/tests/test_view.py +0 -535
  106. pixeltable/tests/utils.py +0 -442
  107. pixeltable/utils/clip.py +0 -18
  108. pixeltable-0.2.5.dist-info/METADATA +0 -128
  109. pixeltable-0.2.5.dist-info/RECORD +0 -139
  110. {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
@@ -0,0 +1,59 @@
1
+ from typing import Any, Optional, Literal
2
+
3
+ import pixeltable as pxt
4
+ from pixeltable import Table
5
+
6
+
7
+ def create_label_studio_project(
8
+ t: Table,
9
+ label_config: str,
10
+ col_mapping: Optional[dict[str, str]] = None,
11
+ title: Optional[str] = None,
12
+ media_import_method: Literal['post', 'file'] = 'file',
13
+ sync_immediately: bool = True,
14
+ **kwargs: Any
15
+ ) -> None:
16
+ """
17
+ Creates a new Label Studio project and links it to the specified `Table`.
18
+
19
+ The required parameter `label_config` specifies the Label Studio project configuration,
20
+ in XML format, as described in the Label Studio documentation. The linked project will
21
+ have one column for each data field in the configuration; for example, if the
22
+ configuration has an entry
23
+ ```
24
+ <Image name="image_obj" value="$image"/>
25
+ ```
26
+ then the linked project will have a column named `image`. In addition, the linked project
27
+ will always have a JSON-typed column `annotations` representing the output.
28
+
29
+ By default, Pixeltable will link each of these columns to a column of the specified `Table`
30
+ with the same name. If any of the data fields are missing, an exception will be thrown. If
31
+ the `annotations` column is missing, it will be created. The default names can be overridden
32
+ by specifying an optional `col_mapping`, with Pixeltable column names as keys and Label
33
+ Studio field names as values.
34
+
35
+ Args:
36
+ t: The Table to link to.
37
+ label_config: The Label Studio project configuration, in XML format.
38
+ col_mapping: An optional mapping of local column names to remote column names.
39
+ title: An optional title for the Label Studio project. If not specified, the
40
+ name of the `Table` will be used as a default.
41
+ sync_immediately: If `True`, immediately perform an initial synchronization by
42
+ importing all rows of the `Table` as Label Studio tasks.
43
+ """
44
+ from pixeltable.datatransfer.label_studio import LabelStudioProject, ANNOTATIONS_COLUMN
45
+
46
+ ls_project = LabelStudioProject.create(title or t.get_name(), label_config, media_import_method, **kwargs)
47
+
48
+ # Create a column to hold the annotations, if one does not yet exist.
49
+ if col_mapping is not None and ANNOTATIONS_COLUMN in col_mapping.values():
50
+ local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
51
+ else:
52
+ local_annotations_column = ANNOTATIONS_COLUMN
53
+ if local_annotations_column not in t.column_names():
54
+ t[local_annotations_column] = pxt.JsonType(nullable=True)
55
+
56
+ # Link the project to `t`, and sync if appropriate.
57
+ t._link(ls_project, col_mapping)
58
+ if sync_immediately:
59
+ t.sync()
@@ -1,11 +1,17 @@
1
- import datasets
2
- from typing import Union, Optional, List, Dict, Any
3
- import pixeltable.type_system as ts
4
- from pixeltable import exceptions as excs
5
- import math
1
+ from __future__ import annotations
2
+
6
3
  import logging
7
- import pixeltable
4
+ import math
8
5
  import random
6
+ import typing
7
+ from typing import Union, Optional, Any
8
+
9
+ import pixeltable
10
+ import pixeltable.type_system as ts
11
+ from pixeltable import exceptions as excs
12
+
13
+ if typing.TYPE_CHECKING:
14
+ import datasets
9
15
 
10
16
  _logger = logging.getLogger(__name__)
11
17
 
@@ -17,7 +23,7 @@ _K_BATCH_SIZE_BYTES = 100_000_000
17
23
  # note, there are many more types. we allow overrides in the schema_override parameter
18
24
  # to handle cases where the appropriate type is not yet mapped, or to override this mapping.
19
25
  # https://huggingface.co/docs/datasets/v2.17.0/en/package_reference/main_classes#datasets.Value
20
- _hf_to_pxt: Dict[str, ts.ColumnType] = {
26
+ _hf_to_pxt: dict[str, ts.ColumnType] = {
21
27
  'int32': ts.IntType(nullable=True), # pixeltable widens to big int
22
28
  'int64': ts.IntType(nullable=True),
23
29
  'bool': ts.BoolType(nullable=True),
@@ -27,10 +33,13 @@ _hf_to_pxt: Dict[str, ts.ColumnType] = {
27
33
  'timestamp[ms]': ts.TimestampType(nullable=True), # HF dataset iterator converts timestamps to datetime.datetime
28
34
  }
29
35
 
36
+
30
37
  def _to_pixeltable_type(
31
38
  feature_type: Union[datasets.ClassLabel, datasets.Value, datasets.Sequence],
32
39
  ) -> Optional[ts.ColumnType]:
33
40
  """Convert a huggingface feature type to a pixeltable ColumnType if one is defined."""
41
+ import datasets
42
+
34
43
  if isinstance(feature_type, datasets.ClassLabel):
35
44
  # enum, example: ClassLabel(names=['neg', 'pos'], id=None)
36
45
  return ts.StringType(nullable=True)
@@ -45,14 +54,18 @@ def _to_pixeltable_type(
45
54
  else:
46
55
  return None
47
56
 
57
+
48
58
  def _get_hf_schema(dataset: Union[datasets.Dataset, datasets.DatasetDict]) -> datasets.Features:
49
59
  """Get the schema of a huggingface dataset as a dictionary."""
60
+ import datasets
61
+
50
62
  first_dataset = dataset if isinstance(dataset, datasets.Dataset) else next(iter(dataset.values()))
51
63
  return first_dataset.features
52
64
 
65
+
53
66
  def huggingface_schema_to_pixeltable_schema(
54
67
  hf_dataset: Union[datasets.Dataset, datasets.DatasetDict],
55
- ) -> Dict[str, Optional[ts.ColumnType]]:
68
+ ) -> dict[str, Optional[ts.ColumnType]]:
56
69
  """Generate a pixeltable schema from a huggingface dataset schema.
57
70
  Columns without a known mapping are mapped to None
58
71
  """
@@ -62,17 +75,35 @@ def huggingface_schema_to_pixeltable_schema(
62
75
  }
63
76
  return pixeltable_schema
64
77
 
78
+
65
79
  def import_huggingface_dataset(
66
- cl: 'pixeltable.Client',
67
80
  table_path: str,
68
81
  dataset: Union[datasets.Dataset, datasets.DatasetDict],
69
82
  *,
70
- column_name_for_split: Optional[str],
71
- schema_override: Optional[Dict[str, Any]],
83
+ column_name_for_split: Optional[str] = None,
84
+ schema_override: Optional[dict[str, Any]] = None,
72
85
  **kwargs,
73
86
  ) -> 'pixeltable.InsertableTable':
74
- """See `pixeltable.Client.import_huggingface_dataset` for documentation"""
75
- if table_path in cl.list_tables():
87
+ """Create a new `Table` from a Huggingface dataset, or dataset dict with multiple splits.
88
+ Requires datasets library to be installed.
89
+
90
+ Args:
91
+ path_str: Path to the table.
92
+ dataset: Huggingface datasets.Dataset or datasets.DatasetDict to insert into the table.
93
+ column_name_for_split: column name to use for split information. If None, no split information will be stored.
94
+ schema_override: Optional dictionary mapping column names to column type to override the corresponding defaults from
95
+ `pixeltable.utils.hf_datasets.huggingface_schema_to_pixeltable_schema`. The column type should be a pixeltable ColumnType.
96
+ For example, {'col_vid': VideoType()}, rather than {'col_vid': StringType()}.
97
+
98
+ kwargs: Additional arguments to pass to `create_table`.
99
+
100
+ Returns:
101
+ The newly created table. The table will have loaded the data from the dataset.
102
+ """
103
+ import datasets
104
+ import pixeltable as pxt
105
+
106
+ if table_path in pxt.list_tables():
76
107
  raise excs.Error(f'table {table_path} already exists')
77
108
 
78
109
  if not isinstance(dataset, (datasets.Dataset, datasets.DatasetDict)):
@@ -122,9 +153,9 @@ def import_huggingface_dataset(
122
153
  try:
123
154
  # random tmp name
124
155
  tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
125
- tab = cl.create_table(tmp_name, pixeltable_schema, **kwargs)
156
+ tab = pxt.create_table(tmp_name, pixeltable_schema, **kwargs)
126
157
 
127
- def _translate_row(row: Dict[str, Any], split_name: str) -> Dict[str, Any]:
158
+ def _translate_row(row: dict[str, Any], split_name: str) -> dict[str, Any]:
128
159
  output_row = row.copy()
129
160
  # map all class labels to strings
130
161
  for field, values in categorical_features.items():
@@ -153,5 +184,5 @@ def import_huggingface_dataset(
153
184
  _logger.error(f'Error while inserting dataset into table: {tmp_name}')
154
185
  raise e
155
186
 
156
- cl.move(tmp_name, table_path)
157
- return cl.get_table(table_path)
187
+ pxt.move(tmp_name, table_path)
188
+ return pxt.get_table(table_path)
@@ -0,0 +1,148 @@
1
+ from typing import Optional, Any, Iterable
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ import pixeltable as pxt
7
+ import pixeltable.exceptions as excs
8
+ import pixeltable.type_system as ts
9
+
10
+
11
+ def import_pandas(
12
+ tbl_name: str, df: pd.DataFrame, *, schema_overrides: Optional[dict[str, pxt.ColumnType]] = None
13
+ ) -> pxt.catalog.InsertableTable:
14
+ """Creates a new `Table` from a Pandas `DataFrame`, with the specified name. The schema of the table
15
+ will be inferred from the `DataFrame`, unless `schema` is specified.
16
+
17
+ The column names of the new `Table` will be identical to those in the `DataFrame`, as long as they are valid
18
+ Pixeltable identifiers. If a column name is not a valid Pixeltable identifier, it will be normalized according to
19
+ the following procedure:
20
+ - first replace any non-alphanumeric characters with underscores;
21
+ - then, preface the result with the letter 'c' if it begins with a number or an underscore;
22
+ - then, if there are any duplicate column names, suffix the duplicates with '_2', '_3', etc., in column order.
23
+
24
+ Args:
25
+ tbl_name: The name of the table to create.
26
+ df: The Pandas `DataFrame`.
27
+ schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
28
+ name `name` will be given type `type`, instead of being inferred from the `DataFrame`. The keys in
29
+ `schema_overrides` should be the column names of the `DataFrame` (whether or not they are valid
30
+ Pixeltable identifiers).
31
+ """
32
+ schema = _df_to_pxt_schema(df, schema_overrides)
33
+ tbl_rows = (dict(_df_row_to_pxt_row(row, schema)) for row in df.itertuples())
34
+ table = pxt.create_table(tbl_name, schema)
35
+ table.insert(tbl_rows)
36
+ return table
37
+
38
+
39
+ def import_csv(
40
+ table_path: str, filepath_or_buffer, schema_overrides: Optional[dict[str, ts.ColumnType]] = None, **kwargs
41
+ ) -> pxt.catalog.InsertableTable:
42
+ """
43
+ Creates a new `Table` from a csv file. This is a convenience method and is equivalent
44
+ to calling `import_pandas(table_path, pd.read_csv(filepath_or_buffer, **kwargs), schema=schema)`.
45
+ See the Pandas documentation for `read_csv` for more details.
46
+ """
47
+ df = pd.read_csv(filepath_or_buffer, **kwargs)
48
+ return import_pandas(table_path, df, schema_overrides=schema_overrides)
49
+
50
+
51
+ def import_excel(
52
+ table_path: str, io, *args, schema_overrides: Optional[dict[str, ts.ColumnType]] = None, **kwargs
53
+ ) -> pxt.catalog.InsertableTable:
54
+ """
55
+ Creates a new `Table` from an excel (.xlsx) file. This is a convenience method and is equivalent
56
+ to calling `import_pandas(table_path, pd.read_excel(io, *args, **kwargs), schema=schema)`.
57
+ See the Pandas documentation for `read_excel` for more details.
58
+ """
59
+ df = pd.read_excel(io, *args, **kwargs)
60
+ return import_pandas(table_path, df, schema_overrides=schema_overrides)
61
+
62
+
63
+ def _df_to_pxt_schema(
64
+ df: pd.DataFrame, schema_overrides: Optional[dict[str, pxt.ColumnType]]
65
+ ) -> dict[str, pxt.ColumnType]:
66
+ if schema_overrides is not None:
67
+ for pd_name in schema_overrides:
68
+ if pd_name not in df.columns:
69
+ raise excs.Error(
70
+ f'Column `{pd_name}` specified in `schema_overrides` does not exist in the given `DataFrame`.'
71
+ )
72
+ schema = {}
73
+ for pd_name, pd_dtype in zip(df.columns, df.dtypes):
74
+ if schema_overrides is not None and pd_name in schema_overrides:
75
+ pxt_type = schema_overrides[pd_name]
76
+ else:
77
+ pxt_type = _np_dtype_to_pxt_type(pd_dtype, df[pd_name])
78
+ pxt_name = _normalize_pxt_col_name(pd_name)
79
+ # Ensure that column names are unique by appending a distinguishing suffix
80
+ # to any collisions
81
+ if pxt_name in schema:
82
+ n = 2
83
+ while f'{pxt_name}_{n}' in schema:
84
+ n += 1
85
+ pxt_name = f'{pxt_name}_{n}'
86
+ schema[pxt_name] = pxt_type
87
+ return schema
88
+
89
+
90
+ def _normalize_pxt_col_name(pd_name: str) -> str:
91
+ """
92
+ Normalizes an arbitrary DataFrame column name into a valid Pixeltable identifier by:
93
+ - replacing any non-ascii or non-alphanumeric characters with an underscore _
94
+ - prefixing the result with the letter 'c' if it starts with an underscore or a number
95
+ """
96
+ id = ''.join(ch if ch.isascii() and ch.isalnum() else '_' for ch in pd_name)
97
+ if id[0].isnumeric():
98
+ id = f'c_{id}'
99
+ elif id[0] == '_':
100
+ id = f'c{id}'
101
+ assert pxt.catalog.is_valid_identifier(id), id
102
+ return id
103
+
104
+
105
+ def _np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series) -> pxt.ColumnType:
106
+ """
107
+ Infers a Pixeltable type based on a Numpy dtype.
108
+ """
109
+ if np.issubdtype(np_dtype, np.integer):
110
+ return pxt.IntType()
111
+ if np.issubdtype(np_dtype, np.floating):
112
+ return pxt.FloatType()
113
+ if np.issubdtype(np_dtype, np.bool_):
114
+ return pxt.BoolType()
115
+ if np_dtype == np.object_ or np.issubdtype(np_dtype, np.character):
116
+ has_nan = any(isinstance(val, float) and np.isnan(val) for val in data_col)
117
+ return pxt.StringType(nullable=has_nan)
118
+ if np.issubdtype(np_dtype, np.datetime64):
119
+ has_nat = any(pd.isnull(val) for val in data_col)
120
+ return pxt.TimestampType(nullable=has_nat)
121
+ raise excs.Error(f'Unsupported dtype: {np_dtype}')
122
+
123
+
124
+ def _df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType]) -> dict[str, Any]:
125
+ rows = {}
126
+ for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
127
+ if pxt_type.is_float_type():
128
+ val = float(val)
129
+ elif isinstance(val, float) and np.isnan(val):
130
+ # pandas uses NaN for empty cells, even for types other than float;
131
+ # for any type but a float, convert these to None
132
+ val = None
133
+ elif pxt_type.is_int_type():
134
+ val = int(val)
135
+ elif pxt_type.is_bool_type():
136
+ val = bool(val)
137
+ elif pxt_type.is_string_type():
138
+ val = str(val)
139
+ elif pxt_type.is_timestamp_type():
140
+ if pd.isnull(val):
141
+ # pandas has the bespoke 'NaT' type for a missing timestamp; postgres is very
142
+ # much not-ok with it. (But if we convert it to None and then load out the
143
+ # table contents as a pandas DataFrame, it will correctly restore the 'NaT'!)
144
+ val = None
145
+ else:
146
+ val = pd.Timestamp(val).to_pydatetime()
147
+ rows[col_name] = val
148
+ return rows
@@ -1,26 +1,31 @@
1
+ from __future__ import annotations
2
+
1
3
  import io
2
4
  import json
3
5
  import logging
6
+ import random
7
+ import typing
4
8
  from collections import deque
5
9
  from pathlib import Path
6
- from typing import Dict, List, Optional, Union
10
+ from typing import Dict, Optional
7
11
 
8
- import numpy as np
9
12
  import PIL.Image
10
- import pyarrow as pa
11
- import pyarrow.parquet
13
+ import numpy as np
12
14
 
15
+ import pixeltable.exceptions as exc
13
16
  import pixeltable.type_system as ts
14
- from pixeltable.utils.arrow import iter_tuples, to_arrow_schema, to_pixeltable_schema
15
17
  from pixeltable.utils.transactional_directory import transactional_directory
16
- import pixeltable.exceptions as exc
17
18
 
18
- import random
19
+ if typing.TYPE_CHECKING:
20
+ import pixeltable as pxt
21
+ import pyarrow as pa
19
22
 
20
23
  _logger = logging.getLogger(__name__)
21
24
 
22
25
 
23
- def _write_batch(value_batch : Dict[str, deque], schema : pa.Schema, output_path : Path) -> None:
26
+ def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
27
+ import pyarrow as pa
28
+
24
29
  pydict = {}
25
30
  for field in schema:
26
31
  if isinstance(field.type, pa.FixedShapeTensorType):
@@ -32,21 +37,24 @@ def _write_batch(value_batch : Dict[str, deque], schema : pa.Schema, output_path
32
37
  tab = pa.Table.from_pydict(pydict, schema=schema)
33
38
  pa.parquet.write_table(tab, output_path)
34
39
 
35
- def save_parquet(df: 'pixeltable.DataFrame', dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
40
+
41
+ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
36
42
  """
37
- Internal method to stream dataframe data to parquet format.
38
- Does not materialize the dataset to memory.
43
+ Internal method to stream dataframe data to parquet format.
44
+ Does not materialize the dataset to memory.
39
45
 
40
- It preserves pixeltable type metadata in a json file, which would otherwise
41
- not be available in the parquet format.
46
+ It preserves pixeltable type metadata in a json file, which would otherwise
47
+ not be available in the parquet format.
42
48
 
43
- Images are stored inline in a compressed format in their parquet file.
49
+ Images are stored inline in a compressed format in their parquet file.
44
50
 
45
- Args:
46
- df : dataframe to save.
47
- dest_path : path to directory to save the parquet files to.
48
- partition_size_bytes : maximum target size for each chunk. Default 100_000_000 bytes.
51
+ Args:
52
+ df : dataframe to save.
53
+ dest_path : path to directory to save the parquet files to.
54
+ partition_size_bytes : maximum target size for each chunk. Default 100_000_000 bytes.
49
55
  """
56
+ from pixeltable.utils.arrow import to_arrow_schema
57
+
50
58
  column_names = df.get_column_names()
51
59
  column_types = df.get_column_types()
52
60
  type_dict = {k: v.as_dict() for k, v in zip(column_names, column_types)}
@@ -55,15 +63,15 @@ def save_parquet(df: 'pixeltable.DataFrame', dest_path: Path, partition_size_byt
55
63
  # store the changes atomically
56
64
  with transactional_directory(dest_path) as temp_path:
57
65
  # dump metadata json file so we can inspect what was the source of the parquet file later on.
58
- json.dump(df._as_dict(), (temp_path / '.pixeltable.json').open('w')) # pylint: disable=protected-access
59
- json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
66
+ json.dump(df._as_dict(), (temp_path / '.pixeltable.json').open('w')) # pylint: disable=protected-access
67
+ json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
60
68
 
61
69
  batch_num = 0
62
- current_value_batch : Dict[str, deque] = {k:deque() for k in column_names}
70
+ current_value_batch: Dict[str, deque] = {k: deque() for k in column_names}
63
71
  current_byte_estimate = 0
64
72
 
65
- for data_row in df._exec(): # pylint: disable=protected-access
66
- for (col_name, col_type, e) in zip(column_names, column_types, df._select_list_exprs): # pylint: disable=protected-access
73
+ for data_row in df._exec(): # pylint: disable=protected-access
74
+ for col_name, col_type, e in zip(column_names, column_types, df._select_list_exprs): # pylint: disable=protected-access
67
75
  val = data_row[e.slot_idx]
68
76
  if val is None:
69
77
  current_value_batch[col_name].append(val)
@@ -112,9 +120,9 @@ def save_parquet(df: 'pixeltable.DataFrame', dest_path: Path, partition_size_byt
112
120
  current_byte_estimate += length
113
121
  if current_byte_estimate > partition_size_bytes:
114
122
  assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
115
- _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
123
+ _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
116
124
  batch_num += 1
117
- current_value_batch = {k:deque() for k in column_names}
125
+ current_value_batch = {k: deque() for k in column_names}
118
126
  current_byte_estimate = 0
119
127
 
120
128
  _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
@@ -122,6 +130,8 @@ def save_parquet(df: 'pixeltable.DataFrame', dest_path: Path, partition_size_byt
122
130
 
123
131
  def parquet_schema_to_pixeltable_schema(parquet_path: str) -> Dict[str, Optional[ts.ColumnType]]:
124
132
  """Generate a default pixeltable schema for the given parquet file. Returns None for unknown types."""
133
+ import pyarrow as pa
134
+ from pixeltable.utils.arrow import to_pixeltable_schema
125
135
 
126
136
  input_path = Path(parquet_path).expanduser()
127
137
  parquet_dataset = pa.parquet.ParquetDataset(input_path)
@@ -129,14 +139,29 @@ def parquet_schema_to_pixeltable_schema(parquet_path: str) -> Dict[str, Optional
129
139
 
130
140
 
131
141
  def import_parquet(
132
- cl: 'pixeltable.Client',
133
142
  table_path: str,
134
143
  *,
135
144
  parquet_path: str,
136
- schema_override: Optional[Dict[str, ts.ColumnType]],
145
+ schema_override: Optional[Dict[str, ts.ColumnType]] = None,
137
146
  **kwargs,
138
- ) -> 'catalog.InsertableTable':
139
- """See `pixeltable.Client.import_parquet` for documentation"""
147
+ ) -> pxt.catalog.InsertableTable:
148
+ """Create a new `Table` from a Parquet file or set of files. Requires pyarrow to be installed.
149
+ Args:
150
+ path_str: Path to the table within pixeltable.
151
+ parquet_path: Path to an individual Parquet file or directory of Parquet files.
152
+ schema_override: Optional dictionary mapping column names to column type to override the default
153
+ schema inferred from the Parquet file. The column type should be a pixeltable ColumnType.
154
+ For example, {'col_vid': VideoType()}, rather than {'col_vid': StringType()}.
155
+ Any fields not provided explicitly will map to types with `pixeltable.utils.parquet.parquet_schema_to_pixeltable_schema`
156
+ kwargs: Additional arguments to pass to `create_table`.
157
+
158
+ Returns:
159
+ The newly created table. The table will have loaded the data from the Parquet file(s).
160
+ """
161
+ import pixeltable as pxt
162
+ import pyarrow as pa
163
+ from pixeltable.utils.arrow import iter_tuples
164
+
140
165
  input_path = Path(parquet_path).expanduser()
141
166
  parquet_dataset = pa.parquet.ParquetDataset(input_path)
142
167
 
@@ -149,12 +174,12 @@ def import_parquet(
149
174
  if v is None:
150
175
  raise exc.Error(f'Could not infer pixeltable type for column {k} from parquet file')
151
176
 
152
- if table_path in cl.list_tables():
177
+ if table_path in pxt.list_tables():
153
178
  raise exc.Error(f'Table {table_path} already exists')
154
179
 
155
180
  try:
156
181
  tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
157
- tab = cl.create_table(tmp_name, schema, **kwargs)
182
+ tab = pxt.create_table(tmp_name, schema, **kwargs)
158
183
  for fragment in parquet_dataset.fragments:
159
184
  for batch in fragment.to_batches():
160
185
  dict_batch = list(iter_tuples(batch))
@@ -163,5 +188,5 @@ def import_parquet(
163
188
  _logger.error(f'Error while inserting Parquet file into table: {e}')
164
189
  raise e
165
190
 
166
- cl.move(tmp_name, table_path)
167
- return cl.get_table(table_path)
191
+ pxt.move(tmp_name, table_path)
192
+ return pxt.get_table(table_path)
@@ -1,3 +1,3 @@
1
1
  from .base import ComponentIterator
2
+ from .document import DocumentSplitter
2
3
  from .video import FrameIterator
3
-
@@ -6,11 +6,11 @@ from pixeltable.type_system import ColumnType
6
6
 
7
7
 
8
8
  class ComponentIterator(ABC):
9
- """Base class for iterators."""
9
+ """Base class for Pixeltable iterators."""
10
10
 
11
11
  @classmethod
12
12
  @abstractmethod
13
- def input_schema(cls) -> Dict[str, ColumnType]:
13
+ def input_schema(cls) -> dict[str, ColumnType]:
14
14
  """Provide the Pixeltable types of the init() parameters
15
15
 
16
16
  The keys need to match the names of the init() parameters. This is equivalent to the parameters_types
@@ -20,7 +20,7 @@ class ComponentIterator(ABC):
20
20
 
21
21
  @classmethod
22
22
  @abstractmethod
23
- def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
23
+ def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
24
24
  """Specify the dictionary returned by next() and a list of unstored column names
25
25
 
26
26
  Returns:
@@ -33,7 +33,7 @@ class ComponentIterator(ABC):
33
33
  return self
34
34
 
35
35
  @abstractmethod
36
- def __next__(self) -> Dict[str, Any]:
36
+ def __next__(self) -> dict[str, Any]:
37
37
  """Return the next element of the iterator as a dictionary or raise StopIteration"""
38
38
  raise NotImplementedError
39
39
 
@@ -46,3 +46,7 @@ class ComponentIterator(ABC):
46
46
  def set_pos(self, pos: int) -> None:
47
47
  """Set the iterator position to pos"""
48
48
  raise NotImplementedError
49
+
50
+ @classmethod
51
+ def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
52
+ return cls, kwargs