pixeltable 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (122) hide show
  1. pixeltable/__init__.py +2 -3
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +2 -1
  4. pixeltable/catalog/catalog.py +63 -36
  5. pixeltable/catalog/column.py +11 -4
  6. pixeltable/catalog/dir.py +5 -5
  7. pixeltable/catalog/globals.py +28 -14
  8. pixeltable/catalog/insertable_table.py +81 -43
  9. pixeltable/catalog/path.py +2 -2
  10. pixeltable/catalog/table.py +140 -109
  11. pixeltable/catalog/table_version.py +60 -43
  12. pixeltable/catalog/table_version_handle.py +3 -0
  13. pixeltable/catalog/table_version_path.py +1 -1
  14. pixeltable/catalog/view.py +17 -9
  15. pixeltable/dataframe.py +5 -3
  16. pixeltable/env.py +109 -43
  17. pixeltable/exec/__init__.py +2 -0
  18. pixeltable/exec/aggregation_node.py +6 -8
  19. pixeltable/exec/cache_prefetch_node.py +4 -7
  20. pixeltable/exec/component_iteration_node.py +1 -3
  21. pixeltable/exec/data_row_batch.py +1 -2
  22. pixeltable/exec/exec_context.py +1 -1
  23. pixeltable/exec/exec_node.py +2 -3
  24. pixeltable/exec/expr_eval/__init__.py +2 -0
  25. pixeltable/exec/expr_eval/evaluators.py +137 -20
  26. pixeltable/exec/expr_eval/expr_eval_node.py +43 -64
  27. pixeltable/exec/expr_eval/globals.py +68 -7
  28. pixeltable/exec/expr_eval/schedulers.py +25 -23
  29. pixeltable/exec/in_memory_data_node.py +8 -6
  30. pixeltable/exec/row_update_node.py +3 -4
  31. pixeltable/exec/sql_node.py +16 -17
  32. pixeltable/exprs/__init__.py +3 -2
  33. pixeltable/exprs/arithmetic_expr.py +2 -0
  34. pixeltable/exprs/column_property_ref.py +1 -1
  35. pixeltable/exprs/column_ref.py +39 -3
  36. pixeltable/exprs/compound_predicate.py +1 -1
  37. pixeltable/exprs/data_row.py +17 -1
  38. pixeltable/exprs/expr.py +51 -21
  39. pixeltable/exprs/function_call.py +34 -2
  40. pixeltable/exprs/globals.py +12 -0
  41. pixeltable/exprs/json_mapper.py +95 -48
  42. pixeltable/exprs/json_path.py +3 -10
  43. pixeltable/exprs/method_ref.py +2 -2
  44. pixeltable/exprs/object_ref.py +2 -2
  45. pixeltable/exprs/row_builder.py +33 -6
  46. pixeltable/exprs/similarity_expr.py +6 -21
  47. pixeltable/exprs/sql_element_cache.py +1 -1
  48. pixeltable/exprs/string_op.py +107 -0
  49. pixeltable/ext/__init__.py +1 -1
  50. pixeltable/ext/functions/__init__.py +1 -1
  51. pixeltable/ext/functions/whisperx.py +1 -1
  52. pixeltable/ext/functions/yolox.py +22 -65
  53. pixeltable/func/aggregate_function.py +1 -1
  54. pixeltable/func/callable_function.py +2 -5
  55. pixeltable/func/expr_template_function.py +22 -2
  56. pixeltable/func/function.py +4 -5
  57. pixeltable/func/function_registry.py +1 -1
  58. pixeltable/func/signature.py +1 -1
  59. pixeltable/func/tools.py +2 -2
  60. pixeltable/func/udf.py +2 -2
  61. pixeltable/functions/__init__.py +2 -2
  62. pixeltable/functions/anthropic.py +2 -2
  63. pixeltable/functions/audio.py +1 -1
  64. pixeltable/functions/deepseek.py +1 -1
  65. pixeltable/functions/fireworks.py +1 -1
  66. pixeltable/functions/globals.py +22 -11
  67. pixeltable/functions/huggingface.py +1 -1
  68. pixeltable/functions/image.py +1 -1
  69. pixeltable/functions/json.py +1 -1
  70. pixeltable/functions/llama_cpp.py +1 -1
  71. pixeltable/functions/math.py +1 -1
  72. pixeltable/functions/mistralai.py +1 -1
  73. pixeltable/functions/ollama.py +1 -1
  74. pixeltable/functions/openai.py +2 -2
  75. pixeltable/functions/replicate.py +1 -1
  76. pixeltable/functions/string.py +1 -1
  77. pixeltable/functions/timestamp.py +1 -1
  78. pixeltable/functions/together.py +1 -1
  79. pixeltable/functions/util.py +1 -1
  80. pixeltable/functions/video.py +2 -2
  81. pixeltable/functions/vision.py +2 -2
  82. pixeltable/globals.py +85 -33
  83. pixeltable/index/embedding_index.py +12 -1
  84. pixeltable/io/__init__.py +8 -5
  85. pixeltable/io/datarows.py +138 -0
  86. pixeltable/io/external_store.py +8 -5
  87. pixeltable/io/fiftyone.py +6 -7
  88. pixeltable/io/globals.py +7 -160
  89. pixeltable/io/hf_datasets.py +21 -98
  90. pixeltable/io/label_studio.py +21 -20
  91. pixeltable/io/pandas.py +35 -48
  92. pixeltable/io/parquet.py +17 -42
  93. pixeltable/io/table_data_conduit.py +569 -0
  94. pixeltable/io/utils.py +6 -21
  95. pixeltable/iterators/__init__.py +1 -1
  96. pixeltable/metadata/__init__.py +6 -4
  97. pixeltable/metadata/converters/convert_24.py +3 -3
  98. pixeltable/metadata/converters/convert_25.py +1 -1
  99. pixeltable/metadata/converters/convert_29.py +1 -1
  100. pixeltable/metadata/converters/convert_30.py +50 -0
  101. pixeltable/metadata/converters/util.py +26 -1
  102. pixeltable/metadata/notes.py +1 -0
  103. pixeltable/metadata/schema.py +3 -0
  104. pixeltable/store.py +2 -2
  105. pixeltable/type_system.py +19 -7
  106. pixeltable/utils/arrow.py +32 -7
  107. pixeltable/utils/console_output.py +3 -2
  108. pixeltable/utils/coroutine.py +3 -3
  109. pixeltable/utils/dbms.py +66 -0
  110. pixeltable/utils/documents.py +61 -67
  111. pixeltable/utils/filecache.py +1 -1
  112. pixeltable/utils/http_server.py +3 -2
  113. pixeltable/utils/pytorch.py +1 -1
  114. pixeltable/utils/sql.py +1 -1
  115. pixeltable-0.3.11.dist-info/METADATA +436 -0
  116. pixeltable-0.3.11.dist-info/RECORD +179 -0
  117. {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/WHEEL +1 -1
  118. pixeltable/catalog/path_dict.py +0 -169
  119. pixeltable-0.3.9.dist-info/METADATA +0 -382
  120. pixeltable-0.3.9.dist-info/RECORD +0 -175
  121. {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/LICENSE +0 -0
  122. {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/entry_points.txt +0 -0
@@ -1,41 +1,38 @@
1
1
  from __future__ import annotations
2
2
 
3
- import logging
4
- import math
5
- import random
6
3
  import typing
7
4
  from typing import Any, Optional, Union
8
5
 
9
6
  import pixeltable as pxt
10
7
  import pixeltable.type_system as ts
11
- from pixeltable import exceptions as excs
12
-
13
- from .utils import normalize_import_parameters, normalize_schema_names
14
8
 
15
9
  if typing.TYPE_CHECKING:
16
10
  import datasets # type: ignore[import-untyped]
17
11
 
18
- _logger = logging.getLogger('pixeltable')
19
-
20
- # use 100MB as the batch size limit for loading a huggingface dataset into pixeltable.
21
- # The primary goal is to bound memory use, regardless of dataset size.
22
- # Second goal is to limit overhead. 100MB is presumed to be reasonable for a lot of storage systems.
23
- _K_BATCH_SIZE_BYTES = 100_000_000
24
12
 
25
- # note, there are many more types. we allow overrides in the schema_override parameter
13
+ # note, there are many more types. we allow overrides in the schema_overrides parameter
26
14
  # to handle cases where the appropriate type is not yet mapped, or to override this mapping.
27
15
  # https://huggingface.co/docs/datasets/v2.17.0/en/package_reference/main_classes#datasets.Value
28
16
  _hf_to_pxt: dict[str, ts.ColumnType] = {
29
- 'int32': ts.IntType(nullable=True), # pixeltable widens to big int
30
- 'int64': ts.IntType(nullable=True),
31
17
  'bool': ts.BoolType(nullable=True),
18
+ 'int8': ts.IntType(nullable=True),
19
+ 'int16': ts.IntType(nullable=True),
20
+ 'int32': ts.IntType(nullable=True),
21
+ 'int64': ts.IntType(nullable=True),
22
+ 'uint8': ts.IntType(nullable=True),
23
+ 'uint16': ts.IntType(nullable=True),
24
+ 'uint32': ts.IntType(nullable=True),
25
+ 'uint64': ts.IntType(nullable=True),
26
+ 'float16': ts.FloatType(nullable=True),
32
27
  'float32': ts.FloatType(nullable=True),
33
28
  'float64': ts.FloatType(nullable=True),
34
- 'large_string': ts.StringType(nullable=True),
35
29
  'string': ts.StringType(nullable=True),
30
+ 'large_string': ts.StringType(nullable=True),
36
31
  'timestamp[s]': ts.TimestampType(nullable=True),
37
32
  'timestamp[ms]': ts.TimestampType(nullable=True), # HF dataset iterator converts timestamps to datetime.datetime
38
33
  'timestamp[us]': ts.TimestampType(nullable=True),
34
+ 'date32': ts.StringType(nullable=True), # date32 is not supported in pixeltable, use string
35
+ 'date64': ts.StringType(nullable=True), # date64 is not supported in pixeltable, use string
39
36
  }
40
37
 
41
38
 
@@ -88,7 +85,6 @@ def import_huggingface_dataset(
88
85
  table_path: str,
89
86
  dataset: Union[datasets.Dataset, datasets.DatasetDict],
90
87
  *,
91
- column_name_for_split: Optional[str] = None,
92
88
  schema_overrides: Optional[dict[str, Any]] = None,
93
89
  primary_key: Optional[Union[str, list[str]]] = None,
94
90
  **kwargs: Any,
@@ -101,91 +97,18 @@ def import_huggingface_dataset(
101
97
  dataset: Huggingface [`datasets.Dataset`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.Dataset)
102
98
  or [`datasets.DatasetDict`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.DatasetDict)
103
99
  to insert into the table.
104
- column_name_for_split: column name to use for split information. If None, no split information will be stored.
105
100
  schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
106
- name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`. The keys in
107
- `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not they are valid
108
- Pixeltable identifiers).
101
+ name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`.
102
+ The keys in `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not
103
+ they are valid Pixeltable identifiers).
109
104
  primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
110
105
  kwargs: Additional arguments to pass to `create_table`.
106
+ An argument of `column_name_for_split` must be provided if the source is a DatasetDict.
107
+ This column name will contain the split information. If None, no split information will be stored.
111
108
 
112
109
  Returns:
113
110
  A handle to the newly created [`Table`][pixeltable.Table].
114
111
  """
115
- import datasets
116
-
117
- import pixeltable as pxt
118
-
119
- if not isinstance(dataset, (datasets.Dataset, datasets.DatasetDict)):
120
- raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
121
-
122
- # Create the pixeltable schema from the huggingface schema
123
- hf_schema_source = _get_hf_schema(dataset)
124
- schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
125
- hf_schema = huggingface_schema_to_pxt_schema(hf_schema_source, schema_overrides, primary_key)
126
-
127
- # Add the split column to the schema if requested
128
- if column_name_for_split is not None:
129
- if column_name_for_split in hf_schema:
130
- raise excs.Error(
131
- f'Column name `{column_name_for_split}` already exists in dataset schema; provide a different `column_name_for_split`'
132
- )
133
- hf_schema[column_name_for_split] = ts.StringType(nullable=True)
134
-
135
- schema, pxt_pk, _ = normalize_schema_names(hf_schema, primary_key, schema_overrides, True)
136
-
137
- # Prepare to create table and insert data
138
- if table_path in pxt.list_tables():
139
- raise excs.Error(f'table {table_path} already exists')
140
-
141
- if isinstance(dataset, datasets.Dataset):
142
- # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
143
- raw_name = dataset.split._name
144
- split_name = raw_name.split('[')[0] if raw_name is not None else None
145
- dataset_dict = {split_name: dataset}
146
- else:
147
- dataset_dict = dataset
148
-
149
- # extract all class labels from the dataset to translate category ints to strings
150
- categorical_features = {
151
- feature_name: feature_type.names
152
- for (feature_name, feature_type) in hf_schema_source.items()
153
- if isinstance(feature_type, datasets.ClassLabel)
154
- }
155
-
156
- try:
157
- # random tmp name
158
- tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
159
- tab = pxt.create_table(tmp_name, schema, primary_key=pxt_pk, **kwargs)
160
-
161
- def _translate_row(row: dict[str, Any], split_name: str) -> dict[str, Any]:
162
- output_row = row.copy()
163
- # map all class labels to strings
164
- for field, values in categorical_features.items():
165
- output_row[field] = values[row[field]]
166
- # add split name to row
167
- if column_name_for_split is not None:
168
- output_row[column_name_for_split] = split_name
169
- return output_row
170
-
171
- for split_name, split_dataset in dataset_dict.items():
172
- num_batches = split_dataset.size_in_bytes / _K_BATCH_SIZE_BYTES
173
- tuples_per_batch = math.ceil(split_dataset.num_rows / num_batches)
174
- assert tuples_per_batch > 0
175
-
176
- batch = []
177
- for row in split_dataset:
178
- batch.append(_translate_row(row, split_name))
179
- if len(batch) >= tuples_per_batch:
180
- tab.insert(batch)
181
- batch = []
182
- # last batch
183
- if len(batch) > 0:
184
- tab.insert(batch)
185
-
186
- except Exception as e:
187
- _logger.error(f'Error while inserting dataset into table: {tmp_name}')
188
- raise e
189
-
190
- pxt.move(tmp_name, table_path)
191
- return pxt.get_table(table_path)
112
+ return pxt.create_table(
113
+ table_path, source=dataset, schema_overrides=schema_overrides, primary_key=primary_key, extra_args=kwargs
114
+ )
@@ -5,16 +5,14 @@ import os
5
5
  from dataclasses import dataclass
6
6
  from pathlib import Path
7
7
  from typing import Any, Iterator, Literal, Optional, cast
8
- from xml.etree import ElementTree
8
+ from xml.etree import ElementTree as ET
9
9
 
10
10
  import label_studio_sdk # type: ignore[import-untyped]
11
11
  import PIL.Image
12
12
  from requests.exceptions import HTTPError
13
13
 
14
14
  import pixeltable as pxt
15
- import pixeltable.env as env
16
- import pixeltable.exceptions as excs
17
- from pixeltable import Column, Table
15
+ from pixeltable import Column, Table, env, exceptions as excs
18
16
  from pixeltable.config import Config
19
17
  from pixeltable.exprs import ColumnRef, DataRow, Expr
20
18
  from pixeltable.io.external_store import Project, SyncStatus
@@ -140,7 +138,8 @@ class LabelStudioProject(Project):
140
138
  page += 1
141
139
  if unknown_task_count > 0:
142
140
  _logger.warning(
143
- f'Skipped {unknown_task_count} unrecognized task(s) when syncing Label Studio project "{self.project_title}".'
141
+ f'Skipped {unknown_task_count} unrecognized task(s) when syncing '
142
+ f'Label Studio project {self.project_title!r}.'
144
143
  )
145
144
 
146
145
  def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) -> SyncStatus:
@@ -174,11 +173,11 @@ class LabelStudioProject(Project):
174
173
  # Send media to Label Studio by HTTP post.
175
174
  assert len(t_data_cols) == 1 # This was verified when the project was set up
176
175
  return self.__update_tasks_by_post(t, existing_tasks, t_data_cols[0], t_rl_cols, rl_info)
177
- elif self.media_import_method == 'file' or self.media_import_method == 'url':
176
+ elif self.media_import_method in ('file', 'url'):
178
177
  # Send media to Label Studio by file reference (local file or URL).
179
178
  return self.__update_tasks_by_files(t, existing_tasks, t_data_cols, t_rl_cols, rl_info)
180
179
  else:
181
- assert False
180
+ raise AssertionError()
182
181
 
183
182
  def __update_tasks_by_post(
184
183
  self,
@@ -227,7 +226,7 @@ class LabelStudioProject(Project):
227
226
  )
228
227
  for i in range(len(coco_annotations))
229
228
  ]
230
- _logger.debug(f'`predictions`: %s', predictions)
229
+ _logger.debug('`predictions`: {%s}', predictions)
231
230
  self.project.create_predictions(predictions)
232
231
  tasks_created += 1
233
232
 
@@ -358,7 +357,7 @@ class LabelStudioProject(Project):
358
357
  def __localpath_to_lspath(cls, localpath: str) -> str:
359
358
  # Transform the local path into Label Studio's bespoke path format.
360
359
  relpath = Path(localpath).relative_to(Config.get().home)
361
- return f'/data/local-files/?d={str(relpath)}'
360
+ return f'/data/local-files/?d={relpath}'
362
361
 
363
362
  def __delete_stale_tasks(
364
363
  self, existing_tasks: dict[tuple, dict], row_ids_in_pxt: set[tuple], tasks_created: int
@@ -405,7 +404,8 @@ class LabelStudioProject(Project):
405
404
  updates = [{'_rowid': rowid, local_annotations_col.name: ann} for rowid, ann in annotations.items()]
406
405
  if len(updates) > 0:
407
406
  _logger.info(
408
- f'Updating table `{t._name}`, column `{local_annotations_col.name}` with {len(updates)} total annotations.'
407
+ f'Updating table {t._name!r}, column {local_annotations_col.name!r} '
408
+ f'with {len(updates)} total annotations.'
409
409
  )
410
410
  # batch_update currently doesn't propagate from views to base tables. As a workaround, we call
411
411
  # batch_update on the actual ancestor table that holds the annotations column.
@@ -451,7 +451,7 @@ class LabelStudioProject(Project):
451
451
  Parses a Label Studio XML config, extracting the names and Pixeltable types of
452
452
  all input variables.
453
453
  """
454
- root: ElementTree.Element = ElementTree.fromstring(xml_config)
454
+ root: ET.Element = ET.fromstring(xml_config)
455
455
  if root.tag.lower() != 'view':
456
456
  raise excs.Error('Root of Label Studio config must be a `View`')
457
457
  config = _LabelStudioConfig(
@@ -461,7 +461,7 @@ class LabelStudioProject(Project):
461
461
  return config
462
462
 
463
463
  @classmethod
464
- def __parse_data_keys_config(cls, root: ElementTree.Element) -> dict[str, '_DataKey']:
464
+ def __parse_data_keys_config(cls, root: ET.Element) -> dict[str, '_DataKey']:
465
465
  """Parses the data keys from a Label Studio XML config."""
466
466
  config: dict[str, '_DataKey'] = {}
467
467
  for element in root:
@@ -477,7 +477,7 @@ class LabelStudioProject(Project):
477
477
  return config
478
478
 
479
479
  @classmethod
480
- def __parse_rectangle_labels_config(cls, root: ElementTree.Element) -> dict[str, '_RectangleLabel']:
480
+ def __parse_rectangle_labels_config(cls, root: ET.Element) -> dict[str, '_RectangleLabel']:
481
481
  """Parses the RectangleLabels from a Label Studio XML config."""
482
482
  config: dict[str, '_RectangleLabel'] = {}
483
483
  for element in root:
@@ -534,7 +534,7 @@ class LabelStudioProject(Project):
534
534
  _label_studio_client().delete_project(self.project_id)
535
535
  env.Env.get().console_logger.info(f'Deleted Label Studio project: {title}')
536
536
 
537
- def __eq__(self, other) -> bool:
537
+ def __eq__(self, other: object) -> bool:
538
538
  return isinstance(other, LabelStudioProject) and self.project_id == other.project_id
539
539
 
540
540
  def __hash__(self) -> int:
@@ -576,7 +576,7 @@ class LabelStudioProject(Project):
576
576
  local_annotations_column = ANNOTATIONS_COLUMN
577
577
  else:
578
578
  local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
579
- if local_annotations_column not in t._schema.keys():
579
+ if local_annotations_column not in t._schema:
580
580
  t.add_columns({local_annotations_column: pxt.JsonType(nullable=True)})
581
581
 
582
582
  resolved_col_mapping = cls.validate_columns(
@@ -591,9 +591,9 @@ class LabelStudioProject(Project):
591
591
  if media_import_method != 'url':
592
592
  raise excs.Error("`s3_configuration` is only valid when `media_import_method == 'url'`")
593
593
  s3_configuration = copy.copy(s3_configuration)
594
- if not 'bucket' in s3_configuration:
594
+ if 'bucket' not in s3_configuration:
595
595
  raise excs.Error('`s3_configuration` must contain a `bucket` field')
596
- if not 'title' in s3_configuration:
596
+ if 'title' not in s3_configuration:
597
597
  s3_configuration['title'] = 'Pixeltable-S3-Import-Storage'
598
598
  if (
599
599
  'aws_access_key_id' not in s3_configuration
@@ -633,7 +633,8 @@ class LabelStudioProject(Project):
633
633
  raise excs.Error(
634
634
  '`media_import_method` is set to `file`, but your Label Studio server is not configured '
635
635
  'for local file storage.\nPlease set the `LABEL_STUDIO_LOCAL_FILES_SERVING_ENABLED` '
636
- 'environment variable to `true` in the environment where your Label Studio server is running.'
636
+ 'environment variable to `true` in the environment where your Label Studio server '
637
+ 'is running.'
637
638
  ) from exc
638
639
  raise # Handle any other exception type normally
639
640
 
@@ -663,7 +664,7 @@ class _LabelStudioConfig:
663
664
  rectangle_labels: dict[str, _RectangleLabel]
664
665
 
665
666
  def validate(self) -> None:
666
- data_key_names = set(key.name for key in self.data_keys.values() if key.name is not None)
667
+ data_key_names = {key.name for key in self.data_keys.values() if key.name is not None}
667
668
  for name, rl in self.rectangle_labels.items():
668
669
  if rl.to_name not in data_key_names:
669
670
  raise excs.Error(
@@ -674,7 +675,7 @@ class _LabelStudioConfig:
674
675
  @property
675
676
  def export_columns(self) -> dict[str, pxt.ColumnType]:
676
677
  data_key_cols = {key_id: key_info.column_type for key_id, key_info in self.data_keys.items()}
677
- rl_cols = {name: pxt.JsonType() for name in self.rectangle_labels.keys()}
678
+ rl_cols = {name: pxt.JsonType() for name in self.rectangle_labels}
678
679
  return {**data_key_cols, **rl_cols}
679
680
 
680
681
 
pixeltable/io/pandas.py CHANGED
@@ -1,3 +1,4 @@
1
+ import os
1
2
  from typing import Any, Optional, Union
2
3
 
3
4
  import numpy as np
@@ -7,9 +8,6 @@ from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
7
8
 
8
9
  import pixeltable as pxt
9
10
  import pixeltable.exceptions as excs
10
- from pixeltable import Table
11
-
12
- from .utils import find_or_create_table, normalize_import_parameters, normalize_schema_names
13
11
 
14
12
 
15
13
  def import_pandas(
@@ -43,30 +41,24 @@ def import_pandas(
43
41
  Returns:
44
42
  A handle to the newly created [`Table`][pixeltable.Table].
45
43
  """
46
- schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
47
- pd_schema = df_infer_schema(df, schema_overrides, primary_key)
48
- schema, pxt_pk, col_mapping = normalize_schema_names(pd_schema, primary_key, schema_overrides, False)
49
-
50
- __check_primary_key_values(df, primary_key)
51
-
52
- # Convert all rows to insertable format
53
- tbl_rows = [__df_row_to_pxt_row(row, pd_schema, col_mapping) for row in df.itertuples()]
54
-
55
- table = find_or_create_table(
56
- tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
44
+ return pxt.create_table(
45
+ tbl_name,
46
+ source=df,
47
+ schema_overrides=schema_overrides,
48
+ primary_key=primary_key,
49
+ num_retained_versions=num_retained_versions,
50
+ comment=comment,
57
51
  )
58
- table.insert(tbl_rows)
59
- return table
60
52
 
61
53
 
62
54
  def import_csv(
63
55
  tbl_name: str,
64
- filepath_or_buffer,
56
+ filepath_or_buffer: Union[str, os.PathLike],
65
57
  schema_overrides: Optional[dict[str, Any]] = None,
66
58
  primary_key: Optional[Union[str, list[str]]] = None,
67
59
  num_retained_versions: int = 10,
68
60
  comment: str = '',
69
- **kwargs,
61
+ **kwargs: Any,
70
62
  ) -> pxt.Table:
71
63
  """
72
64
  Creates a new base table from a csv file. This is a convenience method and is equivalent
@@ -77,26 +69,26 @@ def import_csv(
77
69
  Returns:
78
70
  A handle to the newly created [`Table`][pixeltable.Table].
79
71
  """
80
- df = pd.read_csv(filepath_or_buffer, **kwargs)
81
- return import_pandas(
72
+ return pxt.create_table(
82
73
  tbl_name,
83
- df,
74
+ source=filepath_or_buffer,
84
75
  schema_overrides=schema_overrides,
85
76
  primary_key=primary_key,
86
77
  num_retained_versions=num_retained_versions,
87
78
  comment=comment,
79
+ extra_args=kwargs,
88
80
  )
89
81
 
90
82
 
91
83
  def import_excel(
92
84
  tbl_name: str,
93
- io,
94
- *args,
85
+ io: Union[str, os.PathLike],
86
+ *,
95
87
  schema_overrides: Optional[dict[str, Any]] = None,
96
88
  primary_key: Optional[Union[str, list[str]]] = None,
97
89
  num_retained_versions: int = 10,
98
90
  comment: str = '',
99
- **kwargs,
91
+ **kwargs: Any,
100
92
  ) -> pxt.Table:
101
93
  """
102
94
  Creates a new base table from an Excel (.xlsx) file. This is a convenience method and is
@@ -107,18 +99,18 @@ def import_excel(
107
99
  Returns:
108
100
  A handle to the newly created [`Table`][pixeltable.Table].
109
101
  """
110
- df = pd.read_excel(io, *args, **kwargs)
111
- return import_pandas(
102
+ return pxt.create_table(
112
103
  tbl_name,
113
- df,
104
+ source=io,
114
105
  schema_overrides=schema_overrides,
115
106
  primary_key=primary_key,
116
107
  num_retained_versions=num_retained_versions,
117
108
  comment=comment,
109
+ extra_args=kwargs,
118
110
  )
119
111
 
120
112
 
121
- def __check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
113
+ def _df_check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
122
114
  for pd_name in primary_key:
123
115
  # This can be faster for large DataFrames
124
116
  has_nulls = df[pd_name].count() < len(df)
@@ -146,15 +138,6 @@ def df_infer_schema(
146
138
  return pd_schema
147
139
 
148
140
 
149
- """
150
- # Check if a datetime64[ns, UTC] dtype
151
- def is_datetime_tz_utc(x: Any) -> bool:
152
- if isinstance(x, pd.Timestamp) and x.tzinfo is not None and str(x.tzinfo) == 'UTC':
153
- return True
154
- return pd.api.types.is_datetime64tz_dtype(x) and str(x).endswith('UTC]')
155
- """
156
-
157
-
158
141
  def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.ColumnType]:
159
142
  """
160
143
  Determines a pixeltable ColumnType from a pandas dtype
@@ -165,7 +148,8 @@ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.C
165
148
  Returns:
166
149
  pxt.ColumnType: A pixeltable ColumnType
167
150
  """
168
- # Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly compatible with NumPy dtypes
151
+ # Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly
152
+ # compatible with NumPy dtypes
169
153
  # The timezone-aware datetime64[ns, tz=] dtype is a pandas extension dtype
170
154
  if is_datetime64_any_dtype(pd_dtype):
171
155
  return pxt.TimestampType(nullable=nullable)
@@ -204,32 +188,35 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
204
188
  raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {pd_dtype})')
205
189
 
206
190
 
207
- def __df_row_to_pxt_row(
191
+ def _df_row_to_pxt_row(
208
192
  row: tuple[Any, ...], schema: dict[str, pxt.ColumnType], col_mapping: Optional[dict[str, str]]
209
193
  ) -> dict[str, Any]:
210
194
  """Convert a row to insertable format"""
211
195
  pxt_row: dict[str, Any] = {}
212
196
  for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
197
+ pxt_name = col_mapping.get(col_name, col_name)
198
+ nval: Any
213
199
  if pxt_type.is_float_type():
214
- val = float(val)
200
+ nval = float(val)
215
201
  elif isinstance(val, float) and np.isnan(val):
216
202
  # pandas uses NaN for empty cells, even for types other than float;
217
203
  # for any type but a float, convert these to None
218
- val = None
204
+ nval = None
219
205
  elif pxt_type.is_int_type():
220
- val = int(val)
206
+ nval = int(val)
221
207
  elif pxt_type.is_bool_type():
222
- val = bool(val)
208
+ nval = bool(val)
223
209
  elif pxt_type.is_string_type():
224
- val = str(val)
210
+ nval = str(val)
225
211
  elif pxt_type.is_timestamp_type():
226
212
  if pd.isnull(val):
227
213
  # pandas has the bespoke 'NaT' type for a missing timestamp; postgres is very
228
214
  # much not-ok with it. (But if we convert it to None and then load out the
229
215
  # table contents as a pandas DataFrame, it will correctly restore the 'NaT'!)
230
- val = None
216
+ nval = None
231
217
  else:
232
- val = pd.Timestamp(val).to_pydatetime()
233
- pxt_name = col_name if col_mapping is None else col_mapping[col_name]
234
- pxt_row[pxt_name] = val
218
+ nval = pd.Timestamp(val).to_pydatetime()
219
+ else:
220
+ nval = val
221
+ pxt_row[pxt_name] = nval
235
222
  return pxt_row
pixeltable/io/parquet.py CHANGED
@@ -4,7 +4,6 @@ import datetime
4
4
  import io
5
5
  import json
6
6
  import logging
7
- import random
8
7
  import typing
9
8
  from collections import deque
10
9
  from pathlib import Path
@@ -14,12 +13,10 @@ import numpy as np
14
13
  import PIL.Image
15
14
 
16
15
  import pixeltable as pxt
17
- import pixeltable.exceptions as exc
16
+ import pixeltable.exceptions as excs
18
17
  from pixeltable.env import Env
19
18
  from pixeltable.utils.transactional_directory import transactional_directory
20
19
 
21
- from .utils import normalize_import_parameters, normalize_schema_names
22
-
23
20
  if typing.TYPE_CHECKING:
24
21
  import pyarrow as pa
25
22
 
@@ -78,7 +75,7 @@ def export_parquet(
78
75
  arrow_schema = to_arrow_schema(df.schema)
79
76
 
80
77
  if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
81
- raise exc.Error('Cannot export Dataframe with image columns when inline_images is False')
78
+ raise excs.Error('Cannot export Dataframe with image columns when inline_images is False')
82
79
 
83
80
  # store the changes atomically
84
81
  with transactional_directory(parquet_path) as temp_path:
@@ -87,7 +84,7 @@ def export_parquet(
87
84
  json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
88
85
 
89
86
  batch_num = 0
90
- current_value_batch: dict[str, deque] = {k: deque() for k in df.schema.keys()}
87
+ current_value_batch: dict[str, deque] = {k: deque() for k in df.schema}
91
88
  current_byte_estimate = 0
92
89
 
93
90
  with Env.get().begin_xact():
@@ -111,7 +108,7 @@ def export_parquet(
111
108
  val.save(buf, format='PNG')
112
109
  val = buf.getvalue()
113
110
  else:
114
- assert False, f'unknown image type {type(val)}'
111
+ raise excs.Error(f'unknown image type {type(val)}')
115
112
  length = len(val)
116
113
  elif col_type.is_string_type():
117
114
  length = len(val)
@@ -119,16 +116,14 @@ def export_parquet(
119
116
  if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
120
117
  val = data_row.file_paths[e.slot_idx]
121
118
  else:
122
- assert False, f'unknown video type {type(val)}'
119
+ raise excs.Error(f'unknown video type {type(val)}')
123
120
  length = len(val)
124
121
  elif col_type.is_json_type():
125
122
  val = json.dumps(val)
126
123
  length = len(val)
127
124
  elif col_type.is_array_type():
128
125
  length = val.nbytes
129
- elif col_type.is_int_type():
130
- length = 8
131
- elif col_type.is_float_type():
126
+ elif col_type.is_int_type() or col_type.is_float_type():
132
127
  length = 8
133
128
  elif col_type.is_bool_type():
134
129
  length = 1
@@ -136,7 +131,7 @@ def export_parquet(
136
131
  val = val.astimezone(datetime.timezone.utc)
137
132
  length = 8
138
133
  else:
139
- assert False, f'unknown type {col_type} for {col_name}'
134
+ raise excs.Error(f'unknown type {col_type} for {col_name}')
140
135
 
141
136
  current_value_batch[col_name].append(val)
142
137
  current_byte_estimate += length
@@ -144,7 +139,7 @@ def export_parquet(
144
139
  assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
145
140
  _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
146
141
  batch_num += 1
147
- current_value_batch = {k: deque() for k in df.schema.keys()}
142
+ current_value_batch = {k: deque() for k in df.schema}
148
143
  current_byte_estimate = 0
149
144
 
150
145
  _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
@@ -173,32 +168,12 @@ def import_parquet(
173
168
  Returns:
174
169
  A handle to the newly created table.
175
170
  """
176
- from pyarrow import parquet
177
-
178
- from pixeltable.utils.arrow import ar_infer_schema, iter_tuples2
179
-
180
- input_path = Path(parquet_path).expanduser()
181
- parquet_dataset = parquet.ParquetDataset(str(input_path))
182
-
183
- schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
184
- ar_schema = ar_infer_schema(parquet_dataset.schema, schema_overrides, primary_key)
185
- schema, pxt_pk, col_mapping = normalize_schema_names(ar_schema, primary_key, schema_overrides, False)
186
-
187
- if table in pxt.list_tables():
188
- raise exc.Error(f'Table {table} already exists')
189
-
190
- tmp_name = f'{table}_tmp_{random.randint(0, 100000000)}'
191
- total_rows = 0
192
- try:
193
- tab = pxt.create_table(tmp_name, schema, primary_key=pxt_pk, **kwargs)
194
- for fragment in parquet_dataset.fragments: # type: ignore[attr-defined]
195
- for batch in fragment.to_batches():
196
- dict_batch = list(iter_tuples2(batch, col_mapping, schema))
197
- total_rows += len(dict_batch)
198
- tab.insert(dict_batch)
199
- except Exception as e:
200
- _logger.error(f'Error after inserting {total_rows} rows from Parquet file into table: {e}')
201
- raise e
202
-
203
- pxt.move(tmp_name, table)
204
- return pxt.get_table(table)
171
+ value = kwargs.pop('source_format', None)
172
+ return pxt.create_table(
173
+ table,
174
+ source=parquet_path,
175
+ source_format=value,
176
+ schema_overrides=schema_overrides,
177
+ primary_key=primary_key,
178
+ extra_args=kwargs,
179
+ )