pixeltable 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (127) hide show
  1. pixeltable/__init__.py +5 -3
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +1 -0
  4. pixeltable/catalog/catalog.py +335 -128
  5. pixeltable/catalog/column.py +22 -5
  6. pixeltable/catalog/dir.py +19 -6
  7. pixeltable/catalog/insertable_table.py +34 -37
  8. pixeltable/catalog/named_function.py +0 -4
  9. pixeltable/catalog/schema_object.py +28 -42
  10. pixeltable/catalog/table.py +193 -158
  11. pixeltable/catalog/table_version.py +191 -232
  12. pixeltable/catalog/table_version_handle.py +50 -0
  13. pixeltable/catalog/table_version_path.py +49 -33
  14. pixeltable/catalog/view.py +56 -96
  15. pixeltable/config.py +103 -0
  16. pixeltable/dataframe.py +89 -89
  17. pixeltable/env.py +98 -168
  18. pixeltable/exec/aggregation_node.py +5 -4
  19. pixeltable/exec/cache_prefetch_node.py +1 -1
  20. pixeltable/exec/component_iteration_node.py +13 -9
  21. pixeltable/exec/data_row_batch.py +3 -3
  22. pixeltable/exec/exec_context.py +0 -4
  23. pixeltable/exec/exec_node.py +3 -2
  24. pixeltable/exec/expr_eval/schedulers.py +2 -1
  25. pixeltable/exec/in_memory_data_node.py +9 -4
  26. pixeltable/exec/row_update_node.py +1 -2
  27. pixeltable/exec/sql_node.py +20 -16
  28. pixeltable/exprs/__init__.py +2 -0
  29. pixeltable/exprs/arithmetic_expr.py +7 -11
  30. pixeltable/exprs/array_slice.py +1 -1
  31. pixeltable/exprs/column_property_ref.py +3 -3
  32. pixeltable/exprs/column_ref.py +12 -13
  33. pixeltable/exprs/comparison.py +3 -6
  34. pixeltable/exprs/compound_predicate.py +4 -4
  35. pixeltable/exprs/expr.py +31 -22
  36. pixeltable/exprs/expr_dict.py +3 -3
  37. pixeltable/exprs/expr_set.py +1 -1
  38. pixeltable/exprs/function_call.py +110 -80
  39. pixeltable/exprs/globals.py +3 -3
  40. pixeltable/exprs/in_predicate.py +1 -1
  41. pixeltable/exprs/inline_expr.py +3 -3
  42. pixeltable/exprs/is_null.py +1 -1
  43. pixeltable/exprs/json_mapper.py +2 -2
  44. pixeltable/exprs/json_path.py +17 -10
  45. pixeltable/exprs/literal.py +1 -1
  46. pixeltable/exprs/method_ref.py +2 -2
  47. pixeltable/exprs/row_builder.py +8 -17
  48. pixeltable/exprs/rowid_ref.py +21 -10
  49. pixeltable/exprs/similarity_expr.py +5 -5
  50. pixeltable/exprs/sql_element_cache.py +1 -1
  51. pixeltable/exprs/type_cast.py +2 -3
  52. pixeltable/exprs/variable.py +2 -2
  53. pixeltable/ext/__init__.py +2 -0
  54. pixeltable/ext/functions/__init__.py +2 -0
  55. pixeltable/ext/functions/yolox.py +3 -3
  56. pixeltable/func/__init__.py +3 -1
  57. pixeltable/func/aggregate_function.py +9 -9
  58. pixeltable/func/callable_function.py +3 -4
  59. pixeltable/func/expr_template_function.py +6 -16
  60. pixeltable/func/function.py +48 -14
  61. pixeltable/func/function_registry.py +1 -3
  62. pixeltable/func/query_template_function.py +5 -12
  63. pixeltable/func/signature.py +23 -22
  64. pixeltable/func/tools.py +3 -3
  65. pixeltable/func/udf.py +6 -4
  66. pixeltable/functions/__init__.py +2 -0
  67. pixeltable/functions/fireworks.py +7 -4
  68. pixeltable/functions/globals.py +4 -5
  69. pixeltable/functions/huggingface.py +1 -5
  70. pixeltable/functions/image.py +17 -7
  71. pixeltable/functions/llama_cpp.py +1 -1
  72. pixeltable/functions/mistralai.py +1 -1
  73. pixeltable/functions/ollama.py +4 -4
  74. pixeltable/functions/openai.py +19 -19
  75. pixeltable/functions/string.py +23 -30
  76. pixeltable/functions/timestamp.py +11 -6
  77. pixeltable/functions/together.py +14 -12
  78. pixeltable/functions/util.py +1 -1
  79. pixeltable/functions/video.py +5 -4
  80. pixeltable/functions/vision.py +6 -9
  81. pixeltable/functions/whisper.py +3 -3
  82. pixeltable/globals.py +246 -260
  83. pixeltable/index/__init__.py +2 -0
  84. pixeltable/index/base.py +1 -1
  85. pixeltable/index/btree.py +3 -1
  86. pixeltable/index/embedding_index.py +11 -5
  87. pixeltable/io/external_store.py +11 -12
  88. pixeltable/io/label_studio.py +4 -3
  89. pixeltable/io/parquet.py +57 -56
  90. pixeltable/iterators/__init__.py +4 -2
  91. pixeltable/iterators/audio.py +11 -11
  92. pixeltable/iterators/document.py +10 -10
  93. pixeltable/iterators/string.py +1 -2
  94. pixeltable/iterators/video.py +14 -15
  95. pixeltable/metadata/__init__.py +9 -5
  96. pixeltable/metadata/converters/convert_10.py +0 -1
  97. pixeltable/metadata/converters/convert_15.py +0 -2
  98. pixeltable/metadata/converters/convert_23.py +0 -2
  99. pixeltable/metadata/converters/convert_24.py +3 -3
  100. pixeltable/metadata/converters/convert_25.py +1 -1
  101. pixeltable/metadata/converters/convert_27.py +0 -2
  102. pixeltable/metadata/converters/convert_28.py +0 -2
  103. pixeltable/metadata/converters/convert_29.py +7 -8
  104. pixeltable/metadata/converters/util.py +7 -7
  105. pixeltable/metadata/schema.py +27 -19
  106. pixeltable/plan.py +68 -40
  107. pixeltable/share/__init__.py +2 -0
  108. pixeltable/share/packager.py +15 -12
  109. pixeltable/share/publish.py +3 -5
  110. pixeltable/store.py +37 -38
  111. pixeltable/type_system.py +41 -28
  112. pixeltable/utils/coco.py +4 -4
  113. pixeltable/utils/console_output.py +1 -3
  114. pixeltable/utils/description_helper.py +1 -1
  115. pixeltable/utils/documents.py +3 -3
  116. pixeltable/utils/filecache.py +20 -9
  117. pixeltable/utils/formatter.py +2 -3
  118. pixeltable/utils/media_store.py +1 -1
  119. pixeltable/utils/pytorch.py +1 -1
  120. pixeltable/utils/sql.py +4 -4
  121. pixeltable/utils/transactional_directory.py +2 -1
  122. {pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/METADATA +1 -1
  123. pixeltable-0.3.8.dist-info/RECORD +174 -0
  124. pixeltable-0.3.6.dist-info/RECORD +0 -172
  125. {pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/LICENSE +0 -0
  126. {pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/WHEEL +0 -0
  127. {pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/entry_points.txt +0 -0
pixeltable/index/base.py CHANGED
@@ -37,7 +37,7 @@ class IndexBase(abc.ABC):
37
37
  pass
38
38
 
39
39
  @abc.abstractmethod
40
- def create_index(self, index_name: str, index_value_col: catalog.Column, conn: sql.engine.Connection) -> None:
40
+ def create_index(self, index_name: str, index_value_col: catalog.Column) -> None:
41
41
  """Create the index on the index value column"""
42
42
  pass
43
43
 
pixeltable/index/btree.py CHANGED
@@ -6,6 +6,7 @@ import sqlalchemy as sql
6
6
  # import pixeltable.catalog as catalog
7
7
  import pixeltable.exceptions as excs
8
8
  from pixeltable import catalog, exprs
9
+ from pixeltable.env import Env
9
10
  from pixeltable.func.udf import udf
10
11
 
11
12
  from .base import IndexBase
@@ -52,9 +53,10 @@ class BtreeIndex(IndexBase):
52
53
  """Return the sqlalchemy type of the index value column"""
53
54
  return self.value_expr.col_type.to_sa_type()
54
55
 
55
- def create_index(self, index_name: str, index_value_col: 'catalog.Column', conn: sql.engine.Connection) -> None:
56
+ def create_index(self, index_name: str, index_value_col: 'catalog.Column') -> None:
56
57
  """Create the index on the index value column"""
57
58
  idx = sql.Index(index_name, index_value_col.sa_col, postgresql_using='btree')
59
+ conn = Env.get().conn
58
60
  idx.create(bind=conn)
59
61
 
60
62
  @classmethod
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import enum
4
- from typing import Any, Optional
4
+ from typing import Any, ClassVar, Optional
5
5
 
6
6
  import numpy as np
7
7
  import pgvector.sqlalchemy # type: ignore[import-untyped]
@@ -11,6 +11,7 @@ import sqlalchemy as sql
11
11
  import pixeltable.exceptions as excs
12
12
  import pixeltable.type_system as ts
13
13
  from pixeltable import catalog, exprs, func
14
+ from pixeltable.env import Env
14
15
 
15
16
  from .base import IndexBase
16
17
 
@@ -31,7 +32,11 @@ class EmbeddingIndex(IndexBase):
31
32
  IP = 2
32
33
  L2 = 3
33
34
 
34
- PGVECTOR_OPS = {Metric.COSINE: 'vector_cosine_ops', Metric.IP: 'vector_ip_ops', Metric.L2: 'vector_l2_ops'}
35
+ PGVECTOR_OPS: ClassVar[dict[Metric, str]] = {
36
+ Metric.COSINE: 'vector_cosine_ops',
37
+ Metric.IP: 'vector_ip_ops',
38
+ Metric.L2: 'vector_l2_ops',
39
+ }
35
40
 
36
41
  metric: Metric
37
42
  value_expr: exprs.FunctionCall
@@ -55,7 +60,7 @@ class EmbeddingIndex(IndexBase):
55
60
  if metric.lower() not in metric_names:
56
61
  raise excs.Error(f'Invalid metric {metric}, must be one of {metric_names}')
57
62
  if not c.col_type.is_string_type() and not c.col_type.is_image_type():
58
- raise excs.Error(f'Embedding index requires string or image column')
63
+ raise excs.Error('Embedding index requires string or image column')
59
64
 
60
65
  self.string_embed = None
61
66
  self.image_embed = None
@@ -131,7 +136,7 @@ class EmbeddingIndex(IndexBase):
131
136
  """Return the sqlalchemy type of the index value column"""
132
137
  return self.index_col_type
133
138
 
134
- def create_index(self, index_name: str, index_value_col: catalog.Column, conn: sql.engine.Connection) -> None:
139
+ def create_index(self, index_name: str, index_value_col: catalog.Column) -> None:
135
140
  """Create the index on the index value column"""
136
141
  idx = sql.Index(
137
142
  index_name,
@@ -140,6 +145,7 @@ class EmbeddingIndex(IndexBase):
140
145
  postgresql_with={'m': 16, 'ef_construction': 64},
141
146
  postgresql_ops={index_value_col.sa_col.name: self.PGVECTOR_OPS[self.metric]},
142
147
  )
148
+ conn = Env.get().conn
143
149
  idx.create(bind=conn)
144
150
 
145
151
  def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.ColumnElement:
@@ -219,7 +225,7 @@ class EmbeddingIndex(IndexBase):
219
225
  )
220
226
 
221
227
  shape = return_type.shape
222
- if len(shape) != 1 or shape[0] == None:
228
+ if len(shape) != 1 or shape[0] is None:
223
229
  raise excs.Error(
224
230
  f'The function `{embed_fn.name}` is not a valid embedding: '
225
231
  f'it must return a 1-dimensional array of a specific length, but returns {return_type}'
@@ -8,12 +8,10 @@ from dataclasses import dataclass
8
8
  from typing import Any, Optional
9
9
  from uuid import UUID
10
10
 
11
- import sqlalchemy as sql
12
-
13
11
  import pixeltable.exceptions as excs
14
12
  import pixeltable.type_system as ts
15
13
  from pixeltable import Column, Table
16
- from pixeltable.catalog import TableVersion
14
+ from pixeltable.catalog import TableVersion, TableVersionHandle
17
15
 
18
16
  _logger = logging.getLogger('pixeltable')
19
17
 
@@ -33,13 +31,13 @@ class ExternalStore(abc.ABC):
33
31
  return self.__name
34
32
 
35
33
  @abc.abstractmethod
36
- def link(self, tbl_version: TableVersion, conn: sql.Connection) -> None:
34
+ def link(self, tbl_version: TableVersion) -> None:
37
35
  """
38
36
  Called by `TableVersion.link()` to implement store-specific logic.
39
37
  """
40
38
 
41
39
  @abc.abstractmethod
42
- def unlink(self, tbl_version: TableVersion, conn: sql.Connection) -> None:
40
+ def unlink(self, tbl_version: TableVersion) -> None:
43
41
  """
44
42
  Called by `TableVersion.unlink()` to implement store-specific logic.
45
43
  """
@@ -94,7 +92,7 @@ class Project(ExternalStore, abc.ABC):
94
92
  def get_local_columns(self) -> list[Column]:
95
93
  return list(self.col_mapping.keys())
96
94
 
97
- def link(self, tbl_version: TableVersion, conn: sql.Connection) -> None:
95
+ def link(self, tbl_version: TableVersion) -> None:
98
96
  # All of the media columns being linked need to either be stored computed columns, or else have stored proxies.
99
97
  # This ensures that the media in those columns resides in the media store.
100
98
  # First determine which columns (if any) need stored proxies, but don't have one yet.
@@ -110,6 +108,7 @@ class Project(ExternalStore, abc.ABC):
110
108
  if col not in self.stored_proxies:
111
109
  # We didn't find it in an existing Project
112
110
  stored_proxies_needed.append(col)
111
+
113
112
  if len(stored_proxies_needed) > 0:
114
113
  _logger.info(f'Creating stored proxies for columns: {[col.name for col in stored_proxies_needed]}')
115
114
  # Create stored proxies for columns that need one. Increment the schema version
@@ -119,12 +118,12 @@ class Project(ExternalStore, abc.ABC):
119
118
  tbl_version.schema_version = tbl_version.version
120
119
  proxy_cols = [self.create_stored_proxy(tbl_version, col) for col in stored_proxies_needed]
121
120
  # Add the columns; this will also update table metadata.
122
- tbl_version._add_columns(proxy_cols, conn, print_stats=False, on_error='ignore')
121
+ tbl_version._add_columns(proxy_cols, print_stats=False, on_error='ignore')
123
122
  # We don't need to retain `UpdateStatus` since the stored proxies are intended to be
124
123
  # invisible to the user.
125
- tbl_version._update_md(time.time(), conn, preceding_schema_version=preceding_schema_version)
124
+ tbl_version._update_md(time.time(), preceding_schema_version=preceding_schema_version)
126
125
 
127
- def unlink(self, tbl_version: TableVersion, conn: sql.Connection) -> None:
126
+ def unlink(self, tbl_version: TableVersion) -> None:
128
127
  # Determine which stored proxies can be deleted. (A stored proxy can be deleted if it is not referenced by
129
128
  # any *other* external store for this table.)
130
129
  deletions_needed: set[Column] = set(self.stored_proxies.values())
@@ -139,7 +138,7 @@ class Project(ExternalStore, abc.ABC):
139
138
  tbl_version.schema_version = tbl_version.version
140
139
  tbl_version._drop_columns(deletions_needed)
141
140
  self.stored_proxies.clear()
142
- tbl_version._update_md(time.time(), conn, preceding_schema_version=preceding_schema_version)
141
+ tbl_version._update_md(time.time(), preceding_schema_version=preceding_schema_version)
143
142
 
144
143
  def create_stored_proxy(self, tbl_version: TableVersion, col: Column) -> Column:
145
144
  """
@@ -163,7 +162,7 @@ class Project(ExternalStore, abc.ABC):
163
162
  sa_col_type=col.col_type.to_sa_type(),
164
163
  schema_version_add=tbl_version.schema_version,
165
164
  )
166
- proxy_col.tbl = tbl_version
165
+ proxy_col.tbl = TableVersionHandle(tbl_version.id, tbl_version.effective_version, tbl_version=tbl_version)
167
166
  tbl_version.next_col_id += 1
168
167
  self.stored_proxies[col] = proxy_col
169
168
  return proxy_col
@@ -279,7 +278,7 @@ class Project(ExternalStore, abc.ABC):
279
278
 
280
279
  tbl_id = UUID(d['tbl_id'])
281
280
  col_id = d['col_id']
282
- return Catalog.get().tbl_versions[(tbl_id, None)].cols_by_id[col_id]
281
+ return Catalog.get().get_tbl_version(tbl_id, None).cols_by_id[col_id]
283
282
 
284
283
 
285
284
  @dataclass(frozen=True)
@@ -15,6 +15,7 @@ import pixeltable as pxt
15
15
  import pixeltable.env as env
16
16
  import pixeltable.exceptions as excs
17
17
  from pixeltable import Column, Table
18
+ from pixeltable.config import Config
18
19
  from pixeltable.exprs import ColumnRef, DataRow, Expr
19
20
  from pixeltable.io.external_store import Project, SyncStatus
20
21
  from pixeltable.utils import coco
@@ -356,7 +357,7 @@ class LabelStudioProject(Project):
356
357
  @classmethod
357
358
  def __localpath_to_lspath(cls, localpath: str) -> str:
358
359
  # Transform the local path into Label Studio's bespoke path format.
359
- relpath = Path(localpath).relative_to(env.Env.get().home)
360
+ relpath = Path(localpath).relative_to(Config.get().home)
360
361
  return f'/data/local-files/?d={str(relpath)}'
361
362
 
362
363
  def __delete_stale_tasks(
@@ -410,7 +411,7 @@ class LabelStudioProject(Project):
410
411
  # batch_update on the actual ancestor table that holds the annotations column.
411
412
  # TODO(aaron-siegel): Simplify this once propagation is properly implemented in batch_update
412
413
  ancestor = t
413
- while local_annotations_col not in ancestor._tbl_version.cols:
414
+ while local_annotations_col not in ancestor._tbl_version.get().cols:
414
415
  assert ancestor._base is not None
415
416
  ancestor = ancestor._base
416
417
  update_status = ancestor.batch_update(updates)
@@ -618,7 +619,7 @@ class LabelStudioProject(Project):
618
619
 
619
620
  if media_import_method == 'file':
620
621
  # We need to set up a local storage connection to receive media files
621
- os.environ['LABEL_STUDIO_LOCAL_FILES_DOCUMENT_ROOT'] = str(env.Env.get().home)
622
+ os.environ['LABEL_STUDIO_LOCAL_FILES_DOCUMENT_ROOT'] = str(Config.get().home)
622
623
  try:
623
624
  project.connect_local_import_storage(local_store_path=str(env.Env.get().media_dir))
624
625
  except HTTPError as exc:
pixeltable/io/parquet.py CHANGED
@@ -90,63 +90,64 @@ def export_parquet(
90
90
  current_value_batch: dict[str, deque] = {k: deque() for k in df.schema.keys()}
91
91
  current_byte_estimate = 0
92
92
 
93
- for data_row in df._exec():
94
- for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
95
- val = data_row[e.slot_idx]
96
- if val is None:
97
- current_value_batch[col_name].append(val)
98
- continue
99
-
100
- assert val is not None
101
- if col_type.is_image_type():
102
- # images get inlined into the parquet file
103
- if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
104
- # if there is a file, read directly to preserve information
105
- with open(data_row.file_paths[e.slot_idx], 'rb') as f:
106
- val = f.read()
107
- elif isinstance(val, PIL.Image.Image):
108
- # if no file available, eg. bc it is computed, convert to png
109
- buf = io.BytesIO()
110
- val.save(buf, format='PNG')
111
- val = buf.getvalue()
112
- else:
113
- assert False, f'unknown image type {type(val)}'
114
- length = len(val)
115
- elif col_type.is_string_type():
116
- length = len(val)
117
- elif col_type.is_video_type():
118
- if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
119
- val = data_row.file_paths[e.slot_idx]
93
+ with Env.get().begin_xact():
94
+ for data_row in df._exec():
95
+ for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
96
+ val = data_row[e.slot_idx]
97
+ if val is None:
98
+ current_value_batch[col_name].append(val)
99
+ continue
100
+
101
+ assert val is not None
102
+ if col_type.is_image_type():
103
+ # images get inlined into the parquet file
104
+ if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
105
+ # if there is a file, read directly to preserve information
106
+ with open(data_row.file_paths[e.slot_idx], 'rb') as f:
107
+ val = f.read()
108
+ elif isinstance(val, PIL.Image.Image):
109
+ # if no file available, eg. bc it is computed, convert to png
110
+ buf = io.BytesIO()
111
+ val.save(buf, format='PNG')
112
+ val = buf.getvalue()
113
+ else:
114
+ assert False, f'unknown image type {type(val)}'
115
+ length = len(val)
116
+ elif col_type.is_string_type():
117
+ length = len(val)
118
+ elif col_type.is_video_type():
119
+ if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
120
+ val = data_row.file_paths[e.slot_idx]
121
+ else:
122
+ assert False, f'unknown video type {type(val)}'
123
+ length = len(val)
124
+ elif col_type.is_json_type():
125
+ val = json.dumps(val)
126
+ length = len(val)
127
+ elif col_type.is_array_type():
128
+ length = val.nbytes
129
+ elif col_type.is_int_type():
130
+ length = 8
131
+ elif col_type.is_float_type():
132
+ length = 8
133
+ elif col_type.is_bool_type():
134
+ length = 1
135
+ elif col_type.is_timestamp_type():
136
+ val = val.astimezone(datetime.timezone.utc)
137
+ length = 8
120
138
  else:
121
- assert False, f'unknown video type {type(val)}'
122
- length = len(val)
123
- elif col_type.is_json_type():
124
- val = json.dumps(val)
125
- length = len(val)
126
- elif col_type.is_array_type():
127
- length = val.nbytes
128
- elif col_type.is_int_type():
129
- length = 8
130
- elif col_type.is_float_type():
131
- length = 8
132
- elif col_type.is_bool_type():
133
- length = 1
134
- elif col_type.is_timestamp_type():
135
- val = val.astimezone(datetime.timezone.utc)
136
- length = 8
137
- else:
138
- assert False, f'unknown type {col_type} for {col_name}'
139
-
140
- current_value_batch[col_name].append(val)
141
- current_byte_estimate += length
142
- if current_byte_estimate > partition_size_bytes:
143
- assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
144
- _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
145
- batch_num += 1
146
- current_value_batch = {k: deque() for k in df.schema.keys()}
147
- current_byte_estimate = 0
148
-
149
- _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
139
+ assert False, f'unknown type {col_type} for {col_name}'
140
+
141
+ current_value_batch[col_name].append(val)
142
+ current_byte_estimate += length
143
+ if current_byte_estimate > partition_size_bytes:
144
+ assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
145
+ _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
146
+ batch_num += 1
147
+ current_value_batch = {k: deque() for k in df.schema.keys()}
148
+ current_byte_estimate = 0
149
+
150
+ _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
150
151
 
151
152
 
152
153
  def import_parquet(
@@ -1,3 +1,5 @@
1
+ # ruff: noqa: F401
2
+
1
3
  from .audio import AudioSplitter
2
4
  from .base import ComponentIterator
3
5
  from .document import DocumentSplitter
@@ -5,9 +7,9 @@ from .image import TileIterator
5
7
  from .string import StringSplitter
6
8
  from .video import FrameIterator
7
9
 
8
- __default_dir = set(symbol for symbol in dir() if not symbol.startswith('_'))
10
+ __default_dir = {symbol for symbol in dir() if not symbol.startswith('_')}
9
11
  __removed_symbols = {'base', 'document', 'video'}
10
- __all__ = sorted(list(__default_dir - __removed_symbols))
12
+ __all__ = sorted(__default_dir - __removed_symbols)
11
13
 
12
14
 
13
15
  def __dir__():
@@ -1,15 +1,12 @@
1
1
  import logging
2
- import math
3
2
  import uuid
4
3
  from fractions import Fraction
5
4
  from pathlib import Path
6
- from typing import Any, Optional
5
+ from typing import Any, ClassVar, Optional
7
6
 
8
7
  import av
9
8
 
10
- import pixeltable.env as env
11
- import pixeltable.exceptions as excs
12
- import pixeltable.type_system as ts
9
+ from pixeltable import env, exceptions as excs, type_system as ts
13
10
 
14
11
  from .base import ComponentIterator
15
12
 
@@ -18,7 +15,8 @@ _logger = logging.getLogger('pixeltable')
18
15
 
19
16
  class AudioSplitter(ComponentIterator):
20
17
  """
21
- Iterator over chunks of an audio file. The audio file is split into smaller chunks, where the duration of each chunk is determined by chunk_duration_sec.
18
+ Iterator over chunks of an audio file. The audio file is split into smaller chunks,
19
+ where the duration of each chunk is determined by chunk_duration_sec.
22
20
  The iterator yields audio chunks as pxt.Audio, along with the start and end time of each chunk.
23
21
  If the input contains no audio, no chunks are yielded.
24
22
 
@@ -39,11 +37,11 @@ class AudioSplitter(ComponentIterator):
39
37
 
40
38
  # List of chunks to extract
41
39
  # Each chunk is defined by start and end presentation timestamps in audio file (int)
42
- chunks_to_extract_in_pts: Optional[list[tuple[int, int]]] = []
40
+ chunks_to_extract_in_pts: Optional[list[tuple[int, int]]]
43
41
  # next chunk to extract
44
42
  next_pos: int
45
43
 
46
- __codec_map = {
44
+ __codec_map: ClassVar[dict[str, str]] = {
47
45
  'mp3': 'mp3', # MP3 decoder -> mp3/libmp3lame encoder
48
46
  'mp3float': 'mp3', # MP3float decoder -> mp3 encoder
49
47
  'aac': 'aac', # AAC decoder -> AAC encoder
@@ -88,7 +86,8 @@ class AudioSplitter(ComponentIterator):
88
86
  )
89
87
  ]
90
88
  _logger.debug(
91
- f'AudioIterator: path={self.audio_path} total_audio_duration_pts={total_audio_duration_pts} chunks_to_extract_in_pts={self.chunks_to_extract_in_pts}'
89
+ f'AudioIterator: path={self.audio_path} total_audio_duration_pts={total_audio_duration_pts} '
90
+ f'chunks_to_extract_in_pts={self.chunks_to_extract_in_pts}'
92
91
  )
93
92
 
94
93
  @classmethod
@@ -155,7 +154,7 @@ class AudioSplitter(ComponentIterator):
155
154
  try:
156
155
  frame = next(self.container.decode(audio=0))
157
156
  except EOFError as e:
158
- raise excs.Error(f'Failed to read audio file `{self.audio_path}`, error `{e}`')
157
+ raise excs.Error(f"Failed to read audio file '{self.audio_path}': {e}") from e
159
158
  except StopIteration:
160
159
  # no more frames to scan
161
160
  break
@@ -163,7 +162,8 @@ class AudioSplitter(ComponentIterator):
163
162
  # Current frame is behind chunk's start time, always get frame next to chunk's start time
164
163
  continue
165
164
  if frame.pts >= target_chunk_end:
166
- # Frame has crossed the chunk boundary, it should be picked up by next chunk, throw away the current frame
165
+ # Frame has crossed the chunk boundary, it should be picked up by next chunk, throw away
166
+ # the current frame
167
167
  break
168
168
  frame_end = frame.pts + frame.samples
169
169
  if frame_count == 0:
@@ -1,7 +1,7 @@
1
1
  import dataclasses
2
2
  import enum
3
3
  import logging
4
- from typing import Any, Iterable, Iterator, Optional, Union
4
+ from typing import Any, ClassVar, Iterable, Iterator, Optional, Union
5
5
 
6
6
  import ftfy
7
7
 
@@ -96,7 +96,7 @@ class DocumentSplitter(ComponentIterator):
96
96
  Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
97
97
  """
98
98
 
99
- METADATA_COLUMN_TYPES = {
99
+ METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
100
100
  ChunkMetadata.TITLE: StringType(nullable=True),
101
101
  ChunkMetadata.HEADING: JsonType(nullable=True),
102
102
  ChunkMetadata.SOURCELINE: IntType(nullable=True),
@@ -164,7 +164,7 @@ class DocumentSplitter(ComponentIterator):
164
164
  assert self._doc_handle.txt_doc is not None
165
165
  self._sections = self._txt_sections()
166
166
  else:
167
- assert False, f'Unsupported document format: {self._doc_handle.format}'
167
+ raise AssertionError(f'Unsupported document format: {self._doc_handle.format}')
168
168
 
169
169
  if Separator.SENTENCE in self._separators:
170
170
  self._sections = self._sentence_sections(self._sections)
@@ -215,7 +215,7 @@ class DocumentSplitter(ComponentIterator):
215
215
 
216
216
  # check dependencies at the end
217
217
  if Separator.SENTENCE in separators:
218
- Env.get().require_package('spacy')
218
+ _ = Env.get().spacy_nlp
219
219
  if Separator.TOKEN_LIMIT in separators:
220
220
  Env.get().require_package('tiktoken')
221
221
 
@@ -259,9 +259,9 @@ class DocumentSplitter(ComponentIterator):
259
259
  sourceline = el.sourceline
260
260
  if el.name in _HTML_HEADINGS:
261
261
  # remove the previously seen lower levels
262
- lower_levels = [l for l in headings if l > el.name]
263
- for l in lower_levels:
264
- del headings[l]
262
+ lower_levels = [lv for lv in headings if lv > el.name]
263
+ for lv in lower_levels:
264
+ del headings[lv]
265
265
  headings[el.name] = el.get_text().strip()
266
266
 
267
267
  def emit() -> Iterator[DocumentSection]:
@@ -320,9 +320,9 @@ class DocumentSplitter(ComponentIterator):
320
320
  level = f'h{lint}'
321
321
  text = heading['children'][0]['raw'].strip()
322
322
  # remove the previously seen lower levels
323
- lower_levels = [l for l in headings.keys() if l > level]
324
- for l in lower_levels:
325
- del headings[l]
323
+ lower_levels = [lv for lv in headings if lv > level]
324
+ for lv in lower_levels:
325
+ del headings[lv]
326
326
  headings[level] = text
327
327
 
328
328
  def emit() -> Iterator[DocumentSection]:
@@ -1,7 +1,6 @@
1
1
  from typing import Any, Iterator
2
2
 
3
- import pixeltable.exceptions as excs
4
- import pixeltable.type_system as ts
3
+ from pixeltable import exceptions as excs, type_system as ts
5
4
  from pixeltable.env import Env
6
5
  from pixeltable.iterators.base import ComponentIterator
7
6
 
@@ -2,7 +2,7 @@ import logging
2
2
  import math
3
3
  from fractions import Fraction
4
4
  from pathlib import Path
5
- from typing import Any, Optional, Sequence
5
+ from typing import Any, Optional
6
6
 
7
7
  import av
8
8
  import pandas as pd
@@ -91,21 +91,20 @@ class FrameIterator(ComponentIterator):
91
91
  self.frames_to_extract = None
92
92
  else:
93
93
  spacing = float(self.video_frame_count) / float(num_frames)
94
- self.frames_to_extract = list(round(i * spacing) for i in range(num_frames))
94
+ self.frames_to_extract = [round(i * spacing) for i in range(num_frames)]
95
95
  assert len(self.frames_to_extract) == num_frames
96
+ elif fps is None or fps == 0.0:
97
+ # Extract all frames
98
+ self.frames_to_extract = None
99
+ elif fps > float(self.video_framerate):
100
+ raise excs.Error(
101
+ f'Video {video}: requested fps ({fps}) exceeds that of the video ({float(self.video_framerate)})'
102
+ )
96
103
  else:
97
- if fps is None or fps == 0.0:
98
- # Extract all frames
99
- self.frames_to_extract = None
100
- elif fps > float(self.video_framerate):
101
- raise excs.Error(
102
- f'Video {video}: requested fps ({fps}) exceeds that of the video ({float(self.video_framerate)})'
103
- )
104
- else:
105
- # Extract frames at the implied frequency
106
- freq = fps / float(self.video_framerate)
107
- n = math.ceil(self.video_frame_count * freq) # number of frames to extract
108
- self.frames_to_extract = list(round(i / freq) for i in range(n))
104
+ # Extract frames at the implied frequency
105
+ freq = fps / float(self.video_framerate)
106
+ n = math.ceil(self.video_frame_count * freq) # number of frames to extract
107
+ self.frames_to_extract = [round(i / freq) for i in range(n)]
109
108
 
110
109
  _logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps} num_frames={self.num_frames}')
111
110
  self.next_pos = 0
@@ -149,7 +148,7 @@ class FrameIterator(ComponentIterator):
149
148
  try:
150
149
  frame = next(self.container.decode(video=0))
151
150
  except EOFError:
152
- raise StopIteration
151
+ raise StopIteration from None
153
152
  # Compute the index of the current frame in the video based on the presentation timestamp (pts);
154
153
  # this ensures we have a canonical understanding of frame index, regardless of how we got here
155
154
  # (seek or iteration)
@@ -1,14 +1,20 @@
1
1
  import dataclasses
2
2
  import importlib
3
+ import logging
3
4
  import os
4
5
  import pkgutil
5
6
  from typing import Callable
6
7
 
7
8
  import sqlalchemy as sql
8
- import sqlalchemy.orm as orm
9
+ from sqlalchemy import orm
10
+
11
+ from pixeltable.utils.console_output import ConsoleLogger
9
12
 
10
13
  from .schema import SystemInfo, SystemInfoMd
11
14
 
15
+ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
16
+
17
+
12
18
  # current version of the metadata; this is incremented whenever the metadata schema changes
13
19
  VERSION = 30
14
20
 
@@ -30,7 +36,6 @@ converter_cbs: dict[int, Callable[[sql.engine.Engine], None]] = {}
30
36
 
31
37
  def register_converter(version: int) -> Callable[[Callable[[sql.engine.Engine], None]], None]:
32
38
  def decorator(fn: Callable[[sql.engine.Engine], None]) -> None:
33
- global converter_cbs
34
39
  assert version not in converter_cbs
35
40
  converter_cbs[version] = fn
36
41
 
@@ -53,9 +58,8 @@ def upgrade_md(engine: sql.engine.Engine) -> None:
53
58
  while md_version < VERSION:
54
59
  if md_version not in converter_cbs:
55
60
  raise RuntimeError(f'No metadata converter for version {md_version}')
56
- from pixeltable.env import Env
57
-
58
- Env.get().console_logger.info(f'Converting metadata from version {md_version} to {md_version + 1}')
61
+ # We can't use the console logger in Env, because Env might not have been initialized yet.
62
+ _console_logger.info(f'Converting metadata from version {md_version} to {md_version + 1}')
59
63
  converter_cbs[md_version](engine)
60
64
  md_version += 1
61
65
  # update system info
@@ -13,4 +13,3 @@ def _(engine: sql.engine.Engine) -> None:
13
13
  conn.execute(sql.update(Table).values(md=Table.md - 'parameters'))
14
14
  # Add `table_attrs` to all instances of tableschemaversions.md.
15
15
  conn.execute(sql.update(TableSchemaVersion).values(md=TableSchemaVersion.md.concat(default_table_attrs)))
16
- return
@@ -5,8 +5,6 @@ from typing import Any
5
5
  import cloudpickle # type: ignore[import-untyped]
6
6
  import sqlalchemy as sql
7
7
 
8
- import pixeltable.func as func
9
- import pixeltable.type_system as ts
10
8
  from pixeltable.metadata import register_converter
11
9
  from pixeltable.metadata.schema import Function
12
10
 
@@ -1,12 +1,10 @@
1
1
  import logging
2
- from typing import Any, Optional
3
2
  from uuid import UUID
4
3
 
5
4
  import sqlalchemy as sql
6
5
 
7
6
  from pixeltable.metadata import register_converter
8
7
  from pixeltable.metadata.converters.util import convert_table_md
9
- from pixeltable.metadata.schema import Table
10
8
 
11
9
  _logger = logging.getLogger('pixeltable')
12
10
 
@@ -19,11 +19,11 @@ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], A
19
19
  isinstance(v, dict)
20
20
  and '_classpath' in v
21
21
  and v['_classpath']
22
- in [
22
+ in {
23
23
  'pixeltable.func.callable_function.CallableFunction',
24
24
  'pixeltable.func.aggregate_function.AggregateFunction',
25
25
  'pixeltable.func.expr_template_function.ExprTemplateFunction',
26
- ]
26
+ }
27
27
  ):
28
28
  if 'path' in v:
29
29
  assert 'signature' not in v
@@ -50,6 +50,6 @@ def __substitute_path(path: str) -> str:
50
50
  # versions, it's necessary to resolve the function symbol to get the signature. The following
51
51
  # adjustment is necessary for function names that are stored in db artifacts of version < 25, but
52
52
  # have changed in some version > 25.
53
- if path in ['pixeltable.functions.huggingface.clip_text', 'pixeltable.functions.huggingface.clip_image']:
53
+ if path in {'pixeltable.functions.huggingface.clip_text', 'pixeltable.functions.huggingface.clip_image'}:
54
54
  return 'pixeltable.functions.huggingface.clip'
55
55
  return path