pixeltable 0.2.19__py3-none-any.whl → 0.2.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (88) hide show
  1. pixeltable/__init__.py +7 -19
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +7 -7
  4. pixeltable/catalog/globals.py +3 -0
  5. pixeltable/catalog/insertable_table.py +9 -7
  6. pixeltable/catalog/table.py +220 -143
  7. pixeltable/catalog/table_version.py +36 -18
  8. pixeltable/catalog/table_version_path.py +0 -8
  9. pixeltable/catalog/view.py +3 -3
  10. pixeltable/dataframe.py +9 -24
  11. pixeltable/env.py +107 -36
  12. pixeltable/exceptions.py +7 -4
  13. pixeltable/exec/__init__.py +1 -1
  14. pixeltable/exec/aggregation_node.py +22 -15
  15. pixeltable/exec/component_iteration_node.py +62 -41
  16. pixeltable/exec/data_row_batch.py +7 -7
  17. pixeltable/exec/exec_node.py +35 -7
  18. pixeltable/exec/expr_eval_node.py +2 -1
  19. pixeltable/exec/in_memory_data_node.py +9 -9
  20. pixeltable/exec/sql_node.py +265 -136
  21. pixeltable/exprs/__init__.py +1 -0
  22. pixeltable/exprs/data_row.py +30 -19
  23. pixeltable/exprs/expr.py +15 -14
  24. pixeltable/exprs/expr_dict.py +55 -0
  25. pixeltable/exprs/expr_set.py +21 -15
  26. pixeltable/exprs/function_call.py +21 -8
  27. pixeltable/exprs/json_path.py +3 -6
  28. pixeltable/exprs/rowid_ref.py +2 -2
  29. pixeltable/exprs/sql_element_cache.py +5 -1
  30. pixeltable/ext/functions/whisperx.py +7 -2
  31. pixeltable/func/callable_function.py +2 -2
  32. pixeltable/func/function_registry.py +6 -7
  33. pixeltable/func/query_template_function.py +11 -12
  34. pixeltable/func/signature.py +17 -15
  35. pixeltable/func/udf.py +0 -4
  36. pixeltable/functions/__init__.py +1 -1
  37. pixeltable/functions/audio.py +4 -6
  38. pixeltable/functions/globals.py +86 -42
  39. pixeltable/functions/huggingface.py +12 -14
  40. pixeltable/functions/image.py +59 -45
  41. pixeltable/functions/json.py +0 -1
  42. pixeltable/functions/mistralai.py +2 -2
  43. pixeltable/functions/openai.py +22 -25
  44. pixeltable/functions/string.py +50 -50
  45. pixeltable/functions/timestamp.py +20 -20
  46. pixeltable/functions/together.py +26 -12
  47. pixeltable/functions/video.py +11 -20
  48. pixeltable/functions/whisper.py +2 -20
  49. pixeltable/globals.py +57 -56
  50. pixeltable/index/base.py +2 -2
  51. pixeltable/index/btree.py +7 -7
  52. pixeltable/index/embedding_index.py +8 -10
  53. pixeltable/io/external_store.py +11 -5
  54. pixeltable/io/globals.py +3 -1
  55. pixeltable/io/hf_datasets.py +4 -4
  56. pixeltable/io/label_studio.py +6 -6
  57. pixeltable/io/parquet.py +14 -13
  58. pixeltable/iterators/document.py +10 -8
  59. pixeltable/iterators/video.py +10 -1
  60. pixeltable/metadata/__init__.py +3 -2
  61. pixeltable/metadata/converters/convert_14.py +4 -2
  62. pixeltable/metadata/converters/convert_15.py +1 -1
  63. pixeltable/metadata/converters/convert_19.py +1 -0
  64. pixeltable/metadata/converters/convert_20.py +1 -1
  65. pixeltable/metadata/converters/util.py +9 -8
  66. pixeltable/metadata/schema.py +32 -21
  67. pixeltable/plan.py +136 -154
  68. pixeltable/store.py +51 -36
  69. pixeltable/tool/create_test_db_dump.py +7 -7
  70. pixeltable/tool/doc_plugins/griffe.py +3 -34
  71. pixeltable/tool/mypy_plugin.py +32 -0
  72. pixeltable/type_system.py +243 -60
  73. pixeltable/utils/arrow.py +10 -9
  74. pixeltable/utils/coco.py +4 -4
  75. pixeltable/utils/documents.py +1 -1
  76. pixeltable/utils/filecache.py +131 -84
  77. pixeltable/utils/formatter.py +1 -1
  78. pixeltable/utils/http_server.py +2 -5
  79. pixeltable/utils/media_store.py +6 -6
  80. pixeltable/utils/pytorch.py +10 -11
  81. pixeltable/utils/sql.py +2 -1
  82. {pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/METADATA +16 -7
  83. pixeltable-0.2.21.dist-info/RECORD +148 -0
  84. pixeltable/utils/help.py +0 -11
  85. pixeltable-0.2.19.dist-info/RECORD +0 -147
  86. {pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/LICENSE +0 -0
  87. {pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/WHEEL +0 -0
  88. {pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/entry_points.txt +0 -0
@@ -1,18 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Optional, Any
4
3
  import enum
4
+ from typing import Any, Optional
5
5
 
6
- import PIL.Image
7
6
  import numpy as np
8
- import pgvector.sqlalchemy
7
+ import pgvector.sqlalchemy # type: ignore[import-untyped]
9
8
  import PIL.Image
10
9
  import sqlalchemy as sql
11
10
 
12
- import pixeltable.catalog as catalog
13
11
  import pixeltable.exceptions as excs
14
- import pixeltable.func as func
15
12
  import pixeltable.type_system as ts
13
+ from pixeltable import catalog, exprs, func
14
+
16
15
  from .base import IndexBase
17
16
 
18
17
 
@@ -58,16 +57,15 @@ class EmbeddingIndex(IndexBase):
58
57
  self._validate_embedding_fn(image_embed, 'image_embed', ts.ColumnType.Type.IMAGE)
59
58
 
60
59
  self.metric = self.Metric[metric.upper()]
61
- from pixeltable.exprs import ColumnRef
62
- self.value_expr = string_embed(ColumnRef(c)) if c.col_type.is_string_type() else image_embed(ColumnRef(c))
63
- assert self.value_expr.col_type.is_array_type()
60
+ self.value_expr = string_embed(exprs.ColumnRef(c)) if c.col_type.is_string_type() else image_embed(exprs.ColumnRef(c))
61
+ assert isinstance(self.value_expr.col_type, ts.ArrayType)
64
62
  self.string_embed = string_embed
65
63
  self.image_embed = image_embed
66
64
  vector_size = self.value_expr.col_type.shape[0]
67
65
  assert vector_size is not None
68
66
  self.index_col_type = pgvector.sqlalchemy.Vector(vector_size)
69
67
 
70
- def index_value_expr(self) -> 'pixeltable.exprs.Expr':
68
+ def index_value_expr(self) -> exprs.Expr:
71
69
  """Return expression that computes the value that goes into the index"""
72
70
  return self.value_expr
73
71
 
@@ -151,7 +149,7 @@ class EmbeddingIndex(IndexBase):
151
149
  img = PIL.Image.new('RGB', (512, 512))
152
150
  return_type = embed_fn.call_return_type({param_name: img})
153
151
  assert return_type is not None
154
- if not return_type.is_array_type():
152
+ if not isinstance(return_type, ts.ArrayType):
155
153
  raise excs.Error(f'{name} must return an array, but returns {return_type}')
156
154
  else:
157
155
  shape = return_type.shape
@@ -69,6 +69,9 @@ class Project(ExternalStore, abc.ABC):
69
69
  An `ExternalStore` that represents a labeling project. Extends `ExternalStore` with a few
70
70
  additional capabilities specific to such projects.
71
71
  """
72
+
73
+ stored_proxies: dict[Column, Column]
74
+
72
75
  def __init__(self, name: str, col_mapping: dict[Column, str], stored_proxies: Optional[dict[Column, Column]]):
73
76
  super().__init__(name)
74
77
  self._col_mapping = col_mapping
@@ -116,7 +119,7 @@ class Project(ExternalStore, abc.ABC):
116
119
  tbl_version.schema_version = tbl_version.version
117
120
  proxy_cols = [self.create_stored_proxy(tbl_version, col) for col in stored_proxies_needed]
118
121
  # Add the columns; this will also update table metadata.
119
- tbl_version._add_columns(proxy_cols, conn)
122
+ tbl_version._add_columns(proxy_cols, conn, print_stats=False, on_error='ignore')
120
123
  # We don't need to retain `UpdateStatus` since the stored proxies are intended to be
121
124
  # invisible to the user.
122
125
  tbl_version._update_md(time.time(), conn, preceding_schema_version=preceding_schema_version)
@@ -126,7 +129,7 @@ class Project(ExternalStore, abc.ABC):
126
129
  # any *other* external store for this table.)
127
130
  deletions_needed: set[Column] = set(self.stored_proxies.values())
128
131
  for name, store in tbl_version.external_stores.items():
129
- if name != self.name:
132
+ if isinstance(store, Project) and name != self.name:
130
133
  deletions_needed = deletions_needed.difference(set(store.stored_proxies.values()))
131
134
  if len(deletions_needed) > 0:
132
135
  _logger.info(f'Removing stored proxies for columns: {[col.name for col in deletions_needed]}')
@@ -210,6 +213,8 @@ class Project(ExternalStore, abc.ABC):
210
213
  If validation fails, an exception will be raised. If validation succeeds, a new mapping will be returned
211
214
  in which the Pixeltable column names are resolved to the corresponding `Column` objects.
212
215
  """
216
+ from pixeltable import exprs
217
+
213
218
  is_user_specified_col_mapping = col_mapping is not None
214
219
  if col_mapping is None:
215
220
  col_mapping = {col: col for col in itertools.chain(export_cols.keys(), import_cols.keys())}
@@ -235,8 +240,9 @@ class Project(ExternalStore, abc.ABC):
235
240
  f'Column name `{ext_col}` appears as a value in `col_mapping`, but the external store '
236
241
  f'configuration has no column `{ext_col}`.'
237
242
  )
238
- col = table[t_col].col
239
- resolved_col_mapping[col] = ext_col
243
+ col_ref = table[t_col]
244
+ assert isinstance(col_ref, exprs.ColumnRef)
245
+ resolved_col_mapping[col_ref.col] = ext_col
240
246
  # Validate column specs
241
247
  t_col_types = table._schema
242
248
  for t_col, ext_col in col_mapping.items():
@@ -329,7 +335,7 @@ class MockProject(Project):
329
335
  def get_import_columns(self) -> dict[str, ts.ColumnType]:
330
336
  return self.import_cols
331
337
 
332
- def sync(self, t: Table, export_data: bool, import_data: bool) -> NotImplemented:
338
+ def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
333
339
  raise NotImplementedError()
334
340
 
335
341
  def delete(self) -> None:
pixeltable/io/globals.py CHANGED
@@ -43,7 +43,7 @@ def create_label_studio_project(
43
43
  The API key and URL for a valid Label Studio server must be specified in Pixeltable config. Either:
44
44
 
45
45
  * Set the `LABEL_STUDIO_API_KEY` and `LABEL_STUDIO_URL` environment variables; or
46
- * Specify `api_key` and `url` fields in the `label-studio` section of `$PIXELTABLE_HOME/config.yaml`.
46
+ * Specify `api_key` and `url` fields in the `label-studio` section of `$PIXELTABLE_HOME/config.toml`.
47
47
 
48
48
  __Requirements:__
49
49
 
@@ -187,6 +187,8 @@ def import_rows(
187
187
  # If `key` is not in `schema_overrides`, then we infer its type from the data.
188
188
  # The column type will always be nullable by default.
189
189
  col_type = pxt.ColumnType.infer_literal_type(value, nullable=True)
190
+ if col_type is None:
191
+ raise excs.Error(f'Could not infer type for column `{col_name}`; the value in row {n} has an unsupported type: {type(value)}')
190
192
  if col_name not in schema:
191
193
  schema[col_name] = col_type
192
194
  else:
@@ -11,7 +11,7 @@ import pixeltable.type_system as ts
11
11
  from pixeltable import exceptions as excs
12
12
 
13
13
  if typing.TYPE_CHECKING:
14
- import datasets
14
+ import datasets # type: ignore[import-untyped]
15
15
 
16
16
  _logger = logging.getLogger(__name__)
17
17
 
@@ -34,9 +34,7 @@ _hf_to_pxt: dict[str, ts.ColumnType] = {
34
34
  }
35
35
 
36
36
 
37
- def _to_pixeltable_type(
38
- feature_type: Union[datasets.ClassLabel, datasets.Value, datasets.Sequence],
39
- ) -> Optional[ts.ColumnType]:
37
+ def _to_pixeltable_type(feature_type: Any) -> Optional[ts.ColumnType]:
40
38
  """Convert a huggingface feature type to a pixeltable ColumnType if one is defined."""
41
39
  import datasets
42
40
 
@@ -51,6 +49,8 @@ def _to_pixeltable_type(
51
49
  dtype = _to_pixeltable_type(feature_type.feature)
52
50
  length = feature_type.length if feature_type.length != -1 else None
53
51
  return ts.ArrayType(shape=(length,), dtype=dtype)
52
+ elif isinstance(feature_type, datasets.Image):
53
+ return ts.ImageType(nullable=True)
54
54
  else:
55
55
  return None
56
56
 
@@ -4,17 +4,17 @@ import logging
4
4
  import os
5
5
  from dataclasses import dataclass
6
6
  from pathlib import Path
7
- from typing import Any, Iterator, Optional, Literal
7
+ from typing import Any, Iterator, Literal, Optional, cast
8
8
  from xml.etree import ElementTree
9
9
 
10
+ import label_studio_sdk # type: ignore[import-untyped]
10
11
  import PIL.Image
11
- import label_studio_sdk
12
12
  from requests.exceptions import HTTPError
13
13
 
14
14
  import pixeltable as pxt
15
15
  import pixeltable.env as env
16
16
  import pixeltable.exceptions as excs
17
- from pixeltable import Table, Column
17
+ from pixeltable import Column, Table
18
18
  from pixeltable.exprs import ColumnRef, DataRow, Expr
19
19
  from pixeltable.io.external_store import Project, SyncStatus
20
20
  from pixeltable.utils import coco
@@ -211,7 +211,7 @@ class LabelStudioProject(Project):
211
211
  assert isinstance(row[media_col_idx], PIL.Image.Image)
212
212
  file = env.Env.get().create_tmp_path(extension='.png')
213
213
  row[media_col_idx].save(file, format='png')
214
- task_id: int = self.project.import_tasks(file)[0]
214
+ task_id = self.project.import_tasks(file)[0]
215
215
  os.remove(file)
216
216
 
217
217
  # Update the task with `rowid` metadata
@@ -256,7 +256,7 @@ class LabelStudioProject(Project):
256
256
  assert self.media_import_method == 'file'
257
257
  if not col.col_type.is_media_type():
258
258
  # Not a media column; query the data directly
259
- expr_refs[col_name] = t[col_name]
259
+ expr_refs[col_name] = cast(ColumnRef, t[col_name])
260
260
  elif col in self.stored_proxies:
261
261
  # Media column that has a stored proxy; use it. We have to give it a name,
262
262
  # since it's an anonymous column
@@ -267,7 +267,7 @@ class LabelStudioProject(Project):
267
267
  # and we can just use the localpath
268
268
  expr_refs[col_name] = t[col_name].localpath
269
269
 
270
- df = t.select(*[t[col] for col in t_rl_cols], **expr_refs)
270
+ df = t.select(*[t[col.name] for col in t_rl_cols], **expr_refs)
271
271
  # The following buffers will hold `DataRow` indices that correspond to each of the selected
272
272
  # columns. `rl_col_idxs` holds the indices for the columns that map to RectangleLabels
273
273
  # preannotations; `data_col_idxs` holds the indices for the columns that map to data fields.
pixeltable/io/parquet.py CHANGED
@@ -7,24 +7,23 @@ import random
7
7
  import typing
8
8
  from collections import deque
9
9
  from pathlib import Path
10
- from typing import Dict, Optional, Any
10
+ from typing import Any, Optional
11
11
 
12
- import PIL.Image
13
12
  import numpy as np
13
+ import PIL.Image
14
14
 
15
15
  import pixeltable.exceptions as exc
16
16
  import pixeltable.type_system as ts
17
17
  from pixeltable.utils.transactional_directory import transactional_directory
18
18
 
19
19
  if typing.TYPE_CHECKING:
20
- import pixeltable as pxt
21
20
  import pyarrow as pa
22
- from pyarrow import parquet
21
+ import pixeltable as pxt
23
22
 
24
23
  _logger = logging.getLogger(__name__)
25
24
 
26
25
 
27
- def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
26
+ def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
28
27
  import pyarrow as pa
29
28
  from pyarrow import parquet
30
29
 
@@ -37,7 +36,7 @@ def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path:
37
36
  pydict[field.name] = value_batch[field.name]
38
37
 
39
38
  tab = pa.Table.from_pydict(pydict, schema=schema)
40
- parquet.write_table(tab, output_path)
39
+ parquet.write_table(tab, str(output_path))
41
40
 
42
41
 
43
42
  def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
@@ -67,7 +66,7 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
67
66
  json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
68
67
 
69
68
  batch_num = 0
70
- current_value_batch: Dict[str, deque] = {k: deque() for k in df.schema.keys()}
69
+ current_value_batch: dict[str, deque] = {k: deque() for k in df.schema.keys()}
71
70
  current_byte_estimate = 0
72
71
 
73
72
  for data_row in df._exec():
@@ -128,13 +127,14 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
128
127
  _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
129
128
 
130
129
 
131
- def parquet_schema_to_pixeltable_schema(parquet_path: str) -> Dict[str, Optional[ts.ColumnType]]:
130
+ def parquet_schema_to_pixeltable_schema(parquet_path: str) -> dict[str, Optional[ts.ColumnType]]:
132
131
  """Generate a default pixeltable schema for the given parquet file. Returns None for unknown types."""
133
132
  from pyarrow import parquet
133
+
134
134
  from pixeltable.utils.arrow import to_pixeltable_schema
135
135
 
136
136
  input_path = Path(parquet_path).expanduser()
137
- parquet_dataset = parquet.ParquetDataset(input_path)
137
+ parquet_dataset = parquet.ParquetDataset(str(input_path))
138
138
  return to_pixeltable_schema(parquet_dataset.schema)
139
139
 
140
140
 
@@ -142,7 +142,7 @@ def import_parquet(
142
142
  table_path: str,
143
143
  *,
144
144
  parquet_path: str,
145
- schema_overrides: Optional[Dict[str, ts.ColumnType]] = None,
145
+ schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
146
146
  **kwargs: Any,
147
147
  ) -> pxt.Table:
148
148
  """Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
@@ -159,12 +159,13 @@ def import_parquet(
159
159
  Returns:
160
160
  A handle to the newly created [`Table`][pixeltable.Table].
161
161
  """
162
- import pixeltable as pxt
163
162
  from pyarrow import parquet
163
+
164
+ import pixeltable as pxt
164
165
  from pixeltable.utils.arrow import iter_tuples
165
166
 
166
167
  input_path = Path(parquet_path).expanduser()
167
- parquet_dataset = parquet.ParquetDataset(input_path)
168
+ parquet_dataset = parquet.ParquetDataset(str(input_path))
168
169
 
169
170
  schema = parquet_schema_to_pixeltable_schema(parquet_path)
170
171
  if schema_overrides is None:
@@ -181,7 +182,7 @@ def import_parquet(
181
182
  try:
182
183
  tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
183
184
  tab = pxt.create_table(tmp_name, schema, **kwargs)
184
- for fragment in parquet_dataset.fragments:
185
+ for fragment in parquet_dataset.fragments: # type: ignore[attr-defined]
185
186
  for batch in fragment.to_batches():
186
187
  dict_batch = list(iter_tuples(batch))
187
188
  tab.insert(dict_batch)
@@ -1,7 +1,7 @@
1
1
  import dataclasses
2
2
  import enum
3
3
  import logging
4
- from typing import Any, Iterable, Iterator, Optional
4
+ from typing import Any, Iterable, Iterator, Optional, Union
5
5
 
6
6
  import ftfy
7
7
 
@@ -166,7 +166,7 @@ class DocumentSplitter(ComponentIterator):
166
166
  return {
167
167
  'document': DocumentType(nullable=False),
168
168
  'separators': StringType(nullable=False),
169
- 'metadata': StringType(nullable=True),
169
+ 'metadata': StringType(nullable=False),
170
170
  'limit': IntType(nullable=True),
171
171
  'overlap': IntType(nullable=True),
172
172
  'skip_tags': StringType(nullable=True),
@@ -176,7 +176,7 @@ class DocumentSplitter(ComponentIterator):
176
176
 
177
177
  @classmethod
178
178
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
179
- schema = {'text': StringType()}
179
+ schema: dict[str, ColumnType] = {'text': StringType()}
180
180
  md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
181
181
 
182
182
  for md_field in md_fields:
@@ -214,7 +214,7 @@ class DocumentSplitter(ComponentIterator):
214
214
  section = next(self._sections)
215
215
  if section.text is None:
216
216
  continue
217
- result = {'text': section.text}
217
+ result: dict[str, Any] = {'text': section.text}
218
218
  for md_field in self._metadata_fields:
219
219
  if md_field == ChunkMetadata.TITLE:
220
220
  result[md_field.name.lower()] = self._doc_title
@@ -234,7 +234,7 @@ class DocumentSplitter(ComponentIterator):
234
234
  emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
235
235
  emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
236
236
  # current state
237
- accumulated_text = [] # currently accumulated text
237
+ accumulated_text: list[str] = [] # currently accumulated text
238
238
  # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
239
239
 
240
240
  headings: dict[str, str] = {} # current state of observed headings (level -> text)
@@ -260,9 +260,10 @@ class DocumentSplitter(ComponentIterator):
260
260
  yield DocumentSection(text=full_text, metadata=md)
261
261
  accumulated_text = []
262
262
 
263
- def process_element(el: bs4.PageElement) -> Iterator[DocumentSection]:
263
+ def process_element(el: Union[bs4.element.Tag, bs4.NavigableString]) -> Iterator[DocumentSection]:
264
264
  # process the element and emit sections as necessary
265
265
  nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
266
+
266
267
  if el.name in self._skip_tags:
267
268
  return
268
269
 
@@ -282,6 +283,7 @@ class DocumentSplitter(ComponentIterator):
282
283
  yield from emit()
283
284
  update_metadata(el)
284
285
  for child in el.children:
286
+ assert isinstance(child, (bs4.element.Tag, bs4.NavigableString)), type(el)
285
287
  yield from process_element(child)
286
288
 
287
289
  yield from process_element(self._doc_handle.bs_doc)
@@ -293,7 +295,7 @@ class DocumentSplitter(ComponentIterator):
293
295
  emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
294
296
  emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
295
297
  # current state
296
- accumulated_text = [] # currently accumulated text
298
+ accumulated_text: list[str] = [] # currently accumulated text
297
299
  # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
298
300
  headings: dict[str, str] = {} # current state of observed headings (level -> text)
299
301
 
@@ -347,7 +349,7 @@ class DocumentSplitter(ComponentIterator):
347
349
 
348
350
  def _pdf_sections(self) -> Iterator[DocumentSection]:
349
351
  """Create DocumentSections reflecting the pdf-specific separators"""
350
- import fitz
352
+ import fitz # type: ignore[import-untyped]
351
353
  doc: fitz.Document = self._doc_handle.pdf_doc
352
354
  assert doc is not None
353
355
 
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  import math
3
3
  from pathlib import Path
4
- from typing import Any, Optional
4
+ from typing import Any, Optional, Sequence
5
5
 
6
6
  import cv2
7
7
  import PIL.Image
@@ -29,6 +29,15 @@ class FrameIterator(ComponentIterator):
29
29
  num_frames: Exact number of frames to extract. The frames will be spaced as evenly as possible. If
30
30
  `num_frames` is greater than the number of frames in the video, all frames will be extracted.
31
31
  """
32
+
33
+ video_path: Path
34
+ video_reader: cv2.VideoCapture
35
+ fps: Optional[float]
36
+ num_frames: Optional[int]
37
+ frames_to_extract: Sequence[int]
38
+ frames_set: set[int]
39
+ next_frame_idx: int
40
+
32
41
  def __init__(self, video: str, *, fps: Optional[float] = None, num_frames: Optional[int] = None):
33
42
  if fps is not None and num_frames is not None:
34
43
  raise Error('At most one of `fps` or `num_frames` may be specified')
@@ -2,7 +2,7 @@ import dataclasses
2
2
  import importlib
3
3
  import os
4
4
  import pkgutil
5
- from typing import Callable, Dict
5
+ from typing import Callable
6
6
 
7
7
  import sqlalchemy as sql
8
8
  import sqlalchemy.orm as orm
@@ -24,7 +24,7 @@ def create_system_info(engine: sql.engine.Engine) -> None:
24
24
 
25
25
  # conversion functions for upgrading the metadata schema from one version to the following
26
26
  # key: old schema version
27
- converter_cbs: Dict[int, Callable[[sql.engine.Engine], None]] = {}
27
+ converter_cbs: dict[int, Callable[[sql.engine.Engine], None]] = {}
28
28
 
29
29
  def register_converter(version: int) -> Callable[[Callable[[sql.engine.Engine], None]], None]:
30
30
  def decorator(fn: Callable[[sql.engine.Engine], None]) -> None:
@@ -41,6 +41,7 @@ def upgrade_md(engine: sql.engine.Engine) -> None:
41
41
  with orm.Session(engine) as session:
42
42
  system_info = session.query(SystemInfo).one().md
43
43
  md_version = system_info['schema_version']
44
+ assert isinstance(md_version, int)
44
45
  if md_version == VERSION:
45
46
  return
46
47
  while md_version < VERSION:
@@ -1,11 +1,13 @@
1
+ from typing import Any
2
+
1
3
  import sqlalchemy as sql
2
4
 
3
- from pixeltable.metadata.schema import Table
4
5
  from pixeltable.metadata import register_converter
6
+ from pixeltable.metadata.schema import Table
5
7
 
6
8
 
7
9
  @register_converter(version=14)
8
10
  def _(engine: sql.engine.Engine) -> None:
9
- default_remotes = {'remotes': []}
11
+ default_remotes: dict[str, Any] = {'remotes': []}
10
12
  with engine.begin() as conn:
11
13
  conn.execute(sql.update(Table).where(Table.md['remotes'] == None).values(md=Table.md.concat(default_remotes)))
@@ -3,7 +3,7 @@ import inspect
3
3
  import logging
4
4
  from typing import Any
5
5
 
6
- import cloudpickle
6
+ import cloudpickle # type: ignore[import-untyped]
7
7
  import sqlalchemy as sql
8
8
 
9
9
  import pixeltable.func as func
@@ -44,3 +44,4 @@ def __update_timestamp_literals(k: Any, v: Any) -> Optional[tuple[Any, Any]]:
44
44
  dt_utc = dt.astimezone(datetime.timezone.utc)
45
45
  v['val'] = dt_utc.isoformat()
46
46
  return k, v
47
+ return None
@@ -35,7 +35,7 @@ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], A
35
35
  # but it might actually be transformed into an InlineList when it is instantiated
36
36
  # (unfortunately, there is no way to disambiguate at this stage; see comments in
37
37
  # InlineArray._from_dict() for more details).
38
- updated_v = {'_classname': 'InlineList' if v.get('is_json') else 'InlineArray'}
38
+ updated_v: dict[str, Any] = {'_classname': 'InlineList' if v.get('is_json') else 'InlineArray'}
39
39
  if len(updated_components) > 0:
40
40
  updated_v['components'] = updated_components
41
41
  return k, updated_v
@@ -68,24 +68,25 @@ def __substitute_md_rec(
68
68
  substitution_fn: Callable[[Optional[str], Any], Optional[tuple[Optional[str], Any]]]
69
69
  ) -> Any:
70
70
  if isinstance(md, dict):
71
- updated_md = {}
71
+ updated_dict: dict[str, Any] = {}
72
72
  for k, v in md.items():
73
+ assert isinstance(k, str)
73
74
  substitute = substitution_fn(k, v)
74
75
  if substitute is not None:
75
76
  updated_k, updated_v = substitute
76
- updated_md[updated_k] = __substitute_md_rec(updated_v, substitution_fn)
77
+ updated_dict[updated_k] = __substitute_md_rec(updated_v, substitution_fn)
77
78
  else:
78
- updated_md[k] = __substitute_md_rec(v, substitution_fn)
79
- return updated_md
79
+ updated_dict[k] = __substitute_md_rec(v, substitution_fn)
80
+ return updated_dict
80
81
  elif isinstance(md, list):
81
- updated_md = []
82
+ updated_list: list[Any] = []
82
83
  for v in md:
83
84
  substitute = substitution_fn(None, v)
84
85
  if substitute is not None:
85
86
  _, updated_v = substitute
86
- updated_md.append(__substitute_md_rec(updated_v, substitution_fn))
87
+ updated_list.append(__substitute_md_rec(updated_v, substitution_fn))
87
88
  else:
88
- updated_md.append(__substitute_md_rec(v, substitution_fn))
89
- return updated_md
89
+ updated_list.append(__substitute_md_rec(v, substitution_fn))
90
+ return updated_list
90
91
  else:
91
92
  return md
@@ -1,37 +1,48 @@
1
1
  import dataclasses
2
+ import typing
2
3
  import uuid
3
- from typing import Optional, List, get_type_hints, Type, Any, TypeVar, Tuple, Union
4
+ from typing import Any, Optional, TypeVar, Union, get_type_hints
4
5
 
5
6
  import sqlalchemy as sql
6
7
  import sqlalchemy.orm as orm
7
- from sqlalchemy import ForeignKey
8
- from sqlalchemy import Integer, BigInteger, LargeBinary
9
- from sqlalchemy.dialects.postgresql import UUID, JSONB
8
+ from sqlalchemy import BigInteger, ForeignKey, Integer, LargeBinary
9
+ from sqlalchemy.dialects.postgresql import JSONB, UUID
10
10
  from sqlalchemy.orm import declarative_base
11
+ from sqlalchemy.orm.decl_api import DeclarativeMeta
11
12
 
12
- Base = declarative_base()
13
+ # Base has to be marked explicitly as a type, in order to be used elsewhere as a type hint. But in addition to being
14
+ # a type, it's also a `DeclarativeMeta`. The following pattern enables us to expose both `Base` and `Base.metadata`
15
+ # outside of the module in a typesafe way.
16
+ Base: type = declarative_base()
17
+ assert isinstance(Base, DeclarativeMeta)
18
+ base_metadata = Base.metadata
13
19
 
14
20
  T = TypeVar('T')
15
21
 
16
- def md_from_dict(data_class_type: Type[T], data: Any) -> T:
22
+ def md_from_dict(data_class_type: type[T], data: Any) -> T:
17
23
  """Re-instantiate a dataclass instance that contains nested dataclasses from a dict."""
18
24
  if dataclasses.is_dataclass(data_class_type):
19
25
  fieldtypes = {f: t for f, t in get_type_hints(data_class_type).items()}
20
- return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data})
21
- elif hasattr(data_class_type, '__origin__'):
22
- if data_class_type.__origin__ is Union and type(None) in data_class_type.__args__:
26
+ return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data}) # type: ignore[return-value]
27
+
28
+ origin = typing.get_origin(data_class_type)
29
+ if origin is not None:
30
+ type_args = typing.get_args(data_class_type)
31
+ if origin is Union and type(None) in type_args:
23
32
  # Handling Optional types
24
- non_none_args = [arg for arg in data_class_type.__args__ if arg is not type(None)]
25
- if len(non_none_args) == 1:
26
- return md_from_dict(non_none_args[0], data) if data is not None else None
27
- elif data_class_type.__origin__ is list:
28
- return [md_from_dict(data_class_type.__args__[0], elem) for elem in data]
29
- elif data_class_type.__origin__ is dict:
30
- key_type = data_class_type.__args__[0]
31
- val_type = data_class_type.__args__[1]
32
- return {key_type(key): md_from_dict(val_type, val) for key, val in data.items()}
33
- elif data_class_type.__origin__ is tuple:
34
- return tuple(md_from_dict(arg_type, elem) for arg_type, elem in zip(data_class_type.__args__, data))
33
+ non_none_args = [arg for arg in type_args if arg is not type(None)]
34
+ assert len(non_none_args) == 1
35
+ return md_from_dict(non_none_args[0], data) if data is not None else None
36
+ elif origin is list:
37
+ return [md_from_dict(type_args[0], elem) for elem in data] # type: ignore[return-value]
38
+ elif origin is dict:
39
+ key_type = type_args[0]
40
+ val_type = type_args[1]
41
+ return {key_type(key): md_from_dict(val_type, val) for key, val in data.items()} # type: ignore[return-value]
42
+ elif origin is tuple:
43
+ return tuple(md_from_dict(arg_type, elem) for arg_type, elem in zip(type_args, data)) # type: ignore[return-value]
44
+ else:
45
+ assert False
35
46
  else:
36
47
  return data
37
48
 
@@ -115,7 +126,7 @@ class ViewMd:
115
126
  is_snapshot: bool
116
127
 
117
128
  # (table id, version); for mutable views, all versions are None
118
- base_versions: List[Tuple[str, Optional[int]]]
129
+ base_versions: list[tuple[str, Optional[int]]]
119
130
 
120
131
  # filter predicate applied to the base table; view-only
121
132
  predicate: Optional[dict[str, Any]]