pixeltable 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (120) hide show
  1. pixeltable/__init__.py +7 -19
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +7 -7
  4. pixeltable/catalog/column.py +37 -11
  5. pixeltable/catalog/globals.py +21 -0
  6. pixeltable/catalog/insertable_table.py +6 -4
  7. pixeltable/catalog/table.py +227 -148
  8. pixeltable/catalog/table_version.py +66 -28
  9. pixeltable/catalog/table_version_path.py +0 -8
  10. pixeltable/catalog/view.py +18 -19
  11. pixeltable/dataframe.py +16 -32
  12. pixeltable/env.py +6 -1
  13. pixeltable/exec/__init__.py +1 -2
  14. pixeltable/exec/aggregation_node.py +27 -17
  15. pixeltable/exec/cache_prefetch_node.py +1 -1
  16. pixeltable/exec/data_row_batch.py +9 -26
  17. pixeltable/exec/exec_node.py +36 -7
  18. pixeltable/exec/expr_eval_node.py +19 -11
  19. pixeltable/exec/in_memory_data_node.py +14 -11
  20. pixeltable/exec/sql_node.py +266 -138
  21. pixeltable/exprs/__init__.py +1 -0
  22. pixeltable/exprs/arithmetic_expr.py +3 -1
  23. pixeltable/exprs/array_slice.py +7 -7
  24. pixeltable/exprs/column_property_ref.py +37 -10
  25. pixeltable/exprs/column_ref.py +93 -14
  26. pixeltable/exprs/comparison.py +5 -5
  27. pixeltable/exprs/compound_predicate.py +8 -7
  28. pixeltable/exprs/data_row.py +56 -36
  29. pixeltable/exprs/expr.py +65 -63
  30. pixeltable/exprs/expr_dict.py +55 -0
  31. pixeltable/exprs/expr_set.py +26 -15
  32. pixeltable/exprs/function_call.py +53 -24
  33. pixeltable/exprs/globals.py +4 -1
  34. pixeltable/exprs/in_predicate.py +8 -7
  35. pixeltable/exprs/inline_expr.py +4 -4
  36. pixeltable/exprs/is_null.py +4 -4
  37. pixeltable/exprs/json_mapper.py +11 -12
  38. pixeltable/exprs/json_path.py +5 -10
  39. pixeltable/exprs/literal.py +5 -5
  40. pixeltable/exprs/method_ref.py +5 -4
  41. pixeltable/exprs/object_ref.py +2 -1
  42. pixeltable/exprs/row_builder.py +88 -36
  43. pixeltable/exprs/rowid_ref.py +14 -13
  44. pixeltable/exprs/similarity_expr.py +12 -7
  45. pixeltable/exprs/sql_element_cache.py +12 -6
  46. pixeltable/exprs/type_cast.py +8 -6
  47. pixeltable/exprs/variable.py +5 -4
  48. pixeltable/ext/functions/whisperx.py +7 -2
  49. pixeltable/func/aggregate_function.py +1 -1
  50. pixeltable/func/callable_function.py +2 -2
  51. pixeltable/func/function.py +11 -10
  52. pixeltable/func/function_registry.py +6 -7
  53. pixeltable/func/query_template_function.py +11 -12
  54. pixeltable/func/signature.py +17 -15
  55. pixeltable/func/udf.py +0 -4
  56. pixeltable/functions/__init__.py +2 -2
  57. pixeltable/functions/audio.py +4 -6
  58. pixeltable/functions/globals.py +84 -42
  59. pixeltable/functions/huggingface.py +31 -34
  60. pixeltable/functions/image.py +59 -45
  61. pixeltable/functions/json.py +0 -1
  62. pixeltable/functions/llama_cpp.py +106 -0
  63. pixeltable/functions/mistralai.py +2 -2
  64. pixeltable/functions/ollama.py +147 -0
  65. pixeltable/functions/openai.py +22 -25
  66. pixeltable/functions/replicate.py +72 -0
  67. pixeltable/functions/string.py +59 -50
  68. pixeltable/functions/timestamp.py +20 -20
  69. pixeltable/functions/together.py +2 -2
  70. pixeltable/functions/video.py +11 -20
  71. pixeltable/functions/whisper.py +2 -20
  72. pixeltable/globals.py +65 -74
  73. pixeltable/index/base.py +2 -2
  74. pixeltable/index/btree.py +20 -7
  75. pixeltable/index/embedding_index.py +12 -14
  76. pixeltable/io/__init__.py +1 -2
  77. pixeltable/io/external_store.py +11 -5
  78. pixeltable/io/fiftyone.py +178 -0
  79. pixeltable/io/globals.py +98 -2
  80. pixeltable/io/hf_datasets.py +1 -1
  81. pixeltable/io/label_studio.py +6 -6
  82. pixeltable/io/parquet.py +14 -13
  83. pixeltable/iterators/base.py +3 -2
  84. pixeltable/iterators/document.py +10 -8
  85. pixeltable/iterators/video.py +126 -60
  86. pixeltable/metadata/__init__.py +4 -3
  87. pixeltable/metadata/converters/convert_14.py +4 -2
  88. pixeltable/metadata/converters/convert_15.py +1 -1
  89. pixeltable/metadata/converters/convert_19.py +1 -0
  90. pixeltable/metadata/converters/convert_20.py +1 -1
  91. pixeltable/metadata/converters/convert_21.py +34 -0
  92. pixeltable/metadata/converters/util.py +54 -12
  93. pixeltable/metadata/notes.py +1 -0
  94. pixeltable/metadata/schema.py +40 -21
  95. pixeltable/plan.py +149 -165
  96. pixeltable/py.typed +0 -0
  97. pixeltable/store.py +57 -37
  98. pixeltable/tool/create_test_db_dump.py +6 -6
  99. pixeltable/tool/create_test_video.py +1 -1
  100. pixeltable/tool/doc_plugins/griffe.py +3 -34
  101. pixeltable/tool/embed_udf.py +1 -1
  102. pixeltable/tool/mypy_plugin.py +55 -0
  103. pixeltable/type_system.py +260 -61
  104. pixeltable/utils/arrow.py +10 -9
  105. pixeltable/utils/coco.py +4 -4
  106. pixeltable/utils/documents.py +16 -2
  107. pixeltable/utils/filecache.py +9 -9
  108. pixeltable/utils/formatter.py +10 -11
  109. pixeltable/utils/http_server.py +2 -5
  110. pixeltable/utils/media_store.py +6 -6
  111. pixeltable/utils/pytorch.py +10 -11
  112. pixeltable/utils/sql.py +2 -1
  113. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/METADATA +50 -13
  114. pixeltable-0.2.22.dist-info/RECORD +153 -0
  115. pixeltable/exec/media_validation_node.py +0 -43
  116. pixeltable/utils/help.py +0 -11
  117. pixeltable-0.2.20.dist-info/RECORD +0 -147
  118. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
  119. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
  120. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,178 @@
1
+ import os
2
+ from typing import Iterator, Optional, Union
3
+
4
+ import fiftyone as fo # type: ignore[import-untyped]
5
+ import fiftyone.utils.data as foud # type: ignore[import-untyped]
6
+ import PIL.Image
7
+ import puremagic
8
+
9
+ import pixeltable as pxt
10
+ import pixeltable.exceptions as excs
11
+ from pixeltable import exprs
12
+ from pixeltable.env import Env
13
+
14
+
15
+ class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
16
+ """
17
+ Implementation of a FiftyOne `DatasetImporter` that reads image data from a Pixeltable table.
18
+ """
19
+ __image_format: str # format to use for any exported images that are not already stored on disk
20
+ __labels: dict[str, tuple[exprs.Expr, type[fo.Label]]] # label_name -> (expr, label_cls)
21
+ __image_idx: int # index of the image expr in the select list
22
+ __localpath_idx: Optional[int] # index of the image localpath in the select list, if present
23
+ __row_iter: Iterator[list] # iterator over the table rows, to be convered to FiftyOne samples
24
+
25
+ def __init__(
26
+ self,
27
+ tbl: pxt.Table,
28
+ image: exprs.Expr,
29
+ image_format: str,
30
+ classifications: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
31
+ detections: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
32
+ dataset_dir: Optional[os.PathLike] = None,
33
+ shuffle: bool = False,
34
+ seed: Union[int, float, str, bytes, bytearray, None] = None,
35
+ max_samples: Optional[int] = None,
36
+ ):
37
+ super().__init__(
38
+ dataset_dir=dataset_dir,
39
+ shuffle=shuffle,
40
+ seed=seed,
41
+ max_samples=max_samples
42
+ )
43
+
44
+ self.__image_format = image_format
45
+
46
+ label_categories = [
47
+ (classifications, fo.Classifications, 'classifications'),
48
+ (detections, fo.Detections, 'detections'),
49
+ ]
50
+
51
+ # Construct the labels. First add labels for all label types that have named dictionaries.
52
+ self.__labels = {}
53
+ for exprs_, label_cls, _ in label_categories:
54
+ if isinstance(exprs_, dict):
55
+ for label_name, expr in exprs_.items():
56
+ if not label_name.isidentifier():
57
+ raise excs.Error(f"Invalid label name: {label_name}")
58
+ if label_name in self.__labels:
59
+ raise excs.Error(f"Duplicate label name: {label_name}")
60
+ self.__labels[label_name] = (expr, label_cls)
61
+
62
+ # Now add the remaining labels, assigning unused default names.
63
+ for exprs_, label_cls, default_name in label_categories:
64
+ if exprs_ is None or isinstance(exprs_, dict):
65
+ continue
66
+ if isinstance(exprs_, exprs.Expr):
67
+ exprs_ = [exprs_]
68
+ assert isinstance(exprs_, list)
69
+ for expr in exprs_:
70
+ if default_name not in self.__labels:
71
+ name = default_name
72
+ else:
73
+ i = 1
74
+ while f'{default_name}_{i}' in self.__labels:
75
+ i += 1
76
+ name = f'{default_name}_{i}'
77
+ self.__labels[name] = (expr, label_cls)
78
+
79
+ # Build the select list:
80
+ # - Labels first, in the order they appear in self.__labels
81
+ # - Then the `image` expr
82
+ # - Then `image.localpath`, if `images` is a stored columnref
83
+
84
+ selection = [expr for expr, _ in self.__labels.values()]
85
+ self.__image_idx = len(selection)
86
+ selection.append(image)
87
+
88
+ if isinstance(image, exprs.ColumnRef) and image.col.is_stored:
89
+ # A stored image column; we can use the existing localpaths
90
+ self.__localpath_idx = len(selection)
91
+ selection.append(image.localpath)
92
+ else:
93
+ self.__localpath_idx = None
94
+
95
+ df = tbl.select(*selection)
96
+ self.__row_iter = df._output_row_iterator()
97
+
98
+ def __next__(self) -> tuple[str, Optional[fo.ImageMetadata], Optional[dict[str, fo.Label]]]:
99
+ row = next(self.__row_iter)
100
+ img = row[self.__image_idx]
101
+ assert isinstance(img, PIL.Image.Image)
102
+ if self.__localpath_idx is not None:
103
+ # Use the existing localpath of the stored image
104
+ file = row[self.__localpath_idx]
105
+ assert isinstance(file, str)
106
+ else:
107
+ # Write the dynamically created image to a temp file
108
+ file = str(Env.get().create_tmp_path(f'.{self.__image_format}'))
109
+ img.save(file, format=self.__image_format)
110
+
111
+ metadata = fo.ImageMetadata(
112
+ size_bytes=os.path.getsize(file),
113
+ mime_type=puremagic.from_file(file, mime=True),
114
+ width=img.width,
115
+ height=img.height,
116
+ filepath=file,
117
+ num_channels=len(img.getbands()),
118
+ )
119
+
120
+ labels: dict[str, fo.Label] = {}
121
+ for idx, (label_name, (_, label_cls)) in enumerate(self.__labels.items()):
122
+ label_data = row[idx]
123
+ if label_data is None:
124
+ continue
125
+
126
+ label: fo.Label
127
+ if label_cls is fo.Classifications:
128
+ label = fo.Classifications(classifications=self.__as_fo_classifications(label_data))
129
+ elif label_cls is fo.Detections:
130
+ label = fo.Detections(detections=self.__as_fo_detections(label_data))
131
+ else:
132
+ assert False
133
+ labels[label_name] = label
134
+
135
+ return file, metadata, labels
136
+
137
+ def __as_fo_classifications(self, data: list) -> list[fo.Classification]:
138
+ if not isinstance(data, list) or any('label' not in entry for entry in data):
139
+ raise excs.Error(
140
+ f'Invalid classifications data: {data}\n'
141
+ "(Expected a list of dicts, each containing a 'label' key)"
142
+ )
143
+ return [
144
+ fo.Classification(label=entry['label'], confidence=entry.get('confidence'))
145
+ for entry in data
146
+ ]
147
+
148
+ def __as_fo_detections(self, data: list) -> list[fo.Detections]:
149
+ if not isinstance(data, list) or any('label' not in entry or 'bounding_box' not in entry for entry in data):
150
+ raise excs.Error(
151
+ f'Invalid detections data: {data}\n'
152
+ "(Expected a list of dicts, each containing a 'label' and 'bounding_box' key)"
153
+ )
154
+ return [
155
+ fo.Detection(label=entry['label'], bounding_box=entry['bounding_box'], confidence=entry.get('confidence'))
156
+ for entry in data
157
+ ]
158
+
159
+ @property
160
+ def has_dataset_info(self) -> bool:
161
+ return False
162
+
163
+ @property
164
+ def has_image_metadata(self) -> bool:
165
+ return True
166
+
167
+ @property
168
+ def label_cls(self) -> dict[str, type]:
169
+ return {label_name: label_cls for label_name, (_, label_cls) in self.__labels.items()}
170
+
171
+ def setup(self) -> None:
172
+ pass
173
+
174
+ def get_dataset_info(self) -> dict:
175
+ pass
176
+
177
+ def close(self, *args) -> None:
178
+ pass
pixeltable/io/globals.py CHANGED
@@ -1,10 +1,14 @@
1
- from typing import Any, Literal, Optional, Union
1
+ from typing import TYPE_CHECKING, Any, Literal, Optional, Union
2
2
 
3
3
  import pixeltable as pxt
4
4
  import pixeltable.exceptions as excs
5
- from pixeltable import Table
5
+ from pixeltable import Table, exprs
6
+ from pixeltable.env import Env
6
7
  from pixeltable.io.external_store import SyncStatus
7
8
 
9
+ if TYPE_CHECKING:
10
+ import fiftyone as fo # type: ignore[import-untyped]
11
+
8
12
 
9
13
  def create_label_studio_project(
10
14
  t: Table,
@@ -116,6 +120,8 @@ def create_label_studio_project(
116
120
  s3_configuration={'bucket': 'my-bucket', 'region_name': 'us-east-2'}
117
121
  )
118
122
  """
123
+ Env.get().require_package('label_studio_sdk')
124
+
119
125
  from pixeltable.io.label_studio import LabelStudioProject
120
126
 
121
127
  ls_project = LabelStudioProject.create(
@@ -187,6 +193,8 @@ def import_rows(
187
193
  # If `key` is not in `schema_overrides`, then we infer its type from the data.
188
194
  # The column type will always be nullable by default.
189
195
  col_type = pxt.ColumnType.infer_literal_type(value, nullable=True)
196
+ if col_type is None:
197
+ raise excs.Error(f'Could not infer type for column `{col_name}`; the value in row {n} has an unsupported type: {type(value)}')
190
198
  if col_name not in schema:
191
199
  schema[col_name] = col_type
192
200
  else:
@@ -265,3 +273,91 @@ def import_json(
265
273
  contents = urllib.request.urlopen(filepath_or_url).read()
266
274
  data = json.loads(contents, **kwargs)
267
275
  return import_rows(tbl_path, data, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
276
+
277
+
278
+ def export_images_as_fo_dataset(
279
+ tbl: pxt.Table,
280
+ images: exprs.Expr,
281
+ image_format: str = 'webp',
282
+ classifications: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
283
+ detections: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
284
+ ) -> 'fo.Dataset':
285
+ """
286
+ Export images from a Pixeltable table as a Voxel51 dataset. The data must consist of a single column
287
+ (or expression) containing image data, along with optional additional columns containing labels. Currently, only
288
+ classification and detection labels are supported.
289
+
290
+ The [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial contains a
291
+ fully worked example showing how to export data from a Pixeltable table and load it into Voxel51.
292
+
293
+ Images in the dataset that already exist on disk will be exported directly, in whatever format they
294
+ are stored in. Images that are not already on disk (such as frames extracted using a
295
+ [`FrameIterator`][pixeltable.iterators.FrameIterator]) will first be written to disk in the specified
296
+ `image_format`.
297
+
298
+ The label parameters accept one or more sets of labels of each type. If a single `Expr` is provided, then it will
299
+ be exported as a single set of labels with a default name such as `classifications`.
300
+ (The single set of labels may still containing multiple individual labels; see below.)
301
+ If a list of `Expr`s is provided, then each one will be exported as a separate set of labels with a default name
302
+ such as `classifications`, `classifications_1`, etc. If a dictionary of `Expr`s is provided, then each entry will
303
+ be exported as a set of labels with the specified name.
304
+
305
+ __Requirements:__
306
+
307
+ - `pip install fiftyone`
308
+
309
+ Args:
310
+ tbl: The table from which to export data.
311
+ images: A column or expression that contains the images to export.
312
+ image_format: The format to use when writing out images for export.
313
+ classifications: Optional image classification labels. If a single `Expr` is provided, it must be a table
314
+ column or an expression that evaluates to a list of dictionaries. Each dictionary in the list corresponds
315
+ to an image class and must have the following structure:
316
+
317
+ ```python
318
+ {'label': 'zebra', 'confidence': 0.325}
319
+ ```
320
+
321
+ If multiple `Expr`s are provided, each one must evaluate to a list of such dictionaries.
322
+ detections: Optional image detection labels. If a single `Expr` is provided, it must be a table column or an
323
+ expression that evaluates to a list of dictionaries. Each dictionary in the list corresponds to an image
324
+ detection, and must have the following structure:
325
+
326
+ ```python
327
+ {
328
+ 'label': 'giraffe',
329
+ 'confidence': 0.99,
330
+ 'bounding_box': [0.081, 0.836, 0.202, 0.136] # [x, y, w, h], fractional coordinates
331
+ }
332
+ ```
333
+
334
+ If multiple `Expr`s are provided, each one must evaluate to a list of such dictionaries.
335
+
336
+ Returns:
337
+ A Voxel51 dataset.
338
+
339
+ Example:
340
+ Export the images in the `image` column of the table `tbl` as a Voxel51 dataset, using classification
341
+ labels from `tbl.classifications`:
342
+
343
+ >>> export_as_fiftyone(
344
+ ... tbl,
345
+ ... tbl.image,
346
+ ... classifications=tbl.classifications
347
+ ... )
348
+
349
+ See the [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial
350
+ for a fully worked example.
351
+ """
352
+ Env.get().require_package('fiftyone')
353
+
354
+ import fiftyone as fo
355
+
356
+ from pixeltable.io.fiftyone import PxtImageDatasetImporter
357
+
358
+ if not images.col_type.is_image_type():
359
+ raise excs.Error(f'`images` must be an expression of type Image (got {images.col_type._to_base_str()})')
360
+
361
+ return fo.Dataset.from_importer(PxtImageDatasetImporter(
362
+ tbl, images, image_format, classifications=classifications, detections=detections
363
+ ))
@@ -11,7 +11,7 @@ import pixeltable.type_system as ts
11
11
  from pixeltable import exceptions as excs
12
12
 
13
13
  if typing.TYPE_CHECKING:
14
- import datasets
14
+ import datasets # type: ignore[import-untyped]
15
15
 
16
16
  _logger = logging.getLogger(__name__)
17
17
 
@@ -4,17 +4,17 @@ import logging
4
4
  import os
5
5
  from dataclasses import dataclass
6
6
  from pathlib import Path
7
- from typing import Any, Iterator, Optional, Literal
7
+ from typing import Any, Iterator, Literal, Optional, cast
8
8
  from xml.etree import ElementTree
9
9
 
10
+ import label_studio_sdk # type: ignore[import-untyped]
10
11
  import PIL.Image
11
- import label_studio_sdk
12
12
  from requests.exceptions import HTTPError
13
13
 
14
14
  import pixeltable as pxt
15
15
  import pixeltable.env as env
16
16
  import pixeltable.exceptions as excs
17
- from pixeltable import Table, Column
17
+ from pixeltable import Column, Table
18
18
  from pixeltable.exprs import ColumnRef, DataRow, Expr
19
19
  from pixeltable.io.external_store import Project, SyncStatus
20
20
  from pixeltable.utils import coco
@@ -211,7 +211,7 @@ class LabelStudioProject(Project):
211
211
  assert isinstance(row[media_col_idx], PIL.Image.Image)
212
212
  file = env.Env.get().create_tmp_path(extension='.png')
213
213
  row[media_col_idx].save(file, format='png')
214
- task_id: int = self.project.import_tasks(file)[0]
214
+ task_id = self.project.import_tasks(file)[0]
215
215
  os.remove(file)
216
216
 
217
217
  # Update the task with `rowid` metadata
@@ -256,7 +256,7 @@ class LabelStudioProject(Project):
256
256
  assert self.media_import_method == 'file'
257
257
  if not col.col_type.is_media_type():
258
258
  # Not a media column; query the data directly
259
- expr_refs[col_name] = t[col_name]
259
+ expr_refs[col_name] = cast(ColumnRef, t[col_name])
260
260
  elif col in self.stored_proxies:
261
261
  # Media column that has a stored proxy; use it. We have to give it a name,
262
262
  # since it's an anonymous column
@@ -267,7 +267,7 @@ class LabelStudioProject(Project):
267
267
  # and we can just use the localpath
268
268
  expr_refs[col_name] = t[col_name].localpath
269
269
 
270
- df = t.select(*[t[col] for col in t_rl_cols], **expr_refs)
270
+ df = t.select(*[t[col.name] for col in t_rl_cols], **expr_refs)
271
271
  # The following buffers will hold `DataRow` indices that correspond to each of the selected
272
272
  # columns. `rl_col_idxs` holds the indices for the columns that map to RectangleLabels
273
273
  # preannotations; `data_col_idxs` holds the indices for the columns that map to data fields.
pixeltable/io/parquet.py CHANGED
@@ -7,24 +7,23 @@ import random
7
7
  import typing
8
8
  from collections import deque
9
9
  from pathlib import Path
10
- from typing import Dict, Optional, Any
10
+ from typing import Any, Optional
11
11
 
12
- import PIL.Image
13
12
  import numpy as np
13
+ import PIL.Image
14
14
 
15
15
  import pixeltable.exceptions as exc
16
16
  import pixeltable.type_system as ts
17
17
  from pixeltable.utils.transactional_directory import transactional_directory
18
18
 
19
19
  if typing.TYPE_CHECKING:
20
- import pixeltable as pxt
21
20
  import pyarrow as pa
22
- from pyarrow import parquet
21
+ import pixeltable as pxt
23
22
 
24
23
  _logger = logging.getLogger(__name__)
25
24
 
26
25
 
27
- def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
26
+ def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
28
27
  import pyarrow as pa
29
28
  from pyarrow import parquet
30
29
 
@@ -37,7 +36,7 @@ def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path:
37
36
  pydict[field.name] = value_batch[field.name]
38
37
 
39
38
  tab = pa.Table.from_pydict(pydict, schema=schema)
40
- parquet.write_table(tab, output_path)
39
+ parquet.write_table(tab, str(output_path))
41
40
 
42
41
 
43
42
  def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
@@ -67,7 +66,7 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
67
66
  json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
68
67
 
69
68
  batch_num = 0
70
- current_value_batch: Dict[str, deque] = {k: deque() for k in df.schema.keys()}
69
+ current_value_batch: dict[str, deque] = {k: deque() for k in df.schema.keys()}
71
70
  current_byte_estimate = 0
72
71
 
73
72
  for data_row in df._exec():
@@ -128,13 +127,14 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
128
127
  _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
129
128
 
130
129
 
131
- def parquet_schema_to_pixeltable_schema(parquet_path: str) -> Dict[str, Optional[ts.ColumnType]]:
130
+ def parquet_schema_to_pixeltable_schema(parquet_path: str) -> dict[str, Optional[ts.ColumnType]]:
132
131
  """Generate a default pixeltable schema for the given parquet file. Returns None for unknown types."""
133
132
  from pyarrow import parquet
133
+
134
134
  from pixeltable.utils.arrow import to_pixeltable_schema
135
135
 
136
136
  input_path = Path(parquet_path).expanduser()
137
- parquet_dataset = parquet.ParquetDataset(input_path)
137
+ parquet_dataset = parquet.ParquetDataset(str(input_path))
138
138
  return to_pixeltable_schema(parquet_dataset.schema)
139
139
 
140
140
 
@@ -142,7 +142,7 @@ def import_parquet(
142
142
  table_path: str,
143
143
  *,
144
144
  parquet_path: str,
145
- schema_overrides: Optional[Dict[str, ts.ColumnType]] = None,
145
+ schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
146
146
  **kwargs: Any,
147
147
  ) -> pxt.Table:
148
148
  """Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
@@ -159,12 +159,13 @@ def import_parquet(
159
159
  Returns:
160
160
  A handle to the newly created [`Table`][pixeltable.Table].
161
161
  """
162
- import pixeltable as pxt
163
162
  from pyarrow import parquet
163
+
164
+ import pixeltable as pxt
164
165
  from pixeltable.utils.arrow import iter_tuples
165
166
 
166
167
  input_path = Path(parquet_path).expanduser()
167
- parquet_dataset = parquet.ParquetDataset(input_path)
168
+ parquet_dataset = parquet.ParquetDataset(str(input_path))
168
169
 
169
170
  schema = parquet_schema_to_pixeltable_schema(parquet_path)
170
171
  if schema_overrides is None:
@@ -181,7 +182,7 @@ def import_parquet(
181
182
  try:
182
183
  tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
183
184
  tab = pxt.create_table(tmp_name, schema, **kwargs)
184
- for fragment in parquet_dataset.fragments:
185
+ for fragment in parquet_dataset.fragments: # type: ignore[attr-defined]
185
186
  for batch in fragment.to_batches():
186
187
  dict_batch = list(iter_tuples(batch))
187
188
  tab.insert(dict_batch)
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
- from typing import Dict, Any, Tuple, List
3
- from abc import abstractmethod, ABC
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any
4
5
 
5
6
  from pixeltable.type_system import ColumnType
6
7
 
@@ -1,7 +1,7 @@
1
1
  import dataclasses
2
2
  import enum
3
3
  import logging
4
- from typing import Any, Iterable, Iterator, Optional
4
+ from typing import Any, Iterable, Iterator, Optional, Union
5
5
 
6
6
  import ftfy
7
7
 
@@ -152,7 +152,7 @@ class DocumentSplitter(ComponentIterator):
152
152
  assert self._doc_handle.pdf_doc is not None
153
153
  self._sections = self._pdf_sections()
154
154
  else:
155
- assert False, f'unknown document format: {self._doc_handle.format}'
155
+ assert False, f'Unsupported document format: {self._doc_handle.format}'
156
156
 
157
157
  if Separator.SENTENCE in self._separators:
158
158
  self._sections = self._sentence_sections(self._sections)
@@ -176,7 +176,7 @@ class DocumentSplitter(ComponentIterator):
176
176
 
177
177
  @classmethod
178
178
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
179
- schema = {'text': StringType()}
179
+ schema: dict[str, ColumnType] = {'text': StringType()}
180
180
  md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
181
181
 
182
182
  for md_field in md_fields:
@@ -214,7 +214,7 @@ class DocumentSplitter(ComponentIterator):
214
214
  section = next(self._sections)
215
215
  if section.text is None:
216
216
  continue
217
- result = {'text': section.text}
217
+ result: dict[str, Any] = {'text': section.text}
218
218
  for md_field in self._metadata_fields:
219
219
  if md_field == ChunkMetadata.TITLE:
220
220
  result[md_field.name.lower()] = self._doc_title
@@ -234,7 +234,7 @@ class DocumentSplitter(ComponentIterator):
234
234
  emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
235
235
  emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
236
236
  # current state
237
- accumulated_text = [] # currently accumulated text
237
+ accumulated_text: list[str] = [] # currently accumulated text
238
238
  # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
239
239
 
240
240
  headings: dict[str, str] = {} # current state of observed headings (level -> text)
@@ -260,9 +260,10 @@ class DocumentSplitter(ComponentIterator):
260
260
  yield DocumentSection(text=full_text, metadata=md)
261
261
  accumulated_text = []
262
262
 
263
- def process_element(el: bs4.PageElement) -> Iterator[DocumentSection]:
263
+ def process_element(el: Union[bs4.element.Tag, bs4.NavigableString]) -> Iterator[DocumentSection]:
264
264
  # process the element and emit sections as necessary
265
265
  nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
266
+
266
267
  if el.name in self._skip_tags:
267
268
  return
268
269
 
@@ -282,6 +283,7 @@ class DocumentSplitter(ComponentIterator):
282
283
  yield from emit()
283
284
  update_metadata(el)
284
285
  for child in el.children:
286
+ assert isinstance(child, (bs4.element.Tag, bs4.NavigableString)), type(el)
285
287
  yield from process_element(child)
286
288
 
287
289
  yield from process_element(self._doc_handle.bs_doc)
@@ -293,7 +295,7 @@ class DocumentSplitter(ComponentIterator):
293
295
  emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
294
296
  emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
295
297
  # current state
296
- accumulated_text = [] # currently accumulated text
298
+ accumulated_text: list[str] = [] # currently accumulated text
297
299
  # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
298
300
  headings: dict[str, str] = {} # current state of observed headings (level -> text)
299
301
 
@@ -347,7 +349,7 @@ class DocumentSplitter(ComponentIterator):
347
349
 
348
350
  def _pdf_sections(self) -> Iterator[DocumentSection]:
349
351
  """Create DocumentSections reflecting the pdf-specific separators"""
350
- import fitz
352
+ import fitz # type: ignore[import-untyped]
351
353
  doc: fitz.Document = self._doc_handle.pdf_doc
352
354
  assert doc is not None
353
355