pixeltable 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (147) hide show
  1. pixeltable/__init__.py +64 -11
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +1 -1
  4. pixeltable/catalog/catalog.py +50 -27
  5. pixeltable/catalog/column.py +27 -11
  6. pixeltable/catalog/dir.py +6 -4
  7. pixeltable/catalog/globals.py +8 -1
  8. pixeltable/catalog/insertable_table.py +25 -15
  9. pixeltable/catalog/named_function.py +10 -6
  10. pixeltable/catalog/path.py +3 -2
  11. pixeltable/catalog/path_dict.py +8 -6
  12. pixeltable/catalog/schema_object.py +2 -1
  13. pixeltable/catalog/table.py +123 -103
  14. pixeltable/catalog/table_version.py +292 -143
  15. pixeltable/catalog/table_version_path.py +8 -5
  16. pixeltable/catalog/view.py +68 -27
  17. pixeltable/dataframe.py +102 -72
  18. pixeltable/env.py +39 -23
  19. pixeltable/exec/__init__.py +2 -2
  20. pixeltable/exec/aggregation_node.py +10 -4
  21. pixeltable/exec/cache_prefetch_node.py +5 -3
  22. pixeltable/exec/component_iteration_node.py +9 -8
  23. pixeltable/exec/data_row_batch.py +21 -10
  24. pixeltable/exec/exec_context.py +10 -3
  25. pixeltable/exec/exec_node.py +23 -12
  26. pixeltable/exec/expr_eval/evaluators.py +18 -17
  27. pixeltable/exec/expr_eval/expr_eval_node.py +29 -16
  28. pixeltable/exec/expr_eval/globals.py +33 -11
  29. pixeltable/exec/expr_eval/row_buffer.py +5 -6
  30. pixeltable/exec/expr_eval/schedulers.py +170 -42
  31. pixeltable/exec/in_memory_data_node.py +8 -7
  32. pixeltable/exec/row_update_node.py +15 -5
  33. pixeltable/exec/sql_node.py +56 -27
  34. pixeltable/exprs/__init__.py +2 -2
  35. pixeltable/exprs/arithmetic_expr.py +57 -26
  36. pixeltable/exprs/array_slice.py +1 -1
  37. pixeltable/exprs/column_property_ref.py +2 -1
  38. pixeltable/exprs/column_ref.py +20 -15
  39. pixeltable/exprs/comparison.py +6 -2
  40. pixeltable/exprs/compound_predicate.py +1 -3
  41. pixeltable/exprs/data_row.py +2 -2
  42. pixeltable/exprs/expr.py +101 -72
  43. pixeltable/exprs/expr_dict.py +2 -1
  44. pixeltable/exprs/expr_set.py +3 -1
  45. pixeltable/exprs/function_call.py +39 -41
  46. pixeltable/exprs/globals.py +1 -0
  47. pixeltable/exprs/in_predicate.py +2 -2
  48. pixeltable/exprs/inline_expr.py +20 -17
  49. pixeltable/exprs/json_mapper.py +4 -2
  50. pixeltable/exprs/json_path.py +12 -18
  51. pixeltable/exprs/literal.py +5 -9
  52. pixeltable/exprs/method_ref.py +1 -0
  53. pixeltable/exprs/object_ref.py +1 -1
  54. pixeltable/exprs/row_builder.py +31 -16
  55. pixeltable/exprs/rowid_ref.py +14 -5
  56. pixeltable/exprs/similarity_expr.py +11 -6
  57. pixeltable/exprs/sql_element_cache.py +1 -1
  58. pixeltable/exprs/type_cast.py +24 -9
  59. pixeltable/ext/__init__.py +1 -0
  60. pixeltable/ext/functions/__init__.py +1 -0
  61. pixeltable/ext/functions/whisperx.py +2 -2
  62. pixeltable/ext/functions/yolox.py +11 -11
  63. pixeltable/func/aggregate_function.py +17 -13
  64. pixeltable/func/callable_function.py +6 -6
  65. pixeltable/func/expr_template_function.py +15 -14
  66. pixeltable/func/function.py +16 -16
  67. pixeltable/func/function_registry.py +11 -8
  68. pixeltable/func/globals.py +4 -2
  69. pixeltable/func/query_template_function.py +12 -13
  70. pixeltable/func/signature.py +18 -9
  71. pixeltable/func/tools.py +10 -17
  72. pixeltable/func/udf.py +106 -11
  73. pixeltable/functions/__init__.py +21 -2
  74. pixeltable/functions/anthropic.py +21 -15
  75. pixeltable/functions/fireworks.py +63 -5
  76. pixeltable/functions/gemini.py +13 -3
  77. pixeltable/functions/globals.py +18 -6
  78. pixeltable/functions/huggingface.py +20 -38
  79. pixeltable/functions/image.py +7 -3
  80. pixeltable/functions/json.py +1 -0
  81. pixeltable/functions/llama_cpp.py +1 -4
  82. pixeltable/functions/mistralai.py +31 -20
  83. pixeltable/functions/ollama.py +4 -18
  84. pixeltable/functions/openai.py +214 -109
  85. pixeltable/functions/replicate.py +11 -10
  86. pixeltable/functions/string.py +70 -7
  87. pixeltable/functions/timestamp.py +21 -8
  88. pixeltable/functions/together.py +66 -52
  89. pixeltable/functions/video.py +1 -0
  90. pixeltable/functions/vision.py +14 -11
  91. pixeltable/functions/whisper.py +2 -1
  92. pixeltable/globals.py +61 -28
  93. pixeltable/index/__init__.py +1 -1
  94. pixeltable/index/btree.py +5 -3
  95. pixeltable/index/embedding_index.py +15 -14
  96. pixeltable/io/__init__.py +1 -1
  97. pixeltable/io/external_store.py +30 -25
  98. pixeltable/io/fiftyone.py +6 -14
  99. pixeltable/io/globals.py +33 -27
  100. pixeltable/io/hf_datasets.py +3 -2
  101. pixeltable/io/label_studio.py +80 -71
  102. pixeltable/io/pandas.py +33 -9
  103. pixeltable/io/parquet.py +10 -13
  104. pixeltable/iterators/__init__.py +1 -0
  105. pixeltable/iterators/audio.py +205 -0
  106. pixeltable/iterators/document.py +19 -8
  107. pixeltable/iterators/image.py +6 -24
  108. pixeltable/iterators/string.py +3 -6
  109. pixeltable/iterators/video.py +1 -7
  110. pixeltable/metadata/__init__.py +9 -2
  111. pixeltable/metadata/converters/convert_10.py +2 -2
  112. pixeltable/metadata/converters/convert_15.py +1 -5
  113. pixeltable/metadata/converters/convert_16.py +2 -4
  114. pixeltable/metadata/converters/convert_17.py +2 -4
  115. pixeltable/metadata/converters/convert_18.py +2 -4
  116. pixeltable/metadata/converters/convert_19.py +2 -5
  117. pixeltable/metadata/converters/convert_20.py +1 -4
  118. pixeltable/metadata/converters/convert_21.py +4 -6
  119. pixeltable/metadata/converters/convert_22.py +1 -0
  120. pixeltable/metadata/converters/convert_23.py +5 -5
  121. pixeltable/metadata/converters/convert_24.py +12 -13
  122. pixeltable/metadata/converters/convert_26.py +23 -0
  123. pixeltable/metadata/converters/util.py +3 -4
  124. pixeltable/metadata/notes.py +1 -0
  125. pixeltable/metadata/schema.py +13 -2
  126. pixeltable/plan.py +173 -98
  127. pixeltable/store.py +42 -26
  128. pixeltable/type_system.py +130 -85
  129. pixeltable/utils/arrow.py +1 -7
  130. pixeltable/utils/coco.py +16 -17
  131. pixeltable/utils/code.py +1 -1
  132. pixeltable/utils/console_output.py +44 -0
  133. pixeltable/utils/description_helper.py +7 -7
  134. pixeltable/utils/documents.py +3 -1
  135. pixeltable/utils/filecache.py +13 -8
  136. pixeltable/utils/http_server.py +9 -8
  137. pixeltable/utils/media_store.py +2 -1
  138. pixeltable/utils/pytorch.py +11 -14
  139. pixeltable/utils/s3.py +1 -0
  140. pixeltable/utils/sql.py +1 -0
  141. pixeltable/utils/transactional_directory.py +2 -2
  142. {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/METADATA +7 -8
  143. pixeltable-0.3.3.dist-info/RECORD +163 -0
  144. pixeltable-0.3.1.dist-info/RECORD +0 -160
  145. {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/LICENSE +0 -0
  146. {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/WHEEL +0 -0
  147. {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/entry_points.txt +0 -0
pixeltable/io/pandas.py CHANGED
@@ -9,10 +9,13 @@ import pixeltable.type_system as ts
9
9
 
10
10
 
11
11
  def import_pandas(
12
- tbl_name: str, df: pd.DataFrame, *, schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
12
+ tbl_name: str,
13
+ df: pd.DataFrame,
14
+ *,
15
+ schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
13
16
  primary_key: Optional[Union[str, list[str]]] = None,
14
17
  num_retained_versions: int = 10,
15
- comment: str = ''
18
+ comment: str = '',
16
19
  ) -> pxt.Table:
17
20
  """Creates a new base table from a Pandas
18
21
  [`DataFrame`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html), with the
@@ -45,17 +48,21 @@ def import_pandas(
45
48
 
46
49
  schema, pxt_pk = __df_to_pxt_schema(df, schema_overrides, primary_key)
47
50
  tbl_rows = (dict(__df_row_to_pxt_row(row, schema)) for row in df.itertuples())
48
- table = pxt.create_table(tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment)
51
+ table = pxt.create_table(
52
+ tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
53
+ )
49
54
  table.insert(tbl_rows)
50
55
  return table
51
56
 
52
57
 
53
58
  def import_csv(
54
- tbl_name: str, filepath_or_buffer, schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
59
+ tbl_name: str,
60
+ filepath_or_buffer,
61
+ schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
55
62
  primary_key: Optional[Union[str, list[str]]] = None,
56
63
  num_retained_versions: int = 10,
57
64
  comment: str = '',
58
- **kwargs
65
+ **kwargs,
59
66
  ) -> pxt.Table:
60
67
  """
61
68
  Creates a new base table from a csv file. This is a convenience method and is equivalent
@@ -67,15 +74,25 @@ def import_csv(
67
74
  A handle to the newly created [`Table`][pixeltable.Table].
68
75
  """
69
76
  df = pd.read_csv(filepath_or_buffer, **kwargs)
70
- return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
77
+ return import_pandas(
78
+ tbl_name,
79
+ df,
80
+ schema_overrides=schema_overrides,
81
+ primary_key=primary_key,
82
+ num_retained_versions=num_retained_versions,
83
+ comment=comment,
84
+ )
71
85
 
72
86
 
73
87
  def import_excel(
74
- tbl_name: str, io, *args, schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
88
+ tbl_name: str,
89
+ io,
90
+ *args,
91
+ schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
75
92
  primary_key: Optional[Union[str, list[str]]] = None,
76
93
  num_retained_versions: int = 10,
77
94
  comment: str = '',
78
- **kwargs
95
+ **kwargs,
79
96
  ) -> pxt.Table:
80
97
  """
81
98
  Creates a new base table from an Excel (.xlsx) file. This is a convenience method and is
@@ -87,7 +104,14 @@ def import_excel(
87
104
  A handle to the newly created [`Table`][pixeltable.Table].
88
105
  """
89
106
  df = pd.read_excel(io, *args, **kwargs)
90
- return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
107
+ return import_pandas(
108
+ tbl_name,
109
+ df,
110
+ schema_overrides=schema_overrides,
111
+ primary_key=primary_key,
112
+ num_retained_versions=num_retained_versions,
113
+ comment=comment,
114
+ )
91
115
 
92
116
 
93
117
  def __df_to_pxt_schema(
pixeltable/io/parquet.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import datetime
3
4
  import io
4
5
  import json
5
6
  import logging
@@ -11,19 +12,19 @@ from typing import Any, Optional, Union
11
12
 
12
13
  import numpy as np
13
14
  import PIL.Image
14
- import datetime
15
15
 
16
16
  import pixeltable as pxt
17
- from pixeltable.env import Env
18
17
  import pixeltable.exceptions as exc
19
18
  import pixeltable.type_system as ts
19
+ from pixeltable.env import Env
20
20
  from pixeltable.utils.transactional_directory import transactional_directory
21
21
 
22
22
  if typing.TYPE_CHECKING:
23
23
  import pyarrow as pa
24
+
24
25
  import pixeltable as pxt
25
26
 
26
- _logger = logging.getLogger(__name__)
27
+ _logger = logging.getLogger('pixeltable')
27
28
 
28
29
 
29
30
  def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
@@ -43,11 +44,11 @@ def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path:
43
44
 
44
45
 
45
46
  def export_parquet(
46
- table_or_df: Union[pxt.Table, pxt.DataFrame],
47
- parquet_path: Path,
48
- partition_size_bytes: int = 100_000_000,
49
- inline_images: bool = False
50
- ) -> None:
47
+ table_or_df: Union[pxt.Table, pxt.DataFrame],
48
+ parquet_path: Path,
49
+ partition_size_bytes: int = 100_000_000,
50
+ inline_images: bool = False,
51
+ ) -> None:
51
52
  """
52
53
  Exports a dataframe's data to one or more Parquet files. Requires pyarrow to be installed.
53
54
 
@@ -159,11 +160,7 @@ def parquet_schema_to_pixeltable_schema(parquet_path: str) -> dict[str, Optional
159
160
 
160
161
 
161
162
  def import_parquet(
162
- table: str,
163
- *,
164
- parquet_path: str,
165
- schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
166
- **kwargs: Any,
163
+ table: str, *, parquet_path: str, schema_overrides: Optional[dict[str, ts.ColumnType]] = None, **kwargs: Any
167
164
  ) -> pxt.Table:
168
165
  """Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
169
166
 
@@ -1,3 +1,4 @@
1
+ from .audio import AudioSplitter
1
2
  from .base import ComponentIterator
2
3
  from .document import DocumentSplitter
3
4
  from .image import TileIterator
@@ -0,0 +1,205 @@
1
+ import logging
2
+ import math
3
+ import uuid
4
+ from fractions import Fraction
5
+ from pathlib import Path
6
+ from typing import Any, Optional
7
+
8
+ import av # type: ignore[import-untyped]
9
+
10
+ import pixeltable.env as env
11
+ import pixeltable.exceptions as excs
12
+ import pixeltable.type_system as ts
13
+
14
+ from .base import ComponentIterator
15
+
16
+ _logger = logging.getLogger('pixeltable')
17
+
18
+
19
+ class AudioSplitter(ComponentIterator):
20
+ """
21
+ Iterator over chunks of an audio file. The audio file is split into smaller chunks, where the duration of each chunk is determined by chunk_duration_sec.
22
+ The iterator yields audio chunks as pxt.Audio, along with the start and end time of each chunk.
23
+ If the input contains no audio, no chunks are yielded.
24
+
25
+ Args:
26
+ chunk_duration_sec: Audio chunk duration in seconds
27
+ overlap_sec: Overlap between consecutive chunks in seconds.
28
+ min_chunk_duration_sec: Drop the last chunk if it is smaller than min_chunk_duration_sec
29
+ """
30
+
31
+ # Input parameters
32
+ audio_path: Path
33
+ chunk_duration_sec: float
34
+ overlap_sec: float
35
+
36
+ # audio stream details
37
+ container: av.container.input.InputContainer
38
+ audio_time_base: Fraction # seconds per presentation time
39
+
40
+ # List of chunks to extract
41
+ # Each chunk is defined by start and end presentation timestamps in audio file (int)
42
+ chunks_to_extract_in_pts: Optional[list[tuple[int, int]]] = []
43
+ # next chunk to extract
44
+ next_pos: int
45
+
46
+ __codec_map = {
47
+ 'mp3': 'mp3', # MP3 decoder -> mp3/libmp3lame encoder
48
+ 'mp3float': 'mp3', # MP3float decoder -> mp3 encoder
49
+ 'aac': 'aac', # AAC decoder -> AAC encoder
50
+ 'vorbis': 'libvorbis', # Vorbis decoder -> libvorbis encoder
51
+ 'opus': 'libopus', # Opus decoder -> libopus encoder
52
+ 'flac': 'flac', # FLAC decoder -> FLAC encoder
53
+ 'wavpack': 'wavpack', # WavPack decoder -> WavPack encoder
54
+ 'alac': 'alac', # ALAC decoder -> ALAC encoder
55
+ }
56
+
57
+ def __init__(
58
+ self, audio: str, chunk_duration_sec: float, *, overlap_sec: float = 0.0, min_chunk_duration_sec: float = 0.0
59
+ ):
60
+ if chunk_duration_sec <= 0.0:
61
+ raise excs.Error('chunk_duration_sec must be a positive number')
62
+ if chunk_duration_sec < min_chunk_duration_sec:
63
+ raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
64
+ if overlap_sec >= chunk_duration_sec:
65
+ raise excs.Error('overlap_sec must be less than chunk_duration_sec')
66
+ audio_path = Path(audio)
67
+ assert audio_path.exists() and audio_path.is_file()
68
+ self.audio_path = audio_path
69
+ self.next_pos = 0
70
+ self.container = av.open(str(audio_path))
71
+ if len(self.container.streams.audio) == 0:
72
+ # No audio stream
73
+ return
74
+ self.chunk_duration_sec = chunk_duration_sec
75
+ self.overlap_sec = overlap_sec
76
+ self.min_chunk_duration_sec = min_chunk_duration_sec
77
+ self.audio_time_base = self.container.streams.audio[0].time_base
78
+
79
+ audio_start_time_pts = self.container.streams.audio[0].start_time or 0
80
+ audio_start_time_sec = float(audio_start_time_pts * self.audio_time_base)
81
+ total_audio_duration_pts = self.container.streams.audio[0].duration or 0
82
+ total_audio_duration_sec = float(total_audio_duration_pts * self.audio_time_base)
83
+
84
+ self.chunks_to_extract_in_pts = [
85
+ (round(start / self.audio_time_base), round(end / self.audio_time_base))
86
+ for (start, end) in self.build_chunks(
87
+ audio_start_time_sec, total_audio_duration_sec, chunk_duration_sec, overlap_sec, min_chunk_duration_sec
88
+ )
89
+ ]
90
+ _logger.debug(
91
+ f'AudioIterator: path={self.audio_path} total_audio_duration_pts={total_audio_duration_pts} chunks_to_extract_in_pts={self.chunks_to_extract_in_pts}'
92
+ )
93
+
94
+ @classmethod
95
+ def build_chunks(
96
+ cls,
97
+ start_time_sec: float,
98
+ total_duration_sec: float,
99
+ chunk_duration_sec: float,
100
+ overlap_sec: float,
101
+ min_chunk_duration_sec: float,
102
+ ) -> list[tuple[float, float]]:
103
+ chunks_to_extract_in_sec: list[tuple[float, float]] = []
104
+ current_pos = start_time_sec
105
+ end_time = start_time_sec + total_duration_sec
106
+ while current_pos < end_time:
107
+ chunk_start = current_pos
108
+ chunk_end = min(chunk_start + chunk_duration_sec, end_time)
109
+ chunks_to_extract_in_sec.append((chunk_start, chunk_end))
110
+ if chunk_end >= end_time:
111
+ break
112
+ current_pos = chunk_end - overlap_sec
113
+ # If the last chunk is smaller than min_chunk_duration_sec then drop the last chunk from the list
114
+ if (
115
+ len(chunks_to_extract_in_sec) > 0
116
+ and (chunks_to_extract_in_sec[-1][1] - chunks_to_extract_in_sec[-1][0]) < min_chunk_duration_sec
117
+ ):
118
+ return chunks_to_extract_in_sec[:-1] # return all but the last chunk
119
+ return chunks_to_extract_in_sec
120
+
121
+ @classmethod
122
+ def input_schema(cls) -> dict[str, ts.ColumnType]:
123
+ return {
124
+ 'audio': ts.AudioType(nullable=False),
125
+ 'chunk_duration_sec': ts.FloatType(nullable=True),
126
+ 'overlap_sec': ts.FloatType(nullable=True),
127
+ 'min_chunk_duration_sec': ts.FloatType(nullable=True),
128
+ }
129
+
130
+ @classmethod
131
+ def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
132
+ return {
133
+ 'start_time_sec': ts.FloatType(),
134
+ 'end_time_sec': ts.FloatType(),
135
+ 'audio_chunk': ts.AudioType(nullable=True),
136
+ }, []
137
+
138
+ def __next__(self) -> dict[str, Any]:
139
+ if self.next_pos >= len(self.chunks_to_extract_in_pts):
140
+ raise StopIteration
141
+ target_chunk_start, target_chunk_end = self.chunks_to_extract_in_pts[self.next_pos]
142
+ chunk_start_pts = 0
143
+ chunk_end_pts = 0
144
+ chunk_file = str(env.Env.get().tmp_dir / f'{uuid.uuid4()}{self.audio_path.suffix}')
145
+ output_container = av.open(chunk_file, mode='w')
146
+ input_stream = self.container.streams.audio[0]
147
+ codec_name = AudioSplitter.__codec_map.get(input_stream.codec_context.name, input_stream.codec_context.name)
148
+ output_stream = output_container.add_stream(codec_name, rate=input_stream.codec_context.sample_rate)
149
+ frame_count = 0
150
+ # Since frames don't align with chunk boundaries, we may have read an extra frame in previous iteration
151
+ # Seek to the nearest frame in stream at current chunk start time
152
+ self.container.seek(target_chunk_start, backward=True, stream=self.container.streams.audio[0])
153
+ while True:
154
+ try:
155
+ frame = next(self.container.decode(audio=0))
156
+ except EOFError as e:
157
+ raise excs.Error(f'Failed to read audio file `{self.audio_path}`, error `{e}`')
158
+ except StopIteration:
159
+ # no more frames to scan
160
+ break
161
+ if frame.pts < target_chunk_start:
162
+ # Current frame is behind chunk's start time, always get frame next to chunk's start time
163
+ continue
164
+ if frame.pts >= target_chunk_end:
165
+ # Frame has crossed the chunk boundary, it should be picked up by next chunk, throw away the current frame
166
+ break
167
+ frame_end = frame.pts + frame.samples
168
+ if frame_count == 0:
169
+ # Record start of the first frame
170
+ chunk_start_pts = frame.pts
171
+ # Write frame to output container
172
+ frame_count += 1
173
+ # If encode returns packets, write them to output container. Some encoders will buffer the frames.
174
+ output_container.mux(output_stream.encode(frame))
175
+ # record this frame's end as chunks end
176
+ chunk_end_pts = frame_end
177
+ # Check if frame's end has crossed the chunk boundary
178
+ if frame_end >= target_chunk_end:
179
+ break
180
+
181
+ # record result
182
+ if frame_count > 0:
183
+ # flush encoder
184
+ output_container.mux(output_stream.encode(None))
185
+ output_container.close()
186
+ result = {
187
+ 'start_time_sec': round(float(chunk_start_pts * self.audio_time_base), 4),
188
+ 'end_time_sec': round(float(chunk_end_pts * self.audio_time_base), 4),
189
+ 'audio_chunk': chunk_file if frame_count > 0 else None,
190
+ }
191
+ _logger.debug('audio chunk result: %s', result)
192
+ self.next_pos += 1
193
+ return result
194
+ else:
195
+ # It's possible that there are no frames in the range of the last chunk, stop the iterator in this case.
196
+ # Note that start_time points at the first frame so case applies only for the last chunk
197
+ assert self.next_pos == len(self.chunks_to_extract_in_pts) - 1
198
+ self.next_pos += 1
199
+ raise StopIteration
200
+
201
+ def close(self) -> None:
202
+ self.container.close()
203
+
204
+ def set_pos(self, pos: int) -> None:
205
+ pass
@@ -35,6 +35,7 @@ class Separator(enum.Enum):
35
35
  @dataclasses.dataclass
36
36
  class DocumentSectionMetadata:
37
37
  """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
38
+
38
39
  # html and markdown metadata
39
40
  sourceline: Optional[int] = None
40
41
  # the stack of headings up to the most recently observed one;
@@ -50,6 +51,7 @@ class DocumentSectionMetadata:
50
51
  @dataclasses.dataclass
51
52
  class DocumentSection:
52
53
  """A single document chunk, according to some of the splitting criteria"""
54
+
53
55
  text: Optional[str]
54
56
  metadata: Optional[DocumentSectionMetadata]
55
57
 
@@ -93,6 +95,7 @@ class DocumentSplitter(ComponentIterator):
93
95
 
94
96
  Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
95
97
  """
98
+
96
99
  METADATA_COLUMN_TYPES = {
97
100
  ChunkMetadata.TITLE: StringType(nullable=True),
98
101
  ChunkMetadata.HEADING: JsonType(nullable=True),
@@ -102,10 +105,16 @@ class DocumentSplitter(ComponentIterator):
102
105
  }
103
106
 
104
107
  def __init__(
105
- self, document: str, *, separators: str, limit: Optional[int] = None, overlap: Optional[int] = None,
106
- metadata: str = '',
107
- html_skip_tags: Optional[list[str]] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
108
- tiktoken_target_model: Optional[str] = None
108
+ self,
109
+ document: str,
110
+ *,
111
+ separators: str,
112
+ limit: Optional[int] = None,
113
+ overlap: Optional[int] = None,
114
+ metadata: str = '',
115
+ html_skip_tags: Optional[list[str]] = None,
116
+ tiktoken_encoding: Optional[str] = 'cl100k_base',
117
+ tiktoken_target_model: Optional[str] = None,
109
118
  ):
110
119
  """Init method for `DocumentSplitter` class.
111
120
 
@@ -234,13 +243,14 @@ class DocumentSplitter(ComponentIterator):
234
243
  def _html_sections(self) -> Iterator[DocumentSection]:
235
244
  """Create DocumentSections reflecting the html-specific separators"""
236
245
  import bs4
246
+
237
247
  emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
238
248
  emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
239
249
  # current state
240
250
  accumulated_text: list[str] = [] # currently accumulated text
241
251
  # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
242
252
 
243
- headings: dict[str, str] = {} # current state of observed headings (level -> text)
253
+ headings: dict[str, str] = {} # current state of observed headings (level -> text)
244
254
  sourceline = 0 # most recently seen sourceline
245
255
 
246
256
  def update_metadata(el: bs4.Tag) -> None:
@@ -300,7 +310,7 @@ class DocumentSplitter(ComponentIterator):
300
310
  # current state
301
311
  accumulated_text: list[str] = [] # currently accumulated text
302
312
  # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
303
- headings: dict[str, str] = {} # current state of observed headings (level -> text)
313
+ headings: dict[str, str] = {} # current state of observed headings (level -> text)
304
314
 
305
315
  def update_headings(heading: dict) -> None:
306
316
  # update current state
@@ -353,6 +363,7 @@ class DocumentSplitter(ComponentIterator):
353
363
  def _pdf_sections(self) -> Iterator[DocumentSection]:
354
364
  """Create DocumentSections reflecting the pdf-specific separators"""
355
365
  import fitz # type: ignore[import-untyped]
366
+
356
367
  doc: fitz.Document = self._doc_handle.pdf_doc
357
368
  assert doc is not None
358
369
 
@@ -385,8 +396,7 @@ class DocumentSplitter(ComponentIterator):
385
396
  yield DocumentSection(text=_emit_text(), metadata=metadata)
386
397
 
387
398
  if accumulated_text and emit_on_page and not emit_on_paragraph:
388
- yield DocumentSection(text=_emit_text(),
389
- metadata=DocumentSectionMetadata(page=page_number))
399
+ yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata(page=page_number))
390
400
  accumulated_text = []
391
401
 
392
402
  if accumulated_text and not emit_on_page:
@@ -411,6 +421,7 @@ class DocumentSplitter(ComponentIterator):
411
421
 
412
422
  def _token_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
413
423
  import tiktoken
424
+
414
425
  if self._tiktoken_target_model is not None:
415
426
  encoding = tiktoken.encoding_for_model(self._tiktoken_target_model)
416
427
  else:
@@ -30,15 +30,9 @@ class TileIterator(ComponentIterator):
30
30
  __i: int
31
31
  __j: int
32
32
 
33
- def __init__(
34
- self,
35
- image: PIL.Image.Image,
36
- *,
37
- tile_size: tuple[int, int],
38
- overlap: tuple[int, int] = (0, 0),
39
- ):
33
+ def __init__(self, image: PIL.Image.Image, *, tile_size: tuple[int, int], overlap: tuple[int, int] = (0, 0)):
40
34
  if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
41
- raise excs.Error(f"overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}")
35
+ raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
42
36
 
43
37
  self.__image = image
44
38
  self.__image.load()
@@ -64,11 +58,7 @@ class TileIterator(ComponentIterator):
64
58
  x2 = x1 + self.__tile_size[0]
65
59
  y2 = y1 + self.__tile_size[1]
66
60
  tile = self.__image.crop((x1, y1, x2, y2))
67
- result = {
68
- 'tile': tile,
69
- 'tile_coord': [self.__i, self.__j],
70
- 'tile_box': [x1, y1, x2, y2]
71
- }
61
+ result = {'tile': tile, 'tile_coord': [self.__i, self.__j], 'tile_box': [x1, y1, x2, y2]}
72
62
 
73
63
  self.__i += 1
74
64
  if self.__i >= self.__xlen:
@@ -85,16 +75,8 @@ class TileIterator(ComponentIterator):
85
75
 
86
76
  @classmethod
87
77
  def input_schema(cls, *args: Any, **kwargs: Any) -> dict[str, ts.ColumnType]:
88
- return {
89
- 'image': ts.ImageType(),
90
- 'tile_size': ts.JsonType(),
91
- 'overlap': ts.JsonType(),
92
- }
78
+ return {'image': ts.ImageType(), 'tile_size': ts.JsonType(), 'overlap': ts.JsonType()}
93
79
 
94
80
  @classmethod
95
- def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
96
- return {
97
- 'tile': ts.ImageType(),
98
- 'tile_coord': ts.JsonType(),
99
- 'tile_box': ts.JsonType(),
100
- }, ['tile']
81
+ def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
82
+ return {'tile': ts.ImageType(), 'tile_coord': ts.JsonType(), 'tile_box': ts.JsonType()}, ['tile']
@@ -1,4 +1,4 @@
1
- from typing import Iterator, Any
1
+ from typing import Any, Iterator
2
2
 
3
3
  import pixeltable.exceptions as excs
4
4
  import pixeltable.type_system as ts
@@ -30,11 +30,8 @@ class StringSplitter(ComponentIterator):
30
30
 
31
31
  @classmethod
32
32
  def input_schema(cls, *args: Any, **kwargs: Any) -> dict[str, ts.ColumnType]:
33
- return {
34
- 'text': ts.StringType(),
35
- 'separators': ts.StringType(),
36
- }
33
+ return {'text': ts.StringType(), 'separators': ts.StringType()}
37
34
 
38
35
  @classmethod
39
- def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
36
+ def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
40
37
  return {'text': ts.StringType()}, []
@@ -24,7 +24,6 @@ class FrameIterator(ComponentIterator):
24
24
  frame of the video will always be extracted, and the remaining frames will be spaced as evenly as possible.
25
25
 
26
26
  Args:
27
- video: URL or path of the video to use for frame extraction.
28
27
  fps: Number of frames to extract per second of video. This may be a fractional value, such as 0.5.
29
28
  If omitted or set to 0.0, then the native framerate of the video will be used (all frames will be
30
29
  extracted). If `fps` is greater than the frame rate of the video, an error will be raised.
@@ -167,12 +166,7 @@ class FrameIterator(ComponentIterator):
167
166
  img = frame.to_image()
168
167
  assert isinstance(img, PIL.Image.Image)
169
168
  pos_msec = float(pts * self.video_time_base * 1000)
170
- result = {
171
- 'frame_idx': self.next_pos,
172
- 'pos_msec': pos_msec,
173
- 'pos_frame': video_idx,
174
- 'frame': img,
175
- }
169
+ result = {'frame_idx': self.next_pos, 'pos_msec': pos_msec, 'pos_frame': video_idx, 'frame': img}
176
170
  self.next_pos += 1
177
171
  return result
178
172
 
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
10
10
  from .schema import SystemInfo, SystemInfoMd
11
11
 
12
12
  # current version of the metadata; this is incremented whenever the metadata schema changes
13
- VERSION = 26
13
+ VERSION = 27
14
14
 
15
15
 
16
16
  def create_system_info(engine: sql.engine.Engine) -> None:
@@ -22,20 +22,25 @@ def create_system_info(engine: sql.engine.Engine) -> None:
22
22
  session.flush()
23
23
  session.commit()
24
24
 
25
+
25
26
  # conversion functions for upgrading the metadata schema from one version to the following
26
27
  # key: old schema version
27
28
  converter_cbs: dict[int, Callable[[sql.engine.Engine], None]] = {}
28
29
 
30
+
29
31
  def register_converter(version: int) -> Callable[[Callable[[sql.engine.Engine], None]], None]:
30
32
  def decorator(fn: Callable[[sql.engine.Engine], None]) -> None:
31
33
  global converter_cbs
32
34
  converter_cbs[version] = fn
35
+
33
36
  return decorator
34
37
 
38
+
35
39
  # load all converter modules
36
40
  for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/converters']):
37
41
  importlib.import_module('pixeltable.metadata.converters.' + modname)
38
42
 
43
+
39
44
  def upgrade_md(engine: sql.engine.Engine) -> None:
40
45
  """Upgrade the metadata schema to the current version"""
41
46
  with orm.Session(engine) as session:
@@ -47,7 +52,9 @@ def upgrade_md(engine: sql.engine.Engine) -> None:
47
52
  while md_version < VERSION:
48
53
  if md_version not in converter_cbs:
49
54
  raise RuntimeError(f'No metadata converter for version {md_version}')
50
- print(f'Converting metadata from version {md_version} to {md_version + 1}')
55
+ from pixeltable.env import Env
56
+
57
+ Env.get().console_logger.info(f'Converting metadata from version {md_version} to {md_version + 1}')
51
58
  converter_cbs[md_version](engine)
52
59
  md_version += 1
53
60
  # update system info
@@ -1,12 +1,12 @@
1
1
  import sqlalchemy as sql
2
2
 
3
- from pixeltable.metadata.schema import Table, TableSchemaVersion
4
3
  from pixeltable.metadata import register_converter
4
+ from pixeltable.metadata.schema import Table, TableSchemaVersion
5
5
 
6
6
 
7
7
  @register_converter(version=10)
8
8
  def _(engine: sql.engine.Engine) -> None:
9
- default_table_attrs = {"comment": None, "num_retained_versions": 10}
9
+ default_table_attrs = {'comment': None, 'num_retained_versions': 10}
10
10
  with engine.begin() as conn:
11
11
  # Because `parameters` wasn't actually used for anything,
12
12
  # we can simply delete it without any data loss.
@@ -1,4 +1,3 @@
1
-
2
1
  import inspect
3
2
  import logging
4
3
  from typing import Any
@@ -37,8 +36,5 @@ def __update_md(orig_d: dict, binary_obj: bytes) -> Any:
37
36
  params.append(func.Parameter(name=name, col_type=col_type, kind=kind, default=default, is_batched=is_batched))
38
37
  is_batched = 'batch_size' in orig_d
39
38
  sig = func.Signature(return_type, params, is_batched=is_batched)
40
- d = {
41
- 'signature': sig.as_dict(),
42
- 'batch_size': orig_d['batch_size'] if is_batched else None,
43
- }
39
+ d = {'signature': sig.as_dict(), 'batch_size': orig_d['batch_size'] if is_batched else None}
44
40
  return d
@@ -1,4 +1,5 @@
1
1
  from uuid import UUID
2
+
2
3
  import sqlalchemy as sql
3
4
 
4
5
  from pixeltable.metadata import register_converter
@@ -7,10 +8,7 @@ from pixeltable.metadata.converters.util import convert_table_md
7
8
 
8
9
  @register_converter(version=16)
9
10
  def _(engine: sql.engine.Engine) -> None:
10
- convert_table_md(
11
- engine,
12
- table_md_updater=__update_table_md
13
- )
11
+ convert_table_md(engine, table_md_updater=__update_table_md)
14
12
 
15
13
 
16
14
  def __update_table_md(table_md: dict, table_id: UUID) -> None:
@@ -1,4 +1,5 @@
1
1
  from uuid import UUID
2
+
2
3
  import sqlalchemy as sql
3
4
 
4
5
  from pixeltable.metadata import register_converter
@@ -7,10 +8,7 @@ from pixeltable.metadata.converters.util import convert_table_md
7
8
 
8
9
  @register_converter(version=17)
9
10
  def _(engine: sql.engine.Engine) -> None:
10
- convert_table_md(
11
- engine,
12
- table_md_updater=__update_table_md
13
- )
11
+ convert_table_md(engine, table_md_updater=__update_table_md)
14
12
 
15
13
 
16
14
  def __update_table_md(table_md: dict, table_id: UUID) -> None:
@@ -1,4 +1,5 @@
1
1
  from typing import Any, Optional
2
+
2
3
  import sqlalchemy as sql
3
4
 
4
5
  from pixeltable.metadata import register_converter
@@ -7,10 +8,7 @@ from pixeltable.metadata.converters.util import convert_table_md
7
8
 
8
9
  @register_converter(version=18)
9
10
  def _(engine: sql.engine.Engine) -> None:
10
- convert_table_md(
11
- engine,
12
- substitution_fn=__substitute_md
13
- )
11
+ convert_table_md(engine, substitution_fn=__substitute_md)
14
12
 
15
13
 
16
14
  def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]: