pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +83 -19
- pixeltable/_query.py +1444 -0
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +7 -4
- pixeltable/catalog/catalog.py +2394 -119
- pixeltable/catalog/column.py +225 -104
- pixeltable/catalog/dir.py +38 -9
- pixeltable/catalog/globals.py +53 -34
- pixeltable/catalog/insertable_table.py +265 -115
- pixeltable/catalog/path.py +80 -17
- pixeltable/catalog/schema_object.py +28 -43
- pixeltable/catalog/table.py +1270 -677
- pixeltable/catalog/table_metadata.py +103 -0
- pixeltable/catalog/table_version.py +1270 -751
- pixeltable/catalog/table_version_handle.py +109 -0
- pixeltable/catalog/table_version_path.py +137 -42
- pixeltable/catalog/tbl_ops.py +53 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +251 -134
- pixeltable/config.py +215 -0
- pixeltable/env.py +736 -285
- pixeltable/exceptions.py +26 -2
- pixeltable/exec/__init__.py +7 -2
- pixeltable/exec/aggregation_node.py +39 -21
- pixeltable/exec/cache_prefetch_node.py +87 -109
- pixeltable/exec/cell_materialization_node.py +268 -0
- pixeltable/exec/cell_reconstruction_node.py +168 -0
- pixeltable/exec/component_iteration_node.py +25 -28
- pixeltable/exec/data_row_batch.py +11 -46
- pixeltable/exec/exec_context.py +26 -11
- pixeltable/exec/exec_node.py +35 -27
- pixeltable/exec/expr_eval/__init__.py +3 -0
- pixeltable/exec/expr_eval/evaluators.py +365 -0
- pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
- pixeltable/exec/expr_eval/globals.py +200 -0
- pixeltable/exec/expr_eval/row_buffer.py +74 -0
- pixeltable/exec/expr_eval/schedulers.py +413 -0
- pixeltable/exec/globals.py +35 -0
- pixeltable/exec/in_memory_data_node.py +35 -27
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +44 -29
- pixeltable/exec/sql_node.py +414 -115
- pixeltable/exprs/__init__.py +8 -5
- pixeltable/exprs/arithmetic_expr.py +79 -45
- pixeltable/exprs/array_slice.py +5 -5
- pixeltable/exprs/column_property_ref.py +40 -26
- pixeltable/exprs/column_ref.py +254 -61
- pixeltable/exprs/comparison.py +14 -9
- pixeltable/exprs/compound_predicate.py +9 -10
- pixeltable/exprs/data_row.py +213 -72
- pixeltable/exprs/expr.py +270 -104
- pixeltable/exprs/expr_dict.py +6 -5
- pixeltable/exprs/expr_set.py +20 -11
- pixeltable/exprs/function_call.py +383 -284
- pixeltable/exprs/globals.py +18 -5
- pixeltable/exprs/in_predicate.py +7 -7
- pixeltable/exprs/inline_expr.py +37 -37
- pixeltable/exprs/is_null.py +8 -4
- pixeltable/exprs/json_mapper.py +120 -54
- pixeltable/exprs/json_path.py +90 -60
- pixeltable/exprs/literal.py +61 -16
- pixeltable/exprs/method_ref.py +7 -6
- pixeltable/exprs/object_ref.py +19 -8
- pixeltable/exprs/row_builder.py +238 -75
- pixeltable/exprs/rowid_ref.py +53 -15
- pixeltable/exprs/similarity_expr.py +65 -50
- pixeltable/exprs/sql_element_cache.py +5 -5
- pixeltable/exprs/string_op.py +107 -0
- pixeltable/exprs/type_cast.py +25 -13
- pixeltable/exprs/variable.py +2 -2
- pixeltable/func/__init__.py +9 -5
- pixeltable/func/aggregate_function.py +197 -92
- pixeltable/func/callable_function.py +119 -35
- pixeltable/func/expr_template_function.py +101 -48
- pixeltable/func/function.py +375 -62
- pixeltable/func/function_registry.py +20 -19
- pixeltable/func/globals.py +6 -5
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +151 -35
- pixeltable/func/signature.py +178 -49
- pixeltable/func/tools.py +164 -0
- pixeltable/func/udf.py +176 -53
- pixeltable/functions/__init__.py +44 -4
- pixeltable/functions/anthropic.py +226 -47
- pixeltable/functions/audio.py +148 -11
- pixeltable/functions/bedrock.py +137 -0
- pixeltable/functions/date.py +188 -0
- pixeltable/functions/deepseek.py +113 -0
- pixeltable/functions/document.py +81 -0
- pixeltable/functions/fal.py +76 -0
- pixeltable/functions/fireworks.py +72 -20
- pixeltable/functions/gemini.py +249 -0
- pixeltable/functions/globals.py +208 -53
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1088 -95
- pixeltable/functions/image.py +155 -84
- pixeltable/functions/json.py +8 -11
- pixeltable/functions/llama_cpp.py +31 -19
- pixeltable/functions/math.py +169 -0
- pixeltable/functions/mistralai.py +50 -75
- pixeltable/functions/net.py +70 -0
- pixeltable/functions/ollama.py +29 -36
- pixeltable/functions/openai.py +548 -160
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +15 -14
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +310 -85
- pixeltable/functions/timestamp.py +37 -19
- pixeltable/functions/together.py +77 -120
- pixeltable/functions/twelvelabs.py +188 -0
- pixeltable/functions/util.py +7 -2
- pixeltable/functions/uuid.py +30 -0
- pixeltable/functions/video.py +1528 -117
- pixeltable/functions/vision.py +26 -26
- pixeltable/functions/voyageai.py +289 -0
- pixeltable/functions/whisper.py +19 -10
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/functions/yolox.py +112 -0
- pixeltable/globals.py +716 -236
- pixeltable/index/__init__.py +3 -1
- pixeltable/index/base.py +17 -21
- pixeltable/index/btree.py +32 -22
- pixeltable/index/embedding_index.py +155 -92
- pixeltable/io/__init__.py +12 -7
- pixeltable/io/datarows.py +140 -0
- pixeltable/io/external_store.py +83 -125
- pixeltable/io/fiftyone.py +24 -33
- pixeltable/io/globals.py +47 -182
- pixeltable/io/hf_datasets.py +96 -127
- pixeltable/io/label_studio.py +171 -156
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +136 -115
- pixeltable/io/parquet.py +40 -153
- pixeltable/io/table_data_conduit.py +702 -0
- pixeltable/io/utils.py +100 -0
- pixeltable/iterators/__init__.py +8 -4
- pixeltable/iterators/audio.py +207 -0
- pixeltable/iterators/base.py +9 -3
- pixeltable/iterators/document.py +144 -87
- pixeltable/iterators/image.py +17 -38
- pixeltable/iterators/string.py +15 -12
- pixeltable/iterators/video.py +523 -127
- pixeltable/metadata/__init__.py +33 -8
- pixeltable/metadata/converters/convert_10.py +2 -3
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_15.py +15 -11
- pixeltable/metadata/converters/convert_16.py +4 -5
- pixeltable/metadata/converters/convert_17.py +4 -5
- pixeltable/metadata/converters/convert_18.py +4 -6
- pixeltable/metadata/converters/convert_19.py +6 -9
- pixeltable/metadata/converters/convert_20.py +3 -6
- pixeltable/metadata/converters/convert_21.py +6 -8
- pixeltable/metadata/converters/convert_22.py +3 -2
- pixeltable/metadata/converters/convert_23.py +33 -0
- pixeltable/metadata/converters/convert_24.py +55 -0
- pixeltable/metadata/converters/convert_25.py +19 -0
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/convert_27.py +29 -0
- pixeltable/metadata/converters/convert_28.py +13 -0
- pixeltable/metadata/converters/convert_29.py +110 -0
- pixeltable/metadata/converters/convert_30.py +63 -0
- pixeltable/metadata/converters/convert_31.py +11 -0
- pixeltable/metadata/converters/convert_32.py +15 -0
- pixeltable/metadata/converters/convert_33.py +17 -0
- pixeltable/metadata/converters/convert_34.py +21 -0
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/convert_41.py +12 -0
- pixeltable/metadata/converters/convert_42.py +9 -0
- pixeltable/metadata/converters/convert_43.py +44 -0
- pixeltable/metadata/converters/util.py +44 -18
- pixeltable/metadata/notes.py +21 -0
- pixeltable/metadata/schema.py +185 -42
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +616 -225
- pixeltable/share/__init__.py +3 -0
- pixeltable/share/packager.py +797 -0
- pixeltable/share/protocol/__init__.py +33 -0
- pixeltable/share/protocol/common.py +165 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +119 -0
- pixeltable/share/publish.py +349 -0
- pixeltable/store.py +398 -232
- pixeltable/type_system.py +730 -267
- pixeltable/utils/__init__.py +40 -0
- pixeltable/utils/arrow.py +201 -29
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +346 -0
- pixeltable/utils/coco.py +26 -27
- pixeltable/utils/code.py +4 -4
- pixeltable/utils/console_output.py +46 -0
- pixeltable/utils/coroutine.py +24 -0
- pixeltable/utils/dbms.py +92 -0
- pixeltable/utils/description_helper.py +11 -12
- pixeltable/utils/documents.py +60 -61
- pixeltable/utils/exception_handler.py +36 -0
- pixeltable/utils/filecache.py +38 -22
- pixeltable/utils/formatter.py +88 -51
- pixeltable/utils/gcs_store.py +295 -0
- pixeltable/utils/http.py +133 -0
- pixeltable/utils/http_server.py +14 -13
- pixeltable/utils/iceberg.py +13 -0
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +90 -0
- pixeltable/utils/local_store.py +322 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +573 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +20 -20
- pixeltable/utils/s3_store.py +527 -0
- pixeltable/utils/sql.py +32 -5
- pixeltable/utils/system.py +30 -0
- pixeltable/utils/transactional_directory.py +4 -3
- pixeltable-0.5.7.dist-info/METADATA +579 -0
- pixeltable-0.5.7.dist-info/RECORD +227 -0
- {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
- pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/catalog/named_function.py +0 -36
- pixeltable/catalog/path_dict.py +0 -141
- pixeltable/dataframe.py +0 -894
- pixeltable/exec/expr_eval_node.py +0 -232
- pixeltable/ext/__init__.py +0 -14
- pixeltable/ext/functions/__init__.py +0 -8
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/ext/functions/yolox.py +0 -157
- pixeltable/tool/create_test_db_dump.py +0 -311
- pixeltable/tool/create_test_video.py +0 -81
- pixeltable/tool/doc_plugins/griffe.py +0 -50
- pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
- pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
- pixeltable/tool/embed_udf.py +0 -9
- pixeltable/tool/mypy_plugin.py +0 -55
- pixeltable/utils/media_store.py +0 -76
- pixeltable/utils/s3.py +0 -16
- pixeltable-0.2.26.dist-info/METADATA +0 -400
- pixeltable-0.2.26.dist-info/RECORD +0 -156
- pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
- {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,232 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import sys
|
|
3
|
-
import time
|
|
4
|
-
import warnings
|
|
5
|
-
from dataclasses import dataclass
|
|
6
|
-
from typing import Iterable, Optional
|
|
7
|
-
|
|
8
|
-
from tqdm import TqdmWarning, tqdm
|
|
9
|
-
|
|
10
|
-
from pixeltable import exprs
|
|
11
|
-
from pixeltable.func import CallableFunction
|
|
12
|
-
|
|
13
|
-
from .data_row_batch import DataRowBatch
|
|
14
|
-
from .exec_node import ExecNode
|
|
15
|
-
|
|
16
|
-
_logger = logging.getLogger('pixeltable')
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class ExprEvalNode(ExecNode):
|
|
20
|
-
"""Materializes expressions
|
|
21
|
-
"""
|
|
22
|
-
@dataclass
|
|
23
|
-
class Cohort:
|
|
24
|
-
"""List of exprs that form an evaluation context and contain calls to at most one external function"""
|
|
25
|
-
exprs_: list[exprs.Expr]
|
|
26
|
-
batched_fn: Optional[CallableFunction]
|
|
27
|
-
segment_ctxs: list['exprs.RowBuilder.EvalCtx']
|
|
28
|
-
target_slot_idxs: list[int]
|
|
29
|
-
batch_size: int = 8
|
|
30
|
-
|
|
31
|
-
def __init__(
|
|
32
|
-
self, row_builder: exprs.RowBuilder, output_exprs: Iterable[exprs.Expr], input_exprs: Iterable[exprs.Expr],
|
|
33
|
-
input: ExecNode
|
|
34
|
-
):
|
|
35
|
-
super().__init__(row_builder, output_exprs, input_exprs, input)
|
|
36
|
-
self.input_exprs = input_exprs
|
|
37
|
-
input_slot_idxs = {e.slot_idx for e in input_exprs}
|
|
38
|
-
# we're only materializing exprs that are not already in the input
|
|
39
|
-
self.target_exprs = [e for e in output_exprs if e.slot_idx not in input_slot_idxs]
|
|
40
|
-
self.pbar: Optional[tqdm] = None
|
|
41
|
-
self.cohorts: list[ExprEvalNode.Cohort] = []
|
|
42
|
-
self._create_cohorts()
|
|
43
|
-
|
|
44
|
-
def __next__(self) -> DataRowBatch:
|
|
45
|
-
input_batch = next(self.input)
|
|
46
|
-
# compute target exprs
|
|
47
|
-
for cohort in self.cohorts:
|
|
48
|
-
self._exec_cohort(cohort, input_batch)
|
|
49
|
-
_logger.debug(f'ExprEvalNode: returning {len(input_batch)} rows')
|
|
50
|
-
return input_batch
|
|
51
|
-
|
|
52
|
-
def _open(self) -> None:
|
|
53
|
-
warnings.simplefilter("ignore", category=TqdmWarning)
|
|
54
|
-
# This is a temporary hack. When B-tree indices on string columns were implemented (via computed columns
|
|
55
|
-
# that invoke the `BtreeIndex.str_filter` udf), it resulted in frivolous progress bars appearing on every
|
|
56
|
-
# insertion. This special-cases the `str_filter` call to suppress the corresponding progress bar.
|
|
57
|
-
# TODO(aaron-siegel) Remove this hack once we clean up progress bars more generally.
|
|
58
|
-
is_str_filter_node = all(
|
|
59
|
-
isinstance(expr, exprs.FunctionCall) and expr.fn.name == 'str_filter' for expr in self.output_exprs
|
|
60
|
-
)
|
|
61
|
-
if self.ctx.show_pbar and not is_str_filter_node:
|
|
62
|
-
self.pbar = tqdm(
|
|
63
|
-
total=len(self.target_exprs) * self.ctx.num_rows,
|
|
64
|
-
desc='Computing cells',
|
|
65
|
-
unit=' cells',
|
|
66
|
-
ncols=100,
|
|
67
|
-
file=sys.stdout
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
def _close(self) -> None:
|
|
71
|
-
if self.pbar is not None:
|
|
72
|
-
self.pbar.close()
|
|
73
|
-
|
|
74
|
-
def _get_batched_fn(self, expr: exprs.Expr) -> Optional[CallableFunction]:
|
|
75
|
-
if isinstance(expr, exprs.FunctionCall) and isinstance(expr.fn, CallableFunction) and expr.fn.is_batched:
|
|
76
|
-
return expr.fn
|
|
77
|
-
return None
|
|
78
|
-
|
|
79
|
-
def _is_batched_fn_call(self, expr: exprs.Expr) -> bool:
|
|
80
|
-
return self._get_batched_fn(expr) is not None
|
|
81
|
-
|
|
82
|
-
def _create_cohorts(self) -> None:
|
|
83
|
-
all_exprs = self.row_builder.get_dependencies(self.target_exprs)
|
|
84
|
-
# break up all_exprs into cohorts such that each cohort contains calls to at most one external function;
|
|
85
|
-
# seed the cohorts with only the ext fn calls
|
|
86
|
-
cohorts: list[list[exprs.Expr]] = []
|
|
87
|
-
current_batched_fn: Optional[CallableFunction] = None
|
|
88
|
-
for e in all_exprs:
|
|
89
|
-
if not self._is_batched_fn_call(e):
|
|
90
|
-
continue
|
|
91
|
-
assert isinstance(e, exprs.FunctionCall)
|
|
92
|
-
assert isinstance(e.fn, CallableFunction)
|
|
93
|
-
if current_batched_fn is None or current_batched_fn != e.fn:
|
|
94
|
-
# create a new cohort
|
|
95
|
-
cohorts.append([])
|
|
96
|
-
current_batched_fn = e.fn
|
|
97
|
-
cohorts[-1].append(e)
|
|
98
|
-
|
|
99
|
-
# expand the cohorts to include all exprs that are in the same evaluation context as the external calls;
|
|
100
|
-
# cohorts are evaluated in order, so we can exclude the target slots from preceding cohorts and input slots
|
|
101
|
-
exclude = set(e.slot_idx for e in self.input_exprs)
|
|
102
|
-
all_target_slot_idxs = set(e.slot_idx for e in self.target_exprs)
|
|
103
|
-
target_slot_idxs: list[list[int]] = [] # the ones materialized by each cohort
|
|
104
|
-
for i in range(len(cohorts)):
|
|
105
|
-
cohorts[i] = self.row_builder.get_dependencies(
|
|
106
|
-
cohorts[i], exclude=[self.row_builder.unique_exprs[slot_idx] for slot_idx in exclude])
|
|
107
|
-
target_slot_idxs.append(
|
|
108
|
-
[e.slot_idx for e in cohorts[i] if e.slot_idx in all_target_slot_idxs])
|
|
109
|
-
exclude.update(target_slot_idxs[-1])
|
|
110
|
-
|
|
111
|
-
all_cohort_slot_idxs = set(e.slot_idx for cohort in cohorts for e in cohort)
|
|
112
|
-
remaining_slot_idxs = set(all_target_slot_idxs) - all_cohort_slot_idxs
|
|
113
|
-
if len(remaining_slot_idxs) > 0:
|
|
114
|
-
cohorts.append(self.row_builder.get_dependencies(
|
|
115
|
-
[self.row_builder.unique_exprs[slot_idx] for slot_idx in remaining_slot_idxs],
|
|
116
|
-
exclude=[self.row_builder.unique_exprs[slot_idx] for slot_idx in exclude]))
|
|
117
|
-
target_slot_idxs.append(list(remaining_slot_idxs))
|
|
118
|
-
# we need to have captured all target slots at this point
|
|
119
|
-
assert all_target_slot_idxs == set().union(*target_slot_idxs)
|
|
120
|
-
|
|
121
|
-
for i in range(len(cohorts)):
|
|
122
|
-
cohort = cohorts[i]
|
|
123
|
-
# segment the cohort into sublists that contain either a single ext. function call or no ext. function calls
|
|
124
|
-
# (i.e., only computed cols)
|
|
125
|
-
assert len(cohort) > 0
|
|
126
|
-
# create the first segment here, so we can avoid checking for an empty list in the loop
|
|
127
|
-
segments = [[cohort[0]]]
|
|
128
|
-
is_batched_segment = self._is_batched_fn_call(cohort[0])
|
|
129
|
-
batched_fn: Optional[CallableFunction] = self._get_batched_fn(cohort[0])
|
|
130
|
-
for e in cohort[1:]:
|
|
131
|
-
if self._is_batched_fn_call(e):
|
|
132
|
-
segments.append([e])
|
|
133
|
-
is_batched_segment = True
|
|
134
|
-
batched_fn = self._get_batched_fn(e)
|
|
135
|
-
else:
|
|
136
|
-
if is_batched_segment:
|
|
137
|
-
# start a new segment
|
|
138
|
-
segments.append([])
|
|
139
|
-
is_batched_segment = False
|
|
140
|
-
segments[-1].append(e)
|
|
141
|
-
|
|
142
|
-
# we create the EvalCtxs manually because create_eval_ctx() would repeat the dependencies of each segment
|
|
143
|
-
segment_ctxs = [
|
|
144
|
-
exprs.RowBuilder.EvalCtx(
|
|
145
|
-
slot_idxs=[e.slot_idx for e in s], exprs=s, target_slot_idxs=[], target_exprs=[])
|
|
146
|
-
for s in segments
|
|
147
|
-
]
|
|
148
|
-
cohort_info = self.Cohort(cohort, batched_fn, segment_ctxs, target_slot_idxs[i])
|
|
149
|
-
self.cohorts.append(cohort_info)
|
|
150
|
-
|
|
151
|
-
def _exec_cohort(self, cohort: Cohort, rows: DataRowBatch) -> None:
|
|
152
|
-
"""Compute the cohort for the entire input batch by dividing it up into sub-batches"""
|
|
153
|
-
batch_start_idx = 0 # start row of the current sub-batch
|
|
154
|
-
# for multi-resolution models, we re-assess the correct ext fn batch size for each input batch
|
|
155
|
-
ext_batch_size = cohort.batched_fn.get_batch_size() if cohort.batched_fn is not None else None
|
|
156
|
-
if ext_batch_size is not None:
|
|
157
|
-
cohort.batch_size = ext_batch_size
|
|
158
|
-
|
|
159
|
-
while batch_start_idx < len(rows):
|
|
160
|
-
num_batch_rows = min(cohort.batch_size, len(rows) - batch_start_idx)
|
|
161
|
-
for segment_ctx in cohort.segment_ctxs:
|
|
162
|
-
if not self._is_batched_fn_call(segment_ctx.exprs[0]):
|
|
163
|
-
# compute batch row-wise
|
|
164
|
-
for row_idx in range(batch_start_idx, batch_start_idx + num_batch_rows):
|
|
165
|
-
self.row_builder.eval(
|
|
166
|
-
rows[row_idx], segment_ctx, self.ctx.profile, ignore_errors=self.ctx.ignore_errors)
|
|
167
|
-
else:
|
|
168
|
-
fn_call = segment_ctx.exprs[0]
|
|
169
|
-
assert isinstance(fn_call, exprs.FunctionCall)
|
|
170
|
-
# make a batched external function call
|
|
171
|
-
arg_batches: list[list[exprs.Expr]] = [[] for _ in range(len(fn_call.args))]
|
|
172
|
-
kwarg_batches: dict[str, list[exprs.Expr]] = {k: [] for k in fn_call.kwargs.keys()}
|
|
173
|
-
|
|
174
|
-
valid_batch_idxs: list[int] = [] # rows with exceptions are not valid
|
|
175
|
-
for row_idx in range(batch_start_idx, batch_start_idx + num_batch_rows):
|
|
176
|
-
row = rows[row_idx]
|
|
177
|
-
if row.has_exc(fn_call.slot_idx):
|
|
178
|
-
# one of our inputs had an exception, skip this row
|
|
179
|
-
continue
|
|
180
|
-
valid_batch_idxs.append(row_idx)
|
|
181
|
-
args, kwargs = fn_call._make_args(row)
|
|
182
|
-
for i in range(len(args)):
|
|
183
|
-
arg_batches[i].append(args[i])
|
|
184
|
-
for k in kwargs.keys():
|
|
185
|
-
kwarg_batches[k].append(kwargs[k])
|
|
186
|
-
num_valid_batch_rows = len(valid_batch_idxs)
|
|
187
|
-
|
|
188
|
-
if ext_batch_size is None:
|
|
189
|
-
# we need to choose a batch size based on the args
|
|
190
|
-
assert isinstance(fn_call.fn, CallableFunction)
|
|
191
|
-
sample_args = [arg_batches[i][0] for i in range(len(arg_batches))]
|
|
192
|
-
ext_batch_size = fn_call.fn.get_batch_size(*sample_args)
|
|
193
|
-
|
|
194
|
-
num_remaining_batch_rows = num_valid_batch_rows
|
|
195
|
-
while num_remaining_batch_rows > 0:
|
|
196
|
-
# we make ext. fn calls in batches of ext_batch_size
|
|
197
|
-
if ext_batch_size is None:
|
|
198
|
-
pass
|
|
199
|
-
num_ext_batch_rows = min(ext_batch_size, num_remaining_batch_rows)
|
|
200
|
-
ext_batch_offset = num_valid_batch_rows - num_remaining_batch_rows # offset into args, not rows
|
|
201
|
-
call_args = [
|
|
202
|
-
arg_batches[i][ext_batch_offset:ext_batch_offset + num_ext_batch_rows]
|
|
203
|
-
for i in range(len(arg_batches))
|
|
204
|
-
]
|
|
205
|
-
call_kwargs = {
|
|
206
|
-
k: kwarg_batches[k][ext_batch_offset:ext_batch_offset + num_ext_batch_rows]
|
|
207
|
-
for k in kwarg_batches.keys()
|
|
208
|
-
}
|
|
209
|
-
start_ts = time.perf_counter()
|
|
210
|
-
assert isinstance(fn_call.fn, CallableFunction)
|
|
211
|
-
result_batch = fn_call.fn.exec_batch(*call_args, **call_kwargs)
|
|
212
|
-
self.ctx.profile.eval_time[fn_call.slot_idx] += time.perf_counter() - start_ts
|
|
213
|
-
self.ctx.profile.eval_count[fn_call.slot_idx] += num_ext_batch_rows
|
|
214
|
-
|
|
215
|
-
# move the result into the row batch
|
|
216
|
-
for result_idx in range(len(result_batch)):
|
|
217
|
-
row_idx = valid_batch_idxs[ext_batch_offset + result_idx]
|
|
218
|
-
row = rows[row_idx]
|
|
219
|
-
row[fn_call.slot_idx] = result_batch[result_idx]
|
|
220
|
-
|
|
221
|
-
num_remaining_batch_rows -= num_ext_batch_rows
|
|
222
|
-
|
|
223
|
-
# switch to the ext fn batch size
|
|
224
|
-
cohort.batch_size = ext_batch_size
|
|
225
|
-
|
|
226
|
-
# make sure images for stored cols have been saved to files before moving on to the next batch
|
|
227
|
-
rows.flush_imgs(
|
|
228
|
-
slice(batch_start_idx, batch_start_idx + num_batch_rows), self.stored_img_cols, self.flushed_img_slots)
|
|
229
|
-
if self.pbar is not None:
|
|
230
|
-
self.pbar.update(num_batch_rows * len(cohort.target_slot_idxs))
|
|
231
|
-
batch_start_idx += num_batch_rows
|
|
232
|
-
|
pixeltable/ext/__init__.py
DELETED
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Extended integrations for Pixeltable. This package contains experimental or demonstration features that
|
|
3
|
-
are not intended for production use. Long-term support cannot be guaranteed, usually because the features
|
|
4
|
-
have dependencies whose future support is unclear.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from pixeltable.utils.code import local_public_names
|
|
8
|
-
from . import functions
|
|
9
|
-
|
|
10
|
-
__all__ = local_public_names(__name__)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def __dir__():
|
|
14
|
-
return __all__
|
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING, Optional
|
|
2
|
-
|
|
3
|
-
from pixeltable.utils.code import local_public_names
|
|
4
|
-
|
|
5
|
-
if TYPE_CHECKING:
|
|
6
|
-
from whisperx.asr import FasterWhisperPipeline # type: ignore[import-untyped]
|
|
7
|
-
|
|
8
|
-
import pixeltable as pxt
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@pxt.udf
|
|
12
|
-
def transcribe(
|
|
13
|
-
audio: pxt.Audio,
|
|
14
|
-
*,
|
|
15
|
-
model: str,
|
|
16
|
-
compute_type: Optional[str] = None,
|
|
17
|
-
language: Optional[str] = None,
|
|
18
|
-
chunk_size: int = 30
|
|
19
|
-
) -> dict:
|
|
20
|
-
"""
|
|
21
|
-
Transcribe an audio file using WhisperX.
|
|
22
|
-
|
|
23
|
-
This UDF runs a transcription model _locally_ using the WhisperX library,
|
|
24
|
-
equivalent to the WhisperX `transcribe` function, as described in the
|
|
25
|
-
[WhisperX library documentation](https://github.com/m-bain/whisperX).
|
|
26
|
-
|
|
27
|
-
WhisperX is part of the `pixeltable.ext` package: long-term support in Pixeltable is not guaranteed.
|
|
28
|
-
|
|
29
|
-
__Requirements:__
|
|
30
|
-
|
|
31
|
-
- `pip install whisperx`
|
|
32
|
-
|
|
33
|
-
Args:
|
|
34
|
-
audio: The audio file to transcribe.
|
|
35
|
-
model: The name of the model to use for transcription.
|
|
36
|
-
|
|
37
|
-
See the [WhisperX library documentation](https://github.com/m-bain/whisperX) for details
|
|
38
|
-
on the remaining parameters.
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
A dictionary containing the transcription and various other metadata.
|
|
42
|
-
|
|
43
|
-
Examples:
|
|
44
|
-
Add a computed column that applies the model `tiny.en` to an existing Pixeltable column `tbl.audio`
|
|
45
|
-
of the table `tbl`:
|
|
46
|
-
|
|
47
|
-
>>> tbl['result'] = transcribe(tbl.audio, model='tiny.en')
|
|
48
|
-
"""
|
|
49
|
-
import torch
|
|
50
|
-
import whisperx # type: ignore[import-untyped]
|
|
51
|
-
|
|
52
|
-
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
53
|
-
compute_type = compute_type or ('float16' if device == 'cuda' else 'int8')
|
|
54
|
-
model = _lookup_model(model, device, compute_type)
|
|
55
|
-
audio_array = whisperx.load_audio(audio)
|
|
56
|
-
result = model.transcribe(audio_array, batch_size=16, language=language, chunk_size=chunk_size)
|
|
57
|
-
return result
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def _lookup_model(model_id: str, device: str, compute_type: str) -> 'FasterWhisperPipeline':
|
|
61
|
-
import whisperx
|
|
62
|
-
|
|
63
|
-
key = (model_id, device, compute_type)
|
|
64
|
-
if key not in _model_cache:
|
|
65
|
-
model = whisperx.load_model(model_id, device, compute_type=compute_type)
|
|
66
|
-
_model_cache[key] = model
|
|
67
|
-
return _model_cache[key]
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
_model_cache: dict[tuple[str, str, str], 'FasterWhisperPipeline'] = {}
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
__all__ = local_public_names(__name__)
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
def __dir__():
|
|
77
|
-
return __all__
|
|
@@ -1,157 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
from typing import TYPE_CHECKING, Iterable, Iterator
|
|
4
|
-
from urllib.request import urlretrieve
|
|
5
|
-
|
|
6
|
-
import numpy as np
|
|
7
|
-
import PIL.Image
|
|
8
|
-
|
|
9
|
-
import pixeltable as pxt
|
|
10
|
-
from pixeltable import env
|
|
11
|
-
from pixeltable.func import Batch
|
|
12
|
-
from pixeltable.functions.util import normalize_image_mode
|
|
13
|
-
from pixeltable.utils.code import local_public_names
|
|
14
|
-
|
|
15
|
-
if TYPE_CHECKING:
|
|
16
|
-
import torch
|
|
17
|
-
from yolox.exp import Exp # type: ignore[import-untyped]
|
|
18
|
-
from yolox.models import YOLOX # type: ignore[import-untyped]
|
|
19
|
-
|
|
20
|
-
_logger = logging.getLogger('pixeltable')
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@pxt.udf(batch_size=4)
|
|
24
|
-
def yolox(images: Batch[PIL.Image.Image], *, model_id: str, threshold: float = 0.5) -> Batch[dict]:
|
|
25
|
-
"""
|
|
26
|
-
Computes YOLOX object detections for the specified image. `model_id` should reference one of the models
|
|
27
|
-
defined in the [YOLOX documentation](https://github.com/Megvii-BaseDetection/YOLOX).
|
|
28
|
-
|
|
29
|
-
YOLOX is part of the `pixeltable.ext` package: long-term support in Pixeltable is not guaranteed.
|
|
30
|
-
|
|
31
|
-
__Requirements__:
|
|
32
|
-
|
|
33
|
-
- `pip install git+https://github.com/Megvii-BaseDetection/YOLOX`
|
|
34
|
-
|
|
35
|
-
Args:
|
|
36
|
-
model_id: one of: `yolox_nano`, `yolox_tiny`, `yolox_s`, `yolox_m`, `yolox_l`, `yolox_x`
|
|
37
|
-
threshold: the threshold for object detection
|
|
38
|
-
|
|
39
|
-
Returns:
|
|
40
|
-
A dictionary containing the output of the object detection model.
|
|
41
|
-
|
|
42
|
-
Examples:
|
|
43
|
-
Add a computed column that applies the model `yolox_m` to an existing
|
|
44
|
-
Pixeltable column `tbl.image` of the table `tbl`:
|
|
45
|
-
|
|
46
|
-
>>> tbl['detections'] = yolox(tbl.image, model_id='yolox_m', threshold=0.8)
|
|
47
|
-
"""
|
|
48
|
-
import torch
|
|
49
|
-
from yolox.utils import postprocess # type: ignore[import-untyped]
|
|
50
|
-
|
|
51
|
-
model, exp = _lookup_model(model_id, 'cpu')
|
|
52
|
-
image_tensors = list(_images_to_tensors(images, exp))
|
|
53
|
-
batch_tensor = torch.stack(image_tensors)
|
|
54
|
-
|
|
55
|
-
with torch.no_grad():
|
|
56
|
-
output_tensor = model(batch_tensor)
|
|
57
|
-
|
|
58
|
-
outputs = postprocess(
|
|
59
|
-
output_tensor, 80, threshold, exp.nmsthre, class_agnostic=False
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
results: list[dict] = []
|
|
63
|
-
for image in images:
|
|
64
|
-
ratio = min(exp.test_size[0] / image.height, exp.test_size[1] / image.width)
|
|
65
|
-
if outputs[0] is None:
|
|
66
|
-
results.append({'bboxes': [], 'scores': [], 'labels': []})
|
|
67
|
-
else:
|
|
68
|
-
results.append({
|
|
69
|
-
'bboxes': [(output[:4] / ratio).tolist() for output in outputs[0]],
|
|
70
|
-
'scores': [output[4].item() * output[5].item() for output in outputs[0]],
|
|
71
|
-
'labels': [int(output[6]) for output in outputs[0]]
|
|
72
|
-
})
|
|
73
|
-
return results
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
@pxt.udf
|
|
77
|
-
def yolo_to_coco(detections: dict) -> list:
|
|
78
|
-
"""
|
|
79
|
-
Converts the output of a YOLOX object detection model to COCO format.
|
|
80
|
-
|
|
81
|
-
YOLOX is part of the `pixeltable.ext` package: long-term support in Pixeltable is not guaranteed.
|
|
82
|
-
|
|
83
|
-
Args:
|
|
84
|
-
detections: The output of a YOLOX object detection model, as returned by `yolox`.
|
|
85
|
-
|
|
86
|
-
Returns:
|
|
87
|
-
A dictionary containing the data from `detections`, converted to COCO format.
|
|
88
|
-
|
|
89
|
-
Examples:
|
|
90
|
-
Add a computed column that converts the output `tbl.detections` to COCO format, where `tbl.image`
|
|
91
|
-
is the image for which detections were computed:
|
|
92
|
-
|
|
93
|
-
>>> tbl['detections'] = yolox(tbl.image, model_id='yolox_m', threshold=0.8)
|
|
94
|
-
... tbl['detections_coco'] = yolo_to_coco(tbl.detections)
|
|
95
|
-
"""
|
|
96
|
-
bboxes, labels = detections['bboxes'], detections['labels']
|
|
97
|
-
num_annotations = len(detections['bboxes'])
|
|
98
|
-
assert num_annotations == len(detections['labels'])
|
|
99
|
-
result = []
|
|
100
|
-
for i in range(num_annotations):
|
|
101
|
-
bbox = bboxes[i]
|
|
102
|
-
ann = {
|
|
103
|
-
'bbox': [round(bbox[0]), round(bbox[1]), round(bbox[2] - bbox[0]), round(bbox[3] - bbox[1])],
|
|
104
|
-
'category': labels[i],
|
|
105
|
-
}
|
|
106
|
-
result.append(ann)
|
|
107
|
-
return result
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
def _images_to_tensors(images: Iterable[PIL.Image.Image], exp: 'Exp') -> Iterator['torch.Tensor']:
|
|
111
|
-
import torch
|
|
112
|
-
from yolox.data import ValTransform # type: ignore[import-untyped]
|
|
113
|
-
|
|
114
|
-
_val_transform = ValTransform(legacy=False)
|
|
115
|
-
for image in images:
|
|
116
|
-
image = normalize_image_mode(image)
|
|
117
|
-
image_transform, _ = _val_transform(np.array(image), None, exp.test_size)
|
|
118
|
-
yield torch.from_numpy(image_transform)
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
def _lookup_model(model_id: str, device: str) -> tuple['YOLOX', 'Exp']:
|
|
122
|
-
import torch
|
|
123
|
-
from yolox.exp import get_exp
|
|
124
|
-
|
|
125
|
-
key = (model_id, device)
|
|
126
|
-
if key in _model_cache:
|
|
127
|
-
return _model_cache[key]
|
|
128
|
-
|
|
129
|
-
weights_url = f'https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/{model_id}.pth'
|
|
130
|
-
weights_file = Path(f'{env.Env.get().tmp_dir}/{model_id}.pth')
|
|
131
|
-
if not weights_file.exists():
|
|
132
|
-
_logger.info(f'Downloading weights for YOLOX model {model_id}: from {weights_url} -> {weights_file}')
|
|
133
|
-
urlretrieve(weights_url, weights_file)
|
|
134
|
-
|
|
135
|
-
exp = get_exp(exp_name=model_id)
|
|
136
|
-
model = exp.get_model().to(device)
|
|
137
|
-
|
|
138
|
-
model.eval()
|
|
139
|
-
model.head.training = False
|
|
140
|
-
model.training = False
|
|
141
|
-
|
|
142
|
-
# Load in the weights from training
|
|
143
|
-
weights = torch.load(weights_file, map_location=torch.device(device))
|
|
144
|
-
model.load_state_dict(weights['model'])
|
|
145
|
-
|
|
146
|
-
_model_cache[key] = (model, exp)
|
|
147
|
-
return model, exp
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
_model_cache: dict[tuple[str, str], tuple['YOLOX', 'Exp']] = {}
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
__all__ = local_public_names(__name__)
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
def __dir__():
|
|
157
|
-
return __all__
|