pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +42 -8
- pixeltable/{dataframe.py → _query.py} +470 -206
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -4
- pixeltable/catalog/catalog.py +1785 -432
- pixeltable/catalog/column.py +190 -113
- pixeltable/catalog/dir.py +2 -4
- pixeltable/catalog/globals.py +19 -46
- pixeltable/catalog/insertable_table.py +191 -98
- pixeltable/catalog/path.py +63 -23
- pixeltable/catalog/schema_object.py +11 -15
- pixeltable/catalog/table.py +843 -436
- pixeltable/catalog/table_metadata.py +103 -0
- pixeltable/catalog/table_version.py +978 -657
- pixeltable/catalog/table_version_handle.py +72 -16
- pixeltable/catalog/table_version_path.py +112 -43
- pixeltable/catalog/tbl_ops.py +53 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +134 -90
- pixeltable/config.py +134 -22
- pixeltable/env.py +471 -157
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +4 -1
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +268 -0
- pixeltable/exec/cell_reconstruction_node.py +168 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +11 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +106 -56
- pixeltable/exec/globals.py +35 -0
- pixeltable/exec/in_memory_data_node.py +19 -19
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +351 -84
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +36 -23
- pixeltable/exprs/column_ref.py +213 -89
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +164 -54
- pixeltable/exprs/expr.py +70 -44
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +100 -40
- pixeltable/exprs/globals.py +2 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +18 -32
- pixeltable/exprs/is_null.py +7 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +27 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +167 -67
- pixeltable/exprs/rowid_ref.py +25 -10
- pixeltable/exprs/similarity_expr.py +58 -40
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +17 -11
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +29 -27
- pixeltable/func/signature.py +46 -19
- pixeltable/func/tools.py +31 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +16 -0
- pixeltable/functions/anthropic.py +123 -77
- pixeltable/functions/audio.py +147 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +7 -4
- pixeltable/functions/deepseek.py +35 -43
- pixeltable/functions/document.py +81 -0
- pixeltable/functions/fal.py +76 -0
- pixeltable/functions/fireworks.py +11 -20
- pixeltable/functions/gemini.py +195 -39
- pixeltable/functions/globals.py +142 -14
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1056 -24
- pixeltable/functions/image.py +115 -57
- pixeltable/functions/json.py +1 -1
- pixeltable/functions/llama_cpp.py +28 -13
- pixeltable/functions/math.py +67 -5
- pixeltable/functions/mistralai.py +18 -55
- pixeltable/functions/net.py +70 -0
- pixeltable/functions/ollama.py +20 -13
- pixeltable/functions/openai.py +240 -226
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +4 -4
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +239 -69
- pixeltable/functions/timestamp.py +16 -16
- pixeltable/functions/together.py +24 -84
- pixeltable/functions/twelvelabs.py +188 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/uuid.py +30 -0
- pixeltable/functions/video.py +1515 -107
- pixeltable/functions/vision.py +8 -8
- pixeltable/functions/voyageai.py +289 -0
- pixeltable/functions/whisper.py +16 -8
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +362 -115
- pixeltable/index/base.py +17 -21
- pixeltable/index/btree.py +28 -22
- pixeltable/index/embedding_index.py +100 -118
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +8 -7
- pixeltable/io/external_store.py +56 -105
- pixeltable/io/fiftyone.py +13 -13
- pixeltable/io/globals.py +31 -30
- pixeltable/io/hf_datasets.py +61 -16
- pixeltable/io/label_studio.py +74 -70
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +21 -12
- pixeltable/io/parquet.py +25 -105
- pixeltable/io/table_data_conduit.py +250 -123
- pixeltable/io/utils.py +4 -4
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +26 -25
- pixeltable/iterators/base.py +9 -3
- pixeltable/iterators/document.py +112 -78
- pixeltable/iterators/image.py +12 -15
- pixeltable/iterators/string.py +11 -4
- pixeltable/iterators/video.py +523 -120
- pixeltable/metadata/__init__.py +14 -3
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_30.py +34 -21
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/convert_41.py +12 -0
- pixeltable/metadata/converters/convert_42.py +9 -0
- pixeltable/metadata/converters/convert_43.py +44 -0
- pixeltable/metadata/converters/util.py +20 -31
- pixeltable/metadata/notes.py +9 -0
- pixeltable/metadata/schema.py +140 -53
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +382 -115
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +547 -83
- pixeltable/share/protocol/__init__.py +33 -0
- pixeltable/share/protocol/common.py +165 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +119 -0
- pixeltable/share/publish.py +257 -59
- pixeltable/store.py +311 -194
- pixeltable/type_system.py +373 -211
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +131 -17
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +346 -0
- pixeltable/utils/coco.py +6 -6
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +32 -6
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +7 -18
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +86 -48
- pixeltable/utils/gcs_store.py +295 -0
- pixeltable/utils/http.py +133 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +90 -0
- pixeltable/utils/local_store.py +322 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +573 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +527 -0
- pixeltable/utils/sql.py +26 -0
- pixeltable/utils/system.py +30 -0
- pixeltable-0.5.7.dist-info/METADATA +579 -0
- pixeltable-0.5.7.dist-info/RECORD +227 -0
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
- pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/catalog/named_function.py +0 -40
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable-0.3.14.dist-info/METADATA +0 -434
- pixeltable-0.3.14.dist-info/RECORD +0 -186
- pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/exceptions.py
CHANGED
|
@@ -10,6 +10,12 @@ class Error(Exception):
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class ExprEvalError(Exception):
|
|
13
|
+
"""
|
|
14
|
+
Used during query execution to signal expr evaluation failures.
|
|
15
|
+
|
|
16
|
+
NOT A USER-FACING EXCEPTION. All ExprEvalError instances need to be converted into Error instances.
|
|
17
|
+
"""
|
|
18
|
+
|
|
13
19
|
expr: 'exprs.Expr'
|
|
14
20
|
expr_msg: str
|
|
15
21
|
exc: Exception
|
pixeltable/exec/__init__.py
CHANGED
|
@@ -2,11 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
from .aggregation_node import AggregationNode
|
|
4
4
|
from .cache_prefetch_node import CachePrefetchNode
|
|
5
|
+
from .cell_materialization_node import CellMaterializationNode
|
|
6
|
+
from .cell_reconstruction_node import CellReconstructionNode
|
|
5
7
|
from .component_iteration_node import ComponentIterationNode
|
|
6
8
|
from .data_row_batch import DataRowBatch
|
|
7
9
|
from .exec_context import ExecContext
|
|
8
10
|
from .exec_node import ExecNode
|
|
9
11
|
from .expr_eval import ExprEvalNode
|
|
10
12
|
from .in_memory_data_node import InMemoryDataNode
|
|
13
|
+
from .object_store_save_node import ObjectStoreSaveNode
|
|
11
14
|
from .row_update_node import RowUpdateNode
|
|
12
|
-
from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlScanNode
|
|
15
|
+
from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlSampleNode, SqlScanNode
|
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import sys
|
|
5
|
-
from typing import Any, AsyncIterator, Iterable,
|
|
5
|
+
from typing import Any, AsyncIterator, Iterable, cast
|
|
6
6
|
|
|
7
7
|
from pixeltable import catalog, exceptions as excs, exprs
|
|
8
8
|
|
|
@@ -19,18 +19,18 @@ class AggregationNode(ExecNode):
|
|
|
19
19
|
At the moment, this returns all results in a single DataRowBatch.
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
|
-
group_by:
|
|
22
|
+
group_by: list[exprs.Expr] | None
|
|
23
23
|
input_exprs: list[exprs.Expr]
|
|
24
24
|
agg_fn_eval_ctx: exprs.RowBuilder.EvalCtx
|
|
25
25
|
agg_fn_calls: list[exprs.FunctionCall]
|
|
26
26
|
output_batch: DataRowBatch
|
|
27
|
-
limit:
|
|
27
|
+
limit: int | None
|
|
28
28
|
|
|
29
29
|
def __init__(
|
|
30
30
|
self,
|
|
31
31
|
tbl: catalog.TableVersionHandle,
|
|
32
32
|
row_builder: exprs.RowBuilder,
|
|
33
|
-
group_by:
|
|
33
|
+
group_by: list[exprs.Expr] | None,
|
|
34
34
|
agg_fn_calls: list[exprs.FunctionCall],
|
|
35
35
|
input_exprs: Iterable[exprs.Expr],
|
|
36
36
|
input: ExecNode,
|
|
@@ -45,7 +45,7 @@ class AggregationNode(ExecNode):
|
|
|
45
45
|
# we need to make sure to refer to the same exprs that RowBuilder.eval() will use
|
|
46
46
|
self.agg_fn_calls = [cast(exprs.FunctionCall, e) for e in self.agg_fn_eval_ctx.target_exprs]
|
|
47
47
|
# create output_batch here, rather than in __iter__(), so we don't need to remember tbl and row_builder
|
|
48
|
-
self.output_batch = DataRowBatch(
|
|
48
|
+
self.output_batch = DataRowBatch(row_builder)
|
|
49
49
|
self.limit = None
|
|
50
50
|
|
|
51
51
|
def set_limit(self, limit: int) -> None:
|
|
@@ -72,8 +72,8 @@ class AggregationNode(ExecNode):
|
|
|
72
72
|
raise excs.ExprEvalError(fn_call, expr_msg, exc, exc_tb, input_vals, row_num) from exc
|
|
73
73
|
|
|
74
74
|
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
75
|
-
prev_row:
|
|
76
|
-
current_group:
|
|
75
|
+
prev_row: exprs.DataRow | None = None
|
|
76
|
+
current_group: list[Any] | None = None # the values of the group-by exprs
|
|
77
77
|
num_input_rows = 0
|
|
78
78
|
num_output_rows = 0
|
|
79
79
|
async for row_batch in self.input:
|
|
@@ -103,6 +103,5 @@ class AggregationNode(ExecNode):
|
|
|
103
103
|
self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
|
|
104
104
|
self.output_batch.add_row(prev_row)
|
|
105
105
|
|
|
106
|
-
self.output_batch.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
|
|
107
106
|
_logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
|
|
108
107
|
yield self.output_batch
|
|
@@ -9,11 +9,12 @@ import urllib.request
|
|
|
9
9
|
from collections import deque
|
|
10
10
|
from concurrent import futures
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import
|
|
12
|
+
from typing import AsyncIterator, Iterator
|
|
13
13
|
from uuid import UUID
|
|
14
14
|
|
|
15
|
-
from pixeltable import
|
|
15
|
+
from pixeltable import exceptions as excs, exprs
|
|
16
16
|
from pixeltable.utils.filecache import FileCache
|
|
17
|
+
from pixeltable.utils.object_stores import ObjectOps
|
|
17
18
|
|
|
18
19
|
from .data_row_batch import DataRowBatch
|
|
19
20
|
from .exec_node import ExecNode
|
|
@@ -25,35 +26,35 @@ class CachePrefetchNode(ExecNode):
|
|
|
25
26
|
"""Brings files with external URLs into the cache
|
|
26
27
|
|
|
27
28
|
TODO:
|
|
28
|
-
-
|
|
29
|
+
- Process a row at a time and limit the number of in-flight rows to control memory usage
|
|
30
|
+
- Create asyncio.Tasks to consume our input in order to increase concurrency.
|
|
29
31
|
"""
|
|
30
32
|
|
|
33
|
+
QUEUE_DEPTH_HIGH_WATER = 50 # target number of in-flight requests
|
|
34
|
+
QUEUE_DEPTH_LOW_WATER = 20 # target number of in-flight requests
|
|
31
35
|
BATCH_SIZE = 16
|
|
32
|
-
|
|
36
|
+
MAX_WORKERS = 15
|
|
33
37
|
|
|
34
38
|
retain_input_order: bool # if True, return rows in the exact order they were received
|
|
35
39
|
file_col_info: list[exprs.ColumnSlotIdx]
|
|
36
|
-
boto_client: Optional[Any]
|
|
37
|
-
boto_client_lock: threading.Lock
|
|
38
40
|
|
|
39
41
|
# execution state
|
|
40
|
-
batch_tbl_version: Optional[catalog.TableVersionHandle] # needed to construct output batches
|
|
41
42
|
num_returned_rows: int
|
|
42
43
|
|
|
43
44
|
# ready_rows: rows that are ready to be returned, ordered by row idx;
|
|
44
45
|
# the implied row idx of ready_rows[0] is num_returned_rows
|
|
45
|
-
ready_rows: deque[
|
|
46
|
+
ready_rows: deque[exprs.DataRow | None]
|
|
46
47
|
|
|
47
48
|
in_flight_rows: dict[int, CachePrefetchNode.RowState] # rows with in-flight urls; id(row) -> RowState
|
|
48
49
|
in_flight_requests: dict[futures.Future, str] # in-flight requests for urls; future -> URL
|
|
49
50
|
in_flight_urls: dict[str, list[tuple[exprs.DataRow, exprs.ColumnSlotIdx]]] # URL -> [(row, info)]
|
|
50
51
|
input_finished: bool
|
|
51
|
-
row_idx: Iterator[
|
|
52
|
+
row_idx: Iterator[int | None]
|
|
52
53
|
|
|
53
54
|
@dataclasses.dataclass
|
|
54
55
|
class RowState:
|
|
55
56
|
row: exprs.DataRow
|
|
56
|
-
idx:
|
|
57
|
+
idx: int | None # position in input stream; None if we don't retain input order
|
|
57
58
|
num_missing: int # number of missing URLs in this row
|
|
58
59
|
|
|
59
60
|
def __init__(
|
|
@@ -64,11 +65,6 @@ class CachePrefetchNode(ExecNode):
|
|
|
64
65
|
self.retain_input_order = retain_input_order
|
|
65
66
|
self.file_col_info = file_col_info
|
|
66
67
|
|
|
67
|
-
# clients for specific services are constructed as needed, because it's time-consuming
|
|
68
|
-
self.boto_client = None
|
|
69
|
-
self.boto_client_lock = threading.Lock()
|
|
70
|
-
|
|
71
|
-
self.batch_tbl_version = None
|
|
72
68
|
self.num_returned_rows = 0
|
|
73
69
|
self.ready_rows = deque()
|
|
74
70
|
self.in_flight_rows = {}
|
|
@@ -76,26 +72,44 @@ class CachePrefetchNode(ExecNode):
|
|
|
76
72
|
self.in_flight_urls = {}
|
|
77
73
|
self.input_finished = False
|
|
78
74
|
self.row_idx = itertools.count() if retain_input_order else itertools.repeat(None)
|
|
75
|
+
assert self.QUEUE_DEPTH_HIGH_WATER > self.QUEUE_DEPTH_LOW_WATER
|
|
79
76
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
# we create enough in-flight requests to fill the first batch
|
|
84
|
-
while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
|
|
85
|
-
await self.__submit_input_batch(input_iter, executor)
|
|
77
|
+
@property
|
|
78
|
+
def queued_work(self) -> int:
|
|
79
|
+
return len(self.in_flight_requests)
|
|
86
80
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
81
|
+
async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) -> DataRowBatch | None:
|
|
82
|
+
"""Get the next batch of input rows, or None if there are no more rows"""
|
|
83
|
+
try:
|
|
84
|
+
input_batch = await anext(input_iter)
|
|
85
|
+
if input_batch is None:
|
|
86
|
+
self.input_finished = True
|
|
87
|
+
return input_batch
|
|
88
|
+
except StopAsyncIteration:
|
|
89
|
+
self.input_finished = True
|
|
90
|
+
return None
|
|
95
91
|
|
|
96
|
-
|
|
92
|
+
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
93
|
+
input_iter = aiter(self.input)
|
|
94
|
+
with futures.ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
|
95
|
+
while True:
|
|
96
|
+
# Create work to fill the queue to the high water mark ... ?without overrunning the in-flight row limit.
|
|
97
|
+
while not self.input_finished and self.queued_work < self.QUEUE_DEPTH_HIGH_WATER:
|
|
98
|
+
input_batch = await self.get_input_batch(input_iter)
|
|
99
|
+
if input_batch is not None:
|
|
100
|
+
self.__process_input_batch(input_batch, executor)
|
|
101
|
+
|
|
102
|
+
# Wait for enough completions to enable more queueing or if we're done
|
|
103
|
+
while self.queued_work > self.QUEUE_DEPTH_LOW_WATER or (self.input_finished and self.queued_work > 0):
|
|
104
|
+
done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
|
|
105
|
+
self.__process_completions(done, ignore_errors=self.ctx.ignore_errors)
|
|
106
|
+
|
|
107
|
+
# Emit results to meet batch size requirements or empty the in-flight row queue
|
|
108
|
+
if self.__has_ready_batch() or (
|
|
109
|
+
len(self.ready_rows) > 0 and self.input_finished and self.queued_work == 0
|
|
110
|
+
):
|
|
97
111
|
# create DataRowBatch from the first BATCH_SIZE ready rows
|
|
98
|
-
batch = DataRowBatch(self.
|
|
112
|
+
batch = DataRowBatch(self.row_builder)
|
|
99
113
|
rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
|
|
100
114
|
for row in rows:
|
|
101
115
|
assert row is not None
|
|
@@ -104,23 +118,16 @@ class CachePrefetchNode(ExecNode):
|
|
|
104
118
|
_logger.debug(f'returning {len(rows)} rows')
|
|
105
119
|
yield batch
|
|
106
120
|
|
|
107
|
-
if self.input_finished and self.
|
|
121
|
+
if self.input_finished and self.queued_work == 0 and len(self.ready_rows) == 0:
|
|
108
122
|
return
|
|
109
123
|
|
|
110
|
-
def __num_pending_rows(self) -> int:
|
|
111
|
-
return len(self.in_flight_rows) + len(self.ready_rows)
|
|
112
|
-
|
|
113
124
|
def __has_ready_batch(self) -> bool:
|
|
114
125
|
"""True if there are >= BATCH_SIZES entries in ready_rows and the first BATCH_SIZE ones are all non-None"""
|
|
115
126
|
return (
|
|
116
127
|
sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
|
|
117
128
|
)
|
|
118
129
|
|
|
119
|
-
def
|
|
120
|
-
"""Length of the non-None prefix of ready_rows (= what we can return right now)"""
|
|
121
|
-
return sum(1 for _ in itertools.takewhile(lambda x: x is not None, self.ready_rows))
|
|
122
|
-
|
|
123
|
-
def __add_ready_row(self, row: exprs.DataRow, row_idx: Optional[int]) -> None:
|
|
130
|
+
def __add_ready_row(self, row: exprs.DataRow, row_idx: int | None) -> None:
|
|
124
131
|
if row_idx is None:
|
|
125
132
|
self.ready_rows.append(row)
|
|
126
133
|
else:
|
|
@@ -130,52 +137,36 @@ class CachePrefetchNode(ExecNode):
|
|
|
130
137
|
self.ready_rows.extend([None] * (idx - len(self.ready_rows) + 1))
|
|
131
138
|
self.ready_rows[idx] = row
|
|
132
139
|
|
|
133
|
-
def
|
|
134
|
-
"""Wait for in-flight requests to complete until we have a full batch of rows"""
|
|
140
|
+
def __process_completions(self, done: set[futures.Future], ignore_errors: bool) -> None:
|
|
135
141
|
file_cache = FileCache.get()
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
async def __submit_input_batch(
|
|
165
|
-
self, input: AsyncIterator[DataRowBatch], executor: futures.ThreadPoolExecutor
|
|
166
|
-
) -> None:
|
|
167
|
-
assert not self.input_finished
|
|
168
|
-
input_batch: Optional[DataRowBatch]
|
|
169
|
-
try:
|
|
170
|
-
input_batch = await anext(input)
|
|
171
|
-
except StopAsyncIteration:
|
|
172
|
-
input_batch = None
|
|
173
|
-
if input_batch is None:
|
|
174
|
-
self.input_finished = True
|
|
175
|
-
return
|
|
176
|
-
if self.batch_tbl_version is None:
|
|
177
|
-
self.batch_tbl_version = input_batch.tbl
|
|
178
|
-
|
|
142
|
+
for f in done:
|
|
143
|
+
url = self.in_flight_requests.pop(f)
|
|
144
|
+
tmp_path, exc = f.result()
|
|
145
|
+
if exc is not None and not ignore_errors:
|
|
146
|
+
raise exc
|
|
147
|
+
local_path: Path | None = None
|
|
148
|
+
if tmp_path is not None:
|
|
149
|
+
# register the file with the cache for the first column in which it's missing
|
|
150
|
+
assert url in self.in_flight_urls
|
|
151
|
+
_, info = self.in_flight_urls[url][0]
|
|
152
|
+
local_path = file_cache.add(info.col.get_tbl().id, info.col.id, url, tmp_path)
|
|
153
|
+
_logger.debug(f'cached {url} as {local_path}')
|
|
154
|
+
|
|
155
|
+
# add the local path/exception to the slots that reference the url
|
|
156
|
+
for row, info in self.in_flight_urls.pop(url):
|
|
157
|
+
if exc is not None:
|
|
158
|
+
self.row_builder.set_exc(row, info.slot_idx, exc)
|
|
159
|
+
else:
|
|
160
|
+
assert local_path is not None
|
|
161
|
+
row.set_file_path(info.slot_idx, str(local_path))
|
|
162
|
+
state = self.in_flight_rows[id(row)]
|
|
163
|
+
state.num_missing -= 1
|
|
164
|
+
if state.num_missing == 0:
|
|
165
|
+
del self.in_flight_rows[id(row)]
|
|
166
|
+
self.__add_ready_row(row, state.idx)
|
|
167
|
+
|
|
168
|
+
def __process_input_batch(self, input_batch: DataRowBatch, executor: futures.ThreadPoolExecutor) -> None:
|
|
169
|
+
"""Process a batch of input rows, submitting URLs for download and adding ready rows to ready_rows"""
|
|
179
170
|
file_cache = FileCache.get()
|
|
180
171
|
|
|
181
172
|
# URLs from this input batch that aren't already in the file cache;
|
|
@@ -183,7 +174,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
183
174
|
# the time it takes to get the next batch together
|
|
184
175
|
cache_misses: list[str] = []
|
|
185
176
|
|
|
186
|
-
url_pos: dict[str, int] = {} # url -> row_idx; used for logging
|
|
177
|
+
url_pos: dict[str, int | None] = {} # url -> row_idx; used for logging
|
|
187
178
|
for row in input_batch:
|
|
188
179
|
# identify missing local files in input batch, or fill in their paths if they're already cached
|
|
189
180
|
num_missing = 0
|
|
@@ -222,8 +213,10 @@ class CachePrefetchNode(ExecNode):
|
|
|
222
213
|
_logger.debug(f'submitted {url} for idx {url_pos[url]}')
|
|
223
214
|
self.in_flight_requests[f] = url
|
|
224
215
|
|
|
225
|
-
def __fetch_url(self, url: str) -> tuple[
|
|
226
|
-
"""Fetches a remote URL into
|
|
216
|
+
def __fetch_url(self, url: str) -> tuple[Path | None, Exception | None]:
|
|
217
|
+
"""Fetches a remote URL into the TempStore and returns its path"""
|
|
218
|
+
from pixeltable.utils.local_store import TempStore
|
|
219
|
+
|
|
227
220
|
_logger.debug(f'fetching url={url} thread_name={threading.current_thread().name}')
|
|
228
221
|
parsed = urllib.parse.urlparse(url)
|
|
229
222
|
# Use len(parsed.scheme) > 1 here to ensure we're not being passed
|
|
@@ -234,34 +227,14 @@ class CachePrefetchNode(ExecNode):
|
|
|
234
227
|
if parsed.path:
|
|
235
228
|
p = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
|
|
236
229
|
extension = p.suffix
|
|
237
|
-
tmp_path =
|
|
230
|
+
tmp_path = TempStore.create_path(extension=extension)
|
|
238
231
|
try:
|
|
239
232
|
_logger.debug(f'Downloading {url} to {tmp_path}')
|
|
240
|
-
|
|
241
|
-
from pixeltable.utils.s3 import get_client
|
|
242
|
-
|
|
243
|
-
with self.boto_client_lock:
|
|
244
|
-
if self.boto_client is None:
|
|
245
|
-
config = {
|
|
246
|
-
'max_pool_connections': self.NUM_EXECUTOR_THREADS + 4, # +4: leave some headroom
|
|
247
|
-
'connect_timeout': 5,
|
|
248
|
-
'read_timeout': 30,
|
|
249
|
-
'retries': {'max_attempts': 3, 'mode': 'adaptive'},
|
|
250
|
-
}
|
|
251
|
-
self.boto_client = get_client(**config)
|
|
252
|
-
self.boto_client.download_file(parsed.netloc, parsed.path.lstrip('/'), str(tmp_path))
|
|
253
|
-
elif parsed.scheme in ('http', 'https'):
|
|
254
|
-
with urllib.request.urlopen(url) as resp, open(tmp_path, 'wb') as f:
|
|
255
|
-
data = resp.read()
|
|
256
|
-
f.write(data)
|
|
257
|
-
else:
|
|
258
|
-
raise AssertionError(f'Unsupported URL scheme: {parsed.scheme}')
|
|
233
|
+
ObjectOps.copy_object_to_local_file(url, tmp_path)
|
|
259
234
|
_logger.debug(f'Downloaded {url} to {tmp_path}')
|
|
260
235
|
return tmp_path, None
|
|
261
236
|
except Exception as e:
|
|
262
237
|
# we want to add the file url to the exception message
|
|
263
238
|
exc = excs.Error(f'Failed to download {url}: {e}')
|
|
264
239
|
_logger.debug(f'Failed to download {url}: {e}', exc_info=e)
|
|
265
|
-
if not self.ctx.ignore_errors:
|
|
266
|
-
raise exc from None # suppress original exception
|
|
267
240
|
return None, exc
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, AsyncIterator
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pgvector.sqlalchemy # type: ignore[import-untyped]
|
|
11
|
+
import PIL.Image
|
|
12
|
+
import sqlalchemy as sql
|
|
13
|
+
|
|
14
|
+
import pixeltable.type_system as ts
|
|
15
|
+
import pixeltable.utils.image as image_utils
|
|
16
|
+
from pixeltable import catalog, exprs
|
|
17
|
+
from pixeltable.env import Env
|
|
18
|
+
from pixeltable.utils.local_store import LocalStore
|
|
19
|
+
|
|
20
|
+
from .data_row_batch import DataRowBatch
|
|
21
|
+
from .exec_node import ExecNode
|
|
22
|
+
from .globals import INLINED_OBJECT_MD_KEY, InlinedObjectMd
|
|
23
|
+
|
|
24
|
+
_logger = logging.getLogger('pixeltable')
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class CellMaterializationNode(ExecNode):
|
|
28
|
+
"""
|
|
29
|
+
Node to populate DataRow.cell_vals/cell_md.
|
|
30
|
+
|
|
31
|
+
For now, the scope is limited to populating DataRow.cells_vals for json and array columns.
|
|
32
|
+
|
|
33
|
+
Array values:
|
|
34
|
+
- Arrays < MAX_DB_ARRAY_SIZE are stored inline in the db column
|
|
35
|
+
- Larger arrays are written to inlined_obj_files
|
|
36
|
+
- Bool arrays are stored as packed bits (uint8)
|
|
37
|
+
- cell_md: holds the url of the file, plus start and end offsets, plus bool flag and shape for bool arrays
|
|
38
|
+
(this allows us to query cell_md to get the total external storage size of an array column)
|
|
39
|
+
|
|
40
|
+
Json values:
|
|
41
|
+
- Inlined images and arrays are written to inlined_obj_files and replaced with a dict containing the object
|
|
42
|
+
location
|
|
43
|
+
- Bool arrays are also stored as packed bits; the dict also contains the shape and bool flag
|
|
44
|
+
- cell_md contains the list of urls for the inlined objects.
|
|
45
|
+
|
|
46
|
+
TODO:
|
|
47
|
+
- execute file IO via asyncio Tasks in a thread pool?
|
|
48
|
+
(we already seem to be getting 90% of hardware IO throughput)
|
|
49
|
+
- subsume all cell materialization
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
output_col_info: dict[catalog.Column, int] # value: slot idx
|
|
53
|
+
|
|
54
|
+
# execution state
|
|
55
|
+
inlined_obj_files: list[Path] # only [-1] is open for writing
|
|
56
|
+
buffered_writer: io.BufferedWriter | None # BufferedWriter for inlined_obj_files[-1]
|
|
57
|
+
|
|
58
|
+
MIN_FILE_SIZE = 8 * 2**20 # 8MB
|
|
59
|
+
MAX_DB_BINARY_SIZE = 512 # max size of binary data stored in table column; in bytes
|
|
60
|
+
|
|
61
|
+
def __init__(self, input: ExecNode):
|
|
62
|
+
super().__init__(input.row_builder, [], [], input)
|
|
63
|
+
self.output_col_info = {
|
|
64
|
+
col: slot_idx
|
|
65
|
+
for col, slot_idx in input.row_builder.table_columns.items()
|
|
66
|
+
if slot_idx is not None and col.col_type.supports_file_offloading()
|
|
67
|
+
}
|
|
68
|
+
self.inlined_obj_files = []
|
|
69
|
+
self.buffered_writer = None
|
|
70
|
+
|
|
71
|
+
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
72
|
+
async for batch in self.input:
|
|
73
|
+
for row in batch:
|
|
74
|
+
for col, slot_idx in self.output_col_info.items():
|
|
75
|
+
if row.has_exc(slot_idx):
|
|
76
|
+
# Nulls in JSONB columns need to be stored as sql.sql.null(), otherwise it stores a json 'null'
|
|
77
|
+
row.cell_vals[col.id] = sql.sql.null() if col.col_type.is_json_type() else None
|
|
78
|
+
exc = row.get_exc(slot_idx)
|
|
79
|
+
row.cell_md[col.id] = exprs.CellMd(errortype=type(exc).__name__, errormsg=str(exc))
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
val = row[slot_idx]
|
|
83
|
+
if val is None:
|
|
84
|
+
row.cell_vals[col.id] = sql.sql.null() if col.col_type.is_json_type() else None
|
|
85
|
+
row.cell_md[col.id] = None
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
if col.col_type.is_json_type():
|
|
89
|
+
self._materialize_json_cell(row, col, val)
|
|
90
|
+
elif col.col_type.is_array_type():
|
|
91
|
+
assert isinstance(val, np.ndarray)
|
|
92
|
+
self._materialize_array_cell(row, col, val)
|
|
93
|
+
else:
|
|
94
|
+
assert col.col_type.is_binary_type()
|
|
95
|
+
assert isinstance(val, bytes)
|
|
96
|
+
self._materialize_binary_cell(row, col, val)
|
|
97
|
+
|
|
98
|
+
# continue with only the currently open file
|
|
99
|
+
self.inlined_obj_files = self.inlined_obj_files[-1:]
|
|
100
|
+
|
|
101
|
+
yield batch
|
|
102
|
+
|
|
103
|
+
self._flush_buffer(finalize=True)
|
|
104
|
+
|
|
105
|
+
def init_writer(self) -> None:
|
|
106
|
+
if self.buffered_writer is None:
|
|
107
|
+
self._reset_buffer()
|
|
108
|
+
assert self.buffered_writer is not None
|
|
109
|
+
|
|
110
|
+
def close(self) -> None:
|
|
111
|
+
if self.buffered_writer is not None:
|
|
112
|
+
# there must have been an error, otherwise _flush_full_buffer(finalize=True) would have set this to None
|
|
113
|
+
self.buffered_writer.close()
|
|
114
|
+
self.buffered_writer = None
|
|
115
|
+
|
|
116
|
+
def _materialize_json_cell(self, row: exprs.DataRow, col: catalog.Column, val: Any) -> None:
|
|
117
|
+
if self._json_has_inlined_objs(val):
|
|
118
|
+
row.cell_vals[col.id] = self._rewrite_json(val)
|
|
119
|
+
row.cell_md[col.id] = exprs.CellMd(file_urls=[local_path.as_uri() for local_path in self.inlined_obj_files])
|
|
120
|
+
else:
|
|
121
|
+
row.cell_vals[col.id] = val
|
|
122
|
+
row.cell_md[col.id] = None
|
|
123
|
+
|
|
124
|
+
def _materialize_array_cell(self, row: exprs.DataRow, col: catalog.Column, val: np.ndarray) -> None:
|
|
125
|
+
if isinstance(col.sa_col_type, pgvector.sqlalchemy.Vector):
|
|
126
|
+
# this is a vector column (ie, used for a vector index): store the array itself
|
|
127
|
+
row.cell_vals[col.id] = val
|
|
128
|
+
row.cell_md[col.id] = None
|
|
129
|
+
elif val.nbytes <= self.MAX_DB_BINARY_SIZE:
|
|
130
|
+
# this array is small enough to store in the db column (type: binary) directly
|
|
131
|
+
buffer = io.BytesIO()
|
|
132
|
+
np.save(buffer, val, allow_pickle=False)
|
|
133
|
+
row.cell_vals[col.id] = buffer.getvalue()
|
|
134
|
+
row.cell_md[col.id] = None
|
|
135
|
+
else:
|
|
136
|
+
# append this array to the buffer and store its location in the cell md
|
|
137
|
+
ar: np.ndarray
|
|
138
|
+
if np.issubdtype(val.dtype, np.bool_):
|
|
139
|
+
# for bool arrays, store as packed bits, otherwise it's 1 byte per element
|
|
140
|
+
ar = np.packbits(val)
|
|
141
|
+
else:
|
|
142
|
+
ar = val
|
|
143
|
+
self.init_writer()
|
|
144
|
+
start = self.buffered_writer.tell()
|
|
145
|
+
np.save(self.buffered_writer, ar, allow_pickle=False)
|
|
146
|
+
end = self.buffered_writer.tell()
|
|
147
|
+
row.cell_vals[col.id] = None
|
|
148
|
+
cell_md = exprs.CellMd(
|
|
149
|
+
file_urls=[self.inlined_obj_files[-1].as_uri()], array_md=exprs.ArrayMd(start=start, end=end)
|
|
150
|
+
)
|
|
151
|
+
if np.issubdtype(val.dtype, np.bool_):
|
|
152
|
+
cell_md.array_md.is_bool = True
|
|
153
|
+
cell_md.array_md.shape = val.shape
|
|
154
|
+
row.cell_md[col.id] = cell_md
|
|
155
|
+
self._flush_buffer()
|
|
156
|
+
|
|
157
|
+
assert row.cell_vals[col.id] is not None or row.cell_md[col.id] is not None
|
|
158
|
+
|
|
159
|
+
def _materialize_binary_cell(self, row: exprs.DataRow, col: catalog.Column, val: bytes) -> None:
|
|
160
|
+
if len(val) <= self.MAX_DB_BINARY_SIZE:
|
|
161
|
+
# this `bytes` object is small enough to store in the db column (type: binary) directly
|
|
162
|
+
row.cell_vals[col.id] = val
|
|
163
|
+
row.cell_md[col.id] = None
|
|
164
|
+
else:
|
|
165
|
+
self.init_writer()
|
|
166
|
+
start = self.buffered_writer.tell()
|
|
167
|
+
self.buffered_writer.write(val)
|
|
168
|
+
end = self.buffered_writer.tell()
|
|
169
|
+
row.cell_vals[col.id] = None
|
|
170
|
+
cell_md = exprs.CellMd(
|
|
171
|
+
file_urls=[self.inlined_obj_files[-1].as_uri()], binary_md=exprs.BinaryMd(start=start, end=end)
|
|
172
|
+
)
|
|
173
|
+
row.cell_md[col.id] = cell_md
|
|
174
|
+
self._flush_buffer()
|
|
175
|
+
|
|
176
|
+
assert row.cell_vals[col.id] is not None or row.cell_md[col.id] is not None
|
|
177
|
+
|
|
178
|
+
def _json_has_inlined_objs(self, element: Any) -> bool:
|
|
179
|
+
if isinstance(element, list):
|
|
180
|
+
return any(self._json_has_inlined_objs(v) for v in element)
|
|
181
|
+
if isinstance(element, dict):
|
|
182
|
+
return any(self._json_has_inlined_objs(v) for v in element.values())
|
|
183
|
+
return isinstance(element, (np.ndarray, PIL.Image.Image, bytes))
|
|
184
|
+
|
|
185
|
+
def _rewrite_json(self, element: Any) -> Any:
|
|
186
|
+
"""Recursively rewrites a JSON structure by writing any inlined arrays or images to self.buffered_writer."""
|
|
187
|
+
if isinstance(element, list):
|
|
188
|
+
return [self._rewrite_json(v) for v in element]
|
|
189
|
+
if isinstance(element, dict):
|
|
190
|
+
return {k: self._rewrite_json(v) for k, v in element.items()}
|
|
191
|
+
if isinstance(element, np.ndarray):
|
|
192
|
+
obj_md = self._write_inlined_array(element)
|
|
193
|
+
return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
|
|
194
|
+
if isinstance(element, PIL.Image.Image):
|
|
195
|
+
obj_md = self._write_inlined_image(element)
|
|
196
|
+
return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
|
|
197
|
+
if isinstance(element, bytes):
|
|
198
|
+
obj_md = self._write_inlined_bytes(element)
|
|
199
|
+
return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
|
|
200
|
+
return element
|
|
201
|
+
|
|
202
|
+
def _write_inlined_array(self, ar: np.ndarray) -> InlinedObjectMd:
|
|
203
|
+
"""Write an ndarray to buffered_writer and return its metadata."""
|
|
204
|
+
self.init_writer()
|
|
205
|
+
url_idx = len(self.inlined_obj_files) - 1
|
|
206
|
+
start = self.buffered_writer.tell()
|
|
207
|
+
shape: tuple[int, ...] | None
|
|
208
|
+
is_bool_array: bool
|
|
209
|
+
if np.issubdtype(ar.dtype, np.bool_):
|
|
210
|
+
shape = ar.shape
|
|
211
|
+
ar = np.packbits(ar)
|
|
212
|
+
is_bool_array = True
|
|
213
|
+
else:
|
|
214
|
+
shape = None
|
|
215
|
+
is_bool_array = False
|
|
216
|
+
np.save(self.buffered_writer, ar, allow_pickle=False)
|
|
217
|
+
end = self.buffered_writer.tell()
|
|
218
|
+
self._flush_buffer()
|
|
219
|
+
return InlinedObjectMd(
|
|
220
|
+
type=ts.ColumnType.Type.ARRAY.name,
|
|
221
|
+
url_idx=url_idx,
|
|
222
|
+
array_md=exprs.ArrayMd(start=start, end=end, is_bool=is_bool_array, shape=shape),
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
def _write_inlined_image(self, img: PIL.Image.Image) -> InlinedObjectMd:
|
|
226
|
+
"""Write a PIL image to buffered_writer and return: index into inlined_obj_files, start offset, end offset"""
|
|
227
|
+
self.init_writer()
|
|
228
|
+
url_idx = len(self.inlined_obj_files) - 1
|
|
229
|
+
start = self.buffered_writer.tell()
|
|
230
|
+
img.save(self.buffered_writer, format=image_utils.default_format(img))
|
|
231
|
+
end = self.buffered_writer.tell()
|
|
232
|
+
self._flush_buffer()
|
|
233
|
+
return InlinedObjectMd(type=ts.ColumnType.Type.IMAGE.name, url_idx=url_idx, img_start=start, img_end=end)
|
|
234
|
+
|
|
235
|
+
def _write_inlined_bytes(self, data: bytes) -> InlinedObjectMd:
|
|
236
|
+
"""Write raw bytes to buffered_writer and return: index into inlined_obj_files, start offset, end offset"""
|
|
237
|
+
self.init_writer()
|
|
238
|
+
url_idx = len(self.inlined_obj_files) - 1
|
|
239
|
+
start = self.buffered_writer.tell()
|
|
240
|
+
self.buffered_writer.write(data)
|
|
241
|
+
end = self.buffered_writer.tell()
|
|
242
|
+
self._flush_buffer()
|
|
243
|
+
return InlinedObjectMd(
|
|
244
|
+
type=ts.ColumnType.Type.BINARY.name, url_idx=url_idx, binary_md=exprs.BinaryMd(start, end)
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
def _reset_buffer(self) -> None:
|
|
248
|
+
local_path = LocalStore(Env.get().media_dir)._prepare_path_raw(
|
|
249
|
+
self.row_builder.tbl.id, 0, self.row_builder.tbl.version
|
|
250
|
+
)
|
|
251
|
+
self.inlined_obj_files.append(local_path)
|
|
252
|
+
fh = open(local_path, 'wb', buffering=self.MIN_FILE_SIZE * 2) # noqa: SIM115
|
|
253
|
+
assert isinstance(fh, io.BufferedWriter)
|
|
254
|
+
self.buffered_writer = fh
|
|
255
|
+
|
|
256
|
+
def _flush_buffer(self, finalize: bool = False) -> None:
|
|
257
|
+
"""Flush buffered_writer to storage if it exceeds its minimum size or finalize is True."""
|
|
258
|
+
if self.buffered_writer is None:
|
|
259
|
+
return
|
|
260
|
+
if self.buffered_writer.tell() < self.MIN_FILE_SIZE and not finalize:
|
|
261
|
+
return
|
|
262
|
+
self.buffered_writer.flush()
|
|
263
|
+
os.fsync(self.buffered_writer.fileno()) # needed to force bytes cached by OS to storage
|
|
264
|
+
self.buffered_writer.close()
|
|
265
|
+
if finalize:
|
|
266
|
+
self.buffered_writer = None
|
|
267
|
+
else:
|
|
268
|
+
self._reset_buffer()
|