pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +42 -8
- pixeltable/{dataframe.py → _query.py} +470 -206
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -4
- pixeltable/catalog/catalog.py +1785 -432
- pixeltable/catalog/column.py +190 -113
- pixeltable/catalog/dir.py +2 -4
- pixeltable/catalog/globals.py +19 -46
- pixeltable/catalog/insertable_table.py +191 -98
- pixeltable/catalog/path.py +63 -23
- pixeltable/catalog/schema_object.py +11 -15
- pixeltable/catalog/table.py +843 -436
- pixeltable/catalog/table_metadata.py +103 -0
- pixeltable/catalog/table_version.py +978 -657
- pixeltable/catalog/table_version_handle.py +72 -16
- pixeltable/catalog/table_version_path.py +112 -43
- pixeltable/catalog/tbl_ops.py +53 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +134 -90
- pixeltable/config.py +134 -22
- pixeltable/env.py +471 -157
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +4 -1
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +268 -0
- pixeltable/exec/cell_reconstruction_node.py +168 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +11 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +106 -56
- pixeltable/exec/globals.py +35 -0
- pixeltable/exec/in_memory_data_node.py +19 -19
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +351 -84
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +36 -23
- pixeltable/exprs/column_ref.py +213 -89
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +164 -54
- pixeltable/exprs/expr.py +70 -44
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +100 -40
- pixeltable/exprs/globals.py +2 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +18 -32
- pixeltable/exprs/is_null.py +7 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +27 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +167 -67
- pixeltable/exprs/rowid_ref.py +25 -10
- pixeltable/exprs/similarity_expr.py +58 -40
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +17 -11
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +29 -27
- pixeltable/func/signature.py +46 -19
- pixeltable/func/tools.py +31 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +16 -0
- pixeltable/functions/anthropic.py +123 -77
- pixeltable/functions/audio.py +147 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +7 -4
- pixeltable/functions/deepseek.py +35 -43
- pixeltable/functions/document.py +81 -0
- pixeltable/functions/fal.py +76 -0
- pixeltable/functions/fireworks.py +11 -20
- pixeltable/functions/gemini.py +195 -39
- pixeltable/functions/globals.py +142 -14
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1056 -24
- pixeltable/functions/image.py +115 -57
- pixeltable/functions/json.py +1 -1
- pixeltable/functions/llama_cpp.py +28 -13
- pixeltable/functions/math.py +67 -5
- pixeltable/functions/mistralai.py +18 -55
- pixeltable/functions/net.py +70 -0
- pixeltable/functions/ollama.py +20 -13
- pixeltable/functions/openai.py +240 -226
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +4 -4
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +239 -69
- pixeltable/functions/timestamp.py +16 -16
- pixeltable/functions/together.py +24 -84
- pixeltable/functions/twelvelabs.py +188 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/uuid.py +30 -0
- pixeltable/functions/video.py +1515 -107
- pixeltable/functions/vision.py +8 -8
- pixeltable/functions/voyageai.py +289 -0
- pixeltable/functions/whisper.py +16 -8
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +362 -115
- pixeltable/index/base.py +17 -21
- pixeltable/index/btree.py +28 -22
- pixeltable/index/embedding_index.py +100 -118
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +8 -7
- pixeltable/io/external_store.py +56 -105
- pixeltable/io/fiftyone.py +13 -13
- pixeltable/io/globals.py +31 -30
- pixeltable/io/hf_datasets.py +61 -16
- pixeltable/io/label_studio.py +74 -70
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +21 -12
- pixeltable/io/parquet.py +25 -105
- pixeltable/io/table_data_conduit.py +250 -123
- pixeltable/io/utils.py +4 -4
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +26 -25
- pixeltable/iterators/base.py +9 -3
- pixeltable/iterators/document.py +112 -78
- pixeltable/iterators/image.py +12 -15
- pixeltable/iterators/string.py +11 -4
- pixeltable/iterators/video.py +523 -120
- pixeltable/metadata/__init__.py +14 -3
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_30.py +34 -21
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/convert_41.py +12 -0
- pixeltable/metadata/converters/convert_42.py +9 -0
- pixeltable/metadata/converters/convert_43.py +44 -0
- pixeltable/metadata/converters/util.py +20 -31
- pixeltable/metadata/notes.py +9 -0
- pixeltable/metadata/schema.py +140 -53
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +382 -115
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +547 -83
- pixeltable/share/protocol/__init__.py +33 -0
- pixeltable/share/protocol/common.py +165 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +119 -0
- pixeltable/share/publish.py +257 -59
- pixeltable/store.py +311 -194
- pixeltable/type_system.py +373 -211
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +131 -17
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +346 -0
- pixeltable/utils/coco.py +6 -6
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +32 -6
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +7 -18
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +86 -48
- pixeltable/utils/gcs_store.py +295 -0
- pixeltable/utils/http.py +133 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +90 -0
- pixeltable/utils/local_store.py +322 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +573 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +527 -0
- pixeltable/utils/sql.py +26 -0
- pixeltable/utils/system.py +30 -0
- pixeltable-0.5.7.dist-info/METADATA +579 -0
- pixeltable-0.5.7.dist-info/RECORD +227 -0
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
- pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/catalog/named_function.py +0 -40
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable-0.3.14.dist-info/METADATA +0 -434
- pixeltable-0.3.14.dist-info/RECORD +0 -186
- pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/utils/http.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import time
|
|
3
|
+
from http import HTTPStatus
|
|
4
|
+
from random import random
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
_RETRIABLE_ERROR_INDICATORS = (
|
|
8
|
+
'rate limit',
|
|
9
|
+
'too many requests',
|
|
10
|
+
'429',
|
|
11
|
+
'quota exceeded',
|
|
12
|
+
'throttled',
|
|
13
|
+
'rate exceeded',
|
|
14
|
+
'connection error',
|
|
15
|
+
'timed out',
|
|
16
|
+
)
|
|
17
|
+
_RETRY_AFTER_PATTERNS = (
|
|
18
|
+
r'retry after (\d+(?:\.\d+)?)\s*seconds?',
|
|
19
|
+
r'try again in (\d+(?:\.\d+)?)\s*seconds?',
|
|
20
|
+
r'wait (\d+(?:\.\d+)?)\s*seconds?',
|
|
21
|
+
r'retry-after:\s*(\d+(?:\.\d+)?)',
|
|
22
|
+
)
|
|
23
|
+
_RETRIABLE_HTTP_STATUSES: dict[str, int] = {
|
|
24
|
+
'TOO_MANY_REQUESTS': HTTPStatus.TOO_MANY_REQUESTS.value,
|
|
25
|
+
'SERVICE_UNAVAILABLE': HTTPStatus.SERVICE_UNAVAILABLE.value,
|
|
26
|
+
'REQUEST_TIMEOUT': HTTPStatus.REQUEST_TIMEOUT.value,
|
|
27
|
+
'GATEWAY_TIMEOUT': HTTPStatus.GATEWAY_TIMEOUT.value,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def is_retriable_error(exc: Exception) -> tuple[bool, float | None]:
|
|
32
|
+
"""Attempts to guess if the exception indicates a retriable eror. If that is the case, returns True
|
|
33
|
+
and the retry delay in seconds."""
|
|
34
|
+
|
|
35
|
+
# Check for HTTP status TOO_MANY_REQUESTS in various exception classes.
|
|
36
|
+
# We look for attributes that contain status codes, instead of checking the type of the exception,
|
|
37
|
+
# in order to handle a wider variety of exception classes.
|
|
38
|
+
err_md = _extract_error_metadata(exc)
|
|
39
|
+
if (err_md is None or not err_md[0]) and hasattr(exc, 'response'):
|
|
40
|
+
err_md = _extract_error_metadata(exc.response)
|
|
41
|
+
|
|
42
|
+
if err_md is not None and err_md[0]:
|
|
43
|
+
retry_after = err_md[1]
|
|
44
|
+
return err_md[0], retry_after if retry_after is not None and retry_after >= 0 else None
|
|
45
|
+
|
|
46
|
+
# Check common rate limit keywords in exception message
|
|
47
|
+
error_msg = str(exc).lower()
|
|
48
|
+
if any(indicator in error_msg for indicator in _RETRIABLE_ERROR_INDICATORS):
|
|
49
|
+
retry_delay = _extract_retry_delay_from_message(error_msg)
|
|
50
|
+
return True, retry_delay if retry_delay is not None and retry_delay >= 0 else None
|
|
51
|
+
|
|
52
|
+
return False, None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _extract_error_metadata(obj: Any) -> tuple[bool, float | None] | None:
|
|
56
|
+
is_retriable: bool | None = None
|
|
57
|
+
retry_delay: float | None = None
|
|
58
|
+
for attr in ['status', 'code', 'status_code']:
|
|
59
|
+
if hasattr(obj, attr):
|
|
60
|
+
is_retriable = getattr(obj, attr) in _RETRIABLE_HTTP_STATUSES.values()
|
|
61
|
+
is_retriable |= str(getattr(obj, attr)).upper() in _RETRIABLE_HTTP_STATUSES
|
|
62
|
+
|
|
63
|
+
if hasattr(obj, 'headers'):
|
|
64
|
+
retry_delay = _extract_retry_delay_from_headers(obj.headers)
|
|
65
|
+
if retry_delay is not None:
|
|
66
|
+
is_retriable = True
|
|
67
|
+
|
|
68
|
+
return (is_retriable, retry_delay) if is_retriable is not None else None
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _extract_retry_delay_from_headers(headers: Any | None) -> float | None:
|
|
72
|
+
"""Extract retry delay from HTTP headers."""
|
|
73
|
+
if headers is None:
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
# convert headers to dict-like object for consistent access
|
|
77
|
+
header_dict: dict
|
|
78
|
+
if hasattr(headers, 'get'):
|
|
79
|
+
header_dict = headers
|
|
80
|
+
else:
|
|
81
|
+
# headers are a list of tuples or other format
|
|
82
|
+
try:
|
|
83
|
+
header_dict = dict(headers)
|
|
84
|
+
except (TypeError, ValueError):
|
|
85
|
+
return None
|
|
86
|
+
# normalize dict keys: lowercase and remove dashes
|
|
87
|
+
header_dict = {k.lower().replace('-', ''): v for k, v in header_dict.items()}
|
|
88
|
+
|
|
89
|
+
# check Retry-After header
|
|
90
|
+
retry_after = header_dict.get('retryafter')
|
|
91
|
+
if retry_after is not None:
|
|
92
|
+
try:
|
|
93
|
+
return float(retry_after)
|
|
94
|
+
except (ValueError, TypeError):
|
|
95
|
+
pass
|
|
96
|
+
|
|
97
|
+
# check X-RateLimit-Reset (Unix timestamp)
|
|
98
|
+
reset_time = header_dict.get('xratelimitreset')
|
|
99
|
+
if reset_time is not None:
|
|
100
|
+
try:
|
|
101
|
+
reset_timestamp = float(reset_time)
|
|
102
|
+
delay = max(0, reset_timestamp - time.time())
|
|
103
|
+
return delay
|
|
104
|
+
except (ValueError, TypeError):
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
# check X-RateLimit-Reset-After (seconds from now)
|
|
108
|
+
reset_after = header_dict.get('xratelimitresetafter')
|
|
109
|
+
if reset_after is not None:
|
|
110
|
+
try:
|
|
111
|
+
return float(reset_after)
|
|
112
|
+
except (ValueError, TypeError):
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _extract_retry_delay_from_message(msg: str) -> float | None:
|
|
119
|
+
msg_lower = msg.lower()
|
|
120
|
+
for pattern in _RETRY_AFTER_PATTERNS:
|
|
121
|
+
match = re.search(pattern, msg_lower)
|
|
122
|
+
if match is not None:
|
|
123
|
+
try:
|
|
124
|
+
return float(match.group(1))
|
|
125
|
+
except (ValueError, TypeError):
|
|
126
|
+
continue
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def exponential_backoff(attempt: int, base: float = 2.0, max_delay: float = 16.0) -> float:
|
|
131
|
+
"""Generates the retry delay using exponential backoff strategy with jitter. Attempt count starts from 0."""
|
|
132
|
+
basic_delay = min(max_delay, base**attempt) / 2
|
|
133
|
+
return basic_delay + random() * basic_delay
|
pixeltable/utils/http_server.py
CHANGED
|
@@ -2,7 +2,7 @@ import http
|
|
|
2
2
|
import http.server
|
|
3
3
|
import logging
|
|
4
4
|
import pathlib
|
|
5
|
-
import urllib
|
|
5
|
+
import urllib.request
|
|
6
6
|
from typing import Any
|
|
7
7
|
|
|
8
8
|
_logger = logging.getLogger('pixeltable.http.server')
|
|
@@ -36,8 +36,7 @@ class AbsolutePathHandler(http.server.SimpleHTTPRequestHandler):
|
|
|
36
36
|
path = path.split('?', 1)[0]
|
|
37
37
|
path = path.split('#', 1)[0]
|
|
38
38
|
|
|
39
|
-
|
|
40
|
-
return str(path)
|
|
39
|
+
return str(pathlib.Path(urllib.request.url2pathname(path)))
|
|
41
40
|
|
|
42
41
|
def log_message(self, format: str, *args: Any) -> None:
|
|
43
42
|
"""override logging to stderr in http.server.BaseHTTPRequestHandler"""
|
pixeltable/utils/iceberg.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
from typing import Union
|
|
3
2
|
|
|
4
3
|
from pyiceberg.catalog.sql import SqlCatalog
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
def sqlite_catalog(warehouse_path:
|
|
6
|
+
def sqlite_catalog(warehouse_path: str | Path, name: str = 'pixeltable') -> SqlCatalog:
|
|
8
7
|
"""
|
|
9
8
|
Instantiate a sqlite Iceberg catalog at the specified path. If no catalog exists, one will be created.
|
|
10
9
|
"""
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
|
|
4
|
+
import PIL.Image
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def default_format(img: PIL.Image.Image) -> str:
|
|
8
|
+
# Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
|
|
9
|
+
# In that case, use WebP instead.
|
|
10
|
+
return 'webp' if img.has_transparency_data else 'jpeg'
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def to_base64(image: PIL.Image.Image, format: str | None = None) -> str:
|
|
14
|
+
buffer = BytesIO()
|
|
15
|
+
image.save(buffer, format=format or image.format)
|
|
16
|
+
image_bytes = buffer.getvalue()
|
|
17
|
+
return base64.b64encode(image_bytes).decode('utf-8')
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import shutil
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
import pixeltable as pxt
|
|
9
|
+
import pixeltable.exceptions as excs
|
|
10
|
+
from pixeltable.catalog import Catalog
|
|
11
|
+
from pixeltable.env import Env
|
|
12
|
+
|
|
13
|
+
_logger = logging.getLogger('pixeltable')
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def export_lancedb(
|
|
17
|
+
table_or_query: pxt.Table | pxt.Query,
|
|
18
|
+
db_uri: Path,
|
|
19
|
+
table_name: str,
|
|
20
|
+
batch_size_bytes: int = 128 * 2**20,
|
|
21
|
+
if_exists: Literal['error', 'overwrite', 'append'] = 'error',
|
|
22
|
+
) -> None:
|
|
23
|
+
"""
|
|
24
|
+
Exports a Query's data to a LanceDB table.
|
|
25
|
+
|
|
26
|
+
This utilizes LanceDB's streaming interface for efficient table creation, via a sequence of in-memory pyarrow
|
|
27
|
+
`RecordBatches`, the size of which can be controlled with the `batch_size_bytes` parameter.
|
|
28
|
+
|
|
29
|
+
__Requirements:__
|
|
30
|
+
|
|
31
|
+
- `pip install lancedb`
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
table_or_query : Table or Query to export.
|
|
35
|
+
db_uri: Local Path to the LanceDB database.
|
|
36
|
+
table_name : Name of the table in the LanceDB database.
|
|
37
|
+
batch_size_bytes : Maximum size in bytes for each batch.
|
|
38
|
+
if_exists: Determines the behavior if the table already exists. Must be one of the following:
|
|
39
|
+
|
|
40
|
+
- `'error'`: raise an error
|
|
41
|
+
- `'overwrite'`: overwrite the existing table
|
|
42
|
+
- `'append'`: append to the existing table
|
|
43
|
+
"""
|
|
44
|
+
Env.get().require_package('lancedb')
|
|
45
|
+
|
|
46
|
+
import lancedb # type: ignore[import-untyped]
|
|
47
|
+
|
|
48
|
+
from pixeltable.utils.arrow import to_arrow_schema, to_record_batches
|
|
49
|
+
|
|
50
|
+
if if_exists not in ('error', 'overwrite', 'append'):
|
|
51
|
+
raise excs.Error("export_lancedb(): 'if_exists' must be one of: ['error', 'overwrite', 'append']")
|
|
52
|
+
|
|
53
|
+
query: pxt.Query
|
|
54
|
+
if isinstance(table_or_query, pxt.catalog.Table):
|
|
55
|
+
query = table_or_query.select()
|
|
56
|
+
else:
|
|
57
|
+
query = table_or_query
|
|
58
|
+
|
|
59
|
+
db_exists = False
|
|
60
|
+
if db_uri.exists():
|
|
61
|
+
if not db_uri.is_dir():
|
|
62
|
+
raise excs.Error(f"export_lancedb(): '{db_uri!s}' exists and is not a directory")
|
|
63
|
+
db_exists = True
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
db = lancedb.connect(str(db_uri))
|
|
67
|
+
lance_tbl: lancedb.LanceTable | None = None
|
|
68
|
+
try:
|
|
69
|
+
lance_tbl = db.open_table(table_name)
|
|
70
|
+
if if_exists == 'error':
|
|
71
|
+
raise excs.Error(f'export_lancedb(): table {table_name!r} already exists in {db_uri!r}')
|
|
72
|
+
except ValueError:
|
|
73
|
+
# table doesn't exist
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
with Catalog.get().begin_xact(for_write=False):
|
|
77
|
+
if lance_tbl is None or if_exists == 'overwrite':
|
|
78
|
+
mode = 'overwrite' if lance_tbl is not None else 'create'
|
|
79
|
+
arrow_schema = to_arrow_schema(query.schema)
|
|
80
|
+
_ = db.create_table(
|
|
81
|
+
table_name, to_record_batches(query, batch_size_bytes), schema=arrow_schema, mode=mode
|
|
82
|
+
)
|
|
83
|
+
else:
|
|
84
|
+
lance_tbl.add(to_record_batches(query, batch_size_bytes))
|
|
85
|
+
|
|
86
|
+
except Exception as e:
|
|
87
|
+
# cleanup
|
|
88
|
+
if not db_exists:
|
|
89
|
+
shutil.rmtree(db_uri)
|
|
90
|
+
raise e
|
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import glob
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import shutil
|
|
8
|
+
import urllib.parse
|
|
9
|
+
import urllib.request
|
|
10
|
+
import uuid
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import TYPE_CHECKING
|
|
14
|
+
from uuid import UUID
|
|
15
|
+
|
|
16
|
+
import PIL.Image
|
|
17
|
+
|
|
18
|
+
from pixeltable import env, exceptions as excs
|
|
19
|
+
from pixeltable.utils.object_stores import ObjectPath, ObjectStoreBase, StorageObjectAddress
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from pixeltable.catalog import Column
|
|
23
|
+
|
|
24
|
+
_logger = logging.getLogger('pixeltable')
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class LocalStore(ObjectStoreBase):
|
|
28
|
+
"""
|
|
29
|
+
Utilities to manage files stored in a local filesystem directory.
|
|
30
|
+
|
|
31
|
+
Media file names are a composite of: table id, column id, tbl_version, new uuid:
|
|
32
|
+
the table id/column id/tbl_version are redundant but useful for identifying all files for a table
|
|
33
|
+
or all files created for a particular version of a table
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
__base_dir: Path
|
|
37
|
+
|
|
38
|
+
soa: StorageObjectAddress | None
|
|
39
|
+
|
|
40
|
+
def __init__(self, location: Path | StorageObjectAddress):
|
|
41
|
+
if isinstance(location, Path):
|
|
42
|
+
self.__base_dir = location
|
|
43
|
+
self.soa = None
|
|
44
|
+
else:
|
|
45
|
+
assert isinstance(location, StorageObjectAddress)
|
|
46
|
+
self.__base_dir = location.to_path
|
|
47
|
+
self.soa = location
|
|
48
|
+
|
|
49
|
+
def validate(self, error_col_name: str) -> str:
|
|
50
|
+
"""Convert a Column destination parameter to a URI, else raise errors."""
|
|
51
|
+
dest_path = self.__base_dir
|
|
52
|
+
|
|
53
|
+
# Check if path exists and validate it's a directory
|
|
54
|
+
if not dest_path.exists():
|
|
55
|
+
raise excs.Error(f'{error_col_name}`destination` does not exist')
|
|
56
|
+
if not dest_path.is_dir():
|
|
57
|
+
raise excs.Error(f'{error_col_name}`destination` must be a directory, not a file')
|
|
58
|
+
|
|
59
|
+
# Check if path is absolute
|
|
60
|
+
if dest_path.is_absolute():
|
|
61
|
+
# Convert to file URI
|
|
62
|
+
return dest_path.as_uri()
|
|
63
|
+
|
|
64
|
+
# For relative paths, convert to absolute first
|
|
65
|
+
try:
|
|
66
|
+
absolute_path = dest_path.resolve()
|
|
67
|
+
return absolute_path.as_uri()
|
|
68
|
+
except (OSError, ValueError) as e:
|
|
69
|
+
raise excs.Error(f'{error_col_name}`destination` must be a valid path. Error: {e}') from None
|
|
70
|
+
|
|
71
|
+
@staticmethod
|
|
72
|
+
def file_url_to_path(url: str) -> Path | None:
|
|
73
|
+
"""Convert a file:// URI to a Path object with support for Windows UNC paths."""
|
|
74
|
+
assert isinstance(url, str), type(url)
|
|
75
|
+
parsed = urllib.parse.urlparse(url)
|
|
76
|
+
|
|
77
|
+
# Verify it's a file scheme
|
|
78
|
+
# We should never be passed a local file path here. The "len > 1" ensures that Windows
|
|
79
|
+
# file paths aren't mistaken for URLs with a single-character scheme.
|
|
80
|
+
assert len(parsed.scheme) > 1, url
|
|
81
|
+
if parsed.scheme.lower() != 'file':
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
pth = parsed.path
|
|
85
|
+
if parsed.netloc:
|
|
86
|
+
# This is a UNC path, ie, file://host/share/path/to/file
|
|
87
|
+
pth = f'//{parsed.netloc}{pth}'
|
|
88
|
+
|
|
89
|
+
path_str = urllib.parse.unquote(urllib.request.url2pathname(pth))
|
|
90
|
+
return Path(path_str)
|
|
91
|
+
|
|
92
|
+
@classmethod
|
|
93
|
+
def _save_binary_media_file(cls, file_data: bytes, dest_path: Path, format: str | None) -> Path:
|
|
94
|
+
"""Save binary data to a file in a LocalStore. format is ignored for binary data."""
|
|
95
|
+
assert isinstance(file_data, bytes)
|
|
96
|
+
with open(dest_path, 'wb') as f:
|
|
97
|
+
f.write(file_data)
|
|
98
|
+
f.flush() # Ensures Python buffers are written to OS
|
|
99
|
+
os.fsync(f.fileno()) # Forces OS to write to physical storage
|
|
100
|
+
return dest_path
|
|
101
|
+
|
|
102
|
+
@classmethod
|
|
103
|
+
def _save_pil_image_file(cls, image: PIL.Image.Image, dest_path: Path, format: str | None) -> Path:
|
|
104
|
+
"""Save a PIL Image to a file in a LocalStore with the specified format."""
|
|
105
|
+
if dest_path.suffix != f'.{format}':
|
|
106
|
+
dest_path = dest_path.with_name(f'{dest_path.name}.{format}')
|
|
107
|
+
|
|
108
|
+
with open(dest_path, 'wb') as f:
|
|
109
|
+
image.save(f, format=format)
|
|
110
|
+
f.flush() # Ensures Python buffers are written to OS
|
|
111
|
+
os.fsync(f.fileno()) # Forces OS to write to physical storage
|
|
112
|
+
return dest_path
|
|
113
|
+
|
|
114
|
+
def _prepare_path_raw(self, tbl_id: UUID, col_id: int, tbl_version: int, ext: str | None = None) -> Path:
|
|
115
|
+
"""
|
|
116
|
+
Construct a new, unique Path name in the __base_dir for a persisted file.
|
|
117
|
+
Create the parent directory for the new Path if it does not already exist.
|
|
118
|
+
"""
|
|
119
|
+
prefix, filename = ObjectPath.create_prefix_raw(tbl_id, col_id, tbl_version, ext)
|
|
120
|
+
parent = self.__base_dir / Path(prefix)
|
|
121
|
+
parent.mkdir(parents=True, exist_ok=True)
|
|
122
|
+
return parent / filename
|
|
123
|
+
|
|
124
|
+
def _prepare_path(self, col: Column, ext: str | None = None) -> Path:
|
|
125
|
+
"""
|
|
126
|
+
Construct a new, unique Path name in the __base_dir for a persisted file.
|
|
127
|
+
Create the parent directory for the new Path if it does not already exist.
|
|
128
|
+
"""
|
|
129
|
+
assert col.get_tbl() is not None, 'Column must be associated with a table'
|
|
130
|
+
return self._prepare_path_raw(col.get_tbl().id, col.id, col.get_tbl().version, ext)
|
|
131
|
+
|
|
132
|
+
def contains_path(self, file_path: Path) -> bool:
|
|
133
|
+
"""Return True if the given path refers to a file managed by this LocalStore, else False."""
|
|
134
|
+
return str(file_path).startswith(str(self.__base_dir))
|
|
135
|
+
|
|
136
|
+
def resolve_url(self, file_url: str | None) -> Path | None:
|
|
137
|
+
"""Return path if the given url refers to a file managed by this LocalStore, else None.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
file_url: URL to check
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
If the url is a managed file, return a Path() to the file, None, otherwise
|
|
144
|
+
"""
|
|
145
|
+
if file_url is None:
|
|
146
|
+
return None
|
|
147
|
+
file_path = self.file_url_to_path(file_url)
|
|
148
|
+
if file_path is None:
|
|
149
|
+
return None
|
|
150
|
+
if not str(file_path).startswith(str(self.__base_dir)):
|
|
151
|
+
# not a tmp file
|
|
152
|
+
return None
|
|
153
|
+
return file_path
|
|
154
|
+
|
|
155
|
+
def move_local_file(self, col: Column, src_path: Path) -> str:
|
|
156
|
+
"""Move a local file to this store, and return its new URL"""
|
|
157
|
+
dest_path = self._prepare_path(col, ext=src_path.suffix)
|
|
158
|
+
src_path.rename(dest_path)
|
|
159
|
+
new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
|
|
160
|
+
_logger.debug(f'Media Storage: moved {src_path} to {new_file_url}')
|
|
161
|
+
return new_file_url
|
|
162
|
+
|
|
163
|
+
def copy_local_file(self, col: Column, src_path: Path) -> str:
|
|
164
|
+
"""Copy a local file to a this store, and return its new URL"""
|
|
165
|
+
dest_path = self._prepare_path(col, ext=src_path.suffix)
|
|
166
|
+
shutil.copy2(src_path, dest_path)
|
|
167
|
+
new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
|
|
168
|
+
_logger.debug(f'Media Storage: copied {src_path} to {new_file_url}')
|
|
169
|
+
return new_file_url
|
|
170
|
+
|
|
171
|
+
def save_media_object(self, data: bytes | PIL.Image.Image, col: Column, format: str | None) -> tuple[Path, str]:
|
|
172
|
+
"""Save a data object to a file in a LocalStore
|
|
173
|
+
Returns:
|
|
174
|
+
dest_path: Path to the saved file
|
|
175
|
+
url: URL of the saved file
|
|
176
|
+
"""
|
|
177
|
+
assert col.col_type.is_media_type(), f'LocalStore: request to store non media_type Column {col.name}'
|
|
178
|
+
dest_path = self._prepare_path(col)
|
|
179
|
+
if isinstance(data, bytes):
|
|
180
|
+
dest_path = self._save_binary_media_file(data, dest_path, format)
|
|
181
|
+
elif isinstance(data, PIL.Image.Image):
|
|
182
|
+
dest_path = self._save_pil_image_file(data, dest_path, format)
|
|
183
|
+
else:
|
|
184
|
+
raise ValueError(f'Unsupported object type: {type(data)}')
|
|
185
|
+
new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
|
|
186
|
+
return dest_path, new_file_url
|
|
187
|
+
|
|
188
|
+
def create_presigned_url(self, soa: StorageObjectAddress, expiration_seconds: int) -> str:
|
|
189
|
+
"""Create a presigned URL for local storage (not supported)."""
|
|
190
|
+
raise excs.Error('Cannot generate servable URL for local file storage.')
|
|
191
|
+
|
|
192
|
+
def delete(self, tbl_id: UUID, tbl_version: int | None = None) -> int | None:
|
|
193
|
+
"""Delete all files belonging to tbl_id. If tbl_version is not None, delete
|
|
194
|
+
only those files belonging to the specified tbl_version.
|
|
195
|
+
|
|
196
|
+
Return:
|
|
197
|
+
Number of files deleted or None
|
|
198
|
+
"""
|
|
199
|
+
assert tbl_id is not None
|
|
200
|
+
table_prefix = ObjectPath.table_prefix(tbl_id)
|
|
201
|
+
if tbl_version is None:
|
|
202
|
+
# Remove the entire folder for this table id.
|
|
203
|
+
path = self.__base_dir / table_prefix
|
|
204
|
+
if path.exists():
|
|
205
|
+
shutil.rmtree(path)
|
|
206
|
+
return None
|
|
207
|
+
else:
|
|
208
|
+
# Remove only the elements for the specified tbl_version.
|
|
209
|
+
paths = glob.glob(
|
|
210
|
+
str(self.__base_dir / table_prefix) + f'/**/{table_prefix}_*_{tbl_version}_*', recursive=True
|
|
211
|
+
)
|
|
212
|
+
for p in paths:
|
|
213
|
+
os.remove(p)
|
|
214
|
+
return len(paths)
|
|
215
|
+
|
|
216
|
+
def count(self, tbl_id: UUID | None, tbl_version: int | None = None) -> int:
|
|
217
|
+
"""
|
|
218
|
+
Return number of files for given tbl_id.
|
|
219
|
+
"""
|
|
220
|
+
if tbl_id is None:
|
|
221
|
+
paths = glob.glob(str(self.__base_dir / '*'), recursive=True)
|
|
222
|
+
elif tbl_version is None:
|
|
223
|
+
table_prefix = ObjectPath.table_prefix(tbl_id)
|
|
224
|
+
paths = glob.glob(str(self.__base_dir / table_prefix) + f'/**/{table_prefix}_*', recursive=True)
|
|
225
|
+
else:
|
|
226
|
+
table_prefix = ObjectPath.table_prefix(tbl_id)
|
|
227
|
+
paths = glob.glob(
|
|
228
|
+
str(self.__base_dir / table_prefix) + f'/**/{table_prefix}_*_{tbl_version}_*', recursive=True
|
|
229
|
+
)
|
|
230
|
+
# Filter out directories, only count files
|
|
231
|
+
return len([p for p in paths if not os.path.isdir(p)])
|
|
232
|
+
|
|
233
|
+
def stats(self) -> list[tuple[UUID, int, int, int]]:
|
|
234
|
+
paths = glob.glob(str(self.__base_dir) + '/**', recursive=True)
|
|
235
|
+
# key: (tbl_id, col_id), value: (num_files, size)
|
|
236
|
+
d: dict[tuple[UUID, int], list[int]] = defaultdict(lambda: [0, 0])
|
|
237
|
+
for p in paths:
|
|
238
|
+
if not os.path.isdir(p):
|
|
239
|
+
matched = re.match(ObjectPath.PATTERN, Path(p).name)
|
|
240
|
+
assert matched is not None
|
|
241
|
+
tbl_id, col_id = UUID(hex=matched[1]), int(matched[2])
|
|
242
|
+
file_info = os.stat(p)
|
|
243
|
+
t = d[tbl_id, col_id]
|
|
244
|
+
t[0] += 1
|
|
245
|
+
t[1] += file_info.st_size
|
|
246
|
+
result = [(tbl_id, col_id, num_files, size) for (tbl_id, col_id), (num_files, size) in d.items()]
|
|
247
|
+
result.sort(key=lambda e: e[3], reverse=True)
|
|
248
|
+
return result
|
|
249
|
+
|
|
250
|
+
def list_objects(self, return_uri: bool, n_max: int = 10) -> list[str]:
|
|
251
|
+
"""Return a list of objects found with the specified location
|
|
252
|
+
Each returned object includes the full set of prefixes.
|
|
253
|
+
if return_uri is True, the full GCS URI is returned; otherwise, just the object key.
|
|
254
|
+
"""
|
|
255
|
+
r = []
|
|
256
|
+
for root, _, files in os.walk(self.__base_dir):
|
|
257
|
+
for file in files:
|
|
258
|
+
r.append(Path(root, file).as_uri() if return_uri else os.path.join(root, file))
|
|
259
|
+
return r
|
|
260
|
+
|
|
261
|
+
def clear(self) -> None:
|
|
262
|
+
"""Clear all files from the store."""
|
|
263
|
+
if self.__base_dir.exists():
|
|
264
|
+
shutil.rmtree(self.__base_dir)
|
|
265
|
+
self.__base_dir.mkdir()
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
class TempStore:
|
|
269
|
+
"""
|
|
270
|
+
A temporary store for files of data that are not yet persisted to their destination(s).
|
|
271
|
+
A destination is typically either a LocalStore (local persisted files) or a cloud object store.
|
|
272
|
+
|
|
273
|
+
The TempStore class has no internal state. It provides functionality to manage temporary files
|
|
274
|
+
in the env.Env.get().tmp_dir directory.
|
|
275
|
+
It reuses some of the LocalStore functionality to create unique file names and save objects.
|
|
276
|
+
"""
|
|
277
|
+
|
|
278
|
+
@classmethod
|
|
279
|
+
def _tmp_dir(cls) -> Path:
|
|
280
|
+
"""Returns the path to the temporary directory where files are stored."""
|
|
281
|
+
return env.Env.get().tmp_dir
|
|
282
|
+
|
|
283
|
+
@classmethod
|
|
284
|
+
def count(cls, tbl_id: UUID | None = None, tbl_version: int | None = None) -> int:
|
|
285
|
+
return LocalStore(cls._tmp_dir()).count(tbl_id, tbl_version)
|
|
286
|
+
|
|
287
|
+
@classmethod
|
|
288
|
+
def contains_path(cls, file_path: Path) -> bool:
|
|
289
|
+
return LocalStore(cls._tmp_dir()).contains_path(file_path)
|
|
290
|
+
|
|
291
|
+
@classmethod
|
|
292
|
+
def resolve_url(cls, file_url: str | None) -> Path | None:
|
|
293
|
+
return LocalStore(cls._tmp_dir()).resolve_url(file_url)
|
|
294
|
+
|
|
295
|
+
@classmethod
|
|
296
|
+
def save_media_object(cls, data: bytes | PIL.Image.Image, col: Column, format: str | None) -> tuple[Path, str]:
|
|
297
|
+
return LocalStore(cls._tmp_dir()).save_media_object(data, col, format)
|
|
298
|
+
|
|
299
|
+
@classmethod
|
|
300
|
+
def delete_media_file(cls, file_path: Path) -> None:
|
|
301
|
+
"""Delete an object from the temporary store."""
|
|
302
|
+
assert file_path is not None, 'Object path must be provided'
|
|
303
|
+
assert file_path.exists(), f'Object path does not exist: {file_path}'
|
|
304
|
+
assert cls.contains_path(file_path), f'Object path must be in the TempStore: {file_path}'
|
|
305
|
+
file_path.unlink()
|
|
306
|
+
_logger.debug(f'Media Storage: deleted {file_path}')
|
|
307
|
+
|
|
308
|
+
@classmethod
|
|
309
|
+
def create_path(cls, tbl_id: UUID | None = None, extension: str | None = None) -> Path:
|
|
310
|
+
"""Return a new, unique Path located in the temporary store.
|
|
311
|
+
If tbl_id is provided, the path name will be similar to a LocalStore path based on the tbl_id.
|
|
312
|
+
If tbl_id is None, a random UUID will be used to create the path."""
|
|
313
|
+
if extension is None:
|
|
314
|
+
extension = ''
|
|
315
|
+
if tbl_id is not None:
|
|
316
|
+
return LocalStore(cls._tmp_dir())._prepare_path_raw(tbl_id, 0, 0, extension)
|
|
317
|
+
return cls._tmp_dir() / f'{uuid.uuid4()}{extension}'
|
|
318
|
+
|
|
319
|
+
@classmethod
|
|
320
|
+
def clear(cls) -> None:
|
|
321
|
+
"""Clear all files from the temporary store."""
|
|
322
|
+
LocalStore(cls._tmp_dir()).clear()
|