pixeltable 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +2 -1
- pixeltable/catalog/catalog.py +63 -36
- pixeltable/catalog/column.py +6 -4
- pixeltable/catalog/dir.py +5 -5
- pixeltable/catalog/globals.py +12 -14
- pixeltable/catalog/insertable_table.py +4 -7
- pixeltable/catalog/path.py +2 -2
- pixeltable/catalog/table.py +64 -56
- pixeltable/catalog/table_version.py +42 -40
- pixeltable/catalog/table_version_handle.py +3 -0
- pixeltable/catalog/table_version_path.py +1 -1
- pixeltable/catalog/view.py +8 -7
- pixeltable/dataframe.py +5 -3
- pixeltable/env.py +108 -42
- pixeltable/exec/__init__.py +2 -0
- pixeltable/exec/aggregation_node.py +6 -8
- pixeltable/exec/cache_prefetch_node.py +4 -7
- pixeltable/exec/component_iteration_node.py +1 -3
- pixeltable/exec/data_row_batch.py +1 -2
- pixeltable/exec/exec_context.py +1 -1
- pixeltable/exec/exec_node.py +1 -2
- pixeltable/exec/expr_eval/__init__.py +2 -0
- pixeltable/exec/expr_eval/evaluators.py +137 -20
- pixeltable/exec/expr_eval/expr_eval_node.py +43 -64
- pixeltable/exec/expr_eval/globals.py +68 -7
- pixeltable/exec/expr_eval/schedulers.py +25 -23
- pixeltable/exec/in_memory_data_node.py +8 -6
- pixeltable/exec/row_update_node.py +3 -4
- pixeltable/exec/sql_node.py +16 -17
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/column_property_ref.py +1 -1
- pixeltable/exprs/column_ref.py +3 -3
- pixeltable/exprs/compound_predicate.py +1 -1
- pixeltable/exprs/data_row.py +17 -1
- pixeltable/exprs/expr.py +12 -12
- pixeltable/exprs/function_call.py +34 -2
- pixeltable/exprs/json_mapper.py +95 -48
- pixeltable/exprs/json_path.py +3 -4
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +33 -6
- pixeltable/exprs/similarity_expr.py +1 -1
- pixeltable/exprs/sql_element_cache.py +1 -1
- pixeltable/exprs/string_op.py +2 -2
- pixeltable/ext/__init__.py +1 -1
- pixeltable/ext/functions/__init__.py +1 -1
- pixeltable/ext/functions/whisperx.py +1 -1
- pixeltable/ext/functions/yolox.py +1 -1
- pixeltable/func/aggregate_function.py +1 -1
- pixeltable/func/callable_function.py +2 -5
- pixeltable/func/expr_template_function.py +22 -2
- pixeltable/func/function.py +4 -5
- pixeltable/func/function_registry.py +1 -1
- pixeltable/func/signature.py +1 -1
- pixeltable/func/udf.py +2 -2
- pixeltable/functions/__init__.py +1 -1
- pixeltable/functions/anthropic.py +2 -2
- pixeltable/functions/audio.py +1 -1
- pixeltable/functions/deepseek.py +1 -1
- pixeltable/functions/fireworks.py +1 -1
- pixeltable/functions/globals.py +6 -6
- pixeltable/functions/huggingface.py +1 -1
- pixeltable/functions/image.py +1 -1
- pixeltable/functions/json.py +1 -1
- pixeltable/functions/llama_cpp.py +1 -1
- pixeltable/functions/math.py +1 -1
- pixeltable/functions/mistralai.py +1 -1
- pixeltable/functions/ollama.py +1 -1
- pixeltable/functions/openai.py +2 -2
- pixeltable/functions/replicate.py +1 -1
- pixeltable/functions/string.py +1 -1
- pixeltable/functions/timestamp.py +1 -1
- pixeltable/functions/together.py +1 -1
- pixeltable/functions/util.py +1 -1
- pixeltable/functions/video.py +2 -2
- pixeltable/functions/vision.py +2 -2
- pixeltable/index/embedding_index.py +12 -1
- pixeltable/io/__init__.py +5 -3
- pixeltable/io/fiftyone.py +6 -7
- pixeltable/io/label_studio.py +21 -20
- pixeltable/io/pandas.py +6 -5
- pixeltable/iterators/__init__.py +1 -1
- pixeltable/metadata/__init__.py +5 -3
- pixeltable/metadata/converters/convert_24.py +3 -3
- pixeltable/metadata/converters/convert_25.py +1 -1
- pixeltable/metadata/converters/convert_29.py +1 -1
- pixeltable/store.py +2 -2
- pixeltable/type_system.py +19 -7
- pixeltable/utils/console_output.py +3 -2
- pixeltable/utils/coroutine.py +3 -3
- pixeltable/utils/dbms.py +66 -0
- pixeltable/utils/documents.py +61 -67
- pixeltable/utils/filecache.py +1 -1
- pixeltable/utils/http_server.py +3 -2
- pixeltable/utils/pytorch.py +1 -1
- pixeltable/utils/sql.py +1 -1
- pixeltable-0.3.11.dist-info/METADATA +436 -0
- pixeltable-0.3.11.dist-info/RECORD +179 -0
- pixeltable/catalog/path_dict.py +0 -169
- pixeltable-0.3.10.dist-info/METADATA +0 -382
- pixeltable-0.3.10.dist-info/RECORD +0 -179
- {pixeltable-0.3.10.dist-info → pixeltable-0.3.11.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.10.dist-info → pixeltable-0.3.11.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.10.dist-info → pixeltable-0.3.11.dist-info}/entry_points.txt +0 -0
pixeltable/utils/coroutine.py
CHANGED
|
@@ -7,8 +7,8 @@ T = TypeVar('T')
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
# TODO This is a temporary hack to be able to run async UDFs in contexts that are not properly handled by the existing
|
|
10
|
-
# scheduler logic (e.g.,
|
|
11
|
-
# removed.
|
|
10
|
+
# scheduler logic (e.g., as an embedding function as part of a similarity lookup). Once the scheduler is fully
|
|
11
|
+
# general, it can be removed.
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
def run_coroutine_synchronously(coroutine: Coroutine[Any, Any, T], timeout: float = 30) -> T:
|
|
@@ -16,7 +16,7 @@ def run_coroutine_synchronously(coroutine: Coroutine[Any, Any, T], timeout: floa
|
|
|
16
16
|
Runs the given coroutine synchronously, even if called in the context of a running event loop.
|
|
17
17
|
"""
|
|
18
18
|
|
|
19
|
-
def run_in_new_loop():
|
|
19
|
+
def run_in_new_loop() -> T:
|
|
20
20
|
new_loop = asyncio.new_event_loop()
|
|
21
21
|
asyncio.set_event_loop(new_loop)
|
|
22
22
|
try:
|
pixeltable/utils/dbms.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
|
|
3
|
+
from sqlalchemy import URL
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Dbms(abc.ABC):
|
|
7
|
+
"""
|
|
8
|
+
Provides abstractions for utilities to interact with a database system.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
name: str
|
|
12
|
+
transaction_isolation_level: str
|
|
13
|
+
version_index_type: str
|
|
14
|
+
db_url: URL
|
|
15
|
+
|
|
16
|
+
def __init__(self, name: str, transaction_isolation_level: str, version_index_type: str, db_url: URL) -> None:
|
|
17
|
+
self.name = name
|
|
18
|
+
self.transaction_isolation_level = transaction_isolation_level
|
|
19
|
+
self.version_index_type = version_index_type
|
|
20
|
+
self.db_url = db_url
|
|
21
|
+
|
|
22
|
+
@abc.abstractmethod
|
|
23
|
+
def drop_db_stmt(self, database: str) -> str: ...
|
|
24
|
+
|
|
25
|
+
@abc.abstractmethod
|
|
26
|
+
def create_db_stmt(self, database: str) -> str: ...
|
|
27
|
+
|
|
28
|
+
@abc.abstractmethod
|
|
29
|
+
def default_system_db_url(self) -> str: ...
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class PostgresqlDbms(Dbms):
|
|
33
|
+
"""
|
|
34
|
+
Implements utilities to interact with Postgres database.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, db_url: URL):
|
|
38
|
+
super().__init__('postgresql', 'REPEATABLE READ', 'brin', db_url)
|
|
39
|
+
|
|
40
|
+
def drop_db_stmt(self, database: str) -> str:
|
|
41
|
+
return f'DROP DATABASE {database}'
|
|
42
|
+
|
|
43
|
+
def create_db_stmt(self, database: str) -> str:
|
|
44
|
+
return f"CREATE DATABASE {database} ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C' TEMPLATE template0"
|
|
45
|
+
|
|
46
|
+
def default_system_db_url(self) -> str:
|
|
47
|
+
a = self.db_url.set(database='postgres').render_as_string(hide_password=False)
|
|
48
|
+
return a
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class CockroachDbms(Dbms):
|
|
52
|
+
"""
|
|
53
|
+
Implements utilities to interact with CockroachDb database.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(self, db_url: URL):
|
|
57
|
+
super().__init__('cockroachdb', 'SERIALIZABLE', 'btree', db_url)
|
|
58
|
+
|
|
59
|
+
def drop_db_stmt(self, database: str) -> str:
|
|
60
|
+
return f'DROP DATABASE {database} CASCADE'
|
|
61
|
+
|
|
62
|
+
def create_db_stmt(self, database: str) -> str:
|
|
63
|
+
return f"CREATE DATABASE {database} TEMPLATE template0 ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C'"
|
|
64
|
+
|
|
65
|
+
def default_system_db_url(self) -> str:
|
|
66
|
+
return self.db_url.set(database='defaultdb').render_as_string(hide_password=False)
|
pixeltable/utils/documents.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import dataclasses
|
|
2
|
+
import os
|
|
2
3
|
from typing import Optional
|
|
3
4
|
|
|
4
5
|
import bs4
|
|
5
6
|
import fitz # type: ignore[import-untyped]
|
|
6
7
|
import puremagic
|
|
7
8
|
|
|
8
|
-
import
|
|
9
|
+
from pixeltable import exceptions as excs, type_system as ts
|
|
9
10
|
from pixeltable.env import Env
|
|
10
11
|
|
|
11
12
|
|
|
@@ -18,85 +19,78 @@ class DocumentHandle:
|
|
|
18
19
|
txt_doc: Optional[str] = None
|
|
19
20
|
|
|
20
21
|
|
|
21
|
-
def get_document_handle(path: str) ->
|
|
22
|
-
|
|
22
|
+
def get_document_handle(path: str) -> DocumentHandle:
|
|
23
|
+
_, extension = os.path.splitext(path)
|
|
24
|
+
handle = get_handle_by_extension(path, extension)
|
|
25
|
+
if handle is not None:
|
|
26
|
+
return handle
|
|
23
27
|
|
|
24
|
-
if
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
+
# if no extension, use puremagic to determine the type
|
|
29
|
+
extension = puremagic.from_file(path)
|
|
30
|
+
handle = get_handle_by_extension(path, extension)
|
|
31
|
+
if handle is not None:
|
|
32
|
+
return handle
|
|
28
33
|
|
|
29
|
-
|
|
30
|
-
bs_doc = get_html_handle(path)
|
|
31
|
-
if bs_doc is not None:
|
|
32
|
-
return DocumentHandle(format=ts.DocumentType.DocumentFormat.HTML, bs_doc=bs_doc)
|
|
34
|
+
raise excs.Error(f'Unrecognized document format: {path}')
|
|
33
35
|
|
|
34
|
-
if doc_format == '.md':
|
|
35
|
-
md_ast = get_markdown_handle(path)
|
|
36
|
-
if md_ast is not None:
|
|
37
|
-
return DocumentHandle(format=ts.DocumentType.DocumentFormat.MD, md_ast=md_ast)
|
|
38
36
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
if bs_doc is not None:
|
|
42
|
-
return DocumentHandle(format=ts.DocumentType.DocumentFormat.XML, bs_doc=bs_doc)
|
|
37
|
+
def get_handle_by_extension(path: str, extension: str) -> Optional[DocumentHandle]:
|
|
38
|
+
doc_format = ts.DocumentType.DocumentFormat.from_extension(extension)
|
|
43
39
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
40
|
+
try:
|
|
41
|
+
if doc_format == ts.DocumentType.DocumentFormat.HTML:
|
|
42
|
+
return DocumentHandle(doc_format, bs_doc=get_html_handle(path))
|
|
43
|
+
if doc_format == ts.DocumentType.DocumentFormat.MD:
|
|
44
|
+
return DocumentHandle(doc_format, md_ast=get_markdown_handle(path))
|
|
45
|
+
if doc_format == ts.DocumentType.DocumentFormat.PDF:
|
|
46
|
+
return DocumentHandle(doc_format, pdf_doc=get_pdf_handle(path))
|
|
47
|
+
if doc_format == ts.DocumentType.DocumentFormat.XML:
|
|
48
|
+
return DocumentHandle(doc_format, bs_doc=get_xml_handle(path))
|
|
49
|
+
if doc_format == ts.DocumentType.DocumentFormat.TXT:
|
|
50
|
+
return DocumentHandle(doc_format, txt_doc=get_txt(path))
|
|
51
|
+
except Exception as exc:
|
|
52
|
+
raise excs.Error(f'An error occurred processing a {doc_format} document: {path}') from exc
|
|
48
53
|
|
|
49
54
|
return None
|
|
50
55
|
|
|
51
56
|
|
|
52
|
-
def
|
|
53
|
-
|
|
54
|
-
doc =
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
# try to read one page
|
|
59
|
-
next(page for page in doc)
|
|
60
|
-
return doc
|
|
61
|
-
except Exception:
|
|
62
|
-
return None
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def get_html_handle(path: str) -> Optional[bs4.BeautifulSoup]:
|
|
66
|
-
try:
|
|
67
|
-
with open(path, 'r', encoding='utf8') as fp:
|
|
68
|
-
doc = bs4.BeautifulSoup(fp, 'lxml')
|
|
69
|
-
return doc if doc.find() is not None else None
|
|
70
|
-
except Exception:
|
|
71
|
-
return None
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def get_xml_handle(path: str) -> Optional[bs4.BeautifulSoup]:
|
|
75
|
-
try:
|
|
76
|
-
with open(path, 'r', encoding='utf8') as fp:
|
|
77
|
-
doc = bs4.BeautifulSoup(fp, 'xml')
|
|
78
|
-
return doc if doc.find() is not None else None
|
|
79
|
-
except Exception:
|
|
80
|
-
return None
|
|
57
|
+
def get_html_handle(path: str) -> bs4.BeautifulSoup:
|
|
58
|
+
with open(path, 'r', encoding='utf8') as fp:
|
|
59
|
+
doc = bs4.BeautifulSoup(fp, 'lxml')
|
|
60
|
+
if doc.find() is None:
|
|
61
|
+
raise excs.Error(f'Not a valid HTML document: {path}')
|
|
62
|
+
return doc
|
|
81
63
|
|
|
82
64
|
|
|
83
|
-
def get_markdown_handle(path: str) ->
|
|
65
|
+
def get_markdown_handle(path: str) -> dict:
|
|
84
66
|
Env.get().require_package('mistune', [3, 0])
|
|
85
67
|
import mistune
|
|
86
68
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
return md_ast(text)
|
|
92
|
-
except Exception:
|
|
93
|
-
return None
|
|
69
|
+
with open(path, encoding='utf8') as file:
|
|
70
|
+
text = file.read()
|
|
71
|
+
md_ast = mistune.create_markdown(renderer=None)
|
|
72
|
+
return md_ast(text)
|
|
94
73
|
|
|
95
74
|
|
|
96
|
-
def
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
75
|
+
def get_pdf_handle(path: str) -> fitz.Document:
|
|
76
|
+
doc = fitz.open(path)
|
|
77
|
+
# check pdf (bc it will work for images)
|
|
78
|
+
if not doc.is_pdf:
|
|
79
|
+
raise excs.Error(f'Not a valid PDF document: {path}')
|
|
80
|
+
# try to read one page
|
|
81
|
+
next(page for page in doc)
|
|
82
|
+
return doc
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def get_xml_handle(path: str) -> bs4.BeautifulSoup:
|
|
86
|
+
with open(path, 'r', encoding='utf8') as fp:
|
|
87
|
+
doc = bs4.BeautifulSoup(fp, 'xml')
|
|
88
|
+
if doc.find() is None:
|
|
89
|
+
raise excs.Error(f'Not a valid XML document: {path}')
|
|
90
|
+
return doc
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_txt(path: str) -> str:
|
|
94
|
+
with open(path, 'r', encoding='utf-8') as fp:
|
|
95
|
+
doc = fp.read()
|
|
96
|
+
return doc
|
pixeltable/utils/filecache.py
CHANGED
|
@@ -102,7 +102,7 @@ class FileCache:
|
|
|
102
102
|
def init(cls) -> None:
|
|
103
103
|
cls.__instance = cls()
|
|
104
104
|
|
|
105
|
-
def __init__(self):
|
|
105
|
+
def __init__(self) -> None:
|
|
106
106
|
self.cache = OrderedDict()
|
|
107
107
|
self.total_size = 0
|
|
108
108
|
self.capacity_bytes = int(Env.get()._file_cache_size_g * (1 << 30))
|
pixeltable/utils/http_server.py
CHANGED
|
@@ -3,6 +3,7 @@ import http.server
|
|
|
3
3
|
import logging
|
|
4
4
|
import pathlib
|
|
5
5
|
import urllib
|
|
6
|
+
from typing import Any
|
|
6
7
|
|
|
7
8
|
_logger = logging.getLogger('pixeltable.http.server')
|
|
8
9
|
|
|
@@ -38,7 +39,7 @@ class AbsolutePathHandler(http.server.SimpleHTTPRequestHandler):
|
|
|
38
39
|
path = pathlib.Path(urllib.request.url2pathname(path))
|
|
39
40
|
return str(path)
|
|
40
41
|
|
|
41
|
-
def log_message(self, format, *args) -> None:
|
|
42
|
+
def log_message(self, format: str, *args: Any) -> None:
|
|
42
43
|
"""override logging to stderr in http.server.BaseHTTPRequestHandler"""
|
|
43
44
|
message = format % args
|
|
44
45
|
_logger.info(message.translate(self._control_char_table)) # type: ignore[attr-defined]
|
|
@@ -47,7 +48,7 @@ class AbsolutePathHandler(http.server.SimpleHTTPRequestHandler):
|
|
|
47
48
|
class LoggingHTTPServer(http.server.ThreadingHTTPServer):
|
|
48
49
|
"""Avoids polluting stdout and stderr"""
|
|
49
50
|
|
|
50
|
-
def handle_error(self, request, client_address) -> None:
|
|
51
|
+
def handle_error(self, request, client_address) -> None: # type: ignore[no-untyped-def]
|
|
51
52
|
"""override socketserver.TCPServer.handle_error which prints directly to sys.stderr"""
|
|
52
53
|
import traceback
|
|
53
54
|
|
pixeltable/utils/pytorch.py
CHANGED
|
@@ -32,7 +32,7 @@ class PixeltablePytorchDataset(torch.utils.data.IterableDataset):
|
|
|
32
32
|
|
|
33
33
|
self.path = path
|
|
34
34
|
self.image_format = image_format
|
|
35
|
-
assert image_format in
|
|
35
|
+
assert image_format in ('np', 'pt')
|
|
36
36
|
column_type_path = path / '.pixeltable.column_types.json'
|
|
37
37
|
assert column_type_path.exists(), f'missing {column_type_path}'
|
|
38
38
|
with column_type_path.open() as f:
|
pixeltable/utils/sql.py
CHANGED
|
@@ -4,7 +4,7 @@ import sqlalchemy as sql
|
|
|
4
4
|
from sqlalchemy.dialects import postgresql
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
def log_stmt(logger: logging.Logger, stmt) -> None:
|
|
7
|
+
def log_stmt(logger: logging.Logger, stmt: sql.sql.ClauseElement) -> None:
|
|
8
8
|
logger.debug(f'executing {stmt.compile(dialect=postgresql.dialect())}')
|
|
9
9
|
|
|
10
10
|
|