pixeltable 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (106) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +2 -1
  4. pixeltable/catalog/catalog.py +63 -36
  5. pixeltable/catalog/column.py +6 -4
  6. pixeltable/catalog/dir.py +5 -5
  7. pixeltable/catalog/globals.py +12 -14
  8. pixeltable/catalog/insertable_table.py +4 -7
  9. pixeltable/catalog/path.py +2 -2
  10. pixeltable/catalog/table.py +64 -56
  11. pixeltable/catalog/table_version.py +42 -40
  12. pixeltable/catalog/table_version_handle.py +3 -0
  13. pixeltable/catalog/table_version_path.py +1 -1
  14. pixeltable/catalog/view.py +8 -7
  15. pixeltable/dataframe.py +5 -3
  16. pixeltable/env.py +108 -42
  17. pixeltable/exec/__init__.py +2 -0
  18. pixeltable/exec/aggregation_node.py +6 -8
  19. pixeltable/exec/cache_prefetch_node.py +4 -7
  20. pixeltable/exec/component_iteration_node.py +1 -3
  21. pixeltable/exec/data_row_batch.py +1 -2
  22. pixeltable/exec/exec_context.py +1 -1
  23. pixeltable/exec/exec_node.py +1 -2
  24. pixeltable/exec/expr_eval/__init__.py +2 -0
  25. pixeltable/exec/expr_eval/evaluators.py +137 -20
  26. pixeltable/exec/expr_eval/expr_eval_node.py +43 -64
  27. pixeltable/exec/expr_eval/globals.py +68 -7
  28. pixeltable/exec/expr_eval/schedulers.py +25 -23
  29. pixeltable/exec/in_memory_data_node.py +8 -6
  30. pixeltable/exec/row_update_node.py +3 -4
  31. pixeltable/exec/sql_node.py +16 -17
  32. pixeltable/exprs/__init__.py +1 -1
  33. pixeltable/exprs/column_property_ref.py +1 -1
  34. pixeltable/exprs/column_ref.py +3 -3
  35. pixeltable/exprs/compound_predicate.py +1 -1
  36. pixeltable/exprs/data_row.py +17 -1
  37. pixeltable/exprs/expr.py +12 -12
  38. pixeltable/exprs/function_call.py +34 -2
  39. pixeltable/exprs/json_mapper.py +95 -48
  40. pixeltable/exprs/json_path.py +3 -4
  41. pixeltable/exprs/method_ref.py +2 -2
  42. pixeltable/exprs/object_ref.py +2 -2
  43. pixeltable/exprs/row_builder.py +33 -6
  44. pixeltable/exprs/similarity_expr.py +1 -1
  45. pixeltable/exprs/sql_element_cache.py +1 -1
  46. pixeltable/exprs/string_op.py +2 -2
  47. pixeltable/ext/__init__.py +1 -1
  48. pixeltable/ext/functions/__init__.py +1 -1
  49. pixeltable/ext/functions/whisperx.py +1 -1
  50. pixeltable/ext/functions/yolox.py +1 -1
  51. pixeltable/func/aggregate_function.py +1 -1
  52. pixeltable/func/callable_function.py +2 -5
  53. pixeltable/func/expr_template_function.py +22 -2
  54. pixeltable/func/function.py +4 -5
  55. pixeltable/func/function_registry.py +1 -1
  56. pixeltable/func/signature.py +1 -1
  57. pixeltable/func/udf.py +2 -2
  58. pixeltable/functions/__init__.py +1 -1
  59. pixeltable/functions/anthropic.py +2 -2
  60. pixeltable/functions/audio.py +1 -1
  61. pixeltable/functions/deepseek.py +1 -1
  62. pixeltable/functions/fireworks.py +1 -1
  63. pixeltable/functions/globals.py +6 -6
  64. pixeltable/functions/huggingface.py +1 -1
  65. pixeltable/functions/image.py +1 -1
  66. pixeltable/functions/json.py +1 -1
  67. pixeltable/functions/llama_cpp.py +1 -1
  68. pixeltable/functions/math.py +1 -1
  69. pixeltable/functions/mistralai.py +1 -1
  70. pixeltable/functions/ollama.py +1 -1
  71. pixeltable/functions/openai.py +2 -2
  72. pixeltable/functions/replicate.py +1 -1
  73. pixeltable/functions/string.py +1 -1
  74. pixeltable/functions/timestamp.py +1 -1
  75. pixeltable/functions/together.py +1 -1
  76. pixeltable/functions/util.py +1 -1
  77. pixeltable/functions/video.py +2 -2
  78. pixeltable/functions/vision.py +2 -2
  79. pixeltable/index/embedding_index.py +12 -1
  80. pixeltable/io/__init__.py +5 -3
  81. pixeltable/io/fiftyone.py +6 -7
  82. pixeltable/io/label_studio.py +21 -20
  83. pixeltable/io/pandas.py +6 -5
  84. pixeltable/iterators/__init__.py +1 -1
  85. pixeltable/metadata/__init__.py +5 -3
  86. pixeltable/metadata/converters/convert_24.py +3 -3
  87. pixeltable/metadata/converters/convert_25.py +1 -1
  88. pixeltable/metadata/converters/convert_29.py +1 -1
  89. pixeltable/store.py +2 -2
  90. pixeltable/type_system.py +19 -7
  91. pixeltable/utils/console_output.py +3 -2
  92. pixeltable/utils/coroutine.py +3 -3
  93. pixeltable/utils/dbms.py +66 -0
  94. pixeltable/utils/documents.py +61 -67
  95. pixeltable/utils/filecache.py +1 -1
  96. pixeltable/utils/http_server.py +3 -2
  97. pixeltable/utils/pytorch.py +1 -1
  98. pixeltable/utils/sql.py +1 -1
  99. pixeltable-0.3.11.dist-info/METADATA +436 -0
  100. pixeltable-0.3.11.dist-info/RECORD +179 -0
  101. pixeltable/catalog/path_dict.py +0 -169
  102. pixeltable-0.3.10.dist-info/METADATA +0 -382
  103. pixeltable-0.3.10.dist-info/RECORD +0 -179
  104. {pixeltable-0.3.10.dist-info → pixeltable-0.3.11.dist-info}/LICENSE +0 -0
  105. {pixeltable-0.3.10.dist-info → pixeltable-0.3.11.dist-info}/WHEEL +0 -0
  106. {pixeltable-0.3.10.dist-info → pixeltable-0.3.11.dist-info}/entry_points.txt +0 -0
@@ -7,8 +7,8 @@ T = TypeVar('T')
7
7
 
8
8
 
9
9
  # TODO This is a temporary hack to be able to run async UDFs in contexts that are not properly handled by the existing
10
- # scheduler logic (e.g., inside the eval loop of a JsonMapper). Once the scheduler is fully general, it can be
11
- # removed.
10
+ # scheduler logic (e.g., as an embedding function as part of a similarity lookup). Once the scheduler is fully
11
+ # general, it can be removed.
12
12
 
13
13
 
14
14
  def run_coroutine_synchronously(coroutine: Coroutine[Any, Any, T], timeout: float = 30) -> T:
@@ -16,7 +16,7 @@ def run_coroutine_synchronously(coroutine: Coroutine[Any, Any, T], timeout: floa
16
16
  Runs the given coroutine synchronously, even if called in the context of a running event loop.
17
17
  """
18
18
 
19
- def run_in_new_loop():
19
+ def run_in_new_loop() -> T:
20
20
  new_loop = asyncio.new_event_loop()
21
21
  asyncio.set_event_loop(new_loop)
22
22
  try:
@@ -0,0 +1,66 @@
1
+ import abc
2
+
3
+ from sqlalchemy import URL
4
+
5
+
6
+ class Dbms(abc.ABC):
7
+ """
8
+ Provides abstractions for utilities to interact with a database system.
9
+ """
10
+
11
+ name: str
12
+ transaction_isolation_level: str
13
+ version_index_type: str
14
+ db_url: URL
15
+
16
+ def __init__(self, name: str, transaction_isolation_level: str, version_index_type: str, db_url: URL) -> None:
17
+ self.name = name
18
+ self.transaction_isolation_level = transaction_isolation_level
19
+ self.version_index_type = version_index_type
20
+ self.db_url = db_url
21
+
22
+ @abc.abstractmethod
23
+ def drop_db_stmt(self, database: str) -> str: ...
24
+
25
+ @abc.abstractmethod
26
+ def create_db_stmt(self, database: str) -> str: ...
27
+
28
+ @abc.abstractmethod
29
+ def default_system_db_url(self) -> str: ...
30
+
31
+
32
+ class PostgresqlDbms(Dbms):
33
+ """
34
+ Implements utilities to interact with Postgres database.
35
+ """
36
+
37
+ def __init__(self, db_url: URL):
38
+ super().__init__('postgresql', 'REPEATABLE READ', 'brin', db_url)
39
+
40
+ def drop_db_stmt(self, database: str) -> str:
41
+ return f'DROP DATABASE {database}'
42
+
43
+ def create_db_stmt(self, database: str) -> str:
44
+ return f"CREATE DATABASE {database} ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C' TEMPLATE template0"
45
+
46
+ def default_system_db_url(self) -> str:
47
+ a = self.db_url.set(database='postgres').render_as_string(hide_password=False)
48
+ return a
49
+
50
+
51
+ class CockroachDbms(Dbms):
52
+ """
53
+ Implements utilities to interact with CockroachDb database.
54
+ """
55
+
56
+ def __init__(self, db_url: URL):
57
+ super().__init__('cockroachdb', 'SERIALIZABLE', 'btree', db_url)
58
+
59
+ def drop_db_stmt(self, database: str) -> str:
60
+ return f'DROP DATABASE {database} CASCADE'
61
+
62
+ def create_db_stmt(self, database: str) -> str:
63
+ return f"CREATE DATABASE {database} TEMPLATE template0 ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C'"
64
+
65
+ def default_system_db_url(self) -> str:
66
+ return self.db_url.set(database='defaultdb').render_as_string(hide_password=False)
@@ -1,11 +1,12 @@
1
1
  import dataclasses
2
+ import os
2
3
  from typing import Optional
3
4
 
4
5
  import bs4
5
6
  import fitz # type: ignore[import-untyped]
6
7
  import puremagic
7
8
 
8
- import pixeltable.type_system as ts
9
+ from pixeltable import exceptions as excs, type_system as ts
9
10
  from pixeltable.env import Env
10
11
 
11
12
 
@@ -18,85 +19,78 @@ class DocumentHandle:
18
19
  txt_doc: Optional[str] = None
19
20
 
20
21
 
21
- def get_document_handle(path: str) -> Optional[DocumentHandle]:
22
- doc_format = puremagic.from_file(path)
22
+ def get_document_handle(path: str) -> DocumentHandle:
23
+ _, extension = os.path.splitext(path)
24
+ handle = get_handle_by_extension(path, extension)
25
+ if handle is not None:
26
+ return handle
23
27
 
24
- if doc_format == '.pdf':
25
- pdf_doc = get_pdf_handle(path)
26
- if pdf_doc is not None:
27
- return DocumentHandle(format=ts.DocumentType.DocumentFormat.PDF, pdf_doc=pdf_doc)
28
+ # if no extension, use puremagic to determine the type
29
+ extension = puremagic.from_file(path)
30
+ handle = get_handle_by_extension(path, extension)
31
+ if handle is not None:
32
+ return handle
28
33
 
29
- if doc_format == '.html':
30
- bs_doc = get_html_handle(path)
31
- if bs_doc is not None:
32
- return DocumentHandle(format=ts.DocumentType.DocumentFormat.HTML, bs_doc=bs_doc)
34
+ raise excs.Error(f'Unrecognized document format: {path}')
33
35
 
34
- if doc_format == '.md':
35
- md_ast = get_markdown_handle(path)
36
- if md_ast is not None:
37
- return DocumentHandle(format=ts.DocumentType.DocumentFormat.MD, md_ast=md_ast)
38
36
 
39
- if doc_format == '.xml':
40
- bs_doc = get_xml_handle(path)
41
- if bs_doc is not None:
42
- return DocumentHandle(format=ts.DocumentType.DocumentFormat.XML, bs_doc=bs_doc)
37
+ def get_handle_by_extension(path: str, extension: str) -> Optional[DocumentHandle]:
38
+ doc_format = ts.DocumentType.DocumentFormat.from_extension(extension)
43
39
 
44
- if doc_format == '.txt':
45
- txt_doc = get_txt(path)
46
- if txt_doc is not None:
47
- return DocumentHandle(format=ts.DocumentType.DocumentFormat.TXT, txt_doc=txt_doc)
40
+ try:
41
+ if doc_format == ts.DocumentType.DocumentFormat.HTML:
42
+ return DocumentHandle(doc_format, bs_doc=get_html_handle(path))
43
+ if doc_format == ts.DocumentType.DocumentFormat.MD:
44
+ return DocumentHandle(doc_format, md_ast=get_markdown_handle(path))
45
+ if doc_format == ts.DocumentType.DocumentFormat.PDF:
46
+ return DocumentHandle(doc_format, pdf_doc=get_pdf_handle(path))
47
+ if doc_format == ts.DocumentType.DocumentFormat.XML:
48
+ return DocumentHandle(doc_format, bs_doc=get_xml_handle(path))
49
+ if doc_format == ts.DocumentType.DocumentFormat.TXT:
50
+ return DocumentHandle(doc_format, txt_doc=get_txt(path))
51
+ except Exception as exc:
52
+ raise excs.Error(f'An error occurred processing a {doc_format} document: {path}') from exc
48
53
 
49
54
  return None
50
55
 
51
56
 
52
- def get_pdf_handle(path: str) -> Optional[fitz.Document]:
53
- try:
54
- doc = fitz.open(path)
55
- # check pdf (bc it will work for images)
56
- if not doc.is_pdf:
57
- return None
58
- # try to read one page
59
- next(page for page in doc)
60
- return doc
61
- except Exception:
62
- return None
63
-
64
-
65
- def get_html_handle(path: str) -> Optional[bs4.BeautifulSoup]:
66
- try:
67
- with open(path, 'r', encoding='utf8') as fp:
68
- doc = bs4.BeautifulSoup(fp, 'lxml')
69
- return doc if doc.find() is not None else None
70
- except Exception:
71
- return None
72
-
73
-
74
- def get_xml_handle(path: str) -> Optional[bs4.BeautifulSoup]:
75
- try:
76
- with open(path, 'r', encoding='utf8') as fp:
77
- doc = bs4.BeautifulSoup(fp, 'xml')
78
- return doc if doc.find() is not None else None
79
- except Exception:
80
- return None
57
+ def get_html_handle(path: str) -> bs4.BeautifulSoup:
58
+ with open(path, 'r', encoding='utf8') as fp:
59
+ doc = bs4.BeautifulSoup(fp, 'lxml')
60
+ if doc.find() is None:
61
+ raise excs.Error(f'Not a valid HTML document: {path}')
62
+ return doc
81
63
 
82
64
 
83
- def get_markdown_handle(path: str) -> Optional[dict]:
65
+ def get_markdown_handle(path: str) -> dict:
84
66
  Env.get().require_package('mistune', [3, 0])
85
67
  import mistune
86
68
 
87
- try:
88
- with open(path, encoding='utf8') as file:
89
- text = file.read()
90
- md_ast = mistune.create_markdown(renderer=None)
91
- return md_ast(text)
92
- except Exception:
93
- return None
69
+ with open(path, encoding='utf8') as file:
70
+ text = file.read()
71
+ md_ast = mistune.create_markdown(renderer=None)
72
+ return md_ast(text)
94
73
 
95
74
 
96
- def get_txt(path: str) -> Optional[str]:
97
- try:
98
- with open(path, 'r', encoding='utf-8') as fp:
99
- doc = fp.read()
100
- return doc or None # replace '' with None
101
- except Exception:
102
- return None
75
+ def get_pdf_handle(path: str) -> fitz.Document:
76
+ doc = fitz.open(path)
77
+ # check pdf (bc it will work for images)
78
+ if not doc.is_pdf:
79
+ raise excs.Error(f'Not a valid PDF document: {path}')
80
+ # try to read one page
81
+ next(page for page in doc)
82
+ return doc
83
+
84
+
85
+ def get_xml_handle(path: str) -> bs4.BeautifulSoup:
86
+ with open(path, 'r', encoding='utf8') as fp:
87
+ doc = bs4.BeautifulSoup(fp, 'xml')
88
+ if doc.find() is None:
89
+ raise excs.Error(f'Not a valid XML document: {path}')
90
+ return doc
91
+
92
+
93
+ def get_txt(path: str) -> str:
94
+ with open(path, 'r', encoding='utf-8') as fp:
95
+ doc = fp.read()
96
+ return doc
@@ -102,7 +102,7 @@ class FileCache:
102
102
  def init(cls) -> None:
103
103
  cls.__instance = cls()
104
104
 
105
- def __init__(self):
105
+ def __init__(self) -> None:
106
106
  self.cache = OrderedDict()
107
107
  self.total_size = 0
108
108
  self.capacity_bytes = int(Env.get()._file_cache_size_g * (1 << 30))
@@ -3,6 +3,7 @@ import http.server
3
3
  import logging
4
4
  import pathlib
5
5
  import urllib
6
+ from typing import Any
6
7
 
7
8
  _logger = logging.getLogger('pixeltable.http.server')
8
9
 
@@ -38,7 +39,7 @@ class AbsolutePathHandler(http.server.SimpleHTTPRequestHandler):
38
39
  path = pathlib.Path(urllib.request.url2pathname(path))
39
40
  return str(path)
40
41
 
41
- def log_message(self, format, *args) -> None:
42
+ def log_message(self, format: str, *args: Any) -> None:
42
43
  """override logging to stderr in http.server.BaseHTTPRequestHandler"""
43
44
  message = format % args
44
45
  _logger.info(message.translate(self._control_char_table)) # type: ignore[attr-defined]
@@ -47,7 +48,7 @@ class AbsolutePathHandler(http.server.SimpleHTTPRequestHandler):
47
48
  class LoggingHTTPServer(http.server.ThreadingHTTPServer):
48
49
  """Avoids polluting stdout and stderr"""
49
50
 
50
- def handle_error(self, request, client_address) -> None:
51
+ def handle_error(self, request, client_address) -> None: # type: ignore[no-untyped-def]
51
52
  """override socketserver.TCPServer.handle_error which prints directly to sys.stderr"""
52
53
  import traceback
53
54
 
@@ -32,7 +32,7 @@ class PixeltablePytorchDataset(torch.utils.data.IterableDataset):
32
32
 
33
33
  self.path = path
34
34
  self.image_format = image_format
35
- assert image_format in {'np', 'pt'}
35
+ assert image_format in ('np', 'pt')
36
36
  column_type_path = path / '.pixeltable.column_types.json'
37
37
  assert column_type_path.exists(), f'missing {column_type_path}'
38
38
  with column_type_path.open() as f:
pixeltable/utils/sql.py CHANGED
@@ -4,7 +4,7 @@ import sqlalchemy as sql
4
4
  from sqlalchemy.dialects import postgresql
5
5
 
6
6
 
7
- def log_stmt(logger: logging.Logger, stmt) -> None:
7
+ def log_stmt(logger: logging.Logger, stmt: sql.sql.ClauseElement) -> None:
8
8
  logger.debug(f'executing {stmt.compile(dialect=postgresql.dialect())}')
9
9
 
10
10