pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -1,5 +1,4 @@
1
1
  import dataclasses
2
- from typing import Optional, Union
3
2
 
4
3
  import pandas as pd
5
4
  from pandas.io.formats.style import Styler
@@ -7,11 +6,11 @@ from pandas.io.formats.style import Styler
7
6
 
8
7
  @dataclasses.dataclass
9
8
  class _Descriptor:
10
- body: Union[str, pd.DataFrame]
9
+ body: str | pd.DataFrame
11
10
  # The remaining fields only affect the behavior if `body` is a pd.DataFrame.
12
11
  show_index: bool
13
12
  show_header: bool
14
- styler: Optional[Styler] = None
13
+ styler: Styler | None = None
15
14
 
16
15
 
17
16
  class DescriptionHelper:
@@ -25,6 +24,7 @@ class DescriptionHelper:
25
24
  DescriptionHelper can convert a list of descriptors into either HTML or plaintext and do something reasonable
26
25
  in each case.
27
26
  """
27
+
28
28
  __descriptors: list[_Descriptor]
29
29
 
30
30
  def __init__(self) -> None:
@@ -32,10 +32,10 @@ class DescriptionHelper:
32
32
 
33
33
  def append(
34
34
  self,
35
- descriptor: Union[str, pd.DataFrame],
35
+ descriptor: str | pd.DataFrame,
36
36
  show_index: bool = False,
37
37
  show_header: bool = True,
38
- styler: Optional[Styler] = None,
38
+ styler: Styler | None = None,
39
39
  ) -> None:
40
40
  self.__descriptors.append(_Descriptor(descriptor, show_index, show_header, styler))
41
41
 
@@ -69,18 +69,17 @@ class DescriptionHelper:
69
69
  return (
70
70
  # Render the string as a single-cell DataFrame. This will ensure a consistent style of output in
71
71
  # cases where strings appear alongside DataFrames in the same DescriptionHelper.
72
- pd.DataFrame([descriptor.body]).style
73
- .set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left', 'font-weight': 'bold'})
74
- .hide(axis='index').hide(axis='columns')
72
+ pd.DataFrame([descriptor.body])
73
+ .style.set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left', 'font-weight': 'bold'})
74
+ .hide(axis='index')
75
+ .hide(axis='columns')
75
76
  )
76
77
  else:
77
78
  styler = descriptor.styler
78
79
  if styler is None:
79
80
  styler = descriptor.body.style
80
- styler = (
81
- styler
82
- .set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left'})
83
- .set_table_styles([dict(selector='th', props=[('text-align', 'left')])])
81
+ styler = styler.set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left'}).set_table_styles(
82
+ [{'selector': 'th', 'props': [('text-align', 'left')]}]
84
83
  )
85
84
  if not descriptor.show_header:
86
85
  styler = styler.hide(axis='columns')
@@ -1,86 +1,85 @@
1
1
  import dataclasses
2
- from typing import Optional
2
+ import os
3
3
 
4
4
  import bs4
5
- import fitz # type: ignore[import-untyped]
6
5
  import puremagic
6
+ from pypdfium2 import PdfDocument # type: ignore[import-untyped]
7
7
 
8
- import pixeltable.type_system as ts
8
+ from pixeltable import exceptions as excs, type_system as ts
9
9
  from pixeltable.env import Env
10
10
 
11
11
 
12
12
  @dataclasses.dataclass
13
13
  class DocumentHandle:
14
14
  format: ts.DocumentType.DocumentFormat
15
- bs_doc: Optional[bs4.BeautifulSoup] = None
16
- md_ast: Optional[dict] = None
17
- pdf_doc: Optional[fitz.Document] = None
15
+ bs_doc: bs4.BeautifulSoup | None = None
16
+ md_ast: dict | None = None
17
+ pdf_doc: PdfDocument | None = None
18
+ txt_doc: str | None = None
18
19
 
19
20
 
20
- def get_document_handle(path: str) -> Optional[DocumentHandle]:
21
- doc_format = puremagic.from_file(path)
21
+ def get_document_handle(path: str) -> DocumentHandle:
22
+ _, extension = os.path.splitext(path)
23
+ handle = get_handle_by_extension(path, extension)
24
+ if handle is not None:
25
+ return handle
22
26
 
23
- if doc_format == '.pdf':
24
- pdf_doc = get_pdf_handle(path)
25
- if pdf_doc is not None:
26
- return DocumentHandle(format=ts.DocumentType.DocumentFormat.PDF, pdf_doc=pdf_doc)
27
+ # if no extension, use puremagic to determine the type
28
+ extension = puremagic.from_file(path)
29
+ handle = get_handle_by_extension(path, extension)
30
+ if handle is not None:
31
+ return handle
27
32
 
28
- if doc_format == '.html':
29
- bs_doc = get_html_handle(path)
30
- if bs_doc is not None:
31
- return DocumentHandle(format=ts.DocumentType.DocumentFormat.HTML, bs_doc=bs_doc)
33
+ raise excs.Error(f'Unrecognized document format: {path}')
32
34
 
33
- if doc_format == '.md':
34
- md_ast = get_markdown_handle(path)
35
- if md_ast is not None:
36
- return DocumentHandle(format=ts.DocumentType.DocumentFormat.MD, md_ast=md_ast)
37
35
 
38
- if doc_format == '.xml':
39
- bs_doc = get_xml_handle(path)
40
- if bs_doc is not None:
41
- return DocumentHandle(format=ts.DocumentType.DocumentFormat.XML, bs_doc=bs_doc)
36
+ def get_handle_by_extension(path: str, extension: str) -> DocumentHandle | None:
37
+ doc_format = ts.DocumentType.DocumentFormat.from_extension(extension)
42
38
 
43
- return None
44
-
45
-
46
- def get_pdf_handle(path: str) -> Optional[fitz.Document]:
47
- try:
48
- doc = fitz.open(path)
49
- # check pdf (bc it will work for images)
50
- if not doc.is_pdf:
51
- return None
52
- # try to read one page
53
- next(page for page in doc)
54
- return doc
55
- except Exception:
56
- return None
57
-
58
-
59
- def get_html_handle(path: str) -> Optional[bs4.BeautifulSoup]:
60
39
  try:
61
- with open(path, 'r', encoding='utf8') as fp:
62
- doc = bs4.BeautifulSoup(fp, 'lxml')
63
- return doc if doc.find() is not None else None
64
- except Exception:
65
- return None
40
+ if doc_format == ts.DocumentType.DocumentFormat.HTML:
41
+ return DocumentHandle(doc_format, bs_doc=get_html_handle(path))
42
+ if doc_format == ts.DocumentType.DocumentFormat.MD:
43
+ return DocumentHandle(doc_format, md_ast=get_markdown_handle(path))
44
+ if doc_format == ts.DocumentType.DocumentFormat.PDF:
45
+ return DocumentHandle(doc_format, pdf_doc=PdfDocument(path))
46
+ if doc_format == ts.DocumentType.DocumentFormat.XML:
47
+ return DocumentHandle(doc_format, bs_doc=get_xml_handle(path))
48
+ if doc_format == ts.DocumentType.DocumentFormat.TXT:
49
+ return DocumentHandle(doc_format, txt_doc=get_txt(path))
50
+ except Exception as exc:
51
+ raise excs.Error(f'An error occurred processing a {doc_format} document: {path}') from exc
66
52
 
53
+ return None
67
54
 
68
- def get_xml_handle(path: str) -> Optional[bs4.BeautifulSoup]:
69
- try:
70
- with open(path, 'r', encoding='utf8') as fp:
71
- doc = bs4.BeautifulSoup(fp, 'xml')
72
- return doc if doc.find() is not None else None
73
- except Exception:
74
- return None
55
+
56
+ def get_html_handle(path: str) -> bs4.BeautifulSoup:
57
+ with open(path, 'r', encoding='utf8') as fp:
58
+ doc = bs4.BeautifulSoup(fp, 'lxml')
59
+ if doc.find() is None:
60
+ raise excs.Error(f'Not a valid HTML document: {path}')
61
+ return doc
75
62
 
76
63
 
77
- def get_markdown_handle(path: str) -> Optional[dict]:
64
+ def get_markdown_handle(path: str) -> dict:
78
65
  Env.get().require_package('mistune', [3, 0])
79
66
  import mistune
80
- try:
81
- with open(path, encoding='utf8') as file:
82
- text = file.read()
83
- md_ast = mistune.create_markdown(renderer=None)
84
- return md_ast(text)
85
- except Exception:
86
- return None
67
+
68
+ with open(path, encoding='utf8') as file:
69
+ text = file.read()
70
+ md_ast = mistune.create_markdown(renderer=None)
71
+ return md_ast(text)
72
+
73
+
74
+ def get_xml_handle(path: str) -> bs4.BeautifulSoup:
75
+ with open(path, 'r', encoding='utf8') as fp:
76
+ doc = bs4.BeautifulSoup(fp, 'xml')
77
+ if doc.find() is None:
78
+ raise excs.Error(f'Not a valid XML document: {path}')
79
+ return doc
80
+
81
+
82
+ def get_txt(path: str) -> str:
83
+ with open(path, 'r', encoding='utf-8') as fp:
84
+ doc = fp.read()
85
+ return doc
@@ -0,0 +1,36 @@
1
+ import logging
2
+ from typing import Any, Callable, TypeVar
3
+
4
+ R = TypeVar('R')
5
+
6
+ logger = logging.getLogger('pixeltable')
7
+
8
+
9
+ def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool = True, **kwargs: Any) -> R | None:
10
+ """
11
+ Runs a cleanup function. If interrupted, retry cleanup.
12
+ The `run_cleanup()` function ensures that the `cleanup_func()` function executes at least once.
13
+ If the `cleanup_func()` is interrupted during execution, it will be retried.
14
+
15
+ Args:
16
+ cleanup_func: an idempotent function
17
+ raise_error: raise an exception if an error occurs during cleanup.
18
+ """
19
+ try:
20
+ logger.debug(f'Running cleanup function: {cleanup_func.__name__!r}')
21
+ return cleanup_func(*args, **kwargs)
22
+ except KeyboardInterrupt as interrupt:
23
+ # Save original exception and re-attempt cleanup
24
+ original_exception = interrupt
25
+ logger.debug(f'Cleanup {cleanup_func.__name__!r} interrupted, retrying')
26
+ try:
27
+ return cleanup_func(*args, **kwargs)
28
+ except Exception as e:
29
+ # Suppress this exception
30
+ logger.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e.__class__}: {e}')
31
+ raise KeyboardInterrupt from original_exception
32
+ except Exception as e:
33
+ logger.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e.__class__}: {e}')
34
+ if raise_error:
35
+ raise e
36
+ return None
@@ -5,21 +5,22 @@ import hashlib
5
5
  import logging
6
6
  import os
7
7
  import warnings
8
- from collections import OrderedDict, defaultdict, namedtuple
8
+ from collections import OrderedDict, defaultdict
9
9
  from dataclasses import dataclass
10
10
  from datetime import datetime, timezone
11
11
  from pathlib import Path
12
- from typing import Optional
12
+ from typing import NamedTuple
13
13
  from uuid import UUID
14
14
 
15
15
  import pixeltable.exceptions as excs
16
+ from pixeltable.config import Config
16
17
  from pixeltable.env import Env
17
18
 
18
19
  _logger = logging.getLogger('pixeltable')
19
20
 
21
+
20
22
  @dataclass
21
23
  class CacheEntry:
22
-
23
24
  key: str
24
25
  tbl_id: UUID
25
26
  col_id: int
@@ -56,7 +57,8 @@ class FileCache:
56
57
  TODO:
57
58
  - implement MRU eviction for queries that exceed the capacity
58
59
  """
59
- __instance: Optional[FileCache] = None
60
+
61
+ __instance: FileCache | None = None
60
62
 
61
63
  cache: OrderedDict[str, CacheEntry]
62
64
  total_size: int
@@ -77,11 +79,18 @@ class FileCache:
77
79
  evicted_working_set_keys: set[str]
78
80
  new_redownload_witnessed: bool # whether a new re-download has occurred since the last time a warning was issued
79
81
 
80
- FileCacheColumnStats = namedtuple('FileCacheColumnStats', ('tbl_id', 'col_id', 'num_files', 'total_size'))
81
- FileCacheStats = namedtuple(
82
- 'FileCacheStats',
83
- ('total_size', 'num_requests', 'num_hits', 'num_evictions', 'column_stats')
84
- )
82
+ class FileCacheColumnStats(NamedTuple):
83
+ tbl_id: UUID
84
+ col_id: int
85
+ num_files: int
86
+ total_size: int
87
+
88
+ class FileCacheStats(NamedTuple):
89
+ total_size: int
90
+ num_requests: int
91
+ num_hits: int
92
+ num_evictions: int
93
+ column_stats: list[FileCache.FileCacheColumnStats]
85
94
 
86
95
  @classmethod
87
96
  def get(cls) -> FileCache:
@@ -93,7 +102,7 @@ class FileCache:
93
102
  def init(cls) -> None:
94
103
  cls.__instance = cls()
95
104
 
96
- def __init__(self):
105
+ def __init__(self) -> None:
97
106
  self.cache = OrderedDict()
98
107
  self.total_size = 0
99
108
  self.capacity_bytes = int(Env.get()._file_cache_size_g * (1 << 30))
@@ -117,17 +126,18 @@ class FileCache:
117
126
  return 0
118
127
  return int(self.total_size / len(self.cache))
119
128
 
120
- def num_files(self, tbl_id: Optional[UUID] = None) -> int:
129
+ def num_files(self, tbl_id: UUID | None = None) -> int:
121
130
  if tbl_id is None:
122
131
  return len(self.cache)
123
132
  return sum(e.tbl_id == tbl_id for e in self.cache.values())
124
133
 
125
- def clear(self, tbl_id: Optional[UUID] = None) -> None:
134
+ def clear(self, tbl_id: UUID | None = None) -> None:
126
135
  """
127
136
  For testing purposes: allow resetting capacity and stats.
128
137
  """
129
138
  if tbl_id is None:
130
- # We need to store the entries to remove in a list, because we can't remove items from a dict while iterating
139
+ # We need to store the entries to remove in a list, because we can't remove items from a dict
140
+ # while iterating
131
141
  entries_to_remove = list(self.cache.values())
132
142
  _logger.debug(f'clearing {self.num_files()} entries from file cache')
133
143
  self.num_requests, self.num_hits, self.num_evictions = 0, 0, 0
@@ -153,8 +163,9 @@ class FileCache:
153
163
  f'of the evicted file(s) is {round(extra_capacity_needed / (1 << 30), 1)} GiB.\n'
154
164
  f'Consider increasing the cache size to at least {round(suggested_cache_size / (1 << 30), 1)} GiB '
155
165
  f'(it is currently {round(self.capacity_bytes / (1 << 30), 1)} GiB).\n'
156
- f'You can do this by setting the value of `file_cache_size_g` in: {str(Env.get()._config_file)}',
157
- excs.PixeltableWarning
166
+ f'You can do this by setting the value of `file_cache_size_g` in: {Config.get().config_file}',
167
+ excs.PixeltableWarning,
168
+ stacklevel=2,
158
169
  )
159
170
  self.new_redownload_witnessed = False
160
171
 
@@ -163,7 +174,7 @@ class FileCache:
163
174
  h.update(url.encode())
164
175
  return h.hexdigest()
165
176
 
166
- def lookup(self, url: str) -> Optional[Path]:
177
+ def lookup(self, url: str) -> Path | None:
167
178
  self.num_requests += 1
168
179
  key = self._url_hash(url)
169
180
  entry = self.cache.get(key, None)
@@ -195,13 +206,15 @@ class FileCache:
195
206
  self.evicted_working_set_keys.add(key)
196
207
  self.new_redownload_witnessed = True
197
208
  self.keys_retrieved.add(key)
198
- entry = CacheEntry(key, tbl_id, col_id, file_info.st_size, datetime.fromtimestamp(file_info.st_mtime), path.suffix)
209
+ entry = CacheEntry(
210
+ key, tbl_id, col_id, file_info.st_size, datetime.fromtimestamp(file_info.st_mtime), path.suffix
211
+ )
199
212
  self.cache[key] = entry
200
213
  self.total_size += entry.size
201
214
  new_path = entry.path
202
215
  os.rename(str(path), str(new_path))
203
216
  new_path.touch(exist_ok=True)
204
- _logger.debug(f'added entry for cell {url} to file cache')
217
+ _logger.debug(f'FileCache: cached url {url} with file name {new_path}')
205
218
  return new_path
206
219
 
207
220
  def ensure_capacity(self, size: int) -> None:
@@ -217,7 +230,9 @@ class FileCache:
217
230
  # Make a record of the eviction, so that we can generate a warning later if the key is retrieved again.
218
231
  self.keys_evicted_after_retrieval.add(lru_entry.key)
219
232
  os.remove(str(lru_entry.path))
220
- _logger.debug(f'evicted entry for cell {lru_entry.key} from file cache (of size {lru_entry.size // (1 << 20)} MiB)')
233
+ _logger.debug(
234
+ f'evicted entry for cell {lru_entry.key} from file cache (of size {lru_entry.size // (1 << 20)} MiB)'
235
+ )
221
236
 
222
237
  def set_capacity(self, capacity_bytes: int) -> None:
223
238
  self.capacity_bytes = capacity_bytes
@@ -228,15 +243,16 @@ class FileCache:
228
243
  # (tbl_id, col_id) -> (num_files, total_size)
229
244
  d: dict[tuple[UUID, int], list[int]] = defaultdict(lambda: [0, 0])
230
245
  for entry in self.cache.values():
231
- t = d[(entry.tbl_id, entry.col_id)]
246
+ t = d[entry.tbl_id, entry.col_id]
232
247
  t[0] += 1
233
248
  t[1] += entry.size
234
249
  col_stats = [
235
- self.FileCacheColumnStats(tbl_id, col_id, num_files, size) for (tbl_id, col_id), (num_files, size) in d.items()
250
+ self.FileCacheColumnStats(tbl_id, col_id, num_files, size)
251
+ for (tbl_id, col_id), (num_files, size) in d.items()
236
252
  ]
237
253
  col_stats.sort(key=lambda e: e[3], reverse=True)
238
254
  return self.FileCacheStats(self.total_size, self.num_requests, self.num_hits, self.num_evictions, col_stats)
239
255
 
240
256
  def debug_print(self) -> None:
241
257
  for entry in self.cache.values():
242
- print(f'CacheEntry: tbl_id={entry.tbl_id}, col_id={entry.col_id}, size={entry.size}')
258
+ _logger.debug(f'CacheEntry: tbl_id={entry.tbl_id}, col_id={entry.col_id}, size={entry.size}')
@@ -4,12 +4,13 @@ import io
4
4
  import json
5
5
  import logging
6
6
  import mimetypes
7
- from typing import Any, Callable, Optional
7
+ import uuid
8
+ from typing import Any, Callable
8
9
 
9
- import av # type: ignore[import-untyped]
10
+ import av
10
11
  import numpy as np
11
- import PIL
12
- import PIL.Image as Image
12
+ from PIL import Image
13
+ from pypdfium2 import PdfDocument # type: ignore[import-untyped]
13
14
 
14
15
  import pixeltable.type_system as ts
15
16
  from pixeltable.utils.http_server import get_file_uri
@@ -20,11 +21,11 @@ _logger = logging.getLogger('pixeltable')
20
21
  class Formatter:
21
22
  """
22
23
  A factory for constructing HTML formatters for Pixeltable data. The formatters are used to customize
23
- the rendering of `DataFrameResultSet`s in notebooks.
24
+ the rendering of `ResultSet`s in notebooks.
24
25
 
25
26
  Args:
26
- num_rows: Number of rows in the DataFrame being rendered.
27
- num_cols: Number of columns in the DataFrame being rendered.
27
+ num_rows: Number of rows in the `ResultSet` being rendered.
28
+ num_cols: Number of columns in the `ResultSet` being rendered.
28
29
  http_address: Root address of the Pixeltable HTTP server (used to construct URLs for media references).
29
30
  """
30
31
 
@@ -40,9 +41,13 @@ class Formatter:
40
41
  self.__num_cols = num_cols
41
42
  self.__http_address = http_address
42
43
 
43
- def get_pandas_formatter(self, col_type: ts.ColumnType) -> Optional[Callable]:
44
+ def get_pandas_formatter(self, col_type: ts.ColumnType) -> Callable | None:
44
45
  if col_type.is_string_type():
45
46
  return self.format_string
47
+ if col_type.is_uuid_type():
48
+ return self.format_uuid
49
+ if col_type.is_binary_type():
50
+ return self.format_binary
46
51
  if col_type.is_float_type():
47
52
  return self.format_float
48
53
  if col_type.is_json_type():
@@ -64,10 +69,24 @@ class Formatter:
64
69
  """
65
70
  Escapes special characters in `val`, and abbreviates `val` if its length exceeds `_STRING_MAX_LEN`.
66
71
  """
67
- return cls.__escape(cls.__abbreviate(val, cls.__STRING_MAX_LEN))
72
+ return cls.__escape(cls.abbreviate(val))
68
73
 
69
74
  @classmethod
70
- def __abbreviate(cls, val: str, max_len: int) -> str:
75
+ def format_uuid(cls, val: uuid.UUID | None) -> str:
76
+ """
77
+ Formats a UUID by converting it to a string and applying string formatting.
78
+ """
79
+ return '' if val is None else cls.format_string(str(val))
80
+
81
+ @classmethod
82
+ def format_binary(cls, val: bytes) -> str:
83
+ """
84
+ Formats binary data by converting it to an encoded string and applying string formatting.
85
+ """
86
+ return cls.format_string(str(val))
87
+
88
+ @classmethod
89
+ def abbreviate(cls, val: str, max_len: int = __STRING_MAX_LEN) -> str:
71
90
  if len(val) > max_len:
72
91
  edgeitems = (max_len - len(cls.__STRING_SEP)) // 2
73
92
  return f'{val[:edgeitems]}{cls.__STRING_SEP}{val[-edgeitems:]}'
@@ -95,41 +114,45 @@ class Formatter:
95
114
  )
96
115
 
97
116
  @classmethod
98
- def format_json(cls, val: Any) -> str:
117
+ def format_json(cls, val: Any, escape_strings: bool = True) -> str:
99
118
  if isinstance(val, str):
100
119
  # JSON-like formatting will be applied to strings that appear nested within a list or dict
101
120
  # (quote the string; escape any quotes inside the string; shorter abbreviations).
102
121
  # However, if the string appears in top-level position (i.e., the entire JSON value is a
103
122
  # string), then we format it like an ordinary string.
104
- return cls.format_string(val)
123
+ return cls.format_string(val) if escape_strings else cls.abbreviate(val)
105
124
  # In all other cases, dump the JSON struct recursively.
106
- return cls.__format_json_rec(val)
125
+ return cls.__format_json_rec(val, escape_strings)
107
126
 
108
127
  @classmethod
109
- def __format_json_rec(cls, val: Any) -> str:
128
+ def __format_json_rec(cls, val: Any, escape_strings: bool) -> str:
110
129
  if isinstance(val, str):
111
- return cls.__escape(json.dumps(cls.__abbreviate(val, cls.__NESTED_STRING_MAX_LEN)))
130
+ formatted = json.dumps(cls.abbreviate(val, cls.__NESTED_STRING_MAX_LEN))
131
+ return cls.__escape(formatted) if escape_strings else formatted
112
132
  if isinstance(val, float):
113
133
  return cls.format_float(val)
114
134
  if isinstance(val, np.ndarray):
115
135
  return cls.format_array(val)
116
136
  if isinstance(val, list):
117
137
  if len(val) < cls.__LIST_THRESHOLD:
118
- components = [cls.__format_json_rec(x) for x in val]
138
+ components = [cls.__format_json_rec(x, escape_strings) for x in val]
119
139
  else:
120
- components = [cls.__format_json_rec(x) for x in val[: cls.__LIST_EDGEITEMS]]
140
+ components = [cls.__format_json_rec(x, escape_strings) for x in val[: cls.__LIST_EDGEITEMS]]
121
141
  components.append('...')
122
- components.extend(cls.__format_json_rec(x) for x in val[-cls.__LIST_EDGEITEMS :])
142
+ components.extend(cls.__format_json_rec(x, escape_strings) for x in val[-cls.__LIST_EDGEITEMS :])
123
143
  return '[' + ', '.join(components) + ']'
124
144
  if isinstance(val, dict):
125
- kv_pairs = (f'{cls.__format_json_rec(k)}: {cls.__format_json_rec(v)}' for k, v in val.items())
145
+ kv_pairs = (
146
+ f'{cls.__format_json_rec(k, escape_strings)}: {cls.__format_json_rec(v, escape_strings)}'
147
+ for k, v in val.items()
148
+ )
126
149
  return '{' + ', '.join(kv_pairs) + '}'
127
150
 
128
151
  # Everything else
129
152
  try:
130
153
  return json.dumps(val)
131
154
  except TypeError: # Not JSON serializable
132
- return str(val)
155
+ return cls.__escape(str(val))
133
156
 
134
157
  def format_img(self, img: Image.Image) -> str:
135
158
  """
@@ -153,22 +176,19 @@ class Formatter:
153
176
  """
154
177
 
155
178
  def format_video(self, file_path: str) -> str:
156
- thumb_tag = ''
157
179
  # Attempt to extract the first frame of the video to use as a thumbnail,
158
180
  # so that the notebook can be exported as HTML and viewed in contexts where
159
181
  # the video itself is not accessible.
160
182
  # TODO(aaron-siegel): If the video is backed by a concrete external URL,
161
183
  # should we link to that instead?
162
- with av.open(file_path) as container:
163
- try:
164
- thumb = next(container.decode(video=0)).to_image()
165
- assert isinstance(thumb, Image.Image)
166
- with io.BytesIO() as buffer:
167
- thumb.save(buffer, 'jpeg')
168
- thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
169
- thumb_tag = f'poster="data:image/jpeg;base64,{thumb_base64}"'
170
- except Exception:
171
- pass
184
+ thumb = self.extract_first_video_frame(file_path)
185
+ if thumb is None:
186
+ thumb_tag = ''
187
+ else:
188
+ with io.BytesIO() as buffer:
189
+ thumb.save(buffer, 'jpeg')
190
+ thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
191
+ thumb_tag = f'poster="data:image/jpeg;base64,{thumb_base64}"'
172
192
  if self.__num_rows > 1:
173
193
  width = 320
174
194
  elif self.__num_cols > 1:
@@ -183,6 +203,16 @@ class Formatter:
183
203
  </div>
184
204
  """
185
205
 
206
+ @classmethod
207
+ def extract_first_video_frame(cls, file_path: str) -> Image.Image | None:
208
+ with av.open(file_path) as container:
209
+ try:
210
+ img = next(container.decode(video=0)).to_image()
211
+ assert isinstance(img, Image.Image)
212
+ return img
213
+ except Exception:
214
+ return None
215
+
186
216
  def format_audio(self, file_path: str) -> str:
187
217
  return f"""
188
218
  <div class="pxt_audio">
@@ -192,29 +222,18 @@ class Formatter:
192
222
  </div>
193
223
  """
194
224
 
195
- def format_document(self, file_path: str) -> str:
196
- max_width = max_height = 320
225
+ def format_document(self, file_path: str, max_width: int = 320, max_height: int = 320) -> str:
197
226
  # by default, file path will be shown as a link
198
227
  inner_element = file_path
199
228
  inner_element = html.escape(inner_element)
200
- # try generating a thumbnail for different types and use that if successful
201
- if file_path.lower().endswith('.pdf'):
202
- try:
203
- import fitz # type: ignore[import-untyped]
204
-
205
- doc = fitz.open(file_path)
206
- p = doc.get_page_pixmap(0)
207
- while p.width > max_width or p.height > max_height:
208
- # shrink(1) will halve each dimension
209
- p.shrink(1)
210
- data = p.tobytes(output='jpeg')
211
- thumb_base64 = base64.b64encode(data).decode()
212
- img_src = f'data:image/jpeg;base64,{thumb_base64}'
213
- inner_element = f"""
214
- <img style="object-fit: contain; border: 1px solid black;" src="{img_src}" />
215
- """
216
- except:
217
- logging.warning(f'Failed to produce PDF thumbnail {file_path}. Make sure you have PyMuPDF installed.')
229
+
230
+ thumb = self.make_document_thumbnail(file_path, max_width, max_height)
231
+ if thumb is not None:
232
+ with io.BytesIO() as buffer:
233
+ thumb.save(buffer, 'webp')
234
+ thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
235
+ thumb_tag = f'data:image/webp;base64,{thumb_base64}'
236
+ inner_element = f'<img style="object-fit: contain; border: 1px solid black;" src="{thumb_tag}" />'
218
237
 
219
238
  return f"""
220
239
  <div class="pxt_document" style="width:{max_width}px;">
@@ -224,6 +243,24 @@ class Formatter:
224
243
  </div>
225
244
  """
226
245
 
246
+ @classmethod
247
+ def make_document_thumbnail(cls, file_path: str, max_width: int = 320, max_height: int = 320) -> Image.Image | None:
248
+ """
249
+ Returns a thumbnail image of a document.
250
+ """
251
+ if file_path.lower().endswith('.pdf'):
252
+ try:
253
+ doc = PdfDocument(file_path)
254
+ if len(doc) == 0:
255
+ return None
256
+ img = doc[0].render().to_pil()
257
+ img.thumbnail((max_width, max_height), Image.LANCZOS)
258
+ return img
259
+ except Exception:
260
+ logging.warning(f'Failed to produce PDF thumbnail {file_path}. Make sure you have pypdfium2 installed.')
261
+
262
+ return None
263
+
227
264
  @classmethod
228
265
  def __create_source_tag(cls, http_address: str, file_path: str) -> str:
229
266
  src_url = get_file_uri(http_address, file_path)